From 70dce2c89f185c65b436c28404ae5b7bdb32c2d1 Mon Sep 17 00:00:00 2001 From: Elango Date: Wed, 15 Jun 2022 09:52:13 -0700 Subject: [PATCH] ignore Eclipse files + incremental 4-space indent Java formatter (#254) --- .github/workflows/cli-build-instructions.yml | 27 +- .gitignore | 1 + .../com/ibm/icu/impl/MultiComparator.java | 6 +- .../org/unicode/cldr/tool/TablePrinter.java | 995 ++-- .../org/unicode/jsp/AlternateIterator.java | 148 +- .../java/org/unicode/jsp/Annotations.java | 20 +- .../main/java/org/unicode/jsp/BiMultimap.java | 27 +- .../java/org/unicode/jsp/BidiCharMap.java | 146 +- .../java/org/unicode/jsp/BidiReference.java | 2247 +++++---- .../org/unicode/jsp/BranchStringPrepData.java | 780 ++-- .../main/java/org/unicode/jsp/Builder.java | 139 +- .../java/org/unicode/jsp/CachedProps.java | 75 +- .../src/main/java/org/unicode/jsp/Common.java | 236 +- .../org/unicode/jsp/CompressedDataInput.java | 15 +- .../java/org/unicode/jsp/Confusables.java | 416 +- .../org/unicode/jsp/CreateInversions.java | 424 +- .../src/main/java/org/unicode/jsp/Dummy.java | 55 +- .../org/unicode/jsp/GeneralUtilities.java | 34 +- .../org/unicode/jsp/GenerateSubheader.java | 18 +- .../src/main/java/org/unicode/jsp/Globe.java | 4121 +++++++++-------- .../java/org/unicode/jsp/LanguageCode.java | 301 +- .../src/main/java/org/unicode/jsp/NFM.java | 23 +- .../main/java/org/unicode/jsp/Navigator.java | 405 +- .../org/unicode/jsp/PropertyMetadata.java | 55 +- .../main/java/org/unicode/jsp/QuickCheck.java | 6 +- .../org/unicode/jsp/ScriptCategoriesCopy.java | 290 +- .../java/org/unicode/jsp/ScriptTester.java | 966 ++-- .../java/org/unicode/jsp/ScriptTester2.java | 102 +- .../java/org/unicode/jsp/SequenceData.java | 37 +- .../org/unicode/jsp/SimpleTransliterator.java | 18 +- .../main/java/org/unicode/jsp/Subheader.java | 71 +- .../org/unicode/jsp/SubheaderSnapshot.java | 1464 +++--- .../main/java/org/unicode/jsp/Typology.java | 64 +- .../main/java/org/unicode/jsp/UBAVersion.java | 33 +- .../org/unicode/jsp/UnicodeDataInput.java | 19 +- .../main/java/org/unicode/jsp/UnicodeJsp.java | 200 +- .../java/org/unicode/jsp/UnicodeProperty.java | 643 ++- .../org/unicode/jsp/UnicodeSetUtilities.java | 214 +- .../org/unicode/jsp/UnicodeUtilities.java | 1114 +++-- .../java/org/unicode/jsp/UtfParameters.java | 133 +- .../org/unicode/jsp/XPropertyFactory.java | 429 +- .../java/org/unicode/jsp/TestUBAVersion.java | 6 +- .../java/org/unicode/jsptest/QuickCheck.java | 30 +- .../java/org/unicode/jsptest/TestAll.java | 34 +- .../jsptest/TestAlternateIterator.java | 94 +- .../unicode/jsptest/TestBasicProperties.java | 21 +- .../java/org/unicode/jsptest/TestBuilder.java | 413 +- .../java/org/unicode/jsptest/TestEmoji.java | 10 +- .../java/org/unicode/jsptest/TestFmwk2.java | 21 +- .../org/unicode/jsptest/TestGenerate.java | 330 +- .../unicode/jsptest/TestIcuProperties.java | 31 +- .../java/org/unicode/jsptest/TestJsp.java | 545 ++- .../org/unicode/jsptest/TestLanguageid.java | 5 +- .../org/unicode/jsptest/TestProperties.java | 214 +- .../org/unicode/jsptest/TestScriptTester.java | 154 +- .../org/unicode/jsptest/TestTypology.java | 277 +- .../org/unicode/jsptest/TestUnicodeSet.java | 306 +- pom.xml | 292 +- .../unicode/unittest/TestFmwkMinusMinus.java | 80 +- .../java/com/ibm/icu/dev/tool/UOption.java | 289 +- .../bidi/BidiConformanceTestBuilder.java | 627 +-- .../java/org/unicode/bidi/BidiReference.java | 610 ++- .../org/unicode/bidi/BidiReferenceTest.java | 53 +- .../bidi/BidiReferenceTestCharmap.java | 262 +- .../bidi/BidiTestIcu4jConformance.java | 4 +- .../org/unicode/bidi/GenerateN1Tests.java | 35 +- .../com/ibm/icu/text/StringTransform.java | 10 +- .../org/unicode/draft/AcceptLanguage.java | 60 +- .../java/org/unicode/draft/Alphagram.java | 5 +- .../org/unicode/draft/CharacterFrequency.java | 80 +- .../java/org/unicode/draft/CheckCollator.java | 20 +- .../org/unicode/draft/CheckComparison.java | 74 +- .../java/org/unicode/draft/CheckPunycode.java | 54 +- .../org/unicode/draft/CheckResources.java | 112 +- .../java/org/unicode/draft/CldrUtility.java | 510 +- .../main/java/org/unicode/draft/Cmudict.java | 124 +- .../java/org/unicode/draft/CodePoint.java | 46 +- .../main/java/org/unicode/draft/Compare.java | 28 +- .../unicode/draft/CompareCldrUnihanData.java | 77 +- .../java/org/unicode/draft/ComparePinyin.java | 252 +- .../unicode/draft/CompressedDataInput.java | 15 +- .../unicode/draft/CompressedDataOutput.java | 32 +- .../draft/CountryPopulationByCode.java | 299 +- .../java/org/unicode/draft/FindHanSizes.java | 80 +- .../org/unicode/draft/FormatRegistry.java | 31 +- .../org/unicode/draft/FormatSpecialData2.java | 70 +- .../org/unicode/draft/FrequencyData2.java | 486 +- .../java/org/unicode/draft/FuzzyNumber.java | 17 +- .../java/org/unicode/draft/FuzzyTest.java | 43 +- .../org/unicode/draft/GenerateCasedPairs.java | 24 +- .../GenerateCharacterFrequencyCharts.java | 314 +- .../unicode/draft/GenerateLanguageNames.java | 20 +- .../draft/GenerateNormalizeForMatch2.java | 450 +- .../unicode/draft/GeneratePickerData2.java | 1890 ++++++-- .../draft/GenerateUnihanCollatorFiles.java | 43 +- .../draft/GenerateUnihanCollators.java | 891 ++-- .../java/org/unicode/draft/GetCurrencies.java | 65 +- .../main/java/org/unicode/draft/GetNames.java | 5 +- .../org/unicode/draft/HanFrequencies.java | 108 +- .../main/java/org/unicode/draft/Hello.java | 166 +- .../main/java/org/unicode/draft/IcuCache.java | 25 +- .../java/org/unicode/draft/IdnaFrequency.java | 123 +- .../org/unicode/draft/IdnaLabelTester2.java | 669 ++- .../src/main/java/org/unicode/draft/Ids2.java | 328 +- .../draft/LanguageDetectionVsTags.java | 89 +- .../org/unicode/draft/LanguageQuadgrams.java | 73 +- .../org/unicode/draft/ListTopLanguages.java | 63 +- .../java/org/unicode/draft/MessageFormat.java | 1135 +++-- .../src/main/java/org/unicode/draft/Misc.java | 235 +- .../java/org/unicode/draft/OldPunycode.java | 465 +- .../java/org/unicode/draft/PickerApp.java | 125 +- .../java/org/unicode/draft/PickerData2.java | 8 +- .../main/java/org/unicode/draft/Punycode.java | 309 +- .../org/unicode/draft/RadicalStroke2.java | 143 +- .../org/unicode/draft/ScriptCategories2.java | 1190 ++--- .../java/org/unicode/draft/ScriptCount.java | 200 +- .../java/org/unicode/draft/SetComparator.java | 2 +- .../unicode/draft/SimpleFormatRegistry.java | 383 +- .../main/java/org/unicode/draft/Snippet.java | 7 +- .../java/org/unicode/draft/Subheader2.java | 139 +- .../java/org/unicode/draft/TimeEntry.java | 9 +- .../org/unicode/draft/UnicodeDataInput.java | 19 +- .../org/unicode/draft/UnicodeDataOutput.java | 38 +- .../java/org/unicode/draft/UnicodeIntMap.java | 497 +- .../unicode/draft/WebpageCharacterData.java | 106 +- .../java/org/unicode/draft/WordSolver.java | 23 +- .../java/org/unicode/idna/CheckIdna2008.java | 42 +- .../unicode/idna/CompareCompatProperties.java | 64 +- .../idna/FilteredUnicodeTransform.java | 6 +- .../java/org/unicode/idna/GenerateIdna.java | 403 +- .../idna/GenerateIdnaStableSamples.java | 63 +- .../org/unicode/idna/GenerateIdnaTest.java | 1287 +++-- .../src/main/java/org/unicode/idna/Idna.java | 55 +- .../main/java/org/unicode/idna/Idna2003.java | 18 +- .../main/java/org/unicode/idna/Idna2008.java | 83 +- .../main/java/org/unicode/idna/Idna2008t.java | 57 +- .../main/java/org/unicode/idna/IdnaTypes.java | 3 +- .../java/org/unicode/idna/LoadIdnaTest.java | 145 +- .../main/java/org/unicode/idna/Punycode.java | 459 +- .../main/java/org/unicode/idna/Regexes.java | 10 +- .../java/org/unicode/idna/StringPrepData.java | 315 +- .../src/main/java/org/unicode/idna/Uts46.java | 257 +- .../java/org/unicode/jsp/CharEncoder.java | 27 +- .../java/org/unicode/jsp/FileUtilities.java | 107 +- .../org/unicode/jsp/ICUPropertyFactory.java | 1025 ++-- .../java/org/unicode/jsp/MySymbolTable.java | 92 +- .../java/org/unicode/jsp/UnicodeRegex.java | 202 +- .../org/unicode/jsp/XIDModifications.java | 31 +- .../src/main/java/org/unicode/parse/EBNF.java | 93 +- .../src/main/java/org/unicode/parse/Pick.java | 129 +- .../java/org/unicode/parse/Tokenizer.java | 220 +- .../java/org/unicode/picker/CharData.java | 1526 +++--- .../java/org/unicode/props/BagFormatter.java | 392 +- .../java/org/unicode/props/DefaultValues.java | 18 +- .../java/org/unicode/props/GenerateEnums.java | 517 ++- .../unicode/props/IndexUnicodeProperties.java | 236 +- .../unicode/props/PropNormalizationData.java | 153 +- .../org/unicode/props/PropertyLister.java | 39 +- .../java/org/unicode/props/PropertyNames.java | 39 +- .../unicode/props/PropertyParsingInfo.java | 812 ++-- .../org/unicode/props/PropertyStatus.java | 828 ++-- .../java/org/unicode/props/PropertyType.java | 6 +- .../org/unicode/props/PropertyUtilities.java | 36 +- .../org/unicode/props/PropertyValueSets.java | 110 +- .../unicode/props/RandomStringGenerator.java | 63 +- .../java/org/unicode/props/ScriptInfo.java | 125 +- .../java/org/unicode/props/UcdProperty.java | 121 +- .../org/unicode/props/UcdPropertyValues.java | 731 +-- .../org/unicode/props/UnicodeProperty.java | 699 ++- .../props/UnicodePropertyException.java | 4 + .../props/UnicodePropertySymbolTable.java | 385 +- .../org/unicode/props/UnicodeRelation.java | 89 +- .../unicode/props/UnicodeSetUtilities.java | 7 +- .../org/unicode/props/ValueCardinality.java | 7 +- .../java/org/unicode/props/VersionToAge.java | 122 +- .../main/java/org/unicode/temp/Rounder.java | 180 +- .../org/unicode/temp/UnicodePropertyX.java | 7 +- .../main/java/org/unicode/text/TestICU4J.java | 169 +- .../java/org/unicode/text/UCA/CEList.java | 217 +- .../unicode/text/UCA/CompareDucetToCldr.java | 158 +- .../java/org/unicode/text/UCA/Fractional.java | 130 +- .../org/unicode/text/UCA/FractionalUCA.java | 589 ++- .../java/org/unicode/text/UCA/GenOverlap.java | 231 +- .../java/org/unicode/text/UCA/Implicit.java | 79 +- .../main/java/org/unicode/text/UCA/Main.java | 148 +- .../org/unicode/text/UCA/MakeNamesChart.java | 628 ++- .../text/UCA/MappingsForFractionalUCA.java | 149 +- .../text/UCA/PrimariesToFractional.java | 591 +-- .../org/unicode/text/UCA/RadicalStroke.java | 223 +- .../org/unicode/text/UCA/ReorderCodes.java | 115 +- .../unicode/text/UCA/ReorderingTokens.java | 52 +- .../unicode/text/UCA/SecTerToFractional.java | 47 +- .../text/UCA/TestCompatibilityCharacters.java | 112 +- .../main/java/org/unicode/text/UCA/UCA.java | 833 ++-- .../java/org/unicode/text/UCA/UCA_Data.java | 106 +- .../org/unicode/text/UCA/UCA_Statistics.java | 39 +- .../java/org/unicode/text/UCA/UCA_Types.java | 35 +- .../java/org/unicode/text/UCA/Validity.java | 759 +-- .../org/unicode/text/UCA/WriteAllkeys.java | 58 +- .../org/unicode/text/UCA/WriteCharts.java | 866 ++-- .../unicode/text/UCA/WriteCollationData.java | 878 ++-- .../text/UCA/WriteConformanceTest.java | 106 +- .../unicode/text/UCA/WriteHTMLCollation.java | 3253 +++++++------ .../java/org/unicode/text/UCD/BuildNames.java | 182 +- .../java/org/unicode/text/UCD/Charts.java | 16 +- .../org/unicode/text/UCD/CheckCollator.java | 141 +- .../java/org/unicode/text/UCD/CheckICU.java | 152 +- .../unicode/text/UCD/ChineseFrequency.java | 47 +- .../unicode/text/UCD/CodePointProperty.java | 28 +- .../org/unicode/text/UCD/CompactName.java | 27 +- .../org/unicode/text/UCD/Compare14652.java | 260 +- .../unicode/text/UCD/CompareProperties.java | 173 +- .../java/org/unicode/text/UCD/ConvertUCD.java | 300 +- .../java/org/unicode/text/UCD/Default.java | 17 +- .../org/unicode/text/UCD/DerivedProperty.java | 1173 ++--- .../text/UCD/DerivedPropertyLister.java | 52 +- .../unicode/text/UCD/DiffPropertyLister.java | 41 +- .../unicode/text/UCD/GenerateBreakTest.java | 3113 +++++++------ .../unicode/text/UCD/GenerateCaseFolding.java | 406 +- .../unicode/text/UCD/GenerateCaseTest.java | 57 +- .../unicode/text/UCD/GenerateConfusables.java | 1216 +++-- .../text/UCD/GenerateConfusablesCopy.java | 1656 ++++--- .../org/unicode/text/UCD/GenerateData.java | 102 +- .../text/UCD/GenerateHanTransliterator.java | 1086 +++-- .../text/UCD/GenerateNamedSequences.java | 121 +- .../UCD/GenerateStandardizedVariants.java | 277 +- .../unicode/text/UCD/GenerateStringPrep.java | 424 +- .../unicode/text/UCD/GenerateThaiBreaks.java | 73 +- .../UCD/GenerateWholeScriptConfusables.java | 840 ++-- .../org/unicode/text/UCD/GetTypology.java | 53 +- .../java/org/unicode/text/UCD/IANANames.java | 74 +- .../java/org/unicode/text/UCD/IDNTester.java | 36 +- .../org/unicode/text/UCD/IdentifierInfo.java | 713 +-- .../java/org/unicode/text/UCD/IntMap.java | 16 +- .../org/unicode/text/UCD/ListNFComplete.java | 7 +- .../org/unicode/text/UCD/MLStreamWriter.java | 191 +- .../main/java/org/unicode/text/UCD/Main.java | 171 +- .../unicode/text/UCD/MakeUnicodeFiles.java | 1030 ++-- .../org/unicode/text/UCD/MyFloatLister.java | 16 +- .../unicode/text/UCD/MyPropertyLister.java | 42 +- .../org/unicode/text/UCD/NFCSkippable.java | 6 +- .../org/unicode/text/UCD/NFSkippable.java | 157 +- .../java/org/unicode/text/UCD/NamesList.java | 336 +- .../unicode/text/UCD/NormalizationData.java | 12 +- .../text/UCD/NormalizationDataStandard.java | 42 +- .../java/org/unicode/text/UCD/Normalizer.java | 310 +- .../unicode/text/UCD/NormalizerSample.java | 174 +- .../org/unicode/text/UCD/OldUnicodeMap.java | 29 +- .../org/unicode/text/UCD/ProcessUnihan.java | 13 +- .../org/unicode/text/UCD/PropertyLister.java | 146 +- .../java/org/unicode/text/UCD/QuickTest.java | 682 +-- .../unicode/text/UCD/ScriptExceptions.java | 750 ++- .../unicode/text/UCD/ScriptExtensions.java | 68 +- .../org/unicode/text/UCD/ScriptTimeline.java | 9 +- .../org/unicode/text/UCD/StripDAndCopy.java | 9 +- .../org/unicode/text/UCD/TernaryStore.java | 93 +- .../org/unicode/text/UCD/TestAsciify.java | 164 +- .../java/org/unicode/text/UCD/TestData.java | 1234 ++--- .../org/unicode/text/UCD/TestIdentifiers.java | 121 +- .../unicode/text/UCD/TestNameUniqueness.java | 69 +- .../unicode/text/UCD/TestNormalization.java | 102 +- .../text/UCD/TestUnicodeInvariants.java | 727 +-- .../org/unicode/text/UCD/ToolIdna2008.java | 21 +- .../text/UCD/ToolUnicodePropertySource.java | 2502 ++++++---- .../text/UCD/ToolUnicodeTransformFactory.java | 29 +- .../main/java/org/unicode/text/UCD/UCD.java | 1084 +++-- .../org/unicode/text/UCD/UCDProperty.java | 230 +- .../java/org/unicode/text/UCD/UCD_Names.java | 972 ++-- .../java/org/unicode/text/UCD/UCD_Types.java | 1422 +++--- .../main/java/org/unicode/text/UCD/UData.java | 63 +- .../unicode/text/UCD/UnicodeMapParser.java | 237 +- .../text/UCD/UnifiedBinaryProperty.java | 472 +- .../org/unicode/text/UCD/UnifiedProperty.java | 124 +- .../unicode/text/UCD/UseTransliterator.java | 17 +- .../java/org/unicode/text/UCD/VerifyUCD.java | 1511 +++--- .../unicode/text/UCD/WriteJavaScriptInfo.java | 28 +- .../UCD/XObsoleteGenerateLineBreakTest.java | 280 +- .../unicode/text/tools/AddCharacterNames.java | 111 +- .../unicode/text/tools/AnnotationCheck.java | 36 +- .../text/tools/CLDRCharacterUtility.java | 11 +- .../text/tools/CharsByAgeAndCategory.java | 22 +- .../org/unicode/text/tools/CheckFont.java | 44 +- .../org/unicode/text/tools/CheckFonts.java | 43 +- .../java/org/unicode/text/tools/CheckHan.java | 52 +- .../text/tools/CheckSecurityProposals.java | 81 +- .../org/unicode/text/tools/CheckUnihan.java | 9 +- .../org/unicode/text/tools/Collections.java | 4 +- .../org/unicode/text/tools/CompareIDNA.java | 13 +- .../text/tools/CompareIdentifiers.java | 15 +- .../unicode/text/tools/CompareProperties.java | 38 +- .../text/tools/CompareScriptExtensions.java | 14 +- .../text/tools/FindDuplicateFiles.java | 58 +- .../text/tools/GenerateAnnotationTree.java | 146 +- .../unicode/text/tools/GenerateCeMapping.java | 42 +- .../unicode/text/tools/GenerateLabels.java | 144 +- .../unicode/text/tools/GenerateRadicals.java | 169 +- .../text/tools/GenerateSubtagNames.java | 8 +- .../unicode/text/tools/GifSequenceWriter.java | 110 +- .../java/org/unicode/text/tools/IcannMsr.java | 194 +- .../org/unicode/text/tools/Linkifier.java | 174 +- .../org/unicode/text/tools/Linkifier2.java | 213 +- .../unicode/text/tools/MakeEmojiTable.java | 51 +- .../unicode/text/tools/NamesListPrint.java | 132 +- .../org/unicode/text/tools/NotoCoverage.java | 42 +- .../text/tools/OldEmojiProcessing.java | 31 +- .../unicode/text/tools/PropertyChanges.java | 20 +- .../text/tools/RecommendedSetGenerator.java | 221 +- .../org/unicode/text/tools/RegexBuilder.java | 63 +- .../org/unicode/text/tools/RenameFiles.java | 210 +- .../unicode/text/tools/ScriptPopulation.java | 787 ++-- .../text/tools/ShowCharacterFrequency.java | 63 +- .../unicode/text/tools/ShowCharacters.java | 228 +- .../unicode/text/tools/ShowPatternSyntax.java | 5 +- .../unicode/text/tools/ShowUnicodeGrowth.java | 121 +- .../text/tools/SimplifiedAndTraditional.java | 77 +- .../org/unicode/text/tools/StringTree.java | 231 +- .../org/unicode/text/tools/TransformFile.java | 8 +- .../unicode/text/tools/UnicodeSetTree.java | 33 +- .../org/unicode/text/tools/VerifyIdna.java | 176 +- .../org/unicode/text/tools/VerifyUCD.java | 302 +- .../org/unicode/text/tools/VerifyXmlUcd.java | 166 +- .../org/unicode/text/tools/WordFrequency.java | 155 +- .../unicode/text/tools/XIDModifications.java | 18 +- .../org/unicode/text/utility/Birelation.java | 54 +- .../org/unicode/text/utility/CallArgs.java | 22 +- .../unicode/text/utility/ChainException.java | 20 +- .../text/utility/CompactByteArray.java | 187 +- .../text/utility/CompactShortArray.java | 293 +- .../text/utility/ComparisonNormalizer.java | 173 +- .../org/unicode/text/utility/DifferTest.java | 41 +- .../text/utility/DirectoryIterator.java | 46 +- .../org/unicode/text/utility/DualWriter.java | 24 +- .../org/unicode/text/utility/EnumBase.java | 52 +- .../text/utility/FastBinarySearch.java | 313 +- .../text/utility/FastBinarySearchTest.java | 42 +- .../text/utility/FastByteBinarySearch.java | 172 +- .../text/utility/FastCharBinarySearch.java | 172 +- .../text/utility/FastIntBinarySearch.java | 172 +- .../text/utility/FastLongBinarySearch.java | 172 +- .../text/utility/FastShortBinarySearch.java | 172 +- .../unicode/text/utility/FastUnicodeSet.java | 15 +- .../text/utility/FastUnicodeSetTest.java | 130 +- .../text/utility/FileLineIterator.java | 32 +- .../utility/IcuUnicodeNormalizerFactory.java | 28 +- .../unicode/text/utility/IndentWriter.java | 34 +- .../org/unicode/text/utility/IntStack.java | 19 +- .../text/utility/LengthFirstComparator.java | 14 +- .../java/org/unicode/text/utility/Main.java | 67 +- .../text/utility/OldEquivalenceClass.java | 62 +- .../java/org/unicode/text/utility/Pair.java | 27 +- .../unicode/text/utility/PoorMansEnum.java | 19 +- .../org/unicode/text/utility/SampleEnum.java | 56 +- .../org/unicode/text/utility/Settings.java | 148 +- .../unicode/text/utility/TempPrintWriter.java | 14 +- .../org/unicode/text/utility/TestUtility.java | 242 +- .../org/unicode/text/utility/UTF16Plus.java | 12 +- .../java/org/unicode/text/utility/UTF32.java | 37 +- .../text/utility/UTF8StreamReader.java | 190 +- .../text/utility/UTF8StreamWriter.java | 59 +- .../unicode/text/utility/UnicodeDataFile.java | 66 +- .../unicode/text/utility/UnicodeMapInt.java | 10 +- .../text/utility/UnicodeSetParser.java | 115 +- .../text/utility/UnicodeTransform.java | 34 +- .../org/unicode/text/utility/Utility.java | 851 ++-- .../org/unicode/text/utility/UtilityBase.java | 27 +- .../org/unicode/text/utility/XMLParse.java | 595 +-- .../unicode/text/utility/XMLParseTypes.java | 73 +- .../org/unicode/text/utility/testParser.java | 487 +- .../main/java/org/unicode/tools/AacCheck.java | 145 +- .../main/java/org/unicode/tools/AacOrder.java | 132 +- .../unicode/tools/CharacterCategories.java | 72 +- .../org/unicode/tools/CheckEmojiProps.java | 77 +- .../java/org/unicode/tools/CheckSimpTrad.java | 5 +- .../unicode/tools/CollatorEquivalences.java | 200 +- .../tools/CollatorEquivalencesNew.java | 268 +- .../main/java/org/unicode/tools/Common.java | 6 +- .../org/unicode/tools/CompareUnicodeSets.java | 25 +- .../java/org/unicode/tools/Confusables.java | 211 +- .../unicode/tools/CopyPropsToUnicodeJsp.java | 26 +- .../java/org/unicode/tools/DraftUtils.java | 10 +- .../unicode/tools/ExtendedPictographic.java | 146 +- .../main/java/org/unicode/tools/FixNcrs.java | 39 +- .../java/org/unicode/tools/FixedProps.java | 282 +- .../tools/GenerateNormalizeForMatch.java | 756 +-- .../org/unicode/tools/GeneratePickerData.java | 1891 +++++--- .../unicode/tools/GeneratePickerData2.java | 7 +- .../unicode/tools/GenerateRadicalEnum.java | 12 +- .../org/unicode/tools/GenerateXIDModSets.java | 28 +- .../unicode/tools/GetSIUnitTranslations.java | 64 +- .../src/main/java/org/unicode/tools/Ids.java | 1197 +++-- .../java/org/unicode/tools/IdsFileData.java | 29 +- .../java/org/unicode/tools/ListProps.java | 264 +- .../org/unicode/tools/MultiComparator.java | 4 +- .../org/unicode/tools/NormalizeForMatch.java | 49 +- .../unicode/tools/NormalizeForMatchDiff.java | 81 +- .../java/org/unicode/tools/Normalizer3.java | 37 +- .../main/java/org/unicode/tools/Quick.java | 76 +- .../java/org/unicode/tools/RadicalData.java | 116 +- .../java/org/unicode/tools/RadicalEnum.java | 7 +- .../java/org/unicode/tools/RadicalStroke.java | 142 +- .../org/unicode/tools/ScriptDetector.java | 108 +- .../java/org/unicode/tools/Segmenter.java | 1318 +++--- .../unicode/tools/ShowScriptCategories.java | 98 +- .../java/org/unicode/tools/Subheader.java | 139 +- .../java/org/unicode/tools/TestSegments.java | 130 +- .../main/java/org/unicode/tools/Unilex.java | 349 +- .../org/unicode/tools/UpdateJspFiles.java | 93 +- .../org/unicode/tools/emoji/BirthInfo.java | 80 +- .../tools/emoji/CandidateAnnotations.java | 40 +- .../unicode/tools/emoji/CandidateData.java | 865 ++-- .../unicode/tools/emoji/CarrierGlyphs.java | 295 +- .../unicode/tools/emoji/ChartUtilities.java | 153 +- .../unicode/tools/emoji/CompareEmojiFreq.java | 121 +- .../unicode/tools/emoji/CopyImagesToCldr.java | 51 +- .../org/unicode/tools/emoji/CountEmoji.java | 317 +- .../unicode/tools/emoji/CountValidEmoji.java | 92 +- .../unicode/tools/emoji/DebugUtilities.java | 9 +- .../org/unicode/tools/emoji/DocRegistry.java | 39 +- .../java/org/unicode/tools/emoji/Emoji.java | 1021 ++-- .../unicode/tools/emoji/EmojiAnnotations.java | 177 +- .../org/unicode/tools/emoji/EmojiData.java | 865 ++-- .../unicode/tools/emoji/EmojiDataSource.java | 41 +- .../tools/emoji/EmojiDataSourceCombined.java | 124 +- .../unicode/tools/emoji/EmojiFlagOrder.java | 125 +- .../unicode/tools/emoji/EmojiFrequency.java | 472 +- .../tools/emoji/EmojiFrequencyOld.java | 64 +- .../unicode/tools/emoji/EmojiImageData.java | 201 +- .../unicode/tools/emoji/EmojiIterator.java | 41 +- .../unicode/tools/emoji/EmojiLinkAdder.java | 18 +- .../unicode/tools/emoji/EmojiLocaleData.java | 10 +- .../org/unicode/tools/emoji/EmojiMatcher.java | 338 +- .../org/unicode/tools/emoji/EmojiOrder.java | 525 ++- .../org/unicode/tools/emoji/EmojiRename.java | 149 +- .../org/unicode/tools/emoji/EmojiStats.java | 183 +- .../unicode/tools/emoji/FindExtraImages.java | 11 +- .../org/unicode/tools/emoji/FixEmojiText.java | 27 +- .../tools/emoji/GenerateAnnotations.java | 95 +- .../unicode/tools/emoji/GenerateCldrData.java | 217 +- .../unicode/tools/emoji/GenerateEmoji.java | 2711 +++++++---- .../tools/emoji/GenerateEmojiData.java | 911 ++-- .../tools/emoji/GenerateEmojiFrequency.java | 37 +- .../tools/emoji/GenerateEmojiTestFile.java | 252 +- .../unicode/tools/emoji/GenerateExtPict.java | 105 +- .../unicode/tools/emoji/GenerateGroups.java | 13 +- .../emoji/GenerateMissingAnnotations.java | 236 +- .../emoji/GenerateOldestAnnotations.java | 228 +- .../tools/emoji/GenerateSpecImage.java | 167 +- .../org/unicode/tools/emoji/GmailEmoji.java | 183 +- .../java/org/unicode/tools/emoji/Hexer.java | 51 +- .../org/unicode/tools/emoji/Keywords.java | 10 +- .../unicode/tools/emoji/ListAnnotations.java | 39 +- .../org/unicode/tools/emoji/ListFonts.java | 55 +- .../unicode/tools/emoji/ListSegmentation.java | 100 +- .../org/unicode/tools/emoji/LoadImage.java | 850 ++-- .../tools/emoji/OldAnnotationData.java | 226 +- .../emoji/ParseSpreadsheetAnnotations.java | 1058 +++-- .../org/unicode/tools/emoji/ProposalData.java | 384 +- .../unicode/tools/emoji/TempPrintWriter.java | 8 +- .../unicode/tools/emoji/UnicodeFontData.java | 69 +- .../org/unicode/tools/emoji/UnicodeSets.java | 7 +- .../main/java/org/unicode/unused/CaseBit.java | 155 +- .../org/unicode/unused/CheckSystemFonts.java | 592 ++- .../unicode/unused/DataInputCompressor.java | 47 +- .../unicode/unused/DataOutputCompressor.java | 24 +- .../unused/GenerateUcaDecompositions.java | 135 +- .../java/org/unicode/unused/Implicit.java | 164 +- .../org/unicode/unused/OverlayBundle.java | 128 +- .../main/java/org/unicode/unused/Range.java | 13 +- .../unused/TransformTransliterator.java | 12 +- .../unicode/unused/UnicodePropertySource.java | 210 +- .../org/unicode/unused/YMDDateFormatter.java | 150 +- .../org/unicode/utilities/PolaritySet.java | 80 +- .../utilities/UnicodeSetFormatter.java | 128 +- .../org/unicode/draft/MessageFormatCheck.java | 202 +- .../test/java/org/unicode/draft/Test2.java | 366 +- .../test/java/org/unicode/draft/Test3.java | 25 +- .../java/org/unicode/draft/TestCalendar.java | 172 +- .../org/unicode/draft/TestCompressed.java | 233 +- .../org/unicode/draft/TestDateGenerator.java | 18 +- .../org/unicode/draft/TestRandomIDNA.java | 69 +- .../org/unicode/draft/TestSourceTarget.java | 14 +- .../java/org/unicode/draft/TestTranslit.java | 3 +- .../org/unicode/draft/TestUnicodeSet.java | 21 +- .../java/org/unicode/draft/TestZoneName.java | 10 +- .../test/java/org/unicode/idna/TestIdna.java | 298 +- .../java/org/unicode/idna/TestTransform.java | 10 +- .../test/java/org/unicode/idna/TestUts46.java | 1570 ++++--- .../java/org/unicode/parse/TestTokenizer.java | 20 +- .../propstest/CheckCombiningMarks.java | 46 +- .../propstest/CheckEmojiCollation.java | 84 +- .../unicode/propstest/CheckEmojiProps.java | 4 +- .../unicode/propstest/CheckEmojiProps2.java | 26 +- .../org/unicode/propstest/CheckFaces.java | 20 +- .../java/org/unicode/propstest/CheckGCB.java | 136 +- .../org/unicode/propstest/CheckNames.java | 36 +- .../CheckOtherPropsWithLineBreak.java | 52 +- .../unicode/propstest/CheckProperties.java | 673 ++- .../propstest/CheckPropertyStability.java | 80 +- .../org/unicode/propstest/CheckRadicals.java | 35 +- .../propstest/CheckScriptExtensions.java | 24 +- .../java/org/unicode/propstest/CheckXML.java | 4 +- .../org/unicode/propstest/CheckXidmod.java | 193 +- .../unicode/propstest/CheckXmlProperties.java | 106 +- .../org/unicode/propstest/Collisions.java | 19 +- .../propstest/CompareExemplarsToIdmod.java | 90 +- .../CompareSegmentationProperties.java | 26 +- .../propstest/CompareVersionedProps.java | 67 +- .../java/org/unicode/propstest/Emoji.java | 128 +- .../propstest/FileUnicodeProperty.java | 5 +- .../java/org/unicode/propstest/FindProps.java | 126 +- .../test/java/org/unicode/propstest/Ids.java | 69 +- .../ListAssignedWithDefaultValues.java | 94 +- .../unicode/propstest/ListPropsNfcDiff.java | 127 +- .../org/unicode/propstest/ListVariants.java | 28 +- .../unicode/propstest/PrintUnicodeSet.java | 13 +- .../unicode/propstest/PropertyAliases.java | 4 +- .../org/unicode/propstest/PropertyDemo.java | 39 +- .../org/unicode/propstest/PropertyGrowth.java | 109 +- .../org/unicode/propstest/RegexWordBreak.java | 82 +- .../unicode/propstest/ShowDifferences.java | 480 +- .../ShowDuplicatePropertyValues.java | 122 +- .../unicode/propstest/ShowEmojiDecomps.java | 5 +- .../unicode/propstest/ShowMetaproperties.java | 42 +- .../propstest/ShowPropertyMetadata.java | 30 +- .../org/unicode/propstest/ShowScripts.java | 81 +- .../java/org/unicode/propstest/ShowStats.java | 205 +- .../org/unicode/propstest/ShowTirhuta.java | 34 +- .../org/unicode/propstest/ShowUnicodeSet.java | 193 +- .../unicode/propstest/TestCodePointMap.java | 8 +- .../propstest/TestImmutableUnicodeMap.java | 218 +- .../org/unicode/propstest/TestInvariants.java | 247 +- .../propstest/TestPropNormalization.java | 48 +- .../org/unicode/propstest/TestProperties.java | 288 +- .../unicode/propstest/TestPropertyAccess.java | 128 +- .../org/unicode/propstest/TestScriptInfo.java | 65 +- .../unicode/propstest/TestScriptMetadata.java | 109 +- .../java/org/unicode/propstest/TestXSet.java | 136 +- .../unicode/propstest/TestXUnicodeSet.java | 17 +- .../propstest/UnicodeRelationTest.java | 102 +- .../org/unicode/propstest/XMLProperties.java | 409 +- .../org/unicode/propstest/XMPProperties.java | 4 +- .../test/java/org/unicode/propstest/XSet.java | 59 +- .../java/org/unicode/test/CaseBitTest.java | 58 +- .../org/unicode/test/CheckWholeScript.java | 88 +- .../org/unicode/test/CompareBoundaries.java | 331 +- .../java/org/unicode/test/TestBreaks.java | 10 +- .../java/org/unicode/test/TestSecurity.java | 320 +- .../java/org/unicode/test/TestSegment.java | 315 +- .../java/org/unicode/test/TestUnicodeBnf.java | 78 +- .../unicode/test/TestUnicodeMapParser.java | 111 +- .../unicode/text/UCD/TestCodeInvariants.java | 196 +- .../text/UCD/TestTestUnicodeInvariants.java | 23 +- .../java/org/unicode/tools/AacCheckTest.java | 38 +- .../unicode/tools/emoji/unittest/Asserts.java | 46 +- .../unicode/tools/emoji/unittest/TestAll.java | 20 +- .../emoji/unittest/TestCandidateData.java | 50 +- .../tools/emoji/unittest/TestEmojiData.java | 511 +- .../unittest/TestEmojiDataConsistency.java | 238 +- .../tools/emoji/unittest/TestEmojiOrder.java | 152 +- .../tools/emoji/unittest/TestGender.java | 50 +- .../emoji/unittest/TestProposalData.java | 50 +- .../org/unicode/unittest/AliasDataCldr.java | 14 +- .../org/unicode/unittest/AliasDataRB.java | 27 +- .../org/unicode/unittest/AliasDataSource.java | 246 +- .../unicode/unittest/LocaleCanonicalizer.java | 360 +- .../unicode/unittest/LocaleExtensions.java | 256 +- .../test/java/org/unicode/unittest/Rule.java | 4 +- .../unicode/unittest/TestIdentifierInfo.java | 100 +- .../org/unicode/unittest/TestIdnaTest.java | 191 +- .../unittest/TestLocaleCanonicalization.java | 19 +- .../unittest/TestLocaleConstruction.java | 442 +- .../unicode/unittest/TestLocaleMatching.java | 125 +- .../unicode/unittest/TestLocaleValidity.java | 122 +- .../unicode/unittest/TestRegexBuilder.java | 35 +- .../org/unicode/unittest/TestSegmenter.java | 103 +- .../org/unicode/unittest/TestSettings.java | 112 +- .../org/unicode/unittest/TestUnicodeSet.java | 71 +- .../unicode/utilities/TestPolaritySet.java | 102 +- 578 files changed, 77084 insertions(+), 57714 deletions(-) diff --git a/.github/workflows/cli-build-instructions.yml b/.github/workflows/cli-build-instructions.yml index fc1fc10f6..83da57aff 100644 --- a/.github/workflows/cli-build-instructions.yml +++ b/.github/workflows/cli-build-instructions.yml @@ -14,6 +14,21 @@ env: jobs: + # Using the Java style formatter google-java-style provided by the Spotless + # plugin configured in the root pom.xml using 4-space indents (AOSP style). + # Spotless is configured to run only on files in this branch (PR) that differ + # from origin/main + formatter: + name: Formatter + Style checker + runs-on: ubuntu-latest + steps: + - name: Checkout Unicode Tools + uses: actions/checkout@v3 + with: + fetch-depth: 0 # fetch all branches so that Spotless can resolve `origin/main` + - name: Check Java style + run: mvn spotless:check || (echo "Style checker failed. Formatting changes can be applied by 'mvn spotless:apply'" && exit 1) + # Only run 1 or 2 commands for in-source build instructions, just to demonstrate # what translating between an out-of-source build and an in-source build would # look like. Rely on out-of-source build steps as the canonical way to @@ -23,9 +38,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Unicode Tools - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Check out CLDR - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: unicode-org/cldr path: cldr @@ -66,12 +81,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Unicode Tools - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: unicode-org/unicodetools path: unicodetools/mine/src - name: Checkout CLDR - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: unicode-org/cldr path: cldr/mine/src @@ -158,6 +173,8 @@ jobs: cd unicodetools/mine/src # run GenerateEnums mvn -s .github/workflows/mvn-settings.xml exec:java -Dexec.mainClass="org.unicode.props.GenerateEnums" -Dexec.args="" -pl unicodetools -DCLDR_DIR=$(cd ../../../cldr/mine/src ; pwd) -DUNICODETOOLS_GEN_DIR=$(cd ../Generated ; pwd) -DUNICODETOOLS_REPO_DIR=$(pwd) -DUVERSION=$CURRENT_UVERSION + # apply formatting because generated file will not pass Java formatter + mvn spotless:apply # TODO(#100) Get rid of need for fake version 13.1 of Unicode enum val # Fail if we haven't committed changes from Generate Enums, but make exception for fake Unicode version hack (#100) echo "Check if GenerateEnums output has been committed to repo" @@ -170,7 +187,7 @@ jobs: names=`git diff --name-status | awk '{print $2;}'` if [[ "$names" == "unicodetools/src/main/java/org/unicode/props/UcdPropertyValues.java" ]]; then gitdiffstat=`git diff --stat | tail -n 1` - if [[ "$gitdiffstat" = " 1 file changed, 1 insertion(+), 1 deletion(-)" ]]; then + if [[ "$gitdiffstat" = " 1 file changed, 1 insertion(+), 2 deletions(-)" ]]; then git diff | grep 'V13_1("13.1")' if [ $? -eq 0 ]; then echo "Inferred: Only non-committed change after GenerateEnums is temporary V13_1 hack (issue #100)" diff --git a/.gitignore b/.gitignore index 1bd8cf5de..1cd50cc3a 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ perf-*.xml test-*.xml # Directories +.settings/ .vs/ .vscode/ ARM/ diff --git a/UnicodeJsps/src/main/java/com/ibm/icu/impl/MultiComparator.java b/UnicodeJsps/src/main/java/com/ibm/icu/impl/MultiComparator.java index 38fdae5d8..cf3cab77b 100644 --- a/UnicodeJsps/src/main/java/com/ibm/icu/impl/MultiComparator.java +++ b/UnicodeJsps/src/main/java/com/ibm/icu/impl/MultiComparator.java @@ -11,13 +11,13 @@ public class MultiComparator implements Comparator { private Comparator[] comparators; - @SuppressWarnings("unchecked") // See ticket #11395, this is safe. - public MultiComparator (Comparator... comparators) { + @SuppressWarnings("unchecked") // See ticket #11395, this is safe. + public MultiComparator(Comparator... comparators) { this.comparators = comparators; } /* Lexigraphic compare. Returns the first difference - * @return zero if equal. Otherwise +/- (i+1) + * @return zero if equal. Otherwise +/- (i+1) * where i is the index of the first comparator finding a difference * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) */ diff --git a/UnicodeJsps/src/main/java/org/unicode/cldr/tool/TablePrinter.java b/UnicodeJsps/src/main/java/org/unicode/cldr/tool/TablePrinter.java index 7051ff9e7..6ed8a4d96 100644 --- a/UnicodeJsps/src/main/java/org/unicode/cldr/tool/TablePrinter.java +++ b/UnicodeJsps/src/main/java/org/unicode/cldr/tool/TablePrinter.java @@ -1,5 +1,8 @@ package org.unicode.cldr.tool; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.MessageFormat; +import com.ibm.icu.util.ULocale; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; @@ -7,476 +10,526 @@ import java.util.Comparator; import java.util.List; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.MessageFormat; -import com.ibm.icu.util.ULocale; - public class TablePrinter { - public static void main(String[] args) { - // quick test; - TablePrinter tablePrinter = new TablePrinter() - .setTableAttributes("style='border-collapse: collapse' border='1'") - .addColumn("Language").setSpanRows(true).setSortPriority(0).setBreakSpans(true) - .addColumn("Junk").setSpanRows(true) - .addColumn("Territory").setHeaderAttributes("bgcolor='green'").setCellAttributes("align='right'").setSpanRows(true) - .setSortPriority(1).setSortAscending(false); - Comparable[][] data = { - {"German", 1.3d, 3}, - {"French", 1.3d, 2}, - {"English", 1.3d, 2}, - {"English", 1.3d, 4}, - {"English", 1.3d, 6}, - {"English", 1.3d, 8}, - {"Arabic", 1.3d, 5}, - {"Zebra", 1.3d, 10} - }; - tablePrinter.addRows(data); - tablePrinter.addRow().addCell("Foo").addCell(1.5d).addCell(99).finishRow(); - - String s = tablePrinter.toTable(); - System.out.println(s); - } - - private List columns = new ArrayList(); - private String tableAttributes; - private transient Column[] columnsFlat; - private BitSet blockingRows = new BitSet(); - private List rows = new ArrayList(); - private String caption; - - public String getTableAttributes() { - return tableAttributes; - } - - public TablePrinter setTableAttributes(String tableAttributes) { - this.tableAttributes = tableAttributes; - return this; - } - - public TablePrinter setCaption(String caption) { - this.caption = caption; - return this; - } - - public TablePrinter setSortPriority(int priority) { - columnSorter.setSortPriority(columns.size()-1, priority); - return this; - } - - public TablePrinter setSortAscending(boolean ascending) { - columnSorter.setSortAscending(columns.size()-1, ascending); - return this; - } - - public TablePrinter setBreakSpans(boolean breaks) { - breaksSpans.set(columns.size()-1, breaks); - return this; - } - - private static class Column { - String header; - String headerAttributes; - MessageFormat cellAttributes; - - boolean spanRows; - MessageFormat cellPattern; - private boolean repeatHeader = false; - private boolean hidden = false; - private boolean isHeader = false; - private boolean divider = false; - - public Column(String header) { - this.header = header; - } - - public Column setCellAttributes(String cellAttributes) { - this.cellAttributes = new MessageFormat(MessageFormat.autoQuoteApostrophe(cellAttributes)); - return this; - } - - public Column setCellPattern(String cellPattern) { - this.cellPattern = cellPattern == null ? null : new MessageFormat(MessageFormat.autoQuoteApostrophe(cellPattern)); - return this; - } - - public Column setCellPattern(MessageFormat cellPattern) { - this.cellPattern = cellPattern; - return this; - } - - public Column setHeader(String header) { - this.header = header; - return this; - } - - public Column setHeaderAttributes(String headerAttributes) { - this.headerAttributes = headerAttributes; - return this; - } - - public Column setSpanRows(boolean spanRows) { - this.spanRows = spanRows; - return this; - } - - public void setRepeatHeader(boolean b) { - repeatHeader = b; - } - - public void setHidden(boolean b) { - hidden = b; - } - - public void setHeaderCell(boolean b) { - isHeader = b; - } - - public void setDivider(boolean b) { - divider = b; - } - } - - public TablePrinter addColumn(String header, String headerAttributes, String cellPattern, String cellAttributes, boolean spanRows) { - columns.add(new Column(header).setHeaderAttributes(headerAttributes).setCellPattern(cellPattern).setCellAttributes(cellAttributes).setSpanRows(spanRows)); - setSortAscending(true); - return this; - } - - public TablePrinter addColumn(String header) { - columns.add(new Column(header)); - setSortAscending(true); - return this; - } - - public TablePrinter addRow(Comparable[] data) { - if (data.length != columns.size()) { - throw new IllegalArgumentException(String.format("Data size (%d) != column count (%d)", data.length, columns.size())); - } - // make sure we can compare; get exception early - if (rows.size() > 0) { - Comparable[] data2 = rows.get(0); - for (int i = 0; i < data.length; ++i) { - try { - data[i].compareTo(data2[i]); - } catch (RuntimeException e) { - throw new IllegalArgumentException("Can't compare column " + i + ", " + data[i] + ", " + data2[i]); - } - } - } - rows.add(data); - return this; - } - - Collection partialRow; - - public TablePrinter addRow() { - if (partialRow != null) { - throw new IllegalArgumentException("Cannot add partial row before calling finishRow()"); - } - partialRow = new ArrayList(); - return this; - } - - public TablePrinter addCell(Comparable cell) { - if (rows.size() > 0) { - int i = partialRow.size(); - Comparable cell0 = rows.get(0)[i]; - try { - cell.compareTo(cell0); - } catch (RuntimeException e) { - throw new IllegalArgumentException("Can't compare column " + i + ", " + cell + ", " + cell0); - } - - } - partialRow.add(cell); - return this; - } - - public TablePrinter finishRow() { - if (partialRow.size() != columns.size()) { - throw new IllegalArgumentException("Items in row (" + partialRow.size() - + " not same as number of columns" + columns.size()); - } - addRow(partialRow); - partialRow = null; - return this; - } - - public TablePrinter addRow(Collection data) { - addRow(data.toArray(new Comparable[data.size()])); - return this; - } - - public TablePrinter addRows(Collection data) { - for (Object row : data) { - if (row instanceof Collection) { - addRow((Collection)row); - } else { - addRow((Comparable[])row); - } - } - return this; - } - - public TablePrinter addRows(Comparable[][] data) { - for (Comparable[] row : data) { - addRow(row); - } - return this; - } - - public String toString() { - return toTable(); - } - - public String toTable() { - Comparable[][] sortedFlat = (Comparable[][]) (rows.toArray(new Comparable[rows.size()][])); - return toTableInternal(sortedFlat); - } - - static class ColumnSorter implements Comparator { - private int[] sortPriorities = new int[0]; - private BitSet ascending = new BitSet(); - Collator englishCollator = Collator.getInstance(new ULocale("en-u-co-emoji")); - - public int compare(T[] o1, T[] o2) { - int result; - for (int curr : sortPriorities) { - result = o1[curr] instanceof String ? englishCollator.compare((String)o1[curr],(String)o2[curr]) - : o1[curr].compareTo(o2[curr]); - if (0 != result) { - if (ascending.get(curr)) { - return result; - } - return -result; - } - } - return 0; - } - - public void setSortPriority(int column, int priority) { - if (sortPriorities.length <= priority) { - int[] temp = new int[priority+1]; - System.arraycopy(sortPriorities,0,temp,0,sortPriorities.length); - sortPriorities = temp; - } - sortPriorities[priority] = column; - } - - public int[] getSortPriorities() { - return sortPriorities; - } - - public boolean getSortAscending(int bitIndex) { - return ascending.get(bitIndex); - } - - public void setSortAscending(int bitIndex, boolean value) { - ascending.set(bitIndex, value); - } - } - - ColumnSorter columnSorter = new ColumnSorter(); - - public String toTableInternal(Comparable[][] sortedFlat) { - //TreeSet sorted = new TreeSet(); - //sorted.addAll(data); - Object[] patternArgs = new Object[columns.size() + 1]; - - Arrays.sort(sortedFlat, columnSorter); - - columnsFlat = columns.toArray(new Column[0]); - - StringBuilder result = new StringBuilder(); - - result.append("\n"); - - if (caption != null) { - result.append("").append(caption).append(""); - } - - showHeader(result); - int visibleWidth = 0; - for (int j = 0; j < columns.size(); ++j) { - if (!columnsFlat[j].hidden) { - ++visibleWidth; - } - } - - for (int i = 0; i < sortedFlat.length; ++i) { - System.arraycopy(sortedFlat[i], 0, patternArgs, 1, sortedFlat[i].length); - // check to see if we repeat the header - if (i != 0) { - boolean divider = false; - for (int j = 0; j < sortedFlat[i].length; ++j) { - final Column column = columns.get(j); - if (column.repeatHeader && !sortedFlat[i-1][j].equals(sortedFlat[i][j])) { - showHeader(result); - break; - } else if (column.divider && !sortedFlat[i-1][j].equals(sortedFlat[i][j])) { - divider = true; - } - } - if (divider) { - result.append("\t"); - } - } - result.append("\t"); - for (int j = 0; j < sortedFlat[i].length; ++j) { - int identical = findIdentical(sortedFlat, i, j); - if (identical == 0) continue; - if (columnsFlat[j].hidden) { - continue; - } - patternArgs[0] = sortedFlat[i][j]; - result.append(columnsFlat[j].isHeader ? " = " + sortedFlat[i][j]).initCause(e); - } - } - if (identical != 1) { - result.append(" rowSpan='").append(identical).append('\''); - } - result.append('>'); - - if (columnsFlat[j].cellPattern != null) { - try { - patternArgs[0] = sortedFlat[i][j]; + public static void main(String[] args) { + // quick test; + TablePrinter tablePrinter = + new TablePrinter() + .setTableAttributes("style='border-collapse: collapse' border='1'") + .addColumn("Language") + .setSpanRows(true) + .setSortPriority(0) + .setBreakSpans(true) + .addColumn("Junk") + .setSpanRows(true) + .addColumn("Territory") + .setHeaderAttributes("bgcolor='green'") + .setCellAttributes("align='right'") + .setSpanRows(true) + .setSortPriority(1) + .setSortAscending(false); + Comparable[][] data = { + {"German", 1.3d, 3}, + {"French", 1.3d, 2}, + {"English", 1.3d, 2}, + {"English", 1.3d, 4}, + {"English", 1.3d, 6}, + {"English", 1.3d, 8}, + {"Arabic", 1.3d, 5}, + {"Zebra", 1.3d, 10} + }; + tablePrinter.addRows(data); + tablePrinter.addRow().addCell("Foo").addCell(1.5d).addCell(99).finishRow(); + + String s = tablePrinter.toTable(); + System.out.println(s); + } + + private List columns = new ArrayList(); + private String tableAttributes; + private transient Column[] columnsFlat; + private BitSet blockingRows = new BitSet(); + private List rows = new ArrayList(); + private String caption; + + public String getTableAttributes() { + return tableAttributes; + } + + public TablePrinter setTableAttributes(String tableAttributes) { + this.tableAttributes = tableAttributes; + return this; + } + + public TablePrinter setCaption(String caption) { + this.caption = caption; + return this; + } + + public TablePrinter setSortPriority(int priority) { + columnSorter.setSortPriority(columns.size() - 1, priority); + return this; + } + + public TablePrinter setSortAscending(boolean ascending) { + columnSorter.setSortAscending(columns.size() - 1, ascending); + return this; + } + + public TablePrinter setBreakSpans(boolean breaks) { + breaksSpans.set(columns.size() - 1, breaks); + return this; + } + + private static class Column { + String header; + String headerAttributes; + MessageFormat cellAttributes; + + boolean spanRows; + MessageFormat cellPattern; + private boolean repeatHeader = false; + private boolean hidden = false; + private boolean isHeader = false; + private boolean divider = false; + + public Column(String header) { + this.header = header; + } + + public Column setCellAttributes(String cellAttributes) { + this.cellAttributes = + new MessageFormat(MessageFormat.autoQuoteApostrophe(cellAttributes)); + return this; + } + + public Column setCellPattern(String cellPattern) { + this.cellPattern = + cellPattern == null + ? null + : new MessageFormat(MessageFormat.autoQuoteApostrophe(cellPattern)); + return this; + } + + public Column setCellPattern(MessageFormat cellPattern) { + this.cellPattern = cellPattern; + return this; + } + + public Column setHeader(String header) { + this.header = header; + return this; + } + + public Column setHeaderAttributes(String headerAttributes) { + this.headerAttributes = headerAttributes; + return this; + } + + public Column setSpanRows(boolean spanRows) { + this.spanRows = spanRows; + return this; + } + + public void setRepeatHeader(boolean b) { + repeatHeader = b; + } + + public void setHidden(boolean b) { + hidden = b; + } + + public void setHeaderCell(boolean b) { + isHeader = b; + } + + public void setDivider(boolean b) { + divider = b; + } + } + + public TablePrinter addColumn( + String header, + String headerAttributes, + String cellPattern, + String cellAttributes, + boolean spanRows) { + columns.add( + new Column(header) + .setHeaderAttributes(headerAttributes) + .setCellPattern(cellPattern) + .setCellAttributes(cellAttributes) + .setSpanRows(spanRows)); + setSortAscending(true); + return this; + } + + public TablePrinter addColumn(String header) { + columns.add(new Column(header)); + setSortAscending(true); + return this; + } + + public TablePrinter addRow(Comparable[] data) { + if (data.length != columns.size()) { + throw new IllegalArgumentException( + String.format( + "Data size (%d) != column count (%d)", data.length, columns.size())); + } + // make sure we can compare; get exception early + if (rows.size() > 0) { + Comparable[] data2 = rows.get(0); + for (int i = 0; i < data.length; ++i) { + try { + data[i].compareTo(data2[i]); + } catch (RuntimeException e) { + throw new IllegalArgumentException( + "Can't compare column " + i + ", " + data[i] + ", " + data2[i]); + } + } + } + rows.add(data); + return this; + } + + Collection partialRow; + + public TablePrinter addRow() { + if (partialRow != null) { + throw new IllegalArgumentException("Cannot add partial row before calling finishRow()"); + } + partialRow = new ArrayList(); + return this; + } + + public TablePrinter addCell(Comparable cell) { + if (rows.size() > 0) { + int i = partialRow.size(); + Comparable cell0 = rows.get(0)[i]; + try { + cell.compareTo(cell0); + } catch (RuntimeException e) { + throw new IllegalArgumentException( + "Can't compare column " + i + ", " + cell + ", " + cell0); + } + } + partialRow.add(cell); + return this; + } + + public TablePrinter finishRow() { + if (partialRow.size() != columns.size()) { + throw new IllegalArgumentException( + "Items in row (" + + partialRow.size() + + " not same as number of columns" + + columns.size()); + } + addRow(partialRow); + partialRow = null; + return this; + } + + public TablePrinter addRow(Collection data) { + addRow(data.toArray(new Comparable[data.size()])); + return this; + } + + public TablePrinter addRows(Collection data) { + for (Object row : data) { + if (row instanceof Collection) { + addRow((Collection) row); + } else { + addRow((Comparable[]) row); + } + } + return this; + } + + public TablePrinter addRows(Comparable[][] data) { + for (Comparable[] row : data) { + addRow(row); + } + return this; + } + + public String toString() { + return toTable(); + } + + public String toTable() { + Comparable[][] sortedFlat = (Comparable[][]) (rows.toArray(new Comparable[rows.size()][])); + return toTableInternal(sortedFlat); + } + + static class ColumnSorter implements Comparator { + private int[] sortPriorities = new int[0]; + private BitSet ascending = new BitSet(); + Collator englishCollator = Collator.getInstance(new ULocale("en-u-co-emoji")); + + public int compare(T[] o1, T[] o2) { + int result; + for (int curr : sortPriorities) { + result = + o1[curr] instanceof String + ? englishCollator.compare((String) o1[curr], (String) o2[curr]) + : o1[curr].compareTo(o2[curr]); + if (0 != result) { + if (ascending.get(curr)) { + return result; + } + return -result; + } + } + return 0; + } + + public void setSortPriority(int column, int priority) { + if (sortPriorities.length <= priority) { + int[] temp = new int[priority + 1]; + System.arraycopy(sortPriorities, 0, temp, 0, sortPriorities.length); + sortPriorities = temp; + } + sortPriorities[priority] = column; + } + + public int[] getSortPriorities() { + return sortPriorities; + } + + public boolean getSortAscending(int bitIndex) { + return ascending.get(bitIndex); + } + + public void setSortAscending(int bitIndex, boolean value) { + ascending.set(bitIndex, value); + } + } + + ColumnSorter columnSorter = new ColumnSorter(); + + public String toTableInternal(Comparable[][] sortedFlat) { + // TreeSet sorted = new TreeSet(); + // sorted.addAll(data); + Object[] patternArgs = new Object[columns.size() + 1]; + + Arrays.sort(sortedFlat, columnSorter); + + columnsFlat = columns.toArray(new Column[0]); + + StringBuilder result = new StringBuilder(); + + result.append("\n"); + + if (caption != null) { + result.append("").append(caption).append(""); + } + + showHeader(result); + int visibleWidth = 0; + for (int j = 0; j < columns.size(); ++j) { + if (!columnsFlat[j].hidden) { + ++visibleWidth; + } + } + + for (int i = 0; i < sortedFlat.length; ++i) { System.arraycopy(sortedFlat[i], 0, patternArgs, 1, sortedFlat[i].length); - result.append(columnsFlat[j].cellPattern.format(patternArgs)); - } catch (RuntimeException e) { - throw (RuntimeException) new IllegalArgumentException("cellPattern<" + i + ", " + j + "> = " + sortedFlat[i][j]).initCause(e); - } - } else { - result.append(sortedFlat[i][j]); - } - result.append(columnsFlat[j].isHeader ? "" : ""); - } - result.append("\n"); - } - result.append(""); - return result.toString(); - } - - private void showHeader(StringBuilder result) { - result.append("\t"); - for (int j = 0; j < columnsFlat.length; ++j) { - if (columnsFlat[j].hidden) { - continue; - } - result.append("').append(columnsFlat[j].header).append(""); - - } - result.append("\n"); - } - - /** - * Return 0 if the item is the same as in the row above, otherwise the rowSpan (of equal items) - * @param sortedFlat - * @param rowIndex - * @param colIndex - * @return - */ - private int findIdentical(Comparable[][] sortedFlat, int rowIndex, int colIndex) { - if (!columnsFlat[colIndex].spanRows) return 1; - Comparable item = sortedFlat[rowIndex][colIndex]; - if (rowIndex > 0 && item.equals(sortedFlat[rowIndex-1][colIndex])) { - if (!breakSpans(sortedFlat, rowIndex)) { - return 0; - } - } - for (int k = rowIndex+1; k < sortedFlat.length; ++k) { - if (!item.equals(sortedFlat[k][colIndex]) || breakSpans(sortedFlat, k)) { - return k - rowIndex; - } - } - return sortedFlat.length - rowIndex; - } - // to-do: prevent overlap when it would cause information to be lost. - private BitSet breaksSpans = new BitSet(); - - /** - * Only called with rowIndex > 0 - * @param rowIndex - * @return - */ - private boolean breakSpans(Comparable[][] sortedFlat, int rowIndex) { - for (int colIndex = 0; colIndex < breaksSpans.length(); ++colIndex) { - if (!breaksSpans.get(colIndex)) return false; - if (sortedFlat[rowIndex][colIndex].compareTo(sortedFlat[rowIndex-1][colIndex]) != 0) { - return true; - } - } - return false; - } - - public TablePrinter setCellAttributes(String cellAttributes) { - columns.get(columns.size()-1).setCellAttributes(cellAttributes); - return this; - } - - public TablePrinter setCellPattern(String cellPattern) { - columns.get(columns.size()-1).setCellPattern(cellPattern); - return this; - } - - public TablePrinter setHeaderAttributes(String headerAttributes) { - columns.get(columns.size()-1).setHeaderAttributes(headerAttributes); - return this; - } - - public TablePrinter setSpanRows(boolean spanRows) { - columns.get(columns.size()-1).setSpanRows(spanRows); - return this; - } - - public TablePrinter setRepeatHeader(boolean b) { - columns.get(columns.size()-1).setRepeatHeader(b); - if (b) { - breaksSpans.set(columns.size()-1, true); - } - return this; - } - - /** - * In the style section, have something like: - * - * @param color - * @return - */ - public static String bar(String htmlClass, double value, double max, boolean log) { - double width = 100*(log ? Math.log(value)/Math.log(max) : value/max); - if (!(width>=0.5)) return ""; // do the comparison this way to catch NaN - return "
\u200B
"; - } - - public TablePrinter setHidden(boolean b) { - columns.get(columns.size()-1).setHidden(b); - return this; - } - - public TablePrinter setHeaderCell(boolean b) { - columns.get(columns.size()-1).setHeaderCell(b); - return this; - } - - public TablePrinter setRepeatDivider(boolean b) { - columns.get(columns.size()-1).setDivider(b); - return this; - } -} \ No newline at end of file + // check to see if we repeat the header + if (i != 0) { + boolean divider = false; + for (int j = 0; j < sortedFlat[i].length; ++j) { + final Column column = columns.get(j); + if (column.repeatHeader && !sortedFlat[i - 1][j].equals(sortedFlat[i][j])) { + showHeader(result); + break; + } else if (column.divider && !sortedFlat[i - 1][j].equals(sortedFlat[i][j])) { + divider = true; + } + } + if (divider) { + result.append( + "\t"); + } + } + result.append("\t"); + for (int j = 0; j < sortedFlat[i].length; ++j) { + int identical = findIdentical(sortedFlat, i, j); + if (identical == 0) continue; + if (columnsFlat[j].hidden) { + continue; + } + patternArgs[0] = sortedFlat[i][j]; + result.append(columnsFlat[j].isHeader ? " = " + + sortedFlat[i][j]) + .initCause(e); + } + } + if (identical != 1) { + result.append(" rowSpan='").append(identical).append('\''); + } + result.append('>'); + + if (columnsFlat[j].cellPattern != null) { + try { + patternArgs[0] = sortedFlat[i][j]; + System.arraycopy(sortedFlat[i], 0, patternArgs, 1, sortedFlat[i].length); + result.append(columnsFlat[j].cellPattern.format(patternArgs)); + } catch (RuntimeException e) { + throw (RuntimeException) + new IllegalArgumentException( + "cellPattern<" + + i + + ", " + + j + + "> = " + + sortedFlat[i][j]) + .initCause(e); + } + } else { + result.append(sortedFlat[i][j]); + } + result.append(columnsFlat[j].isHeader ? "" : ""); + } + result.append("\n"); + } + result.append(""); + return result.toString(); + } + + private void showHeader(StringBuilder result) { + result.append("\t"); + for (int j = 0; j < columnsFlat.length; ++j) { + if (columnsFlat[j].hidden) { + continue; + } + result.append("').append(columnsFlat[j].header).append(""); + } + result.append("\n"); + } + + /** + * Return 0 if the item is the same as in the row above, otherwise the rowSpan (of equal items) + * + * @param sortedFlat + * @param rowIndex + * @param colIndex + * @return + */ + private int findIdentical(Comparable[][] sortedFlat, int rowIndex, int colIndex) { + if (!columnsFlat[colIndex].spanRows) return 1; + Comparable item = sortedFlat[rowIndex][colIndex]; + if (rowIndex > 0 && item.equals(sortedFlat[rowIndex - 1][colIndex])) { + if (!breakSpans(sortedFlat, rowIndex)) { + return 0; + } + } + for (int k = rowIndex + 1; k < sortedFlat.length; ++k) { + if (!item.equals(sortedFlat[k][colIndex]) || breakSpans(sortedFlat, k)) { + return k - rowIndex; + } + } + return sortedFlat.length - rowIndex; + } + // to-do: prevent overlap when it would cause information to be lost. + private BitSet breaksSpans = new BitSet(); + + /** + * Only called with rowIndex > 0 + * + * @param rowIndex + * @return + */ + private boolean breakSpans(Comparable[][] sortedFlat, int rowIndex) { + for (int colIndex = 0; colIndex < breaksSpans.length(); ++colIndex) { + if (!breaksSpans.get(colIndex)) return false; + if (sortedFlat[rowIndex][colIndex].compareTo(sortedFlat[rowIndex - 1][colIndex]) != 0) { + return true; + } + } + return false; + } + + public TablePrinter setCellAttributes(String cellAttributes) { + columns.get(columns.size() - 1).setCellAttributes(cellAttributes); + return this; + } + + public TablePrinter setCellPattern(String cellPattern) { + columns.get(columns.size() - 1).setCellPattern(cellPattern); + return this; + } + + public TablePrinter setHeaderAttributes(String headerAttributes) { + columns.get(columns.size() - 1).setHeaderAttributes(headerAttributes); + return this; + } + + public TablePrinter setSpanRows(boolean spanRows) { + columns.get(columns.size() - 1).setSpanRows(spanRows); + return this; + } + + public TablePrinter setRepeatHeader(boolean b) { + columns.get(columns.size() - 1).setRepeatHeader(b); + if (b) { + breaksSpans.set(columns.size() - 1, true); + } + return this; + } + + /** + * In the style section, have something like: + * + * @param color + * @return + */ + public static String bar(String htmlClass, double value, double max, boolean log) { + double width = 100 * (log ? Math.log(value) / Math.log(max) : value / max); + if (!(width >= 0.5)) return ""; // do the comparison this way to catch NaN + return "
\u200B
"; + } + + public TablePrinter setHidden(boolean b) { + columns.get(columns.size() - 1).setHidden(b); + return this; + } + + public TablePrinter setHeaderCell(boolean b) { + columns.get(columns.size() - 1).setHeaderCell(b); + return this; + } + + public TablePrinter setRepeatDivider(boolean b) { + columns.get(columns.size() - 1).setDivider(b); + return this; + } +} diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/AlternateIterator.java b/UnicodeJsps/src/main/java/org/unicode/jsp/AlternateIterator.java index a0cc26aa4..333ced207 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/AlternateIterator.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/AlternateIterator.java @@ -8,92 +8,92 @@ import java.util.TreeSet; public class AlternateIterator implements Iterator, Iterable { - final String[][] sources; - final int[] position; - // optimize later - final int length; - boolean notDone = true; - StringBuilder result = new StringBuilder(); - - public static class Builder { - List> sources = new ArrayList>(); + final String[][] sources; + final int[] position; + // optimize later + final int length; + boolean notDone = true; + StringBuilder result = new StringBuilder(); - Builder add(Collection items) { - if (items.size() == 0) { - throw new IllegalArgumentException(); - } - ArrayList copy = new ArrayList(items); - sources.add(copy); - return this; - } - - public Builder add(String... items) { - return add(Arrays.asList(items)); - } - - public AlternateIterator build() { - return new AlternateIterator(sources); - } - } + public static class Builder { + List> sources = new ArrayList>(); + + Builder add(Collection items) { + if (items.size() == 0) { + throw new IllegalArgumentException(); + } + ArrayList copy = new ArrayList(items); + sources.add(copy); + return this; + } + + public Builder add(String... items) { + return add(Arrays.asList(items)); + } - public static Builder start() { - return new Builder(); - } - - private AlternateIterator(List> inSources) { - length = inSources.size(); - sources = new String[length][]; - for (int i = 0; i < length; ++i) { - List list = inSources.get(i); - sources[i] = list.toArray(new String[list.size()]); + public AlternateIterator build() { + return new AlternateIterator(sources); + } } - position = new int[length]; - } - public boolean hasNext() { - return notDone; - } + public static Builder start() { + return new Builder(); + } - public String next() { - result.setLength(0); - for (int i = 0; i < length; ++i) { - result.append(sources[i][position[i]]); + private AlternateIterator(List> inSources) { + length = inSources.size(); + sources = new String[length][]; + for (int i = 0; i < length; ++i) { + List list = inSources.get(i); + sources[i] = list.toArray(new String[list.size()]); + } + position = new int[length]; } - int i; - for (i = length-1; i >= 0; --i) { - ++position[i]; - if (position[i] < sources[i].length) { - break; - } - position[i] = 0; + + public boolean hasNext() { + return notDone; } - if (i < 0) { - notDone = false; + + public String next() { + result.setLength(0); + for (int i = 0; i < length; ++i) { + result.append(sources[i][position[i]]); + } + int i; + for (i = length - 1; i >= 0; --i) { + ++position[i]; + if (position[i] < sources[i].length) { + break; + } + position[i] = 0; + } + if (i < 0) { + notDone = false; + } + return result.toString(); } - return result.toString(); - } - public void remove() { - throw new UnsupportedOperationException(); - } + public void remove() { + throw new UnsupportedOperationException(); + } - public Iterator iterator() { - return this; - } + public Iterator iterator() { + return this; + } - public double getMaxSize() { - double result = 1; - for (int i = 0; i < length; ++i) { - result *= sources[i].length; + public double getMaxSize() { + double result = 1; + for (int i = 0; i < length; ++i) { + result *= sources[i].length; + } + return result; } - return result; - } - public List> getAlternates() { - List> result = new ArrayList>(); - for (int i = 0; i < length; ++i) { - result.add(new TreeSet(Arrays.asList(sources[i]))); + public List> getAlternates() { + List> result = new ArrayList>(); + for (int i = 0; i < length; ++i) { + result.add(new TreeSet(Arrays.asList(sources[i]))); + } + return result; } - return result; - } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Annotations.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Annotations.java index dc10b22a7..7f1dd0d29 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Annotations.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Annotations.java @@ -1,5 +1,8 @@ package org.unicode.jsp; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UnicodeSet; import java.util.Collections; import java.util.Locale; import java.util.Map; @@ -7,13 +10,8 @@ import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UnicodeSet; - public class Annotations { private static final UnicodeMap> data = new UnicodeMap(); @@ -30,14 +28,13 @@ public static UnicodeSet keys() { } static { - Map idToCodepoint = new TreeMap(); - for (String line : FileUtilities.in(Annotations.class,"annotations.txt")) { + Map idToCodepoint = new TreeMap(); + for (String line : FileUtilities.in(Annotations.class, "annotations.txt")) { String[] parts = line.split("\t"); int codepoint = parts[1].codePointAt(0); idToCodepoint.put(parts[2], codepoint); String annotation = parts[3]; data.put(codepoint, Collections.singleton(annotation)); - } // resolve references UnicodeSet copy = new UnicodeSet(keys()); @@ -65,10 +62,9 @@ public static UnicodeSet keys() { throw new IllegalArgumentException("Can't replace " + original); } else { b.append(aString, lastEnd, m.start()) - .append("{") - .append(replacement.iterator().next()) - .append("}") - ; + .append("{") + .append(replacement.iterator().next()) + .append("}"); lastEnd = m.end(); ++count; if (count > 10) { diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/BiMultimap.java b/UnicodeJsps/src/main/java/org/unicode/jsp/BiMultimap.java index 105d18fca..cb9b36d16 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/BiMultimap.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/BiMultimap.java @@ -1,49 +1,56 @@ package org.unicode.jsp; +import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Multimap; import java.util.Collection; import java.util.Collections; import java.util.Set; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Multimap; +class BiMultimap { + private final Multimap keyToValues = LinkedHashMultimap.create(); + private final Multimap valueToKeys = LinkedHashMultimap.create(); + private final Collection defaultKeys; + private final Collection defaultValues; -class BiMultimap { - final private Multimap keyToValues = LinkedHashMultimap.create(); - final private Multimap valueToKeys = LinkedHashMultimap.create(); - final private Collection defaultKeys; - final private Collection defaultValues; - - BiMultimap(CollectiondefaultKeys, Collection defaultValues) { + BiMultimap(Collection defaultKeys, Collection defaultValues) { this.defaultKeys = defaultKeys; this.defaultValues = defaultValues; } + public void putAll(K key, Collection values) { keyToValues.putAll(key, values); putAll(valueToKeys, values, key); } - public static void putAll(Multimap kToVs, Collection keys, V value) { + + public static void putAll(Multimap kToVs, Collection keys, V value) { for (K key : keys) { kToVs.put(key, value); } } + public Collection getKeys(V value) { Collection result = valueToKeys.get(value); return result.isEmpty() ? defaultKeys : result; } + public Collection getValues(K key) { Collection result = keyToValues.get(key); return result.isEmpty() ? defaultValues : result; } + public Multimap getKeyToValues() { return keyToValues; } + @Override public String toString() { return keyToValues.toString(); } + public Set keySet() { return Collections.unmodifiableSet(keyToValues.keySet()); } + public Set valueSet() { return Collections.unmodifiableSet(valueToKeys.keySet()); } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/BidiCharMap.java b/UnicodeJsps/src/main/java/org/unicode/jsp/BidiCharMap.java index 0a31642b0..64de10b69 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/BidiCharMap.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/BidiCharMap.java @@ -1,6 +1,4 @@ -/** - * - */ +/** */ package org.unicode.jsp; import com.ibm.icu.dev.util.UnicodeMap; @@ -10,81 +8,89 @@ import com.ibm.icu.text.UnicodeSetIterator; class BidiCharMap { - private static final byte L = BidiReference.L; - private static final byte LRE = BidiReference.LRE; - private static final byte LRO = BidiReference.LRO; - private static final byte R = BidiReference.R; - private static final byte AL = BidiReference.AL; - private static final byte RLE = BidiReference.RLE; - private static final byte RLO = BidiReference.RLO; - private static final byte PDF = BidiReference.PDF; - private static final byte EN = BidiReference.EN; - private static final byte ES = BidiReference.ES; - private static final byte ET = BidiReference.ET; - private static final byte AN = BidiReference.AN; - private static final byte CS = BidiReference.CS; - private static final byte NSM = BidiReference.NSM; - private static final byte BN = BidiReference.BN; - private static final byte B = BidiReference.B; - private static final byte S = BidiReference.S; - private static final byte WS = BidiReference.WS; - private static final byte ON = BidiReference.ON; + private static final byte L = BidiReference.L; + private static final byte LRE = BidiReference.LRE; + private static final byte LRO = BidiReference.LRO; + private static final byte R = BidiReference.R; + private static final byte AL = BidiReference.AL; + private static final byte RLE = BidiReference.RLE; + private static final byte RLO = BidiReference.RLO; + private static final byte PDF = BidiReference.PDF; + private static final byte EN = BidiReference.EN; + private static final byte ES = BidiReference.ES; + private static final byte ET = BidiReference.ET; + private static final byte AN = BidiReference.AN; + private static final byte CS = BidiReference.CS; + private static final byte NSM = BidiReference.NSM; + private static final byte BN = BidiReference.BN; + private static final byte B = BidiReference.B; + private static final byte S = BidiReference.S; + private static final byte WS = BidiReference.WS; + private static final byte ON = BidiReference.ON; - static byte mapIcuToRefNum[] = null; - static UnicodeSet[] umap = new UnicodeSet[BidiReference.typenames.length]; - static UnicodeMap asciiHackMap = new UnicodeMap(); + static byte mapIcuToRefNum[] = null; + static UnicodeSet[] umap = new UnicodeSet[BidiReference.typenames.length]; + static UnicodeMap asciiHackMap = new UnicodeMap(); - static { - mapIcuToRefNum = new byte[BidiReference.typenames.length]; - // generate permutation from names - for (byte i = 0; i < mapIcuToRefNum.length; ++i) { - int icuValue = UCharacter.getPropertyValueEnum(UProperty.BIDI_CLASS, BidiReference.typenames[i]); - mapIcuToRefNum[icuValue] = i; - } + static { + mapIcuToRefNum = new byte[BidiReference.typenames.length]; + // generate permutation from names + for (byte i = 0; i < mapIcuToRefNum.length; ++i) { + int icuValue = + UCharacter.getPropertyValueEnum( + UProperty.BIDI_CLASS, BidiReference.typenames[i]); + mapIcuToRefNum[icuValue] = i; + } - for (int i = 0; i < BidiReference.typenames.length; ++i) { - umap[i] = new UnicodeSet(); - } + for (int i = 0; i < BidiReference.typenames.length; ++i) { + umap[i] = new UnicodeSet(); + } - for (UnicodeSetIterator it = new UnicodeSetIterator(new UnicodeSet("[[:ascii:]-[[:cc:]-[:whitespace:]]]")); it.next();) { - asciiHackMap.put(it.codepoint, mapIcuToRefNum[UCharacter.getIntPropertyValue(it.codepoint, UProperty.BIDI_CLASS)]); + for (UnicodeSetIterator it = + new UnicodeSetIterator( + new UnicodeSet("[[:ascii:]-[[:cc:]-[:whitespace:]]]")); + it.next(); ) { + asciiHackMap.put( + it.codepoint, + mapIcuToRefNum[ + UCharacter.getIntPropertyValue(it.codepoint, UProperty.BIDI_CLASS)]); + } + // override + asciiHackMap.put(']', LRE); + asciiHackMap.put('[', RLE); + asciiHackMap.put('}', LRO); + asciiHackMap.put('{', RLO); + asciiHackMap.put('|', PDF); + asciiHackMap.putAll(new UnicodeSet("[A-M]"), R); + asciiHackMap.putAll(new UnicodeSet("[N-Z]"), AL); + asciiHackMap.putAll(new UnicodeSet("[5-9]"), AN); + asciiHackMap.put('>', L); + asciiHackMap.put('<', R); + asciiHackMap.put('"', NSM); + asciiHackMap.put('_', BN); } - // override - asciiHackMap.put(']', LRE); - asciiHackMap.put('[', RLE); - asciiHackMap.put('}', LRO); - asciiHackMap.put('{', RLO); - asciiHackMap.put('|', PDF); - asciiHackMap.putAll(new UnicodeSet("[A-M]"), R); - asciiHackMap.putAll(new UnicodeSet("[N-Z]"), AL); - asciiHackMap.putAll(new UnicodeSet("[5-9]"), AN); - asciiHackMap.put('>', L); - asciiHackMap.put('<',R); - asciiHackMap.put('"',NSM); - asciiHackMap.put('_',BN); - } - boolean asciiHack; + boolean asciiHack; - public BidiCharMap (boolean asciiHack) { - this.asciiHack = asciiHack; - } + public BidiCharMap(boolean asciiHack) { + this.asciiHack = asciiHack; + } - public static UnicodeSet getAsciiHack(byte i) { - return asciiHackMap.keySet(i); - } + public static UnicodeSet getAsciiHack(byte i) { + return asciiHackMap.keySet(i); + } - public static byte getBidiClass(int codepoint, boolean asciiHack2) { - if (asciiHack2) { - Byte result = (Byte) asciiHackMap.getValue(codepoint); - if (result != null) { - return result; - } + public static byte getBidiClass(int codepoint, boolean asciiHack2) { + if (asciiHack2) { + Byte result = (Byte) asciiHackMap.getValue(codepoint); + if (result != null) { + return result; + } + } + return mapIcuToRefNum[UCharacter.getIntPropertyValue(codepoint, UProperty.BIDI_CLASS)]; } - return mapIcuToRefNum[UCharacter.getIntPropertyValue(codepoint, UProperty.BIDI_CLASS)]; - } - public byte getBidiClass(int codepoint) { - return getBidiClass(codepoint, asciiHack); - } -} \ No newline at end of file + public byte getBidiClass(int codepoint) { + return getBidiClass(codepoint, asciiHack); + } +} diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/BidiReference.java b/UnicodeJsps/src/main/java/org/unicode/jsp/BidiReference.java index 8b3ad9e52..a669d61d1 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/BidiReference.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/BidiReference.java @@ -9,1246 +9,1229 @@ /** * Reference implementation of the Unicode 3.0 Bidi algorithm. * - *

- * This implementation is not optimized for performance. It is intended - * as a reference implementation that closely follows the specification - * of the Bidirectional Algorithm in The Unicode Standard version 3.0. - *

- * Input:
- * There are two levels of input to the algorithm, since clients may prefer - * to supply some information from out-of-band sources rather than relying on - * the default behavior. + *

This implementation is not optimized for performance. It is intended as a reference + * implementation that closely follows the specification of the Bidirectional Algorithm in The + * Unicode Standard version 3.0. + * + *

Input:
+ * There are two levels of input to the algorithm, since clients may prefer to supply some + * information from out-of-band sources rather than relying on the default behavior. + * *

    - *
  1. unicode type array - *
  2. unicode type array, with externally supplied base line direction + *
  3. unicode type array + *
  4. unicode type array, with externally supplied base line direction *
+ * *

Output:
- * Output is separated into several stages as well, to better enable clients - * to evaluate various aspects of implementation conformance. + * Output is separated into several stages as well, to better enable clients to evaluate various + * aspects of implementation conformance. + * *

    - *
  1. levels array over entire paragraph - *
  2. reordering array over entire paragraph - *
  3. levels array over line - *
  4. reordering array over line + *
  5. levels array over entire paragraph + *
  6. reordering array over entire paragraph + *
  7. levels array over line + *
  8. reordering array over line *
- * Note that for conformance, algorithms are only required to generate correct - * reordering and character directionality (odd or even levels) over a line. - * Generating identical level arrays over a line is not required. Bidi - * explicit format codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned - * arbitrary levels and positions as long as the other text matches. - *

- * As the algorithm is defined to operate on a single paragraph at a time, - * this implementation is written to handle single paragraphs. Thus - * rule P1 is presumed by this implementation-- the data provided to the - * implementation is assumed to be a single paragraph, and either contains no - * 'B' codes, or a single 'B' code at the end of the input. 'B' is allowed - * as input to illustrate how the algorithm assigns it a level. - *

- * Also note that rules L3 and L4 depend on the rendering engine that uses - * the result of the bidi algorithm. This implementation assumes that the - * rendering engine expects combining marks in visual order (e.g. to the - * left of their base character in RTL runs) and that it adjust the glyphs - * used to render mirrored characters that are in RTL runs so that they - * render appropriately. + * + * Note that for conformance, algorithms are only required to generate correct reordering and + * character directionality (odd or even levels) over a line. Generating identical level arrays over + * a line is not required. Bidi explicit format codes (LRE, RLE, LRO, RLO, PDF) and BN can be + * assigned arbitrary levels and positions as long as the other text matches. + * + *

As the algorithm is defined to operate on a single paragraph at a time, this implementation is + * written to handle single paragraphs. Thus rule P1 is presumed by this implementation-- the data + * provided to the implementation is assumed to be a single paragraph, and either contains no 'B' + * codes, or a single 'B' code at the end of the input. 'B' is allowed as input to illustrate how + * the algorithm assigns it a level. + * + *

Also note that rules L3 and L4 depend on the rendering engine that uses the result of the bidi + * algorithm. This implementation assumes that the rendering engine expects combining marks in + * visual order (e.g. to the left of their base character in RTL runs) and that it adjust the glyphs + * used to render mirrored characters that are in RTL runs so that they render appropriately. * * @author Doug Felt */ - public final class BidiReference { - private byte[] initialTypes; - private byte[] embeddings; // generated from processing format codes - private byte paragraphEmbeddingLevel = -1; // undefined + private byte[] initialTypes; + private byte[] embeddings; // generated from processing format codes + private byte paragraphEmbeddingLevel = -1; // undefined - private int textLength; // for convenience - private byte[] resultTypes; // for paragraph, not lines - private byte[] resultLevels; // for paragraph, not lines - private StringBuffer[] record; - private String rule; - private int[] mapToOriginal; - - // The bidi types - - /** Left-to-right*/ - public static final byte L = 0; - - /** Left-to-Right Embedding */ - public static final byte LRE = 1; + private int textLength; // for convenience + private byte[] resultTypes; // for paragraph, not lines + private byte[] resultLevels; // for paragraph, not lines + private StringBuffer[] record; + private String rule; + private int[] mapToOriginal; - /** Left-to-Right Override */ - public static final byte LRO = 2; - - /** Right-to-Left */ - public static final byte R = 3; - - /** Right-to-Left Arabic */ - public static final byte AL = 4; - - /** Right-to-Left Embedding */ - public static final byte RLE = 5; - - /** Right-to-Left Override */ - public static final byte RLO = 6; - - /** Pop Directional Format */ - public static final byte PDF = 7; - - /** European Number */ - public static final byte EN = 8; - - /** European Number Separator */ - public static final byte ES = 9; - - /** European Number Terminator */ - public static final byte ET = 10; - - /** Arabic Number */ - public static final byte AN = 11; - - /** Common Number Separator */ - public static final byte CS = 12; - - /** Non-Spacing Mark */ - public static final byte NSM = 13; - - /** Boundary Neutral */ - public static final byte BN = 14; - - /** Paragraph Separator */ - public static final byte B = 15; - - /** Segment Separator */ - public static final byte S = 16; - - /** Whitespace */ - public static final byte WS = 17; - - /** Other Neutrals */ - public static final byte ON = 18; - - /** Minimum bidi type value. */ - public static final byte TYPE_MIN = 0; - - /** Maximum bidi type value. */ - public static final byte TYPE_MAX = 18; - - /** Shorthand names of bidi type values, for error reporting. */ - public static final String[] typenames = { - "L", - "LRE", - "LRO", - "R", - "AL", - "RLE", - "RLO", - "PDF", - "EN", - "ES", - "ET", - "AN", - "CS", - "NSM", - "BN", - "B", - "S", - "WS", - "ON", - }; - - // - // Input - // - - /** - * Initialize using an array of direction types. Types range from TYPE_MIN to TYPE_MAX inclusive - * and represent the direction codes of the characters in the text. - * - * @param types the types array - */ - public BidiReference(byte[] types) { - validateTypes(types); - - initialTypes = types.clone(); // client type array remains unchanged - - runAlgorithm(); - } - - /** - * Initialize using an array of direction types and an externally supplied paragraph embedding level. - * The embedding level may be -1, 0, or 1. -1 means to apply the default algorithm (rules P2 and P3), - * 0 is for LTR paragraphs, and 1 is for RTL paragraphs. - * - * @param types the types array - * @param paragraphEmbeddingLevel the externally supplied paragraph embedding level. - */ - public BidiReference(byte[] types, byte paragraphEmbeddingLevel) { - validateTypes(types); - validateParagraphEmbeddingLevel(paragraphEmbeddingLevel); - - initialTypes = types.clone(); // client type array remains unchanged - this.paragraphEmbeddingLevel = paragraphEmbeddingLevel; - - runAlgorithm(); - } - - /** - * The algorithm. - * Does not include line-based processing (Rules L1, L2). - * These are applied later in the line-based phase of the algorithm. - */ - private void runAlgorithm() { - // Ensure trace hook does not change while running algorithm. - // Trace hook is a shared class resource. - synchronized (BidiReference.class) { - textLength = initialTypes.length; - - // Initialize output types. - // Result types initialized to input types. - resultTypes = initialTypes.clone(); - record = new StringBuffer[resultTypes.length]; - for (int i = 0; i < resultTypes.length; ++i) { - record[i] = new StringBuffer(); - } - - trace(BidiTraceHook.PHASE_INIT, 0, textLength); - - // 1) determining the paragraph level - // Rule P1 is the requirement for entering this algorithm. - // Rules P2, P3. - // If no externally supplied paragraph embedding level, use default. - setRule("P1"); - if (paragraphEmbeddingLevel == -1) { - determineParagraphEmbeddingLevel(); - } - - // Initialize result levels to paragraph embedding level. - setRule("P1"); - resultLevels = new byte[textLength]; - setLevels(0, textLength, paragraphEmbeddingLevel); - trace(BidiTraceHook.PHASE_BASELEVEL, 0, textLength); - - // 2) Explicit levels and directions - // Rules X1-X8.\ - setRule("X1-8"); - determineExplicitEmbeddingLevels(); - trace(BidiTraceHook.PHASE_EXPLICIT, 0, textLength); - - // Rule X9. - setRule("X9"); - textLength = removeExplicitCodes(); - trace(BidiTraceHook.PHASE_EXPLICIT_REMOVED, 0, textLength); - - // Rule X10. - // Run remainder of algorithm one level run at a time - setRule("X10"); - byte prevLevel = paragraphEmbeddingLevel; - int start = 0; - while (start < textLength) { - byte level = resultLevels[start]; - byte prevType = typeForLevel(Math.max(prevLevel, level)); - - int limit = start + 1; - while (limit < textLength && resultLevels[limit] == level) { - ++limit; - } + // The bidi types - byte succLevel = limit < textLength ? resultLevels[limit] : paragraphEmbeddingLevel; - byte succType = typeForLevel(Math.max(succLevel, level)); - - // 3) resolving weak types - // Rules W1-W7. - setRule("W1-7"); - resolveWeakTypes(start, limit, level, prevType, succType); - trace(BidiTraceHook.PHASE_WEAK, start, limit); - - // 4) resolving neutral types - // Rules N1-N3. - setRule("N1-2"); - resolveNeutralTypes(start, limit, level, prevType, succType); - trace(BidiTraceHook.PHASE_NEUTRAL, start, limit); - - // 5) resolving implicit embedding levels - // Rules I1, I2. - setRule("I1-2"); - resolveImplicitLevels(start, limit, level, prevType, succType); - trace(BidiTraceHook.PHASE_IMPLICIT, start, limit); - - prevLevel = level; - start = limit; - } - } + /** Left-to-right */ + public static final byte L = 0; - // Reinsert explicit codes and assign appropriate levels to 'hide' them. - // This is for convenience, so the resulting level array maps 1-1 - // with the initial array. - // See the implementation suggestions section of TR#9 for guidelines on - // how to implement the algorithm without removing and reinserting the codes. - textLength = reinsertExplicitCodes(textLength); - } - - /** - * 1) determining the paragraph level. - *

- * Rules P2, P3. - *

- * At the end of this function, the member variable paragraphEmbeddingLevel is set to either 0 or 1. - */ - private void determineParagraphEmbeddingLevel() { - byte strongType = -1; // unknown - - // Rule P2. - for (int i = 0; i < textLength; ++i) { - byte t = resultTypes[i]; - if (t == L || t == AL || t == R) { - strongType = t; - break; - } - } + /** Left-to-Right Embedding */ + public static final byte LRE = 1; - // Rule P3. - if (strongType == -1) { // none found - // default embedding level when no strong types found is 0. - paragraphEmbeddingLevel = 0; - } else if (strongType == L) { - paragraphEmbeddingLevel = 0; - } else { // AL, R - paragraphEmbeddingLevel = 1; - } - } - - /** - * Process embedding format codes. - *

- * Calls processEmbeddings to generate an embedding array from the explicit format codes. The - * embedding overrides in the array are then applied to the result types, and the result levels are - * initialized. - * @see #processEmbeddings - */ - private void determineExplicitEmbeddingLevels() { - embeddings = processEmbeddings(resultTypes, paragraphEmbeddingLevel); - - for (int i = 0; i < textLength; ++i) { - byte level = embeddings[i]; - if ((level & 0x80) != 0) { - level &= 0x7f; - setType(i, typeForLevel(level)); - } - resultLevels[i] = level; - } - } + /** Left-to-Right Override */ + public static final byte LRO = 2; - private void setType(int i, byte value) { - if (value != resultTypes[i]) { - record[i].append(getRule() + "\u2192"+getHtmlTypename(value) + "\n"); - } - resultTypes[i] = value; - } - - public String getChanges(int i) { - return record[i].toString(); - } - - /** - * Rules X9. - * Remove explicit codes so that they may be ignored during the remainder - * of the main portion of the algorithm. The length of the resulting text - * is returned. - * @return the length of the data excluding explicit codes and BN. - */ - private int removeExplicitCodes() { - int w = 0; - mapToOriginal = new int[initialTypes.length]; - for (int i = 0; i < textLength; ++i) { - byte t = initialTypes[i]; - if (!(t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN)) { - mapToOriginal[w] = i; - embeddings[w] = embeddings[i]; - resultTypes[w] = resultTypes[i]; - resultLevels[w] = resultLevels[i]; - w++; - } - } - return w; // new textLength while explicit levels are removed - } - - /** - * Reinsert levels information for explicit codes. - * This is for ease of relating the level information - * to the original input data. Note that the levels - * assigned to these codes are arbitrary, they're - * chosen so as to avoid breaking level runs. - * @param textLength the length of the data after compression - * @return the length of the data (original length of - * types array supplied to constructor) - */ - private int reinsertExplicitCodes(int textLength) { - for (int i = initialTypes.length; --i >= 0;) { - byte t = initialTypes[i]; - if (t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN) { - embeddings[i] = 0; - setType(i, t); - resultLevels[i] = -1; - } else { - --textLength; - embeddings[i] = embeddings[textLength]; - setType(i, resultTypes[textLength]); - resultLevels[i] = resultLevels[textLength]; - } - } - mapToOriginal = null; + /** Right-to-Left */ + public static final byte R = 3; + + /** Right-to-Left Arabic */ + public static final byte AL = 4; + + /** Right-to-Left Embedding */ + public static final byte RLE = 5; + + /** Right-to-Left Override */ + public static final byte RLO = 6; + + /** Pop Directional Format */ + public static final byte PDF = 7; + + /** European Number */ + public static final byte EN = 8; + + /** European Number Separator */ + public static final byte ES = 9; + + /** European Number Terminator */ + public static final byte ET = 10; + + /** Arabic Number */ + public static final byte AN = 11; + + /** Common Number Separator */ + public static final byte CS = 12; + + /** Non-Spacing Mark */ + public static final byte NSM = 13; + + /** Boundary Neutral */ + public static final byte BN = 14; + + /** Paragraph Separator */ + public static final byte B = 15; + + /** Segment Separator */ + public static final byte S = 16; - // now propagate forward the levels information (could have - // propagated backward, the main thing is not to introduce a level - // break where one doesn't already exist). + /** Whitespace */ + public static final byte WS = 17; - if (resultLevels[0] == -1) { - resultLevels[0] = paragraphEmbeddingLevel; + /** Other Neutrals */ + public static final byte ON = 18; + + /** Minimum bidi type value. */ + public static final byte TYPE_MIN = 0; + + /** Maximum bidi type value. */ + public static final byte TYPE_MAX = 18; + + /** Shorthand names of bidi type values, for error reporting. */ + public static final String[] typenames = { + "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", + "BN", "B", "S", "WS", "ON", + }; + + // + // Input + // + + /** + * Initialize using an array of direction types. Types range from TYPE_MIN to TYPE_MAX inclusive + * and represent the direction codes of the characters in the text. + * + * @param types the types array + */ + public BidiReference(byte[] types) { + validateTypes(types); + + initialTypes = types.clone(); // client type array remains unchanged + + runAlgorithm(); } - for (int i = 1; i < initialTypes.length; ++i) { - if (resultLevels[i] == -1) { - resultLevels[i] = resultLevels[i-1]; - } + + /** + * Initialize using an array of direction types and an externally supplied paragraph embedding + * level. The embedding level may be -1, 0, or 1. -1 means to apply the default algorithm (rules + * P2 and P3), 0 is for LTR paragraphs, and 1 is for RTL paragraphs. + * + * @param types the types array + * @param paragraphEmbeddingLevel the externally supplied paragraph embedding level. + */ + public BidiReference(byte[] types, byte paragraphEmbeddingLevel) { + validateTypes(types); + validateParagraphEmbeddingLevel(paragraphEmbeddingLevel); + + initialTypes = types.clone(); // client type array remains unchanged + this.paragraphEmbeddingLevel = paragraphEmbeddingLevel; + + runAlgorithm(); } - // Embedding information is for informational purposes only - // so need not be adjusted. - - return initialTypes.length; - } - - /** - * 2) determining explicit levels - * Rules X1 - X8 - * - * The interaction of these rules makes handling them a bit complex. - * This examines resultTypes but does not modify it. It returns embedding and - * override information in the result array. The low 7 bits are the level, the high - * bit is set if the level is an override, and clear if it is an embedding. - */ - private static byte[] processEmbeddings(byte[] resultTypes, byte paragraphEmbeddingLevel) { - final int EXPLICIT_LEVEL_LIMIT = 62; - - int textLength = resultTypes.length; - byte[] embeddings = new byte[textLength]; - - // This stack will store the embedding levels and override status in a single byte - // as described above. - byte[] embeddingValueStack = new byte[EXPLICIT_LEVEL_LIMIT]; - int stackCounter = 0; - - // An LRE or LRO at level 60 is invalid, since the new level 62 is invalid. But - // an RLE at level 60 is valid, since the new level 61 is valid. The current wording - // of the rules requires that the RLE remain valid even if a previous LRE is invalid. - // This keeps track of ignored LRE or LRO codes at level 60, so that the matching PDFs - // will not try to pop the stack. - int overflowAlmostCounter = 0; - - // This keeps track of ignored pushes at level 61 or higher, so that matching PDFs will - // not try to pop the stack. - int overflowCounter = 0; - - // Rule X1. - - // Keep the level separate from the value (level | override status flag) for ease of access. - byte currentEmbeddingLevel = paragraphEmbeddingLevel; - byte currentEmbeddingValue = paragraphEmbeddingLevel; - - // Loop through types, handling all remaining rules - for (int i = 0; i < textLength; ++i) { - - embeddings[i] = currentEmbeddingValue; - - byte t = resultTypes[i]; - - // Rules X2, X3, X4, X5 - switch (t) { - case RLE: - case LRE: - case RLO: - case LRO: - // Only need to compute new level if current level is valid - if (overflowCounter == 0) { - byte newLevel; - if (t == RLE || t == RLO) { - newLevel = (byte)((currentEmbeddingLevel + 1) | 1); // least greater odd - } else { // t == LRE || t == LRO - newLevel = (byte)((currentEmbeddingLevel + 2) & ~1); // least greater even - } - - // If the new level is valid, push old embedding level and override status - // No check for valid stack counter, since the level check suffices. - if (newLevel < EXPLICIT_LEVEL_LIMIT) { - embeddingValueStack[stackCounter] = currentEmbeddingValue; - stackCounter++; - - currentEmbeddingLevel = newLevel; - if (t == LRO || t == RLO) { // override - currentEmbeddingValue = (byte)(newLevel | 0x80); - } else { - currentEmbeddingValue = newLevel; + /** + * The algorithm. Does not include line-based processing (Rules L1, L2). These are applied later + * in the line-based phase of the algorithm. + */ + private void runAlgorithm() { + // Ensure trace hook does not change while running algorithm. + // Trace hook is a shared class resource. + synchronized (BidiReference.class) { + textLength = initialTypes.length; + + // Initialize output types. + // Result types initialized to input types. + resultTypes = initialTypes.clone(); + record = new StringBuffer[resultTypes.length]; + for (int i = 0; i < resultTypes.length; ++i) { + record[i] = new StringBuffer(); } - // Adjust level of format mark (for expositional purposes only, this gets - // removed later). - embeddings[i] = currentEmbeddingValue; - break; - } - - // Otherwise new level is invalid, but a valid level can still be achieved if this - // level is 60 and we encounter an RLE or RLO further on. So record that we - // 'almost' overflowed. - if (currentEmbeddingLevel == 60) { - overflowAlmostCounter++; - break; - } - } + trace(BidiTraceHook.PHASE_INIT, 0, textLength); - // Otherwise old or new level is invalid. - overflowCounter++; - break; + // 1) determining the paragraph level + // Rule P1 is the requirement for entering this algorithm. + // Rules P2, P3. + // If no externally supplied paragraph embedding level, use default. + setRule("P1"); + if (paragraphEmbeddingLevel == -1) { + determineParagraphEmbeddingLevel(); + } - case PDF: - // The only case where this did not actually overflow but may have almost overflowed - // is when there was an RLE or RLO on level 60, which would result in level 61. So we - // only test the almost overflow condition in that case. - // - // Also note that there may be a PDF without any pushes at all. - - if (overflowCounter > 0) { - --overflowCounter; - } else if (overflowAlmostCounter > 0 && currentEmbeddingLevel != 61) { - --overflowAlmostCounter; - } else if (stackCounter > 0) { - --stackCounter; - currentEmbeddingValue = embeddingValueStack[stackCounter]; - currentEmbeddingLevel = (byte)(currentEmbeddingValue & 0x7f); + // Initialize result levels to paragraph embedding level. + setRule("P1"); + resultLevels = new byte[textLength]; + setLevels(0, textLength, paragraphEmbeddingLevel); + trace(BidiTraceHook.PHASE_BASELEVEL, 0, textLength); + + // 2) Explicit levels and directions + // Rules X1-X8.\ + setRule("X1-8"); + determineExplicitEmbeddingLevels(); + trace(BidiTraceHook.PHASE_EXPLICIT, 0, textLength); + + // Rule X9. + setRule("X9"); + textLength = removeExplicitCodes(); + trace(BidiTraceHook.PHASE_EXPLICIT_REMOVED, 0, textLength); + + // Rule X10. + // Run remainder of algorithm one level run at a time + setRule("X10"); + byte prevLevel = paragraphEmbeddingLevel; + int start = 0; + while (start < textLength) { + byte level = resultLevels[start]; + byte prevType = typeForLevel(Math.max(prevLevel, level)); + + int limit = start + 1; + while (limit < textLength && resultLevels[limit] == level) { + ++limit; + } + + byte succLevel = limit < textLength ? resultLevels[limit] : paragraphEmbeddingLevel; + byte succType = typeForLevel(Math.max(succLevel, level)); + + // 3) resolving weak types + // Rules W1-W7. + setRule("W1-7"); + resolveWeakTypes(start, limit, level, prevType, succType); + trace(BidiTraceHook.PHASE_WEAK, start, limit); + + // 4) resolving neutral types + // Rules N1-N3. + setRule("N1-2"); + resolveNeutralTypes(start, limit, level, prevType, succType); + trace(BidiTraceHook.PHASE_NEUTRAL, start, limit); + + // 5) resolving implicit embedding levels + // Rules I1, I2. + setRule("I1-2"); + resolveImplicitLevels(start, limit, level, prevType, succType); + trace(BidiTraceHook.PHASE_IMPLICIT, start, limit); + + prevLevel = level; + start = limit; + } } - break; - - case B: - // Rule X8. - // These values are reset for clarity, in this implementation B can only - // occur as the last code in the array. - stackCounter = 0; - overflowCounter = 0; - overflowAlmostCounter = 0; - currentEmbeddingLevel = paragraphEmbeddingLevel; - currentEmbeddingValue = paragraphEmbeddingLevel; + // Reinsert explicit codes and assign appropriate levels to 'hide' them. + // This is for convenience, so the resulting level array maps 1-1 + // with the initial array. + // See the implementation suggestions section of TR#9 for guidelines on + // how to implement the algorithm without removing and reinserting the codes. + textLength = reinsertExplicitCodes(textLength); + } - embeddings[i] = paragraphEmbeddingLevel; - break; + /** + * 1) determining the paragraph level. + * + *

Rules P2, P3. + * + *

At the end of this function, the member variable paragraphEmbeddingLevel is set to either + * 0 or 1. + */ + private void determineParagraphEmbeddingLevel() { + byte strongType = -1; // unknown + + // Rule P2. + for (int i = 0; i < textLength; ++i) { + byte t = resultTypes[i]; + if (t == L || t == AL || t == R) { + strongType = t; + break; + } + } - default: - break; - } + // Rule P3. + if (strongType == -1) { // none found + // default embedding level when no strong types found is 0. + paragraphEmbeddingLevel = 0; + } else if (strongType == L) { + paragraphEmbeddingLevel = 0; + } else { // AL, R + paragraphEmbeddingLevel = 1; + } } - return embeddings; - } - - - /** - * 3) resolving weak types - * Rules W1-W7. - * - * Note that some weak types (EN, AN) remain after this processing is complete. - */ - private void resolveWeakTypes(int start, int limit, byte level, byte sor, byte eor) { - - // on entry, only these types remain - assertOnly(start, limit, new byte[] {L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM }); - - // Rule W1. - // Changes all NSMs. - setRule("W1"); - byte preceedingCharacterType = sor; - for (int i = start; i < limit; ++i) { - byte t = resultTypes[i]; - if (t == NSM) { - setType(i, preceedingCharacterType); - } else { - preceedingCharacterType = t; - } + /** + * Process embedding format codes. + * + *

Calls processEmbeddings to generate an embedding array from the explicit format codes. The + * embedding overrides in the array are then applied to the result types, and the result levels + * are initialized. + * + * @see #processEmbeddings + */ + private void determineExplicitEmbeddingLevels() { + embeddings = processEmbeddings(resultTypes, paragraphEmbeddingLevel); + + for (int i = 0; i < textLength; ++i) { + byte level = embeddings[i]; + if ((level & 0x80) != 0) { + level &= 0x7f; + setType(i, typeForLevel(level)); + } + resultLevels[i] = level; + } } - // Rule W2. - // EN does not change at the start of the run, because sor != AL. - setRule("W2"); - for (int i = start; i < limit; ++i) { - if (resultTypes[i] == EN) { - for (int j = i - 1; j >= start; --j) { - byte t = resultTypes[j]; - if (t == L || t == R || t == AL) { - if (t == AL) { - setType(i, AN); - } - break; - } + private void setType(int i, byte value) { + if (value != resultTypes[i]) { + record[i].append(getRule() + "\u2192" + getHtmlTypename(value) + "\n"); } - } + resultTypes[i] = value; } - // Rule W3. - setRule("W3"); - for (int i = start; i < limit; ++i) { - if (resultTypes[i] == AL) { - setType(i, R); - } + public String getChanges(int i) { + return record[i].toString(); } - // Rule W4. - // Since there must be values on both sides for this rule to have an - // effect, the scan skips the first and last value. - // - // Although the scan proceeds left to right, and changes the type values - // in a way that would appear to affect the computations later in the scan, - // there is actually no problem. A change in the current value can only - // affect the value to its immediate right, and only affect it if it is - // ES or CS. But the current value can only change if the value to its - // right is not ES or CS. Thus either the current value will not change, - // or its change will have no effect on the remainder of the analysis. - - setRule("W4"); - for (int i = start + 1; i < limit - 1; ++i) { - if (resultTypes[i] == ES || resultTypes[i] == CS) { - byte prevSepType = resultTypes[i-1]; - byte succSepType = resultTypes[i+1]; - if (prevSepType == EN && succSepType == EN) { - setType(i, EN); - } else if (resultTypes[i] == CS && prevSepType == AN && succSepType == AN) { - setType(i, AN); + /** + * Rules X9. Remove explicit codes so that they may be ignored during the remainder of the main + * portion of the algorithm. The length of the resulting text is returned. + * + * @return the length of the data excluding explicit codes and BN. + */ + private int removeExplicitCodes() { + int w = 0; + mapToOriginal = new int[initialTypes.length]; + for (int i = 0; i < textLength; ++i) { + byte t = initialTypes[i]; + if (!(t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN)) { + mapToOriginal[w] = i; + embeddings[w] = embeddings[i]; + resultTypes[w] = resultTypes[i]; + resultLevels[w] = resultLevels[i]; + w++; + } } - } + return w; // new textLength while explicit levels are removed } - // Rule W5. - setRule("W5"); - for (int i = start; i < limit; ++i) { - if (resultTypes[i] == ET) { - // locate end of sequence - int runstart = i; - int runlimit = findRunLimit(runstart, limit, new byte[] { ET }); + /** + * Reinsert levels information for explicit codes. This is for ease of relating the level + * information to the original input data. Note that the levels assigned to these codes are + * arbitrary, they're chosen so as to avoid breaking level runs. + * + * @param textLength the length of the data after compression + * @return the length of the data (original length of types array supplied to constructor) + */ + private int reinsertExplicitCodes(int textLength) { + for (int i = initialTypes.length; --i >= 0; ) { + byte t = initialTypes[i]; + if (t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN) { + embeddings[i] = 0; + setType(i, t); + resultLevels[i] = -1; + } else { + --textLength; + embeddings[i] = embeddings[textLength]; + setType(i, resultTypes[textLength]); + resultLevels[i] = resultLevels[textLength]; + } + } + mapToOriginal = null; - // check values at ends of sequence - byte t = runstart == start ? sor : resultTypes[runstart - 1]; + // now propagate forward the levels information (could have + // propagated backward, the main thing is not to introduce a level + // break where one doesn't already exist). - if (t != EN) { - t = runlimit == limit ? eor : resultTypes[runlimit]; + if (resultLevels[0] == -1) { + resultLevels[0] = paragraphEmbeddingLevel; } - - if (t == EN) { - setTypes(runstart, runlimit, EN); + for (int i = 1; i < initialTypes.length; ++i) { + if (resultLevels[i] == -1) { + resultLevels[i] = resultLevels[i - 1]; + } } - // continue at end of sequence - i = runlimit; - } + // Embedding information is for informational purposes only + // so need not be adjusted. + + return initialTypes.length; } - // Rule W6. - setRule("W6"); - for (int i = start; i < limit; ++i) { - byte t = resultTypes[i]; - if (t == ES || t == ET || t == CS) { - setType(i, ON); - } + /** + * 2) determining explicit levels Rules X1 - X8 + * + *

The interaction of these rules makes handling them a bit complex. This examines + * resultTypes but does not modify it. It returns embedding and override information in the + * result array. The low 7 bits are the level, the high bit is set if the level is an override, + * and clear if it is an embedding. + */ + private static byte[] processEmbeddings(byte[] resultTypes, byte paragraphEmbeddingLevel) { + final int EXPLICIT_LEVEL_LIMIT = 62; + + int textLength = resultTypes.length; + byte[] embeddings = new byte[textLength]; + + // This stack will store the embedding levels and override status in a single byte + // as described above. + byte[] embeddingValueStack = new byte[EXPLICIT_LEVEL_LIMIT]; + int stackCounter = 0; + + // An LRE or LRO at level 60 is invalid, since the new level 62 is invalid. But + // an RLE at level 60 is valid, since the new level 61 is valid. The current wording + // of the rules requires that the RLE remain valid even if a previous LRE is invalid. + // This keeps track of ignored LRE or LRO codes at level 60, so that the matching PDFs + // will not try to pop the stack. + int overflowAlmostCounter = 0; + + // This keeps track of ignored pushes at level 61 or higher, so that matching PDFs will + // not try to pop the stack. + int overflowCounter = 0; + + // Rule X1. + + // Keep the level separate from the value (level | override status flag) for ease of access. + byte currentEmbeddingLevel = paragraphEmbeddingLevel; + byte currentEmbeddingValue = paragraphEmbeddingLevel; + + // Loop through types, handling all remaining rules + for (int i = 0; i < textLength; ++i) { + + embeddings[i] = currentEmbeddingValue; + + byte t = resultTypes[i]; + + // Rules X2, X3, X4, X5 + switch (t) { + case RLE: + case LRE: + case RLO: + case LRO: + // Only need to compute new level if current level is valid + if (overflowCounter == 0) { + byte newLevel; + if (t == RLE || t == RLO) { + newLevel = + (byte) ((currentEmbeddingLevel + 1) | 1); // least greater odd + } else { // t == LRE || t == LRO + newLevel = + (byte) ((currentEmbeddingLevel + 2) & ~1); // least greater even + } + + // If the new level is valid, push old embedding level and override status + // No check for valid stack counter, since the level check suffices. + if (newLevel < EXPLICIT_LEVEL_LIMIT) { + embeddingValueStack[stackCounter] = currentEmbeddingValue; + stackCounter++; + + currentEmbeddingLevel = newLevel; + if (t == LRO || t == RLO) { // override + currentEmbeddingValue = (byte) (newLevel | 0x80); + } else { + currentEmbeddingValue = newLevel; + } + + // Adjust level of format mark (for expositional purposes only, this + // gets + // removed later). + embeddings[i] = currentEmbeddingValue; + break; + } + + // Otherwise new level is invalid, but a valid level can still be achieved + // if this + // level is 60 and we encounter an RLE or RLO further on. So record that we + // 'almost' overflowed. + if (currentEmbeddingLevel == 60) { + overflowAlmostCounter++; + break; + } + } + + // Otherwise old or new level is invalid. + overflowCounter++; + break; + + case PDF: + // The only case where this did not actually overflow but may have almost + // overflowed + // is when there was an RLE or RLO on level 60, which would result in level 61. + // So we + // only test the almost overflow condition in that case. + // + // Also note that there may be a PDF without any pushes at all. + + if (overflowCounter > 0) { + --overflowCounter; + } else if (overflowAlmostCounter > 0 && currentEmbeddingLevel != 61) { + --overflowAlmostCounter; + } else if (stackCounter > 0) { + --stackCounter; + currentEmbeddingValue = embeddingValueStack[stackCounter]; + currentEmbeddingLevel = (byte) (currentEmbeddingValue & 0x7f); + } + break; + + case B: + // Rule X8. + + // These values are reset for clarity, in this implementation B can only + // occur as the last code in the array. + stackCounter = 0; + overflowCounter = 0; + overflowAlmostCounter = 0; + currentEmbeddingLevel = paragraphEmbeddingLevel; + currentEmbeddingValue = paragraphEmbeddingLevel; + + embeddings[i] = paragraphEmbeddingLevel; + break; + + default: + break; + } + } + + return embeddings; } - // Rule W7. - setRule("W7"); - for (int i = start; i < limit; ++i) { - if (resultTypes[i] == EN) { - // set default if we reach start of run - byte prevStrongType = sor; - for (int j = i - 1; j >= start; --j) { - byte t = resultTypes[j]; - if (t == L || t == R) { // AL's have been removed - prevStrongType = t; - break; - } + /** + * 3) resolving weak types Rules W1-W7. + * + *

Note that some weak types (EN, AN) remain after this processing is complete. + */ + private void resolveWeakTypes(int start, int limit, byte level, byte sor, byte eor) { + + // on entry, only these types remain + assertOnly(start, limit, new byte[] {L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM}); + + // Rule W1. + // Changes all NSMs. + setRule("W1"); + byte preceedingCharacterType = sor; + for (int i = start; i < limit; ++i) { + byte t = resultTypes[i]; + if (t == NSM) { + setType(i, preceedingCharacterType); + } else { + preceedingCharacterType = t; + } } - if (prevStrongType == L) { - setType(i, L); + + // Rule W2. + // EN does not change at the start of the run, because sor != AL. + setRule("W2"); + for (int i = start; i < limit; ++i) { + if (resultTypes[i] == EN) { + for (int j = i - 1; j >= start; --j) { + byte t = resultTypes[j]; + if (t == L || t == R || t == AL) { + if (t == AL) { + setType(i, AN); + } + break; + } + } + } } - } - } - } - - /** - * 6) resolving neutral types - * Rules N1-N2. - */ - private void resolveNeutralTypes(int start, int limit, byte level, byte sor, byte eor) { - - // on entry, only these types can be in resultTypes - assertOnly(start, limit, new byte[] {L, R, EN, AN, B, S, WS, ON}); - - for (int i = start; i < limit; ++i) { - byte t = resultTypes[i]; - if (t == WS || t == ON || t == B || t == S) { - // find bounds of run of neutrals - int runstart = i; - int runlimit = findRunLimit(runstart, limit, new byte[] {B, S, WS, ON}); - - // determine effective types at ends of run - byte leadingType; - byte trailingType; - - if (runstart == start) { - leadingType = sor; - } else { - leadingType = resultTypes[runstart - 1]; - if (leadingType == L || leadingType == R) { - // found the strong type - } else if (leadingType == AN) { - leadingType = R; - } else if (leadingType == EN) { - // Since EN's with previous strong L types have been changed - // to L in W7, the leadingType must be R. - leadingType = R; - } + + // Rule W3. + setRule("W3"); + for (int i = start; i < limit; ++i) { + if (resultTypes[i] == AL) { + setType(i, R); + } } - if (runlimit == limit) { - trailingType = eor; - } else { - trailingType = resultTypes[runlimit]; - if (trailingType == L || trailingType == R) { - // found the strong type - } else if (trailingType == AN) { - trailingType = R; - } else if (trailingType == EN) { - trailingType = R; - } + // Rule W4. + // Since there must be values on both sides for this rule to have an + // effect, the scan skips the first and last value. + // + // Although the scan proceeds left to right, and changes the type values + // in a way that would appear to affect the computations later in the scan, + // there is actually no problem. A change in the current value can only + // affect the value to its immediate right, and only affect it if it is + // ES or CS. But the current value can only change if the value to its + // right is not ES or CS. Thus either the current value will not change, + // or its change will have no effect on the remainder of the analysis. + + setRule("W4"); + for (int i = start + 1; i < limit - 1; ++i) { + if (resultTypes[i] == ES || resultTypes[i] == CS) { + byte prevSepType = resultTypes[i - 1]; + byte succSepType = resultTypes[i + 1]; + if (prevSepType == EN && succSepType == EN) { + setType(i, EN); + } else if (resultTypes[i] == CS && prevSepType == AN && succSepType == AN) { + setType(i, AN); + } + } } - byte resolvedType; - if (leadingType == trailingType) { - // Rule N1. - setRule("N1"); - resolvedType = leadingType; - } else { - // Rule N2. - // Notice the embedding level of the run is used, not - // the paragraph embedding level. - setRule("N2"); - resolvedType = typeForLevel(level); + // Rule W5. + setRule("W5"); + for (int i = start; i < limit; ++i) { + if (resultTypes[i] == ET) { + // locate end of sequence + int runstart = i; + int runlimit = findRunLimit(runstart, limit, new byte[] {ET}); + + // check values at ends of sequence + byte t = runstart == start ? sor : resultTypes[runstart - 1]; + + if (t != EN) { + t = runlimit == limit ? eor : resultTypes[runlimit]; + } + + if (t == EN) { + setTypes(runstart, runlimit, EN); + } + + // continue at end of sequence + i = runlimit; + } } - setTypes(runstart, runlimit, resolvedType); + // Rule W6. + setRule("W6"); + for (int i = start; i < limit; ++i) { + byte t = resultTypes[i]; + if (t == ES || t == ET || t == CS) { + setType(i, ON); + } + } - // skip over run of (former) neutrals - i = runlimit; - } + // Rule W7. + setRule("W7"); + for (int i = start; i < limit; ++i) { + if (resultTypes[i] == EN) { + // set default if we reach start of run + byte prevStrongType = sor; + for (int j = i - 1; j >= start; --j) { + byte t = resultTypes[j]; + if (t == L || t == R) { // AL's have been removed + prevStrongType = t; + break; + } + } + if (prevStrongType == L) { + setType(i, L); + } + } + } } - } - - /** - * 7) resolving implicit embedding levels - * Rules I1, I2. - */ - private void resolveImplicitLevels(int start, int limit, byte level, byte sor, byte eor) { - - // on entry, only these types can be in resultTypes - assertOnly(start, limit, new byte[] {L, R, EN, AN}); - - if ((level & 1) == 0) { // even level - for (int i = start; i < limit; ++i) { - byte t = resultTypes[i]; - // Rule I1. - setRule("I1"); - if (t == L ) { - // no change - } else if (t == R) { - resultLevels[i] += 1; - } else { // t == AN || t == EN - resultLevels[i] += 2; + + /** 6) resolving neutral types Rules N1-N2. */ + private void resolveNeutralTypes(int start, int limit, byte level, byte sor, byte eor) { + + // on entry, only these types can be in resultTypes + assertOnly(start, limit, new byte[] {L, R, EN, AN, B, S, WS, ON}); + + for (int i = start; i < limit; ++i) { + byte t = resultTypes[i]; + if (t == WS || t == ON || t == B || t == S) { + // find bounds of run of neutrals + int runstart = i; + int runlimit = findRunLimit(runstart, limit, new byte[] {B, S, WS, ON}); + + // determine effective types at ends of run + byte leadingType; + byte trailingType; + + if (runstart == start) { + leadingType = sor; + } else { + leadingType = resultTypes[runstart - 1]; + if (leadingType == L || leadingType == R) { + // found the strong type + } else if (leadingType == AN) { + leadingType = R; + } else if (leadingType == EN) { + // Since EN's with previous strong L types have been changed + // to L in W7, the leadingType must be R. + leadingType = R; + } + } + + if (runlimit == limit) { + trailingType = eor; + } else { + trailingType = resultTypes[runlimit]; + if (trailingType == L || trailingType == R) { + // found the strong type + } else if (trailingType == AN) { + trailingType = R; + } else if (trailingType == EN) { + trailingType = R; + } + } + + byte resolvedType; + if (leadingType == trailingType) { + // Rule N1. + setRule("N1"); + resolvedType = leadingType; + } else { + // Rule N2. + // Notice the embedding level of the run is used, not + // the paragraph embedding level. + setRule("N2"); + resolvedType = typeForLevel(level); + } + + setTypes(runstart, runlimit, resolvedType); + + // skip over run of (former) neutrals + i = runlimit; + } } - } - } else { // odd level - for (int i = start; i < limit; ++i) { - byte t = resultTypes[i]; - // Rule I2. - setRule("I2"); - if (t == R) { - // no change - } else { // t == L || t == AN || t == EN - resultLevels[i] += 1; + } + + /** 7) resolving implicit embedding levels Rules I1, I2. */ + private void resolveImplicitLevels(int start, int limit, byte level, byte sor, byte eor) { + + // on entry, only these types can be in resultTypes + assertOnly(start, limit, new byte[] {L, R, EN, AN}); + + if ((level & 1) == 0) { // even level + for (int i = start; i < limit; ++i) { + byte t = resultTypes[i]; + // Rule I1. + setRule("I1"); + if (t == L) { + // no change + } else if (t == R) { + resultLevels[i] += 1; + } else { // t == AN || t == EN + resultLevels[i] += 2; + } + } + } else { // odd level + for (int i = start; i < limit; ++i) { + byte t = resultTypes[i]; + // Rule I2. + setRule("I2"); + if (t == R) { + // no change + } else { // t == L || t == AN || t == EN + resultLevels[i] += 1; + } + } } - } } - } - - // - // Output - // - - /** - * Return levels array breaking lines at offsets in linebreaks.
- * Rule L1. - *

- * The returned levels array contains the resolved level for each - * bidi code passed to the constructor. - *

- * The linebreaks array must include at least one value. - * The values must be in strictly increasing order (no duplicates) - * between 1 and the length of the text, inclusive. The last value - * must be the length of the text. - * - * @param linebreaks the offsets at which to break the paragraph - * @return the resolved levels of the text - */ - public byte[] getLevels(int[] linebreaks) { - - // Note that since the previous processing has removed all - // P, S, and WS values from resultTypes, the values referred to - // in these rules are the initial types, before any processing - // has been applied (including processing of overrides). + // - // This example implementation has reinserted explicit format codes - // and BN, in order that the levels array correspond to the - // initial text. Their final placement is not normative. - // These codes are treated like WS in this implementation, - // so they don't interrupt sequences of WS. - - validateLineBreaks(linebreaks, textLength); - - byte[] result = resultLevels.clone(); // will be returned to caller - - // don't worry about linebreaks since if there is a break within - // a series of WS values preceeding S, the linebreak itself - // causes the reset. - for (int i = 0; i < result.length; ++i) { - byte t = initialTypes[i]; - if (t == B || t == S) { - // Rule L1, clauses one and two. - result[i] = paragraphEmbeddingLevel; - - // Rule L1, clause three. - for (int j = i - 1; j >= 0; --j) { - if (isWhitespace(initialTypes[j])) { // including format codes - result[j] = paragraphEmbeddingLevel; - } else { - break; - } + // Output + // + + /** + * Return levels array breaking lines at offsets in linebreaks.
+ * Rule L1. + * + *

The returned levels array contains the resolved level for each bidi code passed to the + * constructor. + * + *

The linebreaks array must include at least one value. The values must be in strictly + * increasing order (no duplicates) between 1 and the length of the text, inclusive. The last + * value must be the length of the text. + * + * @param linebreaks the offsets at which to break the paragraph + * @return the resolved levels of the text + */ + public byte[] getLevels(int[] linebreaks) { + + // Note that since the previous processing has removed all + // P, S, and WS values from resultTypes, the values referred to + // in these rules are the initial types, before any processing + // has been applied (including processing of overrides). + // + // This example implementation has reinserted explicit format codes + // and BN, in order that the levels array correspond to the + // initial text. Their final placement is not normative. + // These codes are treated like WS in this implementation, + // so they don't interrupt sequences of WS. + + validateLineBreaks(linebreaks, textLength); + + byte[] result = resultLevels.clone(); // will be returned to caller + + // don't worry about linebreaks since if there is a break within + // a series of WS values preceeding S, the linebreak itself + // causes the reset. + for (int i = 0; i < result.length; ++i) { + byte t = initialTypes[i]; + if (t == B || t == S) { + // Rule L1, clauses one and two. + result[i] = paragraphEmbeddingLevel; + + // Rule L1, clause three. + for (int j = i - 1; j >= 0; --j) { + if (isWhitespace(initialTypes[j])) { // including format codes + result[j] = paragraphEmbeddingLevel; + } else { + break; + } + } + } } - } - } - // Rule L1, clause four. - int start = 0; - for (int i = 0; i < linebreaks.length; ++i) { - int limit = linebreaks[i]; - for (int j = limit - 1; j >= start; --j) { - if (isWhitespace(initialTypes[j])) { // including format codes - result[j] = paragraphEmbeddingLevel; - } else { - break; + // Rule L1, clause four. + int start = 0; + for (int i = 0; i < linebreaks.length; ++i) { + int limit = linebreaks[i]; + for (int j = limit - 1; j >= start; --j) { + if (isWhitespace(initialTypes[j])) { // including format codes + result[j] = paragraphEmbeddingLevel; + } else { + break; + } + } + + start = limit; } - } - start = limit; - } + traceLineLevels(linebreaks, result); - traceLineLevels(linebreaks, result); - - return result; - } - - /** - * Return reordering array breaking lines at offsets in linebreaks. - *

- * The reordering array maps from a visual index to a logical index. - * Lines are concatenated from left to right. So for example, the - * fifth character from the left on the third line is - *

 getReordering(linebreaks)[linebreaks[1] + 4]
- * (linebreaks[1] is the position after the last character of the - * second line, which is also the index of the first character on the - * third line, and adding four gets the fifth character from the left). - *

- * The linebreaks array must include at least one value. - * The values must be in strictly increasing order (no duplicates) - * between 1 and the length of the text, inclusive. The last value - * must be the length of the text. - * - * @param linebreaks the offsets at which to break the paragraph. - */ - public int[] getReordering(int[] linebreaks) { - validateLineBreaks(linebreaks, textLength); - - byte[] levels = getLevels(linebreaks); - - return computeMultilineReordering(levels, linebreaks); - } - - /** - * Return multiline reordering array for a given level array. - * Reordering does not occur across a line break. - */ - private static int[] computeMultilineReordering(byte[] levels, int[] linebreaks) { - int[] result = new int[levels.length]; - - int start = 0; - for (int i = 0; i < linebreaks.length; ++i) { - int limit = linebreaks[i]; - - byte[] templevels = new byte[limit - start]; - System.arraycopy(levels, start, templevels, 0, templevels.length); - - int[] temporder = computeReordering(templevels); - for (int j = 0; j < temporder.length; ++j) { - result[start + j] = temporder[j] + start; - } - - start = limit; + return result; } - return result; - } - - /** - * Return reordering array for a given level array. This reorders a single line. - * The reordering is a visual to logical map. For example, - * the leftmost char is string.charAt(order[0]). - * Rule L2. - */ - private static int[] computeReordering(byte[] levels) { - int lineLength = levels.length; + /** + * Return reordering array breaking lines at offsets in linebreaks. + * + *

The reordering array maps from a visual index to a logical index. Lines are concatenated + * from left to right. So for example, the fifth character from the left on the third line is + * + *

 getReordering(linebreaks)[linebreaks[1] + 4]
+ * + * (linebreaks[1] is the position after the last character of the second line, which is also the + * index of the first character on the third line, and adding four gets the fifth character from + * the left). + * + *

The linebreaks array must include at least one value. The values must be in strictly + * increasing order (no duplicates) between 1 and the length of the text, inclusive. The last + * value must be the length of the text. + * + * @param linebreaks the offsets at which to break the paragraph. + */ + public int[] getReordering(int[] linebreaks) { + validateLineBreaks(linebreaks, textLength); - int[] result = new int[lineLength]; + byte[] levels = getLevels(linebreaks); - // initialize order - for (int i = 0; i < lineLength; ++i) { - result[i] = i; + return computeMultilineReordering(levels, linebreaks); } - // locate highest level found on line. - // Note the rules say text, but no reordering across line bounds is performed, - // so this is sufficient. - byte highestLevel = 0; - byte lowestOddLevel = 63; - for (int i = 0; i < lineLength; ++i) { - byte level = levels[i]; - if (level > highestLevel) { - highestLevel = level; - } - if (((level & 1) != 0) && level < lowestOddLevel) { - lowestOddLevel = level; - } - } + /** + * Return multiline reordering array for a given level array. Reordering does not occur across a + * line break. + */ + private static int[] computeMultilineReordering(byte[] levels, int[] linebreaks) { + int[] result = new int[levels.length]; - for (int level = highestLevel; level >= lowestOddLevel; --level) { - for (int i = 0; i < lineLength; ++i) { - if (levels[i] >= level) { - // find range of text at or above this level - int start = i; - int limit = i + 1; - while (limit < lineLength && levels[limit] >= level) { - ++limit; - } - - // reverse run - for (int j = start, k = limit - 1; j < k; ++j, --k) { - int temp = result[j]; - result[j] = result[k]; - result[k] = temp; - } - - // skip to end of level run - i = limit; - } - } - } + int start = 0; + for (int i = 0; i < linebreaks.length; ++i) { + int limit = linebreaks[i]; - return result; - } - - /** - * Return the base level of the paragraph. - */ - public byte getBaseLevel() { - return paragraphEmbeddingLevel; - } - - // --- internal utilities ------------------------------------------------- - - /** - * Return true if the type is considered a whitespace type for the line break rules. - */ - private static boolean isWhitespace(byte biditype) { - switch (biditype) { - case LRE: - case RLE: - case LRO: - case RLO: - case PDF: - case BN: - case WS: - return true; - default: - return false; - } - } - - /** - * Return the strong type (L or R) corresponding to the level. - */ - private static byte typeForLevel(int level) { - return ((level & 0x1) == 0) ? L : R; - } - - /** - * Return the limit of the run starting at index that includes only resultTypes in validSet. - * This checks the value at index, and will return index if that value is not in validSet. - */ - private int findRunLimit(int index, int limit, byte[] validSet) { - --index; - loop: - while (++index < limit) { - byte t = resultTypes[index]; - for (int i = 0; i < validSet.length; ++i) { - if (t == validSet[i]) { - continue loop; - } - } - // didn't find a match in validSet - return index; - } - return limit; - } - - /** - * Return the start of the run including index that includes only resultTypes in validSet. - * This assumes the value at index is valid, and does not check it. - */ - private int findRunStart(int index, byte[] validSet) { - loop: - while (--index >= 0) { - byte t = resultTypes[index]; - for (int i = 0; i < validSet.length; ++i) { - if (t == validSet[i]) { - continue loop; - } + byte[] templevels = new byte[limit - start]; + System.arraycopy(levels, start, templevels, 0, templevels.length); + + int[] temporder = computeReordering(templevels); + for (int j = 0; j < temporder.length; ++j) { + result[start + j] = temporder[j] + start; + } + + start = limit; } - return index + 1; - } - return 0; - } - - /** - * Set resultTypes from start up to (but not including) limit to newType. - */ - private void setTypes(int start, int limit, byte newType) { - for (int i = start; i < limit; ++i) { - setType(i, newType); - } - } - - /** - * Set resultLevels from start up to (but not including) limit to newLevel. - */ - private void setLevels(int start, int limit, byte newLevel) { - for (int i = start; i < limit; ++i) { - resultLevels[i] = newLevel; + + return result; } - } - - // --- algorithm internal validation -------------------------------------- - - /** - * Algorithm validation. - * Assert that all values in resultTypes are in the provided set. - */ - private void assertOnly(int start, int limit, byte[] codes) { - loop: - for (int i = start; i < limit; ++i) { - byte t = resultTypes[i]; - for (int j = 0; j < codes.length; ++j) { - if (t == codes[j]) { - continue loop; - } + + /** + * Return reordering array for a given level array. This reorders a single line. The reordering + * is a visual to logical map. For example, the leftmost char is string.charAt(order[0]). Rule + * L2. + */ + private static int[] computeReordering(byte[] levels) { + int lineLength = levels.length; + + int[] result = new int[lineLength]; + + // initialize order + for (int i = 0; i < lineLength; ++i) { + result[i] = i; } - throw new Error("invalid bidi code " + getHtmlTypename(t) + " present in assertOnly at position " + i); - } - } + // locate highest level found on line. + // Note the rules say text, but no reordering across line bounds is performed, + // so this is sufficient. + byte highestLevel = 0; + byte lowestOddLevel = 63; + for (int i = 0; i < lineLength; ++i) { + byte level = levels[i]; + if (level > highestLevel) { + highestLevel = level; + } + if (((level & 1) != 0) && level < lowestOddLevel) { + lowestOddLevel = level; + } + } - // --- input validation --------------------------------------------------- + for (int level = highestLevel; level >= lowestOddLevel; --level) { + for (int i = 0; i < lineLength; ++i) { + if (levels[i] >= level) { + // find range of text at or above this level + int start = i; + int limit = i + 1; + while (limit < lineLength && levels[limit] >= level) { + ++limit; + } + + // reverse run + for (int j = start, k = limit - 1; j < k; ++j, --k) { + int temp = result[j]; + result[j] = result[k]; + result[k] = temp; + } + + // skip to end of level run + i = limit; + } + } + } - /** - * Throw exception if type array is invalid. - */ - private static void validateTypes(byte[] types) { - if (types == null) { - throw new IllegalArgumentException("types is null"); + return result; } - for (int i = 0; i < types.length; ++i) { - if (types[i] < TYPE_MIN || types[i] > TYPE_MAX) { - throw new IllegalArgumentException("illegal type value at " + i + ": " + types[i]); - } - } - for (int i = 0; i < types.length - 1; ++i) { - if (types[i] == B) { - throw new IllegalArgumentException("B type before end of paragraph at index: " + i); - } - } - } - - /** - * Throw exception if paragraph embedding level is invalid. Special allowance for -1 so that - * default processing can still be performed when using this API. - */ - private static void validateParagraphEmbeddingLevel(byte paragraphEmbeddingLevel) { - if (paragraphEmbeddingLevel != -1 && - paragraphEmbeddingLevel != 0 && - paragraphEmbeddingLevel != 1) { - throw new IllegalArgumentException("illegal paragraph embedding level: " + paragraphEmbeddingLevel); - } - } - - /** - * Throw exception if line breaks array is invalid. - */ - private static void validateLineBreaks(int[] linebreaks, int textLength) { - int prev = 0; - for (int i = 0; i < linebreaks.length; ++i) { - int next = linebreaks[i]; - if (next <= prev) { - throw new IllegalArgumentException("bad linebreak: " + next + " at index: " + i); - } - prev = next; - } - if (prev != textLength) { - throw new IllegalArgumentException("last linebreak must be at " + textLength); + + /** Return the base level of the paragraph. */ + public byte getBaseLevel() { + return paragraphEmbeddingLevel; } - } - // --- debug utilities ---------------------------------------------------- + // --- internal utilities ------------------------------------------------- + + /** Return true if the type is considered a whitespace type for the line break rules. */ + private static boolean isWhitespace(byte biditype) { + switch (biditype) { + case LRE: + case RLE: + case LRO: + case RLO: + case PDF: + case BN: + case WS: + return true; + default: + return false; + } + } - /** - * An interface for tracing the progress of the Bidi reference implementation. - */ - public static interface BidiTraceHook { + /** Return the strong type (L or R) corresponding to the level. */ + private static byte typeForLevel(int level) { + return ((level & 0x1) == 0) ? L : R; + } /** - * Display the current state of the implementation. - *

- * The data supplied to the display method represents the current internal state of the implementation. Note - * that some phases of the algorithm operate on the data as it appears when the explicit formatting codes and - * BN have been removed. When this is the case, start and limit do not correspond directly to the original - * direction type codes that were passed to the constructor. However, the values in embeddings, resultTypes, - * and resultLevels are consistent. - *

- * @param phase the current phase of the algorithm - * @param start the start of the run of text being worked on - * @param limit the limit of the run of text being worked on - * @param paragraphEmbeddingLevel the paragraph embedding level - * @param initialTypes the original bidi types provided to the constructor - * @param embeddings the embeddings and override information resulting from explicit formatting codes - * @param resultTypes the current resolved bidi types - * @param resultLevels the current resolved levels (assuming the paragraph is a single line) + * Return the limit of the run starting at index that includes only resultTypes in validSet. + * This checks the value at index, and will return index if that value is not in validSet. */ - public abstract void display(int phase, - int start, int limit, - byte paragraphEmbeddingLevel, - byte[] initialTypes, - byte[] embeddings, - byte[] resultTypes, - byte[] resultLevels); + private int findRunLimit(int index, int limit, byte[] validSet) { + --index; + loop: + while (++index < limit) { + byte t = resultTypes[index]; + for (int i = 0; i < validSet.length; ++i) { + if (t == validSet[i]) { + continue loop; + } + } + // didn't find a match in validSet + return index; + } + return limit; + } /** - * Display the results of processing line break information to generate line levels. - *

- * @param paragraphEmbeddingLevel the paragraph embedding level - * @param initialTypes the original bidi types provided to the constructor - * @param embeddings the embeddings and override information resulting from explicit formatting codes - * @param linebreaks the array of positions where line breaks occur - * @param resolvedLevels the resolved levels before line processing is performed - * @param lineLevels the levels after line processing was performed + * Return the start of the run including index that includes only resultTypes in validSet. This + * assumes the value at index is valid, and does not check it. */ - public abstract void displayLineLevels(byte paragraphEmbeddingLevel, - byte[] initialTypes, - byte[] embeddings, - int[] linebreaks, - byte[] resolvedLevels, - byte[] lineLevels); + private int findRunStart(int index, byte[] validSet) { + loop: + while (--index >= 0) { + byte t = resultTypes[index]; + for (int i = 0; i < validSet.length; ++i) { + if (t == validSet[i]) { + continue loop; + } + } + return index + 1; + } + return 0; + } - /** - * Display a message. - * - * @param msg the message text - */ - public abstract void message(String msg); + /** Set resultTypes from start up to (but not including) limit to newType. */ + private void setTypes(int start, int limit, byte newType) { + for (int i = start; i < limit; ++i) { + setType(i, newType); + } + } + /** Set resultLevels from start up to (but not including) limit to newLevel. */ + private void setLevels(int start, int limit, byte newLevel) { + for (int i = start; i < limit; ++i) { + resultLevels[i] = newLevel; + } + } - /** The phase before any processing on the data bas been performed. */ - public static int PHASE_INIT = 0; + // --- algorithm internal validation -------------------------------------- + + /** Algorithm validation. Assert that all values in resultTypes are in the provided set. */ + private void assertOnly(int start, int limit, byte[] codes) { + loop: + for (int i = start; i < limit; ++i) { + byte t = resultTypes[i]; + for (int j = 0; j < codes.length; ++j) { + if (t == codes[j]) { + continue loop; + } + } - /** The phase after the base paragraph level has been determined. */ - public static int PHASE_BASELEVEL = 1; + throw new Error( + "invalid bidi code " + + getHtmlTypename(t) + + " present in assertOnly at position " + + i); + } + } - /** The phase after explicit codes have been processed to generate the embedding information. */ - public static int PHASE_EXPLICIT = 2; + // --- input validation --------------------------------------------------- - /** The phase after explicit codes and BN have been removed from the internal data. */ - public static int PHASE_EXPLICIT_REMOVED = 3; + /** Throw exception if type array is invalid. */ + private static void validateTypes(byte[] types) { + if (types == null) { + throw new IllegalArgumentException("types is null"); + } + for (int i = 0; i < types.length; ++i) { + if (types[i] < TYPE_MIN || types[i] > TYPE_MAX) { + throw new IllegalArgumentException("illegal type value at " + i + ": " + types[i]); + } + } + for (int i = 0; i < types.length - 1; ++i) { + if (types[i] == B) { + throw new IllegalArgumentException("B type before end of paragraph at index: " + i); + } + } + } - /** The phase after the weak rule processing has been performed. */ - public static int PHASE_WEAK = 4; + /** + * Throw exception if paragraph embedding level is invalid. Special allowance for -1 so that + * default processing can still be performed when using this API. + */ + private static void validateParagraphEmbeddingLevel(byte paragraphEmbeddingLevel) { + if (paragraphEmbeddingLevel != -1 + && paragraphEmbeddingLevel != 0 + && paragraphEmbeddingLevel != 1) { + throw new IllegalArgumentException( + "illegal paragraph embedding level: " + paragraphEmbeddingLevel); + } + } - /** The phase after the neutral rule processing has been performed. */ - public static int PHASE_NEUTRAL = 5; + /** Throw exception if line breaks array is invalid. */ + private static void validateLineBreaks(int[] linebreaks, int textLength) { + int prev = 0; + for (int i = 0; i < linebreaks.length; ++i) { + int next = linebreaks[i]; + if (next <= prev) { + throw new IllegalArgumentException("bad linebreak: " + next + " at index: " + i); + } + prev = next; + } + if (prev != textLength) { + throw new IllegalArgumentException("last linebreak must be at " + textLength); + } + } - /** The phase after the implicit rule processing has been performed. */ - public static int PHASE_IMPLICIT = 6; - } + // --- debug utilities ---------------------------------------------------- + + /** An interface for tracing the progress of the Bidi reference implementation. */ + public static interface BidiTraceHook { + + /** + * Display the current state of the implementation. + * + *

The data supplied to the display method represents the current internal state of the + * implementation. Note that some phases of the algorithm operate on the data as it appears + * when the explicit formatting codes and BN have been removed. When this is the case, start + * and limit do not correspond directly to the original direction type codes that were + * passed to the constructor. However, the values in embeddings, resultTypes, and + * resultLevels are consistent. + * + *

+ * + * @param phase the current phase of the algorithm + * @param start the start of the run of text being worked on + * @param limit the limit of the run of text being worked on + * @param paragraphEmbeddingLevel the paragraph embedding level + * @param initialTypes the original bidi types provided to the constructor + * @param embeddings the embeddings and override information resulting from explicit + * formatting codes + * @param resultTypes the current resolved bidi types + * @param resultLevels the current resolved levels (assuming the paragraph is a single line) + */ + public abstract void display( + int phase, + int start, + int limit, + byte paragraphEmbeddingLevel, + byte[] initialTypes, + byte[] embeddings, + byte[] resultTypes, + byte[] resultLevels); + + /** + * Display the results of processing line break information to generate line levels. + * + *

+ * + * @param paragraphEmbeddingLevel the paragraph embedding level + * @param initialTypes the original bidi types provided to the constructor + * @param embeddings the embeddings and override information resulting from explicit + * formatting codes + * @param linebreaks the array of positions where line breaks occur + * @param resolvedLevels the resolved levels before line processing is performed + * @param lineLevels the levels after line processing was performed + */ + public abstract void displayLineLevels( + byte paragraphEmbeddingLevel, + byte[] initialTypes, + byte[] embeddings, + int[] linebreaks, + byte[] resolvedLevels, + byte[] lineLevels); + + /** + * Display a message. + * + * @param msg the message text + */ + public abstract void message(String msg); + + /** The phase before any processing on the data bas been performed. */ + public static int PHASE_INIT = 0; + + /** The phase after the base paragraph level has been determined. */ + public static int PHASE_BASELEVEL = 1; + + /** + * The phase after explicit codes have been processed to generate the embedding information. + */ + public static int PHASE_EXPLICIT = 2; + + /** The phase after explicit codes and BN have been removed from the internal data. */ + public static int PHASE_EXPLICIT_REMOVED = 3; + + /** The phase after the weak rule processing has been performed. */ + public static int PHASE_WEAK = 4; + + /** The phase after the neutral rule processing has been performed. */ + public static int PHASE_NEUTRAL = 5; + + /** The phase after the implicit rule processing has been performed. */ + public static int PHASE_IMPLICIT = 6; + } - private static BidiTraceHook hook = null; // for tracking the algorithm + private static BidiTraceHook hook = null; // for tracking the algorithm - /** - * Set a trace hook so the progress of the algorithm can be monitored. - */ - public static synchronized void setTraceHook(BidiTraceHook hook) { - BidiReference.hook = hook; - } + /** Set a trace hook so the progress of the algorithm can be monitored. */ + public static synchronized void setTraceHook(BidiTraceHook hook) { + BidiReference.hook = hook; + } - /** - * Return the trace hook. - */ - public static BidiTraceHook getTraceHook() { - return hook; - } + /** Return the trace hook. */ + public static BidiTraceHook getTraceHook() { + return hook; + } - /** - * Call trace hook during major phases of algorithm. - */ - private void trace(int phase, int start, int limit) { - if (hook != null) { - hook.display(phase, start, limit, paragraphEmbeddingLevel, - initialTypes, embeddings, resultTypes, resultLevels); + /** Call trace hook during major phases of algorithm. */ + private void trace(int phase, int start, int limit) { + if (hook != null) { + hook.display( + phase, + start, + limit, + paragraphEmbeddingLevel, + initialTypes, + embeddings, + resultTypes, + resultLevels); + } } - } - - /** - * Call trace hook when computing line levels based on linebreaks. - */ - private void traceLineLevels(int[] linebreaks, byte[] lineLevels) { - if (hook != null) { - hook.displayLineLevels(paragraphEmbeddingLevel, initialTypes, embeddings, linebreaks, resultLevels, lineLevels); + + /** Call trace hook when computing line levels based on linebreaks. */ + private void traceLineLevels(int[] linebreaks, byte[] lineLevels) { + if (hook != null) { + hook.displayLineLevels( + paragraphEmbeddingLevel, + initialTypes, + embeddings, + linebreaks, + resultLevels, + lineLevels); + } } - } - private void setRule(String rule) { - String[] anchor = rule.split("-"); - this.rule = "" + rule + ""; - } + private void setRule(String rule) { + String[] anchor = rule.split("-"); + this.rule = + "" + + rule + + ""; + } - public static String getHtmlTypename(int value) { - return "" + typenames[value] + ""; - } + public static String getHtmlTypename(int value) { + return "" + + typenames[value] + + ""; + } - private String getRule() { - return rule; - } + private String getRule() { + return rule; + } } - diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/BranchStringPrepData.java b/UnicodeJsps/src/main/java/org/unicode/jsp/BranchStringPrepData.java index a311bc835..97b45012a 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/BranchStringPrepData.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/BranchStringPrepData.java @@ -1,384 +1,402 @@ package org.unicode.jsp; -/** - * - */ - - +/** */ public class BranchStringPrepData { -// -// private static final boolean DEBUG = true; -// public static UnicodeSet U32 = new UnicodeSet("[:age=3.2:]").freeze(); -// public static UnicodeSet VALID_ASCII = new UnicodeSet("[\\u002Da-zA-Z0-9]").freeze(); -// -// -// /** -//3. Mapping -// This profile specifies mapping using the following tables from -// [STRINGPREP]: -// Table B.1 -// Table B.2 -//4. Normalization -// This profile specifies using Unicode normalization form KC, as -// described in [STRINGPREP]. -//5. Prohibited Output -// Table C.1.2 -// Table C.2.2 -// Table C.3 -// Table C.4 -// Table C.5 -// Table C.6 -// Table C.7 -// Table C.8 -// Table C.9 -// */ -// -// public static void getIdna2003Tables(UnicodeMap mappings, UnicodeMap types) { -// EnumSet allowed = EnumSet.of( -// Idna2003Table.B_1, -// Idna2003Table.B_2, -// Idna2003Table.C_1_2 -// , Idna2003Table.C_2_2 -// , Idna2003Table.C_3 -// , Idna2003Table.C_4 -// , Idna2003Table.C_5 -// , Idna2003Table.C_6 -// , Idna2003Table.C_7 -// , Idna2003Table.C_8 -// , Idna2003Table.C_9 -// ); -// for (int i = 0; i <= 0x10FFFF; ++i) { -// String mapping = getMapping(i, allowed); -// boolean isProhibited = mapping == null ? isProhibited(i, allowed) : isProhibited(mapping,allowed); -// IdnaType status; -// if (isProhibited || !U32.contains(i)) { -// status = IdnaType.disallowed; -// mapping = null; -// } else if (mapping == null) { -// status = IdnaType.valid; -// } else if (mapping.length() == 0) { -// status = IdnaType.ignored; -// } else { -// status = IdnaType.mapped; -// } -// mappings.put(i, mapping); -// types.put(i, status); -// } -// // special handling for separators -// mappings.putAll(IdnaTypes.OTHER_DOT_SET,"."); -// types.putAll(IdnaTypes.OTHER_DOT_SET,IdnaType.mapped); -// types.put('.',IdnaType.valid); -// -// mappings.freeze(); -// types.freeze(); -// } -// -// private static String getMapping(int cp, EnumSet allowed) { -// DataSet items = data.get(cp); -// String mapping = items == null ? null : items.mapping; -// String normalizedMapping = mapping != null ? Normalizer.normalize(mapping, Normalizer.NFKC) : Normalizer.normalize(cp, Normalizer.NFKC); -// if (UnicodeProperty.equals(cp, normalizedMapping)) { -// return null; -// } -// return normalizedMapping; -// } -// -// private static boolean isProhibited(int cp, EnumSet allowed) { -// DataSet items = data.get(cp); -// if (items != null) { -// return items.isProhibited; -// } -// return false; -// } -// -// private static boolean isProhibited(String string, EnumSet allowed) { -// int cp; -// for (int i = 0; i < string.length(); i += Character.charCount(cp)) { -// cp = string.codePointAt(i); -// if (isProhibited(cp, allowed)) { -// return true; -// } -// } -// return false; -// } -// -// enum Idna2003Table {none, A_1, B_1, B_2, B_3, C_1_1, C_1_2, C_2_1, C_2_2, C_3, C_4, C_5, C_6, C_7, C_8, C_9, D_1, D_2} -// -// static EnumSet PROHIBITED = EnumSet.range(Idna2003Table.C_1_1, Idna2003Table.C_9); -// static EnumSet MAPPING = EnumSet.range(Idna2003Table.B_1, Idna2003Table.B_3); -// -// -// /** -// A.1 Unassigned code points in Unicode 3.2 -// ----- Start Table A.1 ----- -// 0221 -// B.1 Commonly mapped to nothing -// ----- Start Table B.1 ----- -// 00AD; ; Map to nothing -// B.2 Mapping for case-folding used with NFKC -// ----- Start Table B.2 ----- -// 0041; 0061; Case map -// B.3 Mapping for case-folding used with no normalization -// ----- Start Table B.3 ----- -// 0041; 0061; Case map -// C.1.1 ASCII space characters -// ----- Start Table C.1.1 ----- -// 0020; SPACE -// C.1.2 Non-ASCII space characters -// ----- Start Table C.1.2 ----- -// 00A0; NO-BREAK SPACE -// C.2.1 ASCII control characters -// ----- Start Table C.2.1 ----- -// 0000-001F; [CONTROL CHARACTERS] -// C.2.2 Non-ASCII control characters -// ----- Start Table C.2.2 ----- -// 0080-009F; [CONTROL CHARACTERS] -// C.2.2 Non-ASCII control characters -// ----- Start Table C.2.2 ----- -// 0080-009F; [CONTROL CHARACTERS] -// C.3 Private use -// ----- Start Table C.3 ----- -// E000-F8FF; [PRIVATE USE, PLANE 0] -// C.4 Non-character code points -// ----- Start Table C.4 ----- -// FDD0-FDEF; [NONCHARACTER CODE POINTS] -// C.5 Surrogate codes -// ----- Start Table C.5 ----- -// D800-DFFF; [SURROGATE CODES] -// C.6 Inappropriate for plain text -// ----- Start Table C.6 ----- -// FFF9; INTERLINEAR ANNOTATION ANCHOR -// C.7 Inappropriate for canonical representation -// ----- Start Table C.7 ----- -// 2FF0-2FFB; [IDEOGRAPHIC DESCRIPTION CHARACTERS] -// C.8 Change display properties or are deprecated -// ----- Start Table C.8 ----- -// 0340; COMBINING GRAVE TONE MARK -// C.9 Tagging characters -// ----- Start Table C.9 ----- -// E0001; LANGUAGE TAG -// D.1 Characters with bidirectional property "R" or "AL" -// ----- Start Table D.1 ----- -// 05BE -// D.2 Characters with bidirectional property "L" -// ----- Start Table D.2 ----- -// 0041-005A -// */ -// -// static Pattern TABLE_DELIMITER = Pattern.compile("\\Q-----\\E\\s*(Start|End)\\s*Table\\s*(\\S+)\\s*\\Q-----\\E"); -// static Pattern MAP_LINE = Pattern.compile("([A-Z0-9]{4,6})" + -// "(?:-([A-Z0-9]{4,6}))?" + -// "(?:\\s*;\\s*((?:[A-Z0-9]{4,6}\\s*)*))?" + -// "(?:\\s*;\\s*.*)?"); -// static Pattern SET_LINE = Pattern.compile("([A-Z0-9]{4,6})" + -// "(?:-([A-Z0-9]{4,6}))?" + -// "(?:\\s*;\\s*.*)?"); -// -// static class DataSet { -// final boolean isProhibited; -// final String mapping; -// final String comment; -// -// private DataSet(boolean isProhibited2, String mapping2, String comment2) { -// isProhibited = isProhibited2; -// mapping = mapping2; -// comment = comment2; -// } -// -// public DataSet add(boolean myisProhibited, String mymapping, String mycomment) { -// // now merge -// if (isProhibited) { -// myisProhibited = true; -// } -// if (mymapping == null) { -// mymapping = mapping; -// } else if (mapping != null && !mymapping.equals(mapping)) { -// throw new IllegalArgumentException("Conflicting mapping " + Utility.hex(mapping) + ", " + Utility.hex(mymapping)); -// } -// if (mycomment == null) { -// mycomment = comment; -// } else if (comment != null) { -// mycomment = comment + "\n" + mycomment; -// } -// return new DataSet(myisProhibited, mymapping, mycomment); -// } -// /** -// * If there is a mapping, use the mapping to set the prohibited bit. -// * @param codepoint -// * @param data -// * @return stuff -// */ -// public DataSet fix(int codepoint, UnicodeMap data) { -// if (mapping != null) { -// boolean newIsProhibited = false; -// int cp; -// for (int i = 0; i < mapping.length(); i += Character.charCount(cp)) { -// cp = mapping.codePointAt(i); -// DataSet other = data.get(i); -// if (other.mapping != null) { -// throw new IllegalArgumentException("Recursive Mapping"); -// } -// if (other.isProhibited) { -// newIsProhibited = true; -// } -// } -// DataSet newDataSet = new DataSet(newIsProhibited, mapping, comment); -// if (DEBUG) System.out.println("Changing value for " + Utility.hex(codepoint) + ":\t[" + this + "] => [" + newDataSet + "]"); -// return newDataSet; -// } -// return null; -// } -// -// public boolean equals(Object other) { -// DataSet that = (DataSet) other; -// return isProhibited == that.isProhibited -// && UnicodeProperty.equals(mapping, that.mapping) -// && UnicodeProperty.equals(comment, that.comment); -// } -// public int hashCode() { -// return (isProhibited ? 1 : 0) ^ (mapping == null ? 0 : mapping.hashCode()); -// } -// public String toString() { -// return isProhibited + ", " + Utility.hex(mapping) + ", " + comment; -// } -// } -// -// -// private static final UnicodeMap data; -// -// static { -// data = new UnicodeMap(); -// try { -// //UnicodeMap> rawMapping = new UnicodeMap>(); -// -// Matcher tableDelimiter = TABLE_DELIMITER.matcher(""); -// Matcher mapLine = MAP_LINE.matcher(""); -// Matcher setLine = SET_LINE.matcher(""); -// BufferedReader in = FileUtilities.openFile(StringPrepData.class, "nameprep.txt"); -// //BufferedReader in = BagFormatter.openUTF8Reader(UCD_Types.BASE_DIR + "idna/", "nameprep.txt"); -// StringPrepData.Idna2003Table table = null; -// boolean inTable = false; -// boolean isMapping = false; -// for (int count = 1; ; ++count) { -// String line = in.readLine(); -// if (line == null) break; -// line = line.trim(); -// if (line.length() == 0 || line.startsWith("Hoffman") || line.startsWith("RFC")) continue; -// if (line.startsWith("-----")) { -// if (!tableDelimiter.reset(line).matches()) { -// throw new IllegalArgumentException("Bad syntax: " + line); -// } -// inTable = tableDelimiter.group(1).equals("Start"); -// StringPrepData.Idna2003Table newTable = Idna2003Table.valueOf(tableDelimiter.group(2).replace(".","_")); -// if (inTable) { -// if (table != null) { -// throw new IllegalArgumentException("Table not terminated: " + table + "; " + line); -// } -// table = newTable; -// if (DEBUG) System.out.println(count + ")\t*** New Table: " + table); -// isMapping = newTable.toString().startsWith("B"); -// } else { -// if (newTable != table) { -// throw new IllegalArgumentException("Bad table end: " + newTable + " != " + table + "; " + line); -// } -// table = null; -// isMapping = false; -// } -// continue; -// } -// if (!inTable) { -// if (DEBUG) System.out.println(count + ")\tIgnoring: " + line); -// continue; -// } -// // if (!allowed.contains(table)) { -// // if (DEBUG) System.out.println(count + ")\t" + table + "\tSKIPPING line:\t" + line); -// // continue; -// // } else { -// // if (DEBUG) System.out.println(count + ")\t" + table + "\tDoing line:\t" + line); -// // } -// Matcher lineMatcher = isMapping ? mapLine : setLine; -// if (!lineMatcher.reset(line).matches()) { -// throw new IllegalArgumentException("Illegal range-value syntax: " + line); -// } -// int startCode = Utility.fromHex(lineMatcher.group(1),4," ").codePointAt(0); -// String endCodeString = lineMatcher.groupCount() < 2 ? null : lineMatcher.group(2); -// String group3 = lineMatcher.groupCount() < 3 ? null : lineMatcher.group(3); -// String group4 = lineMatcher.groupCount() < 4 ? null : lineMatcher.group(4); -// int endCode = endCodeString == null ? startCode : Utility.fromHex(endCodeString,4," ").codePointAt(0); -// String comment, mapValueString; -// if (isMapping) { -// comment = group4; -// try { -// mapValueString = group3.length() == 0 ? "" : Utility.fromHex(group3,4," "); -// } catch (RuntimeException e) { -// throw e; -// } -// } else { -// comment = group3; -// mapValueString = null; -// } -// if (DEBUG) System.out.println(count + ")\t" + line + ":\t" + Utility.hex(startCode) -// + (startCode == endCode ? "" : ".." + Utility.hex(endCode)) -// + ",\t" + table -// + ",\t" + (mapValueString == null ? "null" : Utility.hex(mapValueString)) -// ); -// -// addMapping(startCode, endCode, table, (String)mapValueString, (String)comment); -// } -// in.close(); -// } catch (IOException e) { -// throw new IllegalArgumentException(e); -// } -// -// // fix ASCII -// -// addMapping(0, 0x7F, Idna2003Table.C_9, (String)null, (String)null); -// for (UnicodeSetIterator it = new UnicodeSetIterator(VALID_ASCII); it.next();) { -// addMapping(0, 0x7F, null, null, null); -// } -// -// //rawMapping.putAll(VALID_ASCII, null); -// -// for (int i = 'A'; i <= 'Z'; ++i) { -// R3 alphaMap = Row.of(Idna2003Table.B_1, UTF16.valueOf(i-'A'+'a'), (String)null); -// DataSet tableSet = data.get(i); -// if (tableSet == null) { -// tableSet = new DataSet(PROHIBITED.contains(Idna2003Table.B_1), UTF16.valueOf(i-'A'+'a'), (String)null); -// } else { -// tableSet = tableSet.add(PROHIBITED.contains(Idna2003Table.B_1), UTF16.valueOf(i-'A'+'a'), (String)null); -// } -// data.put(i, tableSet); -// } -// for (String i : data.keySet()) { -// DataSet dataSet = data.get(i); -// DataSet fixed = dataSet.fix(i.codePointAt(0), data); -// if (fixed != null) { -// data.put(i, fixed); -// } -// } -// data.freeze(); -// } -// -// private static void addMapping(int startCode, int endCode, StringPrepData.Idna2003Table type, String mapping, String comment) { -// for (int i = startCode; i <= endCode; ++i) { -// addData(i, type, mapping, comment); -// } -// } -// -// private static void addData(int i, StringPrepData.Idna2003Table type, String mapping, String comment) { -// try { -// if (i == 0x200c) { -// System.out.print(""); -// } -// DataSet tableSet = data.get(i); -// if (tableSet == null) { -// tableSet = new DataSet(PROHIBITED.contains(type), mapping, comment); -// } else { -// tableSet = tableSet.add(PROHIBITED.contains(type), mapping, comment); -// } -// data.put(i, tableSet); -// } catch (RuntimeException e) { -// throw new IllegalArgumentException("Failure with " + Utility.hex(i), e); -// } -// } -} \ No newline at end of file + // + // private static final boolean DEBUG = true; + // public static UnicodeSet U32 = new UnicodeSet("[:age=3.2:]").freeze(); + // public static UnicodeSet VALID_ASCII = new UnicodeSet("[\\u002Da-zA-Z0-9]").freeze(); + // + // + // /** + // 3. Mapping + // This profile specifies mapping using the following tables from + // [STRINGPREP]: + // Table B.1 + // Table B.2 + // 4. Normalization + // This profile specifies using Unicode normalization form KC, as + // described in [STRINGPREP]. + // 5. Prohibited Output + // Table C.1.2 + // Table C.2.2 + // Table C.3 + // Table C.4 + // Table C.5 + // Table C.6 + // Table C.7 + // Table C.8 + // Table C.9 + // */ + // + // public static void getIdna2003Tables(UnicodeMap mappings, UnicodeMap + // types) { + // EnumSet allowed = EnumSet.of( + // Idna2003Table.B_1, + // Idna2003Table.B_2, + // Idna2003Table.C_1_2 + // , Idna2003Table.C_2_2 + // , Idna2003Table.C_3 + // , Idna2003Table.C_4 + // , Idna2003Table.C_5 + // , Idna2003Table.C_6 + // , Idna2003Table.C_7 + // , Idna2003Table.C_8 + // , Idna2003Table.C_9 + // ); + // for (int i = 0; i <= 0x10FFFF; ++i) { + // String mapping = getMapping(i, allowed); + // boolean isProhibited = mapping == null ? isProhibited(i, allowed) : + // isProhibited(mapping,allowed); + // IdnaType status; + // if (isProhibited || !U32.contains(i)) { + // status = IdnaType.disallowed; + // mapping = null; + // } else if (mapping == null) { + // status = IdnaType.valid; + // } else if (mapping.length() == 0) { + // status = IdnaType.ignored; + // } else { + // status = IdnaType.mapped; + // } + // mappings.put(i, mapping); + // types.put(i, status); + // } + // // special handling for separators + // mappings.putAll(IdnaTypes.OTHER_DOT_SET,"."); + // types.putAll(IdnaTypes.OTHER_DOT_SET,IdnaType.mapped); + // types.put('.',IdnaType.valid); + // + // mappings.freeze(); + // types.freeze(); + // } + // + // private static String getMapping(int cp, EnumSet allowed) { + // DataSet items = data.get(cp); + // String mapping = items == null ? null : items.mapping; + // String normalizedMapping = mapping != null ? Normalizer.normalize(mapping, + // Normalizer.NFKC) : Normalizer.normalize(cp, Normalizer.NFKC); + // if (UnicodeProperty.equals(cp, normalizedMapping)) { + // return null; + // } + // return normalizedMapping; + // } + // + // private static boolean isProhibited(int cp, EnumSet allowed) { + // DataSet items = data.get(cp); + // if (items != null) { + // return items.isProhibited; + // } + // return false; + // } + // + // private static boolean isProhibited(String string, EnumSet allowed) { + // int cp; + // for (int i = 0; i < string.length(); i += Character.charCount(cp)) { + // cp = string.codePointAt(i); + // if (isProhibited(cp, allowed)) { + // return true; + // } + // } + // return false; + // } + // + // enum Idna2003Table {none, A_1, B_1, B_2, B_3, C_1_1, C_1_2, C_2_1, C_2_2, C_3, C_4, C_5, + // C_6, C_7, C_8, C_9, D_1, D_2} + // + // static EnumSet PROHIBITED = EnumSet.range(Idna2003Table.C_1_1, + // Idna2003Table.C_9); + // static EnumSet MAPPING = EnumSet.range(Idna2003Table.B_1, Idna2003Table.B_3); + // + // + // /** + // A.1 Unassigned code points in Unicode 3.2 + // ----- Start Table A.1 ----- + // 0221 + // B.1 Commonly mapped to nothing + // ----- Start Table B.1 ----- + // 00AD; ; Map to nothing + // B.2 Mapping for case-folding used with NFKC + // ----- Start Table B.2 ----- + // 0041; 0061; Case map + // B.3 Mapping for case-folding used with no normalization + // ----- Start Table B.3 ----- + // 0041; 0061; Case map + // C.1.1 ASCII space characters + // ----- Start Table C.1.1 ----- + // 0020; SPACE + // C.1.2 Non-ASCII space characters + // ----- Start Table C.1.2 ----- + // 00A0; NO-BREAK SPACE + // C.2.1 ASCII control characters + // ----- Start Table C.2.1 ----- + // 0000-001F; [CONTROL CHARACTERS] + // C.2.2 Non-ASCII control characters + // ----- Start Table C.2.2 ----- + // 0080-009F; [CONTROL CHARACTERS] + // C.2.2 Non-ASCII control characters + // ----- Start Table C.2.2 ----- + // 0080-009F; [CONTROL CHARACTERS] + // C.3 Private use + // ----- Start Table C.3 ----- + // E000-F8FF; [PRIVATE USE, PLANE 0] + // C.4 Non-character code points + // ----- Start Table C.4 ----- + // FDD0-FDEF; [NONCHARACTER CODE POINTS] + // C.5 Surrogate codes + // ----- Start Table C.5 ----- + // D800-DFFF; [SURROGATE CODES] + // C.6 Inappropriate for plain text + // ----- Start Table C.6 ----- + // FFF9; INTERLINEAR ANNOTATION ANCHOR + // C.7 Inappropriate for canonical representation + // ----- Start Table C.7 ----- + // 2FF0-2FFB; [IDEOGRAPHIC DESCRIPTION CHARACTERS] + // C.8 Change display properties or are deprecated + // ----- Start Table C.8 ----- + // 0340; COMBINING GRAVE TONE MARK + // C.9 Tagging characters + // ----- Start Table C.9 ----- + // E0001; LANGUAGE TAG + // D.1 Characters with bidirectional property "R" or "AL" + // ----- Start Table D.1 ----- + // 05BE + // D.2 Characters with bidirectional property "L" + // ----- Start Table D.2 ----- + // 0041-005A + // */ + // + // static Pattern TABLE_DELIMITER = + // Pattern.compile("\\Q-----\\E\\s*(Start|End)\\s*Table\\s*(\\S+)\\s*\\Q-----\\E"); + // static Pattern MAP_LINE = Pattern.compile("([A-Z0-9]{4,6})" + + // "(?:-([A-Z0-9]{4,6}))?" + + // "(?:\\s*;\\s*((?:[A-Z0-9]{4,6}\\s*)*))?" + + // "(?:\\s*;\\s*.*)?"); + // static Pattern SET_LINE = Pattern.compile("([A-Z0-9]{4,6})" + + // "(?:-([A-Z0-9]{4,6}))?" + + // "(?:\\s*;\\s*.*)?"); + // + // static class DataSet { + // final boolean isProhibited; + // final String mapping; + // final String comment; + // + // private DataSet(boolean isProhibited2, String mapping2, String comment2) { + // isProhibited = isProhibited2; + // mapping = mapping2; + // comment = comment2; + // } + // + // public DataSet add(boolean myisProhibited, String mymapping, String mycomment) { + // // now merge + // if (isProhibited) { + // myisProhibited = true; + // } + // if (mymapping == null) { + // mymapping = mapping; + // } else if (mapping != null && !mymapping.equals(mapping)) { + // throw new IllegalArgumentException("Conflicting mapping " + Utility.hex(mapping) + ", + // " + Utility.hex(mymapping)); + // } + // if (mycomment == null) { + // mycomment = comment; + // } else if (comment != null) { + // mycomment = comment + "\n" + mycomment; + // } + // return new DataSet(myisProhibited, mymapping, mycomment); + // } + // /** + // * If there is a mapping, use the mapping to set the prohibited bit. + // * @param codepoint + // * @param data + // * @return stuff + // */ + // public DataSet fix(int codepoint, UnicodeMap data) { + // if (mapping != null) { + // boolean newIsProhibited = false; + // int cp; + // for (int i = 0; i < mapping.length(); i += Character.charCount(cp)) { + // cp = mapping.codePointAt(i); + // DataSet other = data.get(i); + // if (other.mapping != null) { + // throw new IllegalArgumentException("Recursive Mapping"); + // } + // if (other.isProhibited) { + // newIsProhibited = true; + // } + // } + // DataSet newDataSet = new DataSet(newIsProhibited, mapping, comment); + // if (DEBUG) System.out.println("Changing value for " + Utility.hex(codepoint) + ":\t[" + // + this + "] => [" + newDataSet + "]"); + // return newDataSet; + // } + // return null; + // } + // + // public boolean equals(Object other) { + // DataSet that = (DataSet) other; + // return isProhibited == that.isProhibited + // && UnicodeProperty.equals(mapping, that.mapping) + // && UnicodeProperty.equals(comment, that.comment); + // } + // public int hashCode() { + // return (isProhibited ? 1 : 0) ^ (mapping == null ? 0 : mapping.hashCode()); + // } + // public String toString() { + // return isProhibited + ", " + Utility.hex(mapping) + ", " + comment; + // } + // } + // + // + // private static final UnicodeMap data; + // + // static { + // data = new UnicodeMap(); + // try { + // //UnicodeMap> rawMapping = new + // UnicodeMap>(); + // + // Matcher tableDelimiter = TABLE_DELIMITER.matcher(""); + // Matcher mapLine = MAP_LINE.matcher(""); + // Matcher setLine = SET_LINE.matcher(""); + // BufferedReader in = FileUtilities.openFile(StringPrepData.class, "nameprep.txt"); + // //BufferedReader in = BagFormatter.openUTF8Reader(UCD_Types.BASE_DIR + "idna/", + // "nameprep.txt"); + // StringPrepData.Idna2003Table table = null; + // boolean inTable = false; + // boolean isMapping = false; + // for (int count = 1; ; ++count) { + // String line = in.readLine(); + // if (line == null) break; + // line = line.trim(); + // if (line.length() == 0 || line.startsWith("Hoffman") || line.startsWith("RFC")) + // continue; + // if (line.startsWith("-----")) { + // if (!tableDelimiter.reset(line).matches()) { + // throw new IllegalArgumentException("Bad syntax: " + line); + // } + // inTable = tableDelimiter.group(1).equals("Start"); + // StringPrepData.Idna2003Table newTable = + // Idna2003Table.valueOf(tableDelimiter.group(2).replace(".","_")); + // if (inTable) { + // if (table != null) { + // throw new IllegalArgumentException("Table not terminated: " + table + "; " + + // line); + // } + // table = newTable; + // if (DEBUG) System.out.println(count + ")\t*** New Table: " + table); + // isMapping = newTable.toString().startsWith("B"); + // } else { + // if (newTable != table) { + // throw new IllegalArgumentException("Bad table end: " + newTable + " != " + + // table + "; " + line); + // } + // table = null; + // isMapping = false; + // } + // continue; + // } + // if (!inTable) { + // if (DEBUG) System.out.println(count + ")\tIgnoring: " + line); + // continue; + // } + // // if (!allowed.contains(table)) { + // // if (DEBUG) System.out.println(count + ")\t" + table + "\tSKIPPING line:\t" + // + line); + // // continue; + // // } else { + // // if (DEBUG) System.out.println(count + ")\t" + table + "\tDoing line:\t" + + // line); + // // } + // Matcher lineMatcher = isMapping ? mapLine : setLine; + // if (!lineMatcher.reset(line).matches()) { + // throw new IllegalArgumentException("Illegal range-value syntax: " + line); + // } + // int startCode = Utility.fromHex(lineMatcher.group(1),4," ").codePointAt(0); + // String endCodeString = lineMatcher.groupCount() < 2 ? null : lineMatcher.group(2); + // String group3 = lineMatcher.groupCount() < 3 ? null : lineMatcher.group(3); + // String group4 = lineMatcher.groupCount() < 4 ? null : lineMatcher.group(4); + // int endCode = endCodeString == null ? startCode : Utility.fromHex(endCodeString,4," + // ").codePointAt(0); + // String comment, mapValueString; + // if (isMapping) { + // comment = group4; + // try { + // mapValueString = group3.length() == 0 ? "" : Utility.fromHex(group3,4," "); + // } catch (RuntimeException e) { + // throw e; + // } + // } else { + // comment = group3; + // mapValueString = null; + // } + // if (DEBUG) System.out.println(count + ")\t" + line + ":\t" + Utility.hex(startCode) + // + (startCode == endCode ? "" : ".." + Utility.hex(endCode)) + // + ",\t" + table + // + ",\t" + (mapValueString == null ? "null" : Utility.hex(mapValueString)) + // ); + // + // addMapping(startCode, endCode, table, (String)mapValueString, (String)comment); + // } + // in.close(); + // } catch (IOException e) { + // throw new IllegalArgumentException(e); + // } + // + // // fix ASCII + // + // addMapping(0, 0x7F, Idna2003Table.C_9, (String)null, (String)null); + // for (UnicodeSetIterator it = new UnicodeSetIterator(VALID_ASCII); it.next();) { + // addMapping(0, 0x7F, null, null, null); + // } + // + // //rawMapping.putAll(VALID_ASCII, null); + // + // for (int i = 'A'; i <= 'Z'; ++i) { + // R3 alphaMap = Row.of(Idna2003Table.B_1, + // UTF16.valueOf(i-'A'+'a'), (String)null); + // DataSet tableSet = data.get(i); + // if (tableSet == null) { + // tableSet = new DataSet(PROHIBITED.contains(Idna2003Table.B_1), + // UTF16.valueOf(i-'A'+'a'), (String)null); + // } else { + // tableSet = tableSet.add(PROHIBITED.contains(Idna2003Table.B_1), + // UTF16.valueOf(i-'A'+'a'), (String)null); + // } + // data.put(i, tableSet); + // } + // for (String i : data.keySet()) { + // DataSet dataSet = data.get(i); + // DataSet fixed = dataSet.fix(i.codePointAt(0), data); + // if (fixed != null) { + // data.put(i, fixed); + // } + // } + // data.freeze(); + // } + // + // private static void addMapping(int startCode, int endCode, StringPrepData.Idna2003Table + // type, String mapping, String comment) { + // for (int i = startCode; i <= endCode; ++i) { + // addData(i, type, mapping, comment); + // } + // } + // + // private static void addData(int i, StringPrepData.Idna2003Table type, String mapping, String + // comment) { + // try { + // if (i == 0x200c) { + // System.out.print(""); + // } + // DataSet tableSet = data.get(i); + // if (tableSet == null) { + // tableSet = new DataSet(PROHIBITED.contains(type), mapping, comment); + // } else { + // tableSet = tableSet.add(PROHIBITED.contains(type), mapping, comment); + // } + // data.put(i, tableSet); + // } catch (RuntimeException e) { + // throw new IllegalArgumentException("Failure with " + Utility.hex(i), e); + // } + // } +} diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Builder.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Builder.java index a52b36286..6349b030d 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Builder.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Builder.java @@ -12,9 +12,10 @@ import java.util.SortedSet; /** - * Convenience class for building collections and maps. Allows them to be built by chaining, making it simpler to - * set as parameters and fields. Also supplies some operations that are missing on the JDK maps and collections, - * and provides finer control for what happens with equal elements. + * Convenience class for building collections and maps. Allows them to be built by chaining, making + * it simpler to set as parameters and fields. Also supplies some operations that are missing on the + * JDK maps and collections, and provides finer control for what happens with equal elements. + * *

  * Operations: A is current contents, B is new collection, x indicates the results
  * A-B   A&B    B-A   Name
@@ -25,29 +26,35 @@
  * x      x           
  *        x      x    clear().addAll(B)
  * x             x    xor(B)
- * x      x      x    addAll(B)          
+ * x      x      x    addAll(B)
  * 
+ * * @author markdavis */ // TODO add other Iterable public final class Builder { - public enum EqualAction {NATIVE, REPLACE, RETAIN, THROW} + public enum EqualAction { + NATIVE, + REPLACE, + RETAIN, + THROW + } - public static > CBuilder with(C collection, EqualAction ea) { - return new CBuilder(collection, ea); + public static > CBuilder with(C collection, EqualAction ea) { + return new CBuilder(collection, ea); } - public static > CBuilder with(C collection) { - return new CBuilder(collection, EqualAction.NATIVE); + public static > CBuilder with(C collection) { + return new CBuilder(collection, EqualAction.NATIVE); } - public static > MBuilder with(M map, EqualAction ea) { - return new MBuilder(map, ea); + public static > MBuilder with(M map, EqualAction ea) { + return new MBuilder(map, ea); } - public static > MBuilder with(M map) { - return new MBuilder(map, EqualAction.NATIVE); + public static > MBuilder with(M map) { + return new MBuilder(map, EqualAction.NATIVE); } // ===== Collections ====== @@ -56,6 +63,7 @@ public static final class CBuilder> { public EqualAction getEqualAction() { return equalAction; } + public CBuilder setEqualAction(EqualAction equalAction) { this.equalAction = equalAction; return this; @@ -66,28 +74,28 @@ public CBuilder clear() { return this; } - public CBuilder add(E e) { + public CBuilder add(E e) { switch (equalAction) { - case NATIVE: - break; - case REPLACE: - collection.remove(e); - break; - case RETAIN: - if (collection.contains(e)) { - return this; - } - break; - case THROW: - if (collection.contains(e)) { - throw new IllegalArgumentException("Map already contains " + e); - } + case NATIVE: + break; + case REPLACE: + collection.remove(e); + break; + case RETAIN: + if (collection.contains(e)) { + return this; + } + break; + case THROW: + if (collection.contains(e)) { + throw new IllegalArgumentException("Map already contains " + e); + } } collection.add(e); return this; } - public CBuilder addAll(Collection c) { + public CBuilder addAll(Collection c) { if (equalAction == EqualAction.REPLACE) { collection.addAll(c); } else { @@ -117,36 +125,36 @@ public CBuilder remove(E o) { return this; } - public CBuilder removeAll(Collection c) { + public CBuilder removeAll(Collection c) { collection.removeAll(c); return this; } - public CBuilder removeAll(E... items) { + public CBuilder removeAll(E... items) { for (E item : items) { collection.remove(item); } return this; } - public CBuilder removeAll(Iterable items) { + public CBuilder removeAll(Iterable items) { for (E item : items) { collection.remove(item); } return this; } - public CBuilder retainAll(Collection c) { + public CBuilder retainAll(Collection c) { collection.retainAll(c); return this; } - public CBuilder retainAll(E... items) { + public CBuilder retainAll(E... items) { collection.retainAll(Arrays.asList(items)); return this; } - public CBuilder xor(Collection c) { + public CBuilder xor(Collection c) { for (E item : c) { boolean changed = collection.remove(item); if (!changed) { @@ -156,11 +164,11 @@ public CBuilder xor(Collection c) { return this; } - public CBuilder xor(E... items) { + public CBuilder xor(E... items) { return xor(Arrays.asList(items)); } - public CBuilder keepNew(Collection c) { + public CBuilder keepNew(Collection c) { HashSet extras = new HashSet(c); extras.removeAll(collection); collection.clear(); @@ -168,7 +176,7 @@ public CBuilder keepNew(Collection c) { return this; } - public CBuilder keepNew(E... items) { + public CBuilder keepNew(E... items) { return keepNew(Arrays.asList(items)); } @@ -182,13 +190,13 @@ public U get() { public U freeze() { U temp; if (collection instanceof SortedSet) { - temp = (U)Collections.unmodifiableSortedSet((SortedSet) collection); + temp = (U) Collections.unmodifiableSortedSet((SortedSet) collection); } else if (collection instanceof Set) { - temp = (U)Collections.unmodifiableSet((Set) collection); + temp = (U) Collections.unmodifiableSet((Set) collection); } else if (collection instanceof List) { - temp = (U)Collections.unmodifiableList((List) collection); + temp = (U) Collections.unmodifiableList((List) collection); } else { - temp = (U)Collections.unmodifiableCollection(collection); + temp = (U) Collections.unmodifiableCollection(collection); } collection = null; return temp; @@ -204,17 +212,19 @@ private CBuilder(U set2, EqualAction ea) { this.collection = set2; equalAction = ea; } + private U collection; private EqualAction equalAction; } // ===== Maps ====== - public static final class MBuilder> { + public static final class MBuilder> { public EqualAction getEqualAction() { return equalAction; } + public MBuilder setEqualAction(EqualAction equalAction) { this.equalAction = equalAction; return this; @@ -224,22 +234,23 @@ public MBuilder clear() { map.clear(); return this; } + public MBuilder put(K key, V value) { switch (equalAction) { - case NATIVE: - break; - case REPLACE: - map.remove(key); - break; - case RETAIN: - if (map.containsKey(key)) { - return this; - } - break; - case THROW: - if (map.containsKey(key)) { - throw new IllegalArgumentException("Map already contains " + key); - } + case NATIVE: + break; + case REPLACE: + map.remove(key); + break; + case RETAIN: + if (map.containsKey(key)) { + return this; + } + break; + case THROW: + if (map.containsKey(key)) { + throw new IllegalArgumentException("Map already contains " + key); + } } map.put(key, value); return this; @@ -300,7 +311,7 @@ public MBuilder putAll(Map m) { public MBuilder putAll(Object[][] data) { for (Object[] key : data) { - put((K)key[0], (V)key[1]); + put((K) key[0], (V) key[1]); } keys = null; return this; @@ -315,6 +326,7 @@ public MBuilder removeAll(Collection keys) { map.keySet().removeAll(keys); return this; } + public MBuilder removeAll(K... keys) { return removeAll(Arrays.asList(keys)); } @@ -323,11 +335,12 @@ public MBuilder retainAll(Collection keys) { map.keySet().retainAll(keys); return this; } + public MBuilder retainAll(K... keys) { return retainAll(Arrays.asList(keys)); } - public > MBuilder xor(N c) { + public > MBuilder xor(N c) { for (K item : c.keySet()) { if (map.containsKey(item)) { map.remove(item); @@ -338,7 +351,7 @@ public > MBuilder xor(N c) { return this; } - public > MBuilder keepNew(N c) { + public > MBuilder keepNew(N c) { HashSet extras = new HashSet(c.keySet()); extras.removeAll(map.keySet()); map.clear(); @@ -357,10 +370,10 @@ public M get() { @SuppressWarnings("unchecked") public M freeze() { M temp; - if (map instanceof SortedMap) { - temp = (M)Collections.unmodifiableSortedMap((SortedMap) map); + if (map instanceof SortedMap) { + temp = (M) Collections.unmodifiableSortedMap((SortedMap) map); } else { - temp = (M)Collections.unmodifiableMap((Map) map); + temp = (M) Collections.unmodifiableMap((Map) map); } map = null; return temp; diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java b/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java index f3bc4ce28..ced9fe206 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/CachedProps.java @@ -1,5 +1,12 @@ package org.unicode.jsp; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.Multimap; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.util.ICUUncheckedIOException; +import com.ibm.icu.util.VersionInfo; import java.io.DataInputStream; import java.io.File; import java.io.IOException; @@ -9,7 +16,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; @@ -17,19 +23,8 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.zip.GZIPInputStream; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.jsp.UnicodeDataInput.ItemReader; - -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableMultimap; -import com.google.common.collect.Multimap; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.util.ICUUncheckedIOException; -import com.ibm.icu.util.VersionInfo; - import org.unicode.props.UnicodeProperty; public class CachedProps { @@ -38,13 +33,15 @@ public class CachedProps { public static final Splitter HASH_SPLITTER = Splitter.on('#').trimResults(); public static final Splitter SEMI_SPLITTER = Splitter.on(';').trimResults(); - static ConcurrentHashMap versionToCachedProps = new ConcurrentHashMap(); + static ConcurrentHashMap versionToCachedProps = + new ConcurrentHashMap(); public final VersionInfo version; final Set propNames; - final ConcurrentHashMap propertyCache = new ConcurrentHashMap(); - final BiMultimap nameToAliases = new BiMultimap(null,null); - final Map> nameToValueToAliases = new LinkedHashMap(); + final ConcurrentHashMap propertyCache = + new ConcurrentHashMap(); + final BiMultimap nameToAliases = new BiMultimap(null, null); + final Map> nameToValueToAliases = new LinkedHashMap(); static CachedProps CACHED_PROPS = getInstance(VersionInfo.getInstance(14)); @@ -56,7 +53,7 @@ private CachedProps(VersionInfo version2) { LinkedHashSet temp = new LinkedHashSet(); for (String filename : dir.list()) { if (filename.endsWith(".bin")) { - temp.add(filename.substring(0, filename.length()-4)); + temp.add(filename.substring(0, filename.length() - 4)); } } @@ -73,9 +70,11 @@ private CachedProps(VersionInfo version2) { nameToAliases.putAll(name, nameAliases); } } - // AHex; Y ; Yes ; T ; True + // AHex; Y ; Yes ; T + // ; True // ccc; 0; NR ; Not_Reordered - for (String fileName : Arrays.asList("PropertyValueAliases.txt", "ExtraPropertyValueAliases.txt")) { + for (String fileName : + Arrays.asList("PropertyValueAliases.txt", "ExtraPropertyValueAliases.txt")) { for (String line : FileUtilities.in(CachedProps.class, "data/" + fileName)) { List splitLine = breakLine(line); if (splitLine == null) { @@ -86,7 +85,8 @@ private CachedProps(VersionInfo version2) { String longName = names.iterator().next(); BiMultimap valueToAliases = nameToValueToAliases.get(longName); if (valueToAliases == null) { - nameToValueToAliases.put(longName, valueToAliases = new BiMultimap(null,null)); + nameToValueToAliases.put( + longName, valueToAliases = new BiMultimap(null, null)); } List aliases = splitLine.subList(1, splitLine.size()); for (String item : aliases) { @@ -110,7 +110,6 @@ private List breakLine(String line) { return splitLine; } - public static CachedProps getInstance(VersionInfo version) { CachedProps result = versionToCachedProps.get(version); if (result == null) { @@ -130,7 +129,11 @@ public UnicodeProperty getProperty(String propName) { result = null; } else { try { - return new DelayedUnicodeProperty(version, propName, nameToAliases.getValues(propName), nameToValueToAliases.get(propName)); + return new DelayedUnicodeProperty( + version, + propName, + nameToAliases.getValues(propName), + nameToValueToAliases.get(propName)); } catch (Exception e) { throw new IllegalArgumentException(propName, e); } @@ -144,9 +147,11 @@ class DelayedUnicodeProperty extends UnicodeProperty { private final VersionInfo version; private UnicodeMap map; private List nameAliases; - private Multimap valueToAliases; + private Multimap valueToAliases; - public DelayedUnicodeProperty(VersionInfo version, String propName, + public DelayedUnicodeProperty( + VersionInfo version, + String propName, Collection nameAliases, BiMultimap biMultimap) { this.version = version; @@ -161,7 +166,10 @@ public DelayedUnicodeProperty(VersionInfo version, String propName, temp = nameAliases; } this.nameAliases = ImmutableList.copyOf(temp); - this.valueToAliases = biMultimap == null ? null : ImmutableMultimap.copyOf(biMultimap.getKeyToValues()); + this.valueToAliases = + biMultimap == null + ? null + : ImmutableMultimap.copyOf(biMultimap.getKeyToValues()); setName(propName); } @@ -212,7 +220,7 @@ private UnicodeMap getMap() { try { String baseName = getName(); if (baseName.endsWith("β")) { - baseName = baseName.substring(0, baseName.length()-1); + baseName = baseName.substring(0, baseName.length() - 1); } fis = CachedProps.class.getResourceAsStream("props/" + baseName + ".bin"); gs = new GZIPInputStream(fis); @@ -222,7 +230,8 @@ private UnicodeMap getMap() { final UnicodeDataInput unicodeDataInput = new UnicodeDataInput(); newItem = unicodeDataInput.set(in, true).readUnicodeMap(stringReader); map = newItem.freeze(); - } catch (Exception e) { } + } catch (Exception e) { + } try { if (fis != null) { fis.close(); @@ -233,7 +242,8 @@ private UnicodeMap getMap() { } } } - } catch (IOException e) {} + } catch (IOException e) { + } } return map; } @@ -257,18 +267,19 @@ public static void main(String[] args) { System.out.println(available); for (String name : available) { UnicodeProperty p = cp.getProperty(name); - System.out.println(p.getName() + "\t" + p.getNameAliases() + "\t" + clip(p.getAvailableValues())); + System.out.println( + p.getName() + "\t" + p.getNameAliases() + "\t" + clip(p.getAvailableValues())); String value = p.getValue('a'); System.out.println("value('a'): " + value + "\t" + p.getValueAliases(value)); } } - private static String clip(Collection availableValues) { - return availableValues.size() > 24 ? new ArrayList(availableValues).subList(0, 23) + ", …" : availableValues.toString(); + return availableValues.size() > 24 + ? new ArrayList(availableValues).subList(0, 23) + ", …" + : availableValues.toString(); } - public Set getPropertyNames() { return nameToAliases.keySet(); } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Common.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Common.java index 7816f3e57..66a537c0d 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Common.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Common.java @@ -1,10 +1,5 @@ package org.unicode.jsp; -import java.util.Arrays; -import java.util.List; - -import org.unicode.jsp.XPropertyFactory.HanType.HanTypeValues; - import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.Normalizer; @@ -15,19 +10,24 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; +import java.util.Arrays; +import java.util.List; +import org.unicode.jsp.XPropertyFactory.HanType.HanTypeValues; public class Common { - static final Normalizer2 NFKC_CF_ = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); + static final Normalizer2 NFKC_CF_ = + Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE); // public static class NFKC_CF implements StringTransform { - // private static final UnicodeSet DEFAULT_IGNORABLES = new UnicodeSet("[:di:]").freeze(); + // private static final UnicodeSet DEFAULT_IGNORABLES = new + // UnicodeSet("[:di:]").freeze(); // // // static Matcher DI = // // Pattern.compile(UnicodeRegex.fix("[:di:]")).matcher(""); // // UnicodeMap DI2 = new // // UnicodeMap().putAll(DEFAULT_IGNORABLES, ""); // public String transform(String source) { - // + // // // String di = DI2.transform(source); // String di = DEFAULT_IGNORABLES.stripFrom(source, true); // String cf = Common.myFoldCase(di); @@ -39,39 +39,49 @@ public class Common { // } // } - public static Transform NFKC_CF = new StringTransform() { - public String transform(String source) { - return NFKC_CF_.normalize(source); - } - }; - - static List XPROPERTY_NAMES = Arrays.asList(new String[] { "toNFC", "toNFD", "toNFKC", "toNFKD", - "toCasefold", "toLowercase", "toUppercase", - "toTitlecase", - "subhead" }); - static final int XSTRING_START = UProperty.STRING_LIMIT; + public static Transform NFKC_CF = + new StringTransform() { + public String transform(String source) { + return NFKC_CF_.normalize(source); + } + }; + + static List XPROPERTY_NAMES = + Arrays.asList( + new String[] { + "toNFC", + "toNFD", + "toNFKC", + "toNFKD", + "toCasefold", + "toLowercase", + "toUppercase", + "toTitlecase", + "subhead" + }); + static final int XSTRING_START = UProperty.STRING_LIMIT; public static String getXStringPropertyValue(int propertyEnum, int codepoint, int nameChoice) { switch (propertyEnum) { - case Common.TO_NFC: - return Common.MyNormalize(codepoint, Normalizer.NFC); - case Common.TO_NFD: - return Common.MyNormalize(codepoint, Normalizer.NFD); - case Common.TO_NFKC: - return Common.MyNormalize(codepoint, Normalizer.NFKC); - case Common.TO_NFKD: - return Common.MyNormalize(codepoint, Normalizer.NFKD); - case Common.TO_CASEFOLD: - return UCharacter.foldCase(UTF16.valueOf(codepoint), true); - case Common.TO_LOWERCASE: - return UCharacter.toLowerCase(ULocale.ROOT, UTF16.valueOf(codepoint)); - case Common.TO_UPPERCASE: - return UCharacter.toUpperCase(ULocale.ROOT, UTF16.valueOf(codepoint)); - case Common.TO_TITLECASE: - return UCharacter.toTitleCase(ULocale.ROOT, UTF16.valueOf(codepoint), null); - case Common.SUBHEAD: - return UnicodeUtilities.getSubheader().getSubheader(codepoint); + case Common.TO_NFC: + return Common.MyNormalize(codepoint, Normalizer.NFC); + case Common.TO_NFD: + return Common.MyNormalize(codepoint, Normalizer.NFD); + case Common.TO_NFKC: + return Common.MyNormalize(codepoint, Normalizer.NFKC); + case Common.TO_NFKD: + return Common.MyNormalize(codepoint, Normalizer.NFKD); + case Common.TO_CASEFOLD: + return UCharacter.foldCase(UTF16.valueOf(codepoint), true); + case Common.TO_LOWERCASE: + return UCharacter.toLowerCase(ULocale.ROOT, UTF16.valueOf(codepoint)); + case Common.TO_UPPERCASE: + return UCharacter.toUpperCase(ULocale.ROOT, UTF16.valueOf(codepoint)); + case Common.TO_TITLECASE: + return UCharacter.toTitleCase(ULocale.ROOT, UTF16.valueOf(codepoint), null); + case Common.SUBHEAD: + return UnicodeUtilities.getSubheader().getSubheader(codepoint); } return UCharacter.getStringPropertyValue(propertyEnum, codepoint, nameChoice); } @@ -81,12 +91,9 @@ static String myFoldCase(String source) { } public static HanTypeValues getValue(int codepoint) { - if (Common.simpOnly.contains(codepoint)) - return HanTypeValues.Hans; - if (Common.tradOnly.contains(codepoint)) - return HanTypeValues.Hant; - if (Common.bothSimpTrad.contains(codepoint)) - return HanTypeValues.Han; + if (Common.simpOnly.contains(codepoint)) return HanTypeValues.Hans; + if (Common.tradOnly.contains(codepoint)) return HanTypeValues.Hant; + if (Common.bothSimpTrad.contains(codepoint)) return HanTypeValues.Han; return HanTypeValues.na; } @@ -94,78 +101,83 @@ static String MyNormalize(int codepoint, Mode mode) { return Normalizer.normalize(codepoint, mode); } - public static final UnicodeSet simpOnly = new UnicodeSet( - "[㑩㓥㔉㖊㖞㛟㛠㛿㟆㧑㧟㨫㱩㱮㲿㶉㶶㶽㺍㻏㻘䁖䅉䇲䌶-䌺䌼-䌾䍀䍁䓕䗖䘛䙊䙓䜣䜥䜧䝙䞌䞍䞐䢂䥿-䦁䩄䯃-䯅䲝䲞䴓-䴙万与丑专业-丝丢两严丧个丰临为丽举么义乌乐乔习乡书买乱争于亏云亚产亩亲亵亸亿仅仆从仑仓仪们价众优会伛伞-传伣-伧伪伫体佣佥侠侣侥-侪侬俣俦俨-俫俭债倾偬偻偾偿傥傧-傩儿克兑兖党兰关兴兹养兽冁内冈册写军农冯冲决况冻净准凉减凑凛几凤凫凭凯击凿刍划刘-创删别-刮制刹刽刿剀剂剐剑剥剧劝办务劢动励-劳势勋勚匀匦匮区医华协单卖卜卢卤卫却厂厅历厉压-厍厐厕厘厢厣厦厨厩厮县叁参双发变叙叠只叶号叹叽同向吓吕吗吣吨听启吴呐呒呓呕-呙呛呜咏咙咛咝咤咸响哑-哕哗哙哜哝哟唛唝唠-唢唤啧啬-啮啴啸喷喽喾嗫嗳嘘嘤嘱噜嚣团园困囱围囵国图圆圣圹场坂坏块坚-坠垄-垆垒垦垩垫垭垱垲垴埘-埚埯堑堕墙壮声壳壶壸处备复够头夸-夺奁奂奋奖奥奸妆-妈妩-妫姗姹娄-娈娱娲娴婳-婶媪嫒嫔嫱嬷孙学孪宁宝实宠审宪宫宽宾寝对寻导寿将尔尘尝尧尴尸尽层屃屉届属屡屦屿岁岂岖-岛岭岽岿峄峡峣-峦崂-崄崭嵘嵚嵝巅巩巯币帅师帏帐帘帜带帧帮帱帻帼幂干并广庄庆庐庑库应庙庞废廪开异弃弑张弥弪弯弹强归当录彝彦彻征径徕御忆忏忧忾怀-怆怜总怼怿恋恒恳恶恸-恽悦悫-悯惊惧-惩惫-惯愠愤愦愿慑懑懒懔戆戋戏戗战戬戯户扑执扩-扬扰抚抛抟-抢护报担拟拢拣拥-择挂挚-挦挽捝-损捡-捣据掳掴掷掸掺掼揽-搂搅携摄-摈摊撄撑撵撷撸撺擞攒敌敛数斋斓斗斩断无旧时-旸昙昼-显晋晒-晖暂暧术朴机杀杂权杆条来杨杩杰松板构枞枢枣枥枧枨枪枫枭柜柠柽栀栅标-栌栎栏树栖栗样栾桠-桩梦梼梾-棂椁椟椠椤椭楼榄榅榇-榉槚槛槟槠横樯樱橥橱橹橼檩欢欤欧歼殁殇残殒殓殚殡殴毁毂毕毙毡毵氇气氢氩氲汇汉汤汹沈沟没沣-沧沩沪泞注泪泶-泸泺-泾洁洒洼浃浅-浈浊测浍-浔涂涛涝-涡涣涤润-涩淀渊渌-渎渐渑渔渖渗温湾湿溃溅溆滗滚滞-滢滤-滦滨-滪漓漤潆潇潋潍潜潴澜濑濒灏灭灯灵灶灾-炀炉炖炜炝点炼炽烁-烃烛烟烦-烩烫-热焕焖焘煴爱爷牍牦牵牺犊状-犹狈狝狞独-狲猃猎猕猡猪-猬献獭玑玚玛玮-玱玺珐珑珰珲琏琐琼瑶瑷璎瓒瓯电画畅畴疖疗疟-疡疬-疯疱疴症-痉痒痖痨痪痫瘅瘆瘗瘘瘪瘫瘾瘿癞癣癫皑皱皲盏-监盖-盘眍眦眬着睁睐睑瞆瞒瞩矫矶矾-码砖砗砚砜砺砻砾础硁硕-硗硙确硷碍碛碜碱礼祃祎祢祯祷祸禀禄禅离秃秆种积称秽秾稆税稣稳穑穷窃窍窎窑窜窝窥窦窭竖竞笃笋笔笕笺笼笾筑筚-筝筹筼签简箓箦-箫篑篓篮篱簖籁籴类籼粜粝粤粪粮糁糇系紧累絷纟-缏缑-缵罂网罗罚罢罴羁羟翘耢耧耸耻聂聋-聍联聩聪肃肠肤肮肴肾-胁胆胜胡胧胨胪胫胶脉脍脏-脑脓脔脚脱脶脸腊腭腻-腾膑臜致舆舍舣舰舱舻艰艳艺节芈芗芜芦芸苁苇苈苋-苏苹范茎茏茑茔茕茧荆荐荙-荜荞-荡荣-药莅莱-莴莶-莺莼萝萤-萨葱蒇蒉蒋蒌蓝蓟蓠蓣蓥蓦蔂蔷蔹蔺蔼蕰蕲蕴薮藓蘖虏虑虚虫虬虮虽-蚂蚕蚬蛊蛎蛏蛮蛰-蛴蜕蜗蝇-蝉蝼蝾螀螨蟏衅衔补表衬衮袄-袆袜袭袯装裆裈裢-裥褛褴见-觑觞触觯訚誉誊讠-谈谊-谷豮贝-赣赪赵赶趋趱趸跃跄跞践-跹跻踊踌踪踬踯蹑蹒蹰蹿躏躜躯车-辚辞辩辫边辽达迁过迈运还这进-迟迩迳迹适选逊递逦逻遗遥邓邝邬邮邹-邻郁郏-郑郓郦郧郸酂酝酦酱酽-酿采释里鉴銮錾钅-镶长门-阛队阳-阶际-陉陕陧-险随隐隶隽难雏雠雳雾霁霡霭靓静面靥鞑鞒鞯韦-韬韵页-颢颤-颧风-飚飞飨餍饣-馕马-骧髅髋髌鬓魇魉鱼-鳣鸟-鹭鹯-鹴鹾麦麸黄黉黡黩黪黾鼋鼍鼗鼹齐齑齿-龌龙-龛龟𡒄𨱏]") - .freeze(); - public static final UnicodeSet tradOnly = new UnicodeSet( - "[㠏㩜䊷䋙䋻䝼䯀䰾䱽䲁丟並乾亂亞佇併來侖侶俁係俔俠倀倆倈倉個們倫偉側偵偽傑傖傘備傭傯傳-債傷傾僂僅僉僑僕僞僥僨價儀儂億儈儉儐儔儕儘償優儲儷儸儺-儼兌兒兗內兩冊冪凈凍凜凱別刪剄則剋剎剗剛剝剮剴創劃劇劉劊劌劍劏劑劚勁動務勛勝勞勢勩勱勵勸勻匭匯匱區協卻厙厠厭厲厴參叄叢吒吳吶呂呆咼員唄唚問啓啞啟啢喎喚喪喬單喲嗆嗇嗊嗎嗚嗩嗶嘆嘍嘔嘖嘗嘜嘩嘮-嘰嘵嘸嘽噓噚噝噠噥噦噯噲噴噸噹嚀嚇嚌嚕嚙嚦嚨嚲-嚴嚶囀-囂囅囈囑囪圇國圍園圓圖團垵埡埰執堅堊堖堝堯報場塊塋塏塒塗塢塤塵塹墊墜墮墳墻墾壇壈壋壓壘-壚壞-壠壢壩壯壺壼壽夠夢夾奐奧奩奪奬奮奼妝姍姦娛婁婦婭媧媯媼媽嫗嫵嫻嫿嬀嬈嬋嬌嬙嬡嬤嬪嬰嬸孌孫學孿宮寢實寧審寫寬寵寶將專尋對導尷屆屍屓屜屢層屨屬岡峴島峽崍崗崢崬嵐嶁嶄嶇嶔嶗嶠嶢嶧嶮嶴嶸嶺嶼巋巒巔巰帥師帳帶幀幃幗幘幟幣幫幬幹幺幾庫廁廂廄廈廚廝廟-廣廩廬廳弒弳張強彈彌彎彙彞彥後徑從徠復徵徹恆恥悅悞悵悶惡惱惲惻愛愜愨愴愷愾慄態慍慘慚慟慣慤慪慫慮慳慶憂憊憐-憒憚憤憫憮憲憶懇應懌懍懟懣懨懲懶-懸懺懼懾戀戇戔戧戩戰-戲戶拋挩挾捨捫掃掄掗掙掛採揀揚換揮損搖搗搵搶摑摜摟摯摳摶摻撈撏撐撓撝撟撣撥撫撲撳撻撾撿擁擄擇擊擋擓擔據擠擬擯-擲擴擷擺-擼擾攄攆攏攔攖攙攛-攝攢-攤攪攬敗敘敵數斂斃斕斬斷於時晉晝暈暉暘暢暫曄曆曇曉曏曖曠曨曬書會朧東杴柵桿梔梘條梟梲棄棖棗棟棧棲棶椏楊楓楨業極榪榮榲榿構槍槤槧槨槳樁樂樅樓標樞樣樸-樺橈橋機橢橫檁檉檔檜檟檢檣檮檯檳檸檻櫃櫓櫚櫛櫝-櫟櫥櫧櫨櫪-櫬櫱櫳櫸櫻欄權欏欒欖欞欽歐歟歡歲歷歸歿殘殞殤殨殫殮-殰殲殺-殼毀毆毿氂氈氌氣氫氬氳決沒沖況洶浹涇涼淚淥淪淵淶淺渙減渦測渾湊湞湯溈準溝溫滄滅滌滎滬滯滲滷滸滻滾滿漁漚漢漣漬漲漵漸漿潁潑潔潙潛潤潯潰潷潿澀澆澇澗澠澤澦澩澮澱濁濃濕濘濟濤濫濰濱濺濼濾瀅-瀇瀉瀋瀏瀕瀘瀝瀟瀠瀦-瀨瀲瀾灃灄灑灕灘灝灠灣灤灧災為烏烴無煉煒煙煢煥煩煬煱熅熒熗熱熲熾燁燈燉燒燙燜營燦燭燴燶燼燾爍爐爛爭爲爺爾牆牘牽犖犢犧狀狹狽猙猶猻獁獃-獅獎獨獪獫獮獰-獲獵獷獸獺-獼玀現琺琿瑋瑒瑣瑤瑩瑪瑲璉璣璦璫環璽瓊瓏瓔瓚甌產産畝畢畫異當疇疊痙痾瘂瘋瘍瘓瘞瘡瘧瘮瘲瘺瘻療癆癇癉癘癟癢癤癥癧癩癬-癮癰-癲發皚皰皸皺盜盞盡監盤盧盪眥眾睏睜睞瞘瞜瞞瞶瞼矓矚矯硜硤硨硯碩碭碸確碼磑磚磣磧磯磽礆礎礙礦礪-礬礱祿禍禎禕禡禦禪禮禰禱禿秈稅稈稏稟種稱穀穌-穎穠-穢穩穫穭窩窪窮窯窵窶窺竄竅竇竈竊竪競筆筍筧筴箋箏節範築篋篔篤篩篳簀簍簞簡簣簫簹簽簾籃籌籙籜籟籠籩籪籬籮粵糝糞糧糲糴糶糹糾紀紂約-紉紋納紐紓-紝紡紬細-紳紵紹紺紼紿絀終組-絆絎結絕絛絝絞絡絢給絨絰-絳絶絹綁綃綆綈綉綌綏綐經綜綞綠綢綣綫-維綯-綵綸-綻綽-綿緄緇緊緋緑-緔緗-線緝緞締緡緣緦編緩緬緯緱緲練緶緹緻縈-縋縐縑縕縗縛縝-縟縣縧縫縭縮縱-縳縵-縷縹總績繃繅繆繒織繕繚繞繡繢繩-繫繭-繰繳繸繹繼-繿纈纊續纍纏纓纖纘纜缽罈罌罰罵罷羅羆羈羋羥義習翹耬耮聖聞聯聰聲聳聵-職聹聽聾肅脅脈脛脫脹腎腖腡腦腫腳腸膃膚膠膩膽-膿臉臍臏臘臚臟臠臢臨臺與-舊艙艤艦艫艱艷芻茲荊莊莖莢莧華萇萊萬萵葉葒著葤葦葯葷蒓蒔蒞蒼蓀蓋蓮蓯蓴蓽蔔蔞蔣蔥蔦蔭蕁蕆蕎蕒蕓蕕蕘蕢蕩蕪蕭蕷薀薈薊薌薔薘薟薦薩薳薴薺藍藎藝藥藪藴藶藹藺蘄蘆蘇蘊蘋蘚蘞蘢蘭蘺蘿虆處虛虜號虧虯蛺蛻蜆蝕蝟蝦蝸螄螞螢螮螻螿蟄蟈蟎蟣蟬蟯蟲蟶蟻蠅蠆蠐蠑蠟蠣蠨蠱蠶蠻衆術衕衚衛衝衹袞裊裏補裝裡製複褌褘褲褳褸褻襇襏襖襝襠襤襪襬襯襲覆見覎規覓視覘覡覥覦親覬覯覲覷覺覽覿觀觴觶觸訁-訃計訊訌討訐訒訓訕-記訛訝訟訢訣訥訩訪設許訴訶診註詁詆詎詐詒詔-詘詛詞詠-詣試詩詫-詮詰-詳詵詼詿誄-誇誌認誑誒誕誘誚語誠誡誣-誦誨說説誰課誶誹誼誾調諂諄談諉請諍諏諑諒論諗諛-諞諢諤諦諧諫諭諮諱諳諶-諸諺諼諾謀-謂謄謅謊謎謐謔謖謗謙-講謝謠謡謨謫-謭謳謹謾譅證譎譏譖識-譚譜譫譯議譴護譸譽譾讀變讎讒讓讕讖讜讞豈豎豐豬豶貓貙貝-貢貧-責貯貰貲-貴貶-貸貺-貽貿-賅資賈賊賑-賓賕賙賚賜賞賠-賤賦賧質-賭賰賴賵賺-賾贄贅贇贈贊贋贍贏贐贓贔贖贗贛贜赬趕趙趨趲跡踐踴蹌蹕蹣蹤蹺躂躉-躋躍躑-躓躕躚躡躥躦躪軀車-軍軑軒軔軛軟軤軫軲軸-軼軾較輅輇-輊輒-輕輛-輟輥輦輩輪輬輯輳輸輻輾-轀轂轄-轆轉轍轎轔轟轡轢轤辦辭-辯農逕這連進運過達違遙遜遞遠適遲遷選遺遼邁還邇邊邏邐郟郵鄆鄉鄒鄔鄖鄧鄭鄰鄲鄴鄶鄺酇酈醖醜醞醫醬醱釀釁釃釅釋釐釒-釕釗-釙針釣釤釧釩釵釷釹釺鈀鈁鈃鈄鈈鈉鈍鈎鈐-鈒鈔鈕鈞鈣鈥-鈧鈮鈰鈳鈴鈷-鈺鈽-鉀鉅鉈鉉鉋鉍鉑鉕鉗鉚鉛鉞鉢鉤鉦鉬鉭鉶鉸鉺鉻鉿銀銃銅銍銑銓銖銘銚-銜銠銣銥銦銨-銬銱銳銷銹銻銼鋁鋃鋅鋇鋌鋏鋒鋙鋝鋟鋣-鋦鋨-鋪鋭-鋱鋶鋸鋼錁錄錆-錈錏錐錒錕錘-錛錟-錢錦錨錩錫錮錯録錳錶錸鍀鍁鍃鍆-鍈鍋鍍鍔鍘鍚鍛鍠鍤鍥鍩鍬鍰鍵鍶鍺鍾鎂鎄鎇鎊鎔鎖鎘鎚鎛鎝鎡-鎣鎦鎧鎩鎪鎬鎮鎰鎲鎳鎵鎸鎿鏃鏇鏈鏌鏍鏐鏑鏗鏘鏜-鏟鏡鏢鏤鏨鏰鏵鏷鏹鏽鐃鐋鐐鐒-鐔鐘鐙鐝鐠鐦-鐨鐫鐮鐲鐳鐵鐶鐸鐺鐿鑄鑊鑌鑒鑔鑕鑞鑠鑣鑥鑭鑰-鑲鑷鑹鑼-鑿钁長門閂閃閆閈閉開閌閎閏閑間閔閘閡閣閥閨閩閫-閭閱閲閶閹閻-閿闃闆闈闊-闍闐闒-闖關闞闠闡闤闥阪陘陝陣陰陳陸陽隉隊階隕際隨險隱隴隸隻雋雖雙雛雜雞離難雲電霢霧霽靂靄靈靚靜靦靨鞀鞏鞝鞽韁韃韉韋-韍韓韙韜韞韻響頁-頃項-須頊頌頎-頓頗領頜頡頤頦頭頮頰頲頴頷-頹頻頽顆題-顏顒-顔願顙顛類顢顥顧顫顬顯-顱顳顴風颭-颯颱颳颶颸颺-颼飀飄飆飈飛飠飢飣飥飩-飫飭飯飲飴飼-飿餃-餅餉養餌餎餏餑-餓餕餖餘餚-餜餞餡館餱餳餶餷餺餼餾餿饁饃饅饈-饌饑饒饗饜饞饢馬-馮馱馳馴馹駁駐-駒駔駕駘駙駛駝駟駡駢駭駰駱駸駿騁騂騅騌-騏騖騙騤騧騫騭騮騰騶-騸騾驀-驅驊驌驍驏驕驗驚驛驟驢驤-驦驪驫骯髏髒體-髖髮鬆鬍鬚鬢鬥鬧鬩鬮鬱魎魘魚魛魢魨魯魴魷魺鮁鮃鮊鮋鮍鮎鮐-鮓鮚鮜-鮞鮦鮪鮫鮭鮮鮳鮶鮺鯀鯁鯇鯉鯊鯒鯔-鯗鯛鯝鯡鯢鯤鯧鯨鯪鯫鯰鯴鯷鯽鯿鰁-鰃鰈鰉鰍鰏鰐鰒鰓鰜鰟鰠鰣鰥鰨鰩鰭鰮鰱-鰳鰵鰷鰹-鰼鰾鱂鱅鱈鱉鱒鱔鱖-鱘鱝鱟鱠鱣鱤鱧鱨鱭鱯鱷鱸鱺鳥鳧鳩鳬鳲-鳴鳶鳾鴆鴇鴉鴒鴕鴛鴝-鴟鴣鴦鴨鴯鴰鴴鴷鴻鴿鵁-鵃鵐-鵓鵜鵝鵠鵡鵪鵬鵮鵯鵲鵷鵾鶄鶇鶉鶊鶓鶖鶘鶚鶡鶥鶩鶪鶬鶯鶲鶴鶹-鶼鶿-鷂鷄鷈鷊鷓鷖鷗鷙鷚鷥鷦鷫鷯鷲鷳鷸-鷺鷽鷿鸂鸇鸌鸏鸕鸘鸚鸛鸝鸞鹵鹹鹺鹼鹽麗麥麩麵麼麽黃黌點黨黲黶黷黽黿鼉鼴齊齋齎齏齒齔齕齗齙齜齟-齡齦齪齬齲齶齷龍龎龐龔龕龜𡞵𡠹𡢃𤪺𤫩𧜵𧝞𧩙𧵳𨋢𨦫𨧜𨯅𩣑𩶘]") - .freeze(); - public static final UnicodeSet bothSimpTrad = new UnicodeSet("[:sc=han:]").removeAll(simpOnly).removeAll(tradOnly).freeze(); + public static final UnicodeSet simpOnly = + new UnicodeSet( + "[㑩㓥㔉㖊㖞㛟㛠㛿㟆㧑㧟㨫㱩㱮㲿㶉㶶㶽㺍㻏㻘䁖䅉䇲䌶-䌺䌼-䌾䍀䍁䓕䗖䘛䙊䙓䜣䜥䜧䝙䞌䞍䞐䢂䥿-䦁䩄䯃-䯅䲝䲞䴓-䴙万与丑专业-丝丢两严丧个丰临为丽举么义乌乐乔习乡书买乱争于亏云亚产亩亲亵亸亿仅仆从仑仓仪们价众优会伛伞-传伣-伧伪伫体佣佥侠侣侥-侪侬俣俦俨-俫俭债倾偬偻偾偿傥傧-傩儿克兑兖党兰关兴兹养兽冁内冈册写军农冯冲决况冻净准凉减凑凛几凤凫凭凯击凿刍划刘-创删别-刮制刹刽刿剀剂剐剑剥剧劝办务劢动励-劳势勋勚匀匦匮区医华协单卖卜卢卤卫却厂厅历厉压-厍厐厕厘厢厣厦厨厩厮县叁参双发变叙叠只叶号叹叽同向吓吕吗吣吨听启吴呐呒呓呕-呙呛呜咏咙咛咝咤咸响哑-哕哗哙哜哝哟唛唝唠-唢唤啧啬-啮啴啸喷喽喾嗫嗳嘘嘤嘱噜嚣团园困囱围囵国图圆圣圹场坂坏块坚-坠垄-垆垒垦垩垫垭垱垲垴埘-埚埯堑堕墙壮声壳壶壸处备复够头夸-夺奁奂奋奖奥奸妆-妈妩-妫姗姹娄-娈娱娲娴婳-婶媪嫒嫔嫱嬷孙学孪宁宝实宠审宪宫宽宾寝对寻导寿将尔尘尝尧尴尸尽层屃屉届属屡屦屿岁岂岖-岛岭岽岿峄峡峣-峦崂-崄崭嵘嵚嵝巅巩巯币帅师帏帐帘帜带帧帮帱帻帼幂干并广庄庆庐庑库应庙庞废廪开异弃弑张弥弪弯弹强归当录彝彦彻征径徕御忆忏忧忾怀-怆怜总怼怿恋恒恳恶恸-恽悦悫-悯惊惧-惩惫-惯愠愤愦愿慑懑懒懔戆戋戏戗战戬戯户扑执扩-扬扰抚抛抟-抢护报担拟拢拣拥-择挂挚-挦挽捝-损捡-捣据掳掴掷掸掺掼揽-搂搅携摄-摈摊撄撑撵撷撸撺擞攒敌敛数斋斓斗斩断无旧时-旸昙昼-显晋晒-晖暂暧术朴机杀杂权杆条来杨杩杰松板构枞枢枣枥枧枨枪枫枭柜柠柽栀栅标-栌栎栏树栖栗样栾桠-桩梦梼梾-棂椁椟椠椤椭楼榄榅榇-榉槚槛槟槠横樯樱橥橱橹橼檩欢欤欧歼殁殇残殒殓殚殡殴毁毂毕毙毡毵氇气氢氩氲汇汉汤汹沈沟没沣-沧沩沪泞注泪泶-泸泺-泾洁洒洼浃浅-浈浊测浍-浔涂涛涝-涡涣涤润-涩淀渊渌-渎渐渑渔渖渗温湾湿溃溅溆滗滚滞-滢滤-滦滨-滪漓漤潆潇潋潍潜潴澜濑濒灏灭灯灵灶灾-炀炉炖炜炝点炼炽烁-烃烛烟烦-烩烫-热焕焖焘煴爱爷牍牦牵牺犊状-犹狈狝狞独-狲猃猎猕猡猪-猬献獭玑玚玛玮-玱玺珐珑珰珲琏琐琼瑶瑷璎瓒瓯电画畅畴疖疗疟-疡疬-疯疱疴症-痉痒痖痨痪痫瘅瘆瘗瘘瘪瘫瘾瘿癞癣癫皑皱皲盏-监盖-盘眍眦眬着睁睐睑瞆瞒瞩矫矶矾-码砖砗砚砜砺砻砾础硁硕-硗硙确硷碍碛碜碱礼祃祎祢祯祷祸禀禄禅离秃秆种积称秽秾稆税稣稳穑穷窃窍窎窑窜窝窥窦窭竖竞笃笋笔笕笺笼笾筑筚-筝筹筼签简箓箦-箫篑篓篮篱簖籁籴类籼粜粝粤粪粮糁糇系紧累絷纟-缏缑-缵罂网罗罚罢罴羁羟翘耢耧耸耻聂聋-聍联聩聪肃肠肤肮肴肾-胁胆胜胡胧胨胪胫胶脉脍脏-脑脓脔脚脱脶脸腊腭腻-腾膑臜致舆舍舣舰舱舻艰艳艺节芈芗芜芦芸苁苇苈苋-苏苹范茎茏茑茔茕茧荆荐荙-荜荞-荡荣-药莅莱-莴莶-莺莼萝萤-萨葱蒇蒉蒋蒌蓝蓟蓠蓣蓥蓦蔂蔷蔹蔺蔼蕰蕲蕴薮藓蘖虏虑虚虫虬虮虽-蚂蚕蚬蛊蛎蛏蛮蛰-蛴蜕蜗蝇-蝉蝼蝾螀螨蟏衅衔补表衬衮袄-袆袜袭袯装裆裈裢-裥褛褴见-觑觞触觯訚誉誊讠-谈谊-谷豮贝-赣赪赵赶趋趱趸跃跄跞践-跹跻踊踌踪踬踯蹑蹒蹰蹿躏躜躯车-辚辞辩辫边辽达迁过迈运还这进-迟迩迳迹适选逊递逦逻遗遥邓邝邬邮邹-邻郁郏-郑郓郦郧郸酂酝酦酱酽-酿采释里鉴銮錾钅-镶长门-阛队阳-阶际-陉陕陧-险随隐隶隽难雏雠雳雾霁霡霭靓静面靥鞑鞒鞯韦-韬韵页-颢颤-颧风-飚飞飨餍饣-馕马-骧髅髋髌鬓魇魉鱼-鳣鸟-鹭鹯-鹴鹾麦麸黄黉黡黩黪黾鼋鼍鼗鼹齐齑齿-龌龙-龛龟𡒄𨱏]") + .freeze(); + public static final UnicodeSet tradOnly = + new UnicodeSet( + "[㠏㩜䊷䋙䋻䝼䯀䰾䱽䲁丟並乾亂亞佇併來侖侶俁係俔俠倀倆倈倉個們倫偉側偵偽傑傖傘備傭傯傳-債傷傾僂僅僉僑僕僞僥僨價儀儂億儈儉儐儔儕儘償優儲儷儸儺-儼兌兒兗內兩冊冪凈凍凜凱別刪剄則剋剎剗剛剝剮剴創劃劇劉劊劌劍劏劑劚勁動務勛勝勞勢勩勱勵勸勻匭匯匱區協卻厙厠厭厲厴參叄叢吒吳吶呂呆咼員唄唚問啓啞啟啢喎喚喪喬單喲嗆嗇嗊嗎嗚嗩嗶嘆嘍嘔嘖嘗嘜嘩嘮-嘰嘵嘸嘽噓噚噝噠噥噦噯噲噴噸噹嚀嚇嚌嚕嚙嚦嚨嚲-嚴嚶囀-囂囅囈囑囪圇國圍園圓圖團垵埡埰執堅堊堖堝堯報場塊塋塏塒塗塢塤塵塹墊墜墮墳墻墾壇壈壋壓壘-壚壞-壠壢壩壯壺壼壽夠夢夾奐奧奩奪奬奮奼妝姍姦娛婁婦婭媧媯媼媽嫗嫵嫻嫿嬀嬈嬋嬌嬙嬡嬤嬪嬰嬸孌孫學孿宮寢實寧審寫寬寵寶將專尋對導尷屆屍屓屜屢層屨屬岡峴島峽崍崗崢崬嵐嶁嶄嶇嶔嶗嶠嶢嶧嶮嶴嶸嶺嶼巋巒巔巰帥師帳帶幀幃幗幘幟幣幫幬幹幺幾庫廁廂廄廈廚廝廟-廣廩廬廳弒弳張強彈彌彎彙彞彥後徑從徠復徵徹恆恥悅悞悵悶惡惱惲惻愛愜愨愴愷愾慄態慍慘慚慟慣慤慪慫慮慳慶憂憊憐-憒憚憤憫憮憲憶懇應懌懍懟懣懨懲懶-懸懺懼懾戀戇戔戧戩戰-戲戶拋挩挾捨捫掃掄掗掙掛採揀揚換揮損搖搗搵搶摑摜摟摯摳摶摻撈撏撐撓撝撟撣撥撫撲撳撻撾撿擁擄擇擊擋擓擔據擠擬擯-擲擴擷擺-擼擾攄攆攏攔攖攙攛-攝攢-攤攪攬敗敘敵數斂斃斕斬斷於時晉晝暈暉暘暢暫曄曆曇曉曏曖曠曨曬書會朧東杴柵桿梔梘條梟梲棄棖棗棟棧棲棶椏楊楓楨業極榪榮榲榿構槍槤槧槨槳樁樂樅樓標樞樣樸-樺橈橋機橢橫檁檉檔檜檟檢檣檮檯檳檸檻櫃櫓櫚櫛櫝-櫟櫥櫧櫨櫪-櫬櫱櫳櫸櫻欄權欏欒欖欞欽歐歟歡歲歷歸歿殘殞殤殨殫殮-殰殲殺-殼毀毆毿氂氈氌氣氫氬氳決沒沖況洶浹涇涼淚淥淪淵淶淺渙減渦測渾湊湞湯溈準溝溫滄滅滌滎滬滯滲滷滸滻滾滿漁漚漢漣漬漲漵漸漿潁潑潔潙潛潤潯潰潷潿澀澆澇澗澠澤澦澩澮澱濁濃濕濘濟濤濫濰濱濺濼濾瀅-瀇瀉瀋瀏瀕瀘瀝瀟瀠瀦-瀨瀲瀾灃灄灑灕灘灝灠灣灤灧災為烏烴無煉煒煙煢煥煩煬煱熅熒熗熱熲熾燁燈燉燒燙燜營燦燭燴燶燼燾爍爐爛爭爲爺爾牆牘牽犖犢犧狀狹狽猙猶猻獁獃-獅獎獨獪獫獮獰-獲獵獷獸獺-獼玀現琺琿瑋瑒瑣瑤瑩瑪瑲璉璣璦璫環璽瓊瓏瓔瓚甌產産畝畢畫異當疇疊痙痾瘂瘋瘍瘓瘞瘡瘧瘮瘲瘺瘻療癆癇癉癘癟癢癤癥癧癩癬-癮癰-癲發皚皰皸皺盜盞盡監盤盧盪眥眾睏睜睞瞘瞜瞞瞶瞼矓矚矯硜硤硨硯碩碭碸確碼磑磚磣磧磯磽礆礎礙礦礪-礬礱祿禍禎禕禡禦禪禮禰禱禿秈稅稈稏稟種稱穀穌-穎穠-穢穩穫穭窩窪窮窯窵窶窺竄竅竇竈竊竪競筆筍筧筴箋箏節範築篋篔篤篩篳簀簍簞簡簣簫簹簽簾籃籌籙籜籟籠籩籪籬籮粵糝糞糧糲糴糶糹糾紀紂約-紉紋納紐紓-紝紡紬細-紳紵紹紺紼紿絀終組-絆絎結絕絛絝絞絡絢給絨絰-絳絶絹綁綃綆綈綉綌綏綐經綜綞綠綢綣綫-維綯-綵綸-綻綽-綿緄緇緊緋緑-緔緗-線緝緞締緡緣緦編緩緬緯緱緲練緶緹緻縈-縋縐縑縕縗縛縝-縟縣縧縫縭縮縱-縳縵-縷縹總績繃繅繆繒織繕繚繞繡繢繩-繫繭-繰繳繸繹繼-繿纈纊續纍纏纓纖纘纜缽罈罌罰罵罷羅羆羈羋羥義習翹耬耮聖聞聯聰聲聳聵-職聹聽聾肅脅脈脛脫脹腎腖腡腦腫腳腸膃膚膠膩膽-膿臉臍臏臘臚臟臠臢臨臺與-舊艙艤艦艫艱艷芻茲荊莊莖莢莧華萇萊萬萵葉葒著葤葦葯葷蒓蒔蒞蒼蓀蓋蓮蓯蓴蓽蔔蔞蔣蔥蔦蔭蕁蕆蕎蕒蕓蕕蕘蕢蕩蕪蕭蕷薀薈薊薌薔薘薟薦薩薳薴薺藍藎藝藥藪藴藶藹藺蘄蘆蘇蘊蘋蘚蘞蘢蘭蘺蘿虆處虛虜號虧虯蛺蛻蜆蝕蝟蝦蝸螄螞螢螮螻螿蟄蟈蟎蟣蟬蟯蟲蟶蟻蠅蠆蠐蠑蠟蠣蠨蠱蠶蠻衆術衕衚衛衝衹袞裊裏補裝裡製複褌褘褲褳褸褻襇襏襖襝襠襤襪襬襯襲覆見覎規覓視覘覡覥覦親覬覯覲覷覺覽覿觀觴觶觸訁-訃計訊訌討訐訒訓訕-記訛訝訟訢訣訥訩訪設許訴訶診註詁詆詎詐詒詔-詘詛詞詠-詣試詩詫-詮詰-詳詵詼詿誄-誇誌認誑誒誕誘誚語誠誡誣-誦誨說説誰課誶誹誼誾調諂諄談諉請諍諏諑諒論諗諛-諞諢諤諦諧諫諭諮諱諳諶-諸諺諼諾謀-謂謄謅謊謎謐謔謖謗謙-講謝謠謡謨謫-謭謳謹謾譅證譎譏譖識-譚譜譫譯議譴護譸譽譾讀變讎讒讓讕讖讜讞豈豎豐豬豶貓貙貝-貢貧-責貯貰貲-貴貶-貸貺-貽貿-賅資賈賊賑-賓賕賙賚賜賞賠-賤賦賧質-賭賰賴賵賺-賾贄贅贇贈贊贋贍贏贐贓贔贖贗贛贜赬趕趙趨趲跡踐踴蹌蹕蹣蹤蹺躂躉-躋躍躑-躓躕躚躡躥躦躪軀車-軍軑軒軔軛軟軤軫軲軸-軼軾較輅輇-輊輒-輕輛-輟輥輦輩輪輬輯輳輸輻輾-轀轂轄-轆轉轍轎轔轟轡轢轤辦辭-辯農逕這連進運過達違遙遜遞遠適遲遷選遺遼邁還邇邊邏邐郟郵鄆鄉鄒鄔鄖鄧鄭鄰鄲鄴鄶鄺酇酈醖醜醞醫醬醱釀釁釃釅釋釐釒-釕釗-釙針釣釤釧釩釵釷釹釺鈀鈁鈃鈄鈈鈉鈍鈎鈐-鈒鈔鈕鈞鈣鈥-鈧鈮鈰鈳鈴鈷-鈺鈽-鉀鉅鉈鉉鉋鉍鉑鉕鉗鉚鉛鉞鉢鉤鉦鉬鉭鉶鉸鉺鉻鉿銀銃銅銍銑銓銖銘銚-銜銠銣銥銦銨-銬銱銳銷銹銻銼鋁鋃鋅鋇鋌鋏鋒鋙鋝鋟鋣-鋦鋨-鋪鋭-鋱鋶鋸鋼錁錄錆-錈錏錐錒錕錘-錛錟-錢錦錨錩錫錮錯録錳錶錸鍀鍁鍃鍆-鍈鍋鍍鍔鍘鍚鍛鍠鍤鍥鍩鍬鍰鍵鍶鍺鍾鎂鎄鎇鎊鎔鎖鎘鎚鎛鎝鎡-鎣鎦鎧鎩鎪鎬鎮鎰鎲鎳鎵鎸鎿鏃鏇鏈鏌鏍鏐鏑鏗鏘鏜-鏟鏡鏢鏤鏨鏰鏵鏷鏹鏽鐃鐋鐐鐒-鐔鐘鐙鐝鐠鐦-鐨鐫鐮鐲鐳鐵鐶鐸鐺鐿鑄鑊鑌鑒鑔鑕鑞鑠鑣鑥鑭鑰-鑲鑷鑹鑼-鑿钁長門閂閃閆閈閉開閌閎閏閑間閔閘閡閣閥閨閩閫-閭閱閲閶閹閻-閿闃闆闈闊-闍闐闒-闖關闞闠闡闤闥阪陘陝陣陰陳陸陽隉隊階隕際隨險隱隴隸隻雋雖雙雛雜雞離難雲電霢霧霽靂靄靈靚靜靦靨鞀鞏鞝鞽韁韃韉韋-韍韓韙韜韞韻響頁-頃項-須頊頌頎-頓頗領頜頡頤頦頭頮頰頲頴頷-頹頻頽顆題-顏顒-顔願顙顛類顢顥顧顫顬顯-顱顳顴風颭-颯颱颳颶颸颺-颼飀飄飆飈飛飠飢飣飥飩-飫飭飯飲飴飼-飿餃-餅餉養餌餎餏餑-餓餕餖餘餚-餜餞餡館餱餳餶餷餺餼餾餿饁饃饅饈-饌饑饒饗饜饞饢馬-馮馱馳馴馹駁駐-駒駔駕駘駙駛駝駟駡駢駭駰駱駸駿騁騂騅騌-騏騖騙騤騧騫騭騮騰騶-騸騾驀-驅驊驌驍驏驕驗驚驛驟驢驤-驦驪驫骯髏髒體-髖髮鬆鬍鬚鬢鬥鬧鬩鬮鬱魎魘魚魛魢魨魯魴魷魺鮁鮃鮊鮋鮍鮎鮐-鮓鮚鮜-鮞鮦鮪鮫鮭鮮鮳鮶鮺鯀鯁鯇鯉鯊鯒鯔-鯗鯛鯝鯡鯢鯤鯧鯨鯪鯫鯰鯴鯷鯽鯿鰁-鰃鰈鰉鰍鰏鰐鰒鰓鰜鰟鰠鰣鰥鰨鰩鰭鰮鰱-鰳鰵鰷鰹-鰼鰾鱂鱅鱈鱉鱒鱔鱖-鱘鱝鱟鱠鱣鱤鱧鱨鱭鱯鱷鱸鱺鳥鳧鳩鳬鳲-鳴鳶鳾鴆鴇鴉鴒鴕鴛鴝-鴟鴣鴦鴨鴯鴰鴴鴷鴻鴿鵁-鵃鵐-鵓鵜鵝鵠鵡鵪鵬鵮鵯鵲鵷鵾鶄鶇鶉鶊鶓鶖鶘鶚鶡鶥鶩鶪鶬鶯鶲鶴鶹-鶼鶿-鷂鷄鷈鷊鷓鷖鷗鷙鷚鷥鷦鷫鷯鷲鷳鷸-鷺鷽鷿鸂鸇鸌鸏鸕鸘鸚鸛鸝鸞鹵鹹鹺鹼鹽麗麥麩麵麼麽黃黌點黨黲黶黷黽黿鼉鼴齊齋齎齏齒齔齕齗齙齜齟-齡齦齪齬齲齶齷龍龎龐龔龕龜𡞵𡠹𡢃𤪺𤫩𧜵𧝞𧩙𧵳𨋢𨦫𨧜𨯅𩣑𩶘]") + .freeze(); + public static final UnicodeSet bothSimpTrad = + new UnicodeSet("[:sc=han:]").removeAll(simpOnly).removeAll(tradOnly).freeze(); static String MyNormalize(String string, Mode mode) { return Normalizer.normalize(string, mode); } - static final int TO_NFC = UProperty.STRING_LIMIT; - static final int TO_NFD = UProperty.STRING_LIMIT + 1; - static final int TO_NFKC = UProperty.STRING_LIMIT + 2; - static final int TO_NFKD = UProperty.STRING_LIMIT + 3; - static final int TO_CASEFOLD = UProperty.STRING_LIMIT + 4; - static final int TO_LOWERCASE = UProperty.STRING_LIMIT + 5; - static final int TO_UPPERCASE = UProperty.STRING_LIMIT + 6; - static final int TO_TITLECASE = UProperty.STRING_LIMIT + 7; - public static final int SUBHEAD = TO_TITLECASE + 1; - static final int XSTRING_LIMIT = SUBHEAD + 1; -// static UnicodeSet isCaseFolded = new UnicodeSet(); -// static UnicodeSet isLowercase = new UnicodeSet(); -// static UnicodeSet isUppercase = new UnicodeSet(); -// static UnicodeSet isTitlecase = new UnicodeSet(); -// static UnicodeSet isCased = new UnicodeSet(); -// static UnicodeSet isNFKC_CF = new UnicodeSet(); - -// static { -// for (int cp = 0; cp <= 0x10FFFF; ++cp) { -// -// int cat = UCharacter.getType(cp); -// if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) { -// // idnaTypeSet.get(IdnaType.disallowed).add(cp); // faster -// Common.isNFKC_CF.add(cp); -// Common.isCaseFolded.add(cp); -// Common.isLowercase.add(cp); -// Common.isTitlecase.add(cp); -// Common.isUppercase.add(cp); -// continue; -// } -// -// // IdnaType idnaType = Idna2003.getIDNA2003Type(cp); -// // idnaTypeSet.get(idnaType).add(cp); -// -// String s = UTF16.valueOf(cp); -// if (UCharacter.foldCase(s, true).equals(s)) { -// Common.isCaseFolded.add(cp); -// } -// if (NFKC_CF_.normalize(s).equals(s)) { -// Common.isNFKC_CF.add(cp); -// } -// if (UCharacter.toLowerCase(ULocale.ROOT, s).equals(s)) { -// Common.isLowercase.add(cp); -// } -// if (UCharacter.toUpperCase(ULocale.ROOT, s).equals(s)) { -// Common.isUppercase.add(cp); -// } -// if (UCharacter.toTitleCase(ULocale.ROOT, s, null).equals(s)) { -// Common.isTitlecase.add(cp); -// } -// } -// isCaseFolded.freeze(); -// isNFKC_CF.freeze(); -// isLowercase.freeze(); -// isUppercase.freeze(); -// isTitlecase.freeze(); -// // isCased if isLowercase=false OR isUppercase=false OR -// // isTitlecase=false -// // or := ! (isLowercase && isUppercase && isTitlecase) -// Common.isCased = new UnicodeSet(Common.isLowercase).retainAll(Common.isUppercase).retainAll( -// Common.isTitlecase).complement(); -// } + static final int TO_NFC = UProperty.STRING_LIMIT; + static final int TO_NFD = UProperty.STRING_LIMIT + 1; + static final int TO_NFKC = UProperty.STRING_LIMIT + 2; + static final int TO_NFKD = UProperty.STRING_LIMIT + 3; + static final int TO_CASEFOLD = UProperty.STRING_LIMIT + 4; + static final int TO_LOWERCASE = UProperty.STRING_LIMIT + 5; + static final int TO_UPPERCASE = UProperty.STRING_LIMIT + 6; + static final int TO_TITLECASE = UProperty.STRING_LIMIT + 7; + public static final int SUBHEAD = TO_TITLECASE + 1; + static final int XSTRING_LIMIT = SUBHEAD + 1; + // static UnicodeSet isCaseFolded = new UnicodeSet(); + // static UnicodeSet isLowercase = new UnicodeSet(); + // static UnicodeSet isUppercase = new UnicodeSet(); + // static UnicodeSet isTitlecase = new UnicodeSet(); + // static UnicodeSet isCased = new UnicodeSet(); + // static UnicodeSet isNFKC_CF = new UnicodeSet(); + + // static { + // for (int cp = 0; cp <= 0x10FFFF; ++cp) { + // + // int cat = UCharacter.getType(cp); + // if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == + // UCharacter.SURROGATE) { + // // idnaTypeSet.get(IdnaType.disallowed).add(cp); // faster + // Common.isNFKC_CF.add(cp); + // Common.isCaseFolded.add(cp); + // Common.isLowercase.add(cp); + // Common.isTitlecase.add(cp); + // Common.isUppercase.add(cp); + // continue; + // } + // + // // IdnaType idnaType = Idna2003.getIDNA2003Type(cp); + // // idnaTypeSet.get(idnaType).add(cp); + // + // String s = UTF16.valueOf(cp); + // if (UCharacter.foldCase(s, true).equals(s)) { + // Common.isCaseFolded.add(cp); + // } + // if (NFKC_CF_.normalize(s).equals(s)) { + // Common.isNFKC_CF.add(cp); + // } + // if (UCharacter.toLowerCase(ULocale.ROOT, s).equals(s)) { + // Common.isLowercase.add(cp); + // } + // if (UCharacter.toUpperCase(ULocale.ROOT, s).equals(s)) { + // Common.isUppercase.add(cp); + // } + // if (UCharacter.toTitleCase(ULocale.ROOT, s, null).equals(s)) { + // Common.isTitlecase.add(cp); + // } + // } + // isCaseFolded.freeze(); + // isNFKC_CF.freeze(); + // isLowercase.freeze(); + // isUppercase.freeze(); + // isTitlecase.freeze(); + // // isCased if isLowercase=false OR isUppercase=false OR + // // isTitlecase=false + // // or := ! (isLowercase && isUppercase && isTitlecase) + // Common.isCased = new + // UnicodeSet(Common.isLowercase).retainAll(Common.isUppercase).retainAll( + // Common.isTitlecase).complement(); + // } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/CompressedDataInput.java b/UnicodeJsps/src/main/java/org/unicode/jsp/CompressedDataInput.java index 8f69d6928..b53a0c859 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/CompressedDataInput.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/CompressedDataInput.java @@ -1,6 +1,4 @@ -/** - * - */ +/** */ package org.unicode.jsp; import java.io.DataInput; @@ -16,7 +14,8 @@ public CompressedDataInput set(DataInput in) { } /** - * Read long using readUnsignedLong. The bottom bit is the sign. If the number was negative, the value is inverted (~). + * Read long using readUnsignedLong. The bottom bit is the sign. If the number was negative, the + * value is inverted (~). */ @Override public long readLong() throws IOException { @@ -32,7 +31,7 @@ public long readLong() throws IOException { /** * Read a long as a series of 7-bits, with the last one having the top bit on. - * + * * @throws IOException */ public long readUnsignedLong() throws IOException { @@ -41,11 +40,11 @@ public long readUnsignedLong() throws IOException { while (true) { int byteValue = in.readByte(); if ((byte) byteValue >= 0) { - result |= ((long)byteValue << shift); + result |= ((long) byteValue << shift); shift += 7; } else { byteValue &= 0x7F; - result |= ((long)byteValue << shift); + result |= ((long) byteValue << shift); return result; } } @@ -158,4 +157,4 @@ public int readUnsignedByte() throws IOException { public String readLine() throws IOException { return in.readLine(); } -} \ No newline at end of file +} diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Confusables.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Confusables.java index 70b3f382d..3f6f16a9c 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Confusables.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Confusables.java @@ -1,5 +1,12 @@ package org.unicode.jsp; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.Normalizer.Mode; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -8,244 +15,239 @@ import java.util.List; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.util.XEquivalenceClass; import org.unicode.jsp.AlternateIterator.Builder; import org.unicode.jsp.ScriptTester.CompatibilityLevel; import org.unicode.jsp.ScriptTester.ScriptSpecials; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.Normalizer.Mode; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - -public class Confusables implements Iterable{ - public enum ScriptCheck {same, none}; - - private static final XEquivalenceClass equivalents = new XEquivalenceClass(); - private String source; - private Mode normalizationCheck; - private ScriptCheck scriptCheck = ScriptCheck.none; - private UnicodeSet allowedCharacters = null; - - public UnicodeSet getAllowedCharacters() { - return allowedCharacters; - } - - public static UnicodeMap getMap() { - UnicodeMap result = new UnicodeMap(); - for (String s : equivalents) { - Set others = new TreeSet(equivalents.getEquivalences(s)); - String list = "\u2051" + CollectionUtilities.join(others, "\u2051") + "\u2051"; - for (String other : others) { - result.put(other, list); - } - } - result.freeze(); - return result; - } - - public static Set getEquivalents(String string) { - Set result = equivalents.getEquivalences(string); - return Collections.unmodifiableSet(result); - } - - public Confusables setAllowedCharacters(UnicodeSet allowedCharacters) { - this.allowedCharacters = allowedCharacters; - return this; - } - - public Mode getNormalizationCheck() { - return normalizationCheck; - } - - public Confusables setNormalizationCheck(Mode normalizationCheck) { - this.normalizationCheck = normalizationCheck; - return this; - } - - static class MyReader extends FileUtilities.SemiFileReader { - @Override - protected boolean handleLine(int start, int end, String[] items) { - String type = items[2]; - if (!type.equals("MA")) return true; - String result = Utility.fromHex(items[1], 4, " "); - for (int i = start; i <= end; ++i) { - equivalents.add(UTF16.valueOf(i), result); - } - return true; - } - } - - static { - new MyReader().process(Confusables.class, "confusables.txt"); - } - - public Confusables(String source) { - this.source = Normalizer.normalize(source,Normalizer.NFD); - } - - public double getMaxSize() { - AlternateIterator build = buildIterator(); - if (build == null) { - return 0; - } - return build.getMaxSize(); - } - - public Iterator iterator() { - AlternateIterator build = buildIterator(); - if (build == null) { - Set empty = Collections.emptySet(); - return empty.iterator(); - } - return new MyFilteredIterator(build); - } - - private AlternateIterator buildIterator() { - Builder builder = AlternateIterator.start(); - List> table = new ArrayList>(); - int cp; - for (int i = 0; i < source.length(); i += Character.charCount(cp)) { - cp = source.codePointAt(i); - String cps = UTF16.valueOf(cp); - Set confusables = equivalents.getEquivalences(cps); - Set items = new HashSet(); - for (String confusable : confusables) { - if (normalizationCheck != null && !Normalizer.isNormalized(confusable, normalizationCheck, 0)) { - continue; - } - if (allowedCharacters != null && !allowedCharacters.containsAll(confusable)) { - continue; - } - items.add(confusable); - } - if (items.size() == 0) { - return null; - } - table.add(items); - } - - // now filter for multiple scripts, if set - if (scriptCheck != ScriptCheck.none) { - if (!scriptTester.filterTable(table)) { - return null; - } - } - for (Set items : table) { - builder.add(items); - } - AlternateIterator build = builder.build(); - return build; - } - +public class Confusables implements Iterable { + public enum ScriptCheck { + same, + none + }; + + private static final XEquivalenceClass equivalents = + new XEquivalenceClass(); + private String source; + private Mode normalizationCheck; + private ScriptCheck scriptCheck = ScriptCheck.none; + private UnicodeSet allowedCharacters = null; + + public UnicodeSet getAllowedCharacters() { + return allowedCharacters; + } + + public static UnicodeMap getMap() { + UnicodeMap result = new UnicodeMap(); + for (String s : equivalents) { + Set others = new TreeSet(equivalents.getEquivalences(s)); + String list = "\u2051" + CollectionUtilities.join(others, "\u2051") + "\u2051"; + for (String other : others) { + result.put(other, list); + } + } + result.freeze(); + return result; + } - public List> getAlternates() { - AlternateIterator build = buildIterator(); - if (build == null) { - return Collections.emptyList(); + public static Set getEquivalents(String string) { + Set result = equivalents.getEquivalences(string); + return Collections.unmodifiableSet(result); } - return build.getAlternates(); - } - public ScriptCheck getScriptCheck() { - return scriptCheck; - } + public Confusables setAllowedCharacters(UnicodeSet allowedCharacters) { + this.allowedCharacters = allowedCharacters; + return this; + } - public Confusables setScriptCheck(ScriptCheck scriptCheck) { - this.scriptCheck = scriptCheck; - return this; - } + public Mode getNormalizationCheck() { + return normalizationCheck; + } - public static boolean scriptOk(String confusable, ScriptCheck scriptCheck) { - return scriptCheck == ScriptCheck.none - || scriptTester.isOk(confusable); - } + public Confusables setNormalizationCheck(Mode normalizationCheck) { + this.normalizationCheck = normalizationCheck; + return this; + } - public static ScriptTester scriptTester = ScriptTester.start(CompatibilityLevel.Highly_Restrictive, ScriptSpecials.on).get(); + static class MyReader extends FileUtilities.SemiFileReader { + @Override + protected boolean handleLine(int start, int end, String[] items) { + String type = items[2]; + if (!type.equals("MA")) return true; + String result = Utility.fromHex(items[1], 4, " "); + for (int i = start; i <= end; ++i) { + equivalents.add(UTF16.valueOf(i), result); + } + return true; + } + } - class MyFilteredIterator extends FilteredIterator{ - Set alreadySeen;; + static { + new MyReader().process(Confusables.class, "confusables.txt"); + } - public MyFilteredIterator(Iterator base) { - super(base); + public Confusables(String source) { + this.source = Normalizer.normalize(source, Normalizer.NFD); } - @Override - public String allow(String confusable) { - if (alreadySeen == null) { - alreadySeen = new HashSet(); - } - if (alreadySeen.contains(confusable)) { - return null; - } - alreadySeen.add(confusable); + public double getMaxSize() { + AlternateIterator build = buildIterator(); + if (build == null) { + return 0; + } + return build.getMaxSize(); + } - String nfcConfusable = Normalizer.normalize(confusable, Normalizer.NFC); - if (!nfcConfusable.equals(confusable)) { - if (alreadySeen.contains(nfcConfusable)) { - return null; + public Iterator iterator() { + AlternateIterator build = buildIterator(); + if (build == null) { + Set empty = Collections.emptySet(); + return empty.iterator(); + } + return new MyFilteredIterator(build); + } + + private AlternateIterator buildIterator() { + Builder builder = AlternateIterator.start(); + List> table = new ArrayList>(); + int cp; + for (int i = 0; i < source.length(); i += Character.charCount(cp)) { + cp = source.codePointAt(i); + String cps = UTF16.valueOf(cp); + Set confusables = equivalents.getEquivalences(cps); + Set items = new HashSet(); + for (String confusable : confusables) { + if (normalizationCheck != null + && !Normalizer.isNormalized(confusable, normalizationCheck, 0)) { + continue; + } + if (allowedCharacters != null && !allowedCharacters.containsAll(confusable)) { + continue; + } + items.add(confusable); + } + if (items.size() == 0) { + return null; + } + table.add(items); } - alreadySeen.add(nfcConfusable); - } - if (allowedCharacters != null && !allowedCharacters.containsAll(nfcConfusable)) { - return null; - } - if (!scriptOk(nfcConfusable, scriptCheck)) { - return null; - } - if (normalizationCheck != null && !Normalizer.isNormalized(nfcConfusable, normalizationCheck, 0)) { - return null; - } - return nfcConfusable; + // now filter for multiple scripts, if set + if (scriptCheck != ScriptCheck.none) { + if (!scriptTester.filterTable(table)) { + return null; + } + } + for (Set items : table) { + builder.add(items); + } + AlternateIterator build = builder.build(); + return build; } - } - - public static class FilteredIterator implements Iterator { - Iterator base; - T nextItem = null; + public List> getAlternates() { + AlternateIterator build = buildIterator(); + if (build == null) { + return Collections.emptyList(); + } + return build.getAlternates(); + } - public FilteredIterator(Iterator base) { - this.base = base; - load(); + public ScriptCheck getScriptCheck() { + return scriptCheck; } - public boolean hasNext() { - return nextItem != null; + public Confusables setScriptCheck(ScriptCheck scriptCheck) { + this.scriptCheck = scriptCheck; + return this; } - public T next() { - T temp = nextItem; - load(); - return temp; + public static boolean scriptOk(String confusable, ScriptCheck scriptCheck) { + return scriptCheck == ScriptCheck.none || scriptTester.isOk(confusable); } - private void load() { - while (base.hasNext()) { - nextItem = allow(base.next()); - if (nextItem != null) { - return; + public static ScriptTester scriptTester = + ScriptTester.start(CompatibilityLevel.Highly_Restrictive, ScriptSpecials.on).get(); + + class MyFilteredIterator extends FilteredIterator { + Set alreadySeen; + ; + + public MyFilteredIterator(Iterator base) { + super(base); } - } - nextItem = null; - } - public T allow(T item) { - return item; + @Override + public String allow(String confusable) { + if (alreadySeen == null) { + alreadySeen = new HashSet(); + } + if (alreadySeen.contains(confusable)) { + return null; + } + alreadySeen.add(confusable); + + String nfcConfusable = Normalizer.normalize(confusable, Normalizer.NFC); + if (!nfcConfusable.equals(confusable)) { + if (alreadySeen.contains(nfcConfusable)) { + return null; + } + alreadySeen.add(nfcConfusable); + } + + if (allowedCharacters != null && !allowedCharacters.containsAll(nfcConfusable)) { + return null; + } + if (!scriptOk(nfcConfusable, scriptCheck)) { + return null; + } + if (normalizationCheck != null + && !Normalizer.isNormalized(nfcConfusable, normalizationCheck, 0)) { + return null; + } + return nfcConfusable; + } } - public void remove() { - throw new UnsupportedOperationException(); - } + public static class FilteredIterator implements Iterator { + Iterator base; + T nextItem = null; - } + public FilteredIterator(Iterator base) { + this.base = base; + load(); + } - public String getOriginal() { - return source; - } + public boolean hasNext() { + return nextItem != null; + } + + public T next() { + T temp = nextItem; + load(); + return temp; + } + + private void load() { + while (base.hasNext()) { + nextItem = allow(base.next()); + if (nextItem != null) { + return; + } + } + nextItem = null; + } + + public T allow(T item) { + return item; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + public String getOriginal() { + return source; + } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/CreateInversions.java b/UnicodeJsps/src/main/java/org/unicode/jsp/CreateInversions.java index 978b8600b..0e100f25b 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/CreateInversions.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/CreateInversions.java @@ -1,239 +1,249 @@ package org.unicode.jsp; -import java.io.IOException; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.dev.util.UnicodeMapIterator; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; +import java.io.IOException; public class CreateInversions { - // testing + // testing - public static void main(String[] args) { - UnicodeSet ignorables = new UnicodeSet("[[:Cn:][:Cs:][:Co:]]").freeze(); // exclude unassigned, surrogates, and private use - CreateInversions createInversions = new CreateInversions().setIgnorables(ignorables).setDelta(true); + public static void main(String[] args) { + UnicodeSet ignorables = + new UnicodeSet("[[:Cn:][:Cs:][:Co:]]") + .freeze(); // exclude unassigned, surrogates, and private use + CreateInversions createInversions = + new CreateInversions().setIgnorables(ignorables).setDelta(true); - // check the code (by inspection) to make sure it works - // later do unit test - UnicodeSet[] tests = { - new UnicodeSet("[abcxyz]"), + // check the code (by inspection) to make sure it works + // later do unit test + UnicodeSet[] tests = { + new UnicodeSet("[abcxyz]"), new UnicodeSet("[:whitespace:]"), new UnicodeSet("[:deprecated:]"), - }; - for (UnicodeSet test : tests) { - showSet(createInversions, test); - } - - UnicodeMap testMap = new UnicodeMap(); - testMap.putAll(new UnicodeSet("[abcxyz]"), "foo"); - showMap(createInversions, testMap); - - // check with names - for (UnicodeSet test : tests) { - testMap.clear(); - for (UnicodeSetIterator it = new UnicodeSetIterator(test); it.next();) { - testMap.put(it.codepoint, UCharacter.getName(it.codepoint)); - } - showMap(createInversions, testMap); - } - - // check with properties - ICUPropertyFactory propFactory = ICUPropertyFactory.make(); - UnicodeMap[] testProperties = { + }; + for (UnicodeSet test : tests) { + showSet(createInversions, test); + } + + UnicodeMap testMap = new UnicodeMap(); + testMap.putAll(new UnicodeSet("[abcxyz]"), "foo"); + showMap(createInversions, testMap); + + // check with names + for (UnicodeSet test : tests) { + testMap.clear(); + for (UnicodeSetIterator it = new UnicodeSetIterator(test); it.next(); ) { + testMap.put(it.codepoint, UCharacter.getName(it.codepoint)); + } + showMap(createInversions, testMap); + } + + // check with properties + ICUPropertyFactory propFactory = ICUPropertyFactory.make(); + UnicodeMap[] testProperties = { propFactory.getProperty("numeric_type").getUnicodeMap(), propFactory.getProperty("block").getUnicodeMap(), propFactory.getProperty("word_break").getUnicodeMap(), - propFactory.getProperty("grapheme_cluster_break").getUnicodeMap().putAll(new UnicodeSet(0xAC00,0xD7A3), "LVT"), + propFactory + .getProperty("grapheme_cluster_break") + .getUnicodeMap() + .putAll(new UnicodeSet(0xAC00, 0xD7A3), "LVT"), // note: separating out the LV from LVT can be done more compactly with an algorithm. // it is periodic: AC00, AC1C, AC38... - }; - for (UnicodeMap test : testProperties) { - showMap(createInversions, test); + }; + for (UnicodeMap test : testProperties) { + showMap(createInversions, test); + } + + // further compaction can be done by assigning each property value to a number, and using + // that instead. + UnicodeMap source = + propFactory + .getProperty("grapheme_cluster_break") + .getUnicodeMap() + .putAll(new UnicodeSet(0xAC00, 0xD7A3), "LVT"); + UnicodeMap target = new UnicodeMap(); + int numberForValue = 0; + // iterate through the values, assigning each a number + for (Object value : source.getAvailableValues()) { + target.putAll(source.keySet(value), numberForValue++); + } + showMap(createInversions, target); } - // further compaction can be done by assigning each property value to a number, and using that instead. - UnicodeMap source = propFactory.getProperty("grapheme_cluster_break").getUnicodeMap().putAll(new UnicodeSet(0xAC00,0xD7A3), "LVT"); - UnicodeMap target = new UnicodeMap(); - int numberForValue = 0; - // iterate through the values, assigning each a number - for (Object value : source.getAvailableValues()) { - target.putAll(source.keySet(value), numberForValue++); + private static void showSet(CreateInversions createInversions, UnicodeSet test) { + System.out.println("** Source:"); + System.out.println(test); + System.out.println("** Result:"); + System.out.println(createInversions.create("testName", test)); + System.out.println("Inversions: " + createInversions.getInversions()); + System.out.println(); } - showMap(createInversions, target); - } - - private static void showSet(CreateInversions createInversions, UnicodeSet test) { - System.out.println("** Source:"); - System.out.println(test); - System.out.println("** Result:"); - System.out.println(createInversions.create("testName", test)); - System.out.println("Inversions: " + createInversions.getInversions()); - System.out.println(); - } - - private static void showMap(CreateInversions createInversions, UnicodeMap testMap) { - System.out.println("** Source:"); - System.out.println(testMap); - System.out.println("** Result:"); - System.out.println(createInversions.create("testName", testMap)); - System.out.println("Inversions: " + createInversions.getInversions()); - System.out.println(); - } - - // guts - - private UnicodeSet ignorables; - - private boolean delta; - - private int inversions; - - private int getInversions() { - return inversions; - } - - private CreateInversions setDelta(boolean b) { - delta = b; - return this; - } - - private CreateInversions setIgnorables(UnicodeSet ignorables) { - this.ignorables = ignorables; - return this; - } - - public String create(String name, UnicodeSet source) { - try { - return create(name, source, new StringBuilder()).toString(); - } catch (IOException e) { - throw (RuntimeException) new IllegalArgumentException("Should not happen").initCause(e); + + private static void showMap(CreateInversions createInversions, UnicodeMap testMap) { + System.out.println("** Source:"); + System.out.println(testMap); + System.out.println("** Result:"); + System.out.println(createInversions.create("testName", testMap)); + System.out.println("Inversions: " + createInversions.getInversions()); + System.out.println(); } - } - public String create(String name, UnicodeMap source) { - try { - return create(name, source, new StringBuilder()).toString(); - } catch (IOException e) { - throw (RuntimeException) new IllegalArgumentException("Should not happen").initCause(e); + // guts + + private UnicodeSet ignorables; + + private boolean delta; + + private int inversions; + + private int getInversions() { + return inversions; } - } - - // public String createInversions(UnicodeSet source, String name, String - // filename) throws IOException { - // return createInversions(source, name, new StringBuilder()).close(); - // } - // - // public String createInversions(UnicodeMap source, String name, String - // filename) throws IOException { - // return createInversions(source, name, new StringBuilder()).toString(); - // } - - public Appendable create(String name, UnicodeSet source, Appendable target) - throws IOException { - initShortestForm(); - target.append("var " + name + " = new Inversion([\n"); - boolean first = true; - for (UnicodeSetIterator it = new UnicodeSetIterator(source); it.nextRange();) { - if (first) { - first = false; - } else { - target.append(",\n"); // the linebreak is not needed, but easier to read - } - target.append(shortestForm(it.codepoint, delta)); - if (it.codepointEnd != 0x10FFFF) { - target.append(",").append(shortestForm(it.codepointEnd + 1, delta)); - } + + private CreateInversions setDelta(boolean b) { + delta = b; + return this; } - target.append("\n]"); - if (delta) { - target.append(",true"); + + private CreateInversions setIgnorables(UnicodeSet ignorables) { + this.ignorables = ignorables; + return this; + } + + public String create(String name, UnicodeSet source) { + try { + return create(name, source, new StringBuilder()).toString(); + } catch (IOException e) { + throw (RuntimeException) new IllegalArgumentException("Should not happen").initCause(e); + } + } + + public String create(String name, UnicodeMap source) { + try { + return create(name, source, new StringBuilder()).toString(); + } catch (IOException e) { + throw (RuntimeException) new IllegalArgumentException("Should not happen").initCause(e); + } + } + + // public String createInversions(UnicodeSet source, String name, String + // filename) throws IOException { + // return createInversions(source, name, new StringBuilder()).close(); + // } + // + // public String createInversions(UnicodeMap source, String name, String + // filename) throws IOException { + // return createInversions(source, name, new StringBuilder()).toString(); + // } + + public Appendable create(String name, UnicodeSet source, Appendable target) throws IOException { + initShortestForm(); + target.append("var " + name + " = new Inversion([\n"); + boolean first = true; + for (UnicodeSetIterator it = new UnicodeSetIterator(source); it.nextRange(); ) { + if (first) { + first = false; + } else { + target.append(",\n"); // the linebreak is not needed, but easier to read + } + target.append(shortestForm(it.codepoint, delta)); + if (it.codepointEnd != 0x10FFFF) { + target.append(",").append(shortestForm(it.codepointEnd + 1, delta)); + } + } + target.append("\n]"); + if (delta) { + target.append(",true"); + } + target.append(");"); + return target; } - target.append(");"); - return target; - } - - public Appendable create(String name, UnicodeMap source, Appendable target) - throws IOException { - initShortestForm(); - target.append("var " + name + " = new Inversion([\n"); - StringBuilder valueArray = new StringBuilder(); - boolean first = true; - for (UnicodeMapIterator it = new UnicodeMapIterator(source); it.nextRange();) { - // skip ignorable range - if (ignorables.contains(it.codepoint, it.codepointEnd)) { - continue; - } - // also skip adjacent rows with same value - final String valueString = shortestForm(source.getValue(it.codepoint)); - if (lastValue == valueString || lastValue != null && lastValue.equals(valueString)) { - continue; - } - lastValue = valueString; - if (first) { - first = false; - } else { - target.append(",\n"); // the linebreak is not needed, but easier to read - valueArray.append(",\n"); // the linebreak is not needed, but easier to - // read - } - target.append(shortestForm(it.codepoint, delta)); - valueArray.append(valueString); + + public Appendable create(String name, UnicodeMap source, Appendable target) throws IOException { + initShortestForm(); + target.append("var " + name + " = new Inversion([\n"); + StringBuilder valueArray = new StringBuilder(); + boolean first = true; + for (UnicodeMapIterator it = new UnicodeMapIterator(source); it.nextRange(); ) { + // skip ignorable range + if (ignorables.contains(it.codepoint, it.codepointEnd)) { + continue; + } + // also skip adjacent rows with same value + final String valueString = shortestForm(source.getValue(it.codepoint)); + if (lastValue == valueString || lastValue != null && lastValue.equals(valueString)) { + continue; + } + lastValue = valueString; + if (first) { + first = false; + } else { + target.append(",\n"); // the linebreak is not needed, but easier to read + valueArray.append(",\n"); // the linebreak is not needed, but easier to + // read + } + target.append(shortestForm(it.codepoint, delta)); + valueArray.append(valueString); + } + target.append("\n],[\n").append(valueArray).append("\n]"); + if (delta) { + target.append(",true"); + } + target.append(");"); + return target; } - target.append("\n],[\n").append(valueArray).append("\n]"); - if (delta) { - target.append(",true"); + + long lastNumber; + String lastValue; + + private void initShortestForm() { + lastNumber = 0; + inversions = 0; + lastValue = null; } - target.append(");"); - return target; - } - - long lastNumber; - String lastValue; - - private void initShortestForm() { - lastNumber = 0; - inversions = 0; - lastValue = null; - } - - private String shortestForm(Object value) { - String result; - if (value == null) { - result = "null"; - } else if (value instanceof Byte || value instanceof Short || value instanceof Integer - || value instanceof Long) { - --inversions; // don't add inversion in this case - result = shortestForm(((Number) value).longValue(), false); - } else if (value instanceof Float || value instanceof Double) { - result = value.toString(); - } else { - result = value.toString(); - // TODO optimize this - result.replace("\b", "\\\b"); // quote - result.replace("\t", "\\\t"); // quote - result.replace("\n", "\\\n"); // quote - result.replace("\u000B", "\\v"); // quote - result.replace("\f", "\\\f"); // quote - result.replace("\r", "\\\r"); // quote - result.replace("\"", "\\\""); // quote - result.replace("\\", "\\\\"); // quote - result = "\"" + result + "\""; + + private String shortestForm(Object value) { + String result; + if (value == null) { + result = "null"; + } else if (value instanceof Byte + || value instanceof Short + || value instanceof Integer + || value instanceof Long) { + --inversions; // don't add inversion in this case + result = shortestForm(((Number) value).longValue(), false); + } else if (value instanceof Float || value instanceof Double) { + result = value.toString(); + } else { + result = value.toString(); + // TODO optimize this + result.replace("\b", "\\\b"); // quote + result.replace("\t", "\\\t"); // quote + result.replace("\n", "\\\n"); // quote + result.replace("\u000B", "\\v"); // quote + result.replace("\f", "\\\f"); // quote + result.replace("\r", "\\\r"); // quote + result.replace("\"", "\\\""); // quote + result.replace("\\", "\\\\"); // quote + result = "\"" + result + "\""; + } + return result; } - return result; - } - - private String shortestForm(long number, boolean useDelta) { - if (useDelta) { - long temp = number; - number -= lastNumber; - lastNumber = temp; + + private String shortestForm(long number, boolean useDelta) { + if (useDelta) { + long temp = number; + number -= lastNumber; + lastNumber = temp; + } + ++inversions; + String decimal = String.valueOf(number); + String hex = "0x" + Long.toHexString(number); + return decimal.length() < hex.length() ? decimal : hex; } - ++inversions; - String decimal = String.valueOf(number); - String hex = "0x" + Long.toHexString(number); - return decimal.length() < hex.length() ? decimal : hex; - } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Dummy.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Dummy.java index 033a118a2..35e248d0c 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Dummy.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Dummy.java @@ -3,27 +3,36 @@ import org.unicode.cldr.tool.TablePrinter; public class Dummy { - public static String getTest() { - TablePrinter tablePrinter = new TablePrinter() - .setTableAttributes("style='border-collapse: collapse' border='1'") - .addColumn("Language").setSpanRows(true).setSortPriority(0).setBreakSpans(true) - .addColumn("Junk").setSpanRows(true) - .addColumn("Territory").setHeaderAttributes("bgcolor='green'").setCellAttributes("align='right'").setSpanRows(true) - .setSortPriority(1).setSortAscending(false); - Comparable[][] data = { - {"German", 1.3d, 3}, - {"French", 1.3d, 2}, - {"English", 1.3d, 2}, - {"English", 1.3d, 4}, - {"English", 1.3d, 6}, - {"English", 1.3d, 8}, - {"Arabic", 1.3d, 5}, - {"Zebra", 1.3d, 10} - }; - tablePrinter.addRows(data); - tablePrinter.addRow().addCell("Foo").addCell(1.5d).addCell(99).finishRow(); - - String s = tablePrinter.toTable(); - return s; - } + public static String getTest() { + TablePrinter tablePrinter = + new TablePrinter() + .setTableAttributes("style='border-collapse: collapse' border='1'") + .addColumn("Language") + .setSpanRows(true) + .setSortPriority(0) + .setBreakSpans(true) + .addColumn("Junk") + .setSpanRows(true) + .addColumn("Territory") + .setHeaderAttributes("bgcolor='green'") + .setCellAttributes("align='right'") + .setSpanRows(true) + .setSortPriority(1) + .setSortAscending(false); + Comparable[][] data = { + {"German", 1.3d, 3}, + {"French", 1.3d, 2}, + {"English", 1.3d, 2}, + {"English", 1.3d, 4}, + {"English", 1.3d, 6}, + {"English", 1.3d, 8}, + {"Arabic", 1.3d, 5}, + {"Zebra", 1.3d, 10} + }; + tablePrinter.addRows(data); + tablePrinter.addRow().addCell("Foo").addCell(1.5d).addCell(99).finishRow(); + + String s = tablePrinter.toTable(); + return s; + } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/GeneralUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/GeneralUtilities.java index 82e9c1841..f4df6dd5d 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/GeneralUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/GeneralUtilities.java @@ -5,35 +5,45 @@ public class GeneralUtilities { /** - * Used to set a static debug flag from an environment variable. Allows static final flags to be set for debugging information even in environments - * where the source cannot be altered. For a given class StringPrepData, the debug flag is -Dstringprepdata_ (that is, all lowercase). - *

Example: + * Used to set a static debug flag from an environment variable. Allows static final flags to be + * set for debugging information even in environments where the source cannot be altered. For a + * given class StringPrepData, the debug flag is -Dstringprepdata_ (that is, all + * lowercase). + * + *

Example: + * *

      * private static final boolean DEBUG_SHOW_DETAILS = GeneralUtilities.getDebugFlag(StringPrepData.class, "show_details");
      * 
+ * * @param class1 Typically the class where the boolean is defined. - * @param flagName a specialized name, such as show_details. + * @param flagName a specialized name, such as show_details. * @return whether flag was present. */ public static boolean getDebugFlag(Class class1, String flagName) { String className = class1.getName(); int lastPart = className.lastIndexOf('.'); if (lastPart >= 0) { - className = className.substring(lastPart+1); + className = className.substring(lastPart + 1); } - return System.getProperty((className+"_" + flagName).toLowerCase(Locale.ROOT)) != null; + return System.getProperty((className + "_" + flagName).toLowerCase(Locale.ROOT)) != null; } /** - * Used to set a static debug flag from an environment variable. Allows static final flags to be set for debugging information even in environments - * where the source cannot be altered. For a given class StringPrepData, the debug flag is -Dstringprepdata_ (that is, all lowercase). - *

Example: + * Used to set a static debug flag from an environment variable. Allows static final flags to be + * set for debugging information even in environments where the source cannot be altered. For a + * given class StringPrepData, the debug flag is -Dstringprepdata_ (that is, all + * lowercase). + * + *

Example: + * *

      * private static final boolean DEBUG_SHOW_DETAILS = GeneralUtilities.getDebugFlag(StringPrepData.class, "show_details", DEBUG);
      * 
+ * * @param class1 Typically the class where the boolean is defined. - * @param flagName a specialized name, such as show_details. - * @param onlyif allows the test to be subject to a general flag. + * @param flagName a specialized name, such as show_details. + * @param onlyif allows the test to be subject to a general flag. * @return whether flag was present. */ public static boolean getDebugFlag(Class class1, String flagName, boolean onlyif) { @@ -42,11 +52,11 @@ public static boolean getDebugFlag(Class class1, String flagName, boolean onl /** * Convenience method, where the flagname is "debug". + * * @param class1 Typically the class where the boolean is defined. * @return whether flag was present. */ public static boolean getDebugFlag(Class class1) { return getDebugFlag(class1, "debug"); } - } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/GenerateSubheader.java b/UnicodeJsps/src/main/java/org/unicode/jsp/GenerateSubheader.java index d0fba7e34..94782b9d2 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/GenerateSubheader.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/GenerateSubheader.java @@ -1,16 +1,16 @@ package org.unicode.jsp; -import java.io.IOException; - import com.ibm.icu.text.UnicodeSet; +import java.io.IOException; public class GenerateSubheader { - public static void main(String[] args) throws IOException { - final String unicodeDataDirectory = "./jsp/"; - Subheader subheader = new Subheader(unicodeDataDirectory); - for (String subhead : subheader) { - UnicodeSet result = subheader.getUnicodeSet(subhead); - System.out.println("{\"" + subhead + "\",\"" + result.toString().replace("\\", "\\\\") + "\"},"); + public static void main(String[] args) throws IOException { + final String unicodeDataDirectory = "./jsp/"; + Subheader subheader = new Subheader(unicodeDataDirectory); + for (String subhead : subheader) { + UnicodeSet result = subheader.getUnicodeSet(subhead); + System.out.println( + "{\"" + subhead + "\",\"" + result.toString().replace("\\", "\\\\") + "\"},"); + } } - } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Globe.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Globe.java index e98ed29c3..8315b123d 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Globe.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Globe.java @@ -1,6 +1,5 @@ package org.unicode.jsp; - import java.awt.BasicStroke; import java.awt.BorderLayout; import java.awt.Color; @@ -43,7 +42,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import javax.imageio.ImageIO; import javax.swing.ImageIcon; import javax.swing.JButton; @@ -55,2103 +53,2258 @@ import javax.swing.JScrollPane; public class Globe { - public static final boolean DEBUG = false; - static int style = 0; - static int degreeInterval = 15; - - static String myPictures0 = "bin/jsp/images/"; - static String SOURCE_DIR = myPictures0; - static String TARGET_DIR = "Pictures/Earth/"; - - static JFileChooser loadFileChooser = new JFileChooser(); - static JFileChooser saveFileChooser = new JFileChooser(); - static { - saveFileChooser.setCurrentDirectory(new File(TARGET_DIR)); - loadFileChooser.setCurrentDirectory(new File(SOURCE_DIR)); - } - - static int QUALITY = 80; - - static double lightness = 0; - static boolean rotate = false; - static boolean doLabels = false; - - static int originChoice = 0; - static String[] originList = new String[] { - "?", - "North Pole", - "San Francisco (SFO)", - "Zürich (ZRH)", - "Tokyo (NRT)", - "Wellington (WLG)", - "Honolulu", - "Melbourne (MEL)", - "Caen (CFR)", - "Cochin (COK)", - "Cochin (COK) - centering", - "Moscow, ID" - }; - // Melbourne, Australia 37 47 S 144 58 E - // Caen — 49° 10' 59" N 00° 22' 10" W - // sundance latitude 44.406 and longitude -104.376 - // San Diego, Calif. 32 42 117 10 9:00 a.m. - // Moscow, ID Latitude: 46.73 N, Longitude: 117.00 W - - static double[][] origins = { // lat, long - {-Math.PI/2 + 0.0001, 0.0001}, // - {-Math.PI/2 + 0.0001, 0.0001}, // - {Navigator.toRadians(37.0, 37.0, 8.3, false), Navigator.toRadians(122.0, 22.0, 29.6, false)}, // sf // - {Navigator.toRadians(47, 27, 0, false), Navigator.toRadians(8.0, 33.0, 0, true)}, // zurich // - {Navigator.toRadians(35, 45, 50, false), Navigator.toRadians(140.0, 23.0, 30, true)}, // Narita 35°45´50"N 140°23´30"E - {Navigator.toRadians(41, 20, 0, true), Navigator.toRadians(174.0, 48.0, 0, true)}, // Wellington 41° 20' 0" S 174° 48' 0" E - {Navigator.toRadians(21, 18, 0, false), Navigator.toRadians(157, 50, 0, false)}, - {Navigator.toRadians(37, 39, 42, true), Navigator.toRadians(144, 50, 0, true)}, - {Navigator.toRadians(49, 10, 24, false), Navigator.toRadians(0, 26, 53, false)}, - {Navigator.toRadians(10, 9, 7, false), Navigator.toRadians(76, 24, 7, true)}, // Cochin - {Navigator.toRadians(0, 0, 0, false), Navigator.toRadians(70, 0, 0, true)}, // Cochin - {Navigator.toRadians(46.743978, 0, 0, false), Navigator.toRadians(116.904176, 0, 0, false)}, // Moscow - //,-116.904176 - /* - * Airport Code : COK + public static final boolean DEBUG = false; + static int style = 0; + static int degreeInterval = 15; + + static String myPictures0 = "bin/jsp/images/"; + static String SOURCE_DIR = myPictures0; + static String TARGET_DIR = "Pictures/Earth/"; + + static JFileChooser loadFileChooser = new JFileChooser(); + static JFileChooser saveFileChooser = new JFileChooser(); + + static { + saveFileChooser.setCurrentDirectory(new File(TARGET_DIR)); + loadFileChooser.setCurrentDirectory(new File(SOURCE_DIR)); + } + + static int QUALITY = 80; + + static double lightness = 0; + static boolean rotate = false; + static boolean doLabels = false; + + static int originChoice = 0; + static String[] originList = + new String[] { + "?", + "North Pole", + "San Francisco (SFO)", + "Zürich (ZRH)", + "Tokyo (NRT)", + "Wellington (WLG)", + "Honolulu", + "Melbourne (MEL)", + "Caen (CFR)", + "Cochin (COK)", + "Cochin (COK) - centering", + "Moscow, ID" + }; + // Melbourne, Australia 37 47 S 144 58 E + // Caen — 49° 10' 59" N 00° 22' 10" W + // sundance latitude 44.406 and longitude -104.376 + // San Diego, Calif. 32 42 117 10 9:00 a.m. + // Moscow, ID Latitude: 46.73 N, Longitude: 117.00 W + + static double[][] origins = { // lat, long + {-Math.PI / 2 + 0.0001, 0.0001}, // + {-Math.PI / 2 + 0.0001, 0.0001}, // + { + Navigator.toRadians(37.0, 37.0, 8.3, false), + Navigator.toRadians(122.0, 22.0, 29.6, false) + }, // sf // + { + Navigator.toRadians(47, 27, 0, false), Navigator.toRadians(8.0, 33.0, 0, true) + }, // zurich // + { + Navigator.toRadians(35, 45, 50, false), Navigator.toRadians(140.0, 23.0, 30, true) + }, // Narita 35°45´50"N 140°23´30"E + { + Navigator.toRadians(41, 20, 0, true), Navigator.toRadians(174.0, 48.0, 0, true) + }, // Wellington 41° 20' 0" S 174° 48' 0" E + {Navigator.toRadians(21, 18, 0, false), Navigator.toRadians(157, 50, 0, false)}, + {Navigator.toRadians(37, 39, 42, true), Navigator.toRadians(144, 50, 0, true)}, + {Navigator.toRadians(49, 10, 24, false), Navigator.toRadians(0, 26, 53, false)}, + {Navigator.toRadians(10, 9, 7, false), Navigator.toRadians(76, 24, 7, true)}, // Cochin + {Navigator.toRadians(0, 0, 0, false), Navigator.toRadians(70, 0, 0, true)}, // Cochin + { + Navigator.toRadians(46.743978, 0, 0, false), + Navigator.toRadians(116.904176, 0, 0, false) + }, // Moscow + // ,-116.904176 + /* + * Airport Code : COK + + Longitude : 76° 24’ 7” E (?) + Latitude : 10° 9’ 7” N (?) + */ + }; + + static int[][] sizeValues = { + {640, 320}, + {1024, 512}, + {1280, 640}, + {1280, 1024}, + {1400, 700}, + {1400, 1050}, + {1440, 720}, + {1600, 800}, + {1920, 960}, + {1920, 1200}, + {2400, 1200}, + }; + + static String[] sizeList = + new String[] { + "640×320", + "1024×512", + "1280×640", + "1280×1024", + "1400×700", + "1400×1050", + "1440×720", + "1600×800", + "1920×960", + "1920×1200", + "2400×1200" + }; + static int sizeChoice = 0; + + static String[] gridList = new String[] {"5°", "10°", "15°"}; + static int gridChoice = 0; + + static String[] labelList = new String[] {"no labels", "labels"}; + + static String[] localeList = new String[] {"en", "de", "fr", "el", "ru", "ja", "zh"}; + static String[] translatedLocaleList; + + static String[] projectionList = + new String[] { + "Plate Carrée", + "Equal Area Rectangular (Gall)", + "Equal Area Sinusoidal", + "Equal Area Ellipse", + "Equidistant Conic", + "3D Isometric" + }; + static int projectionChoice = 0; + + static Transform[] projectionValues = + new Transform[] { + new TransformPlateCarree(), + new TransformGallOrthographic(), + new TransformSinusoidal(), + new TransformEqualAreaEllipse(), + new TransformEquidistantConic(), + new Transform3DIsometric(), + }; + + static double originLat = origins[0][0]; // N = + + static double originLong = origins[0][1]; // W = - -Longitude : 76° 24’ 7” E (?) -Latitude : 10° 9’ 7” N (?) + /** + * Create the GUI and show it. For thread safety, this method should be invoked from the + * event-dispatching thread. + * + * @throws IOException */ - }; - - static int [][] sizeValues = { - {640, 320}, - {1024, 512}, - {1280, 640}, - {1280, 1024}, - {1400, 700}, - {1400, 1050}, - {1440, 720}, - {1600, 800}, - {1920, 960}, - {1920, 1200}, - {2400, 1200}, - }; - - static String[] sizeList = new String[] {"640×320", "1024×512", "1280×640", "1280×1024", "1400×700", "1400×1050", "1440×720", "1600×800", "1920×960", "1920×1200", "2400×1200"}; - static int sizeChoice = 0; - - static String[] gridList = new String[] {"5°", "10°", "15°"}; - static int gridChoice = 0; - - static String[] labelList = new String[] {"no labels", "labels"}; - - static String[] localeList = new String[] {"en", "de", "fr", "el", "ru", "ja", "zh"}; - static String[] translatedLocaleList; - - static String[] projectionList = new String[] { - "Plate Carrée", - "Equal Area Rectangular (Gall)", - "Equal Area Sinusoidal", - "Equal Area Ellipse", - "Equidistant Conic", - "3D Isometric"}; - static int projectionChoice = 0; - - static Transform[] projectionValues = new Transform[] { - new TransformPlateCarree(), - new TransformGallOrthographic(), - new TransformSinusoidal(), - new TransformEqualAreaEllipse(), - new TransformEquidistantConic(), - new Transform3DIsometric(), - }; - - static double originLat = origins[0][0]; // N = + - static double originLong = origins[0][1]; // W = - - - /** - * Create the GUI and show it. For thread safety, - * this method should be invoked from the - * event-dispatching thread. - * @throws IOException - */ - static JLabel mainPicture = new JLabel(); - - //static ImageIcon sourceIcon, resultIcon; - static JFrame frame; - - static BufferedImage sourceImage, griddedImage; - static Image transformedImage; - - static int gradations = 10; - - private static void createAndShowGUI() { - if (false) { - Mapper m = new Mapper(3,7,100,140); - System.out.println(m.map(3) + ", " + m.map(7)); - new Transform.Tester().test(); - return; - } + static JLabel mainPicture = new JLabel(); - //cldrFactory = CLDRFile.Factory.make(Utility.MAIN_DIRECTORY,".*"); - - //Make sure we have nice window decorations. - JFrame.setDefaultLookAndFeelDecorated(true); - - //Create and set up the window. - frame = new JFrame("HelloWorldSwing"); - frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); - - //Add the ubiquitous "Hello World" label. - //JLabel label = new JLabel("Hello World"); - //frame.getContentPane().add(label); - String sname = SOURCE_DIR + "earth-living.jpg"; - //"ev11656_land_shallow_topo_8192.tiff"; - //"ev11656_land_shallow_topo_8192.PNG"; - //earthmap1k.jpg"; - //"C:/Documents and Settings/Administrator/Desktop/macchiato-backup/distance/worldmap.jpg" - loadSourceMap(sname); - - JPanel topPanel = new JPanel(); - topPanel.setLayout(new FlowLayout(FlowLayout.LEADING)); - - JButton but = new JButton("Save"); - but.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - saveFileChooser.setSelectedFile(new File("Earth " - + projectionList[projectionChoice] - + ", " + sizeList[sizeChoice] - + ", " + gridList[gridChoice] - + ", " + originList[originChoice] - + ".jpg")); - int returnVal = saveFileChooser.showSaveDialog(frame); - if (returnVal == JFileChooser.APPROVE_OPTION) { - try { - String filename = saveFileChooser.getSelectedFile().getCanonicalPath(); - System.out.println("You chose to save this file: " - + filename); - writeImage(griddedImage, filename, QUALITY); - //myPictures + "new-earth-living" + style + ".jpg" - } catch (IOException e1) { - System.out.println("Couldn't save file."); - } - } - - } - }); - topPanel.add(but); - - JButton but2 = new JButton("Load"); - but2.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - - // File file = new File(SOURCE_DIR + "earth-living.jpg"); - // try { - // System.out.println("Source Dir: " + file.getCanonicalPath()); - // } catch (IOException e2) { - // } - // loadFileChooser.setSelectedFile(file); - int returnVal = loadFileChooser.showOpenDialog(frame); - if (returnVal == JFileChooser.APPROVE_OPTION) { - try { - String filename = loadFileChooser.getSelectedFile().getCanonicalPath(); - if (!filename.toLowerCase().endsWith(".jpg")) { - filename += ".jpg"; - } - System.out.println("You chose to open this file: " - + filename); - loadSourceMap(filename); - //myPictures + "new-earth-living" + style + ".jpg" - } catch (IOException e1) { - System.out.println("Couldn't save file."); - } - } - - } - }); - topPanel.add(but2); - - JComboBox box = new JComboBox(projectionList); - box.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - int index = ((JComboBox)e.getSource()).getSelectedIndex(); - if (index != projectionChoice) { - style = projectionChoice = index; - changeImage(frame); - } - } - }); - topPanel.add(box); - - JComboBox box2 = new JComboBox(sizeList); - box2.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - int index = ((JComboBox)e.getSource()).getSelectedIndex(); - if (index != sizeChoice) { - sizeChoice = index; - changeImage(frame); - } - } - }); - topPanel.add(box2); - String[] gradationNames = new String[gradations * 2 - 1]; - for (int i = 0; i < gradationNames.length; ++i) { - gradationNames[i] = i < gradations-1 ? "Lighten to " + ((i+1)*100/gradations) - : i == gradations - 1 ? "Neutral" - : "Darken to " + ((gradations*2 - i - 1)*100/gradations); - } - JComboBox box3 = new JComboBox(gradationNames); - box3.setSelectedIndex(gradations-1); - box3.addActionListener(new ActionListener() { - int lastIndex = gradations-1; - public void actionPerformed(ActionEvent e) { - int index = ((JComboBox)e.getSource()).getSelectedIndex(); - if (index != lastIndex) { - lastIndex = index; - lightness = (gradations - 1 -index)/(double)gradations; - changeImage(frame); - } - } - }); - topPanel.add(box3); - - JComboBox box4 = new JComboBox(originList); - //box4.setSelectedIndex(1); - box4.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - int index = ((JComboBox)e.getSource()).getSelectedIndex(); - if (index != originChoice) { - originChoice = index; - originLat = origins[index][0]; - originLong = origins[index][1]; - if (projectionValues[projectionChoice].usesOrigin()) { - changeImage(frame); - } else { - addGrid(transformedImage, projectionValues[projectionChoice]); - // - } - } - } - }); - topPanel.add(box4); - - JComboBox box5 = new JComboBox(gridList); - //box4.setSelectedIndex(1); - box5.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - int index = ((JComboBox)e.getSource()).getSelectedIndex(); - if (index != gridChoice) { - gridChoice = index; - degreeInterval = (index + 1)*5; - addGrid(transformedImage, projectionValues[projectionChoice]); - // changeImage(frame); - } - } - }); - topPanel.add(box5); - - JComboBox box6 = new JComboBox(labelList); - //box4.setSelectedIndex(1); - box6.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - int index = ((JComboBox)e.getSource()).getSelectedIndex(); - if ((index == 1) != doLabels) { - doLabels = index == 1; - addGrid(transformedImage, projectionValues[projectionChoice]); - // changeImage(frame); - } - } - }); - topPanel.add(box6); - - box7 = new JComboBox(localeList); - box7.setFont(font); - //setLocale(0); - //box4.setSelectedIndex(1); - box7.addActionListener(new ActionListener() { - public void actionPerformed(ActionEvent e) { - int index = ((JComboBox)e.getSource()).getSelectedIndex(); - if (index != currentLocaleIndex) { - //setLocale(index); - ((JComboBox)e.getSource()).setSelectedIndex(index); - changeImage(frame); - } - } - }); - topPanel.add(box7); - - JPanel panel = new JPanel(); - panel.setLayout(new BorderLayout()); - panel.add(topPanel, BorderLayout.PAGE_START); - panel.add(mainPicture, BorderLayout.CENTER); - panel.add(new JLabel("See http://www.3dsoftware.com/Cartography/USGS/MapProjections/"), BorderLayout.PAGE_END); - - JScrollPane scrollPane = new JScrollPane(panel); - scrollPane.setPreferredSize(new Dimension(660, 540)); - //add(scrollPane, BorderLayout.CENTER); - - frame.getContentPane().add(scrollPane); - - //Display the window. - frame.pack(); - frame.setVisible(true); - //writeImage(i, myPictures + "new-earth-living" + style + ".jpg", QUALITY); - } - - - //private static void setLocale(int newLocaleIndex) { - //cldrFile = cldrFactory.make(localeList[newLocaleIndex], true); - //tzf = new TimezoneFormatter(cldrFactory, localeList[newLocaleIndex], true); - //currentLocaleIndex = newLocaleIndex; - //MutableComboBoxModel model = (MutableComboBoxModel) box7.getModel(); - //for (int i = 0; i < localeList.length; ++i) { - //model.removeElementAt(i); - //model.insertElementAt(cldrFile.getName(localeList[i], true), i); - //} - //} - static Font font = Font.decode("Arial Unicode MS-9"); - //static CLDRFile.Factory cldrFactory; - //static CLDRFile cldrFile; - static int currentLocaleIndex = -1; - //static TimezoneFormatter tzf; - static JComboBox box7; - - /* - public static class LightingImageFilter implements RGBImageFilter { - /** - * Subclasses must specify a method to convert a single input pixel - * in the default RGB ColorModel to a single output pixel. - * @param x, y the coordinates of the pixel - * @param rgb the integer pixel representation in the default RGB - * color model - * @return a filtered pixel in the default RGB color model. - * @see ColorModel#getRGBdefault - * @see #filterRGBPixels - * / - public int filterRGB(int x, int y, int rgb) { - int a = (rgb >>> 24) & 0xFF; - int r = (rgb >> 16) & 0xFF; - int g = (rgb >> 8) & 0xFF; - int b = rgb & 0xFF; - return (rgb & 0xFF) + // static ImageIcon sourceIcon, resultIcon; + static JFrame frame; - } - } - */ - - /** - * @param sname - */ - private static void loadSourceMap(String sname) { - try { - System.out.println("Check: " + new File(sname).getAbsolutePath()); - ImageIcon sourceIcon = new ImageIcon(sname); - sourceImage = convertToBuffered(sourceIcon.getImage()); - System.out.println("Loaded " + new File(sname).getCanonicalPath()); - } catch (IOException e) { - e.printStackTrace(); - throw new RuntimeException("Can't load"); - } - changeImage(frame); - } - /** - * @param frame - */ - private static final boolean DEBUG_ICON = false; - - private static void changeImage(JFrame frame) { - if (DEBUG_ICON) { - System.out.println("Changing Icon1"); - } - // System.out.println("Width " + ii.getIconWidth() + ", Height: " + ii.getIconHeight()); - DeformFilter filter = new DeformFilter(sourceImage.getWidth(), - sourceImage.getHeight(), sizeValues[sizeChoice][0], sizeValues[sizeChoice][1], - projectionValues[projectionChoice]); - //ImageFilter filter = new RotateFilter(Math.PI / 4); - - if (DEBUG_ICON) { - System.out.println("Changing Icon2"); - } - ImageProducer ip = new FilteredImageSource(sourceImage.getSource(), filter); // modifies filter - if (DEBUG_ICON) { - System.out.println("Changing Icon3"); - } - transformedImage = frame.createImage(ip); - if (DEBUG_ICON) { - System.out.println("Changing Icon4"); - //Icon junk = new ImageIcon(transformedImage); // load image (HACK) - } + static BufferedImage sourceImage, griddedImage; + static Image transformedImage; - if (DEBUG_ICON) { - System.out.println("Changing Icon5"); - } - addGrid(transformedImage, projectionValues[projectionChoice]); - } - - - public static void main(String[] args) throws IOException { - readData(); - //Schedule a job for the event-dispatching thread: - //creating and showing this application's GUI. - javax.swing.SwingUtilities.invokeLater(new Runnable() { - public void run() { - createAndShowGUI(); - } - }); - } - - static void readData() throws IOException { - File file = new File("classes/jsp/"); - System.out.println(file.getAbsolutePath()); - - BufferedReader br = new BufferedReader( - new InputStreamReader( - new FileInputStream("bin/jsp/Globe.txt"), - "UTF-8")); - //BagFormatter.openUTF8Reader("classes/jsp/", "Globe.txt"); - String pat = "([^;]+) \\s* [;] \\s* " - + "([0-9.]+) [°]* \\s* ([0-9.]+)? [']* \\s* ([0-9.]+)? [\"]* \\s* " - + "([NS]) \\s* " - + "([0-9.]+) [°]* \\s* ([0-9.]+)? [']* \\s* ([0-9.]+)? [\"]* \\s* " - + "([EW]) \\s*"; - Matcher m = Pattern.compile(pat, Pattern.CASE_INSENSITIVE | Pattern.COMMENTS).matcher(""); - System.out.println("Pattern: " + pat); - List nameData = new ArrayList(); - List posData = new ArrayList(); - String[] pieces = new String[3]; - while (true) { - String line = br.readLine(); - if (line == null) { - break; - } - if (!m.reset(line).matches()) { - System.out.println("Error in data: " + line); - continue; - } - nameData.add(m.group(1)); - double latitude = Navigator.toRadians(Double.parseDouble(m.group(2)), - m.group(3) != null ? Double.parseDouble(m.group(3)) : 0, - m.group(4) != null ? Double.parseDouble(m.group(4)) : 0, - m.group(5).equalsIgnoreCase("S")); - double longitude = Navigator.toRadians(Double.parseDouble(m.group(6)), - m.group(7) != null ? Double.parseDouble(m.group(7)) : 0, - m.group(8) != null ? Double.parseDouble(m.group(8)) : 0, - m.group(9).equalsIgnoreCase("E")); - posData.add(new double[] {latitude, longitude}); - System.out.println(m.group(1) + ", " + latitude + ", " + longitude); - } - originList = (String[])nameData.toArray(originList); - origins = (double[][])posData.toArray(origins); - br.close(); - } - - public static final NumberFormat nf = NumberFormat.getInstance(); - - public static void getAntipode(DPoint in) { - if (in.x > 0) { - in.x -= Math.PI; - } else { - in.x += Math.PI; - } - in.y = -in.y; - } + static int gradations = 10; - public static class DPoint implements Comparable { - double x, y; - DPoint() { - this(0,0); - } - DPoint(double x, double y) { - set(x,y); - } - DPoint set(double x, double y) { - this.x = x; - this.y = y; - return this; - } - public int compareTo(Object o) { - DPoint that = (DPoint)o; - if (y < that.y) { - return -1; - } - if (y > that.y) { - return 1; - } - if (x < that.x) { - return -1; - } - if (x > that.x) { - return 1; - } - return 0; - } - public String toString() { - return "[" + nf.format(x) + "," + nf.format(y) + "]"; - } - } - - public static class DRectangle { - double x0, y0, x1, y1; - } - - public static class Quad { - DRectangle containing = new DRectangle(); - DPoint[] p = new DPoint[4]; - - // returns the amount (0..1) that the square from x,y to x+1, y+1 - // overlaps the quadralateral - void set(DPoint a, DPoint b, DPoint c, DPoint d) { - p[0] = a; - p[1] = b; - p[2] = c; - p[3] = d; - // sort; so y's are now in sorted order - Arrays.sort(p); - // integer bounding rectangle - // is easy for y's - containing.y0 = (int)Math.floor(p[0].y); - containing.y1 = (int)Math.ceil(p[3].y); - // but for x's we have to compute - containing.x0 = (int)Math.floor(Math.min(p[0].x, Math.min(p[1].x, Math.min(p[2].x, p[3].x)))); - containing.x1 = (int)Math.ceil(Math.max(p[0].x, Math.max(p[1].x, Math.max(p[2].x, p[3].x)))); - } - double getWeight(double x, double y) { - // return the percentage overlap between this quadralateral, - // and the rectangle from x,y to x+1,y+1 - // simple implementation for now. return 1 if center is in containing, otherwise 0 - if (containing.x0 <= x && x < containing.x1 - && containing.y0 <= y && y < containing.y1) { - return 1.0; - } - return 0; - } - } + private static void createAndShowGUI() { + if (false) { + Mapper m = new Mapper(3, 7, 100, 140); + System.out.println(m.map(3) + ", " + m.map(7)); + new Transform.Tester().test(); + return; + } - static public abstract class Transform { - static final boolean debug = false; - protected double srcW, srcH, dstW, dstH; - Mapper srcW_long, srcH_lat, long_dstW, lat_dstH; - Navigator navigator; - boolean allowRotation = true; - Shape clip = null; + // cldrFactory = CLDRFile.Factory.make(Utility.MAIN_DIRECTORY,".*"); + + // Make sure we have nice window decorations. + JFrame.setDefaultLookAndFeelDecorated(true); + + // Create and set up the window. + frame = new JFrame("HelloWorldSwing"); + frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); + + // Add the ubiquitous "Hello World" label. + // JLabel label = new JLabel("Hello World"); + // frame.getContentPane().add(label); + String sname = SOURCE_DIR + "earth-living.jpg"; + // "ev11656_land_shallow_topo_8192.tiff"; + // "ev11656_land_shallow_topo_8192.PNG"; + // earthmap1k.jpg"; + // "C:/Documents and Settings/Administrator/Desktop/macchiato-backup/distance/worldmap.jpg" + loadSourceMap(sname); + + JPanel topPanel = new JPanel(); + topPanel.setLayout(new FlowLayout(FlowLayout.LEADING)); + + JButton but = new JButton("Save"); + but.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + saveFileChooser.setSelectedFile( + new File( + "Earth " + + projectionList[projectionChoice] + + ", " + + sizeList[sizeChoice] + + ", " + + gridList[gridChoice] + + ", " + + originList[originChoice] + + ".jpg")); + int returnVal = saveFileChooser.showSaveDialog(frame); + if (returnVal == JFileChooser.APPROVE_OPTION) { + try { + String filename = + saveFileChooser.getSelectedFile().getCanonicalPath(); + System.out.println("You chose to save this file: " + filename); + writeImage(griddedImage, filename, QUALITY); + // myPictures + "new-earth-living" + style + ".jpg" + } catch (IOException e1) { + System.out.println("Couldn't save file."); + } + } + } + }); + topPanel.add(but); + + JButton but2 = new JButton("Load"); + but2.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + + // File file = new File(SOURCE_DIR + "earth-living.jpg"); + // try { + // System.out.println("Source Dir: " + file.getCanonicalPath()); + // } catch (IOException e2) { + // } + // loadFileChooser.setSelectedFile(file); + int returnVal = loadFileChooser.showOpenDialog(frame); + if (returnVal == JFileChooser.APPROVE_OPTION) { + try { + String filename = + loadFileChooser.getSelectedFile().getCanonicalPath(); + if (!filename.toLowerCase().endsWith(".jpg")) { + filename += ".jpg"; + } + System.out.println("You chose to open this file: " + filename); + loadSourceMap(filename); + // myPictures + "new-earth-living" + style + ".jpg" + } catch (IOException e1) { + System.out.println("Couldn't save file."); + } + } + } + }); + topPanel.add(but2); + + JComboBox box = new JComboBox(projectionList); + box.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + int index = ((JComboBox) e.getSource()).getSelectedIndex(); + if (index != projectionChoice) { + style = projectionChoice = index; + changeImage(frame); + } + } + }); + topPanel.add(box); + + JComboBox box2 = new JComboBox(sizeList); + box2.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + int index = ((JComboBox) e.getSource()).getSelectedIndex(); + if (index != sizeChoice) { + sizeChoice = index; + changeImage(frame); + } + } + }); + topPanel.add(box2); + String[] gradationNames = new String[gradations * 2 - 1]; + for (int i = 0; i < gradationNames.length; ++i) { + gradationNames[i] = + i < gradations - 1 + ? "Lighten to " + ((i + 1) * 100 / gradations) + : i == gradations - 1 + ? "Neutral" + : "Darken to " + ((gradations * 2 - i - 1) * 100 / gradations); + } + JComboBox box3 = new JComboBox(gradationNames); + box3.setSelectedIndex(gradations - 1); + box3.addActionListener( + new ActionListener() { + int lastIndex = gradations - 1; + + public void actionPerformed(ActionEvent e) { + int index = ((JComboBox) e.getSource()).getSelectedIndex(); + if (index != lastIndex) { + lastIndex = index; + lightness = (gradations - 1 - index) / (double) gradations; + changeImage(frame); + } + } + }); + topPanel.add(box3); + + JComboBox box4 = new JComboBox(originList); + // box4.setSelectedIndex(1); + box4.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + int index = ((JComboBox) e.getSource()).getSelectedIndex(); + if (index != originChoice) { + originChoice = index; + originLat = origins[index][0]; + originLong = origins[index][1]; + if (projectionValues[projectionChoice].usesOrigin()) { + changeImage(frame); + } else { + addGrid(transformedImage, projectionValues[projectionChoice]); + // + } + } + } + }); + topPanel.add(box4); + + JComboBox box5 = new JComboBox(gridList); + // box4.setSelectedIndex(1); + box5.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + int index = ((JComboBox) e.getSource()).getSelectedIndex(); + if (index != gridChoice) { + gridChoice = index; + degreeInterval = (index + 1) * 5; + addGrid(transformedImage, projectionValues[projectionChoice]); + // changeImage(frame); + } + } + }); + topPanel.add(box5); + + JComboBox box6 = new JComboBox(labelList); + // box4.setSelectedIndex(1); + box6.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + int index = ((JComboBox) e.getSource()).getSelectedIndex(); + if ((index == 1) != doLabels) { + doLabels = index == 1; + addGrid(transformedImage, projectionValues[projectionChoice]); + // changeImage(frame); + } + } + }); + topPanel.add(box6); + + box7 = new JComboBox(localeList); + box7.setFont(font); + // setLocale(0); + // box4.setSelectedIndex(1); + box7.addActionListener( + new ActionListener() { + public void actionPerformed(ActionEvent e) { + int index = ((JComboBox) e.getSource()).getSelectedIndex(); + if (index != currentLocaleIndex) { + // setLocale(index); + ((JComboBox) e.getSource()).setSelectedIndex(index); + changeImage(frame); + } + } + }); + topPanel.add(box7); + + JPanel panel = new JPanel(); + panel.setLayout(new BorderLayout()); + panel.add(topPanel, BorderLayout.PAGE_START); + panel.add(mainPicture, BorderLayout.CENTER); + panel.add( + new JLabel("See http://www.3dsoftware.com/Cartography/USGS/MapProjections/"), + BorderLayout.PAGE_END); + + JScrollPane scrollPane = new JScrollPane(panel); + scrollPane.setPreferredSize(new Dimension(660, 540)); + // add(scrollPane, BorderLayout.CENTER); + + frame.getContentPane().add(scrollPane); + + // Display the window. + frame.pack(); + frame.setVisible(true); + // writeImage(i, myPictures + "new-earth-living" + style + ".jpg", QUALITY); + } + + // private static void setLocale(int newLocaleIndex) { + // cldrFile = cldrFactory.make(localeList[newLocaleIndex], true); + // tzf = new TimezoneFormatter(cldrFactory, localeList[newLocaleIndex], true); + // currentLocaleIndex = newLocaleIndex; + // MutableComboBoxModel model = (MutableComboBoxModel) box7.getModel(); + // for (int i = 0; i < localeList.length; ++i) { + // model.removeElementAt(i); + // model.insertElementAt(cldrFile.getName(localeList[i], true), i); + // } + // } + static Font font = Font.decode("Arial Unicode MS-9"); + // static CLDRFile.Factory cldrFactory; + // static CLDRFile cldrFile; + static int currentLocaleIndex = -1; + // static TimezoneFormatter tzf; + static JComboBox box7; + + /* + public static class LightingImageFilter implements RGBImageFilter { + /** + * Subclasses must specify a method to convert a single input pixel + * in the default RGB ColorModel to a single output pixel. + * @param x, y the coordinates of the pixel + * @param rgb the integer pixel representation in the default RGB + * color model + * @return a filtered pixel in the default RGB color model. + * @see ColorModel#getRGBdefault + * @see #filterRGBPixels + * / + public int filterRGB(int x, int y, int rgb) { + int a = (rgb >>> 24) & 0xFF; + int r = (rgb >> 16) & 0xFF; + int g = (rgb >> 8) & 0xFF; + int b = rgb & 0xFF; + return (rgb & 0xFF) + + } + } + */ /** - * @return Returns the clip. + * @param sname */ - public Shape getClip() { - if (clip == null) { - clip = _getClip(); - } - return clip; + private static void loadSourceMap(String sname) { + try { + System.out.println("Check: " + new File(sname).getAbsolutePath()); + ImageIcon sourceIcon = new ImageIcon(sname); + sourceImage = convertToBuffered(sourceIcon.getImage()); + System.out.println("Loaded " + new File(sname).getCanonicalPath()); + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException("Can't load"); + } + changeImage(frame); } /** - * @return + * @param frame */ - public boolean usesOrigin() { - return false; + private static final boolean DEBUG_ICON = false; + + private static void changeImage(JFrame frame) { + if (DEBUG_ICON) { + System.out.println("Changing Icon1"); + } + // System.out.println("Width " + ii.getIconWidth() + ", Height: " + ii.getIconHeight()); + DeformFilter filter = + new DeformFilter( + sourceImage.getWidth(), + sourceImage.getHeight(), + sizeValues[sizeChoice][0], + sizeValues[sizeChoice][1], + projectionValues[projectionChoice]); + // ImageFilter filter = new RotateFilter(Math.PI / 4); + + if (DEBUG_ICON) { + System.out.println("Changing Icon2"); + } + ImageProducer ip = + new FilteredImageSource(sourceImage.getSource(), filter); // modifies filter + if (DEBUG_ICON) { + System.out.println("Changing Icon3"); + } + transformedImage = frame.createImage(ip); + if (DEBUG_ICON) { + System.out.println("Changing Icon4"); + // Icon junk = new ImageIcon(transformedImage); // load image (HACK) + } + + if (DEBUG_ICON) { + System.out.println("Changing Icon5"); + } + addGrid(transformedImage, projectionValues[projectionChoice]); } - // must set before use - Transform set(double srcW, double srcH, double dstW, double dstH) { - this.srcW = srcW; - this.srcH = srcH; - this.dstW = dstW; - this.dstH = dstH; - srcW_long = new Mapper(0, srcW, -Math.PI, Math.PI); - srcH_lat = new Mapper(0, srcH, -Math.PI/2, Math.PI/2); - long_dstW = new Mapper(-Math.PI, Math.PI, 0, dstW); - lat_dstH = new Mapper(-Math.PI/2, Math.PI/2, 0, dstH); - navigator = new Navigator().setLat1Lon1(originLat, originLong); - clip = null; - return this; - } - // Remember that the coordinate system is upside down so apply - // the transform as if the angle were negated. - // cos(-angle) = cos(angle) - // sin(-angle) = -sin(angle) - public final boolean transform(double x, double y, DPoint retcoord) { - retcoord.x = srcW_long.map(x); - retcoord.y = srcH_lat.map(y); - if (allowRotation && rotate) { - navigator.setLat2Lon2(retcoord.y, retcoord.x); - double dist = navigator.getDistance(); - double course = -navigator.getCourse(); - double offset = Math.PI/2; - if (dist > Math.PI/2) { - dist = Math.PI - dist; - offset = -offset; - course = -course; - } - navigator.setLat2Lon2(retcoord.y, retcoord.x); - retcoord.x = dist; - retcoord.y = course; - } - _transform(retcoord); - retcoord.x = long_dstW.map(retcoord.x); - retcoord.y = lat_dstH.map(retcoord.y); - return retcoord.x >= 0.0 && retcoord.x <= dstW && retcoord.y >= 0 && retcoord.y <= dstH; + + public static void main(String[] args) throws IOException { + readData(); + // Schedule a job for the event-dispatching thread: + // creating and showing this application's GUI. + javax.swing.SwingUtilities.invokeLater( + new Runnable() { + public void run() { + createAndShowGUI(); + } + }); + } + + static void readData() throws IOException { + File file = new File("classes/jsp/"); + System.out.println(file.getAbsolutePath()); + + BufferedReader br = + new BufferedReader( + new InputStreamReader(new FileInputStream("bin/jsp/Globe.txt"), "UTF-8")); + // BagFormatter.openUTF8Reader("classes/jsp/", "Globe.txt"); + String pat = + "([^;]+) \\s* [;] \\s* " + + "([0-9.]+) [°]* \\s* ([0-9.]+)? [']* \\s* ([0-9.]+)? [\"]* \\s* " + + "([NS]) \\s* " + + "([0-9.]+) [°]* \\s* ([0-9.]+)? [']* \\s* ([0-9.]+)? [\"]* \\s* " + + "([EW]) \\s*"; + Matcher m = Pattern.compile(pat, Pattern.CASE_INSENSITIVE | Pattern.COMMENTS).matcher(""); + System.out.println("Pattern: " + pat); + List nameData = new ArrayList(); + List posData = new ArrayList(); + String[] pieces = new String[3]; + while (true) { + String line = br.readLine(); + if (line == null) { + break; + } + if (!m.reset(line).matches()) { + System.out.println("Error in data: " + line); + continue; + } + nameData.add(m.group(1)); + double latitude = + Navigator.toRadians( + Double.parseDouble(m.group(2)), + m.group(3) != null ? Double.parseDouble(m.group(3)) : 0, + m.group(4) != null ? Double.parseDouble(m.group(4)) : 0, + m.group(5).equalsIgnoreCase("S")); + double longitude = + Navigator.toRadians( + Double.parseDouble(m.group(6)), + m.group(7) != null ? Double.parseDouble(m.group(7)) : 0, + m.group(8) != null ? Double.parseDouble(m.group(8)) : 0, + m.group(9).equalsIgnoreCase("E")); + posData.add(new double[] {latitude, longitude}); + System.out.println(m.group(1) + ", " + latitude + ", " + longitude); + } + originList = (String[]) nameData.toArray(originList); + origins = (double[][]) posData.toArray(origins); + br.close(); } - // Remember that the coordinate system is upside down so apply - // the transform as if the angle were negated. Since inverting - // the transform is also the same as negating the angle, itransform - // is calculated the way you would expect to calculate transform. - public final boolean itransform(double x, double y, DPoint retcoord) { - retcoord.x = long_dstW.back(x); - retcoord.y = lat_dstH.back(y); - _itransform(retcoord); - if (allowRotation && rotate) { - //retcoord.x = Navigator.wrap(retcoord.x + originLong, -Math.PI, Math.PI); - //retcoord.x = Navigator.wrap(retcoord.x, -Math.PI, Math.PI); - //System.out.println(); - //System.out.println("lat: " + Navigator.degrees(retcoord.y) + ", lon:" + Navigator.degrees(retcoord.x)); - navigator.setDistanceCourse(retcoord.y, retcoord.x); - retcoord.y = navigator.getLat2(); - retcoord.x = -navigator.getLon2(); - //System.out.println("lat: " + Navigator.degrees(retcoord.y) + ", lon:" + Navigator.degrees(retcoord.x)); - } - retcoord.x = srcW_long.back(retcoord.x); - retcoord.y = srcH_lat.back(retcoord.y); - return retcoord.x >= 0.0 && retcoord.x <= srcW && retcoord.y >= 0 && retcoord.y <= srcH; + + public static final NumberFormat nf = NumberFormat.getInstance(); + + public static void getAntipode(DPoint in) { + if (in.x > 0) { + in.x -= Math.PI; + } else { + in.x += Math.PI; + } + in.y = -in.y; } - /** - * @param input and output: latitude in y (radians from -pi/2 to pi/2) and longitude in x (radians from -pi to pi) - */ - abstract protected void _transform(DPoint retcoord); - /** - * @param input and output: latitude in y (radians from -pi/2 to pi/2) and longitude in x (radians from -pi to pi) - */ - abstract protected void _itransform(DPoint retcoord); - abstract protected Shape _getClip(); - /** - * @param style - * @return - */ - public static class Tester { - Transform trans; - DPoint retcoord = new DPoint(); - DPoint minX, minY, maxX, maxY; - void test() { - for (int k = 0; k < projectionValues.length; ++k) { - test(projectionValues[k]); - } - } - private void test(Transform trans) { - System.out.println("Testing: " + trans.getClass().getName()); - // check that points in the source rectangle map to the target rectangle - trans.set(10, 10, 100, 150); - int counter = 0; - minX = new DPoint(Double.MAX_VALUE, Double.MAX_VALUE); - minY = new DPoint(Double.MAX_VALUE, Double.MAX_VALUE); - maxX = new DPoint(Double.MIN_VALUE, Double.MIN_VALUE); - maxY = new DPoint(Double.MIN_VALUE, Double.MIN_VALUE); - double pLong = trans.srcW_long.back(originLong); - double pLat = trans.srcH_lat.back(originLat); - - trans.transform(pLong, pLat, retcoord); - trans.transform(trans.srcW/2, trans.srcH/2, retcoord); - for (double x = 0; x < trans.srcW; ++x) { - for (double y = 0; y < trans.srcH; ++y) { - counter++; - trans.transform(x, y, retcoord); - double x2 = retcoord.x; - double y2 = retcoord.y; - if (x2 < minX.x) { - minX.set(x2,y2); - } - if (x2 > maxX.x) { - maxX.set(x2,y2); + public static class DPoint implements Comparable { + double x, y; + + DPoint() { + this(0, 0); + } + + DPoint(double x, double y) { + set(x, y); + } + + DPoint set(double x, double y) { + this.x = x; + this.y = y; + return this; + } + + public int compareTo(Object o) { + DPoint that = (DPoint) o; + if (y < that.y) { + return -1; } - if (y2 < minY.y) { - minY.set(x2,y2); + if (y > that.y) { + return 1; } - if (y2 > maxY.y) { - maxY.set(x2,y2); + if (x < that.x) { + return -1; } - if (0 <= x2 && x2 < trans.dstW && 0 <= y2 && y2 < trans.dstH) { - trans.itransform(x2, y2, retcoord); - double x3 = retcoord.x; - double y3 = retcoord.y; - if (Math.abs(x - x3) > 0.001 || Math.abs(y - y3) > 0.001) { - System.out.println("Error: " + counter + "\t" + x + ", " + y - + " => " + x2 + ", " + y2 - + " => " + x3 + ", " + y3 - ); - } + if (x > that.x) { + return 1; } - } - } - System.out.println("\t minX " + minX - + ",\t maxX " + maxX - + ",\t minY " + minY - + ",\t maxY " + maxY - ); - } - } - } - - static public class TransformPlateCarree extends Transform { - public void _transform(DPoint retcoord) { - // nothing - } - public void _itransform(DPoint retcoord) { - // nothing - } - /* (non-Javadoc) - * @see Globe.Transform#_getClip() - */ - protected Shape _getClip() { - return new Rectangle.Double(0,0,dstW,dstH); - } - } + return 0; + } - static public class TransformSinusoidal extends Transform { - public void _transform(DPoint retcoord) { - retcoord.x = retcoord.x * Math.cos(retcoord.y); - } - public void _itransform(DPoint retcoord) { - if (!(-Math.PI <= retcoord.x && retcoord.x <= Math.PI)) { - retcoord.x = Double.NaN; - return; - } - retcoord.x = retcoord.x / Math.cos(retcoord.y); - } - protected Shape _getClip() { - GeneralPath p = new GeneralPath(); - p.moveTo((float)(dstW/2),0); - double limitx = srcW_long.map(0); - for (int i = 1; i <= dstH; ++i) { - double y = lat_dstH.back(i); - //System.out.println(i + ", " + y + ", " + Math.cos(y)); - double x = limitx * Math.cos(y); - x = long_dstW.map(x); - p.lineTo((float)x,i); - } - limitx = srcW_long.map(srcW); - for (int i = (int)dstH - 1; i >= 0; --i) { - double y = lat_dstH.back(i); - double x = limitx * Math.cos(y); - x = long_dstW.map(x); - p.lineTo((float)x,i); - } - return p; - } - } - - static public class Transform3DIsometric extends Transform { - static double SHIFT = Math.PI/3; - public void _transform(DPoint retcoord) { - // special shift - retcoord.x = Navigator.wrap(retcoord.x - SHIFT, -Math.PI, Math.PI); - // regular stuff - boolean shift = retcoord.x < -Math.PI/2 || retcoord.x > Math.PI/2; - double offset = shift ? Math.PI/2 : -Math.PI/2; - double cosy = Math.cos(retcoord.y); - retcoord.y = Math.sin(retcoord.y) * (Math.PI/2); - retcoord.x = Math.sin(retcoord.x) * cosy * Math.PI / 2 + offset; - if (shift) { - retcoord.x = Math.PI - retcoord.x; - } - } - public void _itransform(DPoint retcoord) { - retcoord.x *= 2; - if (retcoord.x < 0) { - retcoord.x += Math.PI; - retcoord.y = Math.asin(retcoord.y / (Math.PI/2)) ; - retcoord.x = Math.asin(retcoord.x / Math.cos(retcoord.y) / Math.PI); - } else { - retcoord.x -= Math.PI; - retcoord.y = Math.asin(retcoord.y / (Math.PI/2)) ; - retcoord.x = Math.asin(retcoord.x / Math.cos(retcoord.y) / Math.PI); - retcoord.x += Math.PI; - if (retcoord.x > Math.PI) { - retcoord.x -= 2*Math.PI; - } - } - retcoord.x = Navigator.wrap(retcoord.x + SHIFT, -Math.PI, Math.PI); - } - protected Shape _getClip() { - GeneralPath p = new GeneralPath(new Ellipse2D.Double(0,0,dstW/2,dstH)); - p.append(new Ellipse2D.Double(dstW/2,0,dstW/2,dstH), false); - return p; - } - } - - static public class TransformEqualAreaEllipse extends Transform { - boolean debugTemp = false; - TransformEqualAreaEllipse() { - if (debugTemp) { - double[][] tests = {{0, -Math.PI/2}, {0, -Math.PI/4}, {0, -Math.PI/8}, {0, 0}, {0, Math.PI/8}, {0, Math.PI/4}, {0, Math.PI/2}}; - for (int i = 0; i < tests.length; ++i) { - DPoint p = new DPoint(tests[i][0], tests[i][1]); - System.out.println(p); - _itransform(p); - System.out.println(" => " + p); - } - for (double x = -1; x <= 1; x += 0.1) { - double y = oddFunction(x); - double xx = inverseOddFunction(y); - System.out.println("x: " + x + "\ty: " + y + "\txx: " + xx); - } - debugTemp = false; - } - } - // Area of a spherical cap is 2 pi r^2 (1-sin(lat)) - // Area of a circular segment is r^2 ( acos(p) - p sqrt(1-p^2)), where p = dist to chord/r - // Thus we get the itransform easily: - // asin(2/pi (acos p - p sqrt(1-p^2)) - public void _transform(DPoint retcoord) { - double temp2 = Math.PI/2 *(1 - Math.sin(retcoord.y)); - double p = inverseOddFunction(temp2); - retcoord.y = (Math.PI/2) * p; - double temp = Math.sqrt(1-p*p); - retcoord.x = temp * retcoord.x; - } - public void _itransform(DPoint retcoord) { - double p = retcoord.y / (Math.PI/2); - if (debugTemp) { - System.out.println("\tp:\t" + p); - } - double temp = Math.sqrt(1-p*p); - if (debugTemp) { - System.out.println("\ttemp:\t" + temp); - } - double temp2 = (Math.acos(p) - p * temp); - if (debugTemp) { - System.out.println("\ttemp2:\t" + temp2); - } - double newy = Math.asin(1- (2/Math.PI) * temp2); - if (debugTemp) { - System.out.println("\tnewy:\t" + newy); - } - double newx = retcoord.x / temp; - retcoord.y = newy; - retcoord.x = newx; - } - protected Shape _getClip() { - return new Ellipse2D.Double(0,0,dstW,dstH); - } - /** - * @param in -1..1 - * @return value in 0..PI - */ - public double oddFunction(double p) { - double temp = Math.sqrt(1-p*p); - return (Math.acos(p) - p * temp); - } - public double oddFunctionDerivative(double p) { - double temp = Math.sqrt(1-p*p); - return (-2-p+p*p)/temp; - } - static final double epsilon = 0.0001; - final double lowValue = oddFunction(-1); - final double highValue = oddFunction(1); - public double inverseOddFunction(double pp) { - // ugly, have to approximate. Use newton's method. - double pLow = pp - epsilon; - double pHigh = pp + epsilon; - // for the first guess, use high and low bounds - // (guess - low) / (high - low) = (pp - lowV) / (highV - lowV); - double guess = -1 + (1 - -1) * (pp - lowValue) / (highValue - lowValue); - while (true) { - double p = oddFunction(guess); - if (pLow < p && p < pHigh) { - return guess; - } - guess = guess - (p - pp) / oddFunctionDerivative(guess); - if (debugTemp) { - System.out.println("newGuess: " + guess); - } - } + public String toString() { + return "[" + nf.format(x) + "," + nf.format(y) + "]"; + } } - } - static public class TransformGallOrthographic extends Transform { - public void _transform(DPoint retcoord) { - retcoord.y = Math.sin(retcoord.y) * (Math.PI/2) ; // transform to -1..1, then -PI/2..PI/2 - } + public static class DRectangle { + double x0, y0, x1, y1; + } + + public static class Quad { + DRectangle containing = new DRectangle(); + DPoint[] p = new DPoint[4]; + + // returns the amount (0..1) that the square from x,y to x+1, y+1 + // overlaps the quadralateral + void set(DPoint a, DPoint b, DPoint c, DPoint d) { + p[0] = a; + p[1] = b; + p[2] = c; + p[3] = d; + // sort; so y's are now in sorted order + Arrays.sort(p); + // integer bounding rectangle + // is easy for y's + containing.y0 = (int) Math.floor(p[0].y); + containing.y1 = (int) Math.ceil(p[3].y); + // but for x's we have to compute + containing.x0 = + (int) Math.floor(Math.min(p[0].x, Math.min(p[1].x, Math.min(p[2].x, p[3].x)))); + containing.x1 = + (int) Math.ceil(Math.max(p[0].x, Math.max(p[1].x, Math.max(p[2].x, p[3].x)))); + } - public void _itransform(DPoint retcoord) { - retcoord.y = Math.asin(retcoord.y / (Math.PI/2)); // transform to -1..1 - } - protected Shape _getClip() { - return new Rectangle.Double(0,0,dstW,dstH); - } - } - - static public class TransformEquidistantConic extends Transform { - {allowRotation = false;} - public void _transform(DPoint retcoord) { - // divide into two cases - navigator.setLat2Lon2(retcoord.y, retcoord.x); - double dist = navigator.getDistance(); - double course = -navigator.getCourse(); - double offset = Math.PI/2; - if (dist > Math.PI/2) { - dist = Math.PI - dist; - offset = -offset; - course = -course; - } - retcoord.x = Math.sin(course) * dist - offset; - retcoord.y = Math.cos(course) * dist; + double getWeight(double x, double y) { + // return the percentage overlap between this quadralateral, + // and the rectangle from x,y to x+1,y+1 + // simple implementation for now. return 1 if center is in containing, otherwise 0 + if (containing.x0 <= x + && x < containing.x1 + && containing.y0 <= y + && y < containing.y1) { + return 1.0; + } + return 0; + } } - public void _itransform(DPoint retcoord) { - double x2 = retcoord.x; - double y2 = retcoord.y; - double dist, course; - if (x2 < 0) { - x2 += Math.PI/2; // re-center - dist = Math.sqrt(x2 * x2 + y2 * y2); - if (dist > Math.PI/2) { - retcoord.x = Double.NaN; - return; - } - course = -Math.atan2(x2, y2); - } else { - x2 -= Math.PI/2; // re-center - dist = Math.sqrt(x2 * x2 + y2 * y2); - dist = Math.PI - dist; - if (dist < Math.PI/2) { - retcoord.x = Double.NaN; - return; - } - course = Math.atan2(x2, y2); - } - navigator.setDistanceCourse(dist, course); - retcoord.y = navigator.getLat2(); - retcoord.x = navigator.getLon2(); + public abstract static class Transform { + static final boolean debug = false; + protected double srcW, srcH, dstW, dstH; + Mapper srcW_long, srcH_lat, long_dstW, lat_dstH; + Navigator navigator; + boolean allowRotation = true; + Shape clip = null; + + /** + * @return Returns the clip. + */ + public Shape getClip() { + if (clip == null) { + clip = _getClip(); + } + return clip; + } + /** + * @return + */ + public boolean usesOrigin() { + return false; + } + // must set before use + Transform set(double srcW, double srcH, double dstW, double dstH) { + this.srcW = srcW; + this.srcH = srcH; + this.dstW = dstW; + this.dstH = dstH; + srcW_long = new Mapper(0, srcW, -Math.PI, Math.PI); + srcH_lat = new Mapper(0, srcH, -Math.PI / 2, Math.PI / 2); + long_dstW = new Mapper(-Math.PI, Math.PI, 0, dstW); + lat_dstH = new Mapper(-Math.PI / 2, Math.PI / 2, 0, dstH); + navigator = new Navigator().setLat1Lon1(originLat, originLong); + clip = null; + return this; + } + // Remember that the coordinate system is upside down so apply + // the transform as if the angle were negated. + // cos(-angle) = cos(angle) + // sin(-angle) = -sin(angle) + public final boolean transform(double x, double y, DPoint retcoord) { + retcoord.x = srcW_long.map(x); + retcoord.y = srcH_lat.map(y); + if (allowRotation && rotate) { + navigator.setLat2Lon2(retcoord.y, retcoord.x); + double dist = navigator.getDistance(); + double course = -navigator.getCourse(); + double offset = Math.PI / 2; + if (dist > Math.PI / 2) { + dist = Math.PI - dist; + offset = -offset; + course = -course; + } + navigator.setLat2Lon2(retcoord.y, retcoord.x); + retcoord.x = dist; + retcoord.y = course; + } + _transform(retcoord); + retcoord.x = long_dstW.map(retcoord.x); + retcoord.y = lat_dstH.map(retcoord.y); + return retcoord.x >= 0.0 && retcoord.x <= dstW && retcoord.y >= 0 && retcoord.y <= dstH; + } + // Remember that the coordinate system is upside down so apply + // the transform as if the angle were negated. Since inverting + // the transform is also the same as negating the angle, itransform + // is calculated the way you would expect to calculate transform. + public final boolean itransform(double x, double y, DPoint retcoord) { + retcoord.x = long_dstW.back(x); + retcoord.y = lat_dstH.back(y); + _itransform(retcoord); + if (allowRotation && rotate) { + // retcoord.x = Navigator.wrap(retcoord.x + originLong, -Math.PI, Math.PI); + // retcoord.x = Navigator.wrap(retcoord.x, -Math.PI, Math.PI); + // System.out.println(); + // System.out.println("lat: " + Navigator.degrees(retcoord.y) + ", lon:" + + // Navigator.degrees(retcoord.x)); + navigator.setDistanceCourse(retcoord.y, retcoord.x); + retcoord.y = navigator.getLat2(); + retcoord.x = -navigator.getLon2(); + // System.out.println("lat: " + Navigator.degrees(retcoord.y) + ", lon:" + + // Navigator.degrees(retcoord.x)); + } + retcoord.x = srcW_long.back(retcoord.x); + retcoord.y = srcH_lat.back(retcoord.y); + return retcoord.x >= 0.0 && retcoord.x <= srcW && retcoord.y >= 0 && retcoord.y <= srcH; + } + /** + * @param input and output: latitude in y (radians from -pi/2 to pi/2) and longitude in x + * (radians from -pi to pi) + */ + protected abstract void _transform(DPoint retcoord); + /** + * @param input and output: latitude in y (radians from -pi/2 to pi/2) and longitude in x + * (radians from -pi to pi) + */ + protected abstract void _itransform(DPoint retcoord); + + protected abstract Shape _getClip(); + /** + * @param style + * @return + */ + public static class Tester { + Transform trans; + DPoint retcoord = new DPoint(); + DPoint minX, minY, maxX, maxY; + + void test() { + for (int k = 0; k < projectionValues.length; ++k) { + test(projectionValues[k]); + } + } + + private void test(Transform trans) { + System.out.println("Testing: " + trans.getClass().getName()); + // check that points in the source rectangle map to the target rectangle + trans.set(10, 10, 100, 150); + int counter = 0; + minX = new DPoint(Double.MAX_VALUE, Double.MAX_VALUE); + minY = new DPoint(Double.MAX_VALUE, Double.MAX_VALUE); + maxX = new DPoint(Double.MIN_VALUE, Double.MIN_VALUE); + maxY = new DPoint(Double.MIN_VALUE, Double.MIN_VALUE); + double pLong = trans.srcW_long.back(originLong); + double pLat = trans.srcH_lat.back(originLat); + + trans.transform(pLong, pLat, retcoord); + trans.transform(trans.srcW / 2, trans.srcH / 2, retcoord); + for (double x = 0; x < trans.srcW; ++x) { + for (double y = 0; y < trans.srcH; ++y) { + counter++; + trans.transform(x, y, retcoord); + double x2 = retcoord.x; + double y2 = retcoord.y; + if (x2 < minX.x) { + minX.set(x2, y2); + } + if (x2 > maxX.x) { + maxX.set(x2, y2); + } + if (y2 < minY.y) { + minY.set(x2, y2); + } + if (y2 > maxY.y) { + maxY.set(x2, y2); + } + if (0 <= x2 && x2 < trans.dstW && 0 <= y2 && y2 < trans.dstH) { + trans.itransform(x2, y2, retcoord); + double x3 = retcoord.x; + double y3 = retcoord.y; + if (Math.abs(x - x3) > 0.001 || Math.abs(y - y3) > 0.001) { + System.out.println( + "Error: " + counter + "\t" + x + ", " + y + " => " + x2 + + ", " + y2 + " => " + x3 + ", " + y3); + } + } + } + } + System.out.println( + "\t minX " + + minX + + ",\t maxX " + + maxX + + ",\t minY " + + minY + + ",\t maxY " + + maxY); + } + } } - protected Shape _getClip() { - GeneralPath p = new GeneralPath(new Ellipse2D.Double(0,0,dstW/2,dstH)); - p.append(new Ellipse2D.Double(dstW/2,0,dstW/2,dstH), false); - return p; + + public static class TransformPlateCarree extends Transform { + public void _transform(DPoint retcoord) { + // nothing + } + + public void _itransform(DPoint retcoord) { + // nothing + } + /* (non-Javadoc) + * @see Globe.Transform#_getClip() + */ + protected Shape _getClip() { + return new Rectangle.Double(0, 0, dstW, dstH); + } } - public boolean usesOrigin() { - return true; + + public static class TransformSinusoidal extends Transform { + public void _transform(DPoint retcoord) { + retcoord.x = retcoord.x * Math.cos(retcoord.y); + } + + public void _itransform(DPoint retcoord) { + if (!(-Math.PI <= retcoord.x && retcoord.x <= Math.PI)) { + retcoord.x = Double.NaN; + return; + } + retcoord.x = retcoord.x / Math.cos(retcoord.y); + } + + protected Shape _getClip() { + GeneralPath p = new GeneralPath(); + p.moveTo((float) (dstW / 2), 0); + double limitx = srcW_long.map(0); + for (int i = 1; i <= dstH; ++i) { + double y = lat_dstH.back(i); + // System.out.println(i + ", " + y + ", " + Math.cos(y)); + double x = limitx * Math.cos(y); + x = long_dstW.map(x); + p.lineTo((float) x, i); + } + limitx = srcW_long.map(srcW); + for (int i = (int) dstH - 1; i >= 0; --i) { + double y = lat_dstH.back(i); + double x = limitx * Math.cos(y); + x = long_dstW.map(x); + p.lineTo((float) x, i); + } + return p; + } } - } - static class Mapper { - private double slope, offset; - Mapper(double sourceMin, double sourceMax, double targetMin, double targetMax) { - slope = (targetMax - targetMin) / (sourceMax - sourceMin); - offset = targetMin - slope * sourceMin; + public static class Transform3DIsometric extends Transform { + static double SHIFT = Math.PI / 3; + + public void _transform(DPoint retcoord) { + // special shift + retcoord.x = Navigator.wrap(retcoord.x - SHIFT, -Math.PI, Math.PI); + // regular stuff + boolean shift = retcoord.x < -Math.PI / 2 || retcoord.x > Math.PI / 2; + double offset = shift ? Math.PI / 2 : -Math.PI / 2; + double cosy = Math.cos(retcoord.y); + retcoord.y = Math.sin(retcoord.y) * (Math.PI / 2); + retcoord.x = Math.sin(retcoord.x) * cosy * Math.PI / 2 + offset; + if (shift) { + retcoord.x = Math.PI - retcoord.x; + } + } + + public void _itransform(DPoint retcoord) { + retcoord.x *= 2; + if (retcoord.x < 0) { + retcoord.x += Math.PI; + retcoord.y = Math.asin(retcoord.y / (Math.PI / 2)); + retcoord.x = Math.asin(retcoord.x / Math.cos(retcoord.y) / Math.PI); + } else { + retcoord.x -= Math.PI; + retcoord.y = Math.asin(retcoord.y / (Math.PI / 2)); + retcoord.x = Math.asin(retcoord.x / Math.cos(retcoord.y) / Math.PI); + retcoord.x += Math.PI; + if (retcoord.x > Math.PI) { + retcoord.x -= 2 * Math.PI; + } + } + retcoord.x = Navigator.wrap(retcoord.x + SHIFT, -Math.PI, Math.PI); + } + + protected Shape _getClip() { + GeneralPath p = new GeneralPath(new Ellipse2D.Double(0, 0, dstW / 2, dstH)); + p.append(new Ellipse2D.Double(dstW / 2, 0, dstW / 2, dstH), false); + return p; + } } - double map(double in) { - return in * slope + offset; + + public static class TransformEqualAreaEllipse extends Transform { + boolean debugTemp = false; + + TransformEqualAreaEllipse() { + if (debugTemp) { + double[][] tests = { + {0, -Math.PI / 2}, + {0, -Math.PI / 4}, + {0, -Math.PI / 8}, + {0, 0}, + {0, Math.PI / 8}, + {0, Math.PI / 4}, + {0, Math.PI / 2} + }; + for (int i = 0; i < tests.length; ++i) { + DPoint p = new DPoint(tests[i][0], tests[i][1]); + System.out.println(p); + _itransform(p); + System.out.println(" => " + p); + } + for (double x = -1; x <= 1; x += 0.1) { + double y = oddFunction(x); + double xx = inverseOddFunction(y); + System.out.println("x: " + x + "\ty: " + y + "\txx: " + xx); + } + debugTemp = false; + } + } + // Area of a spherical cap is 2 pi r^2 (1-sin(lat)) + // Area of a circular segment is r^2 ( acos(p) - p sqrt(1-p^2)), where p = dist to chord/r + // Thus we get the itransform easily: + // asin(2/pi (acos p - p sqrt(1-p^2)) + public void _transform(DPoint retcoord) { + double temp2 = Math.PI / 2 * (1 - Math.sin(retcoord.y)); + double p = inverseOddFunction(temp2); + retcoord.y = (Math.PI / 2) * p; + double temp = Math.sqrt(1 - p * p); + retcoord.x = temp * retcoord.x; + } + + public void _itransform(DPoint retcoord) { + double p = retcoord.y / (Math.PI / 2); + if (debugTemp) { + System.out.println("\tp:\t" + p); + } + double temp = Math.sqrt(1 - p * p); + if (debugTemp) { + System.out.println("\ttemp:\t" + temp); + } + double temp2 = (Math.acos(p) - p * temp); + if (debugTemp) { + System.out.println("\ttemp2:\t" + temp2); + } + double newy = Math.asin(1 - (2 / Math.PI) * temp2); + if (debugTemp) { + System.out.println("\tnewy:\t" + newy); + } + double newx = retcoord.x / temp; + retcoord.y = newy; + retcoord.x = newx; + } + + protected Shape _getClip() { + return new Ellipse2D.Double(0, 0, dstW, dstH); + } + /** + * @param in -1..1 + * @return value in 0..PI + */ + public double oddFunction(double p) { + double temp = Math.sqrt(1 - p * p); + return (Math.acos(p) - p * temp); + } + + public double oddFunctionDerivative(double p) { + double temp = Math.sqrt(1 - p * p); + return (-2 - p + p * p) / temp; + } + + static final double epsilon = 0.0001; + final double lowValue = oddFunction(-1); + final double highValue = oddFunction(1); + + public double inverseOddFunction(double pp) { + // ugly, have to approximate. Use newton's method. + double pLow = pp - epsilon; + double pHigh = pp + epsilon; + // for the first guess, use high and low bounds + // (guess - low) / (high - low) = (pp - lowV) / (highV - lowV); + double guess = -1 + (1 - -1) * (pp - lowValue) / (highValue - lowValue); + while (true) { + double p = oddFunction(guess); + if (pLow < p && p < pHigh) { + return guess; + } + guess = guess - (p - pp) / oddFunctionDerivative(guess); + if (debugTemp) { + System.out.println("newGuess: " + guess); + } + } + } } - double back(double out) { - return (out - offset)/slope; + + public static class TransformGallOrthographic extends Transform { + public void _transform(DPoint retcoord) { + retcoord.y = + Math.sin(retcoord.y) * (Math.PI / 2); // transform to -1..1, then -PI/2..PI/2 + } + + public void _itransform(DPoint retcoord) { + retcoord.y = Math.asin(retcoord.y / (Math.PI / 2)); // transform to -1..1 + } + + protected Shape _getClip() { + return new Rectangle.Double(0, 0, dstW, dstH); + } } - } - static public class DeformFilter extends ImageFilter { + public static class TransformEquidistantConic extends Transform { + { + allowRotation = false; + } - private static ColorModel defaultRGB = ColorModel.getRGBdefault(); + public void _transform(DPoint retcoord) { + // divide into two cases + navigator.setLat2Lon2(retcoord.y, retcoord.x); + double dist = navigator.getDistance(); + double course = -navigator.getCourse(); + double offset = Math.PI / 2; + if (dist > Math.PI / 2) { + dist = Math.PI - dist; + offset = -offset; + course = -course; + } + retcoord.x = Math.sin(course) * dist - offset; + retcoord.y = Math.cos(course) * dist; + } - private DPoint coord = new DPoint(); + public void _itransform(DPoint retcoord) { + double x2 = retcoord.x; + double y2 = retcoord.y; + double dist, course; + if (x2 < 0) { + x2 += Math.PI / 2; // re-center + dist = Math.sqrt(x2 * x2 + y2 * y2); + if (dist > Math.PI / 2) { + retcoord.x = Double.NaN; + return; + } + course = -Math.atan2(x2, y2); + } else { + x2 -= Math.PI / 2; // re-center + dist = Math.sqrt(x2 * x2 + y2 * y2); + dist = Math.PI - dist; + if (dist < Math.PI / 2) { + retcoord.x = Double.NaN; + return; + } + course = Math.atan2(x2, y2); + } + navigator.setDistanceCourse(dist, course); + retcoord.y = navigator.getLat2(); + retcoord.x = navigator.getLon2(); + } - private int raster[]; + protected Shape _getClip() { + GeneralPath p = new GeneralPath(new Ellipse2D.Double(0, 0, dstW / 2, dstH)); + p.append(new Ellipse2D.Double(dstW / 2, 0, dstW / 2, dstH), false); + return p; + } - private int xoffset, yoffset; + public boolean usesOrigin() { + return true; + } + } - private int srcW, srcH; + static class Mapper { + private double slope, offset; - private int dstW, dstH; + Mapper(double sourceMin, double sourceMax, double targetMin, double targetMax) { + slope = (targetMax - targetMin) / (sourceMax - sourceMin); + offset = targetMin - slope * sourceMin; + } - int style; + double map(double in) { + return in * slope + offset; + } - DeformFilter(int srcW, int srcH, int width, int height, Transform trans) { - dstW = width; - dstH = height; - this.trans = trans; - trans.set(srcW, srcH, dstW, dstH); - //this.style = style; + double back(double out) { + return (out - offset) / slope; + } } - Transform trans; - - public void transformBBox(Rectangle rect) { - double minx = Double.POSITIVE_INFINITY; - double miny = Double.POSITIVE_INFINITY; - double maxx = Double.NEGATIVE_INFINITY; - double maxy = Double.NEGATIVE_INFINITY; - for (int y = 0; y <= 1; y++) { - for (int x = 0; x <= 1; x++) { - trans.transform(rect.x + x * rect.width, - rect.y + y * rect.height, coord); - minx = Math.min(minx, coord.x); - miny = Math.min(miny, coord.y); - maxx = Math.max(maxx, coord.x); - maxy = Math.max(maxy, coord.y); - } - } - rect.x = (int) Math.floor(minx); - rect.y = (int) Math.floor(miny); - rect.width = (int) Math.ceil(maxx) - rect.x + 1; - rect.height = (int) Math.ceil(maxy) - rect.y + 1; - } + public static class DeformFilter extends ImageFilter { - public void setDimensions(int width, int height) { - srcW = width; - srcH = height; - Rectangle rect = new Rectangle(0, 0, dstW, dstH); - xoffset = -rect.x; - yoffset = -rect.y; - raster = new int[srcW * srcH]; - consumer.setDimensions(dstW, dstH); - - // for debugging - debug = false; - for (int i = 0; i <= rect.width; i += rect.width / 4) { - for (int j = 0; j <= rect.height; j += rect.height / 4) { - trans.transform(i, j, coord); - double i2 = coord.x; - double j2 = coord.y; - trans.itransform(i2, j2, coord); - if (debug) { - System.out.println(i + ", " + j + "\t=> " + i2 + ", " + j2 - + "\t=> " + coord.x + ", " + coord.y); - } - } - } - debug = false; + private static ColorModel defaultRGB = ColorModel.getRGBdefault(); - } - static boolean debug = false; + private DPoint coord = new DPoint(); - public void setColorModel(ColorModel model) { - consumer.setColorModel(defaultRGB); - } + private int raster[]; - public void setHints(int hintflags) { - consumer.setHints(TOPDOWNLEFTRIGHT | COMPLETESCANLINES | SINGLEPASS - | (hintflags & SINGLEFRAME)); - } + private int xoffset, yoffset; + private int srcW, srcH; + private int dstW, dstH; - public void setPixels(int x, int y, int w, int h, ColorModel model, - byte pixels[], int off, int scansize) { - int srcoff = off; - int dstoff = y * srcW + x; - for (int yc = 0; yc < h; yc++) { - for (int xc = 0; xc < w; xc++) { - raster[dstoff++] = model.getRGB(pixels[srcoff++] & 0xff); - } - srcoff += (scansize - w); - dstoff += (srcW - w); - } - } + int style; - public void setPixels(int x, int y, int w, int h, ColorModel model, - int pixels[], int off, int scansize) { - int srcoff = off; - int dstoff = y * srcW + x; - if (model == defaultRGB) { - for (int yc = 0; yc < h; yc++) { - System.arraycopy(pixels, srcoff, raster, dstoff, w); - srcoff += scansize; - dstoff += srcW; - } - } else { - for (int yc = 0; yc < h; yc++) { - for (int xc = 0; xc < w; xc++) { - raster[dstoff++] = model.getRGB(pixels[srcoff++]); - } - srcoff += (scansize - w); - dstoff += (srcW - w); - } - } - } + DeformFilter(int srcW, int srcH, int width, int height, Transform trans) { + dstW = width; + dstH = height; + this.trans = trans; + trans.set(srcW, srcH, dstW, dstH); + // this.style = style; + } - public void imageComplete(int status) { - if (status == IMAGEERROR || status == IMAGEABORTED) { - consumer.imageComplete(status); - return; - } - int pixels[] = new int[dstW]; - Quad q = new Quad(); - DPoint coord00 = new DPoint(), coord10 = new DPoint(), coord11 = new DPoint(), coord01 = new DPoint(); - double r, g, b, a, w; - boolean changeLightness = false; - double mainProportion = 0, otherProportion = 0; - if (lightness != 0) { - changeLightness = true; - if (lightness < 0) { - mainProportion = (1 + lightness); // 0 = 1, -1 = 0 - // other is zero - } else { - mainProportion = (1 - lightness); // 0 = 1, 1 = 0 - otherProportion = 0xFF * (1 - mainProportion); - } - } - boolean[] topOk = new boolean[dstW]; - double[] topRowX = new double[dstW]; - double[] topRowY = new double[dstW]; - boolean[] bottomOk = new boolean[dstW]; - double[] bottomRowX = new double[dstW]; - double[] bottomRowY = new double[dstW]; - - fillRow(dstW, 0, bottomOk, bottomRowX, bottomRowY); - - for (int dy = 0; dy < dstH; dy++) { - // exchange rows - boolean[] temp = bottomOk; - bottomOk = topOk; - topOk = temp; - double[] temp2 = bottomRowX; - bottomRowX = topRowX; - topRowX = temp2; - temp2 = bottomRowY; - bottomRowY = topRowY; - topRowY = temp2; - // and fill - fillRow(dstW, dy+1, bottomOk, bottomRowX, bottomRowY); - for (int dx = 0; dx < dstW-1; dx++) { - // optimize later - - // find the corners of the destination pixel in source space - pixels[dx] = 0; - /* - if (false) { - int i = (int)Math.round(coord00.x); - int j = (int)Math.round(coord00.y); - if (i < 0 || j < 0 || i >= srcW || j >= srcH) { - pixels[dx] = 0; - } else { - pixels[dx] = raster[j * srcW + i]; - } + Transform trans; + + public void transformBBox(Rectangle rect) { + double minx = Double.POSITIVE_INFINITY; + double miny = Double.POSITIVE_INFINITY; + double maxx = Double.NEGATIVE_INFINITY; + double maxy = Double.NEGATIVE_INFINITY; + for (int y = 0; y <= 1; y++) { + for (int x = 0; x <= 1; x++) { + trans.transform(rect.x + x * rect.width, rect.y + y * rect.height, coord); + minx = Math.min(minx, coord.x); + miny = Math.min(miny, coord.y); + maxx = Math.max(maxx, coord.x); + maxy = Math.max(maxy, coord.y); + } + } + rect.x = (int) Math.floor(minx); + rect.y = (int) Math.floor(miny); + rect.width = (int) Math.ceil(maxx) - rect.x + 1; + rect.height = (int) Math.ceil(maxy) - rect.y + 1; + } + + public void setDimensions(int width, int height) { + srcW = width; + srcH = height; + Rectangle rect = new Rectangle(0, 0, dstW, dstH); + xoffset = -rect.x; + yoffset = -rect.y; + raster = new int[srcW * srcH]; + consumer.setDimensions(dstW, dstH); + + // for debugging + debug = false; + for (int i = 0; i <= rect.width; i += rect.width / 4) { + for (int j = 0; j <= rect.height; j += rect.height / 4) { + trans.transform(i, j, coord); + double i2 = coord.x; + double j2 = coord.y; + trans.itransform(i2, j2, coord); + if (debug) { + System.out.println( + i + ", " + j + "\t=> " + i2 + ", " + j2 + "\t=> " + coord.x + ", " + + coord.y); + } + } + } + debug = false; + } + + static boolean debug = false; + + public void setColorModel(ColorModel model) { + consumer.setColorModel(defaultRGB); + } + + public void setHints(int hintflags) { + consumer.setHints( + TOPDOWNLEFTRIGHT | COMPLETESCANLINES | SINGLEPASS | (hintflags & SINGLEFRAME)); + } + + public void setPixels( + int x, + int y, + int w, + int h, + ColorModel model, + byte pixels[], + int off, + int scansize) { + int srcoff = off; + int dstoff = y * srcW + x; + for (int yc = 0; yc < h; yc++) { + for (int xc = 0; xc < w; xc++) { + raster[dstoff++] = model.getRGB(pixels[srcoff++] & 0xff); + } + srcoff += (scansize - w); + dstoff += (srcW - w); + } + } + + public void setPixels( + int x, int y, int w, int h, ColorModel model, int pixels[], int off, int scansize) { + int srcoff = off; + int dstoff = y * srcW + x; + if (model == defaultRGB) { + for (int yc = 0; yc < h; yc++) { + System.arraycopy(pixels, srcoff, raster, dstoff, w); + srcoff += scansize; + dstoff += srcW; + } + } else { + for (int yc = 0; yc < h; yc++) { + for (int xc = 0; xc < w; xc++) { + raster[dstoff++] = model.getRGB(pixels[srcoff++]); + } + srcoff += (scansize - w); + dstoff += (srcW - w); + } + } + } + + public void imageComplete(int status) { + if (status == IMAGEERROR || status == IMAGEABORTED) { + consumer.imageComplete(status); + return; + } + int pixels[] = new int[dstW]; + Quad q = new Quad(); + DPoint coord00 = new DPoint(), + coord10 = new DPoint(), + coord11 = new DPoint(), + coord01 = new DPoint(); + double r, g, b, a, w; + boolean changeLightness = false; + double mainProportion = 0, otherProportion = 0; + if (lightness != 0) { + changeLightness = true; + if (lightness < 0) { + mainProportion = (1 + lightness); // 0 = 1, -1 = 0 + // other is zero + } else { + mainProportion = (1 - lightness); // 0 = 1, 1 = 0 + otherProportion = 0xFF * (1 - mainProportion); + } + } + boolean[] topOk = new boolean[dstW]; + double[] topRowX = new double[dstW]; + double[] topRowY = new double[dstW]; + boolean[] bottomOk = new boolean[dstW]; + double[] bottomRowX = new double[dstW]; + double[] bottomRowY = new double[dstW]; + + fillRow(dstW, 0, bottomOk, bottomRowX, bottomRowY); + + for (int dy = 0; dy < dstH; dy++) { + // exchange rows + boolean[] temp = bottomOk; + bottomOk = topOk; + topOk = temp; + double[] temp2 = bottomRowX; + bottomRowX = topRowX; + topRowX = temp2; + temp2 = bottomRowY; + bottomRowY = topRowY; + topRowY = temp2; + // and fill + fillRow(dstW, dy + 1, bottomOk, bottomRowX, bottomRowY); + for (int dx = 0; dx < dstW - 1; dx++) { + // optimize later + + // find the corners of the destination pixel in source space + pixels[dx] = 0; + /* + if (false) { + int i = (int)Math.round(coord00.x); + int j = (int)Math.round(coord00.y); + if (i < 0 || j < 0 || i >= srcW || j >= srcH) { + pixels[dx] = 0; + } else { + pixels[dx] = raster[j * srcW + i]; + } + continue; + } + if (!toptrans.itransform(dx+1, dy, coord10)) continue; + if (!trans.itransform(dx+1, dy+1, coord11)) continue; + if (!trans.itransform(dx, dy+1, coord01)) continue; + */ + if (!topOk[dx] || !topOk[dx + 1] || !bottomOk[dx] || !bottomOk[dx + 1]) { + // pixels[dx] = 0xFFFFFFFF; continue; } - if (!toptrans.itransform(dx+1, dy, coord10)) continue; - if (!trans.itransform(dx+1, dy+1, coord11)) continue; - if (!trans.itransform(dx, dy+1, coord01)) continue; - */ - if (!topOk[dx] || !topOk[dx+1] || !bottomOk[dx] || !bottomOk[dx+1]) { - //pixels[dx] = 0xFFFFFFFF; - continue; - } - coord00.x = topRowX[dx]; - coord00.y = topRowY[dx]; - coord10.x = topRowX[dx+1]; - coord10.y = topRowY[dx+1]; - coord01.x = bottomRowX[dx]; - coord01.y = bottomRowY[dx]; - coord11.x = bottomRowX[dx+1]; - coord11.y = bottomRowY[dx+1]; - - q.set(coord00, coord10, coord11, coord01); - - // add up the weighted colors - r = g = b = a = w = 0; - int xx0 = (int)q.containing.x0; - int xx1 = (int)q.containing.x1; - int yy0 = (int)q.containing.y0; - int yy1 = (int)q.containing.y1; - for (int x0 = xx0; x0 < xx1; ++x0) { - for (int y0 = yy0; y0 < yy1; ++y0) { - double weight; - //weight = q.getWeight(x0, y0); - weight = 1; - if (weight == 0.0) { - continue; - } - w += weight; - if (x0 < 0 || y0 < 0 || x0 >= srcW || y0 >= srcH) { - continue; - } - int color = raster[y0 * srcW + x0]; - a += ((color>>24)&0xFF)*weight; - r += ((color>>16)&0xFF)*weight; - g += ((color>>8)&0xFF)*weight; - b += ((color)&0xFF)*weight; + coord00.x = topRowX[dx]; + coord00.y = topRowY[dx]; + coord10.x = topRowX[dx + 1]; + coord10.y = topRowY[dx + 1]; + coord01.x = bottomRowX[dx]; + coord01.y = bottomRowY[dx]; + coord11.x = bottomRowX[dx + 1]; + coord11.y = bottomRowY[dx + 1]; + + q.set(coord00, coord10, coord11, coord01); + + // add up the weighted colors + r = g = b = a = w = 0; + int xx0 = (int) q.containing.x0; + int xx1 = (int) q.containing.x1; + int yy0 = (int) q.containing.y0; + int yy1 = (int) q.containing.y1; + for (int x0 = xx0; x0 < xx1; ++x0) { + for (int y0 = yy0; y0 < yy1; ++y0) { + double weight; + // weight = q.getWeight(x0, y0); + weight = 1; + if (weight == 0.0) { + continue; + } + w += weight; + if (x0 < 0 || y0 < 0 || x0 >= srcW || y0 >= srcH) { + continue; + } + int color = raster[y0 * srcW + x0]; + a += ((color >> 24) & 0xFF) * weight; + r += ((color >> 16) & 0xFF) * weight; + g += ((color >> 8) & 0xFF) * weight; + b += ((color) & 0xFF) * weight; + } + } + // average: + r /= w; + g /= w; + b /= w; + a /= w; + + if (changeLightness) { + r = mainProportion * r + otherProportion; + g = mainProportion * g + otherProportion; + b = mainProportion * b + otherProportion; + a = mainProportion * a + otherProportion; + } + + pixels[dx] = + ((int) Math.max(0, Math.min(0xFF, Math.round(a))) << 24) + | ((int) Math.max(0, Math.min(0xFF, Math.round(r))) << 16) + | ((int) Math.max(0, Math.min(0xFF, Math.round(g))) << 8) + | ((int) Math.max(0, Math.min(0xFF, Math.round(b)))); + } + consumer.setPixels(0, dy, dstW, 1, defaultRGB, pixels, 0, dstW); + if ((dy % 50) == 0) { + System.out.println(dy); + } } - } - // average: - r /= w; - g /= w; - b /= w; - a /= w; - - if (changeLightness) { - r = mainProportion*r + otherProportion; - g = mainProportion*g + otherProportion; - b = mainProportion*b + otherProportion; - a = mainProportion*a + otherProportion; - } - - pixels[dx] = - ((int)Math.max(0, Math.min(0xFF, Math.round(a))) << 24) | - ((int)Math.max(0, Math.min(0xFF, Math.round(r))) << 16) | - ((int)Math.max(0, Math.min(0xFF, Math.round(g))) << 8) | - ((int)Math.max(0, Math.min(0xFF, Math.round(b)))); - } - consumer.setPixels(0, dy, dstW, 1, defaultRGB, pixels, 0, dstW); - if ((dy % 50) == 0) { - System.out.println(dy); - } - } - consumer.imageComplete(status); - } + consumer.imageComplete(status); + } - /** - * @param i - * @param dstW2 - * @param j - * @param rowX - * @param rowY - */ - private void fillRow(int xLimit, int dy, boolean[] ok, double[] rowX, double[] rowY) { - DPoint coord00 = new DPoint(); - for (int dx = 0; dx < xLimit; dx++) { - ok[dx] = trans.itransform(dx, dy, coord00); - rowX[dx] = coord00.x; - rowY[dx] = coord00.y; - } + /** + * @param i + * @param dstW2 + * @param j + * @param rowX + * @param rowY + */ + private void fillRow(int xLimit, int dy, boolean[] ok, double[] rowX, double[] rowY) { + DPoint coord00 = new DPoint(); + for (int dx = 0; dx < xLimit; dx++) { + ok[dx] = trans.itransform(dx, dy, coord00); + rowX[dx] = coord00.x; + rowY[dx] = coord00.y; + } + } } - } - - static public class RotateFilter extends ImageFilter { - private static ColorModel defaultRGB = ColorModel.getRGBdefault(); + public static class RotateFilter extends ImageFilter { - private double angle; + private static ColorModel defaultRGB = ColorModel.getRGBdefault(); - private double sin; + private double angle; - private double cos; + private double sin; - private double coord[] = new double[2]; + private double cos; - private int raster[]; + private double coord[] = new double[2]; - private int xoffset, yoffset; + private int raster[]; - private int srcW, srcH; + private int xoffset, yoffset; + private int srcW, srcH; + private int dstW, dstH; - private int dstW, dstH; + public RotateFilter(double angle) { + this.angle = angle; + sin = Math.sin(angle); + cos = Math.cos(angle); + } - public RotateFilter(double angle) { - this.angle = angle; - sin = Math.sin(angle); - cos = Math.cos(angle); - } + public void transform(double x, double y, double[] retcoord) { + // Remember that the coordinate system is upside down so apply + // the transform as if the angle were negated. + // cos(-angle) = cos(angle) + // sin(-angle) = -sin(angle) + retcoord[0] = cos * x + sin * y; + retcoord[1] = cos * y - sin * x; + } - public void transform(double x, double y, double[] retcoord) { - // Remember that the coordinate system is upside down so apply - // the transform as if the angle were negated. - // cos(-angle) = cos(angle) - // sin(-angle) = -sin(angle) - retcoord[0] = cos * x + sin * y; - retcoord[1] = cos * y - sin * x; - } + public void itransform(double x, double y, double[] retcoord) { + // Remember that the coordinate system is upside down so apply + // the transform as if the angle were negated. Since inverting + // the transform is also the same as negating the angle, itransform + // is calculated the way you would expect to calculate transform. + retcoord[0] = cos * x - sin * y; + retcoord[1] = cos * y + sin * x; + } - public void itransform(double x, double y, double[] retcoord) { - // Remember that the coordinate system is upside down so apply - // the transform as if the angle were negated. Since inverting - // the transform is also the same as negating the angle, itransform - // is calculated the way you would expect to calculate transform. - retcoord[0] = cos * x - sin * y; - retcoord[1] = cos * y + sin * x; - } + public void transformBBox(Rectangle rect) { + double minx = Double.POSITIVE_INFINITY; + double miny = Double.POSITIVE_INFINITY; + double maxx = Double.NEGATIVE_INFINITY; + double maxy = Double.NEGATIVE_INFINITY; + for (int y = 0; y <= 1; y++) { + for (int x = 0; x <= 1; x++) { + transform(rect.x + x * rect.width, rect.y + y * rect.height, coord); + minx = Math.min(minx, coord[0]); + miny = Math.min(miny, coord[1]); + maxx = Math.max(maxx, coord[0]); + maxy = Math.max(maxy, coord[1]); + } + } + rect.x = (int) Math.floor(minx); + rect.y = (int) Math.floor(miny); + rect.width = (int) Math.ceil(maxx) - rect.x + 1; + rect.height = (int) Math.ceil(maxy) - rect.y + 1; + } - public void transformBBox(Rectangle rect) { - double minx = Double.POSITIVE_INFINITY; - double miny = Double.POSITIVE_INFINITY; - double maxx = Double.NEGATIVE_INFINITY; - double maxy = Double.NEGATIVE_INFINITY; - for (int y = 0; y <= 1; y++) { - for (int x = 0; x <= 1; x++) { - transform(rect.x + x * rect.width, - rect.y + y * rect.height, coord); - minx = Math.min(minx, coord[0]); - miny = Math.min(miny, coord[1]); - maxx = Math.max(maxx, coord[0]); - maxy = Math.max(maxy, coord[1]); - } - } - rect.x = (int) Math.floor(minx); - rect.y = (int) Math.floor(miny); - rect.width = (int) Math.ceil(maxx) - rect.x + 1; - rect.height = (int) Math.ceil(maxy) - rect.y + 1; - } + public void setDimensions(int width, int height) { + Rectangle rect = new Rectangle(0, 0, width, height); + transformBBox(rect); + xoffset = -rect.x; + yoffset = -rect.y; + srcW = width; + srcH = height; + dstW = rect.width; + dstH = rect.height; + raster = new int[srcW * srcH]; + consumer.setDimensions(dstW, dstH); + } - public void setDimensions(int width, int height) { - Rectangle rect = new Rectangle(0, 0, width, height); - transformBBox(rect); - xoffset = -rect.x; - yoffset = -rect.y; - srcW = width; - srcH = height; - dstW = rect.width; - dstH = rect.height; - raster = new int[srcW * srcH]; - consumer.setDimensions(dstW, dstH); - } + public void setColorModel(ColorModel model) { + consumer.setColorModel(defaultRGB); + } - public void setColorModel(ColorModel model) { - consumer.setColorModel(defaultRGB); - } + public void setHints(int hintflags) { + consumer.setHints( + TOPDOWNLEFTRIGHT | COMPLETESCANLINES | SINGLEPASS | (hintflags & SINGLEFRAME)); + } - public void setHints(int hintflags) { - consumer.setHints(TOPDOWNLEFTRIGHT | COMPLETESCANLINES | SINGLEPASS - | (hintflags & SINGLEFRAME)); - } + public void setPixels( + int x, + int y, + int w, + int h, + ColorModel model, + byte pixels[], + int off, + int scansize) { + int srcoff = off; + int dstoff = y * srcW + x; + for (int yc = 0; yc < h; yc++) { + for (int xc = 0; xc < w; xc++) { + raster[dstoff++] = model.getRGB(pixels[srcoff++] & 0xff); + } + srcoff += (scansize - w); + dstoff += (srcW - w); + } + } - public void setPixels(int x, int y, int w, int h, ColorModel model, - byte pixels[], int off, int scansize) { - int srcoff = off; - int dstoff = y * srcW + x; - for (int yc = 0; yc < h; yc++) { - for (int xc = 0; xc < w; xc++) { - raster[dstoff++] = model.getRGB(pixels[srcoff++] & 0xff); - } - srcoff += (scansize - w); - dstoff += (srcW - w); - } - } + public void setPixels( + int x, int y, int w, int h, ColorModel model, int pixels[], int off, int scansize) { + int srcoff = off; + int dstoff = y * srcW + x; + if (model == defaultRGB) { + for (int yc = 0; yc < h; yc++) { + System.arraycopy(pixels, srcoff, raster, dstoff, w); + srcoff += scansize; + dstoff += srcW; + } + } else { + for (int yc = 0; yc < h; yc++) { + for (int xc = 0; xc < w; xc++) { + raster[dstoff++] = model.getRGB(pixels[srcoff++]); + } + srcoff += (scansize - w); + dstoff += (srcW - w); + } + } + } - public void setPixels(int x, int y, int w, int h, ColorModel model, - int pixels[], int off, int scansize) { - int srcoff = off; - int dstoff = y * srcW + x; - if (model == defaultRGB) { - for (int yc = 0; yc < h; yc++) { - System.arraycopy(pixels, srcoff, raster, dstoff, w); - srcoff += scansize; - dstoff += srcW; - } - } else { - for (int yc = 0; yc < h; yc++) { - for (int xc = 0; xc < w; xc++) { - raster[dstoff++] = model.getRGB(pixels[srcoff++]); - } - srcoff += (scansize - w); - dstoff += (srcW - w); - } - } + public void imageComplete(int status) { + if (status == IMAGEERROR || status == IMAGEABORTED) { + consumer.imageComplete(status); + return; + } + int pixels[] = new int[dstW]; + for (int dy = 0; dy < dstH; dy++) { + itransform(0 - xoffset, dy - yoffset, coord); + double x1 = coord[0]; + double y1 = coord[1]; + itransform(dstW - xoffset, dy - yoffset, coord); + double x2 = coord[0]; + double y2 = coord[1]; + double xinc = (x2 - x1) / dstW; + double yinc = (y2 - y1) / dstW; + for (int dx = 0; dx < dstW; dx++) { + int sx = (int) Math.round(x1); + int sy = (int) Math.round(y1); + if (sx < 0 || sy < 0 || sx >= srcW || sy >= srcH) { + pixels[dx] = 0; + } else { + pixels[dx] = raster[sy * srcW + sx]; + } + x1 += xinc; + y1 += yinc; + } + consumer.setPixels(0, dy, dstW, 1, defaultRGB, pixels, 0, dstW); + } + consumer.imageComplete(status); + } } + /* + public static double convertDegreesToDecimal(double degrees, double minutes, double seconds, boolean NorthOrEast) { + double result = (degrees + minutes / 60 + seconds / 3600); + if (!NorthOrEast) result = -result; + return result; + } + */ + /* + public static void convertLongitudeLatitudeToWidthHeight(double longitude, double latitude, double width, double height, DPoint output) { + output.x = (longitude + 180)/360 * width; + output.y = (90 - latitude)/180 * height; + } + */ + /* + public static void convertPolarRadiansToWidthHeight(double longitudeR, double colatitudeR, double width, double height, DPoint output) { + // get in range + longitudeR += Math.PI; // origin on left + while (longitudeR < 0) longitudeR += Math.PI * 2; + while (longitudeR > Math.PI * 2) longitudeR -= Math.PI * 2; + output.x = longitudeR/(Math.PI * 2) * width; + output.y = colatitudeR/Math.PI * height; + } + */ + /* + public static void convertLongitudeLatitudeToPolarRadians(double longitude, double latitude, DPoint output) { + output.x = longitude/180 * Math.PI; + output.y = (90 - latitude)/180 * Math.PI; + } + */ + + public static BufferedImage convertToBuffered(Image image) { + int thumbWidth = image.getWidth(null); + int thumbHeight = image.getHeight(null); + BufferedImage thumbImage = + new BufferedImage(thumbWidth, thumbHeight, BufferedImage.TYPE_INT_RGB); + Graphics2D graphics2D = thumbImage.createGraphics(); + graphics2D.setRenderingHint( + RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR); + graphics2D.drawImage(image, 0, 0, thumbWidth, thumbHeight, null); + return thumbImage; + } + + public static void addGrid(Image image, Transform trans) { + int thumbWidth = image.getWidth(null); + int thumbHeight = image.getHeight(null); + BufferedImage thumbImage = + new BufferedImage(thumbWidth, thumbHeight, BufferedImage.TYPE_INT_RGB); + Graphics2D graphics2D = thumbImage.createGraphics(); + graphics2D.setRenderingHint( + RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR); + Color meridian = Color.red; + Color everyOtherLine = Color.orange; + if (lightness > 0) { + graphics2D.setClip(0, 0, thumbWidth, thumbHeight); + graphics2D.setColor( + new Color( + (int) (0xFF * lightness), + (int) (0xFF * lightness), + (int) (0xFF * lightness))); + graphics2D.fillRect(0, 0, thumbWidth, thumbHeight); + meridian = new Color(0xFF, (int) (0xFF * lightness), (int) (0xFF * lightness)); + } + graphics2D.setClip(trans.getClip()); + graphics2D.drawImage(image, 0, 0, thumbWidth, thumbHeight, null); + // Menlo Park 37? 28' 48" N 122? 08' 39" W + + graphics2D.setRenderingHint( + RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON); + // double latitude = convertDegreesToDecimal(37.0, 28.0, 48.0, true); // N = + + // double longitude = convertDegreesToDecimal(122.0, 8.0, 39.0, false); // W = - + DPoint retCoord = new DPoint(); + + // drawPoint(graphics2D, trans, longitude, latitude); + + drawPoint(graphics2D, trans, Color.green, Color.white, originLong, originLat); + retCoord.x = originLong; + retCoord.y = originLat; + getAntipode(retCoord); + drawPoint(graphics2D, trans, Color.red, Color.white, retCoord.x, retCoord.y); + + graphics2D.setFont(font); + FontMetrics fm = graphics2D.getFontMetrics(); + + BasicStroke normal = new BasicStroke(1.0f / 3); + BasicStroke thick = new BasicStroke(2.0f / 3); + + if (true) { + // hack to draw circles + // convertLongitudeLatitudeToPolarRadians(originLong, originLat, retCoord); + // double longR = retCoord.x; + // double colatR = retCoord.y; + + // SphericalTriangle stri = new SphericalTriangle(); + Navigator navigator = new Navigator().setLat1Lon1(originLat, originLong); + int increment = 180 / degreeInterval; + int grain = 3; + int labelPosition = increment / 2; + + // circles of equal distance + double dInc = Math.PI / increment; + // double dInc2 = dInc/grain; + int distLimit = increment - 1; + int angleLimit = 2 * increment - 1; + int halfAngle = increment; + + /* + for (int distanceI = 1; distanceI <= distLimit; ++distanceI) { + if (distanceI == labelPosition) graphics2D.setColor(Color.black); + else graphics2D.setColor(Color.yellow); + double distance = dInc * distanceI; + double lat1 = 0, lon1 = 0; + for (int angleI = 0; angleI <= (angleLimit + 1) * grain; ++angleI) { + double angle = dInc2 * angleI; + //System.out.println("Distance: " + distance + "\tAngle: " + angle); + navigator.setDistanceCourse(distance, angle); + double lat2 = trans.srcH_lat.back(navigator.getLat2()); + double lon2 = trans.srcW_long.back(navigator.getLon2()); + //System.out.println("Distance: " + distance + "\tAngle: " + angle); + if (angleI != 0) drawLine(graphics2D, trans, lon2, lat2, lon1, lat1); + lat1 = lat2; + lon1 = lon2; + } + } + + // lines to antipode + for (int angleI = 0; angleI <= angleLimit; ++angleI) { + double angle = dInc * angleI; + double lat1 = 0, lon1 = 0; + if (angleI == 0) graphics2D.setColor(Color.black); + else graphics2D.setColor(Color.white); + for (int distanceI = grain; distanceI <= distLimit * grain; ++distanceI) { + double distance = dInc2 * distanceI; + //System.out.println("Distance: " + distance + "\tAngle: " + angle); + navigator.setDistanceCourse(distance, angle); + double lat2 = trans.srcH_lat.back(navigator.getLat2()); + double lon2 = trans.srcW_long.back(navigator.getLon2()); + //System.out.println("Distance: " + distance + "\tAngle: " + angle); + if (distanceI != grain) drawLine(graphics2D, trans, lon2, lat2, lon1, lat1); + lat1 = lat2; + lon1 = lon2; + } + */ + + // lines to the antipode + double gap = 0.02; + PathTransform pathTransform = new PathTransform(navigator, trans); + LineDrawer ld = new LineDrawer(graphics2D, pathTransform); + for (int angleI = 0; angleI <= angleLimit; ++angleI) { + double angle = dInc * angleI; + if (angleI == 0 || angleI == halfAngle) { + graphics2D.setColor(meridian); + graphics2D.setStroke(thick); + } else if ((angleI % 3) == 0) { + graphics2D.setColor(everyOtherLine); + graphics2D.setStroke(thick); + } else { + graphics2D.setColor(Color.white); + graphics2D.setStroke(normal); + } + pathTransform.setAngle(angle); + ld.draw(gap, 1 - gap); + } - public void imageComplete(int status) { - if (status == IMAGEERROR || status == IMAGEABORTED) { - consumer.imageComplete(status); - return; - } - int pixels[] = new int[dstW]; - for (int dy = 0; dy < dstH; dy++) { - itransform(0 - xoffset, dy - yoffset, coord); - double x1 = coord[0]; - double y1 = coord[1]; - itransform(dstW - xoffset, dy - yoffset, coord); - double x2 = coord[0]; - double y2 = coord[1]; - double xinc = (x2 - x1) / dstW; - double yinc = (y2 - y1) / dstW; - for (int dx = 0; dx < dstW; dx++) { - int sx = (int) Math.round(x1); - int sy = (int) Math.round(y1); - if (sx < 0 || sy < 0 || sx >= srcW || sy >= srcH) { - pixels[dx] = 0; - } else { - pixels[dx] = raster[sy * srcW + sx]; - } - x1 += xinc; - y1 += yinc; - } - consumer.setPixels(0, dy, dstW, 1, defaultRGB, pixels, 0, dstW); - } - consumer.imageComplete(status); - } - } - /* - public static double convertDegreesToDecimal(double degrees, double minutes, double seconds, boolean NorthOrEast) { - double result = (degrees + minutes / 60 + seconds / 3600); - if (!NorthOrEast) result = -result; - return result; - } - */ - /* - public static void convertLongitudeLatitudeToWidthHeight(double longitude, double latitude, double width, double height, DPoint output) { - output.x = (longitude + 180)/360 * width; - output.y = (90 - latitude)/180 * height; - } - */ - /* - public static void convertPolarRadiansToWidthHeight(double longitudeR, double colatitudeR, double width, double height, DPoint output) { - // get in range - longitudeR += Math.PI; // origin on left - while (longitudeR < 0) longitudeR += Math.PI * 2; - while (longitudeR > Math.PI * 2) longitudeR -= Math.PI * 2; - output.x = longitudeR/(Math.PI * 2) * width; - output.y = colatitudeR/Math.PI * height; - } - */ - /* - public static void convertLongitudeLatitudeToPolarRadians(double longitude, double latitude, DPoint output) { - output.x = longitude/180 * Math.PI; - output.y = (90 - latitude)/180 * Math.PI; - } - */ - - public static BufferedImage convertToBuffered(Image image) { - int thumbWidth = image.getWidth(null); - int thumbHeight = image.getHeight(null); - BufferedImage thumbImage = new BufferedImage(thumbWidth, thumbHeight, - BufferedImage.TYPE_INT_RGB); - Graphics2D graphics2D = thumbImage.createGraphics(); - graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION, - RenderingHints.VALUE_INTERPOLATION_BILINEAR); - graphics2D.drawImage(image, 0, 0, thumbWidth, thumbHeight, null); - return thumbImage; - } - - public static void addGrid(Image image, Transform trans) { - int thumbWidth = image.getWidth(null); - int thumbHeight = image.getHeight(null); - BufferedImage thumbImage = new BufferedImage(thumbWidth, thumbHeight, - BufferedImage.TYPE_INT_RGB); - Graphics2D graphics2D = thumbImage.createGraphics(); - graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION, - RenderingHints.VALUE_INTERPOLATION_BILINEAR); - Color meridian = Color.red; - Color everyOtherLine = Color.orange; - if (lightness > 0) { - graphics2D.setClip(0,0,thumbWidth, thumbHeight); - graphics2D.setColor(new Color((int)(0xFF * lightness), (int)(0xFF * lightness), (int)(0xFF * lightness))); - graphics2D.fillRect(0,0,thumbWidth, thumbHeight); - meridian = new Color(0xFF, (int)(0xFF * lightness), (int)(0xFF * lightness)); - } - graphics2D.setClip(trans.getClip()); - graphics2D.drawImage(image, 0, 0, thumbWidth, thumbHeight, null); - // Menlo Park 37? 28' 48" N 122? 08' 39" W - - graphics2D.setRenderingHint(RenderingHints.KEY_ANTIALIASING,RenderingHints.VALUE_ANTIALIAS_ON); - //double latitude = convertDegreesToDecimal(37.0, 28.0, 48.0, true); // N = + - //double longitude = convertDegreesToDecimal(122.0, 8.0, 39.0, false); // W = - - DPoint retCoord = new DPoint(); - - //drawPoint(graphics2D, trans, longitude, latitude); - - - drawPoint(graphics2D, trans, Color.green, Color.white, originLong, originLat); - retCoord.x = originLong; - retCoord.y = originLat; - getAntipode(retCoord); - drawPoint(graphics2D, trans, Color.red, Color.white, retCoord.x, retCoord.y); - - graphics2D.setFont(font); - FontMetrics fm = graphics2D.getFontMetrics(); - - BasicStroke normal = new BasicStroke(1.0f/3); - BasicStroke thick = new BasicStroke(2.0f/3); - - if (true) { - // hack to draw circles - //convertLongitudeLatitudeToPolarRadians(originLong, originLat, retCoord); - //double longR = retCoord.x; - //double colatR = retCoord.y; - - //SphericalTriangle stri = new SphericalTriangle(); - Navigator navigator = new Navigator().setLat1Lon1(originLat, originLong); - int increment = 180/degreeInterval; - int grain = 3; - int labelPosition = increment / 2; - - // circles of equal distance - double dInc = Math.PI / increment; - //double dInc2 = dInc/grain; - int distLimit = increment - 1; - int angleLimit = 2 * increment - 1; - int halfAngle = increment; - - /* + AngleCircleTransform angleTransform = new AngleCircleTransform(navigator, trans); + ld = new LineDrawer(graphics2D, angleTransform); for (int distanceI = 1; distanceI <= distLimit; ++distanceI) { - if (distanceI == labelPosition) graphics2D.setColor(Color.black); - else graphics2D.setColor(Color.yellow); + if (distanceI == labelPosition) { + graphics2D.setColor(meridian); + graphics2D.setStroke(thick); + } else { + graphics2D.setColor(Color.white); + graphics2D.setStroke(normal); + } double distance = dInc * distanceI; - double lat1 = 0, lon1 = 0; - for (int angleI = 0; angleI <= (angleLimit + 1) * grain; ++angleI) { - double angle = dInc2 * angleI; - //System.out.println("Distance: " + distance + "\tAngle: " + angle); - navigator.setDistanceCourse(distance, angle); - double lat2 = trans.srcH_lat.back(navigator.getLat2()); - double lon2 = trans.srcW_long.back(navigator.getLon2()); - //System.out.println("Distance: " + distance + "\tAngle: " + angle); - if (angleI != 0) drawLine(graphics2D, trans, lon2, lat2, lon1, lat1); - lat1 = lat2; - lon1 = lon2; + angleTransform.setDistance(distance); + ld.draw(0, 1); + } + + // if (doLabels) { + // graphics2D.setClip(null); + // StandardCodes sc = StandardCodes.make(); + // LabelPosition lp = new LabelPosition(graphics2D, trans.dstW, trans.dstH); + // Map zones = sc.getZoneData(); + // Set zkeys = sc.getGoodAvailableCodes("tzid"); + // Date now = new Date(); + // for (Iterator it = zkeys.iterator(); it.hasNext();) { + // String fullkey = (String) it.next(); + // List data = (List) zones.get(fullkey); + // //String key = tzf.getFormattedZone(fullkey,"vvvv", now.getTime(), false); // + // key.substring(key.lastIndexOf('/')+1); + // retCoord.y = ((Double)data.get(0)).doubleValue(); + // retCoord.y = Navigator.toRadians(retCoord.y, 0, 0, false); + // retCoord.x = ((Double)data.get(1)).doubleValue(); + // retCoord.x = Navigator.toRadians(retCoord.x, 0, 0, true); + // drawPoint(graphics2D, trans, Color.white, Color.red, retCoord.x, retCoord.y, + // null); + // lp.add(trans, retCoord.x, retCoord.y, key); + // } + // lp.draw(); + // } + /* + graphics2D.setColor(Color.red); + graphics2D.setStroke(new BasicStroke(2f)); + pathTransform = new PathTransform(navigator, trans); + pathTransform.setAngle(20*Navigator.DEGREE); + ld = new LineDrawer(graphics2D,pathTransform); + ld.draw(0,1); + */ + + /* + for (int distanceI = 1; distanceI < coord.length; ++distanceI) { + boolean doLabel = distanceI == labelPosition; + double[][] distanceR = coord[distanceI]; + for (int angleI = 0; angleI < distanceR.length-1; ++angleI) { + double[] angleR00 = distanceR[angleI]; + double[] angleR01 = distanceR[angleI+1]; + graphics2D.setColor(Color.white); + if (doLabel) { + if (AverageWithColor) graphics2D.setColor(Color.gray); + else graphics2D.setColor(Color.black); + } + drawLine(graphics2D, trans, angleR00[0], angleR00[1], angleR01[0], angleR01[1]); + if (doLabel) { + drawDegrees(graphics2D, trans, fm, 180 * angleI / increment, angleR00[0], angleR00[1]); + } + } + + double[] angleR00 = distanceR[distanceR.length-1]; + double[] angleR01 = distanceR[0]; + graphics2D.setColor(Color.white); + if (doLabel) { + if (AverageWithColor) graphics2D.setColor(Color.gray); + else graphics2D.setColor(Color.black); + } + drawLine(graphics2D, trans, angleR00[0], angleR00[1], angleR01[0], angleR01[1]); + if (doLabel) { + drawDegrees(graphics2D, trans, fm, 180 * (distanceR.length-1) / increment, angleR00[0], angleR00[1]); + } + + } + */ + + } + // save thumbnail image to OUTFILE + griddedImage = thumbImage; + if (DEBUG_ICON) { + System.out.println("Changing Icon6"); + } + ImageIcon resultIcon = new ImageIcon(thumbImage); // recreate with buffered version + if (DEBUG_ICON) { + System.out.println("Changing Icon7"); + } + mainPicture.setIcon(resultIcon); + } + + static class LabelPosition { + class Chunk implements Comparable { + double x, y; + int xStart, yStart, width; + String s; + + public Chunk(double x2, double y2, String s2) { + x = x2; + y = y2; + s = s2; + width = 1; + xStart = (int) (x2 / tileWidth); + yStart = (int) (y2 / tileHeight); + if (s == null) { + return; + } + xStart += 2; + + Rectangle2D r = metrics.getStringBounds(s, graphics2D); + width = 1 + (int) ((r.getWidth() - 1.0) / tileWidth); + + if (xStart + width >= tileWidthCount) { + xStart = tileWidthCount - width; } } - // lines to antipode - for (int angleI = 0; angleI <= angleLimit; ++angleI) { - double angle = dInc * angleI; - double lat1 = 0, lon1 = 0; - if (angleI == 0) graphics2D.setColor(Color.black); - else graphics2D.setColor(Color.white); - for (int distanceI = grain; distanceI <= distLimit * grain; ++distanceI) { - double distance = dInc2 * distanceI; - //System.out.println("Distance: " + distance + "\tAngle: " + angle); - navigator.setDistanceCourse(distance, angle); - double lat2 = trans.srcH_lat.back(navigator.getLat2()); - double lon2 = trans.srcW_long.back(navigator.getLon2()); - //System.out.println("Distance: " + distance + "\tAngle: " + angle); - if (distanceI != grain) drawLine(graphics2D, trans, lon2, lat2, lon1, lat1); - lat1 = lat2; - lon1 = lon2; + public int compareTo(Object o) { + Chunk that = (Chunk) o; + if (x != that.x) { + return x < that.x ? -1 : 1; } - */ - - // lines to the antipode - double gap = 0.02; - PathTransform pathTransform = new PathTransform(navigator, trans); - LineDrawer ld = new LineDrawer(graphics2D,pathTransform); - for (int angleI = 0; angleI <= angleLimit; ++angleI) { - double angle = dInc * angleI; - if (angleI == 0 || angleI == halfAngle) { - graphics2D.setColor(meridian); - graphics2D.setStroke(thick); - } else if ((angleI % 3) == 0) { - graphics2D.setColor(everyOtherLine); - graphics2D.setStroke(thick); - } else { - graphics2D.setColor(Color.white); - graphics2D.setStroke(normal); - } - pathTransform.setAngle(angle); - ld.draw(gap,1-gap); - } - - AngleCircleTransform angleTransform = new AngleCircleTransform(navigator, trans); - ld = new LineDrawer(graphics2D,angleTransform); - for (int distanceI = 1; distanceI <= distLimit; ++distanceI) { - if (distanceI == labelPosition) { - graphics2D.setColor(meridian); - graphics2D.setStroke(thick); - } else { - graphics2D.setColor(Color.white); - graphics2D.setStroke(normal); - } - double distance = dInc * distanceI; - angleTransform.setDistance(distance); - ld.draw(0,1); - } - - // if (doLabels) { - //graphics2D.setClip(null); - //StandardCodes sc = StandardCodes.make(); - //LabelPosition lp = new LabelPosition(graphics2D, trans.dstW, trans.dstH); - //Map zones = sc.getZoneData(); - //Set zkeys = sc.getGoodAvailableCodes("tzid"); - // Date now = new Date(); - // for (Iterator it = zkeys.iterator(); it.hasNext();) { - // String fullkey = (String) it.next(); - // List data = (List) zones.get(fullkey); - // //String key = tzf.getFormattedZone(fullkey,"vvvv", now.getTime(), false); // key.substring(key.lastIndexOf('/')+1); - // retCoord.y = ((Double)data.get(0)).doubleValue(); - // retCoord.y = Navigator.toRadians(retCoord.y, 0, 0, false); - // retCoord.x = ((Double)data.get(1)).doubleValue(); - // retCoord.x = Navigator.toRadians(retCoord.x, 0, 0, true); - // drawPoint(graphics2D, trans, Color.white, Color.red, retCoord.x, retCoord.y, null); - // lp.add(trans, retCoord.x, retCoord.y, key); - // } - // lp.draw(); - // } - /* - graphics2D.setColor(Color.red); - graphics2D.setStroke(new BasicStroke(2f)); - pathTransform = new PathTransform(navigator, trans); - pathTransform.setAngle(20*Navigator.DEGREE); - ld = new LineDrawer(graphics2D,pathTransform); - ld.draw(0,1); - */ - - - /* - for (int distanceI = 1; distanceI < coord.length; ++distanceI) { - boolean doLabel = distanceI == labelPosition; - double[][] distanceR = coord[distanceI]; - for (int angleI = 0; angleI < distanceR.length-1; ++angleI) { - double[] angleR00 = distanceR[angleI]; - double[] angleR01 = distanceR[angleI+1]; - graphics2D.setColor(Color.white); - if (doLabel) { - if (AverageWithColor) graphics2D.setColor(Color.gray); - else graphics2D.setColor(Color.black); - } - drawLine(graphics2D, trans, angleR00[0], angleR00[1], angleR01[0], angleR01[1]); - if (doLabel) { - drawDegrees(graphics2D, trans, fm, 180 * angleI / increment, angleR00[0], angleR00[1]); + if (width != that.width) { + return width > that.width ? -1 : 1; // largest first + } + if (y != that.y) { + return y < that.y ? -1 : 1; + } + if (s == null) { + if (that.s == null) { + return 0; } + return -1; + } + if (that.s == null) { + return 1; } + return s.compareTo(that.s); + } - double[] angleR00 = distanceR[distanceR.length-1]; - double[] angleR01 = distanceR[0]; - graphics2D.setColor(Color.white); - if (doLabel) { - if (AverageWithColor) graphics2D.setColor(Color.gray); - else graphics2D.setColor(Color.black); + public boolean overlaps(Chunk that) { + if (yStart != that.yStart) { + return false; } - drawLine(graphics2D, trans, angleR00[0], angleR00[1], angleR01[0], angleR01[1]); - if (doLabel) { - drawDegrees(graphics2D, trans, fm, 180 * (distanceR.length-1) / increment, angleR00[0], angleR00[1]); + if (xStart > that.xStart + that.width) { + return false; } - + if (that.xStart > xStart + width) { + return false; + } + return true; } - */ + } - } - // save thumbnail image to OUTFILE - griddedImage = thumbImage; - if (DEBUG_ICON) { - System.out.println("Changing Icon6"); - } - ImageIcon resultIcon = new ImageIcon(thumbImage); // recreate with buffered version - if (DEBUG_ICON) { - System.out.println("Changing Icon7"); - } - mainPicture.setIcon(resultIcon); - - } - - static class LabelPosition { - class Chunk implements Comparable { - double x, y; - int xStart, yStart, width; - String s; - public Chunk(double x2, double y2, String s2) { - x = x2; - y = y2; - s = s2; - width = 1; - xStart = (int)(x2 / tileWidth); - yStart = (int)(y2 / tileHeight); - if (s == null) { - return; - } - xStart += 2; - - Rectangle2D r = metrics.getStringBounds(s, graphics2D); - width = 1 + (int)((r.getWidth() - 1.0) / tileWidth); - - if (xStart + width >= tileWidthCount) { - xStart = tileWidthCount - width; - } - } - public int compareTo(Object o) { - Chunk that = (Chunk)o; - if (x != that.x) { - return x < that.x ? -1 : 1; - } - if (width != that.width) { - return width > that.width ? -1 : 1; // largest first - } - if (y != that.y) { - return y < that.y ? -1 : 1; - } - if (s == null) { - if (that.s == null) { - return 0; - } - return -1; + Graphics2D graphics2D; + FontMetrics metrics; + Set[] lineContents; + double tileWidth, tileHeight; + int tileWidthCount, tileHeightCount; + double ascent; + Set initialContents = new TreeSet(); + + LabelPosition(Graphics2D graphics2D, double width, double height) { + this.graphics2D = graphics2D; + metrics = graphics2D.getFontMetrics(); + Rectangle2D r = metrics.getStringBounds("n", graphics2D); + ascent = metrics.getAscent(); + // tile the map into a grid + tileWidthCount = (int) (width / r.getWidth()); + tileWidth = width / tileWidthCount + 0.0000001; + tileHeightCount = (int) (height / r.getHeight()); + tileHeight = height / tileHeightCount + 0.0000001; + lineContents = new Set[tileHeightCount]; + for (int i = 0; i < lineContents.length; ++i) { + lineContents[i] = new TreeSet(); + } } - if (that.s == null) { - return 1; + + void add(Transform trans, double longitude, double latitude, String s) { + double xx = trans.srcW_long.back(longitude); + double yy = trans.srcH_lat.back(latitude); + trans.transform(xx, yy, drawLineP1); + Chunk c = new Chunk(drawLineP1.x, drawLineP1.y, s); + initialContents.add(c); + c = new Chunk(drawLineP1.x, drawLineP1.y, null); // point only + lineContents[c.yStart].add(c); } - return s.compareTo(that.s); - } - public boolean overlaps(Chunk that) { - if (yStart != that.yStart) { - return false; + + void fixContents() { + for (Iterator it2 = initialContents.iterator(); it2.hasNext(); ) { + Chunk c = (Chunk) it2.next(); + findFittingLine(c); + lineContents[c.yStart].add(c); + } } - if (xStart > that.xStart + that.width) { - return false; + /** + * @param c + * @return + */ + private void findFittingLine(Chunk c) { + int pos = c.yStart; + boolean positive = false; + boolean lastOutOfBounds = false; + main: + for (int ii = 0; ; ++ii, positive = !positive) { + pos += (positive ? ii : -ii); + if (pos < 0 || pos >= lineContents.length) { + if (lastOutOfBounds) { + c.yStart = 0; + return; + } + lastOutOfBounds = true; + continue; + } + lastOutOfBounds = false; + c.yStart = pos; // assume ok. + // go x, x+1, x-1, +2, -2, ... + for (Iterator it = lineContents[pos].iterator(); it.hasNext(); ) { + Chunk that = (Chunk) it.next(); + if (c.overlaps(that)) { + if (DEBUG) { + System.out.println( + pos + " pushing " + c.s + " (collision with " + that.s + ")"); + } + continue main; + } + } + return; // yStart now set right. + } } - if (that.xStart > xStart + width) { - return false; + + void draw() { + fixContents(); + graphics2D.setColor(Color.pink); + for (int i = 0; i < lineContents.length; ++i) { + for (Iterator it = lineContents[i].iterator(); it.hasNext(); ) { + Chunk c = (Chunk) it.next(); + if (c.s == null) { + continue; // point + } + double x2 = tileWidth * c.xStart; + double y2 = tileHeight * c.yStart; + Line2D.Double line2 = new Line2D.Double(c.x, c.y, x2, y2 + tileHeight / 2); + graphics2D.draw(line2); + graphics2D.drawString(c.s, (int) x2, (int) (y2 + ascent)); + } + } } - return true; - } - } - Graphics2D graphics2D; - FontMetrics metrics; - Set[] lineContents; - double tileWidth, tileHeight; - int tileWidthCount, tileHeightCount; - double ascent; - Set initialContents = new TreeSet(); - - LabelPosition(Graphics2D graphics2D, double width, double height) { - this.graphics2D = graphics2D; - metrics = graphics2D.getFontMetrics(); - Rectangle2D r = metrics.getStringBounds("n", graphics2D); - ascent = metrics.getAscent(); - // tile the map into a grid - tileWidthCount = (int)(width / r.getWidth()); - tileWidth = width / tileWidthCount + 0.0000001; - tileHeightCount = (int)(height / r.getHeight()); - tileHeight = height / tileHeightCount + 0.0000001; - lineContents = new Set[tileHeightCount]; - for (int i = 0; i < lineContents.length; ++i) { - lineContents[i] = new TreeSet(); - } } - void add(Transform trans, double longitude, double latitude, String s) { - double xx = trans.srcW_long.back(longitude); - double yy = trans.srcH_lat.back(latitude); - trans.transform(xx, yy, drawLineP1); - Chunk c = new Chunk(drawLineP1.x, drawLineP1.y, s); - initialContents.add(c); - c = new Chunk(drawLineP1.x, drawLineP1.y, null); // point only - lineContents[c.yStart].add(c); + /** + * @param graphics2D + * @param trans + * @param fill TODO + * @param line TODO + * @param longitude + * @param latitude + * @return + */ + private static void drawPoint( + Graphics2D graphics2D, + Transform trans, + Color fill, + Color line, + double longitude, + double latitude, + String label) { + double xx = trans.srcW_long.back(longitude); + double yy = trans.srcH_lat.back(latitude); + // convertLongitudeLatitudeToWidthHeight(longitude, latitude, trans.srcW, trans.srcH, + // drawLineP1); + // double xx = drawLineP1.x; + // double yy = drawLineP1.y; + // System.out.println(" xx: " + xx + ", yy: " + yy); + double radius = 1; + trans.transform(xx, yy, drawLineP1); + + Ellipse2D.Double ellipse = new Ellipse2D.Double(); + ellipse.x = drawLineP1.x - radius; + ellipse.y = drawLineP1.y - radius; + ellipse.height = ellipse.width = radius * 2; + graphics2D.setColor(fill); + graphics2D.fill(ellipse); + graphics2D.setColor(line); + graphics2D.draw(ellipse); + /* + if (label == null) return; + if (label != null) { + Line2D.Double line2 = new Line2D.Double(drawLineP1.x, drawLineP1.y, drawLineP1.x + 5, drawLineP1.y + 5); + graphics2D.draw(line2); + } + + if (label != null) graphics2D.drawString(label, (int)drawLineP1.x + 5, (int)drawLineP1.y + 5); + */ + } + + private static void drawPoint( + Graphics2D graphics2D, + Transform trans, + Color fill, + Color line, + double longitude, + double latitude) { + drawPoint(graphics2D, trans, fill, line, longitude, latitude, null); } - void fixContents() { - for (Iterator it2 = initialContents.iterator(); it2.hasNext();) { - Chunk c = (Chunk) it2.next(); - findFittingLine(c); - lineContents[c.yStart].add(c); - } - } /** - * @param c - * @return + * @param graphics2D + * @param trans + * @param fm + * @param retCoord + * @param increment + * @param angleI + * @param angleR00 */ - private void findFittingLine(Chunk c) { - int pos = c.yStart; - boolean positive = false; - boolean lastOutOfBounds = false; - main: - for (int ii = 0; ; ++ii, positive = !positive) { - pos += (positive ? ii : -ii); - if (pos < 0 || pos >= lineContents.length) { - if (lastOutOfBounds) { - c.yStart = 0; - return; - } - lastOutOfBounds = true; - continue; - } - lastOutOfBounds = false; - c.yStart = pos; // assume ok. - // go x, x+1, x-1, +2, -2, ... - for (Iterator it = lineContents[pos].iterator(); it.hasNext();) { - Chunk that = (Chunk)it.next(); - if (c.overlaps(that)) { - if (DEBUG) { - System.out.println(pos + " pushing " + c.s + " (collision with " + that.s + ")"); - } - continue main; - } - } - return; // yStart now set right. + private static void drawDegrees( + Graphics2D graphics2D, + Transform trans, + FontMetrics fm, + double degrees, + double x, + double y) { + String degreesStr = nf.format(degrees) + "°"; + Rectangle2D r = fm.getStringBounds(degreesStr, graphics2D); + trans.transform(x - r.getWidth() / 2, y, drawLineP1); + graphics2D.drawString(degreesStr, (int) drawLineP1.x, (int) drawLineP1.y); + } + + private static DPoint drawLineP1 = new DPoint(); + /* + + private static void drawLine(Graphics2D graphics2D, Transform trans, double x1, double y1, double x2, double y2) { + // check for cases where it crosses a boundary + double xDist = Math.abs(x1 - x2); + double yDist = Math.abs(y1 - y2); + if (xDist > trans.srcW/2) { + if (yDist > trans.srcH/2) { + // skip, don't care about opposite corners + System.out.println("Skipping opposite corners"); + } else { + if (x1 < x2) { + drawLine2(graphics2D, trans, x1, y1, x2 - trans.srcW, y2); + drawLine2(graphics2D, trans, x1 + trans.srcW, y1, x2, y2); + } else { + drawLine2(graphics2D, trans, x1, y1, x2 + trans.srcW, y2); + drawLine2(graphics2D, trans, x1 - trans.srcW, y1, x2, y2); + } + } + } else if (yDist > trans.srcH/2) { + if (y1 < y2) { + drawLine2(graphics2D, trans, x1, y1, x2, y2 - trans.srcH); + drawLine2(graphics2D, trans, x1, y1 + trans.srcH, x2, y2); + } else { + drawLine2(graphics2D, trans, x1, y1, x2, y2 + trans.srcH); + drawLine2(graphics2D, trans, x1, y1 - trans.srcH, x2, y2); + } + } else { + drawLine2(graphics2D, trans, x1, y1, x2, y2); + } + } + + private static void drawLine2(Graphics2D graphics2D, Transform trans, double x, double y, double x2, double y2) { + trans.transform(x, y, drawLineP1); + int ix = (int) Math.round(drawLineP1.x); + int iy = (int) Math.round(drawLineP1.y); + trans.transform(x2, y2, drawLineP1); + int ix2 = (int) Math.round(drawLineP1.x); + int iy2 = (int) Math.round(drawLineP1.y); + graphics2D.drawLine(ix, iy, ix2, iy2); + } + */ + + abstract static class TTransform { + double x, y; + // t is 0..1 + abstract void transform(double t); + } + + static class PathTransform extends TTransform { + private Navigator navigator; + private Transform trans; + private double angle; + + PathTransform(Navigator navigator, Transform trans) { + this.navigator = navigator; + this.trans = trans; + } + + void setAngle(double angle) { + this.angle = angle; + } + + transient DPoint temp = new DPoint(); + + void transform(double t) { + navigator.setDistanceCourse(t * Math.PI, angle); + y = trans.srcH_lat.back(navigator.getLat2()); + x = trans.srcW_long.back(navigator.getLon2()); + trans.transform(x, y, temp); + x = temp.x; + y = temp.y; } } - void draw() { - fixContents(); - graphics2D.setColor(Color.pink); - for (int i = 0; i < lineContents.length; ++i) { - for (Iterator it = lineContents[i].iterator(); it.hasNext();) { - Chunk c = (Chunk) it.next(); - if (c.s == null) { - continue; // point - } - double x2 = tileWidth * c.xStart; - double y2 = tileHeight * c.yStart; - Line2D.Double line2 = new Line2D.Double(c.x, c.y, x2, y2 + tileHeight / 2); - graphics2D.draw(line2); - graphics2D.drawString(c.s, (int)x2, (int)(y2+ascent)); - } - } + static class AngleCircleTransform extends TTransform { + private Navigator navigator; + private Transform trans; + private double distance; + + AngleCircleTransform(Navigator navigator, Transform trans) { + this.navigator = navigator; + this.trans = trans; + } + + void setDistance(double distance) { + this.distance = distance; + } + + transient DPoint temp = new DPoint(); + + void transform(double t) { + navigator.setDistanceCourse(distance, t * (2 * Math.PI)); + y = trans.srcH_lat.back(navigator.getLat2()); + x = trans.srcW_long.back(navigator.getLon2()); + trans.transform(x, y, temp); + x = temp.x; + y = temp.y; + } } - } - - /** - * @param graphics2D - * @param trans - * @param fill TODO - * @param line TODO - * @param longitude - * @param latitude - * @return - */ - private static void drawPoint(Graphics2D graphics2D, Transform trans, Color fill, Color line, - double longitude, double latitude, String label) { - double xx = trans.srcW_long.back(longitude); - double yy = trans.srcH_lat.back(latitude); - //convertLongitudeLatitudeToWidthHeight(longitude, latitude, trans.srcW, trans.srcH, drawLineP1); - //double xx = drawLineP1.x; - //double yy = drawLineP1.y; - //System.out.println(" xx: " + xx + ", yy: " + yy); - double radius = 1; - trans.transform(xx, yy, drawLineP1); - - Ellipse2D.Double ellipse = new Ellipse2D.Double(); - ellipse.x = drawLineP1.x - radius; - ellipse.y = drawLineP1.y - radius; - ellipse.height = ellipse.width = radius * 2; - graphics2D.setColor(fill); - graphics2D.fill(ellipse); - graphics2D.setColor(line); - graphics2D.draw(ellipse); - /* - if (label == null) return; - if (label != null) { - Line2D.Double line2 = new Line2D.Double(drawLineP1.x, drawLineP1.y, drawLineP1.x + 5, drawLineP1.y + 5); - graphics2D.draw(line2); + + static class LineDrawer { + double distanceSquaredLimit = 10 * 10; + Graphics2D graphics2D; + Line2D.Double line = new Line2D.Double(); + transient double startX, startY, startT; + // transient double endX, endY, endT; + TTransform ttransform; + // int segments = 0; + LineDrawer(Graphics2D graphics2D, TTransform ttransform) { + this.graphics2D = graphics2D; + this.ttransform = ttransform; + } + // t is 0..1 + void draw(double startT, double endT) { + this.startT = startT; + // this.endT = endT; + ttransform.transform(startT); + startX = ttransform.x; + startY = ttransform.y; + ttransform.transform(endT); + double endX = ttransform.x; + double endY = ttransform.y; + draw(3, 10, endT, endX, endY); + // System.out.println("segments: " + segments); } - if (label != null) graphics2D.drawString(label, (int)drawLineP1.x + 5, (int)drawLineP1.y + 5); - */ - } - - private static void drawPoint(Graphics2D graphics2D, Transform trans, Color fill, Color line, - double longitude, double latitude) { - drawPoint(graphics2D, trans, fill, line, - longitude, latitude, null); - } - - /** - * @param graphics2D - * @param trans - * @param fm - * @param retCoord - * @param increment - * @param angleI - * @param angleR00 - */ - private static void drawDegrees(Graphics2D graphics2D, Transform trans, FontMetrics fm, double degrees, double x, double y) { - String degreesStr = nf.format(degrees) + "°"; - Rectangle2D r = fm.getStringBounds(degreesStr, graphics2D); - trans.transform(x - r.getWidth()/2, y, drawLineP1); - graphics2D.drawString(degreesStr, (int)drawLineP1.x, (int)drawLineP1.y); - } - - private static DPoint drawLineP1 = new DPoint(); - /* - - private static void drawLine(Graphics2D graphics2D, Transform trans, double x1, double y1, double x2, double y2) { - // check for cases where it crosses a boundary - double xDist = Math.abs(x1 - x2); - double yDist = Math.abs(y1 - y2); - if (xDist > trans.srcW/2) { - if (yDist > trans.srcH/2) { - // skip, don't care about opposite corners - System.out.println("Skipping opposite corners"); + void draw(int minDepth, int maxDepth, double endT, double endX, double endY) { + // System.out.println(maxDepth + "\t" + startT + ", " + startX + ", " + startY + "\t" + + // endT + ", " + endX + ", " + endY); + // at the end of a draw, the startT is always moved up to the endT + boolean divide = false; + // if we've reached the limit, draw + if (minDepth > 0) { // if we are under the depth, divide and conquer + divide = true; } else { - if (x1 < x2) { - drawLine2(graphics2D, trans, x1, y1, x2 - trans.srcW, y2); - drawLine2(graphics2D, trans, x1 + trans.srcW, y1, x2, y2); - } else { - drawLine2(graphics2D, trans, x1, y1, x2 + trans.srcW, y2); - drawLine2(graphics2D, trans, x1 - trans.srcW, y1, x2, y2); + // if the distance is large, and still not too deep, divide and conquer + double dx = endX - startX; + double dy = endY - startY; + // System.out.println("dist: " + Math.sqrt(dx*dx + dy*dy)); + if ((dx * dx + dy * dy) > distanceSquaredLimit) { + if (maxDepth <= 0) { + return; // skip if too long + } + divide = true; } } - } else if (yDist > trans.srcH/2) { - if (y1 < y2) { - drawLine2(graphics2D, trans, x1, y1, x2, y2 - trans.srcH); - drawLine2(graphics2D, trans, x1, y1 + trans.srcH, x2, y2); + if (divide) { + double midT = (startT + endT) / 2; + ttransform.transform((startT + endT) / 2); + double midX = ttransform.x; // keep, since ttransform gets overridden + double midY = ttransform.y; + draw(minDepth - 1, maxDepth - 1, midT, midX, midY); + draw(minDepth - 1, maxDepth - 1, endT, endX, endY); } else { - drawLine2(graphics2D, trans, x1, y1, x2, y2 + trans.srcH); - drawLine2(graphics2D, trans, x1, y1 - trans.srcH, x2, y2); + // System.out.println("Drawing"); + // segments++; + line.x1 = startX; + line.y1 = startY; + line.x2 = endX; + line.y2 = endY; + graphics2D.draw(line); + // graphics2D.drawLine((int) Math.round(startX), (int) Math.round(startY), + // (int) Math.round(endX), (int) Math.round(endY)); } - } else { - drawLine2(graphics2D, trans, x1, y1, x2, y2); + startT = endT; + startX = endX; + startY = endY; } } - private static void drawLine2(Graphics2D graphics2D, Transform trans, double x, double y, double x2, double y2) { - trans.transform(x, y, drawLineP1); - int ix = (int) Math.round(drawLineP1.x); - int iy = (int) Math.round(drawLineP1.y); - trans.transform(x2, y2, drawLineP1); - int ix2 = (int) Math.round(drawLineP1.x); - int iy2 = (int) Math.round(drawLineP1.y); - graphics2D.drawLine(ix, iy, ix2, iy2); - } - */ - - abstract static class TTransform { - double x, y; - // t is 0..1 - abstract void transform(double t); - } - - static class PathTransform extends TTransform { - private Navigator navigator; - private Transform trans; - private double angle; - PathTransform (Navigator navigator, Transform trans) { - this.navigator = navigator; - this.trans = trans; - } - void setAngle (double angle) { - this.angle = angle; - } - transient DPoint temp = new DPoint(); - void transform(double t) { - navigator.setDistanceCourse(t * Math.PI, angle); - y = trans.srcH_lat.back(navigator.getLat2()); - x = trans.srcW_long.back(navigator.getLon2()); - trans.transform(x, y, temp); - x = temp.x; - y = temp.y; - } - } - - static class AngleCircleTransform extends TTransform { - private Navigator navigator; - private Transform trans; - private double distance; - AngleCircleTransform (Navigator navigator, Transform trans) { - this.navigator = navigator; - this.trans = trans; - } - void setDistance (double distance) { - this.distance = distance; - } - transient DPoint temp = new DPoint(); - void transform(double t) { - navigator.setDistanceCourse(distance, t * (2*Math.PI)); - y = trans.srcH_lat.back(navigator.getLat2()); - x = trans.srcW_long.back(navigator.getLon2()); - trans.transform(x, y, temp); - x = temp.x; - y = temp.y; - } - } - - static class LineDrawer { - double distanceSquaredLimit = 10*10; - Graphics2D graphics2D; - Line2D.Double line = new Line2D.Double(); - transient double startX, startY, startT; - //transient double endX, endY, endT; - TTransform ttransform; - //int segments = 0; - LineDrawer(Graphics2D graphics2D, TTransform ttransform) { - this.graphics2D = graphics2D; - this.ttransform = ttransform; - } - // t is 0..1 - void draw(double startT, double endT) { - this.startT = startT; - //this.endT = endT; - ttransform.transform(startT); - startX = ttransform.x; - startY = ttransform.y; - ttransform.transform(endT); - double endX = ttransform.x; - double endY = ttransform.y; - draw(3, 10, endT, endX, endY); - //System.out.println("segments: " + segments); - } - void draw(int minDepth, int maxDepth, double endT, double endX, double endY) { - //System.out.println(maxDepth + "\t" + startT + ", " + startX + ", " + startY + "\t" + endT + ", " + endX + ", " + endY); - // at the end of a draw, the startT is always moved up to the endT - boolean divide = false; - // if we've reached the limit, draw - if (minDepth > 0) { // if we are under the depth, divide and conquer - divide = true; - } else { - // if the distance is large, and still not too deep, divide and conquer - double dx = endX - startX; - double dy = endY - startY; - //System.out.println("dist: " + Math.sqrt(dx*dx + dy*dy)); - if ((dx*dx + dy*dy) > distanceSquaredLimit) { - if (maxDepth <= 0) { - return; // skip if too long - } - divide = true; - } - } - if (divide) { - double midT = (startT + endT)/2; - ttransform.transform((startT + endT)/2); - double midX = ttransform.x; // keep, since ttransform gets overridden - double midY = ttransform.y; - draw(minDepth - 1, maxDepth - 1, midT, midX, midY); - draw(minDepth - 1, maxDepth - 1, endT, endX, endY); - } else { - //System.out.println("Drawing"); - //segments++; - line.x1 = startX; line.y1 = startY; line.x2 = endX; line.y2 = endY; - graphics2D.draw(line); - //graphics2D.drawLine((int) Math.round(startX), (int) Math.round(startY), - // (int) Math.round(endX), (int) Math.round(endY)); - } - startT = endT; - startX = endX; - startY = endY; - } - } - - public static void writeImage(BufferedImage image, String filename, float quality) { - try { - BufferedOutputStream out = new BufferedOutputStream( - new FileOutputStream(filename)); -// JPEGImageEncoder encoder = JPEGCodec.createJPEGEncoder(out); -// JPEGEncodeParam param = encoder -// .getDefaultJPEGEncodeParam(image); -// quality = Math.max(0, Math.min(quality, 100)); -// param.setQuality(quality / 100.0f, false); -// encoder.setJPEGEncodeParam(param); -// encoder.encode(image); -// out.close(); - ImageIO.write(image, "jpg", new File(filename)); - System.out.println("Saving on: " + new File(filename).getCanonicalPath()); - } catch (Exception e) { - e.printStackTrace(); - throw new RuntimeException("Failed write of image"); + public static void writeImage(BufferedImage image, String filename, float quality) { + try { + BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(filename)); + // JPEGImageEncoder encoder = JPEGCodec.createJPEGEncoder(out); + // JPEGEncodeParam param = encoder + // .getDefaultJPEGEncodeParam(image); + // quality = Math.max(0, Math.min(quality, 100)); + // param.setQuality(quality / 100.0f, false); + // encoder.setJPEGEncodeParam(param); + // encoder.encode(image); + // out.close(); + ImageIO.write(image, "jpg", new File(filename)); + System.out.println("Saving on: " + new File(filename).getCanonicalPath()); + } catch (Exception e) { + e.printStackTrace(); + throw new RuntimeException("Failed write of image"); + } } - } - - public static class Thumbnail { - public static void main(String[] args) throws Exception { - if (args.length != 5) { - System.err.println("Usage: java Thumbnail INFILE " + - "OUTFILE WIDTH HEIGHT QUALITY"); - System.exit(1); - } - // load image from INFILE - Image image = Toolkit.getDefaultToolkit().getImage(args[0]); - MediaTracker mediaTracker = new MediaTracker(new Container()); - mediaTracker.addImage(image, 0); - mediaTracker.waitForID(0); - // determine thumbnail size from WIDTH and HEIGHT - int thumbWidth = Integer.parseInt(args[2]); - int thumbHeight = Integer.parseInt(args[3]); - double thumbRatio = (double)thumbWidth / (double)thumbHeight; - int imageWidth = image.getWidth(null); - int imageHeight = image.getHeight(null); - double imageRatio = (double)imageWidth / (double)imageHeight; - if (thumbRatio < imageRatio) { - thumbHeight = (int)(thumbWidth / imageRatio); - } else { - thumbWidth = (int)(thumbHeight * imageRatio); - } - // draw original image to thumbnail image object and - // scale it to the new size on-the-fly - BufferedImage thumbImage = new BufferedImage(thumbWidth, - thumbHeight, BufferedImage.TYPE_INT_RGB); - Graphics2D graphics2D = thumbImage.createGraphics(); - graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION, - RenderingHints.VALUE_INTERPOLATION_BILINEAR); - graphics2D.drawImage(image, 0, 0, thumbWidth, thumbHeight, null); - // save thumbnail image to OUTFILE -// BufferedOutputStream out = new BufferedOutputStream(new -// FileOutputStream(args[1])); -// JPEGImageEncoder encoder = JPEGCodec.createJPEGEncoder(out); -// JPEGEncodeParam param = encoder. -// getDefaultJPEGEncodeParam(thumbImage); -// int quality = Integer.parseInt(args[4]); -// quality = Math.max(0, Math.min(quality, 100)); -// param.setQuality((float)quality / 100.0f, false); -// encoder.setJPEGEncodeParam(param); -// encoder.encode(thumbImage); -// out.close(); - File out = new File(args[1]); - ImageIO.write(thumbImage, "jpg", out); - // docs say imageio can handle quality settings. - System.out.println("Done."); - System.exit(0); + + public static class Thumbnail { + public static void main(String[] args) throws Exception { + if (args.length != 5) { + System.err.println( + "Usage: java Thumbnail INFILE " + "OUTFILE WIDTH HEIGHT QUALITY"); + System.exit(1); + } + // load image from INFILE + Image image = Toolkit.getDefaultToolkit().getImage(args[0]); + MediaTracker mediaTracker = new MediaTracker(new Container()); + mediaTracker.addImage(image, 0); + mediaTracker.waitForID(0); + // determine thumbnail size from WIDTH and HEIGHT + int thumbWidth = Integer.parseInt(args[2]); + int thumbHeight = Integer.parseInt(args[3]); + double thumbRatio = (double) thumbWidth / (double) thumbHeight; + int imageWidth = image.getWidth(null); + int imageHeight = image.getHeight(null); + double imageRatio = (double) imageWidth / (double) imageHeight; + if (thumbRatio < imageRatio) { + thumbHeight = (int) (thumbWidth / imageRatio); + } else { + thumbWidth = (int) (thumbHeight * imageRatio); + } + // draw original image to thumbnail image object and + // scale it to the new size on-the-fly + BufferedImage thumbImage = + new BufferedImage(thumbWidth, thumbHeight, BufferedImage.TYPE_INT_RGB); + Graphics2D graphics2D = thumbImage.createGraphics(); + graphics2D.setRenderingHint( + RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR); + graphics2D.drawImage(image, 0, 0, thumbWidth, thumbHeight, null); + // save thumbnail image to OUTFILE + // BufferedOutputStream out = new BufferedOutputStream(new + // FileOutputStream(args[1])); + // JPEGImageEncoder encoder = JPEGCodec.createJPEGEncoder(out); + // JPEGEncodeParam param = encoder. + // getDefaultJPEGEncodeParam(thumbImage); + // int quality = Integer.parseInt(args[4]); + // quality = Math.max(0, Math.min(quality, 100)); + // param.setQuality((float)quality / 100.0f, false); + // encoder.setJPEGEncodeParam(param); + // encoder.encode(thumbImage); + // out.close(); + File out = new File(args[1]); + ImageIO.write(thumbImage, "jpg", out); + // docs say imageio can handle quality settings. + System.out.println("Done."); + System.exit(0); + } } - } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/LanguageCode.java b/UnicodeJsps/src/main/java/org/unicode/jsp/LanguageCode.java index e28186d5d..502ed795d 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/LanguageCode.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/LanguageCode.java @@ -1,5 +1,8 @@ package org.unicode.jsp; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Collator; +import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; @@ -11,25 +14,26 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.Collator; -import com.ibm.icu.util.ULocale; - public class LanguageCode { - static public final Pattern languageID = Pattern.compile( - " (?: ( [a-z A-Z]{2,8} | [a-z A-Z]{2,3} [-_] [a-z A-Z]{3} )" - + " (?: [-_] ( [a-z A-Z]{4} ) )? " - + " (?: [-_] ( [a-z A-Z]{2} | [0-9]{3} ) )?" - + " (?: [-_] ( (?: [0-9 a-z A-Z]{5,8} | [0-9] [0-9 a-z A-Z]{3} ) (?: [-_] (?: [0-9 a-z A-Z]{5,8} | [0-9] [0-9 a-z A-Z]{3} ) )* ) )?" - + " (?: [-_] ( [a-w y-z A-W Y-Z] (?: [-_] [0-9 a-z A-Z]{2,8} )+ (?: [-_] [a-w y-z A-W Y-Z] (?: [-_] [0-9 a-z A-Z]{2,8} )+ )* ) )?" - + " (?: [-_] ( [xX] (?: [-_] [0-9 a-z A-Z]{1,8} )+ ) )? ) " - + " | ( [xX] (?: [-_] [0-9 a-z A-Z]{1,8} )+ )", + public static final Pattern languageID = + Pattern.compile( + " (?: ( [a-z A-Z]{2,8} | [a-z A-Z]{2,3} [-_] [a-z A-Z]{3} )" + + " (?: [-_] ( [a-z A-Z]{4} ) )? " + + " (?: [-_] ( [a-z A-Z]{2} | [0-9]{3} ) )?" + + " (?: [-_] ( (?: [0-9 a-z A-Z]{5,8} | [0-9] [0-9 a-z A-Z]{3} ) (?: [-_] (?: [0-9 a-z A-Z]{5,8} | [0-9] [0-9 a-z A-Z]{3} ) )* ) )?" + + " (?: [-_] ( [a-w y-z A-W Y-Z] (?: [-_] [0-9 a-z A-Z]{2,8} )+ (?: [-_] [a-w y-z A-W Y-Z] (?: [-_] [0-9 a-z A-Z]{2,8} )+ )* ) )?" + + " (?: [-_] ( [xX] (?: [-_] [0-9 a-z A-Z]{1,8} )+ ) )? ) " + + " | ( [xX] (?: [-_] [0-9 a-z A-Z]{1,8} )+ )", Pattern.COMMENTS); - static final Pattern extensionID = Pattern.compile("[a-w y-z A-W Y-Z]([-_][0-9 a-z A-Z]{2,8})*"); - static final Collection QUALITY_EXCLUSIONS = new HashSet(Arrays.asList("ti fo so kok ps cy sw ur pa pa_Guru uz_Latn ii haw az_Cyrl bo as zu ha ha_Latn uz_Arab om pa_Arab kw kl kk kk_Cyrl gv si uz uz_Cyrl" - .split("\\s+"))); + static final Pattern extensionID = + Pattern.compile("[a-w y-z A-W Y-Z]([-_][0-9 a-z A-Z]{2,8})*"); + static final Collection QUALITY_EXCLUSIONS = + new HashSet( + Arrays.asList( + "ti fo so kok ps cy sw ur pa pa_Guru uz_Latn ii haw az_Cyrl bo as zu ha ha_Latn uz_Arab om pa_Arab kw kl kk kk_Cyrl gv si uz uz_Cyrl" + .split("\\s+"))); enum Subtag { language, @@ -39,25 +43,31 @@ enum Subtag { extensions, privateUse, privateUse2; + String get(Matcher m) { - return m.group(ordinal()+1); + return m.group(ordinal() + 1); } } static class MyHandler extends FileUtilities.SemiFileReader { - TreeMap map = new TreeMap(); + TreeMap map = new TreeMap(); + protected boolean isCodePoint() { return false; } + public boolean handleLine(int start, int end, String[] items) { map.put(items[0], items[1]); return true; } } - static final Map names = ((MyHandler) new MyHandler().process(LanguageCode.class, "subtagNames.txt")).map; - static final Map toAlpha3 = ((MyHandler) new MyHandler().process(LanguageCode.class, "alpha2_3.txt")).map; - static final Map fixCodes = ((MyHandler)new MyHandler().process(LanguageCode.class, "fixCodes.txt")).map; + static final Map names = + ((MyHandler) new MyHandler().process(LanguageCode.class, "subtagNames.txt")).map; + static final Map toAlpha3 = + ((MyHandler) new MyHandler().process(LanguageCode.class, "alpha2_3.txt")).map; + static final Map fixCodes = + ((MyHandler) new MyHandler().process(LanguageCode.class, "fixCodes.txt")).map; public static String validate(String input, ULocale ulocale) { StringBuilder builder = new StringBuilder(); @@ -79,25 +89,29 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde if (!m.matches()) { int i = input.length(); for (; ; --i) { - final String fragment = input.substring(0,i); + final String fragment = input.substring(0, i); m.reset(fragment).matches(); - if(i == 0 || m.hitEnd()) { - int posBefore = input.lastIndexOf('-', i-1) + 1; + if (i == 0 || m.hitEnd()) { + int posBefore = input.lastIndexOf('-', i - 1) + 1; int posAfter = input.indexOf('-', i); if (posAfter < 0) { posAfter = input.length(); } - prefix = "

Ill-Formed Language Identifier: " + input.substring(0, posBefore) - + "" + input.substring(posBefore, i) - + "×" - + input.substring(i, posAfter) - + "" + input.substring(posAfter, input.length()) - + "
Couldn't parse past the point marked with ×.

\n"; + prefix = + "

Ill-Formed Language Identifier: " + + input.substring(0, posBefore) + + "" + + input.substring(posBefore, i) + + "×" + + input.substring(i, posAfter) + + "" + + input.substring(posAfter, input.length()) + + "
Couldn't parse past the point marked with ×.

\n"; if (posBefore <= 0) { builder.append(prefix); return; } - input = input.substring(0, posBefore-1); + input = input.substring(0, posBefore - 1); m.reset(input); if (!m.matches()) { builder.append(prefix); @@ -108,7 +122,8 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde } } int start = builder.length(); - builder.append("\n").append(getLine("th", "Type", "2.1", "Code", "Name", "Replacement")); + builder.append("
\n") + .append(getLine("th", "Type", "2.1", "Code", "Name", "Replacement")); String languageCode = Subtag.language.get(m); if (languageCode != null) { @@ -130,7 +145,8 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde } fixed = fixCodes.get(languageCode); } else { // must be 2 - // cases are the following. For the replacement, we use fix(extlang) if valid, otherwise fix(lang) if valid, otherwise fix(extlang) + // cases are the following. For the replacement, we use fix(extlang) if valid, + // otherwise fix(lang) if valid, otherwise fix(extlang) // zh-cmn - valid => cmn // en-cmn - valid => cmn // but shouldn't be; by canonicalization en-cmn = cmn // eng-cmn - invalid => cmn @@ -148,13 +164,15 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde if (extLangName == null) { languageName = "invalid base and extlang codes"; } else { - languageName = "invalid base and extlang code - extlang would be valid base-lang code"; + languageName = + "invalid base and extlang code - extlang would be valid base-lang code"; } } else if (invalidExtlang) { if (extLangName == null) { languageName = "invalid extlang code"; } else { - languageName = "invalid extlang code - would be valid base-lang code"; + languageName = + "invalid extlang code - would be valid base-lang code"; } } else if (invalidLanguageCode) { languageName = "invalid base-lang code"; @@ -164,13 +182,20 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde if (languageName.startsWith("@")) { languageName = languageName.substring(1); } - //languageAndLink = getLanguageAndLink(extlang); + // languageAndLink = getLanguageAndLink(extlang); languageCode = extlang; } fixed = fixCodes.get(languageCode); languageAndLink = originalCode; } - builder.append(getLine("td", "Language", "2.2.1", languageAndLink, languageName, getCodeAndLink(Subtag.language, fixed, ulocale))); + builder.append( + getLine( + "td", + "Language", + "2.2.1", + languageAndLink, + languageName, + getCodeAndLink(Subtag.language, fixed, ulocale))); addFixed(canonical, languageCode, fixed); } @@ -185,7 +210,14 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde script = getCodeAndLink(Subtag.script, script, ulocale); } final String fixed = fixCodes.get(scriptCode); - builder.append(getLine("td", "Script", "2.2.3", script, scriptName, getCodeAndLink(Subtag.script, fixed, ulocale))); + builder.append( + getLine( + "td", + "Script", + "2.2.3", + script, + scriptName, + getCodeAndLink(Subtag.script, fixed, ulocale))); addFixed(canonical, scriptCode, fixed); } @@ -200,7 +232,14 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde region = getCodeAndLink(Subtag.region, region, ulocale); } final String fixed = fixCodes.get(regionCode); - builder.append(getLine("td", "Region", "2.2.4", region, regionName, getCodeAndLink(Subtag.region, fixed, ulocale))); + builder.append( + getLine( + "td", + "Region", + "2.2.4", + region, + regionName, + getCodeAndLink(Subtag.region, fixed, ulocale))); addFixed(canonical, regionCode, fixed); } @@ -215,7 +254,10 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde variantName = "invalid Code"; } else { variantName = getSubtagName(variant, ulocale, true); - variant = "" + variant + ""; + variant = + "" + + variant + + ""; } final String fixed = fixCodes.get(variantCode); builder.append(getLine("td", "Variant", "2.2.5", variant, variantName, fixed)); @@ -251,13 +293,24 @@ private static void validate(String input, ULocale ulocale, StringBuilder builde String canonicalString = canonical.toString(); String insert = ""; if (!canonicalString.equals(oldInput)) { - insert = "

Canonical Form: " + canonical + "

\n"; + insert = + "

Canonical Form: " + + canonical + + "

\n"; } ULocale minimized = ULocale.minimizeSubtags(new ULocale(canonicalString)); if (minimized != null) { String minimizedCode = minimized.toLanguageTag(); if (!minimizedCode.equals(canonicalString)) { - insert += "

Minimal Form: " + minimizedCode + "

\n";; + insert += + "

Minimal Form: " + + minimizedCode + + "

\n"; + ; } } if (insert != null) { @@ -306,30 +359,56 @@ private static String getCodeAndLink2(Subtag subtag, String code, ULocale ulocal name = ""; } switch (subtag) { - case region: { - if (code.compareTo("A") < 0) { - code = "" + code + ""; - } else { - code = "" + code + ""; - } - return code; - } - case script: { - code = "" + code + ""; - return code; - } - case language: { - String alpha3 = code; - if (code.length() == 2) { - alpha3 = toAlpha3.get(code); - if (alpha3 == null) { - alpha3 = code; + case region: + { + if (code.compareTo("A") < 0) { + code = + "" + + code + + ""; + } else { + code = + "" + + code + + ""; + } + return code; } - } - code = "" + code + ""; - return code; - } - default: throw new IllegalArgumentException(); + case script: + { + code = + "" + + code + + ""; + return code; + } + case language: + { + String alpha3 = code; + if (code.length() == 2) { + alpha3 = toAlpha3.get(code); + if (alpha3 == null) { + alpha3 = code; + } + } + code = + "" + + code + + ""; + return code; + } + default: + throw new IllegalArgumentException(); } } @@ -364,26 +443,33 @@ private static String getSubtagName(String code, ULocale ulocale, boolean html) private static String getIcuName(String code, ULocale ulocale) { String icuName = code; - switch(code.length()) { - case 2: - case 3: - icuName = code.compareTo("a") < 0 - ? ULocale.getDisplayCountry("und-" + code, ulocale) - : ULocale.getDisplayLanguage(code, ulocale); - break; - case 4: - if (code.compareTo("A") >= 0) { - icuName = ULocale.getDisplayScript("und-" + code, ulocale); + switch (code.length()) { + case 2: + case 3: + icuName = + code.compareTo("a") < 0 + ? ULocale.getDisplayCountry("und-" + code, ulocale) + : ULocale.getDisplayLanguage(code, ulocale); + break; + case 4: + if (code.compareTo("A") >= 0) { + icuName = ULocale.getDisplayScript("und-" + code, ulocale); + break; + } // otherwise fall through! + default: + icuName = ULocale.getDisplayVariant("und-Latn-AQ-" + code, ulocale).toLowerCase(); break; - } // otherwise fall through! - default: - icuName = ULocale.getDisplayVariant("und-Latn-AQ-" + code, ulocale).toLowerCase(); - break; } return icuName; } - private static String getLine(String element, String type, String specSection, String subtag, String name, String replacement) { + private static String getLine( + String element, + String type, + String specSection, + String subtag, + String name, + String replacement) { if (name == null) { name = "invalid"; } @@ -392,9 +478,35 @@ private static String getLine(String element, String type, String specSection, S } else { replacement = ""; } - final String typeAndLink = specSection == null ? type : "" + type + ""; - return "<" + element + ">" + typeAndLink + "<" + element + ">" + subtag + "<" + element + ">" + name + "" + replacement + "\n"; + final String typeAndLink = + specSection == null + ? type + : "" + + type + + ""; + return "<" + + element + + ">" + + typeAndLink + + "<" + + element + + ">" + + subtag + + "<" + + element + + ">" + + name + + "" + + replacement + + "\n"; } public static String getLanguageOptions(ULocale toLocalizeInto) { @@ -403,7 +515,8 @@ public static String getLanguageOptions(ULocale toLocalizeInto) { toLocalizeInto = ULocale.ENGLISH; } ULocale[] list = ULocale.getAvailableLocales(); - Map sorted = new TreeMap(Collator.getInstance(toLocalizeInto)); + Map sorted = + new TreeMap(Collator.getInstance(toLocalizeInto)); for (ULocale ulocale : list) { String country = ulocale.getCountry(); if (country.length() != 0) { @@ -423,17 +536,17 @@ public static String getLanguageOptions(ULocale toLocalizeInto) { } return result.toString(); /* - - - - - - - - - - - */ + + + + + + + + + + + */ } public static String getLocaleName(ULocale toBeLocalized, ULocale toLocalizeInto) { diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/NFM.java b/UnicodeJsps/src/main/java/org/unicode/jsp/NFM.java index c7d1fb49f..b444de569 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/NFM.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/NFM.java @@ -1,9 +1,8 @@ package org.unicode.jsp; -import java.util.regex.Pattern; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Utility; +import java.util.regex.Pattern; public class NFM { public static final UnicodeMap nfm = new UnicodeMap(); @@ -12,21 +11,23 @@ public class NFM { new MySemiFileReader().process(NFM.class, "nfm.txt"); nfm.freeze(); } + static final class MySemiFileReader extends FileUtilities.SemiFileReader { Pattern spaces = Pattern.compile("\\s+"); + @Override protected boolean handleLine(int start, int end, String[] items) { String results; switch (items.length) { - default: - throw new IllegalArgumentException(); - case 2: - results = Utility.fromHex(items[1], 1, spaces); - nfm.putAll(start, end, results); - break; - case 1: - nfm.putAll(start, end, ""); - break; + default: + throw new IllegalArgumentException(); + case 2: + results = Utility.fromHex(items[1], 1, spaces); + nfm.putAll(start, end, results); + break; + case 1: + nfm.putAll(start, end, ""); + break; } return true; } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Navigator.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Navigator.java index 24fe8f4d7..1e65d41ab 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Navigator.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Navigator.java @@ -3,208 +3,239 @@ import java.text.DecimalFormat; import java.text.NumberFormat; -final public class Navigator { - private double lat1, lon1; - private transient double cosLat1, sinLat1; +public final class Navigator { + private double lat1, lon1; + private transient double cosLat1, sinLat1; + private double lat2, lon2; + private double distance, course; + private transient double sinDistance, cosDistance; + + public static double DEGREE = Math.PI / 180.0; + + private static double EPS = 0.000001; // EPS a small number ~ machine precision + + public static void main(String[] args) { + double latitude1 = 0; // toRadians(37.0, 28.0, 48.0, false); // N = + + double longitude1 = 0; // toRadians(180-122.0, 8.0, 39.0, true); // W = - + Tester tester = new Tester(latitude1, longitude1); + + double[][] tests = { + {90 * DEGREE, -180 * DEGREE}, + {-50 * DEGREE, -180 * DEGREE}, + {25 * DEGREE, 10 * DEGREE}, + {latitude1, longitude1}, + }; + for (int i = 0; i < tests.length; ++i) { + tester.testItem(0, tests[i][0], tests[i][1]); + } + // exhaustive + tester.test(0, 0, -90, 90, -180, 180, 1); + } - private double lat2, lon2; - private double distance, course; - private transient double sinDistance, cosDistance; + private static class Tester { + Navigator a = new Navigator(); + Navigator b = new Navigator(); + int counter = 0, failures = 0; - public static double DEGREE = Math.PI/180.0; + Tester(double latitude1, double longitude1) { + a.setLat1Lon1(latitude1, longitude1); + b.setLat1Lon1(latitude1, longitude1); + System.out.println( + "\tLatitude1 " + degrees(a.getLat1()) + "\tLongitude1 " + degrees(a.getLon1())); + } - private static double EPS = 0.000001; // EPS a small number ~ machine precision + private void test( + double lat0, + double lon0, + double latMin, + double latMax, + double lonMin, + double lonMax, + double inc) { + for (double dLat = latMin; dLat <= latMax; dLat += inc) { + for (double dLon = lonMin; dLon <= lonMax; dLon += inc) { + counter++; + double lat2a = lat0 + dLat * DEGREE; + double lon2a = lon0 + dLon * DEGREE; + if (!testItem(counter, lat2a, lon2a)) { + failures++; + } + } + } + System.out.println("Count: " + counter + "\tFailures: " + failures); + counter = failures = 0; + } - public static void main(String[] args) { - double latitude1 = 0; // toRadians(37.0, 28.0, 48.0, false); // N = + - double longitude1 = 0; // toRadians(180-122.0, 8.0, 39.0, true); // W = - - Tester tester = new Tester(latitude1, longitude1); + boolean testItem(int counter, double lat2a, double lon2a) { + a.setLat2Lon2(lat2a, lon2a); + lat2a = a.getLat2(); + lon2a = a.getLon2(); + double distance2 = a.getDistance(); + double course2 = a.getCourse(); + b.setDistanceCourse(distance2, course2); + double lat2b = b.getLat2(); + double lon2b = b.getLon2(); + boolean success = areClose(lat2b, lat2a) && areClose(lon2a, lon2b); + if (success && (counter % 1023) != 1) { + return true; + } + System.out.println(); + System.out.println( + counter + + "\tLat " + + degrees(lat2a) + + "\tLong " + + degrees(lon2a) + + "\tDistance " + + degrees(distance2) + + "\tCourse " + + degrees(course2)); + System.out.println("\t\tLat2 " + degrees(lat2b) + "\tLong2 " + degrees(lon2b)); + return success; + } + } - double[][] tests = { - {90*DEGREE, -180*DEGREE}, - {-50*DEGREE, -180*DEGREE}, - {25*DEGREE, 10*DEGREE}, - {latitude1, longitude1}, - }; - for (int i = 0; i < tests.length; ++i) { - tester.testItem(0, tests[i][0], tests[i][1]); + private static boolean areClose(double a, double b) { + a -= b; + return (-EPS < a && a < EPS); } - // exhaustive - tester.test(0, 0, -90, 90, -180, 180, 1); - } - private static class Tester { - Navigator a = new Navigator(); - Navigator b = new Navigator(); - int counter = 0, failures = 0; - - Tester(double latitude1, double longitude1) { - a.setLat1Lon1(latitude1, longitude1); - b.setLat1Lon1(latitude1, longitude1); - System.out.println("\tLatitude1 " + degrees(a.getLat1()) + "\tLongitude1 " + degrees(a.getLon1())); + + private static final NumberFormat nf = new DecimalFormat("+000.000;-000.000"); + + public static String degrees(double in) { + return nf.format(in / DEGREE) + '°'; } - private void test(double lat0, double lon0, double latMin, double latMax, double lonMin, double lonMax, double inc) { - for (double dLat = latMin; dLat <= latMax; dLat += inc) { - for (double dLon = lonMin; dLon <= lonMax; dLon += inc) { - counter++; - double lat2a = lat0 + dLat*DEGREE; - double lon2a = lon0 + dLon*DEGREE; - if (!testItem(counter, lat2a, lon2a)) { - failures++; - } + public static double toRadians( + double degrees, double minutes, double seconds, boolean northOrWest) { + double result = (degrees + minutes / 60 + seconds / 3600); + if (!northOrWest) { + result = -result; } - } - System.out.println("Count: " + counter + "\tFailures: " + failures); - counter = failures = 0; + return result * DEGREE; } - boolean testItem(int counter, double lat2a, double lon2a) { - a.setLat2Lon2(lat2a, lon2a); - lat2a = a.getLat2(); - lon2a = a.getLon2(); - double distance2 = a.getDistance(); - double course2 = a.getCourse(); - b.setDistanceCourse(distance2, course2); - double lat2b = b.getLat2(); - double lon2b = b.getLon2(); - boolean success = areClose(lat2b, lat2a) && areClose(lon2a, lon2b); - if (success && (counter % 1023) != 1) { - return true; - } - System.out.println(); - System.out.println(counter + "\tLat " + degrees(lat2a) + "\tLong " + degrees(lon2a) - + "\tDistance " + degrees(distance2) + "\tCourse " + degrees(course2)); - System.out.println("\t\tLat2 " + degrees(lat2b) + "\tLong2 " + degrees(lon2b)); - return success; + public Navigator setLat1Lon1(double lat1, double lon1) { + if (lat1 < -Math.PI / 2 + EPS) { + lat1 = -Math.PI / 2; + lon1 = 0; // no point in distinguishing + } else if (lat1 > Math.PI / 2 + EPS) { + lat1 = Math.PI / 2; + lon1 = 0; // no point in distinguishing + } else { + lon1 = wrap(lon1, -Math.PI, Math.PI); + } + this.lat1 = lat1; + this.lon1 = lon1; + cosLat1 = Math.cos(lat1); + sinLat1 = Math.sin(lat1); + return this; } - } - private static boolean areClose(double a, double b) { - a -= b; - return (-EPS < a && a < EPS); - } - - private static final NumberFormat nf = new DecimalFormat("+000.000;-000.000"); - public static String degrees(double in) { - return nf.format(in/DEGREE) + '°'; - } - - public static double toRadians(double degrees, double minutes, double seconds, boolean northOrWest) { - double result = (degrees + minutes / 60 + seconds / 3600); - if (!northOrWest) { - result = -result; + + public Navigator setLat2Lon2(double lat2, double lon2) { + if (lat2 < -Math.PI / 2 + EPS) { + lat2 = -Math.PI / 2; + lon2 = 0; // no point in distinguishing + } else if (lat2 > Math.PI / 2 - EPS) { + lat2 = Math.PI / 2; + lon2 = 0; // no point in distinguishing + } else { + lon2 = wrap(lon2, -Math.PI, Math.PI); + } + this.lat2 = lat2; + this.lon2 = lon2; + + double cosLat2 = Math.cos(lat2); + double sinLat2 = Math.sin(lat2); + + // compute distance + double halfLatDiff = Math.sin((lat1 - lat2) / 2); + double halfLonDiff = Math.sin((lon1 - lon2) / 2); + + distance = + 2 + * Math.asin( + Math.sqrt( + halfLatDiff * halfLatDiff + + cosLat1 * cosLat2 * halfLonDiff * halfLonDiff)); + sinDistance = Math.sin(distance); + cosDistance = Math.cos(distance); + + // compute course + if (distance < EPS) { + course = 0; + } else if (cosLat1 < EPS) { + if (lat1 > 0) { + course = Math.PI; // starting from N pole + } else { + course = 2 * Math.PI; // starting from S pole + } + } else { + double cosCourse = (sinLat2 - sinLat1 * cosDistance) / (sinDistance * cosLat1); + if (cosCourse < -1.0) { + cosCourse = -1.0; + } + if (cosCourse > 1.0) { + cosCourse = 1.0; + } + course = Math.acos(cosCourse); + if (Math.sin(lon2 - lon1) >= 0) { + course = 2 * Math.PI - course; + } + } + return this; } - return result * DEGREE; - } - - public Navigator setLat1Lon1(double lat1, double lon1) { - if (lat1 < -Math.PI/2 + EPS) { - lat1 = -Math.PI/2; - lon1 = 0; // no point in distinguishing - } else if (lat1 > Math.PI/2 + EPS) { - lat1 = Math.PI/2; - lon1 = 0; // no point in distinguishing - } else { - lon1 = wrap(lon1, -Math.PI, Math.PI); + + public Navigator setDistanceCourse(double distance, double course) { + this.distance = distance; + this.course = course; + sinDistance = Math.sin(distance); + cosDistance = Math.cos(distance); + + lat2 = Math.asin(sinLat1 * cosDistance + cosLat1 * sinDistance * Math.cos(course)); + if (lat2 < -Math.PI / 2 + EPS || lat2 > Math.PI / 2 - EPS) { + lon2 = 0; // no point in distinguishing + } else { + double dlon = + Math.atan2( + Math.sin(course) * sinDistance * cosLat1, + cosDistance - sinLat1 * Math.sin(lat2)); + lon2 = wrap(lon1 - dlon, -Math.PI, Math.PI); + } + return this; } - this.lat1 = lat1; - this.lon1 = lon1; - cosLat1 = Math.cos(lat1); - sinLat1 = Math.sin(lat1); - return this; - } - - public Navigator setLat2Lon2(double lat2, double lon2) { - if (lat2 < -Math.PI/2 + EPS) { - lat2 = -Math.PI/2; - lon2 = 0; // no point in distinguishing - } else if (lat2 > Math.PI/2 - EPS) { - lat2 = Math.PI/2; - lon2 = 0; // no point in distinguishing - } else { - lon2 = wrap(lon2, -Math.PI, Math.PI); + + public static double wrap(double aa, double low, double high) { + double a = aa - low; + double span = high - low; + if (a >= 0 && a < span) { + return aa; + } + double intQuotient = Math.floor(a / span); + return a - intQuotient * span + low; } - this.lat2 = lat2; - this.lon2 = lon2; - - double cosLat2 = Math.cos(lat2); - double sinLat2 = Math.sin(lat2); - - // compute distance - double halfLatDiff = Math.sin((lat1-lat2)/2); - double halfLonDiff = Math.sin((lon1-lon2)/2); - - distance = 2*Math.asin(Math.sqrt(halfLatDiff*halfLatDiff + cosLat1*cosLat2*halfLonDiff*halfLonDiff)); - sinDistance = Math.sin(distance); - cosDistance = Math.cos(distance); - - // compute course - if (distance < EPS) { - course = 0; - } else if (cosLat1 < EPS) { - if (lat1 > 0) { - course = Math.PI; // starting from N pole - } else { - course = 2*Math.PI; // starting from S pole - } - } else { - double cosCourse = (sinLat2-sinLat1*cosDistance)/(sinDistance*cosLat1); - if (cosCourse < -1.0) { - cosCourse = -1.0; - } - if (cosCourse > 1.0) { - cosCourse = 1.0; - } - course=Math.acos(cosCourse); - if (Math.sin(lon2-lon1) >= 0) { - course=2*Math.PI-course; - } + + public double getCourse() { + return course; } - return this; - } - - public Navigator setDistanceCourse(double distance, double course) { - this.distance = distance; - this.course = course; - sinDistance = Math.sin(distance); - cosDistance = Math.cos(distance); - - lat2 = Math.asin(sinLat1*cosDistance+cosLat1*sinDistance*Math.cos(course)); - if (lat2 < -Math.PI/2 + EPS || lat2 > Math.PI/2 - EPS) { - lon2 = 0; // no point in distinguishing - } else { - double dlon=Math.atan2( - Math.sin(course)*sinDistance*cosLat1, - cosDistance-sinLat1*Math.sin(lat2)); - lon2 = wrap(lon1-dlon, -Math.PI, Math.PI); + + public double getDistance() { + return distance; } - return this; - } - - public static double wrap(double aa, double low, double high) { - double a = aa - low; - double span = high - low; - if (a >= 0 && a < span) { - return aa; + + public double getLat1() { + return lat1; + } + + public double getLat2() { + return lat2; + } + + public double getLon1() { + return lon1; + } + + public double getLon2() { + return lon2; } - double intQuotient = Math.floor(a / span); - return a - intQuotient * span + low; - } - public double getCourse() { - return course; - } - public double getDistance() { - return distance; - } - public double getLat1() { - return lat1; - } - public double getLat2() { - return lat2; - } - public double getLon1() { - return lon1; - } - public double getLon2() { - return lon2; - } -} \ No newline at end of file +} diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/PropertyMetadata.java b/UnicodeJsps/src/main/java/org/unicode/jsp/PropertyMetadata.java index b895eacfa..43afbb37d 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/PropertyMetadata.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/PropertyMetadata.java @@ -1,16 +1,15 @@ package org.unicode.jsp; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R4; import java.util.Arrays; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R4; - public class PropertyMetadata { // #Property ; Source ; Datatype ; Category @@ -19,21 +18,28 @@ public static class PropertyMetaDatum { public final String source; public final String datatype; public final String category; - + public PropertyMetaDatum(String[] items) { source = items[1]; datatype = items[2]; category = items[3]; } + @Override public String toString() { - return "{source: " + source + ", datatype: " + datatype + ", category: " + category + "}"; + return "{source: " + + source + + ", datatype: " + + datatype + + ", category: " + + category + + "}"; } } private static final Map propToDatum; - private static final Set> CategoryDatatypeSourceProperty; + private static final Set> CategoryDatatypeSourceProperty; static { MyHandler myHandler = new MyHandler(); @@ -44,43 +50,50 @@ public String toString() { private static class MyHandler extends FileUtilities.SemiFileReader { - private Set> set = new TreeSet>(); - private Map _propToDatum = new TreeMap(); + private Set> set = + new TreeSet>(); + private Map _propToDatum = + new TreeMap(); protected boolean isCodePoint() { return false; } - + public boolean handleLine(int start, int end, String[] items) { if (items.length != 4) { - throw new IllegalArgumentException("Must have exactly 4 items: " + Arrays.asList(items)); + throw new IllegalArgumentException( + "Must have exactly 4 items: " + Arrays.asList(items)); } - PropertyMetaDatum datum = new PropertyMetaDatum(items); - _propToDatum.put(items[0], datum); - - set.add((R4) Row.of(items[3], items[2], items[1], items[0]).freeze()); - set.add((R4) Row.of(items[3], items[2], items[1], items[0]+"β").freeze()); + PropertyMetaDatum datum = new PropertyMetaDatum(items); + _propToDatum.put(items[0], datum); + + set.add( + (R4) + Row.of(items[3], items[2], items[1], items[0]).freeze()); + set.add( + (R4) + Row.of(items[3], items[2], items[1], items[0] + "β").freeze()); return true; } - + protected void handleEnd() { super.handleEnd(); } } - public static Set> getCategoryDatatypeSourceProperty() { + public static Set> getCategoryDatatypeSourceProperty() { return CategoryDatatypeSourceProperty; } public static Map getPropertyToData() { return propToDatum; } - + public static Set getPropertiesWithData() { return propToDatum.keySet(); } public static PropertyMetaDatum getData(String propName) { - return propToDatum.get(propName.replace("β","")); + return propToDatum.get(propName.replace("β", "")); } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/QuickCheck.java b/UnicodeJsps/src/main/java/org/unicode/jsp/QuickCheck.java index cc9d138d8..d9cd8facb 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/QuickCheck.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/QuickCheck.java @@ -1,7 +1,7 @@ package org.unicode.jsp; public class QuickCheck { -public static void main(String[] args) { - System.out.println(UnicodeUtilities.toHTML("a character_compatibleScripts; - - - public enum CompatibilityLevel {Highly_Restrictive, Moderately_Restrictive} - public enum ScriptSpecials {on, off} - - - /** - * Space reserved for script codes not in ICU - */ - public static final int EXTRA_COUNT = 16; // should be enough, hard working as UTC is! - public static final Map extraScripts = new ConcurrentHashMap<>(EXTRA_COUNT); - /** - * Extended scripts; note that they do not have stable numbers, and should not be persisted. - */ - public static final int - //HANT = UScript.CODE_LIMIT, - //HANS = HANT + 1, - LIMIT = UScript.CODE_LIMIT + EXTRA_COUNT; // HANS + 1; - - private static String[][] EXTENDED_NAME = { - // Scripts without stable numbers - {"Hant", "Han Traditional"}, {"Hans", "Han Simplified"}, - }; - - static AtomicInteger scriptCounter = new AtomicInteger(UScript.CODE_LIMIT); - - static int getScriptCode(String script) { - try { - // If ICU has it, great - return UCharacter.getPropertyValueEnum(UProperty.SCRIPT, script); - } catch (com.ibm.icu.impl.IllegalIcuArgumentException iiae) { - // Make something up - int newCode = extraScripts.computeIfAbsent(script, script2 -> { - int i = scriptCounter.getAndIncrement(); - logger.warning("Synthesized scriptCode " + i + " for unrecognized script extension '"+script+"'"); - return i; - }); - // Verify we didn't run over - if (newCode >= LIMIT) { - throw new RuntimeException("computed script code of " + newCode + " for '"+script+"' overflows: have " + extraScripts.size() + - " scripts but EXTRA_COUNT=" + EXTRA_COUNT); - } - return newCode; - } - } - - public static String getScriptName(int extendedScriptCode, int choice) { - if (extendedScriptCode >= UScript.CODE_LIMIT) { - if (extendedScriptCode >= LIMIT) { - return EXTENDED_NAME[extendedScriptCode - LIMIT][choice]; - } else { - for (Map.Entry e : extraScripts.entrySet()) { - if(e.getValue() == extendedScriptCode) { - if(choice == 0) { - return e.getKey(); - } else { - return "New Script '"+ e.getKey() + "'"; - } - } - } - throw new IllegalArgumentException("Unknown extended script code " + extendedScriptCode); - } - } - return UCharacter.getPropertyValueName(UProperty.SCRIPT, extendedScriptCode, choice); - } - - - private static final BitSet ALL = new BitSet(LIMIT); // be careful when using this; can't freeze it! - static { - ALL.set(0, LIMIT, true); - } - - /** - * Build a ScriptTester - * @return - */ - public static Builder start(CompatibilityLevel level, ScriptSpecials specials) { - return new Builder(level, specials); - } - - public static Builder start() { - return new Builder(CompatibilityLevel.Highly_Restrictive, ScriptSpecials.on); - } - - public static Builder start(CompatibilityLevel level) { - return new Builder(level, ScriptSpecials.on); - } - - - /** - * If the scripts in the string are compatible, then returns a list of them. Otherwise returns an empty bitset. - * The input must be in NFD. - * @param input - * @return bitset of scripts found - */ - public boolean isOk(CharSequence input) { - input = Normalizer.normalize(input.toString(), Normalizer.NFD); - // We make one pass forward and one backward, finding if each characters scripts - // are compatible with the ones before and after - // We save the value that we collect on the first pass. - int cp; - int maxSize = input.length(); - int base = -1; - BitSet[] actual = new BitSet[maxSize]; - BitSet[] compat = new BitSet[maxSize]; - int codePointCount = 0; - BitSet compatBefore = new BitSet(LIMIT); - compatBefore.or(ALL); - int lastCp = -1; - for (int i = 0; i < maxSize; i += Character.charCount(cp)) { - cp = Character.codePointAt(input, i); - // check for mixed numbers - int type = UCharacter.getType(cp); - if (type == UCharacter.DECIMAL_DIGIT_NUMBER) { - int newBase = cp & 0xFFFFF0; - if (base < 0) { - base = newBase; - } else if (base != newBase){ - return false; - } - } - // check for multiple combining marks - if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) { - if (lastCp == cp) { - return false; - } - } - // check scripts - compat[codePointCount] = character_compatibleScripts.get(cp); - actual[codePointCount] = getActualScripts(cp); - if (!actual[codePointCount].intersects(compatBefore)) { - return false; - } - compatBefore.and(compat[codePointCount]); - codePointCount++; - lastCp = cp; + static Logger logger = Logger.getLogger(ScriptTester.class.getName()); + private final UnicodeMap character_compatibleScripts; + + public enum CompatibilityLevel { + Highly_Restrictive, + Moderately_Restrictive } - compatBefore.or(ALL); - for (int i = codePointCount - 1; i >= 0; --i) { - if (!actual[i].intersects(compatBefore)) { - return false; - } - compatBefore.and(compat[i]); + + public enum ScriptSpecials { + on, + off } - // check numbers - return true; - } + /** Space reserved for script codes not in ICU */ + public static final int EXTRA_COUNT = 16; // should be enough, hard working as UTC is! + public static final Map extraScripts = new ConcurrentHashMap<>(EXTRA_COUNT); + /** Extended scripts; note that they do not have stable numbers, and should not be persisted. */ + public static final int + // HANT = UScript.CODE_LIMIT, + // HANS = HANT + 1, + LIMIT = UScript.CODE_LIMIT + EXTRA_COUNT; // HANS + 1; - // TODO, cache results - private BitSet getActualScripts(int cp) { - BitSet actualScripts = getScriptSpecials().get(cp); - if (actualScripts == null) { - actualScripts = new BitSet(LIMIT); - int script = UCharacter.getIntPropertyValue(cp, UProperty.SCRIPT); - actualScripts.set(script); - } - return actualScripts; - } - - public boolean filterTable(List> table) { - - // We make one pass forward and one backward, finding if each characters scripts - // are compatible with the ones before. - // We then make a second pass for the ones after. - // Could be optimized if needed - int maxSize = table.size(); - BitSet compatBefore = new BitSet(LIMIT); - compatBefore.or(ALL); - BitSet anyCompatAt = new BitSet(LIMIT); - - HashSet toRemove = new HashSet(); - for (int i = 0; i < maxSize; ++i) { - toRemove.clear(); - anyCompatAt.clear(); - Set column = table.get(i); - for (String item : column) { - BitSet compatibleScripts = getCompatibleScripts(item); // ANDed - anyCompatAt.or(compatibleScripts); - BitSet actualScripts = getActualScripts(item); // ORed - if (!actualScripts.intersects(compatBefore)) { - toRemove.add(item); + private static String[][] EXTENDED_NAME = { + // Scripts without stable numbers + {"Hant", "Han Traditional"}, {"Hans", "Han Simplified"}, + }; + + static AtomicInteger scriptCounter = new AtomicInteger(UScript.CODE_LIMIT); + + static int getScriptCode(String script) { + try { + // If ICU has it, great + return UCharacter.getPropertyValueEnum(UProperty.SCRIPT, script); + } catch (com.ibm.icu.impl.IllegalIcuArgumentException iiae) { + // Make something up + int newCode = + extraScripts.computeIfAbsent( + script, + script2 -> { + int i = scriptCounter.getAndIncrement(); + logger.warning( + "Synthesized scriptCode " + + i + + " for unrecognized script extension '" + + script + + "'"); + return i; + }); + // Verify we didn't run over + if (newCode >= LIMIT) { + throw new RuntimeException( + "computed script code of " + + newCode + + " for '" + + script + + "' overflows: have " + + extraScripts.size() + + " scripts but EXTRA_COUNT=" + + EXTRA_COUNT); + } + return newCode; } - } - column.removeAll(toRemove); - if (column.size() == 0) { - return false; - } - compatBefore.and(anyCompatAt); } - // now reverse order - compatBefore.or(ALL); - for (int i = maxSize - 1; i >= 0; --i) { - toRemove.clear(); - anyCompatAt.clear(); - Set column = table.get(i); - for (String item : column) { - BitSet compatibleScripts = getCompatibleScripts(item); // ANDed - anyCompatAt.or(compatibleScripts); - BitSet actualScripts = getActualScripts(item); // ORed - if (!actualScripts.intersects(compatBefore)) { - toRemove.add(item); + + public static String getScriptName(int extendedScriptCode, int choice) { + if (extendedScriptCode >= UScript.CODE_LIMIT) { + if (extendedScriptCode >= LIMIT) { + return EXTENDED_NAME[extendedScriptCode - LIMIT][choice]; + } else { + for (Map.Entry e : extraScripts.entrySet()) { + if (e.getValue() == extendedScriptCode) { + if (choice == 0) { + return e.getKey(); + } else { + return "New Script '" + e.getKey() + "'"; + } + } + } + throw new IllegalArgumentException( + "Unknown extended script code " + extendedScriptCode); + } } - } - column.removeAll(toRemove); - if (column.size() == 0) { - return false; - } - compatBefore.and(anyCompatAt); - } - return true; - } - - private BitSet getActualScripts(String item) { - BitSet toOrWith = new BitSet(LIMIT); - int cp; - for (int i = 0; i < item.length(); i += Character.charCount(cp)) { - cp = Character.codePointAt(item, i); - toOrWith.or(getActualScripts(cp)); - } - return toOrWith; - } - - private BitSet getCompatibleScripts(String item) { - BitSet toAndWith = new BitSet(LIMIT); - toAndWith.or(ALL); - int cp; - for (int i = 0; i < item.length(); i += Character.charCount(cp)) { - cp = Character.codePointAt(item, i); - toAndWith.and(character_compatibleScripts.get(cp)); - } - return toAndWith; - } - - /** - * Each character in item has a compatible set that intersects overall. - * @param item - * @param overallCompatible - * @return - */ - private boolean isCompatible(String input, BitSet overallCompatible) { - int cp; - for (int i = 0; i < input.length(); i += Character.charCount(cp)) { - cp = Character.codePointAt(input, i); - BitSet scripts = character_compatibleScripts.get(cp); // will never fail - if (!scripts.intersects(overallCompatible)) { - return false; - } + return UCharacter.getPropertyValueName(UProperty.SCRIPT, extendedScriptCode, choice); } - return true; - } - - // Ugly hack, because BitSet doesn't have the method. - private boolean contains(BitSet set1, BitSet set2) { - // quick check to verify intersecting - if (!set1.intersects(set2)) { - return false; - } - BitSet temp = new BitSet(); - temp.or(set2); - temp.and(set1); - // we now have the intersection. It must be equal to set2 - return temp.equals(set2); - } - - public static class ScriptExtensions { - - public static final Comparator COMPARATOR = new Comparator() { - - public int compare(BitSet o1, BitSet o2) { - int diff = o1.cardinality() - o2.cardinality(); - if (diff != 0) return diff; - if (o1.equals(o2)) return 0; - String n1 = getNames(o1, UProperty.NameChoice.LONG, " "); - String n2 = getNames(o2, UProperty.NameChoice.LONG, " "); - return n1.compareToIgnoreCase(n2); - } - }; - private UnicodeMap scriptSpecials; + private static final BitSet ALL = + new BitSet(LIMIT); // be careful when using this; can't freeze it! - public Collection getAvailableValues() { - return scriptSpecials.getAvailableValues(); + static { + ALL.set(0, LIMIT, true); } - public UnicodeSet getSet(BitSet value) { - return scriptSpecials.getSet(value); + /** + * Build a ScriptTester + * + * @return + */ + public static Builder start(CompatibilityLevel level, ScriptSpecials specials) { + return new Builder(level, specials); } - private static class MyHandler extends FileUtilities.SemiFileReader { - public final static Pattern SPACES = Pattern.compile("\\s+"); + public static Builder start() { + return new Builder(CompatibilityLevel.Highly_Restrictive, ScriptSpecials.on); + } - UnicodeMap map = new UnicodeMap(); + public static Builder start(CompatibilityLevel level) { + return new Builder(level, ScriptSpecials.on); + } - public boolean handleLine(int start, int end, String[] items) { - BitSet bitSet = new BitSet(LIMIT); - for (String script : SPACES.split(items[1])) { - int scriptCode = getScriptCode(script); - bitSet.set(scriptCode); + /** + * If the scripts in the string are compatible, then returns a list of them. Otherwise returns + * an empty bitset. The input must be in NFD. + * + * @param input + * @return bitset of scripts found + */ + public boolean isOk(CharSequence input) { + input = Normalizer.normalize(input.toString(), Normalizer.NFD); + // We make one pass forward and one backward, finding if each characters scripts + // are compatible with the ones before and after + // We save the value that we collect on the first pass. + int cp; + int maxSize = input.length(); + int base = -1; + BitSet[] actual = new BitSet[maxSize]; + BitSet[] compat = new BitSet[maxSize]; + int codePointCount = 0; + BitSet compatBefore = new BitSet(LIMIT); + compatBefore.or(ALL); + int lastCp = -1; + for (int i = 0; i < maxSize; i += Character.charCount(cp)) { + cp = Character.codePointAt(input, i); + // check for mixed numbers + int type = UCharacter.getType(cp); + if (type == UCharacter.DECIMAL_DIGIT_NUMBER) { + int newBase = cp & 0xFFFFF0; + if (base < 0) { + base = newBase; + } else if (base != newBase) { + return false; + } + } + // check for multiple combining marks + if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) { + if (lastCp == cp) { + return false; + } + } + // check scripts + compat[codePointCount] = character_compatibleScripts.get(cp); + actual[codePointCount] = getActualScripts(cp); + if (!actual[codePointCount].intersects(compatBefore)) { + return false; + } + compatBefore.and(compat[codePointCount]); + codePointCount++; + lastCp = cp; } - map.putAll(start, end, bitSet); + compatBefore.or(ALL); + for (int i = codePointCount - 1; i >= 0; --i) { + if (!actual[i].intersects(compatBefore)) { + return false; + } + compatBefore.and(compat[i]); + } + // check numbers return true; - } } - public static ScriptExtensions make(String directory, String filename) { - ScriptExtensions result = new ScriptExtensions(); - result.scriptSpecials = ((MyHandler) new MyHandler() - .process(directory, filename)).map.freeze(); - return result; + // TODO, cache results + private BitSet getActualScripts(int cp) { + BitSet actualScripts = getScriptSpecials().get(cp); + if (actualScripts == null) { + actualScripts = new BitSet(LIMIT); + int script = UCharacter.getIntPropertyValue(cp, UProperty.SCRIPT); + actualScripts.set(script); + } + return actualScripts; } - public static ScriptExtensions make(Class aClass, String filename) { - ScriptExtensions result = new ScriptExtensions(); - result.scriptSpecials = ((MyHandler) new MyHandler() - .process(aClass, filename)).map.freeze(); - return result; + public boolean filterTable(List> table) { + + // We make one pass forward and one backward, finding if each characters scripts + // are compatible with the ones before. + // We then make a second pass for the ones after. + // Could be optimized if needed + int maxSize = table.size(); + BitSet compatBefore = new BitSet(LIMIT); + compatBefore.or(ALL); + BitSet anyCompatAt = new BitSet(LIMIT); + + HashSet toRemove = new HashSet(); + for (int i = 0; i < maxSize; ++i) { + toRemove.clear(); + anyCompatAt.clear(); + Set column = table.get(i); + for (String item : column) { + BitSet compatibleScripts = getCompatibleScripts(item); // ANDed + anyCompatAt.or(compatibleScripts); + BitSet actualScripts = getActualScripts(item); // ORed + if (!actualScripts.intersects(compatBefore)) { + toRemove.add(item); + } + } + column.removeAll(toRemove); + if (column.size() == 0) { + return false; + } + compatBefore.and(anyCompatAt); + } + // now reverse order + compatBefore.or(ALL); + for (int i = maxSize - 1; i >= 0; --i) { + toRemove.clear(); + anyCompatAt.clear(); + Set column = table.get(i); + for (String item : column) { + BitSet compatibleScripts = getCompatibleScripts(item); // ANDed + anyCompatAt.or(compatibleScripts); + BitSet actualScripts = getActualScripts(item); // ORed + if (!actualScripts.intersects(compatBefore)) { + toRemove.add(item); + } + } + column.removeAll(toRemove); + if (column.size() == 0) { + return false; + } + compatBefore.and(anyCompatAt); + } + return true; } - public BitSet get(int codepoint) { - return scriptSpecials.get(codepoint); + private BitSet getActualScripts(String item) { + BitSet toOrWith = new BitSet(LIMIT); + int cp; + for (int i = 0; i < item.length(); i += Character.charCount(cp)) { + cp = Character.codePointAt(item, i); + toOrWith.or(getActualScripts(cp)); + } + return toOrWith; } - public void putAllInto(UnicodeMap char2scripts) { - char2scripts.putAll(scriptSpecials); + private BitSet getCompatibleScripts(String item) { + BitSet toAndWith = new BitSet(LIMIT); + toAndWith.or(ALL); + int cp; + for (int i = 0; i < item.length(); i += Character.charCount(cp)) { + cp = Character.codePointAt(item, i); + toAndWith.and(character_compatibleScripts.get(cp)); + } + return toAndWith; } - public static String getNames(BitSet value, int choice, String separator) { - return getNames(value, choice, separator, new TreeSet()); + /** + * Each character in item has a compatible set that intersects overall. + * + * @param item + * @param overallCompatible + * @return + */ + private boolean isCompatible(String input, BitSet overallCompatible) { + int cp; + for (int i = 0; i < input.length(); i += Character.charCount(cp)) { + cp = Character.codePointAt(input, i); + BitSet scripts = character_compatibleScripts.get(cp); // will never fail + if (!scripts.intersects(overallCompatible)) { + return false; + } + } + return true; } - public static String getNames(BitSet value, int choice, String separator, Set names) { - names.clear(); - for (int i = value.nextSetBit(0); i >= 0; i = value.nextSetBit(i+1)) { - names.add(ScriptTester.getScriptName(i, choice)); - } - return CollectionUtilities.join(names, separator).toString(); + // Ugly hack, because BitSet doesn't have the method. + private boolean contains(BitSet set1, BitSet set2) { + // quick check to verify intersecting + if (!set1.intersects(set2)) { + return false; + } + BitSet temp = new BitSet(); + temp.or(set2); + temp.and(set1); + // we now have the intersection. It must be equal to set2 + return temp.equals(set2); } - } - static final class ScriptExtensionsHelper { - ScriptExtensions scriptSpecials; + public static class ScriptExtensions { - ScriptExtensionsHelper() { - scriptSpecials = ScriptExtensions.make(ScriptExtensions.class, "ScriptExtensions.txt"); - } + public static final Comparator COMPARATOR = + new Comparator() { - static ScriptExtensionsHelper INSTANCE = new ScriptExtensionsHelper(); - } - - static final ScriptExtensions getScriptSpecials() { - return ScriptExtensionsHelper.INSTANCE.scriptSpecials; - } - - public static BitSet getScriptSpecials(int codepoint) { - BitSet output = new BitSet(LIMIT); - BitSet actualScripts = getScriptSpecials().get(codepoint); - if (actualScripts != null) { - output.or(actualScripts); - } else { - int script = UCharacter.getIntPropertyValue(codepoint, UProperty.SCRIPT); - output.set(script); - } - return output; - } + public int compare(BitSet o1, BitSet o2) { + int diff = o1.cardinality() - o2.cardinality(); + if (diff != 0) return diff; + if (o1.equals(o2)) return 0; + String n1 = getNames(o1, UProperty.NameChoice.LONG, " "); + String n2 = getNames(o2, UProperty.NameChoice.LONG, " "); + return n1.compareToIgnoreCase(n2); + } + }; - public static UnicodeMap getScriptSpecialsNames() { - UnicodeMap result = new UnicodeMap(); - Set names = new TreeSet(); // to alphabetize + private UnicodeMap scriptSpecials; - for (BitSet value : getScriptSpecials().getAvailableValues()) { - result.putAll(getScriptSpecials().getSet(value), ScriptExtensions.getNames(value, UProperty.NameChoice.LONG, ",", names)); - } - return result; - } - - public static String[][] getScriptSpecialsAlternates() { - Collection availableValues = getScriptSpecials().getAvailableValues(); - String[][] result = new String[availableValues.size()][]; - Set names = new TreeSet(); // to alphabetize - - int i = 0; - for (BitSet value : availableValues) { - String baseName = ScriptExtensions.getNames(value, UProperty.NameChoice.LONG, ",", names); - String altName = ScriptExtensions.getNames(value, UProperty.NameChoice.SHORT, ",", names); - String[] row = {baseName, altName}; - result[i++] = row; - } - return result; - } - - private ScriptTester(UnicodeMap character_scripts) { - this.character_compatibleScripts = character_scripts; - } - - public static class Builder { - - private final Map compatible = new TreeMap(); - private final UnicodeMap char2scripts = new UnicodeMap(); - - private Builder(CompatibilityLevel level, ScriptSpecials specials) { - // make everything compatible with itself - for (int i = 0; i < LIMIT; ++i) { - BitSet itself = new BitSet(LIMIT); - itself.set(i); - compatible.put(i, itself); - } - // first do levels - switch (level) { - case Moderately_Restrictive: - for (int i = 0; i < LIMIT; ++i) { - if (i == UScript.CYRILLIC || i == UScript.GREEK || i == UScript.CHEROKEE) { - continue; - } - addCompatible(UScript.LATIN, i); + public Collection getAvailableValues() { + return scriptSpecials.getAvailableValues(); + } + + public UnicodeSet getSet(BitSet value) { + return scriptSpecials.getSet(value); + } + + private static class MyHandler extends FileUtilities.SemiFileReader { + public static final Pattern SPACES = Pattern.compile("\\s+"); + + UnicodeMap map = new UnicodeMap(); + + public boolean handleLine(int start, int end, String[] items) { + BitSet bitSet = new BitSet(LIMIT); + for (String script : SPACES.split(items[1])) { + int scriptCode = getScriptCode(script); + bitSet.set(scriptCode); + } + map.putAll(start, end, bitSet); + return true; + } } - // FALL THRU! - case Highly_Restrictive: - addCompatible(UScript.LATIN, UScript.HAN, UScript.HIRAGANA, UScript.KATAKANA); - //addCompatible(UScript.LATIN, HANT, UScript.HIRAGANA, UScript.KATAKANA); - //addCompatible(UScript.LATIN, HANS, UScript.HIRAGANA, UScript.KATAKANA); - - addCompatible(UScript.LATIN, UScript.HAN, UScript.HANGUL); - //addCompatible(UScript.LATIN, HANT, UScript.HANGUL); - //addCompatible(UScript.LATIN, HANS, UScript.HANGUL); - - addCompatible(UScript.LATIN, UScript.HAN, UScript.BOPOMOFO); - addCompatible(UScript.LATIN, UScript.HAN); - // ?? Asomtavruli, Nuskhuri, and Mkhedruli (georgian) - // FALL THRU! - default: - //addCompatible(UScript.HAN, HANT); - //addCompatible(UScript.HAN, HANS); - // Common and Inherited are compatible with everything! - for (int i = 0; i < LIMIT; ++i) { - addCompatible(UScript.COMMON, i); - addCompatible(UScript.INHERITED, i); + + public static ScriptExtensions make(String directory, String filename) { + ScriptExtensions result = new ScriptExtensions(); + result.scriptSpecials = + ((MyHandler) new MyHandler().process(directory, filename)).map.freeze(); + return result; + } + + public static ScriptExtensions make(Class aClass, String filename) { + ScriptExtensions result = new ScriptExtensions(); + result.scriptSpecials = + ((MyHandler) new MyHandler().process(aClass, filename)).map.freeze(); + return result; + } + + public BitSet get(int codepoint) { + return scriptSpecials.get(codepoint); + } + + public void putAllInto(UnicodeMap char2scripts) { + char2scripts.putAll(scriptSpecials); } - } - // then specials - // fix the char2scripts mapping - if (specials == ScriptSpecials.on){ - getScriptSpecials().putAllInto(char2scripts); - } + public static String getNames(BitSet value, int choice, String separator) { + return getNames(value, choice, separator, new TreeSet()); + } + + public static String getNames( + BitSet value, int choice, String separator, Set names) { + names.clear(); + for (int i = value.nextSetBit(0); i >= 0; i = value.nextSetBit(i + 1)) { + names.add(ScriptTester.getScriptName(i, choice)); + } + return CollectionUtilities.join(names, separator).toString(); + } } - public ScriptTester get() { - UnicodeMap character_scripts = new UnicodeMap(); - // first set all the simple cases: character => script => scripts - for (int script = 0; script < UScript.CODE_LIMIT; ++script) { - UnicodeSet uset = new UnicodeSet(); - uset.applyIntPropertyValue(UProperty.SCRIPT, script); - if (uset.size() != 0) { - BitSet scripts = compatible.get(script); - character_scripts.putAll(uset, scripts); + static final class ScriptExtensionsHelper { + ScriptExtensions scriptSpecials; + + ScriptExtensionsHelper() { + scriptSpecials = ScriptExtensions.make(ScriptExtensions.class, "ScriptExtensions.txt"); } - } - // now override these (as necessary) with the charScriptMapping - for (BitSet scripts : char2scripts.values()) { - // The scripts need fluffing up according to the acceptableTogether sets - // We have to create new Bitsets! - BitSet fluffed = new BitSet(LIMIT); - fluffed.or(scripts); - for (int unfluffedScript = scripts.nextSetBit(0); unfluffedScript >= 0; unfluffedScript = scripts.nextSetBit(unfluffedScript+1)) { - BitSet acceptable = compatible.get(unfluffedScript); - fluffed.or(acceptable); + + static ScriptExtensionsHelper INSTANCE = new ScriptExtensionsHelper(); + } + + static final ScriptExtensions getScriptSpecials() { + return ScriptExtensionsHelper.INSTANCE.scriptSpecials; + } + + public static BitSet getScriptSpecials(int codepoint) { + BitSet output = new BitSet(LIMIT); + BitSet actualScripts = getScriptSpecials().get(codepoint); + if (actualScripts != null) { + output.or(actualScripts); + } else { + int script = UCharacter.getIntPropertyValue(codepoint, UProperty.SCRIPT); + output.set(script); } - UnicodeSet uset = char2scripts.getSet(scripts); - character_scripts.putAll(uset, fluffed); - } - return new ScriptTester(character_scripts); + return output; } - /** - * Add list of scripts that are acceptable in combination together. - *

Example: st.addAcceptable(UScript.LATIN, USCRIPT.HANGUL);

- * @param scripts - */ - public Builder addCompatible(int... scripts) { - // set all the scripts on each of the other scripts - for (int script : scripts) { - BitSet items = compatible.get(script); - for (int script2 : scripts) { - items.set(script2); + + public static UnicodeMap getScriptSpecialsNames() { + UnicodeMap result = new UnicodeMap(); + Set names = new TreeSet(); // to alphabetize + + for (BitSet value : getScriptSpecials().getAvailableValues()) { + result.putAll( + getScriptSpecials().getSet(value), + ScriptExtensions.getNames(value, UProperty.NameChoice.LONG, ",", names)); } - } - return this; + return result; } - /** - * Add mapping from code point to scripts - *

Example: st.addMapping(0x, USCRIPT.HIRAGANA, USCRIPT.KATAKANA); // U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK

- */ - public Builder addMapping(int codePoint, int... scripts) { - BitSet newScripts = new BitSet(LIMIT); - BitSet oldScripts = char2scripts.get(codePoint); - if (oldScripts != null) { - newScripts.or(oldScripts); - } - for (int script : scripts) { - newScripts.set(script); - } - char2scripts.put(codePoint, newScripts); - return this; + public static String[][] getScriptSpecialsAlternates() { + Collection availableValues = getScriptSpecials().getAvailableValues(); + String[][] result = new String[availableValues.size()][]; + Set names = new TreeSet(); // to alphabetize + + int i = 0; + for (BitSet value : availableValues) { + String baseName = + ScriptExtensions.getNames(value, UProperty.NameChoice.LONG, ",", names); + String altName = + ScriptExtensions.getNames(value, UProperty.NameChoice.SHORT, ",", names); + String[] row = {baseName, altName}; + result[i++] = row; + } + return result; + } + + private ScriptTester(UnicodeMap character_scripts) { + this.character_compatibleScripts = character_scripts; } - } + public static class Builder { + + private final Map compatible = new TreeMap(); + private final UnicodeMap char2scripts = new UnicodeMap(); + + private Builder(CompatibilityLevel level, ScriptSpecials specials) { + // make everything compatible with itself + for (int i = 0; i < LIMIT; ++i) { + BitSet itself = new BitSet(LIMIT); + itself.set(i); + compatible.put(i, itself); + } + // first do levels + switch (level) { + case Moderately_Restrictive: + for (int i = 0; i < LIMIT; ++i) { + if (i == UScript.CYRILLIC || i == UScript.GREEK || i == UScript.CHEROKEE) { + continue; + } + addCompatible(UScript.LATIN, i); + } + // FALL THRU! + case Highly_Restrictive: + addCompatible(UScript.LATIN, UScript.HAN, UScript.HIRAGANA, UScript.KATAKANA); + // addCompatible(UScript.LATIN, HANT, UScript.HIRAGANA, UScript.KATAKANA); + // addCompatible(UScript.LATIN, HANS, UScript.HIRAGANA, UScript.KATAKANA); + + addCompatible(UScript.LATIN, UScript.HAN, UScript.HANGUL); + // addCompatible(UScript.LATIN, HANT, UScript.HANGUL); + // addCompatible(UScript.LATIN, HANS, UScript.HANGUL); + + addCompatible(UScript.LATIN, UScript.HAN, UScript.BOPOMOFO); + addCompatible(UScript.LATIN, UScript.HAN); + // ?? Asomtavruli, Nuskhuri, and Mkhedruli (georgian) + // FALL THRU! + default: + // addCompatible(UScript.HAN, HANT); + // addCompatible(UScript.HAN, HANS); + // Common and Inherited are compatible with everything! + for (int i = 0; i < LIMIT; ++i) { + addCompatible(UScript.COMMON, i); + addCompatible(UScript.INHERITED, i); + } + } + // then specials + // fix the char2scripts mapping + + if (specials == ScriptSpecials.on) { + getScriptSpecials().putAllInto(char2scripts); + } + } + + public ScriptTester get() { + UnicodeMap character_scripts = new UnicodeMap(); + // first set all the simple cases: character => script => scripts + for (int script = 0; script < UScript.CODE_LIMIT; ++script) { + UnicodeSet uset = new UnicodeSet(); + uset.applyIntPropertyValue(UProperty.SCRIPT, script); + if (uset.size() != 0) { + BitSet scripts = compatible.get(script); + character_scripts.putAll(uset, scripts); + } + } + // now override these (as necessary) with the charScriptMapping + for (BitSet scripts : char2scripts.values()) { + // The scripts need fluffing up according to the acceptableTogether sets + // We have to create new Bitsets! + BitSet fluffed = new BitSet(LIMIT); + fluffed.or(scripts); + for (int unfluffedScript = scripts.nextSetBit(0); + unfluffedScript >= 0; + unfluffedScript = scripts.nextSetBit(unfluffedScript + 1)) { + BitSet acceptable = compatible.get(unfluffedScript); + fluffed.or(acceptable); + } + UnicodeSet uset = char2scripts.getSet(scripts); + character_scripts.putAll(uset, fluffed); + } + return new ScriptTester(character_scripts); + } + /** + * Add list of scripts that are acceptable in combination together. + * + *

Example: st.addAcceptable(UScript.LATIN, USCRIPT.HANGUL); + * + * @param scripts + */ + public Builder addCompatible(int... scripts) { + // set all the scripts on each of the other scripts + for (int script : scripts) { + BitSet items = compatible.get(script); + for (int script2 : scripts) { + items.set(script2); + } + } + return this; + } + + /** + * Add mapping from code point to scripts + * + *

Example: st.addMapping(0x, USCRIPT.HIRAGANA, USCRIPT.KATAKANA); // U+30FC + * KATAKANA-HIRAGANA PROLONGED SOUND MARK + */ + public Builder addMapping(int codePoint, int... scripts) { + BitSet newScripts = new BitSet(LIMIT); + BitSet oldScripts = char2scripts.get(codePoint); + if (oldScripts != null) { + newScripts.or(oldScripts); + } + for (int script : scripts) { + newScripts.set(script); + } + char2scripts.put(codePoint, newScripts); + return this; + } + } } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester2.java b/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester2.java index 68d4b87d5..0815290f1 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester2.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/ScriptTester2.java @@ -1,23 +1,7 @@ package org.unicode.jsp; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.TreeSet; - import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; -import com.google.common.collect.ImmutableSortedMap; import com.google.common.collect.Multimap; import com.google.common.collect.TreeMultimap; import com.ibm.icu.dev.util.UnicodeMap; @@ -27,6 +11,17 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.VersionInfo; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map.Entry; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.TreeSet; public class ScriptTester2 { private final UnicodeMap toEquivalents; @@ -36,7 +31,13 @@ public class ScriptTester2 { private final UnicodeMap confusables; private final SortedMap multipleToSingleConfusable; - private ScriptTester2(Set all, UnicodeMap confusables, SortedMap multipleToSingleConfusable, UnicodeMap> scripts, UnicodeMap equiv2, UnicodeSet allowed) { + private ScriptTester2( + Set all, + UnicodeMap confusables, + SortedMap multipleToSingleConfusable, + UnicodeMap> scripts, + UnicodeMap equiv2, + UnicodeSet allowed) { this.allScripts = all; this.confusables = confusables; toEquivalents = equiv2; @@ -48,10 +49,11 @@ private ScriptTester2(Set all, UnicodeMap confusables, SortedMap public static ScriptTester2 getInstance(VersionInfo version, UnicodeSet allowed) { allowed = allowed.isFrozen() ? allowed : new UnicodeSet(allowed).freeze(); CachedProps props = CachedProps.getInstance(version); - //System.out.println(new TreeSet(props.getAvailable())); + // System.out.println(new TreeSet(props.getAvailable())); UnicodeMap confusables = props.getProperty("Confusable_MA").getUnicodeMap(); UnicodeMap equiv = new UnicodeMap(); - SortedMap multipleToSingle = new TreeMap(new UTF16.StringComparator(true,false,0)); + SortedMap multipleToSingle = + new TreeMap(new UTF16.StringComparator(true, false, 0)); for (String value : confusables.values()) { UnicodeSet us = new UnicodeSet(confusables.getSet(value)).add(value).retainAll(allowed); if (us.isEmpty()) { @@ -102,7 +104,13 @@ public static ScriptTester2 getInstance(VersionInfo version, UnicodeSet allowed) toScripts.putAll(us, ImmutableSet.copyOf(scriptSet)); } - return new ScriptTester2(all, confusables, Collections.unmodifiableSortedMap(multipleToSingle), toScripts, equiv, allowed); + return new ScriptTester2( + all, + confusables, + Collections.unmodifiableSortedMap(multipleToSingle), + toScripts, + equiv, + allowed); } public static UnicodeSet getAllowedStatus(VersionInfo version) { @@ -131,12 +139,17 @@ public Set getScripts(CharSequence value) { return intersection; } - public enum ScriptRestriction {any, wholeScript} + public enum ScriptRestriction { + any, + wholeScript + } - public List> getData(CharSequence value, ScriptRestriction scriptRestriction) { + public List> getData( + CharSequence value, ScriptRestriction scriptRestriction) { value = getSkeleton(value); List> result = new ArrayList(); - HashSet foundScripts = scriptRestriction == ScriptRestriction.any ? null : new HashSet(); + HashSet foundScripts = + scriptRestriction == ScriptRestriction.any ? null : new HashSet(); for (int cp : CharSequences.codePoints(value)) { UnicodeSet current = toEquivalents.get(cp); if (current == null) { @@ -170,7 +183,7 @@ public List> getData(CharSequence value, ScriptRestrict } private Multimap getScriptsToChars(UnicodeSet current) { - Multimap result = TreeMultimap.create(); + Multimap result = TreeMultimap.create(); for (String s : current) { Set scriptSet = toScripts.get(s); if (scriptSet.equals(allScripts)) { @@ -217,7 +230,7 @@ void checkData() { } } for (Entry entry : multipleToSingleConfusable.entrySet()) { - //System.out.println(entry.getKey() + "\t" + UTF16.valueOf(entry.getValue())); + // System.out.println(entry.getKey() + "\t" + UTF16.valueOf(entry.getValue())); // check for overlaps String source = entry.getKey(); String partial = source; @@ -232,20 +245,31 @@ public static void main(String[] args) { VersionInfo version = VersionInfo.getInstance(10); UnicodeSet allowedStatus = ScriptTester2.getAllowedStatus(version); UnicodeSet nfkd_Quick_CheckNo = ScriptTester2.getNFKD_Quick_CheckNo(version); - ScriptTester2 tester = ScriptTester2.getInstance(version, - new UnicodeSet(0,0x10ffff) - // .removeAll(nfkd_Quick_CheckNo) - // .removeAll(new UnicodeSet("[^[:scx=cyrl:][:scx=latn:][:scx=common:][:scx=inherited:]]")) -// .retainAll(allowedStatus) - ); - - + ScriptTester2 tester = + ScriptTester2.getInstance( + version, new UnicodeSet(0, 0x10ffff) + // .removeAll(nfkd_Quick_CheckNo) + // .removeAll(new + // UnicodeSet("[^[:scx=cyrl:][:scx=latn:][:scx=common:][:scx=inherited:]]")) + // .retainAll(allowedStatus) + ); + tester.checkData(); - - for (String s : Arrays.asList("came", "apple", "scope", "Circle", "СігсӀе", "Сirсlе", "Circ1e", "C𝗂𝗋𝖼𝗅𝖾", "〆切", "ねガ", - "abcdefghijklmnopqrstuvwxyz", - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - )) { + + for (String s : + Arrays.asList( + "came", + "apple", + "scope", + "Circle", + "СігсӀе", + "Сirсlе", + "Circ1e", + "C𝗂𝗋𝖼𝗅𝖾", + "〆切", + "ねガ", + "abcdefghijklmnopqrstuvwxyz", + "ABCDEFGHIJKLMNOPQRSTUVWXYZ")) { System.out.println(s + "\t" + tester.getScripts(s)); for (Multimap data : tester.getData(s, ScriptRestriction.any)) { System.out.println("\t" + data); @@ -254,7 +278,7 @@ public static void main(String[] args) { // for (Set s : tester.scripts.values()) { // String sample = tester.scripts.getSet(s).iterator().next(); // UnicodeSet equivs = tester.equivalents.get(sample); - // System.out.println(sample + // System.out.println(sample // + "\n\t" + (equivs == null ? "?" : equivs.toPattern(false)) // + "\n\t" + tester.scripts.get(sample)); // } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/SequenceData.java b/UnicodeJsps/src/main/java/org/unicode/jsp/SequenceData.java index 91bcc159b..e858b1e79 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/SequenceData.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/SequenceData.java @@ -1,22 +1,19 @@ package org.unicode.jsp; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; - -import org.unicode.cldr.draft.FileUtilities; - -// import com.google.common.base.Splitter; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.CharSequences; import com.ibm.icu.text.UnicodeSet; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; +import org.unicode.cldr.draft.FileUtilities; public class SequenceData { - static final UnicodeSet REGIONAL_INDICATORS = new UnicodeSet(0x1F1E6,0x1F1FF).freeze(); + static final UnicodeSet REGIONAL_INDICATORS = new UnicodeSet(0x1F1E6, 0x1F1FF).freeze(); static final UnicodeSet MARK = new UnicodeSet("[:M:]").freeze(); static final UnicodeSet MODIFIER = new UnicodeSet("[:emoji_modifier:]").freeze(); - static final UnicodeSet TAGS = new UnicodeSet(0xe0020,0xe007F).freeze(); + static final UnicodeSet TAGS = new UnicodeSet(0xe0020, 0xe007F).freeze(); // static final Splitter SEMI = Splitter.onPattern("[;#]").trimResults(); // static final Splitter COMMA = Splitter.on(",").trimResults(); @@ -53,16 +50,21 @@ public class SequenceData { EMOJI_KEYCAP_SEQUENCES.add(source); EMOJI_DEFECTIVES.add(source.codePointAt(0)); // TODO fix data - final StringBuilder sourceWithVs = new StringBuilder() - .appendCodePoint(first) - .append('\uFE0F') - .append(source, Character.charCount(first), source.length()); + final StringBuilder sourceWithVs = + new StringBuilder() + .appendCodePoint(first) + .append('\uFE0F') + .append( + source, + Character.charCount(first), + source.length()); EMOJI_KEYCAP_SEQUENCES.add(sourceWithVs); } else if (TAGS.containsSome(source)) { EMOJI_TAG_SEQUENCES.add(source); } else { int[] codepoints = CharSequences.codePoints(source); - throw new IllegalArgumentException("internal error" + Arrays.asList(codepoints)); + throw new IllegalArgumentException( + "internal error" + Arrays.asList(codepoints)); } } } @@ -78,7 +80,8 @@ public class SequenceData { if (selectors == null) { VARIATION_BASE_TO_SELECTORS.put(first, selectors = new UnicodeSet(second, second)); } else { - VARIATION_BASE_TO_SELECTORS.put(first, selectors = new UnicodeSet(selectors).add(second)); + VARIATION_BASE_TO_SELECTORS.put( + first, selectors = new UnicodeSet(selectors).add(second)); } } EMOJI_ZWJ_SEQUENCES.freeze(); @@ -90,11 +93,13 @@ public class SequenceData { VARIATION_BASE_TO_SELECTORS.freeze(); } + public static void main(String[] args) { System.out.println("EMOJI_ZWJ_SEQUENCES: " + EMOJI_ZWJ_SEQUENCES.toPattern(false)); System.out.println("EMOJI_FLAG_SEQUENCES: " + EMOJI_FLAG_SEQUENCES.toPattern(false)); System.out.println("EMOJI_KEYCAP_SEQUENCES: " + EMOJI_KEYCAP_SEQUENCES.toPattern(false)); - System.out.println("EMOJI_MODIFIER_SEQUENCES: " + EMOJI_MODIFIER_SEQUENCES.toPattern(false)); + System.out.println( + "EMOJI_MODIFIER_SEQUENCES: " + EMOJI_MODIFIER_SEQUENCES.toPattern(false)); System.out.println("EMOJI_DEFECTIVES: " + EMOJI_DEFECTIVES.toPattern(false)); System.out.println("EMOJI_TAG_SEQUENCES: " + EMOJI_TAG_SEQUENCES.toPattern(false)); System.out.println("VARIATION_BASE_TO_SELECTORS: " + VARIATION_BASE_TO_SELECTORS); diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/SimpleTransliterator.java b/UnicodeJsps/src/main/java/org/unicode/jsp/SimpleTransliterator.java index 14e11e325..5453ecc20 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/SimpleTransliterator.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/SimpleTransliterator.java @@ -9,30 +9,26 @@ */ package org.unicode.jsp; + import com.ibm.icu.text.Replaceable; import com.ibm.icu.text.Transform; import com.ibm.icu.text.Transliterator; final class SimpleTransliterator extends Transliterator { - private final Transform stringTransform; + private final Transform stringTransform; - /** - * Constructs a transliterator. - */ - public SimpleTransliterator(String id, Transform stringTransform) { + /** Constructs a transliterator. */ + public SimpleTransliterator(String id, Transform stringTransform) { super(id, null); this.stringTransform = stringTransform; } - /** - * Implements {@link Transliterator#handleTransliterate}. - */ - protected void handleTransliterate(Replaceable text, - Position offsets, boolean isIncremental) { + /** Implements {@link Transliterator#handleTransliterate}. */ + protected void handleTransliterate(Replaceable text, Position offsets, boolean isIncremental) { // start and limit of the input range int start = offsets.start; int limit = offsets.limit; - if(start >= limit) { + if (start >= limit) { return; } // should convert in small chunks, but for now... diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Subheader.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Subheader.java index 43cfd1953..6de137898 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Subheader.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Subheader.java @@ -1,6 +1,10 @@ package org.unicode.jsp; - +import com.ibm.icu.impl.Relation; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -18,23 +22,16 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.util.MultiComparator; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - -public class Subheader implements Iterable { +public class Subheader implements Iterable { static final boolean DEBUG = false; Matcher subheadMatcher = Pattern.compile("(@+)\\s+(.*)").matcher(""); Matcher hexMatcher = Pattern.compile("([A-Z0-9]+).*").matcher(""); Map codePoint2Subblock = new HashMap(); Map subblock2UnicodeSet = new TreeMap(); - Map> block2subblock = new TreeMap>(); - Map> subblock2block = new TreeMap>(); + Map> block2subblock = new TreeMap>(); + Map> subblock2block = new TreeMap>(); public Subheader(String unicodeDataDirectory) { try { @@ -62,18 +59,21 @@ public Subheader(InputStream resourceAsStream) { fillTables(); } - static final Comparator SHORTEST_FIRST = new Comparator() { - @Override - public int compare(CharSequence arg0, CharSequence arg1) { - return arg0.length() - arg1.length(); - } - }; + static final Comparator SHORTEST_FIRST = + new Comparator() { + @Override + public int compare(CharSequence arg0, CharSequence arg1) { + return arg0.length() - arg1.length(); + } + }; - static final MultiComparator SHORTEST = new MultiComparator(SHORTEST_FIRST, UnicodeSetUtilities.MAIN_COLLATOR); + static final MultiComparator SHORTEST = + new MultiComparator(SHORTEST_FIRST, UnicodeSetUtilities.MAIN_COLLATOR); private void fillTables() { // fix plurals & casing - Relation caseless = new Relation(new TreeMap(), TreeSet.class, SHORTEST); + Relation caseless = + new Relation(new TreeMap(), TreeSet.class, SHORTEST); for (String subhead : subblock2UnicodeSet.keySet()) { String norm = getSkeleton(subhead); @@ -85,8 +85,9 @@ private void fillTables() { if (set.size() == 1) { continue; } - if (DEBUG) System.out.println("***Merging similar names:\t" + set + "\tskeleton:" + norm); - + if (DEBUG) + System.out.println("***Merging similar names:\t" + set + "\tskeleton:" + norm); + UnicodeSet best = null; String bestName = null; for (String name : set) { @@ -110,10 +111,15 @@ private void fillTables() { for (String subblock : subblock2UnicodeSet.keySet()) { final UnicodeSet uset = subblock2UnicodeSet.get(subblock); - for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next(); ) { codePoint2Subblock.put(it.codepoint, subblock); - String block = UCharacter.getStringPropertyValue(UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG).toString().replace('_', ' ').intern(); + String block = + UCharacter.getStringPropertyValue( + UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG) + .toString() + .replace('_', ' ') + .intern(); Set set = block2subblock.get(block); if (set == null) { @@ -130,11 +136,13 @@ private void fillTables() { } } - static final Pattern NON_ALPHANUM = Pattern.compile("[^" + - "\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Lm}" + - "\\p{Me}\\p{Mc}\\p{Mn}" + - "\\p{Nd}" + - "]+"); + static final Pattern NON_ALPHANUM = + Pattern.compile( + "[^" + + "\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Lm}" + + "\\p{Me}\\p{Mc}\\p{Mn}" + + "\\p{Nd}" + + "]+"); static final Pattern TERMINATION = Pattern.compile("(ies|es|s|y)_"); static final Pattern INITIAL_GORP = Pattern.compile("$[A-Z]\\."); @@ -167,12 +175,11 @@ private String getSkeleton(String input) { result = result.replace("_archaic_", "_historic_"); result = result.replace("_general_use_", "_general_"); - - return result; } - private Map getDataFromFile(String filename) throws FileNotFoundException, IOException { + private Map getDataFromFile(String filename) + throws FileNotFoundException, IOException { InputStream is = new FileInputStream(filename); return getDataFromStream(is); } @@ -215,4 +222,4 @@ public Iterator iterator() { public UnicodeSet getUnicodeSet(String subhead) { return subblock2UnicodeSet.get(subhead); } -} \ No newline at end of file +} diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/SubheaderSnapshot.java b/UnicodeJsps/src/main/java/org/unicode/jsp/SubheaderSnapshot.java index 1513f427f..bbced3e11 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/SubheaderSnapshot.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/SubheaderSnapshot.java @@ -1,688 +1,784 @@ package org.unicode.jsp; public class SubheaderSnapshot { - static final String[][] data = { - {"APL","[\\u2336-\\u237A\\u2395]"}, - {"ASCII digits","[0-9]"}, - {"ASCII punctuation and symbols","[\\ -/\\:-@\\[-`\\{-~]"}, - {"Abbreviation mark","[\\uA66F]"}, - {"Accidentals","[\\U0001D12A-\\U0001D133]"}, - {"Addition for German typography","[\\u1E9E]"}, - {"Addition for Sanskrit","[\\u0C3D\\u0D3D]"}, - {"Addition for Torwali","[\\u0772]"}, - {"Additional Suzhou numerals","[\\u3038-\\u303A]"}, - {"Additional archaic letters for Bactrian","[\\u03F7\\u03F8]"}, - {"Additional bold Greek symbols","[\\U0001D6DB-\\U0001D6E1\\U0001D7CA\\U0001D7CB]"}, - {"Additional bold italic Greek symbols","[\\U0001D74F-\\U0001D755]"}, - {"Additional circled number","[\\u24EA]"}, - {"Additional consonant","[\\u09CE]"}, - {"Additional consonants","[\\u0958-\\u095F\\u09DC\\u09DD\\u09DF\\u0A59-\\u0A5C\\u0A5E\\u0B5C\\u0B5D\\u0B5F\\u0CDE\\u1B45-\\u1B4B\\u1BAE\\u1BAF]"}, - {"Additional dependent vowel signs","[\\u0DF2\\u0DF3]"}, - {"Additional diacritical mark for symbols","[\\u20E1]"}, - {"Additional diacritical marks for symbols","[\\u20E5-\\u20F0]"}, - {"Additional enclosing diacritics","[\\u20E2-\\u20E4]"}, - {"Additional italic Greek symbols","[\\U0001D715-\\U0001D71B]"}, - {"Additional letter","[\\u03F3]"}, - {"Additional letterlike symbols","[\\u2139-\\u213F\\u2141-\\u2144\\u214A-\\u214D]"}, - {"Additional letters","[\\u10F9\\u10FA\\u1C4D-\\u1C4F]"}, - {"Additional letters for Mingrelian and Svan","[\\u10F7\\u10F8]"}, - {"Additional marks for UPA","[\\u1DFE\\u1DFF]"}, - {"Additional punctuation","[\\u05F3\\u05F4]"}, - {"Additional sans-serif bold Greek symbols","[\\U0001D789-\\U0001D78F]"}, - {"Additional sans-serif bold italic Greek symbols","[\\U0001D7C3-\\U0001D7C9]"}, - {"Additional vowel for Marathi","[\\u0972]"}, - {"Additional vowels for Sanskrit","[\\u0960-\\u0963\\u09E0-\\u09E3\\u0AE0-\\u0AE3\\u0B60\\u0B61\\u0C60\\u0C61\\u0CE0\\u0CE1\\u0D60\\u0D61]"}, - {"Additional white on black circled number","[\\u24FF]"}, - {"Additions","[\\u0339-\\u033F]"}, - {"Additions based on 1989 IPA","[\\u02DE-\\u02E4]"}, - {"Additions for Burushaski","[\\u0773-\\u077D]"}, - {"Additions for Greek","[\\u0342-\\u0345]"}, - {"Additions for IPA","[\\u0346-\\u034A]"}, - {"Additions for Khowar","[\\u076E-\\u0771]"}, - {"Additions for Livonian","[\\u022A-\\u0233]"}, - {"Additions for Nivkh","[\\u04FA-\\u04FF]"}, - {"Additions for Romanian","[\\u0218-\\u021B]"}, - {"Additions for Sinology","[\\u0234-\\u0236\\u02AE\\u02AF]"}, - {"Additions for Slovenian and Croatian","[\\u0200-\\u0217]"}, - {"Additions for UPA","[\\u2C77-\\u2C7D\\uA720\\uA721]"}, - {"Additions for Uighur","[\\u2C67-\\u2C6C]"}, - {"Additions for early Persian","[\\u077E\\u077F]"}, - {"Additions for early Persian and Azerbaijani","[\\u063B-\\u063F]"}, - {"Additions for the Uralic Phonetic Alphabet","[\\u0350-\\u0357]"}, - {"Afona or Ypostaseis (Mutes or Hypostases)","[\\U0001D057-\\U0001D07E]"}, - {"African letters for clicks","[\\u01C0-\\u01C3]"}, - {"Africanist tone letters","[\\uA71B-\\uA71F]"}, - {"Agogika (Conduits)","[\\U0001D09A-\\U0001D0A1]"}, - {"Aleut letter","[\\u051E\\u051F]"}, - {"Alloioseis (Differentiators)","[\\U0001D0CB-\\U0001D0D9]"}, - {"Alternate consonant forms for Chinese","[\\uA86D-\\uA870]"}, - {"Analytics","[\\U0001D1A6-\\U0001D1A9]"}, - {"Ancient Greek acrophonic numerals","[\\U00010140-\\U00010174]"}, - {"Ancient Greek instrumental notation","[\\U0001D21D-\\U0001D241]"}, - {"Ancient Greek papyrological numbers","[\\U00010175-\\U0001018A]"}, - {"Ancient Greek textual symbols","[\\u2E0E-\\u2E16]"}, - {"Ancient Greek vocalic notation","[\\U0001D200-\\U0001D21C]"}, - {"Ancient Near-Eastern linguistic symbol","[\\u2E17]"}, - {"Ancient Roman epigraphic letters","[\\uA7FB-\\uA7FF]"}, - {"Angle brackets","[\\u2329\\u232A]"}, - {"Angles","[\\u299B-\\u29AF]"}, - {"Arabic-Indic digits","[\\u0660-\\u0669]"}, - {"Archaic Roman numerals","[\\u2180-\\u2183\\u2185-\\u2188]"}, - {"Archaic letters","[\\u0370-\\u0373\\u0376\\u0377\\u03D8-\\u03E1\\u03FA\\u03FB\\u066E\\u066F\\u07E8-\\u07EA\\u10F1-\\u10F6\\u3165-\\u318E]"}, - {"Archaic punctuation","[\\u2056\\u2058-\\u205E]"}, - {"Argies (Retards)","[\\U0001D07F-\\U0001D089]"}, - {"Armenian ligatures","[\\uFB13-\\uFB17]"}, - {"Arrow tails","[\\u2919-\\u291C]"}, - {"Arrows","[\\u27F0-\\u27F4]"}, - {"Arrows combined with operators","[\\u2942-\\u2949]"}, - {"Arrows combined with relations","[\\u2971-\\u297B]"}, - {"Arrows with bent tips","[\\u21B0-\\u21B3\\u2B0E-\\u2B11]"}, - {"Arrows with modifications","[\\u219A-\\u21AF]"}, - {"Articulation","[\\U0001D17B-\\U0001D18E]"}, - {"Astrological signs","[\\u0F15-\\u0F1F\\u0F3E\\u0F3F\\u0FCE\\u0FCF\\u26B3-\\u26BC]"}, - {"Astrological symbols","[\\u263D-\\u2647]"}, - {"Augmentation dot","[\\U0001D16D]"}, - {"Bamboo suit tiles","[\\U0001F010-\\U0001F018]"}, - {"Bars","[\\U0001D100-\\U0001D105]"}, - {"Based on GB 2312","[\\u3105-\\u3129]"}, - {"Based on ISO 8859-6","[\\u0621-\\u063A\\u0640-\\u064A]"}, - {"Based on ISO 8859-8","[\\u05D0-\\u05EA]"}, - {"Based on JIS X 0208","[\\u3041-\\u3094\\u30A1-\\u30FA]"}, - {"Basic Russian alphabet","[\\u0410-\\u044F]"}, - {"Basic consonants","[\\u0780-\\u0797]"}, - {"Basic glyphs for Arabic language contextual forms","[\\uFE80-\\uFEFC]"}, - {"Basic letters","[\\u1820-\\u1842]"}, - {"Basic syllables","[\\U00010000-\\U0001000B\\U0001000D-\\U00010026\\U00010028-\\U0001003A\\U0001003C\\U0001003D\\U0001003F]"}, - {"Beams and slurs","[\\U0001D173-\\U0001D17A]"}, - {"Bengali-specific additions","[\\u09F0-\\u09FA]"}, - {"Biblical editorial symbol","[\\u214F]"}, - {"Block elements","[\\u2580-\\u2590\\u2594\\u2595]"}, - {"Bohairic Coptic letters","[\\u2C80-\\u2CB1]"}, - {"Bold Fraktur symbols","[\\U0001D56C-\\U0001D59F]"}, - {"Bold Greek symbols","[\\U0001D6A8-\\U0001D6DA]"}, - {"Bold digits","[\\U0001D7CE-\\U0001D7D7]"}, - {"Bold italic Greek symbols","[\\U0001D71C-\\U0001D74E]"}, - {"Bold italic symbols","[\\U0001D468-\\U0001D49B]"}, - {"Bold script symbols","[\\U0001D4D0-\\U0001D503]"}, - {"Bold symbols","[\\U0001D400-\\U0001D433]"}, - {"Bowtie symbols","[\\u29D1-\\u29D7]"}, - {"Bracket pieces","[\\u239B-\\u23AD\\u23B0\\u23B1]"}, - {"Brackets","[\\u2983-\\u2998\\u29FC\\u29FD\\u2E1C\\u2E1D\\u2E20\\u2E21\\u2E26-\\u2E29]"}, - {"Braille patterns","[\\u2800-\\u28FF]"}, - {"C0 controls","[\\u0000-\\u001F]"}, - {"C1 controls","[\\u0080-\\u009F]"}, - {"CJK angle brackets","[\\u3008-\\u300B]"}, - {"CJK brackets","[\\u3010\\u3011\\u3014-\\u301B]"}, - {"CJK corner brackets","[\\u300C-\\u300F]"}, - {"CJK radicals supplement","[\\u2E80-\\u2E99\\u2E9B-\\u2EF3]"}, - {"CJK strokes","[\\u31C0-\\u31E3]"}, - {"CJK symbols","[\\u3012\\u3013]"}, - {"CJK symbols and punctuation","[\\u3000-\\u3007\\u301C-\\u3020]"}, - {"Candrabindu","[\\uA873]"}, - {"Cantillation marks","[\\u0591-\\u05AF]"}, - {"Cantillation signs","[\\u0FC0-\\u0FC3]"}, - {"Capital letters","[\\u2C00-\\u2C2E]"}, - {"Capital letters (Khutsuri)","[\\u10A0-\\u10C5]"}, - {"Caucasian linguistics","[\\u1D77\\u1D78]"}, - {"Ceilings and floors","[\\u2308-\\u230B]"}, - {"Cereals and plants","[\\U0001008E-\\U00010094]"}, - {"Character suit tiles","[\\U0001F007-\\U0001F00F]"}, - {"Chemistry symbol","[\\u232C\\u23E3]"}, - {"Chess symbols","[\\u2654-\\u265F]"}, - {"Chillu letters","[\\u0D7A-\\u0D7F]"}, - {"Chinantec tone marks","[\\uA717-\\uA71A]"}, - {"Chuvash letters","[\\u0520-\\u0523]"}, - {"Circle","[\\u2B24]"}, - {"Circle suit tiles","[\\U0001F019-\\U0001F021]"}, - {"Circle symbols","[\\u29B5-\\u29C3]"}, - {"Circled Hangul elements","[\\u3260-\\u326D]"}, - {"Circled Hangul syllable","[\\u327E]"}, - {"Circled Hangul syllables","[\\u326E-\\u327B]"}, - {"Circled Katakana","[\\u32D0-\\u32FE]"}, - {"Circled Korean words","[\\u327C\\u327D]"}, - {"Circled Latin letters","[\\u24B6-\\u24E9]"}, - {"Circled ideographs","[\\u3280-\\u32B0]"}, - {"Circled numbers","[\\u2460-\\u2473\\u3251-\\u325F\\u32B1-\\u32BF]"}, - {"Circles","[\\u26AA-\\u26AC]"}, - {"Claudian letters","[\\u2C75\\u2C76]"}, - {"Clefs","[\\U0001D11E-\\U0001D126]"}, - {"Codas","[\\U0001D106-\\U0001D10C]"}, - {"Combining diacritical marks for symbols","[\\u20D0-\\u20DC]"}, - {"Combining half marks","[\\uFE20-\\uFE23]"}, - {"Combining maddah and hamza","[\\u0653-\\u0655]"}, - {"Combining mark","[\\u135F]"}, - {"Combining marks for Old Cyrillic","[\\uA67C\\uA67D]"}, - {"Combining numeric signs","[\\uA670-\\uA672]"}, - {"Combining stroke","[\\U000101FD]"}, - {"Conjunction and length marks","[\\u30FB\\u30FC]"}, - {"Consonant","[\\u103F]"}, - {"Consonant addition for Tibetan","[\\uA872]"}, - {"Consonant additions for Sanskrit","[\\uA869-\\uA86C]"}, - {"Consonant for Addu dialect","[\\u07B1]"}, - {"Consonant shifters","[\\u17C9\\u17CA]"}, - {"Consonant signs","[\\u1C2D-\\u1C35\\uA94F-\\uA952\\uAA33-\\uAA36]"}, - {"Consonants","[\\u0915-\\u0939\\u0995-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u0A15-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A33\\u0A35\\u0A36\\u0A38\\u0A39\\u0A95-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0B15-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0C15-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C95-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0D15-\\u0D28\\u0D2A-\\u0D39\\u0D9A-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0E01-\\u0E2E\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD\\u0EAE\\u0F40-\\u0F47\\u0F49-\\u0F6A\\u1000-\\u1020\\u1703-\\u170C\\u170E-\\u1711\\u1723-\\u1731\\u1743-\\u1751\\u1763-\\u176C\\u176E-\\u1770\\u1780-\\u17A2\\u1900-\\u191C\\u1950-\\u1962\\u1980-\\u19A9\\u1A00-\\u1A16\\u1B13-\\u1B33\\u1B8A-\\u1BA0\\u1C00-\\u1C23\\uA840-\\uA85C\\uA862-\\uA865\\uA892-\\uA8B4\\uA90A-\\uA925\\uA930-\\uA946\\uAA06-\\uAA28\\U000103A3-\\U000103C3\\U00010450-\\U00010465\\U00010A10-\\U00010A13\\U00010A15-\\U00010A17\\U00010A19-\\U00010A33]"}, - {"Consonants and consonant signs","[\\uA807-\\uA822]"}, - {"Consonants signs","[\\u1BA1-\\u1BA3]"}, - {"Continuous macrons for Coptic","[\\uFE24-\\uFE26]"}, - {"Contour tone marks","[\\u1DC4-\\u1DC9\\u1DCB\\u1DCC]"}, - {"Control character","[\\u007F]"}, - {"Control code graphics","[\\u25F0-\\u25F7]"}, - {"Coptic letters derived from Demotic","[\\u03E2-\\u03EF]"}, - {"Corner tone marks for Chinese","[\\uA700-\\uA707]"}, - {"Counting rod units","[\\U0001D360-\\U0001D371]"}, - {"Croatian digraphs matching Serbian Cyrillic letters","[\\u01C4-\\u01CC]"}, - {"Crops","[\\u230C-\\u230F]"}, - {"Crosses","[\\u2719-\\u2720]"}, - {"Crossing arrows for knot theory","[\\u2927-\\u2932]"}, - {"Currency sign","[\\u060B\\u0AF1\\uFDFC]"}, - {"Currency symbol","[\\u0BF9\\u0E3F\\u17DB]"}, - {"Currency symbols","[\\u20A0-\\u20B5]"}, - {"Cyrillic extensions","[\\u0400-\\u040F\\u0450-\\u045F\\u0510-\\u0513]"}, - {"Cyrillic letter","[\\u1D2B]"}, - {"DPRK compatibility ideographs","[\\uFA70-\\uFAD9]"}, - {"Dashes","[\\u2010-\\u2015]"}, - {"Database theory operators","[\\u27D5-\\u27D7]"}, - {"Date mark","[\\u0D79]"}, - {"Dentistry notation symbols","[\\u23BE-\\u23CC]"}, - {"Dependent consonant signs","[\\u103B-\\u103E]"}, - {"Dependent vowel sign","[\\u0DDF]"}, - {"Dependent vowel signs","[\\u093E-\\u094C\\u09BE-\\u09C4\\u09C7\\u09C8\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B\\u0A4C\\u0ABE-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB\\u0ACC\\u0B3E-\\u0B44\\u0B47\\u0B48\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0C3E-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4C\\u0CBE-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCC\\u0D3E-\\u0D44\\u0D46-\\u0D48\\u0DCF-\\u0DD4\\u0DD6\\u0DD8-\\u0DDB\\u0F71-\\u0F7D\\u0F80\\u0F81\\u102B-\\u1035\\u1712\\u1713\\u1732\\u1733\\u1752\\u1753\\u1772\\u1773\\u17B6-\\u17BD\\u17C1-\\u17C3\\u1920-\\u1928\\u1B35-\\u1B43\\uA823-\\uA827\\uA8B5-\\uA8C3\\uAA29-\\uAA32]"}, - {"Dependent vowels","[\\u0B62\\u0B63\\u0C62\\u0C63\\u0CE2\\u0CE3\\u0D62\\u0D63\\u1C26-\\u1C2C]"}, - {"Deprecated","[\\u206A-\\u206F]"}, - {"Devanagari-specific additions","[\\u0970\\u0971]"}, - {"Diacritical marks for musical symbols","[\\u1B6B-\\u1B73]"}, - {"Diacritics","[\\u302A-\\u302F]"}, - {"Dialect (non-Mandarin) letters","[\\u312A-\\u312C]"}, - {"Diamonds","[\\u2B16-\\u2B19]"}, - {"Diamonds and lozenges","[\\u2B25-\\u2B2B]"}, - {"Dice","[\\u2680-\\u2685]"}, - {"Dictionary and map symbols","[\\u2690-\\u269B]"}, - {"Dictionary punctuation","[\\u2E1A\\u2E1B\\u2E1E\\u2E1F]"}, - {"Digits","[\\u07C0-\\u07C9\\u0966-\\u096F\\u09E6-\\u09EF\\u0A66-\\u0A6F\\u0AE6-\\u0AEF\\u0B66-\\u0B6F\\u0BE6-\\u0BEF\\u0C66-\\u0C6F\\u0CE6-\\u0CEF\\u0D66-\\u0D6F\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F29\\u1040-\\u1049\\u1369-\\u1371\\u17E0-\\u17E9\\u1810-\\u1819\\u1946-\\u194F\\u19D0-\\u19D9\\u1B50-\\u1B59\\u1BB0-\\u1BB9\\u1C40-\\u1C49\\u1C50-\\u1C59\\uA620-\\uA629\\uA8D0-\\uA8D9\\uA900-\\uA909\\uAA50-\\uAA59\\U000104A0-\\U000104A9\\U00010A40-\\U00010A43]"}, - {"Digits minus half","[\\u0F2A-\\u0F33]"}, - {"Digrams","[\\U0001D301-\\U0001D305]"}, - {"Digraphs","[\\u0EDC\\u0EDD]"}, - {"Dingbat arrows","[\\u2794-\\u27AF\\u27B1-\\u27BE]"}, - {"Dingbat circled digits","[\\u2776-\\u2793]"}, - {"Division operator","[\\u27CC]"}, - {"Dotless symbols","[\\U0001D6A4\\U0001D6A5]"}, - {"Dotted tone letters","[\\uA708-\\uA711]"}, - {"Double arrows","[\\u21CD-\\u21D9]"}, - {"Double circled numbers","[\\u24F5-\\u24FE]"}, - {"Double diacritic","[\\u1DCD]"}, - {"Double diacritics","[\\u035C-\\u0362]"}, - {"Double punctuation for vertical text","[\\u203C\\u2047-\\u2049]"}, - {"Double-barbed harpoons","[\\u294A-\\u2951]"}, - {"Double-struck digits","[\\U0001D7D8-\\U0001D7E1]"}, - {"Double-struck italic math symbols","[\\u2145-\\u2149]"}, - {"Double-struck large operator","[\\u2140]"}, - {"Double-struck symbols","[\\U0001D538-\\U0001D56B]"}, - {"Drafting symbols","[\\u232D-\\u2335]"}, - {"Dragon tiles","[\\U0001F004-\\U0001F006]"}, - {"Duplicate characters from Big 5","[\\uFA0C\\uFA0D]"}, - {"Duplicate characters from CNS�11643-1992","[\\U0002F800-\\U0002FA1D]"}, - {"Dynamics","[\\U0001D18F-\\U0001D193]"}, - {"Eastern Arabic-Indic digits","[\\u06F0-\\u06F9]"}, - {"Editorial symbols","[\\u03FD-\\u03FF]"}, - {"Egyptological additions","[\\uA722-\\uA725]"}, - {"Ekfonetika","[\\U0001D003-\\U0001D014]"}, - {"Electrotechnical symbols","[\\u23DA\\u23DB]"}, - {"Electrotechnical symbols from IR 181","[\\u238D-\\u2394]"}, - {"Ellipses","[\\u2B2C-\\u2B2F]"}, - {"Empty sets","[\\u29B0-\\u29B4]"}, - {"Enclosing diacritics","[\\u20DD-\\u20E0]"}, - {"Error bar symbols","[\\u29EE-\\u29F3]"}, - {"European Latin","[\\u0100-\\u017F]"}, - {"Extended Arabic letter","[\\u06D5]"}, - {"Extended Arabic letter for Parkari","[\\u06FF]"}, - {"Extended Arabic letters","[\\u0671-\\u06D3\\u06FA-\\u06FC\\u0750-\\u076D]"}, - {"Extended Arabic letters for Parkari","[\\u06EE\\u06EF]"}, - {"Extended Bopomofo for Minnan and Hakka","[\\u31A0-\\u31B7]"}, - {"Extended Bopomofo tone marks","[\\u02EA\\u02EB]"}, - {"Extended Cyrillic","[\\u048A-\\u04F9]"}, - {"Extension for Geba Karen","[\\u1071]"}, - {"Extensions for Arabic","[\\u0798-\\u07A5]"}, - {"Extensions for Balti","[\\u0F6B\\u0F6C]"}, - {"Extensions for Eastern Pwo Karen","[\\u106E-\\u1070]"}, - {"Extensions for Kayah","[\\u1072-\\u1074]"}, - {"Extensions for Mon","[\\u105A-\\u1060]"}, - {"Extensions for Rumai Palaung","[\\u108E\\u108F]"}, - {"Extensions for S'gaw Karen","[\\u1061-\\u1064]"}, - {"Extensions for Sanskrit and Tibetan","[\\u1880-\\u18AA]"}, - {"Extensions for Shan","[\\u1075-\\u108D]"}, - {"Extensions for Western Pwo Karen","[\\u1065-\\u106D]"}, - {"Extracts","[\\U00010095-\\U00010099]"}, - {"Fences","[\\u2999\\u299A\\u29D8-\\u29DB]"}, - {"Figure repetitions","[\\U0001D10D-\\U0001D10F]"}, - {"Final consonants","[\\u11A8-\\u11F9\\u1930-\\u1938\\u19C1-\\u19C7]"}, - {"Final letters","[\\uAA40-\\uAA4D]"}, - {"Fish tails","[\\u297C-\\u297F]"}, - {"Fives","[\\U0001F054-\\U0001F05A\\U0001F086-\\U0001F08C]"}, - {"Fixed-form subjoined consonants","[\\u0FBA-\\u0FBC]"}, - {"Flags","[\\U0001D16E-\\U0001D172]"}, - {"Flower tiles","[\\U0001F022-\\U0001F025]"}, - {"Fonitika (Vocals)","[\\U0001D046-\\U0001D056]"}, - {"Forfeda (supplementary letters)","[\\u1695-\\u169A]"}, - {"Forks","[\\u2AD9-\\u2ADD]"}, - {"Form and chart components","[\\u2500-\\u257F]"}, - {"Format character","[\\u2060]"}, - {"Format characters","[\\u200C-\\u200F\\u2028-\\u202F]"}, - {"Format controls","[\\u180B-\\u180E]"}, - {"Fours","[\\U0001F04D-\\U0001F053\\U0001F07F-\\U0001F085]"}, - {"Fractions","[\\u0D73-\\u0D75\\u2153-\\u215F\\U0001245A-\\U00012462]"}, - {"Fraktur symbols","[\\U0001D504-\\U0001D537]"}, - {"Frown and smile","[\\u2322\\u2323]"}, - {"Fthores (Destroyers)","[\\U0001D0B6-\\U0001D0CA]"}, - {"Fullwidth ASCII variants","[\\uFF01-\\uFF5E]"}, - {"Fullwidth brackets","[\\uFF5F\\uFF60]"}, - {"Fullwidth symbol variants","[\\uFFE0-\\uFFE6]"}, - {"Further Greek musical notation symbols","[\\U0001D242-\\U0001D245]"}, - {"GUI icons","[\\u231A\\u231B]"}, - {"Gender symbol","[\\u26B2]"}, - {"Gender symbols","[\\u26A2-\\u26A9]"}, - {"Genealogical symbols","[\\u26AD-\\u26B1]"}, - {"General punctuation","[\\u2016-\\u2027\\u2030-\\u203B\\u203D-\\u2046\\u204A-\\u2055\\u2057\\u2E18\\u2E19]"}, - {"Generic punctuation for Philippine scripts","[\\u1735\\u1736]"}, - {"Generic punctuation for scripts of India","[\\u0964\\u0965]"}, - {"Geometric shapes","[\\u25A0-\\u25EF\\u25F8-\\u25FF]"}, - {"Glottal stop","[\\u097D]"}, - {"Glyph part","[\\uFE73]"}, - {"Glyphs for contextual forms of letters for Central Asian languages","[\\uFBD3-\\uFBE9]"}, - {"Glyphs for contextual forms of letters for Persian, Urdu, Sindhi, etc.","[\\uFB50-\\uFBB1]"}, - {"Glyphs for spacing forms of Arabic points","[\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFE7F]"}, - {"Glyphs for vertical variants","[\\uFE10-\\uFE19\\uFE30-\\uFE44\\uFE47\\uFE48]"}, - {"Go markers","[\\u2686-\\u2689]"}, - {"Golden number runes","[\\u16EE-\\u16F0]"}, - {"Grammata (Letters)","[\\U0001D0E6-\\U0001D0EF]"}, - {"Grapheme joiner","[\\u034F]"}, - {"Graphic picture for control code","[\\u2424]"}, - {"Graphic pictures for control codes","[\\u2400-\\u2421]"}, - {"Graphics for control codes","[\\u237B\\u237D-\\u237F]"}, - {"Greek letters","[\\u1D26-\\u1D2A]"}, - {"Greek subscript modifier letters","[\\u1D66-\\u1D6A]"}, - {"Greek superscript modifier letters","[\\u1D5D-\\u1D61]"}, - {"Gregorian notation","[\\U0001D1D0-\\U0001D1DD]"}, - {"Gurmukhi-specific additions","[\\u0A70-\\u0A75]"}, - {"Half brackets","[\\u2E22-\\u2E25]"}, - {"Halfwidth CJK punctuation","[\\uFF61-\\uFF64]"}, - {"Halfwidth Hangul variants","[\\uFFA0-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC]"}, - {"Halfwidth Katakana variants","[\\uFF65-\\uFF9F]"}, - {"Halfwidth symbol variants","[\\uFFE8-\\uFFEE]"}, - {"Harpoons","[\\u21BC-\\u21C3]"}, - {"Head marks","[\\u0F01-\\u0F07\\u0FD3\\u0FD4]"}, - {"Head marks for Tibetan","[\\uA874\\uA875]"}, - {"Hebrew letterlike math symbols","[\\u2135-\\u2138]"}, - {"Hebrew presentation forms","[\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFB4F]"}, - {"Hexagons","[\\u2B21-\\u2B23]"}, - {"Historic letters","[\\u0460-\\u0481]"}, - {"Historic miscellaneous","[\\u0482-\\u0489]"}, - {"Historic phonetic variants","[\\u0C58\\u0C59]"}, - {"Historic syllables","[\\uA610-\\uA612\\uA62A\\uA62B]"}, - {"Holds and pauses","[\\U0001D110-\\U0001D113]"}, - {"Honorifics","[\\u0610-\\u0614]"}, - {"Horizontal brackets","[\\u23B4-\\u23B6\\u23DC-\\u23E1]"}, - {"Horizontal tiles","[\\U0001F030]"}, - {"IPA characters for disordered speech","[\\u02A9-\\u02AD]"}, - {"IPA diacritics for disordered speech","[\\u034B-\\u034E]"}, - {"IPA extensions","[\\u0250-\\u02A8]"}, - {"IPA modifiers","[\\u02EC\\u02ED]"}, - {"Ichimata and Martyrika (Ichimas and Evidentials)","[\\U0001D0A2-\\U0001D0B5]"}, - {"Ideographic description characters","[\\u2FF0-\\u2FFB]"}, - {"Independent vowel (deprecated)","[\\u17A3]"}, - {"Independent vowels","[\\u0904-\\u0914\\u0985-\\u098C\\u098F\\u0990\\u0993\\u0994\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13\\u0A14\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93\\u0A94\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13\\u0B14\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B94\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C14\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0C94\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D14\\u0D85-\\u0D96\\u1021-\\u102A\\u1700-\\u1702\\u1720-\\u1722\\u1740-\\u1742\\u1760-\\u1762\\u17A4-\\u17B3\\u1B05-\\u1B12\\uA882-\\uA891\\uAA00-\\uAA05\\U000103A0-\\U000103A2]"}, - {"Independent vowels and dvisvara","[\\uA800-\\uA805]"}, - {"Inherent vowels","[\\u17B4\\u17B5]"}, - {"Initial consonants","[\\u1100-\\u1159\\u115F]"}, - {"Instrumentation","[\\U0001D1AA-\\U0001D1AD]"}, - {"Insular and Celticist letters","[\\uA779-\\uA787]"}, - {"Integral pieces","[\\u2320\\u2321]"}, - {"Integrals","[\\u222B-\\u2233]"}, - {"Interlinear annotation","[\\uFFF9-\\uFFFB]"}, - {"Intersections and unions","[\\u2A40-\\u2A50]"}, - {"Invisible operators","[\\u2061-\\u2064]"}, - {"Iota subscript","[\\u037A]"}, - {"Italic Greek symbols","[\\U0001D6E2-\\U0001D714]"}, - {"Italic symbols","[\\U0001D434-\\U0001D467]"}, - {"Iteration marks","[\\u309D\\u309E\\u30FD\\u30FE]"}, - {"JIS X 0213 compatibility ideographs","[\\uFA30-\\uFA6A]"}, - {"Japanese chess symbols","[\\u2616\\u2617]"}, - {"Japanese corporation","[\\u337F]"}, - {"Japanese era names","[\\u337B-\\u337E]"}, - {"Kanbun","[\\u3190-\\u319F]"}, - {"Kangxi radicals","[\\u2F00-\\u2FD5]"}, - {"Katakana punctuation","[\\u30A0]"}, - {"Keyboard and UI symbols","[\\u23CE\\u23CF]"}, - {"Keyboard symbol","[\\u232B\\u2425]"}, - {"Keyboard symbols","[\\u2324-\\u2328]"}, - {"Keyboard symbols and circle arrows","[\\u21B4-\\u21BB]"}, - {"Keyboard symbols from ISO 9995-7","[\\u2380-\\u238C\\u2396-\\u239A]"}, - {"Komi letters","[\\u0500-\\u050F]"}, - {"Koranic annotation signs","[\\u0615-\\u061A\\u06D6-\\u06ED]"}, - {"Kurdish letters","[\\u051A-\\u051D]"}, - {"Large operators","[\\u29F8\\u29F9]"}, - {"Latin extensions for Vietnamese","[\\u1EA0-\\u1EF1]"}, - {"Latin general extensions","[\\u1EF2-\\u1EF9]"}, - {"Latin general use extensions","[\\u1E00-\\u1E9B]"}, - {"Latin letter","[\\u1D6B]"}, - {"Latin letters","[\\u1D00-\\u1D25]"}, - {"Latin letters with middle tilde","[\\u1D6C-\\u1D76]"}, - {"Latin letters with palatal hook","[\\u1D80-\\u1D8E]"}, - {"Latin letters with retroflex hook","[\\u1D8F-\\u1D9A]"}, - {"Latin ligatures","[\\uFB00-\\uFB06]"}, - {"Latin subscript modifier letters","[\\u1D62-\\u1D65]"}, - {"Latin superscript modifier letters","[\\u02B0-\\u02B8\\u1D2C-\\u1D5C]"}, - {"Latin-1 punctuation and symbols","[\\u00A0-\\u00BF]"}, - {"Left-stem tone letters","[\\uA712-\\uA716]"}, - {"Leimmata or Siopes (Leimmas or Silencers)","[\\U0001D08A-\\U0001D08E]"}, - {"Length mark","[\\U00010A0C]"}, - {"Letter","[\\u0386]"}, - {"Letter A","[\\uA85D]"}, - {"Letter extender","[\\u07FA]"}, - {"Letterlike symbol","[\\u0608]"}, - {"Letterlike symbols","[\\u2100-\\u2134]"}, - {"Letters","[\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u07CA-\\u07E7\\u16A0-\\u16EA\\u1C5A-\\u1C77\\u2D30-\\u2D65\\U00010280-\\U0001029C\\U000102A0-\\U000102D0\\U00010300-\\U0001031E\\U00010330-\\U0001034A\\U00010380-\\U0001039D\\U00010480-\\U0001049D\\U00010900-\\U00010915\\U00010920-\\U00010939]"}, - {"Letters for Old Abkhasian orthography","[\\uA680-\\uA697]"}, - {"Letters for Old Cyrillic","[\\uA640-\\uA65F\\uA662-\\uA66E]"}, - {"Ligatures (three elements)","[\\uFD50-\\uFD8F\\uFD92-\\uFDC7]"}, - {"Ligatures (two elements)","[\\uFBEA-\\uFD3D]"}, - {"Logical and set operators","[\\u2227-\\u222A]"}, - {"Logical ands and ors","[\\u2A51-\\u2A63]"}, - {"Logical operators","[\\u22CE\\u22CF]"}, - {"Logograms","[\\uA613-\\uA61F]"}, - {"Long arrows","[\\u27F5-\\u27FF]"}, - {"Lowercase Claudian letter","[\\u214E\\u2184]"}, - {"Lowercase Latin alphabet","[a-z]"}, - {"Lowercase letters","[\\u0561-\\u0587\\U00010428-\\U0001044F]"}, - {"Lowercase of editorial symbols","[\\u037B-\\u037D]"}, - {"Lunar date sign (deprecated)","[\\u17D3]"}, - {"Lunar date symbols","[\\u19E0-\\u19FF]"}, - {"Malayalam numerics","[\\u0D70-\\u0D72]"}, - {"Manchu letters","[\\u1873-\\u1877]"}, - {"Marks","[\\u0FD0-\\u0FD2]"}, - {"Marks and signs","[\\u0F08-\\u0F14\\u0F34-\\u0F39\\u0F82-\\u0F87]"}, - {"Mathematical arrows","[\\u2B30-\\u2B4C]"}, - {"Mathematical brackets","[\\u27E6-\\u27EF]"}, - {"Mathematical operator","[\\u00D7\\u00F7]"}, - {"Mayanist additions","[\\uA726-\\uA72F]"}, - {"Measures","[\\U00010137-\\U0001013F]"}, - {"Medial vowels","[\\u1160-\\u11A2]"}, - {"Medical and healing symbols","[\\u2624\\u2625]"}, - {"Medieval superscript letter diacritics","[\\u0363-\\u036F\\u1DD3-\\u1DE6]"}, - {"Medievalist addition","[\\u1E9F]"}, - {"Medievalist additions","[\\u1DCE-\\u1DD2\\u1E9C\\u1E9D\\u1EFA-\\u1EFF\\uA730-\\uA778]"}, - {"Medievalist punctuation","[\\u2E2A-\\u2E30]"}, - {"Melodimata (Melodics)","[\\U0001D015-\\U0001D045]"}, - {"Mensural notation","[\\U0001D1B6-\\U0001D1C0]"}, - {"Mensural prolations","[\\U0001D1C7-\\U0001D1CE]"}, - {"Mensural rests","[\\U0001D1C1-\\U0001D1C6]"}, - {"Metals","[\\U0001009A-\\U0001009C]"}, - {"Metrical symbols","[\\u23D1-\\u23D9]"}, - {"Miscellaneous","[\\u2701-\\u2718\\u274C-\\u275A]"}, - {"Miscellaneous addition","[\\u312D]"}, - {"Miscellaneous additions","[\\u021C-\\u0229\\u0237-\\u024F\\u0358-\\u035B\\u2C6D-\\u2C6F\\u2C71-\\u2C74]"}, - {"Miscellaneous arrow","[\\u2970]"}, - {"Miscellaneous arrows","[\\u21F4-\\u21FF\\u2900-\\u2918\\u291D-\\u2926]"}, - {"Miscellaneous arrows and keyboard symbols","[\\u21DA-\\u21E5]"}, - {"Miscellaneous curved arrows","[\\u2933-\\u2941]"}, - {"Miscellaneous large operators","[\\u2A1D-\\u2A21]"}, - {"Miscellaneous mark","[\\u1DCA]"}, - {"Miscellaneous marks","[\\u1DC2\\u1DC3]"}, - {"Miscellaneous mathematical operator","[\\u2AF6]"}, - {"Miscellaneous mathematical operators","[\\u2A39-\\u2A3F\\u2A64\\u2A65]"}, - {"Miscellaneous mathematical symbol","[\\u220E\\u223F]"}, - {"Miscellaneous mathematical symbols","[\\u2200-\\u2207\\u221E-\\u2222\\u2234\\u2235\\u22A4\\u22A5\\u22BE\\u22BF\\u2980-\\u2982\\u29DC-\\u29E2\\u29E7-\\u29ED\\u29F4-\\u29F7\\u29FE\\u29FF]"}, - {"Miscellaneous phonetic modifiers","[\\u02B9-\\u02D7]"}, - {"Miscellaneous symbol","[\\u2615\\u2668\\u27D0\\U0001D1CF]"}, - {"Miscellaneous symbols","[\\u260E-\\u2613\\u2618\\u2619\\u2638-\\u263C\\u267E\\u267F\\u269C\\u269D\\u26A0\\u26A1\\u27C0-\\u27C9\\U0001D1B1-\\U0001D1B5]"}, - {"Miscellaneous technical","[\\u2300-\\u2307\\u2310-\\u2319\\u237C\\u23CD\\u23E2\\u23E4-\\u23E7]"}, - {"Miscellaneous tiles","[\\U0001F02A\\U0001F02B]"}, - {"Mkhedruli","[\\u10D0-\\u10F0]"}, - {"Modal logic operators","[\\u27E0-\\u27E5]"}, - {"Modern letters","[\\u3131-\\u3163]"}, - {"Modified harpoons","[\\u2952-\\u2961]"}, - {"Modifier letter","[\\u10FC\\u2D6F\\uA67F]"}, - {"Modifier letters","[\\u0559-\\u055F\\u1C78-\\u1C7D\\u1D9B-\\u1DBF\\uA788-\\uA78A]"}, - {"Monogram","[\\U0001D300]"}, - {"Monospace digits","[\\U0001D7F6-\\U0001D7FF]"}, - {"Monospace symbols","[\\U0001D670-\\U0001D6A3]"}, - {"Mordvin letters","[\\u0514-\\u0519]"}, - {"Multiplication and division sign operators","[\\u2A2F-\\u2A38]"}, - {"Musical symbols","[\\u1B74-\\u1B7C\\u2669-\\u266F]"}, - {"Musical symbols for notes","[\\u1B61-\\u1B6A]"}, - {"N-ary operators","[\\u220F-\\u2211\\u22C0-\\u22C3\\u2A00-\\u2A09]"}, - {"New Testament editorial symbols","[\\u2E00-\\u2E0D]"}, - {"Non-European and historic Latin","[\\u0180-\\u01BF]"}, - {"Noncharacters","[\\uFDD0-\\uFDEF\\uFFFE\\uFFFF\\U0002FFFE\\U0002FFFF\\U0003FFFE\\U0003FFFF\\U0004FFFE\\U0004FFFF\\U0005FFFE\\U0005FFFF\\U0006FFFE\\U0006FFFF\\U0007FFFE\\U0007FFFF\\U0008FFFE\\U0008FFFF\\U0009FFFE\\U0009FFFF\\U000AFFFE\\U000AFFFF\\U000BFFFE\\U000BFFFF\\U000CFFFE\\U000CFFFF\\U000DFFFE\\U000DFFFF\\U000EFFFE\\U000EFFFF\\U000FFFFE\\U000FFFFF\\U0010FFFE\\U0010FFFF]"}, - {"Not character codes","[\\U0001FFFE\\U0001FFFF]"}, - {"Noteheads","[\\U0001D143-\\U0001D15B]"}, - {"Notes","[\\U0001D15C-\\U0001D164]"}, - {"Numbers","[\\u1372-\\u137C\\U00010107-\\U00010133\\U000103D1-\\U000103D5\\U00010916-\\U00010919\\U00010A44-\\U00010A47]"}, - {"Numbers period","[\\u2488-\\u249B]"}, - {"Numeral signs","[\\u0374\\u0375]"}, - {"Numerals","[\\U00010320-\\U00010323]"}, - {"Numeric character","[\\u2CFD]"}, - {"Numeric signs","[\\U00012400-\\U00012459]"}, - {"Numeric symbols for divination lore","[\\u17F0-\\u17F9]"}, - {"OCR","[\\u2440-\\u244A]"}, - {"Octaves","[\\U0001D136-\\U0001D139]"}, - {"Old Church Slavonic combining letters","[\\u2DE0-\\u2DFF]"}, - {"Old Coptic and dialect letters","[\\u2CB2-\\u2CDB]"}, - {"Old Nubian letters","[\\u2CDC-\\u2CE3]"}, - {"Old Nubian punctuation","[\\u2CF9-\\u2CFC]"}, - {"Ones","[\\U0001F038-\\U0001F03E\\U0001F06A-\\U0001F070]"}, - {"Operator","[\\u2238\\u223A\\u2240]"}, - {"Operators","[\\u2212-\\u221D\\u2223-\\u2226\\u228C-\\u228E\\u2293-\\u22A3\\u22BA-\\u22BD\\u22C4-\\u22C7\\u22C9-\\u22CC\\u22D2\\u22D3\\u27D1-\\u27D4\\u2AFC-\\u2AFF]"}, - {"Ordinary diacritics","[\\u0300-\\u0333]"}, - {"Oriya-specific additions","[\\u0B70\\u0B71]"}, - {"Ornamental brackets","[\\u2768-\\u2775]"}, - {"Ornaments","[\\U0001D194-\\U0001D1A5]"}, - {"Orthographic Latin additions","[\\u2C60-\\u2C66]"}, - {"Orthographic letters for glottals","[\\uA78B\\uA78C]"}, - {"Other CJK punctuation","[\\u303B-\\u303D]"}, - {"Other CJK symbols","[\\u3030-\\u3037]"}, - {"Other combining marks","[\\u0656-\\u065E]"}, - {"Other materials","[\\U0001009D-\\U000100DD]"}, - {"Other modifier letter","[\\u02EE]"}, - {"Other phonetic symbols","[\\u1D79-\\u1D7F]"}, - {"Overscores and underscores","[\\uFE49-\\uFE4F]"}, - {"Overstruck diacritics","[\\u0334-\\u0338]"}, - {"Paired arrows and harpoons","[\\u21C4-\\u21CC]"}, - {"Paired harpoons","[\\u2962-\\u296F]"}, - {"Paired punctuation","[\\u0F3A-\\u0F3D]"}, - {"Pali and Sanskrit extensions","[\\u1050-\\u1059]"}, - {"Parenthesized Hangul elements","[\\u3200-\\u320D]"}, - {"Parenthesized Hangul syllables","[\\u320E-\\u321C]"}, - {"Parenthesized Korean words","[\\u321D\\u321E]"}, - {"Parenthesized Latin letters","[\\u249C-\\u24B5]"}, - {"Parenthesized ideographs","[\\u3220-\\u3243]"}, - {"Parenthesized numbers","[\\u2474-\\u2487]"}, - {"Pedals","[\\U0001D1AE-\\U0001D1B0]"}, - {"Pentagons","[\\u2B1F\\u2B20\\u2B53\\u2B54]"}, - {"People and animals","[\\U00010080-\\U0001008D]"}, - {"Persian letters","[\\u072D-\\u072F]"}, - {"Phonetic and historic letters","[\\u01DD-\\u01FF]"}, - {"Phonetic extensions for Ainu","[\\u31F0-\\u31FF]"}, - {"Pinyin diacritic-vowel combinations","[\\u01CD-\\u01DC]"}, - {"Playing card symbols","[\\u2660-\\u2667]"}, - {"Plus and minus sign operators","[\\u2A22-\\u2A2E]"}, - {"Poetic marks","[\\u060E\\u060F]"}, - {"Poetry marks","[\\uA828-\\uA82B]"}, - {"Point","[\\u0670]"}, - {"Pointing hand symbols","[\\u261A-\\u261F]"}, - {"Points and punctuation","[\\u05B0-\\u05C3\\u05C6\\u05C7]"}, - {"Points from ISO 8859-6","[\\u064B-\\u0652]"}, - {"Precomposed polytonic Greek","[\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFE]"}, - {"Prevailing wind tiles","[\\U0001F000-\\U0001F003]"}, - {"Pronunciation variants from KS�X�1001:1998","[\\uF900-\\uFA0B]"}, - {"Prosodies (Prosodics)","[\\U0001D000-\\U0001D002]"}, - {"Puncta extraordinaria","[\\u05C4\\u05C5]"}, - {"Punctuation","[\\u037E\\u0387\\u0589\\u058A\\u0609\\u060A\\u060C\\u060D\\u061B\\u061E\\u061F\\u066A-\\u066D\\u06D4\\u07F7-\\u07F9\\u0DF4\\u104A\\u104B\\u10FB\\u1360-\\u1368\\u166E\\u1680\\u169B\\u169C\\u16EB-\\u16ED\\u1800-\\u180A\\u1B5A-\\u1B60\\u1C3B-\\u1C3F\\u1C7E\\u1C7F\\u2CFE\\u2CFF\\uA60D-\\uA60F\\uA8CE\\uA8CF\\uA92E\\uA92F\\uA95F\\uAA5C-\\uAA5F\\uFD3E\\uFD3F\\U00010100-\\U00010102\\U0001039F\\U000103D0\\U0001091F\\U0001093F\\U00010A50-\\U00010A58\\U00012470-\\U00012473]"}, - {"Punctuation for Tibetan","[\\uA876\\uA877]"}, - {"Punctuation mark","[\\uA673\\uA67E]"}, - {"Punctuation ornaments","[\\u275B-\\u275E\\u2761-\\u2767]"}, - {"Quine corners","[\\u231C-\\u231F]"}, - {"Radix symbols","[\\u0606\\u0607]"}, - {"Recycling symbols","[\\u2672-\\u267D]"}, - {"Relation","[\\u2239\\u22C8\\u22CD]"}, - {"Relational operators","[\\u2A66-\\u2ABC]"}, - {"Relations","[\\u2236\\u2237\\u223B-\\u223E\\u2241-\\u228B\\u228F-\\u2292\\u22A6-\\u22B9\\u22D0\\u22D1\\u22D4-\\u22FF\\u29E3-\\u29E6\\u2AF7-\\u2AFB]"}, - {"Religious and political symbols","[\\u2626-\\u262F]"}, - {"Replacement characters","[\\uFFFC\\uFFFD]"}, - {"Reserved","[\\u09E4\\u09E5\\u0A64\\u0A65\\u0AE4\\u0AE5\\u0B64\\u0B65\\u0BE4\\u0BE5\\u0C64\\u0C65\\u0CE4\\u0CE5\\u0D64\\u0D65]"}, - {"Rest","[\\U0001D129]"}, - {"Rests","[\\U0001D13A-\\U0001D142]"}, - {"Roman coin symbols","[\\U00010196-\\U0001019A]"}, - {"Roman military symbol","[\\U0001019B]"}, - {"Roman numerals","[\\u2160-\\u217F]"}, - {"Roman weights and measures","[\\U00010190-\\U00010195]"}, - {"Rythmika (Rhythmics)","[\\U0001D0DA-\\U0001D0E5]"}, - {"Sans-serif bold Greek symbols","[\\U0001D756-\\U0001D788]"}, - {"Sans-serif bold digits","[\\U0001D7EC-\\U0001D7F5]"}, - {"Sans-serif bold italic Greek symbols","[\\U0001D790-\\U0001D7C2]"}, - {"Sans-serif bold italic symbols","[\\U0001D63C-\\U0001D66F]"}, - {"Sans-serif bold symbols","[\\U0001D5D4-\\U0001D607]"}, - {"Sans-serif digits","[\\U0001D7E2-\\U0001D7EB]"}, - {"Sans-serif italic symbols","[\\U0001D608-\\U0001D63B]"}, - {"Sans-serif symbols","[\\U0001D5A0-\\U0001D5D3]"}, - {"Scan lines for terminal graphics","[\\u23BA-\\u23BD]"}, - {"Script symbols","[\\U0001D49C-\\U0001D4CF]"}, - {"Season tiles","[\\U0001F026-\\U0001F029]"}, - {"Set membership","[\\u2208-\\u220D]"}, - {"Shade characters","[\\u2591-\\u2593]"}, - {"Shan digits","[\\u1090-\\u1099]"}, - {"Shan symbols","[\\u109E\\u109F]"}, - {"Sibe letters","[\\u185D-\\u1872]"}, - {"Sidelining emphasis marks","[\\uFE45\\uFE46]"}, - {"Sign","[\\u09CD\\u09D7\\u0CCD\\u0DCA\\u0E2F\\u0E46\\u0EAF\\u0EC6\\u1B34\\u1B44\\uA806]"}, - {"Signs","[\\u0E4C-\\u0E4F\\u0E5A\\u0E5B\\u0EBC\\u0EBD\\u0ECC\\u0ECD\\u0FBE\\u0FBF\\U000101D0-\\U000101FC\\U00012000-\\U0001236E]"}, - {"Signs for Sindhi","[\\u06FD\\u06FE]"}, - {"Simple arrows","[\\u2190-\\u2199]"}, - {"Sindhi implosives","[\\u097B\\u097C\\u097E\\u097F]"}, - {"Sixes","[\\U0001F05B-\\U0001F061\\U0001F08D-\\U0001F093]"}, - {"Small form variants","[\\uFE50-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B]"}, - {"Small letters","[\\u2C30-\\u2C5E\\u3095\\u3096]"}, - {"Small letters (Khutsuri)","[\\u2D00-\\u2D25]"}, - {"Sogdian letters","[\\u074D-\\u074F]"}, - {"Space","[\\u205F]"}, - {"Spaces","[\\u2000-\\u200B]"}, - {"Spacing accent marks","[\\u0384\\u0385]"}, - {"Spacing clones of diacritics","[\\u02D8-\\u02DD]"}, - {"Special","[\\uFEFF]"}, - {"Special CJK indicators","[\\u303E\\u303F]"}, - {"Special character","[\\u3164]"}, - {"Special character extension","[\\u23D0]"}, - {"Special character extensions","[\\u23AE\\u23AF]"}, - {"Specialized plus sign operators","[\\u29FA\\u29FB]"}, - {"Specials","[\\U0001D0F0-\\U0001D0F5]"}, - {"Specific symbol for control code","[\\u2426]"}, - {"Specific symbols for space","[\\u2422\\u2423]"}, - {"Square symbols","[\\u29C4-\\u29C9]"}, - {"Squared Katakana words","[\\u3300-\\u3357]"}, - {"Squared Latin abbreviation","[\\u3250\\u33FF]"}, - {"Squared Latin abbreviations","[\\u32CC-\\u32CF\\u3371-\\u337A\\u3380-\\u33DF]"}, - {"Squares","[\\u2B12-\\u2B15\\u2B1A-\\u2B1E]"}, - {"Staff brackets","[\\U0001D114\\U0001D115]"}, - {"Stars","[\\u2B50-\\u2B52]"}, - {"Stars, asterisks and snowflakes","[\\u2721-\\u274B]"}, - {"Staves","[\\U0001D116-\\U0001D11B]"}, - {"Stems","[\\U0001D165\\U0001D166]"}, - {"Subjoined Consonants","[\\uA867\\uA868]"}, - {"Subjoined consonant","[\\uA871]"}, - {"Subjoined consonants","[\\u0F90-\\u0F97\\u0F99-\\u0FB9\\u1929-\\u192B\\u1C24\\u1C25]"}, - {"Subscripts","[\\u2080-\\u208E\\u2090-\\u2094]"}, - {"Subset and superset relations","[\\u2ABD-\\u2AD8]"}, - {"Subtending marks","[\\u0600-\\u0603]"}, - {"Summation sign parts","[\\u23B2\\u23B3]"}, - {"Summations and integrals","[\\u2A0A-\\u2A1C]"}, - {"Superscripts","[\\u2070-\\u207F]"}, - {"Supplementary signs","[\\U00010040-\\U0001004D]"}, - {"Suzhou numerals","[\\u3021-\\u3029]"}, - {"Syllable","[\\u0F00]"}, - {"Syllable finals","[\\uA60B\\uA60C]"}, - {"Syllable iteration mark","[\\uA015]"}, - {"Syllables","[\\u1200-\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u13A0-\\u13F4\\u1401-\\u166C\\u166F-\\u1676\\uA000-\\uA014\\uA016-\\uA48C\\U00010800-\\U00010805\\U00010808\\U0001080A-\\U00010835\\U00010837\\U00010838\\U0001083C\\U0001083F]"}, - {"Syllables for Blin","[\\u2D93-\\u2D96]"}, - {"Syllables for Me'en","[\\u2D80-\\u2D92]"}, - {"Syllables for Sebatbeit","[\\u1380-\\u138F\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE]"}, - {"Syllables in -a","[\\uA549-\\uA570]"}, - {"Syllables in -e","[\\uA5E1-\\uA60A]"}, - {"Syllables in -ee","[\\uA500-\\uA514]"}, - {"Syllables in -i","[\\uA515-\\uA548]"}, - {"Syllables in -o","[\\uA5BA-\\uA5E0]"}, - {"Syllables in -oo","[\\uA571-\\uA594]"}, - {"Syllables in -u","[\\uA595-\\uA5B9]"}, - {"Symbol","[\\u03FC\\u07F6\\u166D\\u327F\\uFDFD]"}, - {"Symbols","[\\u0FC4-\\u0FCC\\u2CE4-\\u2CEA\\U00010050-\\U0001005D]"}, - {"Symbols for draughts and checkers","[\\u26C0-\\u26C3]"}, - {"Synagmata or Gorgotites (Synagmas or Quickeners)","[\\U0001D08F-\\U0001D099]"}, - {"Syriac cross symbols","[\\u2670\\u2671]"}, - {"Syriac format control character","[\\u070F]"}, - {"Syriac letters","[\\u0710-\\u072C]"}, - {"Syriac marks","[\\u0740-\\u074A]"}, - {"Syriac points (vowels)","[\\u0730-\\u073F]"}, - {"Syriac punctuation and signs","[\\u0700-\\u070D]"}, - {"Tablature","[\\U0001D11C\\U0001D11D]"}, - {"Tacks and turnstiles","[\\u27D8-\\u27DF\\u2ADE-\\u2AED]"}, - {"Tag components","[\\U000E0020-\\U000E007F]"}, - {"Tag identifiers","[\\U000E0001]"}, - {"Tamil numerics","[\\u0BF0-\\u0BF2]"}, - {"Tamil symbol","[\\u0BFA]"}, - {"Tamil symbols","[\\u0BF3-\\u0BF8]"}, - {"Telegraph symbols for days","[\\u33E0-\\u33FE]"}, - {"Telegraph symbols for hours","[\\u3358-\\u3370]"}, - {"Telegraph symbols for months","[\\u32C0-\\u32CB]"}, - {"Telugu fractions and weights","[\\u0C78-\\u0C7F]"}, - {"Terminal graphic characters","[\\u23B7-\\u23B9\\u2596-\\u259F]"}, - {"Tetragrams","[\\U0001D306-\\U0001D356]"}, - {"The IBM 32 compatibility ideographs","[\\uFA0E-\\uFA2D]"}, - {"Threes","[\\U0001F046-\\U0001F04C\\U0001F078-\\U0001F07E]"}, - {"Time signatures","[\\U0001D134\\U0001D135]"}, - {"Todo letters","[\\u1843-\\u185C]"}, - {"Tonal marks","[\\u1390-\\u1399]"}, - {"Tone letters","[\\u02E5-\\u02E9\\u1970-\\u1974]"}, - {"Tone marks","[\\u07EB-\\u07F5\\u0E48-\\u0E4B\\u0EC8-\\u0ECB\\u19C8\\u19C9\\uA92B-\\uA92D]"}, - {"Traditional letters","[\\u1681-\\u1694]"}, - {"Transliteration head letters","[\\u0F88-\\u0F8B]"}, - {"Tremolos","[\\U0001D167-\\U0001D16C]"}, - {"Triangle symbols","[\\u29CA-\\u29D0]"}, - {"Two-part dependent vowel signs","[\\u09CB\\u09CC\\u0B4B\\u0B4C\\u0BCA-\\u0BCC\\u0D4A-\\u0D4C\\u0DDC-\\u0DDE\\u17BE-\\u17C0\\u17C4\\u17C5]"}, - {"Twos","[\\U0001F03F-\\U0001F045\\U0001F071-\\U0001F077]"}, - {"UPA modifiers","[\\u02EF-\\u02FF]"}, - {"Uppercase Latin alphabet","[A-Z]"}, - {"Uppercase letters","[\\u0531-\\u0556\\U00010400-\\U00010427]"}, - {"Used for Ancient Greek","[\\u1DC0\\u1DC1]"}, - {"Variant letterform","[\\u03F9]"}, - {"Variant letterforms","[\\u03CF-\\u03D7\\u03F0-\\u03F2]"}, - {"Variant letterforms and symbols","[\\u03F4-\\u03F6]"}, - {"Variation selectors","[\\uFE00-\\uFE0F\\U000E0100-\\U000E01EF]"}, - {"Various signs","[\\u0901-\\u0903\\u093C\\u093D\\u094D\\u0950-\\u0954\\u0981-\\u0983\\u09BC\\u09BD\\u0A01-\\u0A03\\u0A3C\\u0A4D\\u0A51\\u0A81-\\u0A83\\u0ABC\\u0ABD\\u0ACD\\u0AD0\\u0B01-\\u0B03\\u0B3C\\u0B3D\\u0B4D\\u0B56\\u0B57\\u0B82\\u0B83\\u0BCD\\u0BD0\\u0BD7\\u0C01-\\u0C03\\u0C4D\\u0C55\\u0C56\\u0C82\\u0C83\\u0CBC\\u0CBD\\u0CD5\\u0CD6\\u0D02\\u0D03\\u0D4D\\u0D57\\u0D82\\u0D83\\u1036-\\u103A\\u104C-\\u104F\\u17C6-\\u17C8\\u17CB-\\u17D2\\u17D4-\\u17DA\\u17DC\\u17DD\\u1939-\\u193B\\u1940\\u1944\\u1945\\u19DE\\u19DF\\u1A1E\\u1A1F\\u1B00-\\u1B04\\u1B80-\\u1B82\\u1C36\\u1C37\\uA880\\uA881\\U000103C8-\\U000103CF\\U00010A0D-\\U00010A0F\\U00010A38-\\U00010A3A]"}, - {"Vedic signs","[\\u0CF1\\u0CF2]"}, - {"Vertical form digraph","[\\u309F\\u30FF]"}, - {"Vertical line operator","[\\u27CA]"}, - {"Vertical line operators","[\\u2AEE-\\u2AF5]"}, - {"Vertical tiles","[\\U0001F062]"}, - {"Vessels","[\\U000100DE-\\U000100FA]"}, - {"Vietnamese tone marks (deprecated)","[\\u0340\\u0341]"}, - {"Virama","[\\u1714\\u1734\\u1BAA\\uA8C4\\uA953\\U00010A3F]"}, - {"Vocalic modification","[\\u0F7E\\u0F7F]"}, - {"Voicing marks","[\\u3099-\\u309C]"}, - {"Vowel","[\\u0E47\\uA866]"}, - {"Vowel signs","[\\u19B0-\\u19C0\\u1BA4-\\u1BA9\\uA947-\\uA94E]"}, - {"Vowels","[\\u07A6-\\u07B0\\u0E30-\\u0E3A\\u0E40-\\u0E45\\u0EB0-\\u0EB9\\u0EBB\\u0EC0-\\u0EC4\\u1963-\\u196D\\u1A17-\\u1A1B\\u1B83-\\u1B89\\uA85E-\\uA861\\uA926-\\uA92A\\U00010466-\\U0001047F\\U00010A00-\\U00010A03\\U00010A05\\U00010A06]"}, - {"Warning signs","[\\u2620-\\u2623]"}, - {"Weather and astrological symbols","[\\u2600-\\u260D]"}, - {"Weather symbol","[\\u2614]"}, - {"White and black arrows","[\\u2B00-\\u2B0D]"}, - {"White arrows and keyboard symbols","[\\u21E6-\\u21F3]"}, - {"White on black circled numbers","[\\u24EB-\\u24F4]"}, - {"Word ligatures","[\\uFDF0-\\uFDFB]"}, - {"Yi radicals","[\\uA490-\\uA4C6]"}, - {"Yiddish digraphs","[\\u05F0-\\u05F2]"}, - {"Yijing hexagram symbols","[\\u4DC0-\\u4DFF]"}, - {"Yijing monogram and digram symbols","[\\u268A-\\u268F]"}, - {"Yijing trigram symbols","[\\u2630-\\u2637]"}, - {"Zeroes","[\\U0001F031-\\U0001F037\\U0001F063-\\U0001F069]"}, - {"Zodiacal symbols","[\\u2648-\\u2653]"}, - }; + static final String[][] data = { + {"APL", "[\\u2336-\\u237A\\u2395]"}, + {"ASCII digits", "[0-9]"}, + {"ASCII punctuation and symbols", "[\\ -/\\:-@\\[-`\\{-~]"}, + {"Abbreviation mark", "[\\uA66F]"}, + {"Accidentals", "[\\U0001D12A-\\U0001D133]"}, + {"Addition for German typography", "[\\u1E9E]"}, + {"Addition for Sanskrit", "[\\u0C3D\\u0D3D]"}, + {"Addition for Torwali", "[\\u0772]"}, + {"Additional Suzhou numerals", "[\\u3038-\\u303A]"}, + {"Additional archaic letters for Bactrian", "[\\u03F7\\u03F8]"}, + {"Additional bold Greek symbols", "[\\U0001D6DB-\\U0001D6E1\\U0001D7CA\\U0001D7CB]"}, + {"Additional bold italic Greek symbols", "[\\U0001D74F-\\U0001D755]"}, + {"Additional circled number", "[\\u24EA]"}, + {"Additional consonant", "[\\u09CE]"}, + { + "Additional consonants", + "[\\u0958-\\u095F\\u09DC\\u09DD\\u09DF\\u0A59-\\u0A5C\\u0A5E\\u0B5C\\u0B5D\\u0B5F\\u0CDE\\u1B45-\\u1B4B\\u1BAE\\u1BAF]" + }, + {"Additional dependent vowel signs", "[\\u0DF2\\u0DF3]"}, + {"Additional diacritical mark for symbols", "[\\u20E1]"}, + {"Additional diacritical marks for symbols", "[\\u20E5-\\u20F0]"}, + {"Additional enclosing diacritics", "[\\u20E2-\\u20E4]"}, + {"Additional italic Greek symbols", "[\\U0001D715-\\U0001D71B]"}, + {"Additional letter", "[\\u03F3]"}, + {"Additional letterlike symbols", "[\\u2139-\\u213F\\u2141-\\u2144\\u214A-\\u214D]"}, + {"Additional letters", "[\\u10F9\\u10FA\\u1C4D-\\u1C4F]"}, + {"Additional letters for Mingrelian and Svan", "[\\u10F7\\u10F8]"}, + {"Additional marks for UPA", "[\\u1DFE\\u1DFF]"}, + {"Additional punctuation", "[\\u05F3\\u05F4]"}, + {"Additional sans-serif bold Greek symbols", "[\\U0001D789-\\U0001D78F]"}, + {"Additional sans-serif bold italic Greek symbols", "[\\U0001D7C3-\\U0001D7C9]"}, + {"Additional vowel for Marathi", "[\\u0972]"}, + { + "Additional vowels for Sanskrit", + "[\\u0960-\\u0963\\u09E0-\\u09E3\\u0AE0-\\u0AE3\\u0B60\\u0B61\\u0C60\\u0C61\\u0CE0\\u0CE1\\u0D60\\u0D61]" + }, + {"Additional white on black circled number", "[\\u24FF]"}, + {"Additions", "[\\u0339-\\u033F]"}, + {"Additions based on 1989 IPA", "[\\u02DE-\\u02E4]"}, + {"Additions for Burushaski", "[\\u0773-\\u077D]"}, + {"Additions for Greek", "[\\u0342-\\u0345]"}, + {"Additions for IPA", "[\\u0346-\\u034A]"}, + {"Additions for Khowar", "[\\u076E-\\u0771]"}, + {"Additions for Livonian", "[\\u022A-\\u0233]"}, + {"Additions for Nivkh", "[\\u04FA-\\u04FF]"}, + {"Additions for Romanian", "[\\u0218-\\u021B]"}, + {"Additions for Sinology", "[\\u0234-\\u0236\\u02AE\\u02AF]"}, + {"Additions for Slovenian and Croatian", "[\\u0200-\\u0217]"}, + {"Additions for UPA", "[\\u2C77-\\u2C7D\\uA720\\uA721]"}, + {"Additions for Uighur", "[\\u2C67-\\u2C6C]"}, + {"Additions for early Persian", "[\\u077E\\u077F]"}, + {"Additions for early Persian and Azerbaijani", "[\\u063B-\\u063F]"}, + {"Additions for the Uralic Phonetic Alphabet", "[\\u0350-\\u0357]"}, + {"Afona or Ypostaseis (Mutes or Hypostases)", "[\\U0001D057-\\U0001D07E]"}, + {"African letters for clicks", "[\\u01C0-\\u01C3]"}, + {"Africanist tone letters", "[\\uA71B-\\uA71F]"}, + {"Agogika (Conduits)", "[\\U0001D09A-\\U0001D0A1]"}, + {"Aleut letter", "[\\u051E\\u051F]"}, + {"Alloioseis (Differentiators)", "[\\U0001D0CB-\\U0001D0D9]"}, + {"Alternate consonant forms for Chinese", "[\\uA86D-\\uA870]"}, + {"Analytics", "[\\U0001D1A6-\\U0001D1A9]"}, + {"Ancient Greek acrophonic numerals", "[\\U00010140-\\U00010174]"}, + {"Ancient Greek instrumental notation", "[\\U0001D21D-\\U0001D241]"}, + {"Ancient Greek papyrological numbers", "[\\U00010175-\\U0001018A]"}, + {"Ancient Greek textual symbols", "[\\u2E0E-\\u2E16]"}, + {"Ancient Greek vocalic notation", "[\\U0001D200-\\U0001D21C]"}, + {"Ancient Near-Eastern linguistic symbol", "[\\u2E17]"}, + {"Ancient Roman epigraphic letters", "[\\uA7FB-\\uA7FF]"}, + {"Angle brackets", "[\\u2329\\u232A]"}, + {"Angles", "[\\u299B-\\u29AF]"}, + {"Arabic-Indic digits", "[\\u0660-\\u0669]"}, + {"Archaic Roman numerals", "[\\u2180-\\u2183\\u2185-\\u2188]"}, + { + "Archaic letters", + "[\\u0370-\\u0373\\u0376\\u0377\\u03D8-\\u03E1\\u03FA\\u03FB\\u066E\\u066F\\u07E8-\\u07EA\\u10F1-\\u10F6\\u3165-\\u318E]" + }, + {"Archaic punctuation", "[\\u2056\\u2058-\\u205E]"}, + {"Argies (Retards)", "[\\U0001D07F-\\U0001D089]"}, + {"Armenian ligatures", "[\\uFB13-\\uFB17]"}, + {"Arrow tails", "[\\u2919-\\u291C]"}, + {"Arrows", "[\\u27F0-\\u27F4]"}, + {"Arrows combined with operators", "[\\u2942-\\u2949]"}, + {"Arrows combined with relations", "[\\u2971-\\u297B]"}, + {"Arrows with bent tips", "[\\u21B0-\\u21B3\\u2B0E-\\u2B11]"}, + {"Arrows with modifications", "[\\u219A-\\u21AF]"}, + {"Articulation", "[\\U0001D17B-\\U0001D18E]"}, + {"Astrological signs", "[\\u0F15-\\u0F1F\\u0F3E\\u0F3F\\u0FCE\\u0FCF\\u26B3-\\u26BC]"}, + {"Astrological symbols", "[\\u263D-\\u2647]"}, + {"Augmentation dot", "[\\U0001D16D]"}, + {"Bamboo suit tiles", "[\\U0001F010-\\U0001F018]"}, + {"Bars", "[\\U0001D100-\\U0001D105]"}, + {"Based on GB 2312", "[\\u3105-\\u3129]"}, + {"Based on ISO 8859-6", "[\\u0621-\\u063A\\u0640-\\u064A]"}, + {"Based on ISO 8859-8", "[\\u05D0-\\u05EA]"}, + {"Based on JIS X 0208", "[\\u3041-\\u3094\\u30A1-\\u30FA]"}, + {"Basic Russian alphabet", "[\\u0410-\\u044F]"}, + {"Basic consonants", "[\\u0780-\\u0797]"}, + {"Basic glyphs for Arabic language contextual forms", "[\\uFE80-\\uFEFC]"}, + {"Basic letters", "[\\u1820-\\u1842]"}, + { + "Basic syllables", + "[\\U00010000-\\U0001000B\\U0001000D-\\U00010026\\U00010028-\\U0001003A\\U0001003C\\U0001003D\\U0001003F]" + }, + {"Beams and slurs", "[\\U0001D173-\\U0001D17A]"}, + {"Bengali-specific additions", "[\\u09F0-\\u09FA]"}, + {"Biblical editorial symbol", "[\\u214F]"}, + {"Block elements", "[\\u2580-\\u2590\\u2594\\u2595]"}, + {"Bohairic Coptic letters", "[\\u2C80-\\u2CB1]"}, + {"Bold Fraktur symbols", "[\\U0001D56C-\\U0001D59F]"}, + {"Bold Greek symbols", "[\\U0001D6A8-\\U0001D6DA]"}, + {"Bold digits", "[\\U0001D7CE-\\U0001D7D7]"}, + {"Bold italic Greek symbols", "[\\U0001D71C-\\U0001D74E]"}, + {"Bold italic symbols", "[\\U0001D468-\\U0001D49B]"}, + {"Bold script symbols", "[\\U0001D4D0-\\U0001D503]"}, + {"Bold symbols", "[\\U0001D400-\\U0001D433]"}, + {"Bowtie symbols", "[\\u29D1-\\u29D7]"}, + {"Bracket pieces", "[\\u239B-\\u23AD\\u23B0\\u23B1]"}, + {"Brackets", "[\\u2983-\\u2998\\u29FC\\u29FD\\u2E1C\\u2E1D\\u2E20\\u2E21\\u2E26-\\u2E29]"}, + {"Braille patterns", "[\\u2800-\\u28FF]"}, + {"C0 controls", "[\\u0000-\\u001F]"}, + {"C1 controls", "[\\u0080-\\u009F]"}, + {"CJK angle brackets", "[\\u3008-\\u300B]"}, + {"CJK brackets", "[\\u3010\\u3011\\u3014-\\u301B]"}, + {"CJK corner brackets", "[\\u300C-\\u300F]"}, + {"CJK radicals supplement", "[\\u2E80-\\u2E99\\u2E9B-\\u2EF3]"}, + {"CJK strokes", "[\\u31C0-\\u31E3]"}, + {"CJK symbols", "[\\u3012\\u3013]"}, + {"CJK symbols and punctuation", "[\\u3000-\\u3007\\u301C-\\u3020]"}, + {"Candrabindu", "[\\uA873]"}, + {"Cantillation marks", "[\\u0591-\\u05AF]"}, + {"Cantillation signs", "[\\u0FC0-\\u0FC3]"}, + {"Capital letters", "[\\u2C00-\\u2C2E]"}, + {"Capital letters (Khutsuri)", "[\\u10A0-\\u10C5]"}, + {"Caucasian linguistics", "[\\u1D77\\u1D78]"}, + {"Ceilings and floors", "[\\u2308-\\u230B]"}, + {"Cereals and plants", "[\\U0001008E-\\U00010094]"}, + {"Character suit tiles", "[\\U0001F007-\\U0001F00F]"}, + {"Chemistry symbol", "[\\u232C\\u23E3]"}, + {"Chess symbols", "[\\u2654-\\u265F]"}, + {"Chillu letters", "[\\u0D7A-\\u0D7F]"}, + {"Chinantec tone marks", "[\\uA717-\\uA71A]"}, + {"Chuvash letters", "[\\u0520-\\u0523]"}, + {"Circle", "[\\u2B24]"}, + {"Circle suit tiles", "[\\U0001F019-\\U0001F021]"}, + {"Circle symbols", "[\\u29B5-\\u29C3]"}, + {"Circled Hangul elements", "[\\u3260-\\u326D]"}, + {"Circled Hangul syllable", "[\\u327E]"}, + {"Circled Hangul syllables", "[\\u326E-\\u327B]"}, + {"Circled Katakana", "[\\u32D0-\\u32FE]"}, + {"Circled Korean words", "[\\u327C\\u327D]"}, + {"Circled Latin letters", "[\\u24B6-\\u24E9]"}, + {"Circled ideographs", "[\\u3280-\\u32B0]"}, + {"Circled numbers", "[\\u2460-\\u2473\\u3251-\\u325F\\u32B1-\\u32BF]"}, + {"Circles", "[\\u26AA-\\u26AC]"}, + {"Claudian letters", "[\\u2C75\\u2C76]"}, + {"Clefs", "[\\U0001D11E-\\U0001D126]"}, + {"Codas", "[\\U0001D106-\\U0001D10C]"}, + {"Combining diacritical marks for symbols", "[\\u20D0-\\u20DC]"}, + {"Combining half marks", "[\\uFE20-\\uFE23]"}, + {"Combining maddah and hamza", "[\\u0653-\\u0655]"}, + {"Combining mark", "[\\u135F]"}, + {"Combining marks for Old Cyrillic", "[\\uA67C\\uA67D]"}, + {"Combining numeric signs", "[\\uA670-\\uA672]"}, + {"Combining stroke", "[\\U000101FD]"}, + {"Conjunction and length marks", "[\\u30FB\\u30FC]"}, + {"Consonant", "[\\u103F]"}, + {"Consonant addition for Tibetan", "[\\uA872]"}, + {"Consonant additions for Sanskrit", "[\\uA869-\\uA86C]"}, + {"Consonant for Addu dialect", "[\\u07B1]"}, + {"Consonant shifters", "[\\u17C9\\u17CA]"}, + {"Consonant signs", "[\\u1C2D-\\u1C35\\uA94F-\\uA952\\uAA33-\\uAA36]"}, + { + "Consonants", + "[\\u0915-\\u0939\\u0995-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u0A15-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A33\\u0A35\\u0A36\\u0A38\\u0A39\\u0A95-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0B15-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0C15-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C95-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0D15-\\u0D28\\u0D2A-\\u0D39\\u0D9A-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0E01-\\u0E2E\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD\\u0EAE\\u0F40-\\u0F47\\u0F49-\\u0F6A\\u1000-\\u1020\\u1703-\\u170C\\u170E-\\u1711\\u1723-\\u1731\\u1743-\\u1751\\u1763-\\u176C\\u176E-\\u1770\\u1780-\\u17A2\\u1900-\\u191C\\u1950-\\u1962\\u1980-\\u19A9\\u1A00-\\u1A16\\u1B13-\\u1B33\\u1B8A-\\u1BA0\\u1C00-\\u1C23\\uA840-\\uA85C\\uA862-\\uA865\\uA892-\\uA8B4\\uA90A-\\uA925\\uA930-\\uA946\\uAA06-\\uAA28\\U000103A3-\\U000103C3\\U00010450-\\U00010465\\U00010A10-\\U00010A13\\U00010A15-\\U00010A17\\U00010A19-\\U00010A33]" + }, + {"Consonants and consonant signs", "[\\uA807-\\uA822]"}, + {"Consonants signs", "[\\u1BA1-\\u1BA3]"}, + {"Continuous macrons for Coptic", "[\\uFE24-\\uFE26]"}, + {"Contour tone marks", "[\\u1DC4-\\u1DC9\\u1DCB\\u1DCC]"}, + {"Control character", "[\\u007F]"}, + {"Control code graphics", "[\\u25F0-\\u25F7]"}, + {"Coptic letters derived from Demotic", "[\\u03E2-\\u03EF]"}, + {"Corner tone marks for Chinese", "[\\uA700-\\uA707]"}, + {"Counting rod units", "[\\U0001D360-\\U0001D371]"}, + {"Croatian digraphs matching Serbian Cyrillic letters", "[\\u01C4-\\u01CC]"}, + {"Crops", "[\\u230C-\\u230F]"}, + {"Crosses", "[\\u2719-\\u2720]"}, + {"Crossing arrows for knot theory", "[\\u2927-\\u2932]"}, + {"Currency sign", "[\\u060B\\u0AF1\\uFDFC]"}, + {"Currency symbol", "[\\u0BF9\\u0E3F\\u17DB]"}, + {"Currency symbols", "[\\u20A0-\\u20B5]"}, + {"Cyrillic extensions", "[\\u0400-\\u040F\\u0450-\\u045F\\u0510-\\u0513]"}, + {"Cyrillic letter", "[\\u1D2B]"}, + {"DPRK compatibility ideographs", "[\\uFA70-\\uFAD9]"}, + {"Dashes", "[\\u2010-\\u2015]"}, + {"Database theory operators", "[\\u27D5-\\u27D7]"}, + {"Date mark", "[\\u0D79]"}, + {"Dentistry notation symbols", "[\\u23BE-\\u23CC]"}, + {"Dependent consonant signs", "[\\u103B-\\u103E]"}, + {"Dependent vowel sign", "[\\u0DDF]"}, + { + "Dependent vowel signs", + "[\\u093E-\\u094C\\u09BE-\\u09C4\\u09C7\\u09C8\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B\\u0A4C\\u0ABE-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB\\u0ACC\\u0B3E-\\u0B44\\u0B47\\u0B48\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0C3E-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4C\\u0CBE-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCC\\u0D3E-\\u0D44\\u0D46-\\u0D48\\u0DCF-\\u0DD4\\u0DD6\\u0DD8-\\u0DDB\\u0F71-\\u0F7D\\u0F80\\u0F81\\u102B-\\u1035\\u1712\\u1713\\u1732\\u1733\\u1752\\u1753\\u1772\\u1773\\u17B6-\\u17BD\\u17C1-\\u17C3\\u1920-\\u1928\\u1B35-\\u1B43\\uA823-\\uA827\\uA8B5-\\uA8C3\\uAA29-\\uAA32]" + }, + { + "Dependent vowels", + "[\\u0B62\\u0B63\\u0C62\\u0C63\\u0CE2\\u0CE3\\u0D62\\u0D63\\u1C26-\\u1C2C]" + }, + {"Deprecated", "[\\u206A-\\u206F]"}, + {"Devanagari-specific additions", "[\\u0970\\u0971]"}, + {"Diacritical marks for musical symbols", "[\\u1B6B-\\u1B73]"}, + {"Diacritics", "[\\u302A-\\u302F]"}, + {"Dialect (non-Mandarin) letters", "[\\u312A-\\u312C]"}, + {"Diamonds", "[\\u2B16-\\u2B19]"}, + {"Diamonds and lozenges", "[\\u2B25-\\u2B2B]"}, + {"Dice", "[\\u2680-\\u2685]"}, + {"Dictionary and map symbols", "[\\u2690-\\u269B]"}, + {"Dictionary punctuation", "[\\u2E1A\\u2E1B\\u2E1E\\u2E1F]"}, + { + "Digits", + "[\\u07C0-\\u07C9\\u0966-\\u096F\\u09E6-\\u09EF\\u0A66-\\u0A6F\\u0AE6-\\u0AEF\\u0B66-\\u0B6F\\u0BE6-\\u0BEF\\u0C66-\\u0C6F\\u0CE6-\\u0CEF\\u0D66-\\u0D6F\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F29\\u1040-\\u1049\\u1369-\\u1371\\u17E0-\\u17E9\\u1810-\\u1819\\u1946-\\u194F\\u19D0-\\u19D9\\u1B50-\\u1B59\\u1BB0-\\u1BB9\\u1C40-\\u1C49\\u1C50-\\u1C59\\uA620-\\uA629\\uA8D0-\\uA8D9\\uA900-\\uA909\\uAA50-\\uAA59\\U000104A0-\\U000104A9\\U00010A40-\\U00010A43]" + }, + {"Digits minus half", "[\\u0F2A-\\u0F33]"}, + {"Digrams", "[\\U0001D301-\\U0001D305]"}, + {"Digraphs", "[\\u0EDC\\u0EDD]"}, + {"Dingbat arrows", "[\\u2794-\\u27AF\\u27B1-\\u27BE]"}, + {"Dingbat circled digits", "[\\u2776-\\u2793]"}, + {"Division operator", "[\\u27CC]"}, + {"Dotless symbols", "[\\U0001D6A4\\U0001D6A5]"}, + {"Dotted tone letters", "[\\uA708-\\uA711]"}, + {"Double arrows", "[\\u21CD-\\u21D9]"}, + {"Double circled numbers", "[\\u24F5-\\u24FE]"}, + {"Double diacritic", "[\\u1DCD]"}, + {"Double diacritics", "[\\u035C-\\u0362]"}, + {"Double punctuation for vertical text", "[\\u203C\\u2047-\\u2049]"}, + {"Double-barbed harpoons", "[\\u294A-\\u2951]"}, + {"Double-struck digits", "[\\U0001D7D8-\\U0001D7E1]"}, + {"Double-struck italic math symbols", "[\\u2145-\\u2149]"}, + {"Double-struck large operator", "[\\u2140]"}, + {"Double-struck symbols", "[\\U0001D538-\\U0001D56B]"}, + {"Drafting symbols", "[\\u232D-\\u2335]"}, + {"Dragon tiles", "[\\U0001F004-\\U0001F006]"}, + {"Duplicate characters from Big 5", "[\\uFA0C\\uFA0D]"}, + {"Duplicate characters from CNS�11643-1992", "[\\U0002F800-\\U0002FA1D]"}, + {"Dynamics", "[\\U0001D18F-\\U0001D193]"}, + {"Eastern Arabic-Indic digits", "[\\u06F0-\\u06F9]"}, + {"Editorial symbols", "[\\u03FD-\\u03FF]"}, + {"Egyptological additions", "[\\uA722-\\uA725]"}, + {"Ekfonetika", "[\\U0001D003-\\U0001D014]"}, + {"Electrotechnical symbols", "[\\u23DA\\u23DB]"}, + {"Electrotechnical symbols from IR 181", "[\\u238D-\\u2394]"}, + {"Ellipses", "[\\u2B2C-\\u2B2F]"}, + {"Empty sets", "[\\u29B0-\\u29B4]"}, + {"Enclosing diacritics", "[\\u20DD-\\u20E0]"}, + {"Error bar symbols", "[\\u29EE-\\u29F3]"}, + {"European Latin", "[\\u0100-\\u017F]"}, + {"Extended Arabic letter", "[\\u06D5]"}, + {"Extended Arabic letter for Parkari", "[\\u06FF]"}, + {"Extended Arabic letters", "[\\u0671-\\u06D3\\u06FA-\\u06FC\\u0750-\\u076D]"}, + {"Extended Arabic letters for Parkari", "[\\u06EE\\u06EF]"}, + {"Extended Bopomofo for Minnan and Hakka", "[\\u31A0-\\u31B7]"}, + {"Extended Bopomofo tone marks", "[\\u02EA\\u02EB]"}, + {"Extended Cyrillic", "[\\u048A-\\u04F9]"}, + {"Extension for Geba Karen", "[\\u1071]"}, + {"Extensions for Arabic", "[\\u0798-\\u07A5]"}, + {"Extensions for Balti", "[\\u0F6B\\u0F6C]"}, + {"Extensions for Eastern Pwo Karen", "[\\u106E-\\u1070]"}, + {"Extensions for Kayah", "[\\u1072-\\u1074]"}, + {"Extensions for Mon", "[\\u105A-\\u1060]"}, + {"Extensions for Rumai Palaung", "[\\u108E\\u108F]"}, + {"Extensions for S'gaw Karen", "[\\u1061-\\u1064]"}, + {"Extensions for Sanskrit and Tibetan", "[\\u1880-\\u18AA]"}, + {"Extensions for Shan", "[\\u1075-\\u108D]"}, + {"Extensions for Western Pwo Karen", "[\\u1065-\\u106D]"}, + {"Extracts", "[\\U00010095-\\U00010099]"}, + {"Fences", "[\\u2999\\u299A\\u29D8-\\u29DB]"}, + {"Figure repetitions", "[\\U0001D10D-\\U0001D10F]"}, + {"Final consonants", "[\\u11A8-\\u11F9\\u1930-\\u1938\\u19C1-\\u19C7]"}, + {"Final letters", "[\\uAA40-\\uAA4D]"}, + {"Fish tails", "[\\u297C-\\u297F]"}, + {"Fives", "[\\U0001F054-\\U0001F05A\\U0001F086-\\U0001F08C]"}, + {"Fixed-form subjoined consonants", "[\\u0FBA-\\u0FBC]"}, + {"Flags", "[\\U0001D16E-\\U0001D172]"}, + {"Flower tiles", "[\\U0001F022-\\U0001F025]"}, + {"Fonitika (Vocals)", "[\\U0001D046-\\U0001D056]"}, + {"Forfeda (supplementary letters)", "[\\u1695-\\u169A]"}, + {"Forks", "[\\u2AD9-\\u2ADD]"}, + {"Form and chart components", "[\\u2500-\\u257F]"}, + {"Format character", "[\\u2060]"}, + {"Format characters", "[\\u200C-\\u200F\\u2028-\\u202F]"}, + {"Format controls", "[\\u180B-\\u180E]"}, + {"Fours", "[\\U0001F04D-\\U0001F053\\U0001F07F-\\U0001F085]"}, + {"Fractions", "[\\u0D73-\\u0D75\\u2153-\\u215F\\U0001245A-\\U00012462]"}, + {"Fraktur symbols", "[\\U0001D504-\\U0001D537]"}, + {"Frown and smile", "[\\u2322\\u2323]"}, + {"Fthores (Destroyers)", "[\\U0001D0B6-\\U0001D0CA]"}, + {"Fullwidth ASCII variants", "[\\uFF01-\\uFF5E]"}, + {"Fullwidth brackets", "[\\uFF5F\\uFF60]"}, + {"Fullwidth symbol variants", "[\\uFFE0-\\uFFE6]"}, + {"Further Greek musical notation symbols", "[\\U0001D242-\\U0001D245]"}, + {"GUI icons", "[\\u231A\\u231B]"}, + {"Gender symbol", "[\\u26B2]"}, + {"Gender symbols", "[\\u26A2-\\u26A9]"}, + {"Genealogical symbols", "[\\u26AD-\\u26B1]"}, + { + "General punctuation", + "[\\u2016-\\u2027\\u2030-\\u203B\\u203D-\\u2046\\u204A-\\u2055\\u2057\\u2E18\\u2E19]" + }, + {"Generic punctuation for Philippine scripts", "[\\u1735\\u1736]"}, + {"Generic punctuation for scripts of India", "[\\u0964\\u0965]"}, + {"Geometric shapes", "[\\u25A0-\\u25EF\\u25F8-\\u25FF]"}, + {"Glottal stop", "[\\u097D]"}, + {"Glyph part", "[\\uFE73]"}, + {"Glyphs for contextual forms of letters for Central Asian languages", "[\\uFBD3-\\uFBE9]"}, + { + "Glyphs for contextual forms of letters for Persian, Urdu, Sindhi, etc.", + "[\\uFB50-\\uFBB1]" + }, + {"Glyphs for spacing forms of Arabic points", "[\\uFE70-\\uFE72\\uFE74\\uFE76-\\uFE7F]"}, + {"Glyphs for vertical variants", "[\\uFE10-\\uFE19\\uFE30-\\uFE44\\uFE47\\uFE48]"}, + {"Go markers", "[\\u2686-\\u2689]"}, + {"Golden number runes", "[\\u16EE-\\u16F0]"}, + {"Grammata (Letters)", "[\\U0001D0E6-\\U0001D0EF]"}, + {"Grapheme joiner", "[\\u034F]"}, + {"Graphic picture for control code", "[\\u2424]"}, + {"Graphic pictures for control codes", "[\\u2400-\\u2421]"}, + {"Graphics for control codes", "[\\u237B\\u237D-\\u237F]"}, + {"Greek letters", "[\\u1D26-\\u1D2A]"}, + {"Greek subscript modifier letters", "[\\u1D66-\\u1D6A]"}, + {"Greek superscript modifier letters", "[\\u1D5D-\\u1D61]"}, + {"Gregorian notation", "[\\U0001D1D0-\\U0001D1DD]"}, + {"Gurmukhi-specific additions", "[\\u0A70-\\u0A75]"}, + {"Half brackets", "[\\u2E22-\\u2E25]"}, + {"Halfwidth CJK punctuation", "[\\uFF61-\\uFF64]"}, + { + "Halfwidth Hangul variants", + "[\\uFFA0-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC]" + }, + {"Halfwidth Katakana variants", "[\\uFF65-\\uFF9F]"}, + {"Halfwidth symbol variants", "[\\uFFE8-\\uFFEE]"}, + {"Harpoons", "[\\u21BC-\\u21C3]"}, + {"Head marks", "[\\u0F01-\\u0F07\\u0FD3\\u0FD4]"}, + {"Head marks for Tibetan", "[\\uA874\\uA875]"}, + {"Hebrew letterlike math symbols", "[\\u2135-\\u2138]"}, + { + "Hebrew presentation forms", + "[\\uFB1D-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46-\\uFB4F]" + }, + {"Hexagons", "[\\u2B21-\\u2B23]"}, + {"Historic letters", "[\\u0460-\\u0481]"}, + {"Historic miscellaneous", "[\\u0482-\\u0489]"}, + {"Historic phonetic variants", "[\\u0C58\\u0C59]"}, + {"Historic syllables", "[\\uA610-\\uA612\\uA62A\\uA62B]"}, + {"Holds and pauses", "[\\U0001D110-\\U0001D113]"}, + {"Honorifics", "[\\u0610-\\u0614]"}, + {"Horizontal brackets", "[\\u23B4-\\u23B6\\u23DC-\\u23E1]"}, + {"Horizontal tiles", "[\\U0001F030]"}, + {"IPA characters for disordered speech", "[\\u02A9-\\u02AD]"}, + {"IPA diacritics for disordered speech", "[\\u034B-\\u034E]"}, + {"IPA extensions", "[\\u0250-\\u02A8]"}, + {"IPA modifiers", "[\\u02EC\\u02ED]"}, + {"Ichimata and Martyrika (Ichimas and Evidentials)", "[\\U0001D0A2-\\U0001D0B5]"}, + {"Ideographic description characters", "[\\u2FF0-\\u2FFB]"}, + {"Independent vowel (deprecated)", "[\\u17A3]"}, + { + "Independent vowels", + "[\\u0904-\\u0914\\u0985-\\u098C\\u098F\\u0990\\u0993\\u0994\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13\\u0A14\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93\\u0A94\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13\\u0B14\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B94\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C14\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0C94\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D14\\u0D85-\\u0D96\\u1021-\\u102A\\u1700-\\u1702\\u1720-\\u1722\\u1740-\\u1742\\u1760-\\u1762\\u17A4-\\u17B3\\u1B05-\\u1B12\\uA882-\\uA891\\uAA00-\\uAA05\\U000103A0-\\U000103A2]" + }, + {"Independent vowels and dvisvara", "[\\uA800-\\uA805]"}, + {"Inherent vowels", "[\\u17B4\\u17B5]"}, + {"Initial consonants", "[\\u1100-\\u1159\\u115F]"}, + {"Instrumentation", "[\\U0001D1AA-\\U0001D1AD]"}, + {"Insular and Celticist letters", "[\\uA779-\\uA787]"}, + {"Integral pieces", "[\\u2320\\u2321]"}, + {"Integrals", "[\\u222B-\\u2233]"}, + {"Interlinear annotation", "[\\uFFF9-\\uFFFB]"}, + {"Intersections and unions", "[\\u2A40-\\u2A50]"}, + {"Invisible operators", "[\\u2061-\\u2064]"}, + {"Iota subscript", "[\\u037A]"}, + {"Italic Greek symbols", "[\\U0001D6E2-\\U0001D714]"}, + {"Italic symbols", "[\\U0001D434-\\U0001D467]"}, + {"Iteration marks", "[\\u309D\\u309E\\u30FD\\u30FE]"}, + {"JIS X 0213 compatibility ideographs", "[\\uFA30-\\uFA6A]"}, + {"Japanese chess symbols", "[\\u2616\\u2617]"}, + {"Japanese corporation", "[\\u337F]"}, + {"Japanese era names", "[\\u337B-\\u337E]"}, + {"Kanbun", "[\\u3190-\\u319F]"}, + {"Kangxi radicals", "[\\u2F00-\\u2FD5]"}, + {"Katakana punctuation", "[\\u30A0]"}, + {"Keyboard and UI symbols", "[\\u23CE\\u23CF]"}, + {"Keyboard symbol", "[\\u232B\\u2425]"}, + {"Keyboard symbols", "[\\u2324-\\u2328]"}, + {"Keyboard symbols and circle arrows", "[\\u21B4-\\u21BB]"}, + {"Keyboard symbols from ISO 9995-7", "[\\u2380-\\u238C\\u2396-\\u239A]"}, + {"Komi letters", "[\\u0500-\\u050F]"}, + {"Koranic annotation signs", "[\\u0615-\\u061A\\u06D6-\\u06ED]"}, + {"Kurdish letters", "[\\u051A-\\u051D]"}, + {"Large operators", "[\\u29F8\\u29F9]"}, + {"Latin extensions for Vietnamese", "[\\u1EA0-\\u1EF1]"}, + {"Latin general extensions", "[\\u1EF2-\\u1EF9]"}, + {"Latin general use extensions", "[\\u1E00-\\u1E9B]"}, + {"Latin letter", "[\\u1D6B]"}, + {"Latin letters", "[\\u1D00-\\u1D25]"}, + {"Latin letters with middle tilde", "[\\u1D6C-\\u1D76]"}, + {"Latin letters with palatal hook", "[\\u1D80-\\u1D8E]"}, + {"Latin letters with retroflex hook", "[\\u1D8F-\\u1D9A]"}, + {"Latin ligatures", "[\\uFB00-\\uFB06]"}, + {"Latin subscript modifier letters", "[\\u1D62-\\u1D65]"}, + {"Latin superscript modifier letters", "[\\u02B0-\\u02B8\\u1D2C-\\u1D5C]"}, + {"Latin-1 punctuation and symbols", "[\\u00A0-\\u00BF]"}, + {"Left-stem tone letters", "[\\uA712-\\uA716]"}, + {"Leimmata or Siopes (Leimmas or Silencers)", "[\\U0001D08A-\\U0001D08E]"}, + {"Length mark", "[\\U00010A0C]"}, + {"Letter", "[\\u0386]"}, + {"Letter A", "[\\uA85D]"}, + {"Letter extender", "[\\u07FA]"}, + {"Letterlike symbol", "[\\u0608]"}, + {"Letterlike symbols", "[\\u2100-\\u2134]"}, + { + "Letters", + "[\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u00FF\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u07CA-\\u07E7\\u16A0-\\u16EA\\u1C5A-\\u1C77\\u2D30-\\u2D65\\U00010280-\\U0001029C\\U000102A0-\\U000102D0\\U00010300-\\U0001031E\\U00010330-\\U0001034A\\U00010380-\\U0001039D\\U00010480-\\U0001049D\\U00010900-\\U00010915\\U00010920-\\U00010939]" + }, + {"Letters for Old Abkhasian orthography", "[\\uA680-\\uA697]"}, + {"Letters for Old Cyrillic", "[\\uA640-\\uA65F\\uA662-\\uA66E]"}, + {"Ligatures (three elements)", "[\\uFD50-\\uFD8F\\uFD92-\\uFDC7]"}, + {"Ligatures (two elements)", "[\\uFBEA-\\uFD3D]"}, + {"Logical and set operators", "[\\u2227-\\u222A]"}, + {"Logical ands and ors", "[\\u2A51-\\u2A63]"}, + {"Logical operators", "[\\u22CE\\u22CF]"}, + {"Logograms", "[\\uA613-\\uA61F]"}, + {"Long arrows", "[\\u27F5-\\u27FF]"}, + {"Lowercase Claudian letter", "[\\u214E\\u2184]"}, + {"Lowercase Latin alphabet", "[a-z]"}, + {"Lowercase letters", "[\\u0561-\\u0587\\U00010428-\\U0001044F]"}, + {"Lowercase of editorial symbols", "[\\u037B-\\u037D]"}, + {"Lunar date sign (deprecated)", "[\\u17D3]"}, + {"Lunar date symbols", "[\\u19E0-\\u19FF]"}, + {"Malayalam numerics", "[\\u0D70-\\u0D72]"}, + {"Manchu letters", "[\\u1873-\\u1877]"}, + {"Marks", "[\\u0FD0-\\u0FD2]"}, + {"Marks and signs", "[\\u0F08-\\u0F14\\u0F34-\\u0F39\\u0F82-\\u0F87]"}, + {"Mathematical arrows", "[\\u2B30-\\u2B4C]"}, + {"Mathematical brackets", "[\\u27E6-\\u27EF]"}, + {"Mathematical operator", "[\\u00D7\\u00F7]"}, + {"Mayanist additions", "[\\uA726-\\uA72F]"}, + {"Measures", "[\\U00010137-\\U0001013F]"}, + {"Medial vowels", "[\\u1160-\\u11A2]"}, + {"Medical and healing symbols", "[\\u2624\\u2625]"}, + {"Medieval superscript letter diacritics", "[\\u0363-\\u036F\\u1DD3-\\u1DE6]"}, + {"Medievalist addition", "[\\u1E9F]"}, + {"Medievalist additions", "[\\u1DCE-\\u1DD2\\u1E9C\\u1E9D\\u1EFA-\\u1EFF\\uA730-\\uA778]"}, + {"Medievalist punctuation", "[\\u2E2A-\\u2E30]"}, + {"Melodimata (Melodics)", "[\\U0001D015-\\U0001D045]"}, + {"Mensural notation", "[\\U0001D1B6-\\U0001D1C0]"}, + {"Mensural prolations", "[\\U0001D1C7-\\U0001D1CE]"}, + {"Mensural rests", "[\\U0001D1C1-\\U0001D1C6]"}, + {"Metals", "[\\U0001009A-\\U0001009C]"}, + {"Metrical symbols", "[\\u23D1-\\u23D9]"}, + {"Miscellaneous", "[\\u2701-\\u2718\\u274C-\\u275A]"}, + {"Miscellaneous addition", "[\\u312D]"}, + { + "Miscellaneous additions", + "[\\u021C-\\u0229\\u0237-\\u024F\\u0358-\\u035B\\u2C6D-\\u2C6F\\u2C71-\\u2C74]" + }, + {"Miscellaneous arrow", "[\\u2970]"}, + {"Miscellaneous arrows", "[\\u21F4-\\u21FF\\u2900-\\u2918\\u291D-\\u2926]"}, + {"Miscellaneous arrows and keyboard symbols", "[\\u21DA-\\u21E5]"}, + {"Miscellaneous curved arrows", "[\\u2933-\\u2941]"}, + {"Miscellaneous large operators", "[\\u2A1D-\\u2A21]"}, + {"Miscellaneous mark", "[\\u1DCA]"}, + {"Miscellaneous marks", "[\\u1DC2\\u1DC3]"}, + {"Miscellaneous mathematical operator", "[\\u2AF6]"}, + {"Miscellaneous mathematical operators", "[\\u2A39-\\u2A3F\\u2A64\\u2A65]"}, + {"Miscellaneous mathematical symbol", "[\\u220E\\u223F]"}, + { + "Miscellaneous mathematical symbols", + "[\\u2200-\\u2207\\u221E-\\u2222\\u2234\\u2235\\u22A4\\u22A5\\u22BE\\u22BF\\u2980-\\u2982\\u29DC-\\u29E2\\u29E7-\\u29ED\\u29F4-\\u29F7\\u29FE\\u29FF]" + }, + {"Miscellaneous phonetic modifiers", "[\\u02B9-\\u02D7]"}, + {"Miscellaneous symbol", "[\\u2615\\u2668\\u27D0\\U0001D1CF]"}, + { + "Miscellaneous symbols", + "[\\u260E-\\u2613\\u2618\\u2619\\u2638-\\u263C\\u267E\\u267F\\u269C\\u269D\\u26A0\\u26A1\\u27C0-\\u27C9\\U0001D1B1-\\U0001D1B5]" + }, + { + "Miscellaneous technical", + "[\\u2300-\\u2307\\u2310-\\u2319\\u237C\\u23CD\\u23E2\\u23E4-\\u23E7]" + }, + {"Miscellaneous tiles", "[\\U0001F02A\\U0001F02B]"}, + {"Mkhedruli", "[\\u10D0-\\u10F0]"}, + {"Modal logic operators", "[\\u27E0-\\u27E5]"}, + {"Modern letters", "[\\u3131-\\u3163]"}, + {"Modified harpoons", "[\\u2952-\\u2961]"}, + {"Modifier letter", "[\\u10FC\\u2D6F\\uA67F]"}, + {"Modifier letters", "[\\u0559-\\u055F\\u1C78-\\u1C7D\\u1D9B-\\u1DBF\\uA788-\\uA78A]"}, + {"Monogram", "[\\U0001D300]"}, + {"Monospace digits", "[\\U0001D7F6-\\U0001D7FF]"}, + {"Monospace symbols", "[\\U0001D670-\\U0001D6A3]"}, + {"Mordvin letters", "[\\u0514-\\u0519]"}, + {"Multiplication and division sign operators", "[\\u2A2F-\\u2A38]"}, + {"Musical symbols", "[\\u1B74-\\u1B7C\\u2669-\\u266F]"}, + {"Musical symbols for notes", "[\\u1B61-\\u1B6A]"}, + {"N-ary operators", "[\\u220F-\\u2211\\u22C0-\\u22C3\\u2A00-\\u2A09]"}, + {"New Testament editorial symbols", "[\\u2E00-\\u2E0D]"}, + {"Non-European and historic Latin", "[\\u0180-\\u01BF]"}, + { + "Noncharacters", + "[\\uFDD0-\\uFDEF\\uFFFE\\uFFFF\\U0002FFFE\\U0002FFFF\\U0003FFFE\\U0003FFFF\\U0004FFFE\\U0004FFFF\\U0005FFFE\\U0005FFFF\\U0006FFFE\\U0006FFFF\\U0007FFFE\\U0007FFFF\\U0008FFFE\\U0008FFFF\\U0009FFFE\\U0009FFFF\\U000AFFFE\\U000AFFFF\\U000BFFFE\\U000BFFFF\\U000CFFFE\\U000CFFFF\\U000DFFFE\\U000DFFFF\\U000EFFFE\\U000EFFFF\\U000FFFFE\\U000FFFFF\\U0010FFFE\\U0010FFFF]" + }, + {"Not character codes", "[\\U0001FFFE\\U0001FFFF]"}, + {"Noteheads", "[\\U0001D143-\\U0001D15B]"}, + {"Notes", "[\\U0001D15C-\\U0001D164]"}, + { + "Numbers", + "[\\u1372-\\u137C\\U00010107-\\U00010133\\U000103D1-\\U000103D5\\U00010916-\\U00010919\\U00010A44-\\U00010A47]" + }, + {"Numbers period", "[\\u2488-\\u249B]"}, + {"Numeral signs", "[\\u0374\\u0375]"}, + {"Numerals", "[\\U00010320-\\U00010323]"}, + {"Numeric character", "[\\u2CFD]"}, + {"Numeric signs", "[\\U00012400-\\U00012459]"}, + {"Numeric symbols for divination lore", "[\\u17F0-\\u17F9]"}, + {"OCR", "[\\u2440-\\u244A]"}, + {"Octaves", "[\\U0001D136-\\U0001D139]"}, + {"Old Church Slavonic combining letters", "[\\u2DE0-\\u2DFF]"}, + {"Old Coptic and dialect letters", "[\\u2CB2-\\u2CDB]"}, + {"Old Nubian letters", "[\\u2CDC-\\u2CE3]"}, + {"Old Nubian punctuation", "[\\u2CF9-\\u2CFC]"}, + {"Ones", "[\\U0001F038-\\U0001F03E\\U0001F06A-\\U0001F070]"}, + {"Operator", "[\\u2238\\u223A\\u2240]"}, + { + "Operators", + "[\\u2212-\\u221D\\u2223-\\u2226\\u228C-\\u228E\\u2293-\\u22A3\\u22BA-\\u22BD\\u22C4-\\u22C7\\u22C9-\\u22CC\\u22D2\\u22D3\\u27D1-\\u27D4\\u2AFC-\\u2AFF]" + }, + {"Ordinary diacritics", "[\\u0300-\\u0333]"}, + {"Oriya-specific additions", "[\\u0B70\\u0B71]"}, + {"Ornamental brackets", "[\\u2768-\\u2775]"}, + {"Ornaments", "[\\U0001D194-\\U0001D1A5]"}, + {"Orthographic Latin additions", "[\\u2C60-\\u2C66]"}, + {"Orthographic letters for glottals", "[\\uA78B\\uA78C]"}, + {"Other CJK punctuation", "[\\u303B-\\u303D]"}, + {"Other CJK symbols", "[\\u3030-\\u3037]"}, + {"Other combining marks", "[\\u0656-\\u065E]"}, + {"Other materials", "[\\U0001009D-\\U000100DD]"}, + {"Other modifier letter", "[\\u02EE]"}, + {"Other phonetic symbols", "[\\u1D79-\\u1D7F]"}, + {"Overscores and underscores", "[\\uFE49-\\uFE4F]"}, + {"Overstruck diacritics", "[\\u0334-\\u0338]"}, + {"Paired arrows and harpoons", "[\\u21C4-\\u21CC]"}, + {"Paired harpoons", "[\\u2962-\\u296F]"}, + {"Paired punctuation", "[\\u0F3A-\\u0F3D]"}, + {"Pali and Sanskrit extensions", "[\\u1050-\\u1059]"}, + {"Parenthesized Hangul elements", "[\\u3200-\\u320D]"}, + {"Parenthesized Hangul syllables", "[\\u320E-\\u321C]"}, + {"Parenthesized Korean words", "[\\u321D\\u321E]"}, + {"Parenthesized Latin letters", "[\\u249C-\\u24B5]"}, + {"Parenthesized ideographs", "[\\u3220-\\u3243]"}, + {"Parenthesized numbers", "[\\u2474-\\u2487]"}, + {"Pedals", "[\\U0001D1AE-\\U0001D1B0]"}, + {"Pentagons", "[\\u2B1F\\u2B20\\u2B53\\u2B54]"}, + {"People and animals", "[\\U00010080-\\U0001008D]"}, + {"Persian letters", "[\\u072D-\\u072F]"}, + {"Phonetic and historic letters", "[\\u01DD-\\u01FF]"}, + {"Phonetic extensions for Ainu", "[\\u31F0-\\u31FF]"}, + {"Pinyin diacritic-vowel combinations", "[\\u01CD-\\u01DC]"}, + {"Playing card symbols", "[\\u2660-\\u2667]"}, + {"Plus and minus sign operators", "[\\u2A22-\\u2A2E]"}, + {"Poetic marks", "[\\u060E\\u060F]"}, + {"Poetry marks", "[\\uA828-\\uA82B]"}, + {"Point", "[\\u0670]"}, + {"Pointing hand symbols", "[\\u261A-\\u261F]"}, + {"Points and punctuation", "[\\u05B0-\\u05C3\\u05C6\\u05C7]"}, + {"Points from ISO 8859-6", "[\\u064B-\\u0652]"}, + { + "Precomposed polytonic Greek", + "[\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FC4\\u1FC6-\\u1FD3\\u1FD6-\\u1FDB\\u1FDD-\\u1FEF\\u1FF2-\\u1FF4\\u1FF6-\\u1FFE]" + }, + {"Prevailing wind tiles", "[\\U0001F000-\\U0001F003]"}, + {"Pronunciation variants from KS�X�1001:1998", "[\\uF900-\\uFA0B]"}, + {"Prosodies (Prosodics)", "[\\U0001D000-\\U0001D002]"}, + {"Puncta extraordinaria", "[\\u05C4\\u05C5]"}, + { + "Punctuation", + "[\\u037E\\u0387\\u0589\\u058A\\u0609\\u060A\\u060C\\u060D\\u061B\\u061E\\u061F\\u066A-\\u066D\\u06D4\\u07F7-\\u07F9\\u0DF4\\u104A\\u104B\\u10FB\\u1360-\\u1368\\u166E\\u1680\\u169B\\u169C\\u16EB-\\u16ED\\u1800-\\u180A\\u1B5A-\\u1B60\\u1C3B-\\u1C3F\\u1C7E\\u1C7F\\u2CFE\\u2CFF\\uA60D-\\uA60F\\uA8CE\\uA8CF\\uA92E\\uA92F\\uA95F\\uAA5C-\\uAA5F\\uFD3E\\uFD3F\\U00010100-\\U00010102\\U0001039F\\U000103D0\\U0001091F\\U0001093F\\U00010A50-\\U00010A58\\U00012470-\\U00012473]" + }, + {"Punctuation for Tibetan", "[\\uA876\\uA877]"}, + {"Punctuation mark", "[\\uA673\\uA67E]"}, + {"Punctuation ornaments", "[\\u275B-\\u275E\\u2761-\\u2767]"}, + {"Quine corners", "[\\u231C-\\u231F]"}, + {"Radix symbols", "[\\u0606\\u0607]"}, + {"Recycling symbols", "[\\u2672-\\u267D]"}, + {"Relation", "[\\u2239\\u22C8\\u22CD]"}, + {"Relational operators", "[\\u2A66-\\u2ABC]"}, + { + "Relations", + "[\\u2236\\u2237\\u223B-\\u223E\\u2241-\\u228B\\u228F-\\u2292\\u22A6-\\u22B9\\u22D0\\u22D1\\u22D4-\\u22FF\\u29E3-\\u29E6\\u2AF7-\\u2AFB]" + }, + {"Religious and political symbols", "[\\u2626-\\u262F]"}, + {"Replacement characters", "[\\uFFFC\\uFFFD]"}, + { + "Reserved", + "[\\u09E4\\u09E5\\u0A64\\u0A65\\u0AE4\\u0AE5\\u0B64\\u0B65\\u0BE4\\u0BE5\\u0C64\\u0C65\\u0CE4\\u0CE5\\u0D64\\u0D65]" + }, + {"Rest", "[\\U0001D129]"}, + {"Rests", "[\\U0001D13A-\\U0001D142]"}, + {"Roman coin symbols", "[\\U00010196-\\U0001019A]"}, + {"Roman military symbol", "[\\U0001019B]"}, + {"Roman numerals", "[\\u2160-\\u217F]"}, + {"Roman weights and measures", "[\\U00010190-\\U00010195]"}, + {"Rythmika (Rhythmics)", "[\\U0001D0DA-\\U0001D0E5]"}, + {"Sans-serif bold Greek symbols", "[\\U0001D756-\\U0001D788]"}, + {"Sans-serif bold digits", "[\\U0001D7EC-\\U0001D7F5]"}, + {"Sans-serif bold italic Greek symbols", "[\\U0001D790-\\U0001D7C2]"}, + {"Sans-serif bold italic symbols", "[\\U0001D63C-\\U0001D66F]"}, + {"Sans-serif bold symbols", "[\\U0001D5D4-\\U0001D607]"}, + {"Sans-serif digits", "[\\U0001D7E2-\\U0001D7EB]"}, + {"Sans-serif italic symbols", "[\\U0001D608-\\U0001D63B]"}, + {"Sans-serif symbols", "[\\U0001D5A0-\\U0001D5D3]"}, + {"Scan lines for terminal graphics", "[\\u23BA-\\u23BD]"}, + {"Script symbols", "[\\U0001D49C-\\U0001D4CF]"}, + {"Season tiles", "[\\U0001F026-\\U0001F029]"}, + {"Set membership", "[\\u2208-\\u220D]"}, + {"Shade characters", "[\\u2591-\\u2593]"}, + {"Shan digits", "[\\u1090-\\u1099]"}, + {"Shan symbols", "[\\u109E\\u109F]"}, + {"Sibe letters", "[\\u185D-\\u1872]"}, + {"Sidelining emphasis marks", "[\\uFE45\\uFE46]"}, + {"Sign", "[\\u09CD\\u09D7\\u0CCD\\u0DCA\\u0E2F\\u0E46\\u0EAF\\u0EC6\\u1B34\\u1B44\\uA806]"}, + { + "Signs", + "[\\u0E4C-\\u0E4F\\u0E5A\\u0E5B\\u0EBC\\u0EBD\\u0ECC\\u0ECD\\u0FBE\\u0FBF\\U000101D0-\\U000101FC\\U00012000-\\U0001236E]" + }, + {"Signs for Sindhi", "[\\u06FD\\u06FE]"}, + {"Simple arrows", "[\\u2190-\\u2199]"}, + {"Sindhi implosives", "[\\u097B\\u097C\\u097E\\u097F]"}, + {"Sixes", "[\\U0001F05B-\\U0001F061\\U0001F08D-\\U0001F093]"}, + {"Small form variants", "[\\uFE50-\\uFE52\\uFE54-\\uFE66\\uFE68-\\uFE6B]"}, + {"Small letters", "[\\u2C30-\\u2C5E\\u3095\\u3096]"}, + {"Small letters (Khutsuri)", "[\\u2D00-\\u2D25]"}, + {"Sogdian letters", "[\\u074D-\\u074F]"}, + {"Space", "[\\u205F]"}, + {"Spaces", "[\\u2000-\\u200B]"}, + {"Spacing accent marks", "[\\u0384\\u0385]"}, + {"Spacing clones of diacritics", "[\\u02D8-\\u02DD]"}, + {"Special", "[\\uFEFF]"}, + {"Special CJK indicators", "[\\u303E\\u303F]"}, + {"Special character", "[\\u3164]"}, + {"Special character extension", "[\\u23D0]"}, + {"Special character extensions", "[\\u23AE\\u23AF]"}, + {"Specialized plus sign operators", "[\\u29FA\\u29FB]"}, + {"Specials", "[\\U0001D0F0-\\U0001D0F5]"}, + {"Specific symbol for control code", "[\\u2426]"}, + {"Specific symbols for space", "[\\u2422\\u2423]"}, + {"Square symbols", "[\\u29C4-\\u29C9]"}, + {"Squared Katakana words", "[\\u3300-\\u3357]"}, + {"Squared Latin abbreviation", "[\\u3250\\u33FF]"}, + {"Squared Latin abbreviations", "[\\u32CC-\\u32CF\\u3371-\\u337A\\u3380-\\u33DF]"}, + {"Squares", "[\\u2B12-\\u2B15\\u2B1A-\\u2B1E]"}, + {"Staff brackets", "[\\U0001D114\\U0001D115]"}, + {"Stars", "[\\u2B50-\\u2B52]"}, + {"Stars, asterisks and snowflakes", "[\\u2721-\\u274B]"}, + {"Staves", "[\\U0001D116-\\U0001D11B]"}, + {"Stems", "[\\U0001D165\\U0001D166]"}, + {"Subjoined Consonants", "[\\uA867\\uA868]"}, + {"Subjoined consonant", "[\\uA871]"}, + {"Subjoined consonants", "[\\u0F90-\\u0F97\\u0F99-\\u0FB9\\u1929-\\u192B\\u1C24\\u1C25]"}, + {"Subscripts", "[\\u2080-\\u208E\\u2090-\\u2094]"}, + {"Subset and superset relations", "[\\u2ABD-\\u2AD8]"}, + {"Subtending marks", "[\\u0600-\\u0603]"}, + {"Summation sign parts", "[\\u23B2\\u23B3]"}, + {"Summations and integrals", "[\\u2A0A-\\u2A1C]"}, + {"Superscripts", "[\\u2070-\\u207F]"}, + {"Supplementary signs", "[\\U00010040-\\U0001004D]"}, + {"Suzhou numerals", "[\\u3021-\\u3029]"}, + {"Syllable", "[\\u0F00]"}, + {"Syllable finals", "[\\uA60B\\uA60C]"}, + {"Syllable iteration mark", "[\\uA015]"}, + { + "Syllables", + "[\\u1200-\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u13A0-\\u13F4\\u1401-\\u166C\\u166F-\\u1676\\uA000-\\uA014\\uA016-\\uA48C\\U00010800-\\U00010805\\U00010808\\U0001080A-\\U00010835\\U00010837\\U00010838\\U0001083C\\U0001083F]" + }, + {"Syllables for Blin", "[\\u2D93-\\u2D96]"}, + {"Syllables for Me'en", "[\\u2D80-\\u2D92]"}, + { + "Syllables for Sebatbeit", + "[\\u1380-\\u138F\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE]" + }, + {"Syllables in -a", "[\\uA549-\\uA570]"}, + {"Syllables in -e", "[\\uA5E1-\\uA60A]"}, + {"Syllables in -ee", "[\\uA500-\\uA514]"}, + {"Syllables in -i", "[\\uA515-\\uA548]"}, + {"Syllables in -o", "[\\uA5BA-\\uA5E0]"}, + {"Syllables in -oo", "[\\uA571-\\uA594]"}, + {"Syllables in -u", "[\\uA595-\\uA5B9]"}, + {"Symbol", "[\\u03FC\\u07F6\\u166D\\u327F\\uFDFD]"}, + {"Symbols", "[\\u0FC4-\\u0FCC\\u2CE4-\\u2CEA\\U00010050-\\U0001005D]"}, + {"Symbols for draughts and checkers", "[\\u26C0-\\u26C3]"}, + {"Synagmata or Gorgotites (Synagmas or Quickeners)", "[\\U0001D08F-\\U0001D099]"}, + {"Syriac cross symbols", "[\\u2670\\u2671]"}, + {"Syriac format control character", "[\\u070F]"}, + {"Syriac letters", "[\\u0710-\\u072C]"}, + {"Syriac marks", "[\\u0740-\\u074A]"}, + {"Syriac points (vowels)", "[\\u0730-\\u073F]"}, + {"Syriac punctuation and signs", "[\\u0700-\\u070D]"}, + {"Tablature", "[\\U0001D11C\\U0001D11D]"}, + {"Tacks and turnstiles", "[\\u27D8-\\u27DF\\u2ADE-\\u2AED]"}, + {"Tag components", "[\\U000E0020-\\U000E007F]"}, + {"Tag identifiers", "[\\U000E0001]"}, + {"Tamil numerics", "[\\u0BF0-\\u0BF2]"}, + {"Tamil symbol", "[\\u0BFA]"}, + {"Tamil symbols", "[\\u0BF3-\\u0BF8]"}, + {"Telegraph symbols for days", "[\\u33E0-\\u33FE]"}, + {"Telegraph symbols for hours", "[\\u3358-\\u3370]"}, + {"Telegraph symbols for months", "[\\u32C0-\\u32CB]"}, + {"Telugu fractions and weights", "[\\u0C78-\\u0C7F]"}, + {"Terminal graphic characters", "[\\u23B7-\\u23B9\\u2596-\\u259F]"}, + {"Tetragrams", "[\\U0001D306-\\U0001D356]"}, + {"The IBM 32 compatibility ideographs", "[\\uFA0E-\\uFA2D]"}, + {"Threes", "[\\U0001F046-\\U0001F04C\\U0001F078-\\U0001F07E]"}, + {"Time signatures", "[\\U0001D134\\U0001D135]"}, + {"Todo letters", "[\\u1843-\\u185C]"}, + {"Tonal marks", "[\\u1390-\\u1399]"}, + {"Tone letters", "[\\u02E5-\\u02E9\\u1970-\\u1974]"}, + { + "Tone marks", + "[\\u07EB-\\u07F5\\u0E48-\\u0E4B\\u0EC8-\\u0ECB\\u19C8\\u19C9\\uA92B-\\uA92D]" + }, + {"Traditional letters", "[\\u1681-\\u1694]"}, + {"Transliteration head letters", "[\\u0F88-\\u0F8B]"}, + {"Tremolos", "[\\U0001D167-\\U0001D16C]"}, + {"Triangle symbols", "[\\u29CA-\\u29D0]"}, + { + "Two-part dependent vowel signs", + "[\\u09CB\\u09CC\\u0B4B\\u0B4C\\u0BCA-\\u0BCC\\u0D4A-\\u0D4C\\u0DDC-\\u0DDE\\u17BE-\\u17C0\\u17C4\\u17C5]" + }, + {"Twos", "[\\U0001F03F-\\U0001F045\\U0001F071-\\U0001F077]"}, + {"UPA modifiers", "[\\u02EF-\\u02FF]"}, + {"Uppercase Latin alphabet", "[A-Z]"}, + {"Uppercase letters", "[\\u0531-\\u0556\\U00010400-\\U00010427]"}, + {"Used for Ancient Greek", "[\\u1DC0\\u1DC1]"}, + {"Variant letterform", "[\\u03F9]"}, + {"Variant letterforms", "[\\u03CF-\\u03D7\\u03F0-\\u03F2]"}, + {"Variant letterforms and symbols", "[\\u03F4-\\u03F6]"}, + {"Variation selectors", "[\\uFE00-\\uFE0F\\U000E0100-\\U000E01EF]"}, + { + "Various signs", + "[\\u0901-\\u0903\\u093C\\u093D\\u094D\\u0950-\\u0954\\u0981-\\u0983\\u09BC\\u09BD\\u0A01-\\u0A03\\u0A3C\\u0A4D\\u0A51\\u0A81-\\u0A83\\u0ABC\\u0ABD\\u0ACD\\u0AD0\\u0B01-\\u0B03\\u0B3C\\u0B3D\\u0B4D\\u0B56\\u0B57\\u0B82\\u0B83\\u0BCD\\u0BD0\\u0BD7\\u0C01-\\u0C03\\u0C4D\\u0C55\\u0C56\\u0C82\\u0C83\\u0CBC\\u0CBD\\u0CD5\\u0CD6\\u0D02\\u0D03\\u0D4D\\u0D57\\u0D82\\u0D83\\u1036-\\u103A\\u104C-\\u104F\\u17C6-\\u17C8\\u17CB-\\u17D2\\u17D4-\\u17DA\\u17DC\\u17DD\\u1939-\\u193B\\u1940\\u1944\\u1945\\u19DE\\u19DF\\u1A1E\\u1A1F\\u1B00-\\u1B04\\u1B80-\\u1B82\\u1C36\\u1C37\\uA880\\uA881\\U000103C8-\\U000103CF\\U00010A0D-\\U00010A0F\\U00010A38-\\U00010A3A]" + }, + {"Vedic signs", "[\\u0CF1\\u0CF2]"}, + {"Vertical form digraph", "[\\u309F\\u30FF]"}, + {"Vertical line operator", "[\\u27CA]"}, + {"Vertical line operators", "[\\u2AEE-\\u2AF5]"}, + {"Vertical tiles", "[\\U0001F062]"}, + {"Vessels", "[\\U000100DE-\\U000100FA]"}, + {"Vietnamese tone marks (deprecated)", "[\\u0340\\u0341]"}, + {"Virama", "[\\u1714\\u1734\\u1BAA\\uA8C4\\uA953\\U00010A3F]"}, + {"Vocalic modification", "[\\u0F7E\\u0F7F]"}, + {"Voicing marks", "[\\u3099-\\u309C]"}, + {"Vowel", "[\\u0E47\\uA866]"}, + {"Vowel signs", "[\\u19B0-\\u19C0\\u1BA4-\\u1BA9\\uA947-\\uA94E]"}, + { + "Vowels", + "[\\u07A6-\\u07B0\\u0E30-\\u0E3A\\u0E40-\\u0E45\\u0EB0-\\u0EB9\\u0EBB\\u0EC0-\\u0EC4\\u1963-\\u196D\\u1A17-\\u1A1B\\u1B83-\\u1B89\\uA85E-\\uA861\\uA926-\\uA92A\\U00010466-\\U0001047F\\U00010A00-\\U00010A03\\U00010A05\\U00010A06]" + }, + {"Warning signs", "[\\u2620-\\u2623]"}, + {"Weather and astrological symbols", "[\\u2600-\\u260D]"}, + {"Weather symbol", "[\\u2614]"}, + {"White and black arrows", "[\\u2B00-\\u2B0D]"}, + {"White arrows and keyboard symbols", "[\\u21E6-\\u21F3]"}, + {"White on black circled numbers", "[\\u24EB-\\u24F4]"}, + {"Word ligatures", "[\\uFDF0-\\uFDFB]"}, + {"Yi radicals", "[\\uA490-\\uA4C6]"}, + {"Yiddish digraphs", "[\\u05F0-\\u05F2]"}, + {"Yijing hexagram symbols", "[\\u4DC0-\\u4DFF]"}, + {"Yijing monogram and digram symbols", "[\\u268A-\\u268F]"}, + {"Yijing trigram symbols", "[\\u2630-\\u2637]"}, + {"Zeroes", "[\\U0001F031-\\U0001F037\\U0001F063-\\U0001F069]"}, + {"Zodiacal symbols", "[\\u2648-\\u2653]"}, + }; } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/Typology.java b/UnicodeJsps/src/main/java/org/unicode/jsp/Typology.java index c9ff87d05..c6ed039c1 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/Typology.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/Typology.java @@ -1,5 +1,10 @@ package org.unicode.jsp; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.text.UnicodeSet; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -9,15 +14,10 @@ import java.util.TreeSet; import java.util.regex.Pattern; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UProperty.NameChoice; -import com.ibm.icu.text.UnicodeSet; - public class Typology { - //static UnicodeMap reasons = new UnicodeMap(); - public static Map label_to_uset = new TreeMap(); + // static UnicodeMap reasons = new UnicodeMap(); + public static Map label_to_uset = new TreeMap(); + static { label_to_uset.put("S", new UnicodeSet("[:S:]").freeze()); label_to_uset.put("L", new UnicodeSet("[:L:]").freeze()); @@ -27,17 +27,20 @@ public class Typology { label_to_uset.put("Z", new UnicodeSet("[:Z:]").freeze()); label_to_uset.put("P", new UnicodeSet("[:P:]").freeze()); } - public static Map full_path_to_uset = new TreeMap(); - public static Map path_to_uset = new TreeMap(); - //static Map,UnicodeSet> path_to_uset = new TreeMap,UnicodeSet>(); - public static Relation labelToPaths = new Relation(new TreeMap(), TreeSet.class); - public static Map> label_parent_uset = new TreeMap(); - //public static Relation pathToList = new Relation(new TreeMap(), TreeSet.class); + + public static Map full_path_to_uset = new TreeMap(); + public static Map path_to_uset = new TreeMap(); + // static Map,UnicodeSet> path_to_uset = new TreeMap,UnicodeSet>(); + public static Relation labelToPaths = + new Relation(new TreeMap(), TreeSet.class); + public static Map> label_parent_uset = new TreeMap(); + // public static Relation pathToList = new Relation(new TreeMap(), + // TreeSet.class); static class MyReader extends FileUtilities.SemiFileReader { - //0000 Cc [Control] [X] [X] [X] - public final static Pattern SPLIT = Pattern.compile("\\s*\t\\s*"); - public final static Pattern NON_ALPHANUM = Pattern.compile("[^0-9A-Za-z]+"); + // 0000 Cc [Control] [X] [X] [X] + public static final Pattern SPLIT = Pattern.compile("\\s*\t\\s*"); + public static final Pattern NON_ALPHANUM = Pattern.compile("[^0-9A-Za-z]+"); protected String[] splitLine(String line) { return SPLIT.split(line); @@ -56,13 +59,13 @@ protected boolean handleLine(int startRaw, int endRaw, String[] items) { if (!item.startsWith("[") || !item.endsWith("]")) { throw new IllegalArgumentException(i + "\t" + item); } - item = item.substring(1, item.length()-1); + item = item.substring(1, item.length() - 1); if (item.length() == 0) continue; item = NON_ALPHANUM.matcher(item).replaceAll("_"); temp_path.append('/').append(item); } String fullPath = temp_path.toString(); - + // store { fullPath = fullPath.intern(); @@ -82,7 +85,7 @@ protected boolean handleLine(int startRaw, int endRaw, String[] items) { } uset.add(startRaw, endRaw); - //labelToPath.put(item, path); + // labelToPath.put(item, path); path = (path + "/" + item).intern(); @@ -95,13 +98,13 @@ protected boolean handleLine(int startRaw, int endRaw, String[] items) { return true; } - Map,List> listCache = new HashMap,List>(); - Map,Set> setCache = new HashMap,Set>(); + Map, List> listCache = new HashMap, List>(); + Map, Set> setCache = new HashMap, Set>(); - private T intern(Map cache, T list) { + private T intern(Map cache, T list) { T old = cache.get(list); if (old != null) return old; - cache.put(list,list); + cache.put(list, list); return list; } } @@ -110,13 +113,14 @@ private T intern(Map cache, T list) { new MyReader().process(Typology.class, "Categories.txt"); // "09421-u52m09xxxx.txt" // fix the paths - Map temp= new TreeMap(); + Map temp = new TreeMap(); for (int i = 0; i < UCharacter.CHAR_CATEGORY_COUNT; ++i) { - UnicodeSet same = new UnicodeSet() - .applyIntPropertyValue(UProperty.GENERAL_CATEGORY, i); - String gcName = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, i, NameChoice.SHORT); - //System.out.println("\n" + gcName); - String prefix = gcName.substring(0,1); + UnicodeSet same = new UnicodeSet().applyIntPropertyValue(UProperty.GENERAL_CATEGORY, i); + String gcName = + UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, i, NameChoice.SHORT); + // System.out.println("\n" + gcName); + String prefix = gcName.substring(0, 1); for (String path : path_to_uset.keySet()) { UnicodeSet uset = path_to_uset.get(path); diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UBAVersion.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UBAVersion.java index 3d534680d..7c351c789 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UBAVersion.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UBAVersion.java @@ -1,26 +1,24 @@ package org.unicode.jsp; import java.util.EnumSet; - import org.unicode.props.UcdPropertyValues.Age_Values; -/** - * A class to encapsulate the available C UBA versions. - */ +/** A class to encapsulate the available C UBA versions. */ public class UBAVersion { - private static EnumSet C_UBA_AGES = EnumSet.of( - Age_Values.V6_2, - Age_Values.V6_3, - Age_Values.V7_0, - Age_Values.V8_0, - Age_Values.V9_0, - Age_Values.V10_0, - Age_Values.V11_0, - Age_Values.V12_0, - Age_Values.V13_0, - Age_Values.V14_0 - /* Current version is always last */ - ); + private static EnumSet C_UBA_AGES = + EnumSet.of( + Age_Values.V6_2, + Age_Values.V6_3, + Age_Values.V7_0, + Age_Values.V8_0, + Age_Values.V9_0, + Age_Values.V10_0, + Age_Values.V11_0, + Age_Values.V12_0, + Age_Values.V13_0, + Age_Values.V14_0 + /* Current version is always last */ + ); public static EnumSet getVersions() { return C_UBA_AGES; @@ -28,6 +26,7 @@ public static EnumSet getVersions() { /** * As a select, such as "140" + * * @return */ public static final String toSelect(Age_Values age) { diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeDataInput.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeDataInput.java index 25b532733..0a84443ab 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeDataInput.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeDataInput.java @@ -1,10 +1,9 @@ package org.unicode.jsp; -import java.io.DataInput; -import java.io.IOException; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.text.UnicodeSet; +import java.io.DataInput; +import java.io.IOException; public class UnicodeDataInput { @@ -22,6 +21,7 @@ public DataInput get() { /** * Reads a UnicodeSet in the format of writeUnicodeSet. + * * @param input * @return set read * @throws IOException @@ -43,7 +43,7 @@ public UnicodeSet readUnicodeSet() throws IOException { return result; } - public static abstract class ItemReader { + public abstract static class ItemReader { public abstract T read(DataInput in) throws IOException; public T[] readArray(DataInput input) throws IOException { @@ -74,14 +74,15 @@ public UnicodeMap readUnicodeMap(ItemReader reader) throws IOException return readUnicodeMap(reader, input); } - public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput dataInput) throws IOException { + public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput dataInput) + throws IOException { final UnicodeMap result = new UnicodeMap(); // values final T[] values = reader.readArray(dataInput); // transitions final int transitionCount = dataInput.readInt(); - final int[] transitions = new int[transitionCount+1]; + final int[] transitions = new int[transitionCount + 1]; int last = 0; for (int i = 0; i < transitionCount; ++i) { transitions[i] = last = dataInput.readInt() + last; @@ -90,12 +91,11 @@ public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput d // values for (int i = 0; i < transitionCount; ++i) { final int valueIndex = dataInput.readInt(); - if (valueIndex < 0) - { + if (valueIndex < 0) { continue; // no value } final T value = values[valueIndex]; - result.putAll(transitions[i], transitions[i+1]-1, value); + result.putAll(transitions[i], transitions[i + 1] - 1, value); } // strings final int stringCount = dataInput.readInt(); @@ -107,5 +107,4 @@ public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput d } return result; } - } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeJsp.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeJsp.java index 4216afeb5..956388544 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeJsp.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeJsp.java @@ -1,5 +1,15 @@ package org.unicode.jsp; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; @@ -9,7 +19,6 @@ import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.util.BNF; import org.unicode.cldr.util.Quoter; import org.unicode.idna.Idna2003; @@ -17,20 +26,10 @@ import org.unicode.idna.Uts46; import org.unicode.jsp.UnicodeUtilities.CodePointShower; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.RuleBasedBreakIterator; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - public class UnicodeJsp { public static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); + static { nf.setGroupingUsed(true); nf.setMaximumFractionDigits(0); @@ -49,7 +48,10 @@ public static String showRegexFind(String regex, String test) { try { Matcher matcher = Pattern.compile(regex, Pattern.COMMENTS).matcher(test); String result = UnicodeUtilities.toHTML.transform(matcher.replaceAll("⇑⇑$0⇓⇓")); - result = result.replaceAll("⇑⇑", "").replaceAll("⇓⇓", "").replaceAll("\r?\n", "
"); + result = + result.replaceAll("⇑⇑", "") + .replaceAll("⇓⇓", "") + .replaceAll("\r?\n", "
"); return result; } catch (Exception e) { return "Error: " + e.getMessage(); @@ -58,22 +60,21 @@ public static String showRegexFind(String regex, String test) { /** * The regex doesn't have to have the UnicodeSets resolved. + * * @param regex * @param count * @param maxRepeat * @return */ public static String getBnf(String regexSource, int count, int maxRepeat) { - //String regex = new UnicodeRegex().compileBnf(rules); + // String regex = new UnicodeRegex().compileBnf(rules); String regex = regexSource.replace("(?:", "(").replace("(?i)", ""); BNF bnf = new BNF(new Random(), new Quoter.RuleQuoter()); if (maxRepeat > 20) { maxRepeat = 20; } - bnf.setMaxRepeat(maxRepeat) - .addRules("$root=" + regex + ";") - .complete(); + bnf.setMaxRepeat(maxRepeat).addRules("$root=" + regex + ";").complete(); StringBuffer output = new StringBuffer(); for (int i = 0; i < count; ++i) { String line = bnf.next(); @@ -86,8 +87,10 @@ public static String showBreaks(String text, String choice) { RuleBasedBreakIterator b; if (choice.equals("Word")) b = (RuleBasedBreakIterator) BreakIterator.getWordInstance(); - else if (choice.equals("Line")) b = (RuleBasedBreakIterator) BreakIterator.getLineInstance(); - else if (choice.equals("Sentence")) b = (RuleBasedBreakIterator) BreakIterator.getSentenceInstance(); + else if (choice.equals("Line")) + b = (RuleBasedBreakIterator) BreakIterator.getLineInstance(); + else if (choice.equals("Sentence")) + b = (RuleBasedBreakIterator) BreakIterator.getSentenceInstance(); else b = (RuleBasedBreakIterator) BreakIterator.getCharacterInstance(); Matcher decimalEscapes = Pattern.compile("&#(x)?([0-9]+);").matcher(text); @@ -97,7 +100,7 @@ public static String showBreaks(String text, String choice) { while (decimalEscapes.find(start)) { int radix = 10; int code = Integer.parseInt(decimalEscapes.group(2), radix); - result2.append(text.substring(start,decimalEscapes.start()) + UTF16.valueOf(code)); + result2.append(text.substring(start, decimalEscapes.start()) + UTF16.valueOf(code)); start = decimalEscapes.end(); } result2.append(text.substring(start)); @@ -110,36 +113,40 @@ public static String showBreaks(String text, String choice) { for (int nextBreak = b.next(); nextBreak != BreakIterator.DONE; nextBreak = b.next()) { int status = b.getRuleStatus(); String piece = text.substring(lastBreak, nextBreak); - //piece = toHTML.transliterate(piece); + // piece = toHTML.transliterate(piece); piece = UnicodeUtilities.toHTML(piece); - piece = piece.replaceAll(" ","
") - .replaceAll("\r\n", "
") - .replaceAll("\n", "
"); + piece = + piece.replaceAll(" ", "
") + .replaceAll("\r\n", "
") + .replaceAll("\n", "
"); result.append("").append(piece).append(""); lastBreak = nextBreak; } - return result.toString(); } + return result.toString(); + } public static void showProperties(int cp, Appendable out) throws IOException { UnicodeUtilities.showProperties(cp, out); } - static String defaultIdnaInput = "" - +"fass.de faß.de fäß.de xn--fa-hia.de" - + "\n₹.com 𑀓.com" - + "\n\u0080.com xn--a.com a\u200cb xn--ab-j1t" - +"\nöbb.at ÖBB.at ÖBB.at" - +"\nȡog.de ☕.de I♥NY.de" - +"\nABC・日本.co.jp 日本。co。jp 日本。co.jp 日本⒈co.jp" - +"\nx\\u0327\\u0301.de x\\u0301\\u0327.de" - +"\nσόλος.gr Σόλος.gr ΣΌΛΟΣ.gr" - +"\nﻋﺮﺑﻲ.de عربي.de نامهای.de نامه\\u200Cای.de".trim(); + static String defaultIdnaInput = + "" + + "fass.de faß.de fäß.de xn--fa-hia.de" + + "\n₹.com 𑀓.com" + + "\n\u0080.com xn--a.com a\u200cb xn--ab-j1t" + + "\nöbb.at ÖBB.at ÖBB.at" + + "\nȡog.de ☕.de I♥NY.de" + + "\nABC・日本.co.jp 日本。co。jp 日本。co.jp 日本⒈co.jp" + + "\nx\\u0327\\u0301.de x\\u0301\\u0327.de" + + "\nσόλος.gr Σόλος.gr ΣΌΛΟΣ.gr" + + "\nﻋﺮﺑﻲ.de عربي.de نامهای.de نامه\\u200Cای.de".trim(); public static String getDefaultIdnaInput() { return defaultIdnaInput; } + public static final Transliterator UNESCAPER = Transliterator.getInstance("hex-any"); public static String getLanguageOptions(String locale) { @@ -150,11 +157,12 @@ public static String getTrace(Exception e) { return Arrays.asList(e.getStackTrace()).toString().replace("\n", "<\br>"); } - public static String getSimpleSet(String setA, UnicodeSet a, boolean abbreviate, boolean escape) { + public static String getSimpleSet( + String setA, UnicodeSet a, boolean abbreviate, boolean escape) { String a_out; a.clear(); try { - //setA = UnicodeSetUtilities.MyNormalize(setA, Normalizer.NFC); + // setA = UnicodeSetUtilities.MyNormalize(setA, Normalizer.NFC); a.addAll(UnicodeSetUtilities.parseUnicodeSet(setA)); a_out = UnicodeUtilities.getPrettySet(a, abbreviate, escape); } catch (Exception e) { @@ -163,13 +171,25 @@ public static String getSimpleSet(String setA, UnicodeSet a, boolean abbreviate, return a_out; } - public static void showSet(String grouping, String info, UnicodeSet a, boolean abbreviate, boolean ucdFormat, boolean collate, Appendable out) throws IOException { - CodePointShower codePointShower = new CodePointShower(grouping, info, abbreviate, ucdFormat, collate); + public static void showSet( + String grouping, + String info, + UnicodeSet a, + boolean abbreviate, + boolean ucdFormat, + boolean collate, + Appendable out) + throws IOException { + CodePointShower codePointShower = + new CodePointShower(grouping, info, abbreviate, ucdFormat, collate); UnicodeUtilities.showSetMain(a, codePointShower, out); } - public static void showPropsTable(Appendable out, String propForValues, String myLink) throws IOException { + + public static void showPropsTable(Appendable out, String propForValues, String myLink) + throws IOException { UnicodeUtilities.showPropsTable(out, propForValues, myLink); } + public static String showTransform(String transform, String sample) { return UnicodeUtilities.showTransform(transform, sample); } @@ -178,18 +198,24 @@ public static String listTransforms() { return UnicodeUtilities.listTransforms(); } - public static void getDifferences(String setA, String setB, - boolean abbreviate, String[] abResults, int[] abSizes, String[] abLinks) { + public static void getDifferences( + String setA, + String setB, + boolean abbreviate, + String[] abResults, + int[] abSizes, + String[] abLinks) { UnicodeUtilities.getDifferences(setA, setB, abbreviate, abResults, abSizes, abLinks); } public static int parseCode(String text, String nextButton, String previousButton) { - //text = fromHTML.transliterate(text); + // text = fromHTML.transliterate(text); String trimmed = text.trim(); if (trimmed.length() > 1) { try { - text = UTF16.valueOf(Integer.parseInt(trimmed,16)); - } catch (Exception e) {} + text = UTF16.valueOf(Integer.parseInt(trimmed, 16)); + } catch (Exception e) { + } } int cp = UTF16.charAt(text, 0); if (nextButton != null) { @@ -211,21 +237,23 @@ public static String getConfusables(String test, int choice) { Confusables confusables = new Confusables(test); switch (choice) { - case 0: // none - break; - case 1: // IDNA2008 - confusables.setAllowedCharacters(Idna2003.SINGLETON.validSet_transitional); - confusables.setNormalizationCheck(Normalizer.NFC); - break; - case 2: // IDNA2008 - confusables.setAllowedCharacters(Idna2008.SINGLETON.validSet_transitional); - confusables.setNormalizationCheck(Normalizer.NFC); - break; - case 3: // UTS46/39 - confusables.setAllowedCharacters(new UnicodeSet(Uts46.SINGLETON.validSet_transitional).retainAll(XIDModifications.getAllowed())); - confusables.setNormalizationCheck(Normalizer.NFC); - confusables.setScriptCheck(Confusables.ScriptCheck.same); - break; + case 0: // none + break; + case 1: // IDNA2008 + confusables.setAllowedCharacters(Idna2003.SINGLETON.validSet_transitional); + confusables.setNormalizationCheck(Normalizer.NFC); + break; + case 2: // IDNA2008 + confusables.setAllowedCharacters(Idna2008.SINGLETON.validSet_transitional); + confusables.setNormalizationCheck(Normalizer.NFC); + break; + case 3: // UTS46/39 + confusables.setAllowedCharacters( + new UnicodeSet(Uts46.SINGLETON.validSet_transitional) + .retainAll(XIDModifications.getAllowed())); + confusables.setNormalizationCheck(Normalizer.NFC); + confusables.setScriptCheck(Confusables.ScriptCheck.same); + break; } return getConfusablesCore(test, confusables); } catch (Exception e) { @@ -242,8 +270,12 @@ private static String returnStackTrace(Exception e) { return str; } - - public static String getConfusables(String test, boolean nfkcCheck, boolean scriptCheck, boolean idCheck, boolean xidCheck) { + public static String getConfusables( + String test, + boolean nfkcCheck, + boolean scriptCheck, + boolean idCheck, + boolean xidCheck) { try { Confusables confusables = new Confusables(test); @@ -271,13 +303,13 @@ private static String getConfusablesCore(String test, Confusables confusables) { max = size; } } - String topCell = "

\n" + - "" + - "" + - "" + - "" + - "" + - ""); + "\n" + + "" + + "" + + "" + + "" + + "" + + ""); } } @@ -239,15 +270,27 @@ private static void showOrderedList2(String title, Collection"; + line = + ""; } LOG_WRITER.println(line); } @@ -257,33 +300,34 @@ private static void showOrderedList2(String title, Collection[]> DOUBLE_STRING_COMP = new Comparator[]>() { - // only handle the case where the lengths are equal - public int compare(Comparable[] o1, Comparable[] o2) { - for (int i = 0; i < o1.length; ++i) { - int result = o1[i].compareTo(o2[i]); - if (result != 0) { - return -result; + static Comparator[]> DOUBLE_STRING_COMP = + new Comparator[]>() { + // only handle the case where the lengths are equal + public int compare(Comparable[] o1, Comparable[] o2) { + for (int i = 0; i < o1.length; ++i) { + int result = o1[i].compareTo(o2[i]); + if (result != 0) { + return -result; + } + } + return 0; } - } - return 0; - } - }; + }; /** - * Generate new mapping file, based on exceptions file. See normalizeForMatchExceptions.txt for the format. - * - * @param specialMappingsFile - * TODO - * @param outputFile - * TODO + * Generate new mapping file, based on exceptions file. See normalizeForMatchExceptions.txt for + * the format. + * + * @param specialMappingsFile TODO + * @param outputFile TODO * @param diffSource * @param frequencyData * @param frequencyFile * @throws IOException */ - static void generateMappings(String specialMappingsFile, String outputFile, String diffSource, String frequencyFile) - throws IOException { + static void generateMappings( + String specialMappingsFile, String outputFile, String diffSource, String frequencyFile) + throws IOException { PrintWriter out = openUTF8Writer(outputFile); out.println("# Generated from: " + new File(specialMappingsFile).getName()); out.println("# Date: " + ISO_DATE.format(new Date())); @@ -295,7 +339,7 @@ static void generateMappings(String specialMappingsFile, String outputFile, Stri loadMappings(specialMappingsFile, SPECIAL_MAPPINGS, true); UnicodeMap mappings = new UnicodeMap(); - for (UnicodeSetIterator it = new UnicodeSetIterator(ASSIGNED); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(ASSIGNED); it.next(); ) { if (LIST_STYLE == ListStyle.ONLY_OLD && !U50.contains(it.codepoint)) { continue; } @@ -309,7 +353,7 @@ static void generateMappings(String specialMappingsFile, String outputFile, Stri while (true) { UnicodeMap deltaMappings = new UnicodeMap(); UnicodeSet done = new UnicodeSet(); - for (UnicodeSetIterator it = new UnicodeSetIterator(mappings.keySet()); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(mappings.keySet()); it.next(); ) { String target = (String) mappings.getValue(it.codepoint); String recursed = replace(target, mappings); if (recursed != target) { @@ -326,7 +370,7 @@ static void generateMappings(String specialMappingsFile, String outputFile, Stri ; // print them - for (UnicodeSetIterator it = new UnicodeSetIterator(mappings.keySet()); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(mappings.keySet()); it.next(); ) { writeMapping(out, it.getString(), (String) mappings.getValue(it.codepoint), SIMPLE); } out.close(); @@ -361,16 +405,23 @@ private static String getRemapped(int codepoint, UnicodeMap mapping) { } enum RuleMappings { - __ok, __nfc, __caseonly, __bracket, __bracket_up, __bracket_circle, __bracket_down, __delete, __exclude + __ok, + __nfc, + __caseonly, + __bracket, + __bracket_up, + __bracket_circle, + __bracket_down, + __delete, + __exclude } /** - * Remap a string based on a special flag (usually gotten from the special_mappings, but - * broken out so that we can see the effects of new rules). - * + * Remap a string based on a special flag (usually gotten from the special_mappings, but broken + * out so that we can see the effects of new rules). + * * @param codepoint - * @param special - * either "exclude", or "caseonly", or null, or actual result. + * @param special either "exclude", or "caseonly", or null, or actual result. * @return */ private static String getRemapped(int codepoint, String special) { @@ -384,29 +435,29 @@ private static String getRemapped(int codepoint, String special) { other = special; } else { switch (RuleMappings.valueOf(special)) { - case __ok: - other = normalizeAndCaseFold(other, Normalizer.NFKC); - break; - case __nfc: - other = Normalizer.normalize(other, Normalizer.NFC, 0); - break; - case __caseonly: - other = normalizeAndCaseFold(other, Normalizer.NFC); - break; - case __bracket: - other = " " + normalizeAndCaseFold(other, Normalizer.NFKC) + " "; - break; - case __bracket_down: - other = "⌜" + normalizeAndCaseFold(other, Normalizer.NFKC) + "⌝"; - break; - case __bracket_up: - other = "⌞" + normalizeAndCaseFold(other, Normalizer.NFKC) + "⌟"; - break; - case __bracket_circle: - other = "(" + normalizeAndCaseFold(other, Normalizer.NFKC) + ")"; - break; - default: - throw new IllegalArgumentException("Missing rule"); + case __ok: + other = normalizeAndCaseFold(other, Normalizer.NFKC); + break; + case __nfc: + other = Normalizer.normalize(other, Normalizer.NFC, 0); + break; + case __caseonly: + other = normalizeAndCaseFold(other, Normalizer.NFC); + break; + case __bracket: + other = " " + normalizeAndCaseFold(other, Normalizer.NFKC) + " "; + break; + case __bracket_down: + other = "⌜" + normalizeAndCaseFold(other, Normalizer.NFKC) + "⌝"; + break; + case __bracket_up: + other = "⌞" + normalizeAndCaseFold(other, Normalizer.NFKC) + "⌟"; + break; + case __bracket_circle: + other = "(" + normalizeAndCaseFold(other, Normalizer.NFKC) + ")"; + break; + default: + throw new IllegalArgumentException("Missing rule"); } } return other; @@ -414,7 +465,7 @@ private static String getRemapped(int codepoint, String special) { /** * Do the standard normalization & casefold - * + * * @param other * @return */ @@ -427,44 +478,64 @@ private static String normalizeAndCaseFold(String other, Mode normalizerType) { /** * Write out a mapping line - * + * * @param out * @param source * @param target * @param simple TODO */ - private static void writeMapping(PrintWriter out, String source, String target, boolean simple) { + private static void writeMapping( + PrintWriter out, String source, String target, boolean simple) { if (simple) { - out.println(source + "; " + target - + " # " + hex(source, " ") + " → " + hex(target, " ") - + ", " + UCharacter.getName(source, " + ") + " → " + UCharacter.getName(target, " + ")); + out.println( + source + + "; " + + target + + " # " + + hex(source, " ") + + " → " + + hex(target, " ") + + ", " + + UCharacter.getName(source, " + ") + + " → " + + UCharacter.getName(target, " + ")); return; } String otherName = jimName(target); - String age = (LIST_STYLE == ListStyle.SHOW_AGE) && (!U50.containsAll(source) || !U50.containsAll(target)) - ? showVersion(getNewest(source + target)) + " " : ""; - out.println(hex(source, " ") - + " ; " + hex(target, " ").replace(",", " ") - + " ; # " + age + UCharacter.getName(source, " + ") - + " => " + otherName - ); + String age = + (LIST_STYLE == ListStyle.SHOW_AGE) + && (!U50.containsAll(source) || !U50.containsAll(target)) + ? showVersion(getNewest(source + target)) + " " + : ""; + out.println( + hex(source, " ") + + " ; " + + hex(target, " ").replace(",", " ") + + " ; # " + + age + + UCharacter.getName(source, " + ") + + " => " + + otherName); } /** * Show the version in a nice format - * + * * @param newest * @return */ private static String showVersion(VersionInfo newest) { - return "[" + newest.getMajor() + "." + newest.getMinor() - + (newest.getMilli() == 0 ? "" : "." + newest.getMilli()) - + "]"; + return "[" + + newest.getMajor() + + "." + + newest.getMinor() + + (newest.getMilli() == 0 ? "" : "." + newest.getMilli()) + + "]"; } /** * Get the age of the newest character in the string. - * + * * @param string * @return */ @@ -483,16 +554,15 @@ private static VersionInfo getNewest(String string) { /** * Load the special mappings from a file. - * - * @param filename - * TODO + * + * @param filename TODO * @param resultMappings - * @param printWriter - * TODO + * @param printWriter TODO * @throws IOException */ - private static void loadMappings(String filename, UnicodeMap resultMappings, boolean printWriter) - throws IOException { + private static void loadMappings( + String filename, UnicodeMap resultMappings, boolean printWriter) + throws IOException { // SPECIAL_MAPPINGS.putAll(0,0x10FFFF, "exclude"); BufferedReader in = openUTF8Reader(filename); int lineNumber = 0; @@ -507,17 +577,20 @@ private static void loadMappings(String filename, UnicodeMap resultMappi /** * Process each special mapping line from a file - * + * * @param lineNumber * @param line * @param skipIfIdentical * @param resultMappings - * @param printWriter - * TODO + * @param printWriter TODO * @param mappings */ - private static void getMappingFromSemiLine(int lineNumber, String line, boolean skipIfIdentical, - UnicodeMap resultMappings, boolean printWriter) { + private static void getMappingFromSemiLine( + int lineNumber, + String line, + boolean skipIfIdentical, + UnicodeMap resultMappings, + boolean printWriter) { line = line.trim(); if (line.startsWith("\uFEFF")) { line = line.substring(1); @@ -556,24 +629,29 @@ private static void getMappingFromSemiLine(int lineNumber, String line, boolean source.add(start, end); } - UnicodeSet targetFilter = pieces.length < 3 || pieces[2].length() == 0 ? new UnicodeSet(ASSIGNED) - : new UnicodeSet(pieces[2]); + UnicodeSet targetFilter = + pieces.length < 3 || pieces[2].length() == 0 + ? new UnicodeSet(ASSIGNED) + : new UnicodeSet(pieces[2]); addMappings(source, target, targetFilter, resultMappings, printWriter); } /** * Add exceptions based on a line from the special mapping files - * + * * @param source * @param target * @param targetFilter * @param resultMappings - * @param printWriter - * TODO + * @param printWriter TODO * @param mappings */ - private static void addMappings(UnicodeSet source, String target, UnicodeSet targetFilter, - UnicodeMap resultMappings, boolean printWriter) { + private static void addMappings( + UnicodeSet source, + String target, + UnicodeSet targetFilter, + UnicodeMap resultMappings, + boolean printWriter) { // remap options if (target.equalsIgnoreCase("delete")) { @@ -597,7 +675,7 @@ private static void addMappings(UnicodeSet source, String target, UnicodeSet tar UnicodeMap deltaMappings = new UnicodeMap(); UnicodeSet done = new UnicodeSet(); - for (UnicodeSetIterator it = new UnicodeSetIterator(affected); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(affected); it.next(); ) { String willGet = getRemapped(it.codepoint, target); if (!targetFilter.containsAll(willGet)) { continue; @@ -626,7 +704,8 @@ private static void addMappings(UnicodeSet source, String target, UnicodeSet tar // //final String caseFolded = UCharacter.foldCase(string, true); // // if (printWriter != null) { - // printWriter.println("\t" + codeAndName(string, Form.codeStringAndName) + "\t;\t" + oldTarget + // printWriter.println("\t" + codeAndName(string, Form.codeStringAndName) + "\t;\t" + + // oldTarget // + "\t; #\t" + showChanged(string, didGet, willGet, Form.string)); // } } @@ -638,30 +717,35 @@ private static void addMappings(UnicodeSet source, String target, UnicodeSet tar } enum Form { - string, codeStringAndName + string, + codeStringAndName } - private static String showChanged(final String string, String didGet, String willGet, Form form) { + private static String showChanged( + final String string, String didGet, String willGet, Form form) { final String didGetStr = string.equals(didGet) ? "[unchanged]" : codeAndName(didGet, form); return codeAndName(string, form) - + "\t ↛ " + didGetStr - + "\t → " + codeAndName(willGet, form); + + "\t ↛ " + + didGetStr + + "\t → " + + codeAndName(willGet, form); } static final UnicodeSet BIDI = new UnicodeSet("[[:bidiclass=R:][:bidiclass=AL:]]"); static final UnicodeSet DI = new UnicodeSet("[[:C:][:Default_Ignorable_Code_Point:]]"); - static final UnicodeSet FISHY = new UnicodeSet( - "[\\<\\&\\>\\\"[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]]"); + static final UnicodeSet FISHY = + new UnicodeSet( + "[\\<\\&\\>\\\"[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]]"); private static final String RULES = - "'<' > '<' ;" + - "'&' > '&' ;" + - "'>' > '>' ;" + - "'\"' > '"' ; " + - ":: [[:C:][:Default_Ignorable_Code_Point:]-[\\u0020\\u0009\\u000A\\u000D]] hex/java ; "; + "'<' > '<' ;" + + "'&' > '&' ;" + + "'>' > '>' ;" + + "'\"' > '"' ; " + + ":: [[:C:][:Default_Ignorable_Code_Point:]-[\\u0020\\u0009\\u000A\\u000D]] hex/java ; "; - public static final Transliterator toHTMLControl = Transliterator.createFromRules( - "any-html", RULES, Transliterator.FORWARD); + public static final Transliterator toHTMLControl = + Transliterator.createFromRules("any-html", RULES, Transliterator.FORWARD); private static FrequencyData2 frequencies; @@ -685,23 +769,28 @@ private static String quote(String input) { /** * Printing convenience function - * + * * @param defaultChange * @return */ private static String codeAndName(String defaultChange, Form form) { - final String quotedChar = DI.containsAll(defaultChange) ? "«»" : "«" + quote(defaultChange) + "»"; + final String quotedChar = + DI.containsAll(defaultChange) ? "«»" : "«" + quote(defaultChange) + "»"; switch (form) { - default: - return quotedChar; - case codeStringAndName: - return hex(defaultChange, " ") + " " + quotedChar + " " + UCharacter.getName(defaultChange, " + "); + default: + return quotedChar; + case codeStringAndName: + return hex(defaultChange, " ") + + " " + + quotedChar + + " " + + UCharacter.getName(defaultChange, " + "); } } /** * Printing convenience function - * + * * @param spaceDelimitedHex * @return */ @@ -720,13 +809,15 @@ static String fromHex(String spaceDelimitedHex) { /** * Printing convenience function - * + * * @param other * @return */ private static String jimName(String other) { - String otherName = UTF16.countCodePoint(other) != 1 ? null - : (String) JIM_NAMES.getValue(UTF16.charAt(other, 0)); + String otherName = + UTF16.countCodePoint(other) != 1 + ? null + : (String) JIM_NAMES.getValue(UTF16.charAt(other, 0)); if (otherName == null) { otherName = UCharacter.getName(other, " + "); } @@ -735,7 +826,7 @@ private static String jimName(String other) { /** * Printing convenience function - * + * * @param s * @param separator * @return @@ -760,12 +851,13 @@ private static PrintWriter openUTF8Writer(String filename) throws IOException { static BufferedReader openUTF8Reader(String filename) throws IOException { File file = new File(filename); System.out.println("Reading:\t" + file.getCanonicalPath()); - return new BufferedReader(new InputStreamReader(new FileInputStream(file), UTF8), 1024 * 64); + return new BufferedReader( + new InputStreamReader(new FileInputStream(file), UTF8), 1024 * 64); } /** * Used to reformat files into consistent form. - * + * * @param sourceFile * @param targetFile * @throws IOException @@ -781,7 +873,7 @@ private static void fixOld(String sourceFile, String targetFile) throws IOExcept } in.close(); PrintWriter out = openUTF8Writer(targetFile); - for (UnicodeSetIterator it = new UnicodeSetIterator(oldMap.keySet()); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(oldMap.keySet()); it.next(); ) { String str = it.getString(); String other = (String) oldMap.getValue(it.codepoint); writeMapping(out, str, other, false); diff --git a/unicodetools/src/main/java/org/unicode/draft/GeneratePickerData2.java b/unicodetools/src/main/java/org/unicode/draft/GeneratePickerData2.java index 42c5082c2..c756120d7 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GeneratePickerData2.java +++ b/unicodetools/src/main/java/org/unicode/draft/GeneratePickerData2.java @@ -1,5 +1,17 @@ package org.unicode.draft; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.LocaleData; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -29,7 +41,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.CharacterListCompressor; import org.unicode.cldr.draft.Compacter; import org.unicode.cldr.draft.ScriptMetadata; @@ -41,20 +52,10 @@ import org.unicode.draft.GeneratePickerData2.CategoryTable.Separation; import org.unicode.text.utility.Settings; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.LocaleData; -import com.ibm.icu.util.ULocale; - -@CLDRTool(alias = "generate-picker-data", description = "Generate draft.PickerData content", hidden = "generator for draft data") +@CLDRTool( + alias = "generate-picker-data", + description = "Generate draft.PickerData content", + hidden = "generator for draft data") class GeneratePickerData2 { static final boolean DEBUG = true; @@ -79,76 +80,104 @@ class GeneratePickerData2 { private static final String EAST_ASIAN = "Other East Asian Scripts"; - static final UnicodeSet COMPATIBILITY = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:nfkcqc=n:]-[:Lm:]]") - .removeAll(ScriptCategories2.IPA).removeAll(ScriptCategories2.IPA_EXTENSIONS).freeze(); + static final UnicodeSet COMPATIBILITY = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet("[[:nfkcqc=n:]-[:Lm:]]") + .removeAll(ScriptCategories2.IPA) + .removeAll(ScriptCategories2.IPA_EXTENSIONS) + .freeze(); - private static final UnicodeSet PRIVATE_USE = (UnicodeSet) new UnicodeSet("[:private use:]").freeze(); + private static final UnicodeSet PRIVATE_USE = + (UnicodeSet) new UnicodeSet("[:private use:]").freeze(); - static final UnicodeSet SKIP = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:cn:][:cs:][:co:][:cc:]\uFFFC]") - .addAll(ScriptCategories2.DEPRECATED_NEW).freeze(); - private static final UnicodeSet KNOWN_DUPLICATES = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:Nd:]").freeze(); + static final UnicodeSet SKIP = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet("[[:cn:][:cs:][:co:][:cc:]\uFFFC]") + .addAll(ScriptCategories2.DEPRECATED_NEW) + .freeze(); + private static final UnicodeSet KNOWN_DUPLICATES = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:Nd:]").freeze(); public static final UnicodeSet HISTORIC = ScriptCategories2.ARCHAIC; // public static final UnicodeSet UNCOMMON = (UnicodeSet) new // UnicodeSet(ScriptCategories.ARCHAIC).addAll(COMPATIBILITY).freeze(); - private static final UnicodeSet NAMED_CHARACTERS = (UnicodeSet) new UnicodeSet( - "[[:Z:][:default_ignorable_code_point:][:Pd:][:cf:]]").removeAll(SKIP).freeze(); - private static final UnicodeSet MODERN_JAMO = (UnicodeSet) new UnicodeSet( - "[\u1100-\u1112 \u1161-\u1175 \u11A8-\u11C2]").removeAll(SKIP).freeze(); - - private static final UnicodeSet HST_L = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:HST=L:]").freeze(); - private static final UnicodeSet single = (UnicodeSet) ScriptCategories2.parseUnicodeSet( - "[[:HST=L:][:HST=V:][:HST=T:]]").freeze(); - private static final UnicodeSet syllable = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:HST=LV:][:HST=LVT:]]") - .freeze(); - private static final UnicodeSet all = (UnicodeSet) new UnicodeSet(single).addAll(syllable).freeze(); + private static final UnicodeSet NAMED_CHARACTERS = + (UnicodeSet) + new UnicodeSet("[[:Z:][:default_ignorable_code_point:][:Pd:][:cf:]]") + .removeAll(SKIP) + .freeze(); + private static final UnicodeSet MODERN_JAMO = + (UnicodeSet) + new UnicodeSet("[\u1100-\u1112 \u1161-\u1175 \u11A8-\u11C2]") + .removeAll(SKIP) + .freeze(); + + private static final UnicodeSet HST_L = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:HST=L:]").freeze(); + private static final UnicodeSet single = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet("[[:HST=L:][:HST=V:][:HST=T:]]").freeze(); + private static final UnicodeSet syllable = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:HST=LV:][:HST=LVT:]]").freeze(); + private static final UnicodeSet all = + (UnicodeSet) new UnicodeSet(single).addAll(syllable).freeze(); static RuleBasedCollator UCA_BASE = (RuleBasedCollator) Collator.getInstance(Locale.ENGLISH); static { UCA_BASE.setNumericCollation(true); } - public static final Comparator CODE_POINT_ORDER = new UTF16.StringComparator(true, false, 0); + public static final Comparator CODE_POINT_ORDER = + new UTF16.StringComparator(true, false, 0); static Comparator UCA = new MultilevelComparator(UCA_BASE, CODE_POINT_ORDER); - static Comparator buttonComparator = new MultilevelComparator( - // new UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[:ascii:]")), - // new UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[[:Letter:]&[:^NFKC_QuickCheck=N:]]")), - new UnicodeSetInclusionFirst(ScriptCategories2.parseUnicodeSet("[[:Letter:]-[:Lm:]]")), - new UnicodeSetInclusionFirst(ScriptCategories2.parseUnicodeSet("[:Lm:]")), UCA_BASE, CODE_POINT_ORDER); - - static Comparator LinkedHashSetComparator = new Comparator() { - public int compare(String arg0, String arg1) { - throw new IllegalArgumentException(); // only used to signal usage - } - }; - - static Comparator ListComparator = new Comparator() { - public int compare(String arg0, String arg1) { - throw new IllegalArgumentException(); // only used to signal usage - } - }; - - public static final Comparator SORT_ALWAYS = CODE_POINT_ORDER; // null for piecemeal sorting, ENGLISH for - // UCA + static Comparator buttonComparator = + new MultilevelComparator( + // new UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[:ascii:]")), + // new + // UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[[:Letter:]&[:^NFKC_QuickCheck=N:]]")), + new UnicodeSetInclusionFirst( + ScriptCategories2.parseUnicodeSet("[[:Letter:]-[:Lm:]]")), + new UnicodeSetInclusionFirst(ScriptCategories2.parseUnicodeSet("[:Lm:]")), + UCA_BASE, + CODE_POINT_ORDER); + + static Comparator LinkedHashSetComparator = + new Comparator() { + public int compare(String arg0, String arg1) { + throw new IllegalArgumentException(); // only used to signal usage + } + }; - static Comparator subCategoryComparator = new Comparator() { - public int compare(String o1, String o2) { - boolean a = o1.startsWith(ARCHAIC_MARKER); - boolean b = o2.startsWith(ARCHAIC_MARKER); - if (a != b) { - return a ? 1 : -1; - } - a = o1.startsWith(COMPAT_MARKER); - b = o2.startsWith(COMPAT_MARKER); - if (a != b) { - return a ? 1 : -1; - } - return UCA.compare(o1, o2); - } - }; + static Comparator ListComparator = + new Comparator() { + public int compare(String arg0, String arg1) { + throw new IllegalArgumentException(); // only used to signal usage + } + }; + + public static final Comparator SORT_ALWAYS = + CODE_POINT_ORDER; // null for piecemeal sorting, ENGLISH for + // UCA + + static Comparator subCategoryComparator = + new Comparator() { + public int compare(String o1, String o2) { + boolean a = o1.startsWith(ARCHAIC_MARKER); + boolean b = o2.startsWith(ARCHAIC_MARKER); + if (a != b) { + return a ? 1 : -1; + } + a = o1.startsWith(COMPAT_MARKER); + b = o2.startsWith(COMPAT_MARKER); + if (a != b) { + return a ? 1 : -1; + } + return UCA.compare(o1, o2); + } + }; static CategoryTable CATEGORYTABLE = new CategoryTable(); static Subheader2 subheader; @@ -157,14 +186,17 @@ public int compare(String o1, String o2) { static Renamer renamer; private static PrintWriter renamingLog; - final static Options myOptions = new Options(); + static final Options myOptions = new Options(); enum MyOptions { - output(".*", CLDRPaths.BASE_DIRECTORY + "tools/java/org/unicode/cldr/draft/picker/", - "output data directory"), + output( + ".*", + CLDRPaths.BASE_DIRECTORY + "tools/java/org/unicode/cldr/draft/picker/", + "output data directory"), unicodedata(".*", Settings.CLDR.UCD_DATA_DIRECTORY, "Unicode Data directory"), verbose(null, null, "verbose debugging messages"), - korean(null, null, "generate korean hangul defectives instead"), ; + korean(null, null, "generate korean hangul defectives instead"), + ; // boilerplate final Option option; @@ -180,16 +212,21 @@ public static void main(String[] args) throws Exception { generateHangulDefectives(); return; } - outputDirectory = new File(MyOptions.output.option.getValue()).getCanonicalPath() + File.separator; - unicodeDataDirectory = new File(MyOptions.unicodedata.option.getValue()).getCanonicalPath() + File.separator; + outputDirectory = + new File(MyOptions.output.option.getValue()).getCanonicalPath() + File.separator; + unicodeDataDirectory = + new File(MyOptions.unicodedata.option.getValue()).getCanonicalPath() + + File.separator; renamingLog = getFileWriter(outputDirectory, "renamingLog.txt"); renamer = new Renamer("GeneratePickerData.txt"); if (DEBUG) - System.out.println("Whitespace? " - + ScriptCategories2.parseUnicodeSet("[:z:]").equals(ScriptCategories2.parseUnicodeSet("[:whitespace:]"))); + System.out.println( + "Whitespace? " + + ScriptCategories2.parseUnicodeSet("[:z:]") + .equals(ScriptCategories2.parseUnicodeSet("[:whitespace:]"))); buildMainTable(); addEmojiCharacters(); @@ -217,8 +254,13 @@ public static void main(String[] args) throws Exception { } throw new Exception(ERROR_COUNT.size() + " errors above!"); } - System.out.println("Compression\t" + Compacter.totalOld + ",\t" + Compacter.totalNew + ",\t" - + (Compacter.totalNew / Compacter.totalOld)); + System.out.println( + "Compression\t" + + Compacter.totalOld + + ",\t" + + Compacter.totalNew + + ",\t" + + (Compacter.totalNew / Compacter.totalOld)); System.out.println("DONE"); } @@ -239,7 +281,14 @@ public static void writeCategories() throws FileNotFoundException, IOException { final Collection strings = subData.getValue().strings; out.println(main + " ;\t" + sub + " ;\t" + simpleList(strings)); final UnicodeSet uset = new UnicodeSet().addAll(strings); - out2.println(main + " ;\t" + sub + " ;\t" + uset.size() + " ;\t" + uset.toPattern(false)); + out2.println( + main + + " ;\t" + + sub + + " ;\t" + + uset.size() + + " ;\t" + + uset.toPattern(false)); } } out.close(); @@ -270,18 +319,43 @@ private static void buildMainTable() throws IOException { addSymbols(); - addProperty("General_Category", "Category", buttonComparator, - ScriptCategories2.parseUnicodeSet("[[:script=common:][:script=inherited:][:N:]" + "-[:letter:]" - + "-[:default_ignorable_code_point:]" + "-[:cf:]" + "-[:whitespace:]" + "-[:So:]" + - // "-[[:M:]-[:script=common:]-[:script=inherited:]]" + - "]")); + addProperty( + "General_Category", + "Category", + buttonComparator, + ScriptCategories2.parseUnicodeSet( + "[[:script=common:][:script=inherited:][:N:]" + + "-[:letter:]" + + "-[:default_ignorable_code_point:]" + + "-[:cf:]" + + "-[:whitespace:]" + + "-[:So:]" + + + // "-[[:M:]-[:script=common:]-[:script=inherited:]]" + + "]")); - CATEGORYTABLE.add("Format & Whitespace", true, "Whitespace", buttonComparator, Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[:whitespace:]")); - CATEGORYTABLE.add("Format & Whitespace", true, "Format", buttonComparator, Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[:cf:]")); - CATEGORYTABLE.add("Format & Whitespace", true, "Other", buttonComparator, Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[[:default_ignorable_code_point:]-[:cf:]-[:whitespace:]]")); + CATEGORYTABLE.add( + "Format & Whitespace", + true, + "Whitespace", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet("[:whitespace:]")); + CATEGORYTABLE.add( + "Format & Whitespace", + true, + "Format", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet("[:cf:]")); + CATEGORYTABLE.add( + "Format & Whitespace", + true, + "Other", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet( + "[[:default_ignorable_code_point:]-[:cf:]-[:whitespace:]]")); addLatin(); Set EuropeanMinusLatin = new TreeSet(ScriptCategories2.EUROPEAN); @@ -304,26 +378,33 @@ private static void addHan() throws IOException { UnicodeSet others = ScriptCategories2.parseUnicodeSet("[:script=Han:]"); // find base values - for (int radicalStrokes : RadicalStroke2.SINGLETON.radStrokesToRadToRemainingStrokes.keySet()) { + for (int radicalStrokes : + RadicalStroke2.SINGLETON.radStrokesToRadToRemainingStrokes.keySet()) { // String mainCat = null; - Map> char2RemStrokes2Set = RadicalStroke2.SINGLETON.radStrokesToRadToRemainingStrokes - .get(radicalStrokes); + Map> char2RemStrokes2Set = + RadicalStroke2.SINGLETON.radStrokesToRadToRemainingStrokes.get(radicalStrokes); for (String radical : char2RemStrokes2Set.keySet()) { Map remStrokes2Set = char2RemStrokes2Set.get(radical); for (int remStrokes : remStrokes2Set.keySet()) { int radicalChar = ScriptCategories2.RADICAL_NUM2CHAR.get(radical); - String mainCat = "Han " + (radicalStrokes > 10 ? "11..17" : String.valueOf(radicalStrokes)) - + "-Stroke Radicals"; + String mainCat = + "Han " + + (radicalStrokes > 10 + ? "11..17" + : String.valueOf(radicalStrokes)) + + "-Stroke Radicals"; String subCat = UTF16.valueOf(radicalChar); // if (DEBUG) System.out.println(radical + " => " + radicalToChar.get(radical)); // String radChar = getRadicalName(radicalToChar, radical); // String subCat = radChar + " Han"; // try { - // String radical2 = radical.endsWith("'") ? radical.substring(0, radical.length() - 1) : radical; + // String radical2 = radical.endsWith("'") ? radical.substring(0, + // radical.length() - 1) : radical; // int x = Integer.parseInt(radical2); // int base = (x / 20) * 20; // int top = base + 19; - // mainCat = "CJK (Han) " + getRadicalName(radicalToChar, Math.max(base,1)) + " - " + + // mainCat = "CJK (Han) " + getRadicalName(radicalToChar, Math.max(base,1)) + " + // - " + // getRadicalName(radicalToChar, Math.min(top,214)); // } catch (Exception e) {} // if (mainCat == null) { @@ -334,7 +415,9 @@ private static void addHan() throws IOException { final UnicodeSet values = remStrokes2Set.get(remStrokes); // close over NFKC - for (UnicodeSetIterator it = new UnicodeSetIterator(RadicalStroke2.SINGLETON.remainder); it.next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator(RadicalStroke2.SINGLETON.remainder); + it.next(); ) { String nfkc = Normalizer.normalize(it.codepoint, Normalizer.NFKC); if (values.contains(nfkc)) { values.add(it.codepoint); @@ -342,24 +425,43 @@ private static void addHan() throws IOException { } others.removeAll(values); - UnicodeSet normal = new UnicodeSet(values).removeAll(GeneratePickerData2.HISTORIC) - .removeAll(GeneratePickerData2.COMPATIBILITY).removeAll(GeneratePickerData2.UNCOMMON_HAN); - GeneratePickerData2.CATEGORYTABLE.add(mainCat, true, subCat, - GeneratePickerData2.LinkedHashSetComparator, Separation.AUTOMATIC, normal); + UnicodeSet normal = + new UnicodeSet(values) + .removeAll(GeneratePickerData2.HISTORIC) + .removeAll(GeneratePickerData2.COMPATIBILITY) + .removeAll(GeneratePickerData2.UNCOMMON_HAN); + GeneratePickerData2.CATEGORYTABLE.add( + mainCat, + true, + subCat, + GeneratePickerData2.LinkedHashSetComparator, + Separation.AUTOMATIC, + normal); values.removeAll(normal); - GeneratePickerData2.CATEGORYTABLE.add(mainCat, true, "Other", - GeneratePickerData2.LinkedHashSetComparator, Separation.AUTOMATIC, values); + GeneratePickerData2.CATEGORYTABLE.add( + mainCat, + true, + "Other", + GeneratePickerData2.LinkedHashSetComparator, + Separation.AUTOMATIC, + values); } } } - GeneratePickerData2.CATEGORYTABLE.add("Han - Other", true, "Other", GeneratePickerData2.LinkedHashSetComparator, - Separation.AUTOMATIC, others); + GeneratePickerData2.CATEGORYTABLE.add( + "Han - Other", + true, + "Other", + GeneratePickerData2.LinkedHashSetComparator, + Separation.AUTOMATIC, + others); GeneratePickerData2.UNCOMMON_HAN.removeAll(RadicalStroke2.SINGLETON.iiCoreSet); - } - static UnicodeSet LATIN = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:script=Latin:]").freeze(); + static UnicodeSet LATIN = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:script=Latin:]").freeze(); static Set SKIP_LOCALES = new HashSet(); + static { SKIP_LOCALES.add("kl"); SKIP_LOCALES.add("eo"); @@ -384,16 +486,26 @@ private static void addLatin() { } UnicodeSet diff = new UnicodeSet(exemplarSet).removeAll(LATIN); if (!diff.isEmpty()) { - System.out.println(loc + " Latin: " + new UnicodeSet(exemplarSet).retainAll(LATIN).toPattern(false)); + System.out.println( + loc + + " Latin: " + + new UnicodeSet(exemplarSet).retainAll(LATIN).toPattern(false)); while (!diff.isEmpty()) { String first = diff.iterator().next(); int script = UScript.getScript(first.codePointAt(0)); - UnicodeSet scriptSet = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script) - .retainAll(diff).add(first); + UnicodeSet scriptSet = + new UnicodeSet() + .applyIntPropertyValue(UProperty.SCRIPT, script) + .retainAll(diff) + .add(first); diff.removeAll(scriptSet).remove(first); if (script != UScript.INHERITED) { - System.out.println(loc + " Latin with : " + UScript.getName(script) + ", " - + scriptSet.toPattern(false)); + System.out.println( + loc + + " Latin with : " + + UScript.getName(script) + + ", " + + scriptSet.toPattern(false)); } } } @@ -403,31 +515,55 @@ private static void addLatin() { closeOver(closed).retainAll(ScriptCategories2.parseUnicodeSet("[[:L:][:M:]-[:nfkcqc=n:]]")); System.out.println("Exemplars: " + closed); - final UnicodeSet common = ScriptCategories2 - .parseUnicodeSet("[[aáàăâấầåäãąāảạậ æ b c ćčç dđð eéèêếềểěëėęēệ ə f ƒ gğ h iíìî ïįīị ı j-lľļł m nńňñņ oóòô ốồổöőõøơớờởợọộ œ p-rř s śšş tťţ uúùûůüűųūủưứữ ựụ v-yýÿ zźžż þ]]"); + final UnicodeSet common = + ScriptCategories2.parseUnicodeSet( + "[[aáàăâấầåäãąāảạậ æ b c ćčç dđð eéèêếềểěëėęēệ ə f ƒ gğ h iíìî ïįīị ı j-lľļł m nńňñņ oóòô ốồổöőõøơớờởợọộ œ p-rř s śšş tťţ uúùûůüűųūủưứữ ựụ v-yýÿ zźžż þ]]"); closeOver(common).retainAll(ScriptCategories2.parseUnicodeSet("[[:L:][:M:]-[:nfkcqc=n:]]")); System.out.println("Common: " + common); exemplars.retainAll(ScriptCategories2.parseUnicodeSet("[[:L:][:M:]-[:nfkcqc=n:]]")); - CATEGORYTABLE.add("Latin", true, "Common", buttonComparator, Separation.ALL_ORDINARY, exemplars); - CATEGORYTABLE.add("Latin", true, "Phonetics (IPA)", buttonComparator, Separation.ALL_ORDINARY, - ScriptCategories2.IPA); - CATEGORYTABLE.add("Latin", true, "Phonetics (X-IPA)", buttonComparator, Separation.ALL_ORDINARY, - ScriptCategories2.IPA_EXTENSIONS); - String flipped = "ɒdɔbɘᎸǫʜiꞁʞlmnoqpɿƨƚuvwxʏƹ؟" + "AᙠƆᗡƎꟻᎮHIႱᐴᏗMИOꟼϘЯƧTUVWXYƸ" + "ɐqɔpǝɟɓɥɪſʞ1ɯuodbɹsʇnʌʍxʎz¿" - + "∀ᙠƆᗡƎℲ⅁HIΓᐴ⅂ꟽNOԀÓᴚƧ⊥ȠɅM⅄Z"; - CATEGORYTABLE.add("Latin", true, "Flipped/Mirrored", ListComparator, Separation.ALL_ORDINARY, flipped); CATEGORYTABLE.add( - "Latin", - true, - "Other", - buttonComparator, - Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[:script=Latin:]").removeAll(ScriptCategories2.SCRIPT_CHANGED) - .addAll(ScriptCategories2.SCRIPT_NEW.get("Latin")).removeAll(ScriptCategories2.IPA) - .removeAll(ScriptCategories2.IPA_EXTENSIONS).removeAll(exemplars)); + "Latin", true, "Common", buttonComparator, Separation.ALL_ORDINARY, exemplars); + CATEGORYTABLE.add( + "Latin", + true, + "Phonetics (IPA)", + buttonComparator, + Separation.ALL_ORDINARY, + ScriptCategories2.IPA); + CATEGORYTABLE.add( + "Latin", + true, + "Phonetics (X-IPA)", + buttonComparator, + Separation.ALL_ORDINARY, + ScriptCategories2.IPA_EXTENSIONS); + String flipped = + "ɒdɔbɘᎸǫʜiꞁʞlmnoqpɿƨƚuvwxʏƹ؟" + + "AᙠƆᗡƎꟻᎮHIႱᐴᏗMИOꟼϘЯƧTUVWXYƸ" + + "ɐqɔpǝɟɓɥɪſʞ1ɯuodbɹsʇnʌʍxʎz¿" + + "∀ᙠƆᗡƎℲ⅁HIΓᐴ⅂ꟽNOԀÓᴚƧ⊥ȠɅM⅄Z"; + CATEGORYTABLE.add( + "Latin", + true, + "Flipped/Mirrored", + ListComparator, + Separation.ALL_ORDINARY, + flipped); + CATEGORYTABLE.add( + "Latin", + true, + "Other", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet("[:script=Latin:]") + .removeAll(ScriptCategories2.SCRIPT_CHANGED) + .addAll(ScriptCategories2.SCRIPT_NEW.get("Latin")) + .removeAll(ScriptCategories2.IPA) + .removeAll(ScriptCategories2.IPA_EXTENSIONS) + .removeAll(exemplars)); } private static UnicodeSet closeOver(UnicodeSet closed) { @@ -447,12 +583,18 @@ private static UnicodeSet closeOver(UnicodeSet closed) { private static void addAndNoteNew(ULocale title, UnicodeSet toAddTo, final UnicodeSet toAdd) { flatten(toAdd); if (toAddTo.containsAll(toAdd)) return; - System.out.println("Adding Common\t" + title.getDisplayName() + "\t" + title.toString() + "\t" - + new UnicodeSet(toAdd).removeAll(toAddTo).toPattern(false)); + System.out.println( + "Adding Common\t" + + title.getDisplayName() + + "\t" + + title.toString() + + "\t" + + new UnicodeSet(toAdd).removeAll(toAddTo).toPattern(false)); toAddTo.addAll(toAdd); } - private static void writeMainFile(String directory, String categoryTable) throws IOException, FileNotFoundException { + private static void writeMainFile(String directory, String categoryTable) + throws IOException, FileNotFoundException { PrintWriter out = getFileWriter(directory, "CharData.java"); out.println("package org.unicode.cldr.draft.picker;"); out.println("public class CharData {"); @@ -465,10 +607,13 @@ private static void writeMainFile(String directory, String categoryTable) throws out.close(); } - static PrintWriter getFileWriter(String directory, String filename) throws IOException, FileNotFoundException { + static PrintWriter getFileWriter(String directory, String filename) + throws IOException, FileNotFoundException { File f = new File(directory, filename); System.out.println("Writing: " + f.getCanonicalFile()); - PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(f), Charset.forName("UTF-8"))); + PrintWriter out = + new PrintWriter( + new OutputStreamWriter(new FileOutputStream(f), Charset.forName("UTF-8"))); return out; } @@ -485,51 +630,78 @@ static PrintWriter getFileWriter(String directory, String filename) throws IOExc // } private static void addSymbols() { - final UnicodeSet symbolsMinusScripts = ScriptCategories2 - .parseUnicodeSet("[[[:script=common:][:script=inherited:]]&[[:S:][:Letter:]]]"); + final UnicodeSet symbolsMinusScripts = + ScriptCategories2.parseUnicodeSet( + "[[[:script=common:][:script=inherited:]]&[[:S:][:Letter:]]]"); if (true) { System.out.println("***Contains:" + symbolsMinusScripts.contains(0x3192)); } final UnicodeSet math = ScriptCategories2.parseUnicodeSet("[:math:]"); - final UnicodeSet superscripts = ScriptCategories2.parseUnicodeSet("[[:dt=super:]-[:block=kanbun:]]"); + final UnicodeSet superscripts = + ScriptCategories2.parseUnicodeSet("[[:dt=super:]-[:block=kanbun:]]"); final UnicodeSet subscripts = ScriptCategories2.parseUnicodeSet("[:dt=sub:]"); - UnicodeSet skip = new UnicodeSet().addAll(math).addAll(superscripts).addAll(subscripts) - .retainAll(COMPATIBILITY); + UnicodeSet skip = + new UnicodeSet() + .addAll(math) + .addAll(superscripts) + .addAll(subscripts) + .retainAll(COMPATIBILITY); - for (int i = UCharacter.getIntPropertyMinValue(UProperty.GENERAL_CATEGORY); i <= UCharacter - .getIntPropertyMaxValue(UProperty.GENERAL_CATEGORY); ++i) { - String valueAlias = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, i, - UProperty.NameChoice.LONG); + for (int i = UCharacter.getIntPropertyMinValue(UProperty.GENERAL_CATEGORY); + i <= UCharacter.getIntPropertyMaxValue(UProperty.GENERAL_CATEGORY); + ++i) { + String valueAlias = + UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, i, UProperty.NameChoice.LONG); UnicodeSet temp = new UnicodeSet(); ScriptCategories2.applyPropertyAlias("General Category", valueAlias, temp); - for (UnicodeSetIterator it = new UnicodeSetIterator(temp.retainAll(symbolsMinusScripts).removeAll(skip)); it - .next();) { - String block = UCharacter.getStringPropertyValue(UProperty.BLOCK, it.codepoint, - UProperty.NameChoice.LONG).toString(); - CATEGORYTABLE.add("Symbol", true, block + "@" + valueAlias, buttonComparator, Separation.AUTOMATIC, - it.codepoint, it.codepoint); + for (UnicodeSetIterator it = + new UnicodeSetIterator( + temp.retainAll(symbolsMinusScripts).removeAll(skip)); + it.next(); ) { + String block = + UCharacter.getStringPropertyValue( + UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG) + .toString(); + CATEGORYTABLE.add( + "Symbol", + true, + block + "@" + valueAlias, + buttonComparator, + Separation.AUTOMATIC, + it.codepoint, + it.codepoint); } } CATEGORYTABLE.add("Symbol", true, "Math", CODE_POINT_ORDER, Separation.ALL_ORDINARY, math); - CATEGORYTABLE.add("Symbol", true, "Superscript", buttonComparator, Separation.ALL_ORDINARY, superscripts); - CATEGORYTABLE.add("Symbol", true, "Subscript", buttonComparator, Separation.ALL_ORDINARY, subscripts); - + CATEGORYTABLE.add( + "Symbol", + true, + "Superscript", + buttonComparator, + Separation.ALL_ORDINARY, + superscripts); + CATEGORYTABLE.add( + "Symbol", true, "Subscript", buttonComparator, Separation.ALL_ORDINARY, subscripts); } private static void generateHangulDefectives() { for (int atomic = 0; atomic < 2; ++atomic) { for (int modern = 0; modern < 2; ++modern) { - for (char c : new char[] { 'L', 'V', 'T' }) { + for (char c : new char[] {'L', 'V', 'T'}) { UnicodeSet uset = new UnicodeSet(); - for (UnicodeSetIterator it = new UnicodeSetIterator(ScriptCategories2.parseUnicodeSet("[:HST=" + c - + ":]")); it.next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator( + ScriptCategories2.parseUnicodeSet("[:HST=" + c + ":]")); + it.next(); ) { if (UCharacter.getName(it.codepoint).contains("FILLER")) continue; String s = it.getString(); String d = MKD.transform(s); - if (s.equals(d) == (atomic == 1) && MODERN_JAMO.contains(it.codepoint) == (modern == 1)) { + if (s.equals(d) == (atomic == 1) + && MODERN_JAMO.contains(it.codepoint) == (modern == 1)) { uset.add(it.codepoint); } } @@ -563,7 +735,7 @@ private static void generateHangulDefectives() { System.out.println("testing roundtrip"); // test roundtrip - for (UnicodeSetIterator it = new UnicodeSetIterator(all); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(all); it.next(); ) { final String a = it.getString(); String b = MKD.transform(a); String c = MKC.transform(b); @@ -576,8 +748,10 @@ private static void generateHangulDefectives() { Map decomp2comp = new HashMap(); System.out.println("find defectives"); - for (UnicodeSetIterator it = new UnicodeSetIterator(ScriptCategories2.parseUnicodeSet("[:script=Hangul:]")); it - .next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator( + ScriptCategories2.parseUnicodeSet("[:script=Hangul:]")); + it.next(); ) { final String comp = it.getString(); String decomp = MKD.transform(comp); decomp2comp.put(decomp, comp); @@ -596,19 +770,19 @@ private static void generateHangulDefectives() { System.out.println("testing single+all"); - for (UnicodeSetIterator it = new UnicodeSetIterator(single); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(single); it.next(); ) { final String a = it.getString(); System.out.println(a); - for (UnicodeSetIterator it2 = new UnicodeSetIterator(all); it2.next();) { + for (UnicodeSetIterator it2 = new UnicodeSetIterator(all); it2.next(); ) { final String b = it2.getString(); checkPair(a, b, count); } } System.out.println("testing syllable+single"); - for (UnicodeSetIterator it = new UnicodeSetIterator(syllable); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(syllable); it.next(); ) { final String a = it.getString(); System.out.println(a); - for (UnicodeSetIterator it2 = new UnicodeSetIterator(single); it2.next();) { + for (UnicodeSetIterator it2 = new UnicodeSetIterator(single); it2.next(); ) { final String b = it2.getString(); checkPair(a, b, count); } @@ -634,16 +808,29 @@ private static void checkPair(final String a, final String b, int[] count) { } public static String codeAndName(String comp) { - return toHex(comp, false) + "(" + comp + ")" + UCharacter.getExtendedName(comp.codePointAt(0)); + return toHex(comp, false) + + "(" + + comp + + ")" + + UCharacter.getExtendedName(comp.codePointAt(0)); } private static void addHangul() { - for (UnicodeSetIterator it = new UnicodeSetIterator(ScriptCategories2.parseUnicodeSet("[:script=Hangul:]") - .removeAll(SKIP)); it.next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator( + ScriptCategories2.parseUnicodeSet("[:script=Hangul:]") + .removeAll(SKIP)); + it.next(); ) { String str = it.getString(); if (ScriptCategories2.ARCHAIC.contains(it.codepoint)) { - CATEGORYTABLE.add("Hangul", true, "Archaic Hangul", buttonComparator, Separation.AUTOMATIC, - it.codepoint, it.codepoint); + CATEGORYTABLE.add( + "Hangul", + true, + "Archaic Hangul", + buttonComparator, + Separation.AUTOMATIC, + it.codepoint, + it.codepoint); continue; } String s = MKKD.transform(str); @@ -651,25 +838,49 @@ private static void addHangul() { if (decompCodePoint1 == '(') { decompCodePoint1 = s.codePointAt(1); } - if (!HST_L.contains(decompCodePoint1) || it.codepoint == 0x115F || it.codepoint == 0x1160) { - CATEGORYTABLE.add("Hangul", true, "Other", buttonComparator, Separation.AUTOMATIC, it.codepoint); + if (!HST_L.contains(decompCodePoint1) + || it.codepoint == 0x115F + || it.codepoint == 0x1160) { + CATEGORYTABLE.add( + "Hangul", + true, + "Other", + buttonComparator, + Separation.AUTOMATIC, + it.codepoint); continue; } if (COMPATIBILITY.contains(it.codepoint)) { - CATEGORYTABLE - .add("Hangul", true, "Compatibility", buttonComparator, Separation.AUTOMATIC, it.codepoint); + CATEGORYTABLE.add( + "Hangul", + true, + "Compatibility", + buttonComparator, + Separation.AUTOMATIC, + it.codepoint); continue; } - CATEGORYTABLE.add("Hangul", true, - UTF16.valueOf(decompCodePoint1) + " " + UCharacter.getExtendedName(decompCodePoint1), buttonComparator, - Separation.AUTOMATIC, it.codepoint); + CATEGORYTABLE.add( + "Hangul", + true, + UTF16.valueOf(decompCodePoint1) + + " " + + UCharacter.getExtendedName(decompCodePoint1), + buttonComparator, + Separation.AUTOMATIC, + it.codepoint); } } private static String buildNames() { StringBuilder result = new StringBuilder(); - for (UnicodeSetIterator it = new UnicodeSetIterator(NAMED_CHARACTERS); it.next();) { - result.append("{\"" + it.getString() + "\",\"" + UCharacter.getExtendedName(it.codepoint) + "\"},\n"); + for (UnicodeSetIterator it = new UnicodeSetIterator(NAMED_CHARACTERS); it.next(); ) { + result.append( + "{\"" + + it.getString() + + "\",\"" + + UCharacter.getExtendedName(it.codepoint) + + "\"},\n"); } return result.toString(); } @@ -711,16 +922,26 @@ public String toString() { static class CategoryTable { enum Separation { - AUTOMATIC, ALL_UNCOMMON, ALL_HISTORIC, ALL_COMPATIBILITY, ALL_ORDINARY - } - - static Map> categoryTable = // new TreeMap>(ENGLISH); // - new LinkedHashMap>(); - - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, UnicodeSet values) { - for (UnicodeSetIterator it = new UnicodeSetIterator(values); it.next();) { + AUTOMATIC, + ALL_UNCOMMON, + ALL_HISTORIC, + ALL_COMPATIBILITY, + ALL_ORDINARY + } + + static Map> + categoryTable = // new TreeMap>(ENGLISH); // + new LinkedHashMap>(); + + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + UnicodeSet values) { + for (UnicodeSetIterator it = new UnicodeSetIterator(values); it.next(); ) { add(category, sortSubcategory, subcategory, sortValues, separateOld, it.codepoint); } } @@ -729,7 +950,9 @@ public Collection getLocalizations() { TreeSet result = new TreeSet(); result.add("variation selector-PLACEHOLDER"); for (String cp : new UnicodeSet(NAMED_CHARACTERS).removeAll(SKIPNAMES)) { - addNames(UCharacter.toLowerCase(UCharacter.getExtendedName(cp.codePointAt(0))), result); + addNames( + UCharacter.toLowerCase(UCharacter.getExtendedName(cp.codePointAt(0))), + result); } for (String category : categoryTable.keySet()) { addNames(category, result); @@ -771,23 +994,45 @@ private void addNames(String name, Collection result) { result.add(name); } - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, String values) { + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + String values) { int cp; for (int i = 0; i < values.length(); i += UTF16.getCharCount(cp)) { - add(category, sortSubcategory, subcategory, sortValues, separateOld, cp = values.charAt(i)); - } - } - - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, int startCodePoint, int endCodePoint) { + add( + category, + sortSubcategory, + subcategory, + sortValues, + separateOld, + cp = values.charAt(i)); + } + } + + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + int startCodePoint, + int endCodePoint) { for (int i = startCodePoint; i <= endCodePoint; ++i) { add(category, sortSubcategory, subcategory, sortValues, separateOld, i); } } - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, int codepoint) { + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + int codepoint) { // if (ADD_SUBHEAD.contains(codepoint)) String subhead = subheader.getSubheader(codepoint); @@ -807,18 +1052,18 @@ public void add(String category, boolean sortSubcategory, String subcategory, Co } } switch (separateOld) { - case ALL_HISTORIC: - prefix = ARCHAIC_MARKER; - sortValues = CODE_POINT_ORDER; - break; - case ALL_COMPATIBILITY: - prefix = COMPAT_MARKER; - sortValues = CODE_POINT_ORDER; - break; - case ALL_UNCOMMON: - prefix = LESS_COMMON_MARKER; - sortValues = CODE_POINT_ORDER; - break; + case ALL_HISTORIC: + prefix = ARCHAIC_MARKER; + sortValues = CODE_POINT_ORDER; + break; + case ALL_COMPATIBILITY: + prefix = COMPAT_MARKER; + sortValues = CODE_POINT_ORDER; + break; + case ALL_UNCOMMON: + prefix = LESS_COMMON_MARKER; + sortValues = CODE_POINT_ORDER; + break; } SimplePair names = renamer.rename(category, prefix + subcategory); @@ -831,9 +1076,14 @@ public void add(String category, boolean sortSubcategory, String subcategory, Co CATEGORYTABLE.add2(mainCategory, sortSubcategory, subCategory, sortValues, codepoint); } - private void add2(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - int codePoint) { - GeneratePickerData2.USet oldValue = getValues(category, sortSubcategory, subcategory, sortValues); + private void add2( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + int codePoint) { + GeneratePickerData2.USet oldValue = + getValues(category, sortSubcategory, subcategory, sortValues); if (!SKIP.contains(codePoint)) { oldValue.strings.add(UTF16.valueOf(codePoint)); } @@ -843,7 +1093,9 @@ public void removeAll(String category, String subcategory, UnicodeSet values) { Map sub = addMainCategory(category); GeneratePickerData2.USet oldValue = sub.get(subcategory); if (oldValue != null) { - System.out.println(oldValue.strings.removeAll(addAllToCollection(values, new HashSet()))); + System.out.println( + oldValue.strings.removeAll( + addAllToCollection(values, new HashSet()))); } } @@ -868,20 +1120,25 @@ public void removeAll(UnicodeSet values) { // } // } - public void remove(String category, String subcategory, int startCodePoint, int endCodePoint) { + public void remove( + String category, String subcategory, int startCodePoint, int endCodePoint) { removeAll(category, subcategory, new UnicodeSet(startCodePoint, endCodePoint)); } public Map addMainCategory(String mainCategory) { Map sub = categoryTable.get(mainCategory); if (sub == null) { - categoryTable.put(mainCategory, sub = new TreeMap(UCA)); + categoryTable.put( + mainCategory, sub = new TreeMap(UCA)); } return sub; } - private GeneratePickerData2.USet getValues(String category, boolean sortSubcategory, String subcategory, - Comparator sortValues) { + private GeneratePickerData2.USet getValues( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues) { Map sub = addMainCategory(category); GeneratePickerData2.USet oldValue = sub.get(subcategory); if (oldValue == null) { @@ -899,14 +1156,20 @@ public String toString() { static final UnicodeSet DEPRECATED = new UnicodeSet("[:deprecated:]").freeze(); static final UnicodeSet CONTROLS = new UnicodeSet("[[:cc:]]").freeze(); - public String toString(boolean displayData, String localDataDirectory) throws FileNotFoundException, - IOException { - UnicodeSet missing = new UnicodeSet(0, 0x10FFFF).removeAll(Typology.SKIP) - .removeAll(DEPRECATED) - .removeAll(CONTROLS); + public String toString(boolean displayData, String localDataDirectory) + throws FileNotFoundException, IOException { + UnicodeSet missing = + new UnicodeSet(0, 0x10FFFF) + .removeAll(Typology.SKIP) + .removeAll(DEPRECATED) + .removeAll(CONTROLS); PrintWriter htmlChart = getFileWriter(localDataDirectory, "index.html"); - writeHtmlHeader(htmlChart, localDataDirectory, null, "main", - "p {font-size:100%; margin:0; margin-left:1em; text-indent:-1em;}"); + writeHtmlHeader( + htmlChart, + localDataDirectory, + null, + "main", + "p {font-size:100%; margin:0; margin-left:1em; text-indent:-1em;}"); writePageIndex(htmlChart, categoryTable.keySet()); int totalChars = 0, totalCompressed = 0; @@ -915,11 +1178,13 @@ public String toString(boolean displayData, String localDataDirectory) throws Fi StringBuilder result = new StringBuilder(); for (String category : categoryTable.keySet()) { Map sub = categoryTable.get(category); - htmlChart = openChart(htmlChart, localDataDirectory, category, categoryTable.keySet()); + htmlChart = + openChart(htmlChart, localDataDirectory, category, categoryTable.keySet()); result.append("{{\"" + category + "\"},\n"); // clean up results - for (Iterator subcategoryIterator = sub.keySet().iterator(); subcategoryIterator.hasNext();) { + for (Iterator subcategoryIterator = sub.keySet().iterator(); + subcategoryIterator.hasNext(); ) { String subcategory = subcategoryIterator.next(); GeneratePickerData2.USet valueChars = sub.get(subcategory); if (valueChars.strings.isEmpty()) { @@ -937,21 +1202,38 @@ public String toString(boolean displayData, String localDataDirectory) throws Fi labelString.append(" ‧ "); } UnicodeSet labelSet = Typology.getSet(s); - labelString.append(getUnicodeSetUrl(s, labelSet) + percentSuperscript(set, labelSet)); + labelString.append( + getUnicodeSetUrl(s, labelSet) + percentSuperscript(set, labelSet)); } - htmlChart.println("" - + - // "" + - "" + "" + "" - + "\n" + "" + "" + ""); - String valueCharsString = addResult(result, valueChars, category, subcategory, displayData); + htmlChart.println( + "" + + + // "" + + "" + + "" + + "" + + "\n" + + "" + + "" + + ""); + String valueCharsString = + addResult(result, valueChars, category, subcategory, displayData); totalChars += utf8Length(valueChars.strings); totalCompressed += utf8Length(valueCharsString); // if (valueChars.set.size() > 1000) { - // System.out.println("//Big class: " + category + MAIN_SUB_SEPARATOR + subcategory + + // System.out.println("//Big class: " + category + MAIN_SUB_SEPARATOR + + // subcategory + // MAIN_SUBSUB_SEPARATOR + valueChars.set.size()); // } UnicodeSet dups = new UnicodeSet(soFar); @@ -1008,13 +1290,18 @@ private String percentSuperscript(UnicodeSet set, UnicodeSet labelSet) { if (set.containsAll(labelSet)) return ""; UnicodeSet inSet = new UnicodeSet(labelSet).retainAll(set); UnicodeSet outSet = new UnicodeSet(labelSet).removeAll(set); - String result = " " + getUnicodeSetUrl(String.valueOf(inSet.size()), inSet) + ":" - + getUnicodeSetUrl(String.valueOf(outSet.size()), outSet) + ""; + String result = + " " + + getUnicodeSetUrl(String.valueOf(inSet.size()), inSet) + + ":" + + getUnicodeSetUrl(String.valueOf(outSet.size()), outSet) + + ""; return result; } private String getUnicodeSetUrl(UnicodeSet set) { - return "http://unicode.org/cldr/utility/list-unicodeset.jsp?a=" + fixURL(set.toPattern(false)); + return "http://unicode.org/cldr/utility/list-unicodeset.jsp?a=" + + fixURL(set.toPattern(false)); } private String fixURL(String string) { @@ -1038,17 +1325,22 @@ private String fixHtml(Collection strings) { return result.toString(); } - private PrintWriter openChart(PrintWriter htmlChart, String localDataDirectory, - String category, Set set) - throws IOException, FileNotFoundException { + private PrintWriter openChart( + PrintWriter htmlChart, String localDataDirectory, String category, Set set) + throws IOException, FileNotFoundException { if (htmlChart != null) { htmlChart = writeHtmlFooterAndClose(htmlChart); } if (category != null) { String fileNameFromCategory = fileNameFromCategory(category); htmlChart = getFileWriter(localDataDirectory, fileNameFromCategory); - htmlChart = writeHtmlHeader(htmlChart, localDataDirectory, category, "main", - "table, th, td {border-collapse:collapse; border:1px solid blue;}"); + htmlChart = + writeHtmlHeader( + htmlChart, + localDataDirectory, + category, + "main", + "table, th, td {border-collapse:collapse; border:1px solid blue;}"); writeCategoryH1(htmlChart, category); htmlChart.println("

Index

"); htmlChart.println("
"; + String topCell = ""; String underStart = " "; String underEnd = " "; UnicodeSet nsm = new UnicodeSet("[[:Mn:][:Me:]]"); - result.append("\n"); + result.append( + "

Confusable Characters

\n"); for (Collection items : alternates) { result.append(""); for (String item : items) { @@ -315,7 +347,7 @@ private static String getConfusablesCore(String test, Confusables confusables) { result.append("

Total raw values: " + nf.format(maxSize) + "

\n"); if (maxSize > 1000000) { - result.append( "

Too many raw items to process.

\n"); + result.append("

Too many raw items to process.

\n"); return result.toString(); } @@ -339,10 +371,12 @@ private static String getConfusablesCore(String test, Confusables confusables) { result.append("

Total filtered values: " + nf.format(count) + "

\n"); if (count > 1000) { - result.append("

Too many filtered items to display; truncating to 1,000.

\n"); + result.append( + "

Too many filtered items to display; truncating to 1,000.

\n"); } return result.toString(); } + public static String testIdnaLines(String lines, String filter) { return UnicodeUtilities.testIdnaLines(lines, filter); } @@ -351,23 +385,35 @@ public static String getIdentifier(String script) { return UnicodeUtilities.getIdentifier(script); } - static final String VERSIONS = "Version 3.9; " - + "ICU version: " + VersionInfo.ICU_VERSION.getVersionString(2, 2) + "; " - + "Unicode/Emoji version: " + UCharacter.getUnicodeVersion().getVersionString(2, 2) + "; " - + (CachedProps.IS_BETA ? "Unicodeβ version: " + CachedProps.CACHED_PROPS.version.getVersionString(2, 2) + "; " : ""); + static final String VERSIONS = + "Version 3.9; " + + "ICU version: " + + VersionInfo.ICU_VERSION.getVersionString(2, 2) + + "; " + + "Unicode/Emoji version: " + + UCharacter.getUnicodeVersion().getVersionString(2, 2) + + "; " + + (CachedProps.IS_BETA + ? "Unicodeβ version: " + + CachedProps.CACHED_PROPS.version.getVersionString(2, 2) + + "; " + : ""); public static String getVersions() { return VERSIONS; } - static final String SUBHEAD = !CachedProps.IS_BETA ? "" - : "

Properties use ICU for Unicode V" + UCharacter.getUnicodeVersion().getVersionString(2, 2) - + "; the beta properties support Unicode V" + CachedProps.CACHED_PROPS.version.getVersionString(2, 2) + "β. " - + "For more information, see Unicode Utilities Beta.

" - ; + static final String SUBHEAD = + !CachedProps.IS_BETA + ? "" + : "

Properties use ICU for Unicode V" + + UCharacter.getUnicodeVersion().getVersionString(2, 2) + + "; the beta properties support Unicode V" + + CachedProps.CACHED_PROPS.version.getVersionString(2, 2) + + "β. " + + "For more information, see Unicode Utilities Beta.

"; public static String getSubtitle() { return SUBHEAD; } - } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeProperty.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeProperty.java index e18f5b392..6defd8bc8 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeProperty.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeProperty.java @@ -7,6 +7,15 @@ ******************************************************************************* */ +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.SymbolTable; +import com.ibm.icu.text.UFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeMatcher; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.PrintWriter; import java.io.StringWriter; import java.text.ParsePosition; @@ -21,59 +30,57 @@ import java.util.TreeMap; import java.util.function.Predicate; import java.util.regex.Pattern; - import org.unicode.cldr.util.props.UnicodeLabel; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.SymbolTable; -import com.ibm.icu.text.UFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeMatcher; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - /** * @Deprecated use org.unicode.props.UnicodeProperty */ -@Deprecated( forRemoval = true ) +@Deprecated(forRemoval = true) public abstract class UnicodeProperty extends UnicodeLabel { public static final UnicodeSet UNASSIGNED = new UnicodeSet("[:gc=unassigned:]").freeze(); - public static final UnicodeSet NONCHARACTERS = new UnicodeSet("[:noncharactercodepoint:]").freeze(); + public static final UnicodeSet NONCHARACTERS = + new UnicodeSet("[:noncharactercodepoint:]").freeze(); public static final int SAMPLE_UNASSIGNED = UNASSIGNED.charAt(0); public static final UnicodeSet PRIVATE_USE = new UnicodeSet("[:gc=privateuse:]").freeze(); public static final UnicodeSet SURROGATE = new UnicodeSet("[:gc=surrogate:]").freeze(); public static final UnicodeSet HIGH_SURROGATES = new UnicodeSet("[\\uD800-\\uDB7F]").freeze(); public static final int SAMPLE_HIGH_SURROGATE = HIGH_SURROGATES.charAt(0); - public static final UnicodeSet HIGH_PRIVATE_USE_SURROGATES = new UnicodeSet("[\\uDB80-\\uDBFF]").freeze(); - public static final int SAMPLE_HIGH_PRIVATE_USE_SURROGATE = HIGH_PRIVATE_USE_SURROGATES.charAt(0); + public static final UnicodeSet HIGH_PRIVATE_USE_SURROGATES = + new UnicodeSet("[\\uDB80-\\uDBFF]").freeze(); + public static final int SAMPLE_HIGH_PRIVATE_USE_SURROGATE = + HIGH_PRIVATE_USE_SURROGATES.charAt(0); public static final UnicodeSet LOW_SURROGATES = new UnicodeSet("[\\uDC00-\\uDFFF]").freeze(); public static final int SAMPLE_LOW_SURROGATE = LOW_SURROGATES.charAt(0); public static final UnicodeSet PRIVATE_USE_AREA = new UnicodeSet("[\\uE000-\\uF8FF]").freeze(); public static final int SAMPLE_PRIVATE_USE_AREA = PRIVATE_USE_AREA.charAt(0); - public static final UnicodeSet PRIVATE_USE_AREA_A = new UnicodeSet("[\\U000F0000-\\U000FFFFD]").freeze(); + public static final UnicodeSet PRIVATE_USE_AREA_A = + new UnicodeSet("[\\U000F0000-\\U000FFFFD]").freeze(); public static final int SAMPLE_PRIVATE_USE_AREA_A = PRIVATE_USE_AREA_A.charAt(0); - public static final UnicodeSet PRIVATE_USE_AREA_B = new UnicodeSet("[\\U00100000-\\U0010FFFD]").freeze(); + public static final UnicodeSet PRIVATE_USE_AREA_B = + new UnicodeSet("[\\U00100000-\\U0010FFFD]").freeze(); public static final int SAMPLE_PRIVATE_USE_AREA_B = PRIVATE_USE_AREA_B.charAt(0); - public static final UnicodeSet SPECIALS = new UnicodeSet(UNASSIGNED).addAll(PRIVATE_USE).addAll(SURROGATE).freeze(); - - public static final UnicodeSet STUFF_TO_TEST = new UnicodeSet(SPECIALS).complement() - .addAll(NONCHARACTERS) - .add(SAMPLE_UNASSIGNED) - .add(SAMPLE_HIGH_SURROGATE) - .add(SAMPLE_HIGH_PRIVATE_USE_SURROGATE) - .add(SAMPLE_LOW_SURROGATE) - .add(SAMPLE_PRIVATE_USE_AREA) - .add(SAMPLE_PRIVATE_USE_AREA_A) - .add(SAMPLE_PRIVATE_USE_AREA_B) - .freeze(); - - public static final UnicodeSet STUFF_TO_TEST_WITH_UNASSIGNED = new UnicodeSet(STUFF_TO_TEST).addAll(UNASSIGNED).freeze(); + public static final UnicodeSet SPECIALS = + new UnicodeSet(UNASSIGNED).addAll(PRIVATE_USE).addAll(SURROGATE).freeze(); + + public static final UnicodeSet STUFF_TO_TEST = + new UnicodeSet(SPECIALS) + .complement() + .addAll(NONCHARACTERS) + .add(SAMPLE_UNASSIGNED) + .add(SAMPLE_HIGH_SURROGATE) + .add(SAMPLE_HIGH_PRIVATE_USE_SURROGATE) + .add(SAMPLE_LOW_SURROGATE) + .add(SAMPLE_PRIVATE_USE_AREA) + .add(SAMPLE_PRIVATE_USE_AREA_A) + .add(SAMPLE_PRIVATE_USE_AREA_B) + .freeze(); + + public static final UnicodeSet STUFF_TO_TEST_WITH_UNASSIGNED = + new UnicodeSet(STUFF_TO_TEST).addAll(UNASSIGNED).freeze(); public static boolean DEBUG = false; @@ -96,25 +103,49 @@ public abstract class UnicodeProperty extends UnicodeLabel { * */ - public static final int UNKNOWN = 0, BINARY = 2, EXTENDED_BINARY = 3, - ENUMERATED = 4, EXTENDED_ENUMERATED = 5, CATALOG = 6, - EXTENDED_CATALOG = 7, MISC = 8, EXTENDED_MISC = 9, STRING = 10, - EXTENDED_STRING = 11, NUMERIC = 12, EXTENDED_NUMERIC = 13, - START_TYPE = 2, LIMIT_TYPE = 14, EXTENDED_MASK = 1, - CORE_MASK = ~EXTENDED_MASK, BINARY_MASK = (1 << BINARY) - | (1 << EXTENDED_BINARY), STRING_MASK = (1 << STRING) - | (1 << EXTENDED_STRING), - STRING_OR_MISC_MASK = (1 << STRING) | (1 << EXTENDED_STRING) - | (1 << MISC) | (1 << EXTENDED_MISC), - ENUMERATED_OR_CATALOG_MASK = (1 << ENUMERATED) - | (1 << EXTENDED_ENUMERATED) | (1 << CATALOG) - | (1 << EXTENDED_CATALOG); - - private static final String[] TYPE_NAMES = { "Unknown", "Unknown", - "Binary", "Extended Binary", "Enumerated", "Extended Enumerated", - "Catalog", "Extended Catalog", "Miscellaneous", - "Extended Miscellaneous", "String", "Extended String", "Numeric", - "Extended Numeric", }; + public static final int UNKNOWN = 0, + BINARY = 2, + EXTENDED_BINARY = 3, + ENUMERATED = 4, + EXTENDED_ENUMERATED = 5, + CATALOG = 6, + EXTENDED_CATALOG = 7, + MISC = 8, + EXTENDED_MISC = 9, + STRING = 10, + EXTENDED_STRING = 11, + NUMERIC = 12, + EXTENDED_NUMERIC = 13, + START_TYPE = 2, + LIMIT_TYPE = 14, + EXTENDED_MASK = 1, + CORE_MASK = ~EXTENDED_MASK, + BINARY_MASK = (1 << BINARY) | (1 << EXTENDED_BINARY), + STRING_MASK = (1 << STRING) | (1 << EXTENDED_STRING), + STRING_OR_MISC_MASK = + (1 << STRING) | (1 << EXTENDED_STRING) | (1 << MISC) | (1 << EXTENDED_MISC), + ENUMERATED_OR_CATALOG_MASK = + (1 << ENUMERATED) + | (1 << EXTENDED_ENUMERATED) + | (1 << CATALOG) + | (1 << EXTENDED_CATALOG); + + private static final String[] TYPE_NAMES = { + "Unknown", + "Unknown", + "Binary", + "Extended Binary", + "Enumerated", + "Extended Enumerated", + "Catalog", + "Extended Catalog", + "Miscellaneous", + "Extended Miscellaneous", + "String", + "Extended String", + "Numeric", + "Extended Numeric", + }; public static String getTypeName(int propType) { return TYPE_NAMES[propType]; @@ -137,8 +168,7 @@ public final boolean isType(int mask) { } protected final void setName(String string) { - if (string == null) - throw new IllegalArgumentException("Name must not be null"); + if (string == null) throw new IllegalArgumentException("Name must not be null"); name = string; } @@ -153,8 +183,12 @@ public String getVersion() { public String getValue(int codepoint) { if (DEBUG && CHECK_VALUE == codepoint && CHECK_NAME.equals(getName())) { String value = _getValue(codepoint); - System.out.println(getName() + "(" + Utility.hex(codepoint) + "):" - + (getType() == STRING ? Utility.hex(value) : value)); + System.out.println( + getName() + + "(" + + Utility.hex(codepoint) + + "):" + + (getType() == STRING ? Utility.hex(value) : value)); return value; } return _getValue(codepoint); @@ -165,8 +199,7 @@ public String getValue(int codepoint) { // } public List getNameAliases(List result) { - if (result == null) - result = new ArrayList(1); + if (result == null) result = new ArrayList(1); return _getNameAliases(result); } @@ -177,16 +210,19 @@ public List getValueAliases(String valueAlias, List result) { result = _getValueAliases(valueAlias, result); if (!result.contains(valueAlias)) { // FIX && type < NUMERIC result = _getValueAliases(valueAlias, result); // for debugging - throw new IllegalArgumentException("Internal error: " + getName() - + " doesn't contain " + valueAlias + ": " - + CollectionUtilities.join(result, ", ")); + throw new IllegalArgumentException( + "Internal error: " + + getName() + + " doesn't contain " + + valueAlias + + ": " + + CollectionUtilities.join(result, ", ")); } return result; } public List getAvailableValues(List result) { - if (result == null) - result = new ArrayList(1); + if (result == null) result = new ArrayList(1); return _getAvailableValues(result); } @@ -215,8 +251,7 @@ public final List getAvailableValues() { public final String getValue(int codepoint, boolean getShortest) { String result = getValue(codepoint); - if (type >= MISC || result == null || !getShortest) - return result; + if (type >= MISC || result == null || !getShortest) return result; return getFirstValueAlias(result); } @@ -228,8 +263,7 @@ public final String getFirstNameAlias() { } public final String getFirstValueAlias(String value) { - if (valueToFirstValueAlias == null) - _getFirstValueAliasCache(); + if (valueToFirstValueAlias == null) _getFirstValueAliasCache(); return valueToFirstValueAlias.get(value).toString(); } @@ -242,13 +276,17 @@ private void _getFirstValueAliasCache() { String value = (String) it.next(); String first = (String) getValueAliases(value).get(0); if (first == null) { // internal error - throw new IllegalArgumentException( - "Value not in value aliases: " + value); + throw new IllegalArgumentException("Value not in value aliases: " + value); } if (DEBUG && CHECK_NAME.equals(getName())) { - System.out.println("First Alias: " + getName() + ": " + value - + " => " + first - + CollectionUtilities.join(getValueAliases(value), ", ")); + System.out.println( + "First Alias: " + + getName() + + ": " + + value + + " => " + + first + + CollectionUtilities.join(getValueAliases(value), ", ")); } valueToFirstValueAlias.put(value, first); if (value.length() > maxValueWidth) { @@ -265,17 +303,18 @@ private void _getFirstValueAliasCache() { private int maxFirstValueAliasWidth = -1; public int getMaxWidth(boolean getShortest) { - if (maxValueWidth < 0) - _getFirstValueAliasCache(); - if (getShortest) - return maxFirstValueAliasWidth; + if (maxValueWidth < 0) _getFirstValueAliasCache(); + if (getShortest) return maxFirstValueAliasWidth; return maxValueWidth; } public final UnicodeSet getTrueSet() { if (!isType(BINARY)) { - throw new IllegalArgumentException("Only applicable to binary (boolean) properties, not " + getName() + - " which is of type " + getTypeName()); + throw new IllegalArgumentException( + "Only applicable to binary (boolean) properties, not " + + getName() + + " which is of type " + + getTypeName()); } return getSet("Yes", null); } @@ -289,8 +328,9 @@ public final UnicodeSet getSet(PatternMatcher matcher) { } public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { - return getSet(new SimpleMatcher(propertyValue, - isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR), + return getSet( + new SimpleMatcher( + propertyValue, isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR), result); } @@ -299,11 +339,11 @@ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { public static final String UNUSED = "??"; public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { - if (result == null) - result = new UnicodeSet(); + if (result == null) result = new UnicodeSet(); boolean uniformUnassigned = hasUniformUnassigned(); if (isType(STRING_OR_MISC_MASK)) { - for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); usi.next();) { // int i = 0; i <= 0x10FFFF; ++i + for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); + usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i int i = usi.codepoint; String value = getValue(i); if (value != null && matcher.test(value)) { @@ -315,15 +355,15 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { List temp = new ArrayList(1); // to avoid reallocating... UnicodeMap um = getUnicodeMap_internal(); Iterator it = um.getAvailableValues(null).iterator(); - main: while (it.hasNext()) { + main: + while (it.hasNext()) { String value = (String) it.next(); temp.clear(); Iterator it2 = getValueAliases(value, temp).iterator(); while (it2.hasNext()) { String value2 = (String) it2.next(); // System.out.println("Values:" + value2); - if (matcher.test(value2) - || matcher.test(toSkeleton(value2))) { + if (matcher.test(value2) || matcher.test(toSkeleton(value2))) { um.keySet(value, result); continue main; } @@ -340,9 +380,7 @@ public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { * UnicodeSetIterator(set); } */ - /** - * Utility for debugging - */ + /** Utility for debugging */ public static String getStack() { Exception e = new Exception(); StringWriter sw = new StringWriter(); @@ -361,8 +399,7 @@ public static class Name implements Comparable { public final int RAW = 0, TITLE = 1, NORMAL = 2; public Name(String name, int style) { - if (name == null) - name = ""; + if (name == null) name = ""; if (style == RAW) { skeleton = pretty = name; } else { @@ -405,7 +442,8 @@ public UnicodeMap getUnicodeMap(boolean getShortest) { UnicodeMap result = new UnicodeMap(); boolean uniformUnassigned = hasUniformUnassigned(); - for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); usi.next();) { // int i = 0; i <= 0x10FFFF; ++i + for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); + usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i int i = usi.codepoint; // if (DEBUG && i == 0x41) System.out.println(i + "\t" + // getValue(i)); @@ -419,8 +457,7 @@ public UnicodeMap getUnicodeMap(boolean getShortest) { * @return the unicode map */ protected UnicodeMap getUnicodeMap_internal() { - if (unicodeMap == null) - unicodeMap = _getUnicodeMap(); + if (unicodeMap == null) unicodeMap = _getUnicodeMap(); return unicodeMap; } @@ -429,34 +466,33 @@ protected UnicodeMap _getUnicodeMap() { HashMap myIntern = new HashMap(); boolean uniformUnassigned = hasUniformUnassigned(); - for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); usi.next();) { // int i = 0; i <= 0x10FFFF; ++i + for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); + usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i int i = usi.codepoint; // if (DEBUG && i == 0x41) System.out.println(i + "\t" + // getValue(i)); String value = getValue(i); String iValue = (String) myIntern.get(value); - if (iValue == null) - myIntern.put(value, iValue = value); + if (iValue == null) myIntern.put(value, iValue = value); result.put(i, iValue); } addUntested(result, uniformUnassigned); if (DEBUG) { - for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); usi.next();) { // int i = 0; i <= 0x10FFFF; ++i + for (UnicodeSetIterator usi = getStuffToTest(uniformUnassigned); + usi.next(); ) { // int i = 0; i <= 0x10FFFF; ++i int i = usi.codepoint; // if (DEBUG && i == 0x41) System.out.println(i + "\t" + // getValue(i)); String value = getValue(i); String resultValue = (String) result.getValue(i); if (!value.equals(resultValue)) { - throw new RuntimeException("Value failure at: " - + Utility.hex(i)); + throw new RuntimeException("Value failure at: " + Utility.hex(i)); } } } if (DEBUG && CHECK_NAME.equals(getName())) { - System.out.println(getName() + ":\t" + getClass().getName() + "\t" - + getVersion()); + System.out.println(getName() + ":\t" + getClass().getName() + "\t" + getVersion()); System.out.println(getStack()); System.out.println(result); } @@ -474,7 +510,7 @@ private UnicodeSet getSameValueRanges(boolean uniformUnassigned) { String current = _getValue(i); if (!current.equals(lastValue)) { if (startRange >= 0) { - int last = i-1; + int last = i - 1; if (last - startRange > 1) { sameValueRanges.add(startRange, last); } @@ -495,62 +531,45 @@ private UnicodeSet getSameValueRanges(boolean uniformUnassigned) { } private static UnicodeSetIterator getStuffToTest(boolean uniformUnassigned) { - return new UnicodeSetIterator(uniformUnassigned ? STUFF_TO_TEST : STUFF_TO_TEST_WITH_UNASSIGNED); + return new UnicodeSetIterator( + uniformUnassigned ? STUFF_TO_TEST : STUFF_TO_TEST_WITH_UNASSIGNED); } - /** - * Really ought to create a Collection UniqueList, that forces uniqueness. - * But for now... - */ + /** Really ought to create a Collection UniqueList, that forces uniqueness. But for now... */ public static Collection addUnique(Object obj, Collection result) { - if (obj != null && !result.contains(obj)) - result.add(obj); + if (obj != null && !result.contains(obj)) result.add(obj); return result; } - /** - * Utility for managing property & non-string value aliases - */ - public static final Comparator PROPERTY_COMPARATOR = new Comparator() { - public int compare(Object o1, Object o2) { - return compareNames((String) o1, (String) o2); - } - }; + /** Utility for managing property & non-string value aliases */ + public static final Comparator PROPERTY_COMPARATOR = + new Comparator() { + public int compare(Object o1, Object o2) { + return compareNames((String) o1, (String) o2); + } + }; - /** - * Utility for managing property & non-string value aliases - * - */ + /** Utility for managing property & non-string value aliases */ // TODO optimize public static boolean equalNames(String a, String b) { - if (a == b) - return true; - if (a == null) - return false; + if (a == b) return true; + if (a == null) return false; return toSkeleton(a).equals(toSkeleton(b)); } - /** - * Utility for managing property & non-string value aliases - */ + /** Utility for managing property & non-string value aliases */ // TODO optimize public static int compareNames(String a, String b) { - if (a == b) - return 0; - if (a == null) - return -1; - if (b == null) - return 1; + if (a == b) return 0; + if (a == null) return -1; + if (b == null) return 1; return toSkeleton(a).compareTo(toSkeleton(b)); } - /** - * Utility for managing property & non-string value aliases - */ + /** Utility for managing property & non-string value aliases */ // TODO account for special names, tibetan, hangul public static String toSkeleton(String source) { - if (source == null) - return null; + if (source == null) return null; StringBuffer skeletonBuffer = new StringBuffer(); boolean gotOne = false; // remove spaces, '_', '-' @@ -569,22 +588,19 @@ public static String toSkeleton(String source) { } } } - if (!gotOne) - return source; // avoid string creation + if (!gotOne) return source; // avoid string creation return skeletonBuffer.toString(); } // get the name skeleton public static String toNameSkeleton(String source) { - if (source == null) - return null; + if (source == null) return null; StringBuffer result = new StringBuffer(); // remove spaces, medial '-' // we can do this with char, since no surrogates are involved for (int i = 0; i < source.length(); ++i) { char ch = source.charAt(i); - if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') - || ch == '<' || ch == '>') { + if (('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'Z') || ch == '<' || ch == '>') { result.append(ch); } else if (ch == ' ') { // don't copy ever @@ -593,29 +609,28 @@ public static String toNameSkeleton(String source) { if (0 == i || i == source.length() - 1 || source.charAt(i - 1) == ' ' - || source.charAt(i + 1) == ' ' - || (i == source.length() - 2 - && source.charAt(i - 1) == 'O' && source - .charAt(i + 1) == 'E')) { + || source.charAt(i + 1) == ' ' + || (i == source.length() - 2 + && source.charAt(i - 1) == 'O' + && source.charAt(i + 1) == 'E')) { System.out.println("****** EXCEPTION " + source); result.append(ch); } // otherwise don't copy } else { - throw new IllegalArgumentException("Illegal Name Char: U+" - + Utility.hex(ch) + ", " + ch); + throw new IllegalArgumentException( + "Illegal Name Char: U+" + Utility.hex(ch) + ", " + ch); } } return result.toString(); } /** - * These routines use the Java functions, because they only need to act on - * ASCII Changes space, - into _, inserts _ between lower and UPPER. + * These routines use the Java functions, because they only need to act on ASCII Changes space, + * - into _, inserts _ between lower and UPPER. */ public static String regularize(String source, boolean titlecaseStart) { - if (source == null) - return source; + if (source == null) return source; /* * if (source.equals("noBreak")) { // HACK if (titlecaseStart) return * "NoBreak"; return source; } @@ -629,16 +644,15 @@ public static String regularize(String source, boolean titlecaseStart) { c = '_'; haveFirstCased = true; } - if (c == '=') - haveFirstCased = true; + if (c == '=') haveFirstCased = true; int cat = Character.getType(c); - if (lastCat == Character.LOWERCASE_LETTER - && cat == Character.UPPERCASE_LETTER) { + if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) { result.append('_'); } if (haveFirstCased && (cat == Character.LOWERCASE_LETTER - || cat == Character.TITLECASE_LETTER || cat == Character.UPPERCASE_LETTER)) { + || cat == Character.TITLECASE_LETTER + || cat == Character.UPPERCASE_LETTER)) { if (titlecaseStart) { c = Character.toUpperCase(c); } @@ -651,8 +665,7 @@ public static String regularize(String source, boolean titlecaseStart) { } /** - * Utility function for comparing codepoint to string without generating new - * string. + * Utility function for comparing codepoint to string without generating new string. * * @param codepoint * @param other @@ -669,14 +682,9 @@ public static final boolean equals(int codepoint, String other) { return false; } - /** - * Utility function for comparing objects that may be null - * string. - */ + /** Utility function for comparing objects that may be null string. */ public static final boolean equals(T a, T b) { - return a == null ? b == null - : b == null ? false - : a.equals(b); + return a == null ? b == null : b == null ? false : a.equals(b); } /** @@ -685,7 +693,7 @@ public static final boolean equals(T a, T b) { * @param source * @param result */ - static public void addAll(UnicodeSetIterator source, UnicodeSet result) { + public static void addAll(UnicodeSetIterator source, UnicodeSet result) { while (source.nextRange()) { if (source.codepoint == UnicodeSetIterator.IS_STRING) { result.add(source.string); @@ -695,21 +703,15 @@ static public void addAll(UnicodeSetIterator source, UnicodeSet result) { } } - /** - * Really ought to create a Collection UniqueList, that forces uniqueness. - * But for now... - */ + /** Really ought to create a Collection UniqueList, that forces uniqueness. But for now... */ public static Collection addAllUnique(Collection source, Collection result) { - for (Iterator it = source.iterator(); it.hasNext();) { + for (Iterator it = source.iterator(); it.hasNext(); ) { addUnique(it.next(), result); } return result; } - /** - * Really ought to create a Collection UniqueList, that forces uniqueness. - * But for now... - */ + /** Really ought to create a Collection UniqueList, that forces uniqueness. But for now... */ public static Collection addAllUnique(Object[] source, Collection result) { for (int i = 0; i < source.length; ++i) { addUnique(source[i], result); @@ -717,7 +719,7 @@ public static Collection addAllUnique(Object[] source, Collection result) { return result; } - static public class Factory { + public static class Factory { static boolean DEBUG = false; Map canonicalNames = new TreeMap(); @@ -739,8 +741,7 @@ public final Factory add(UnicodeProperty sp) { } public final UnicodeProperty getProperty(String propertyAlias) { - return (UnicodeProperty) skeletonNames - .get(toSkeleton(propertyAlias)); + return (UnicodeProperty) skeletonNames.get(toSkeleton(propertyAlias)); } public final List getAvailableNames() { @@ -748,8 +749,7 @@ public final List getAvailableNames() { } public final List getAvailableNames(List result) { - if (result == null) - result = new ArrayList(1); + if (result == null) result = new ArrayList(1); Iterator it = canonicalNames.keySet().iterator(); while (it.hasNext()) { addUnique(it.next(), result); @@ -762,15 +762,12 @@ public final List getAvailableNames(int propertyTypeMask) { } public final List getAvailableNames(int propertyTypeMask, List result) { - if (result == null) - result = new ArrayList(1); + if (result == null) result = new ArrayList(1); Iterator it = canonicalNames.keySet().iterator(); while (it.hasNext()) { String item = (String) it.next(); UnicodeProperty property = getProperty(item); - if (DEBUG) - System.out.println("Properties: " + item + "," - + property.getType()); + if (DEBUG) System.out.println("Properties: " + item + "," + property.getType()); if (!property.isType(propertyTypeMask)) { // System.out.println("Masking: " + property.getType() + "," // + propertyTypeMask); @@ -783,11 +780,9 @@ public final List getAvailableNames(int propertyTypeMask, List result) { InversePatternMatcher inverseMatcher = new InversePatternMatcher(); - /** - * Format is: propname ('=' | '!=') propvalue ( '|' propValue )* - */ - public final UnicodeSet getSet(String propAndValue, - PatternMatcher matcher, UnicodeSet result) { + /** Format is: propname ('=' | '!=') propvalue ( '|' propValue )* */ + public final UnicodeSet getSet( + String propAndValue, PatternMatcher matcher, UnicodeSet result) { int equalPos = propAndValue.indexOf('='); String prop = propAndValue.substring(0, equalPos); String value = propAndValue.substring(equalPos + 1); @@ -799,9 +794,9 @@ public final UnicodeSet getSet(String propAndValue, prop = prop.trim(); UnicodeProperty up = getProperty(prop); if (matcher == null) { - matcher = new SimpleMatcher(value, up - .isType(STRING_OR_MISC_MASK) ? null - : PROPERTY_COMPARATOR); + matcher = + new SimpleMatcher( + value, up.isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR); } if (negative) { inverseMatcher.set(matcher); @@ -810,8 +805,7 @@ public final UnicodeSet getSet(String propAndValue, return up.getSet(matcher.set(value), result); } - public final UnicodeSet getSet(String propAndValue, - PatternMatcher matcher) { + public final UnicodeSet getSet(String propAndValue, PatternMatcher matcher) { return getSet(propAndValue, matcher, null); } @@ -824,13 +818,11 @@ public final SymbolTable getSymbolTable(String prefix) { } private class MyXSymbolTable extends UnicodeSet.XSymbolTable { - public boolean applyPropertyAlias(String propertyName, - String propertyValue, UnicodeSet result) { - if (false) - System.out.println(propertyName + "=" + propertyValue); + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { + if (false) System.out.println(propertyName + "=" + propertyValue); UnicodeProperty prop = getProperty(propertyName); - if (prop == null) - return false; + if (prop == null) return false; result.clear(); UnicodeSet x = prop.getSet(propertyValue, result); return x.size() != 0; @@ -853,50 +845,47 @@ private class PropertySymbolTable implements SymbolTable { } public char[] lookup(String s) { - if (DEBUG) - System.out.println("\t(" + prefix + ")Looking up " + s); + if (DEBUG) System.out.println("\t(" + prefix + ")Looking up " + s); // ensure, again, that prefix matches int start = prefix.length(); - if (!s.regionMatches(true, 0, prefix, 0, start)) - return null; + if (!s.regionMatches(true, 0, prefix, 0, start)) return null; int pos = s.indexOf(':', start); if (pos < 0) { // should never happen - throw new IllegalArgumentException( - "Internal Error: missing =: " + s + "\r\n"); + throw new IllegalArgumentException("Internal Error: missing =: " + s + "\r\n"); } UnicodeProperty prop = getProperty(s.substring(start, pos)); if (prop == null) { - throw new IllegalArgumentException("Invalid Property in: " - + s + "\r\nUse " + showSet(getAvailableNames())); + throw new IllegalArgumentException( + "Invalid Property in: " + + s + + "\r\nUse " + + showSet(getAvailableNames())); } String value = s.substring(pos + 1); UnicodeSet set; if (value.startsWith("\u00AB")) { // regex! - set = prop.getSet(regexMatcher.set(value.substring(1, value - .length() - 1))); + set = prop.getSet(regexMatcher.set(value.substring(1, value.length() - 1))); } else { set = prop.getSet(value); } if (set.size() == 0) { throw new IllegalArgumentException( - "Empty Property-Value in: " + s + "\r\nUse " - + showSet(prop.getAvailableValues())); + "Empty Property-Value in: " + + s + + "\r\nUse " + + showSet(prop.getAvailableValues())); } - if (DEBUG) - System.out.println("\t(" + prefix + ")Returning " - + set.toPattern(true)); + if (DEBUG) System.out.println("\t(" + prefix + ")Returning " + set.toPattern(true)); return set.toPattern(true).toCharArray(); // really ugly } private String showSet(List list) { StringBuffer result = new StringBuffer("["); boolean first = true; - for (Iterator it = list.iterator(); it.hasNext();) { - if (!first) - result.append(", "); - else - first = false; + for (Iterator it = list.iterator(); it.hasNext(); ) { + if (!first) result.append(", "); + else first = false; result.append(it.next().toString()); } result.append("]"); @@ -907,21 +896,21 @@ public UnicodeMatcher lookupMatcher(int ch) { return null; } - public String parseReference(String text, ParsePosition pos, - int limit) { + public String parseReference(String text, ParsePosition pos, int limit) { if (DEBUG) - System.out.println("\t(" + prefix + ")Parsing <" - + text.substring(pos.getIndex(), limit) + ">"); + System.out.println( + "\t(" + + prefix + + ")Parsing <" + + text.substring(pos.getIndex(), limit) + + ">"); int start = pos.getIndex(); // ensure that it starts with 'prefix' - if (!text - .regionMatches(true, start, prefix, 0, prefix.length())) - return null; + if (!text.regionMatches(true, start, prefix, 0, prefix.length())) return null; start += prefix.length(); // now see if it is of the form identifier:identifier int i = getIdentifier(text, start, limit); - if (i == start) - return null; + if (i == start) return null; String prop = text.substring(start, i); String value = "true"; if (i < limit) { @@ -932,8 +921,7 @@ public String parseReference(String text, ParsePosition pos, j = text.indexOf('\u00BB', i + 2) + 1; // include // last // character - if (j <= 0) - return null; + if (j <= 0) return null; } else { j = getIdentifier(text, i + 1, limit); } @@ -943,28 +931,21 @@ public String parseReference(String text, ParsePosition pos, } pos.setIndex(i); if (DEBUG) - System.out.println("\t(" + prefix + ")Parsed <" + prop - + ">=<" + value + ">"); + System.out.println("\t(" + prefix + ")Parsed <" + prop + ">=<" + value + ">"); return prefix + prop + ":" + value; } private int getIdentifier(String text, int start, int limit) { - if (DEBUG) - System.out.println("\tGetID <" - + text.substring(start, limit) + ">"); + if (DEBUG) System.out.println("\tGetID <" + text.substring(start, limit) + ">"); int cp = 0; int i; for (i = start; i < limit; i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(text, i); - if (!com.ibm.icu.lang.UCharacter - .isUnicodeIdentifierPart(cp) - && cp != '.') { + if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp) && cp != '.') { break; } } - if (DEBUG) - System.out.println("\tGotID <" + text.substring(start, i) - + ">"); + if (DEBUG) System.out.println("\tGotID <" + text.substring(start, i) + ">"); return i; } } @@ -975,8 +956,8 @@ public static class FilteredProperty extends UnicodeProperty { protected StringFilter filter; - protected UnicodeSetIterator matchIterator = new UnicodeSetIterator( - new UnicodeSet(0, 0x10FFFF)); + protected UnicodeSetIterator matchIterator = + new UnicodeSetIterator(new UnicodeSet(0, 0x10FFFF)); protected HashMap backmap; @@ -1020,19 +1001,16 @@ public List _getValueAliases(String valueAlias, List result) { while (it.hasNext()) { String item = (String) it.next(); String mappedItem = filter.remap(item); - if (backmap.get(mappedItem) != null - && !allowValueAliasCollisions) { + if (backmap.get(mappedItem) != null && !allowValueAliasCollisions) { throw new IllegalArgumentException( - "Filter makes values collide! " + item + ", " - + mappedItem); + "Filter makes values collide! " + item + ", " + mappedItem); } backmap.put(mappedItem, item); } } valueAlias = (String) backmap.get(valueAlias); temp.clear(); - return filter.addUnique(property.getValueAliases(valueAlias, temp), - result); + return filter.addUnique(property.getValueAliases(valueAlias, temp), result); } public String _getVersion() { @@ -1047,15 +1025,13 @@ public FilteredProperty setAllowValueAliasCollisions(boolean b) { allowValueAliasCollisions = b; return this; } - } - public static abstract class StringFilter implements Cloneable { + public abstract static class StringFilter implements Cloneable { public abstract String remap(String original); public final List addUnique(Collection source, List result) { - if (result == null) - result = new ArrayList(1); + if (result == null) result = new ArrayList(1); Iterator it = source.iterator(); while (it.hasNext()) { UnicodeProperty.addUnique(remap((String) it.next()), result); @@ -1119,8 +1095,7 @@ public SimpleMatcher(String pattern, Comparator comparator) { } public boolean test(String value) { - if (comparator == null) - return pattern.equals(value); + if (comparator == null) return pattern.equals(value); return comparator.compare(pattern, value) == 0; } @@ -1137,21 +1112,21 @@ public UnicodeProperty.PatternMatcher set(String pattern) { matcher = Pattern.compile(pattern).matcher(""); return this; } + UFormat foo; + public boolean test(String value) { matcher.reset(value.toString()); return matcher.find(); } } - public static abstract class BaseProperty extends UnicodeProperty { + public abstract static class BaseProperty extends UnicodeProperty { private static final String[] NO_VALUES = {"No", "N", "F", "False"}; private static final String[] YES_VALUES = {"Yes", "Y", "T", "True"}; - /** - * - */ + /** */ private static final String[][] YES_NO_ALIASES = new String[][] {YES_VALUES, NO_VALUES}; protected List propertyAliases = new ArrayList(1); @@ -1160,8 +1135,8 @@ public static abstract class BaseProperty extends UnicodeProperty { protected String version; - public BaseProperty setMain(String alias, String shortAlias, - int propertyType, String version) { + public BaseProperty setMain( + String alias, String shortAlias, int propertyType, String version) { setName(alias); setType(propertyType); propertyAliases.add(shortAlias); @@ -1182,41 +1157,33 @@ public List _getNameAliases(List result) { return result; } - public BaseProperty addValueAliases(String[][] valueAndAlternates, - boolean errorIfCant) { - if (toValueAliases == null) - _fixValueAliases(); + public BaseProperty addValueAliases(String[][] valueAndAlternates, boolean errorIfCant) { + if (toValueAliases == null) _fixValueAliases(); for (int i = 0; i < valueAndAlternates.length; ++i) { for (int j = 1; j < valueAndAlternates[0].length; ++j) { - addValueAlias(valueAndAlternates[i][0], - valueAndAlternates[i][j], errorIfCant); + addValueAlias(valueAndAlternates[i][0], valueAndAlternates[i][j], errorIfCant); } } return this; } - public void addValueAlias(String value, String valueAlias, - boolean errorIfCant) { + public void addValueAlias(String value, String valueAlias, boolean errorIfCant) { List result = (List) toValueAliases.get(value); - if (result == null && !errorIfCant) - return; + if (result == null && !errorIfCant) return; addUnique(value, result); addUnique(valueAlias, result); } protected List _getValueAliases(String valueAlias, List result) { - if (toValueAliases == null) - _fixValueAliases(); + if (toValueAliases == null) _fixValueAliases(); List a = (List) toValueAliases.get(valueAlias); - if (a != null) - addAllUnique(a, result); + if (a != null) addAllUnique(a, result); return result; } protected void _fixValueAliases() { - if (toValueAliases == null) - toValueAliases = new HashMap(1); - for (Iterator it = getAvailableValues().iterator(); it.hasNext();) { + if (toValueAliases == null) toValueAliases = new HashMap(1); + for (Iterator it = getAvailableValues().iterator(); it.hasNext(); ) { Object value = it.next(); _ensureValueInAliases(value); } @@ -1224,16 +1191,14 @@ protected void _fixValueAliases() { protected void _ensureValueInAliases(Object value) { List result = (List) toValueAliases.get(value); - if (result == null) - toValueAliases.put(value, result = new ArrayList(1)); + if (result == null) toValueAliases.put(value, result = new ArrayList(1)); addUnique(value, result); } public BaseProperty swapFirst2ValueAliases() { - for (Iterator it = toValueAliases.keySet().iterator(); it.hasNext();) { + for (Iterator it = toValueAliases.keySet().iterator(); it.hasNext(); ) { List list = (List) toValueAliases.get(it.next()); - if (list.size() < 2) - continue; + if (list.size() < 2) continue; Object first = list.get(0); list.set(0, list.get(1)); list.set(1, first); @@ -1248,10 +1213,9 @@ public BaseProperty swapFirst2ValueAliases() { public UnicodeProperty addName(String string) { throw new UnsupportedOperationException(); } - } - public static abstract class SimpleProperty extends BaseProperty { + public abstract static class SimpleProperty extends BaseProperty { LinkedHashSet values; public UnicodeProperty addName(String alias) { @@ -1269,85 +1233,78 @@ public SimpleProperty addAliases(String valueAlias, String... aliases) { return this; } - public SimpleProperty setValues(String[] valueAliases, - String[] alternateValueAliases) { + public SimpleProperty setValues(String[] valueAliases, String[] alternateValueAliases) { for (int i = 0; i < valueAliases.length; ++i) { - if (valueAliases[i].equals(UNUSED)) - continue; + if (valueAliases[i].equals(UNUSED)) continue; _addToValues( valueAliases[i], - alternateValueAliases != null ? alternateValueAliases[i] - : null); + alternateValueAliases != null ? alternateValueAliases[i] : null); } return this; } public SimpleProperty setValues(List valueAliases) { this.values = new LinkedHashSet(valueAliases); - for (Iterator it = this.values.iterator(); it.hasNext();) { + for (Iterator it = this.values.iterator(); it.hasNext(); ) { _addToValues((String) it.next(), null); } return this; } public List _getAvailableValues(List result) { - if (values == null) - _fillValues(); + if (values == null) _fillValues(); result.addAll(values); return result; } protected void _fillValues() { - List newvalues = (List) getUnicodeMap_internal() - .getAvailableValues(new ArrayList()); - for (Iterator it = newvalues.iterator(); it.hasNext();) { + List newvalues = (List) getUnicodeMap_internal().getAvailableValues(new ArrayList()); + for (Iterator it = newvalues.iterator(); it.hasNext(); ) { _addToValues((String) it.next(), null); } } private void _addToValues(String item, String alias) { - if (values == null) - values = new LinkedHashSet(); - if (toValueAliases == null) - _fixValueAliases(); + if (values == null) values = new LinkedHashSet(); + if (toValueAliases == null) _fixValueAliases(); addUnique(item, values); _ensureValueInAliases(item); addValueAlias(item, alias, true); } /* public String _getVersion() { - return version; - } - */ + return version; + } + */ } public static class UnicodeMapProperty extends BaseProperty { /* - * Example of usage: - * new UnicodeProperty.UnicodeMapProperty() { - { - unicodeMap = new UnicodeMap(); - unicodeMap.setErrorOnReset(true); - unicodeMap.put(0xD, "CR"); - unicodeMap.put(0xA, "LF"); - UnicodeProperty cat = getProperty("General_Category"); - UnicodeSet temp = cat.getSet("Line_Separator") - .addAll(cat.getSet("Paragraph_Separator")) - .addAll(cat.getSet("Control")) - .addAll(cat.getSet("Format")) - .remove(0xD).remove(0xA).remove(0x200C).remove(0x200D); - unicodeMap.putAll(temp, "Control"); - UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); - unicodeMap.putAll(graphemeExtend,"Extend"); - UnicodeProperty hangul = getProperty("Hangul_Syllable_Type"); - unicodeMap.putAll(hangul.getSet("L"),"L"); - unicodeMap.putAll(hangul.getSet("V"),"V"); - unicodeMap.putAll(hangul.getSet("T"),"T"); - unicodeMap.putAll(hangul.getSet("LV"),"LV"); - unicodeMap.putAll(hangul.getSet("LVT"),"LVT"); - unicodeMap.setMissing("Other"); - } - }.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version) - */ + * Example of usage: + * new UnicodeProperty.UnicodeMapProperty() { + { + unicodeMap = new UnicodeMap(); + unicodeMap.setErrorOnReset(true); + unicodeMap.put(0xD, "CR"); + unicodeMap.put(0xA, "LF"); + UnicodeProperty cat = getProperty("General_Category"); + UnicodeSet temp = cat.getSet("Line_Separator") + .addAll(cat.getSet("Paragraph_Separator")) + .addAll(cat.getSet("Control")) + .addAll(cat.getSet("Format")) + .remove(0xD).remove(0xA).remove(0x200C).remove(0x200D); + unicodeMap.putAll(temp, "Control"); + UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); + unicodeMap.putAll(graphemeExtend,"Extend"); + UnicodeProperty hangul = getProperty("Hangul_Syllable_Type"); + unicodeMap.putAll(hangul.getSet("L"),"L"); + unicodeMap.putAll(hangul.getSet("V"),"V"); + unicodeMap.putAll(hangul.getSet("T"),"T"); + unicodeMap.putAll(hangul.getSet("LV"),"LV"); + unicodeMap.putAll(hangul.getSet("LVT"),"LVT"); + unicodeMap.setMissing("Other"); + } + }.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version) + */ protected UnicodeMap unicodeMap; public UnicodeMapProperty set(UnicodeMap map) { @@ -1365,13 +1322,13 @@ protected String _getValue(int codepoint) { } /* protected List _getValueAliases(String valueAlias, List result) { - if (!unicodeMap.getAvailableValues().contains(valueAlias)) return result; - result.add(valueAlias); - return result; // no other aliases - } - */protected List _getAvailableValues(List result) { - return (List) unicodeMap.getAvailableValues(result); - } + if (!unicodeMap.getAvailableValues().contains(valueAlias)) return result; + result.add(valueAlias); + return result; // no other aliases + } + */ protected List _getAvailableValues(List result) { + return (List) unicodeMap.getAvailableValues(result); + } } public boolean isValidValue(String propertyValue) { @@ -1405,7 +1362,6 @@ public List getValueAliases() { return result; } - public static UnicodeSet addUntested(UnicodeSet result, boolean uniformUnassigned) { if (uniformUnassigned && result.contains(UnicodeProperty.SAMPLE_UNASSIGNED)) { result.addAll(UnicodeProperty.UNASSIGNED); @@ -1474,6 +1430,7 @@ public boolean isDefault(int cp) { public boolean hasUniformUnassigned() { return hasUniformUnassigned; } + protected UnicodeProperty setUniformUnassigned(boolean hasUniformUnassigned) { this.hasUniformUnassigned = hasUniformUnassigned; return this; diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java index 0ed9b199c..9f7926b10 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeSetUtilities.java @@ -1,15 +1,5 @@ package org.unicode.jsp; -import java.text.ParsePosition; -import java.util.Comparator; -import java.util.List; -import java.util.regex.Pattern; - -import org.unicode.cldr.util.MultiComparator; -import org.unicode.props.UnicodeProperty; -import org.unicode.props.UnicodeProperty.PatternMatcher; -import org.unicode.jsp.UnicodeSetUtilities.ComparisonMatcher.Relation; - import com.ibm.icu.lang.CharSequences; import com.ibm.icu.text.Collator; import com.ibm.icu.text.RuleBasedCollator; @@ -17,6 +7,14 @@ import com.ibm.icu.text.UTF16.StringComparator; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; +import java.text.ParsePosition; +import java.util.Comparator; +import java.util.List; +import java.util.regex.Pattern; +import org.unicode.cldr.util.MultiComparator; +import org.unicode.jsp.UnicodeSetUtilities.ComparisonMatcher.Relation; +import org.unicode.props.UnicodeProperty; +import org.unicode.props.UnicodeProperty.PatternMatcher; public class UnicodeSetUtilities { @@ -25,40 +23,66 @@ public class UnicodeSetUtilities { public static final UnicodeSet Emoji_Modifier = new UnicodeSet("[:Emoji_Modifier:]"); public static final UnicodeSet Emoji_Modifier_Base = new UnicodeSet("[:Emoji_Modifier_Base:]"); - public static final UnicodeSet SINGLETONS = new UnicodeSet("[©®‼⁉™ℹ↔-↙↩↪⌚⌛⌨⏏⏩-⏳⏸-⏺Ⓜ▪▫▶◀◻-◾☀-☄☎☑☔☕☘☝☠☢☣☦☪☮☯☸-☺♈-♓♠♣♥♦♨♻♿⚒-⚔⚖⚗⚙⚛⚜⚠⚡" - + "⚪⚫⚰⚱⚽⚾⛄⛅⛈⛎⛏⛑⛓⛔⛩⛪⛰-⛵⛷-⛺⛽✂✅✈-✍✏✒✔✖✝✡✨✳✴❄❇❌❎❓-❕❗❣❤➕-➗➡➰➿⤴⤵⬅-⬇⬛⬜⭐⭕〰〽㊗㊙🀄🃏🅰🅱🅾🅿🆎🆑-🆚🈁🈂🈚🈯🈲-🈺" - + "🉐🉑🌀-🌡🌤-🎓🎖🎗🎙-🎛🎞-🏰🏳-🏵🏷-📽📿-🔽🕉-🕎🕐-🕧🕯🕰🕳-🕹🖇🖊-🖍🖐🖕🖖🖥🖨🖱🖲🖼🗂-🗄🗑-🗓🗜-🗞🗡🗣🗯🗳🗺-🙏🚀-🛅🛋-🛐🛠-🛥🛩🛫🛬🛰🛳🤐-🤘🦀-🦄🧀]").freeze(); - public static final UnicodeSet KEYCAPS = new UnicodeSet("[{#⃣}{*⃣}{0⃣}{1⃣}{2⃣}{3⃣}{4⃣}{5⃣}{6⃣}{7⃣}{8⃣}{9⃣}]").freeze(); - public static final UnicodeSet FLAGS = new UnicodeSet("[{🇦🇨}" - + "{🇦🇩}{🇦🇪}{🇦🇫}{🇦🇬}{🇦🇮}{🇦🇱}{🇦🇲}{🇦🇴}{🇦🇶}{🇦🇷}{🇦🇸}{🇦🇹}{🇦🇺}{🇦🇼}{🇦🇽}{🇦🇿}{🇧🇦}{🇧🇧}{🇧🇩}{🇧🇪}{🇧🇫}{🇧🇬}{🇧🇭}{🇧🇮}{🇧🇯}{🇧🇱}{🇧🇲}{🇧🇳}{🇧🇴}{🇧🇶}{🇧🇷}{🇧🇸}" - + "{🇧🇹}{🇧🇻}{🇧🇼}{🇧🇾}{🇧🇿}{🇨🇦}{🇨🇨}{🇨🇩}{🇨🇫}{🇨🇬}{🇨🇭}{🇨🇮}{🇨🇰}{🇨🇱}{🇨🇲}{🇨🇳}{🇨🇴}{🇨🇵}{🇨🇷}{🇨🇺}{🇨🇻}{🇨🇼}{🇨🇽}{🇨🇾}{🇨🇿}{🇩🇪}{🇩🇬}{🇩🇯}{🇩🇰}{🇩🇲}{🇩🇴}" - + "{🇩🇿}{🇪🇦}{🇪🇨}{🇪🇪}{🇪🇬}{🇪🇭}{🇪🇷}{🇪🇸}{🇪🇹}{🇪🇺}{🇫🇮}{🇫🇯}{🇫🇰}{🇫🇲}{🇫🇴}{🇫🇷}{🇬🇦}{🇬🇧}{🇬🇩}{🇬🇪}{🇬🇫}{🇬🇬}{🇬🇭}{🇬🇮}{🇬🇱}{🇬🇲}{🇬🇳}{🇬🇵}{🇬🇶}{🇬🇷}" - + "{🇬🇸}{🇬🇹}{🇬🇺}{🇬🇼}{🇬🇾}{🇭🇰}{🇭🇲}{🇭🇳}{🇭🇷}{🇭🇹}{🇭🇺}{🇮🇨}{🇮🇩}{🇮🇪}{🇮🇱}{🇮🇲}{🇮🇳}{🇮🇴}{🇮🇶}{🇮🇷}{🇮🇸}{🇮🇹}{🇯🇪}{🇯🇲}{🇯🇴}{🇯🇵}{🇰🇪}{🇰🇬}{🇰🇭}{🇰🇮}{🇰🇲}" - + "{🇰🇳}{🇰🇵}{🇰🇷}{🇰🇼}{🇰🇾}{🇰🇿}{🇱🇦}{🇱🇧}{🇱🇨}{🇱🇮}{🇱🇰}{🇱🇷}{🇱🇸}{🇱🇹}{🇱🇺}{🇱🇻}{🇱🇾}{🇲🇦}{🇲🇨}{🇲🇩}{🇲🇪}{🇲🇫}{🇲🇬}{🇲🇭}{🇲🇰}{🇲🇱}{🇲🇲}{🇲🇳}{🇲🇴}{🇲🇵}{🇲🇶}{🇲🇷}{🇲🇸}" - + "{🇲🇹}{🇲🇺}{🇲🇻}{🇲🇼}{🇲🇽}{🇲🇾}{🇲🇿}{🇳🇦}{🇳🇨}{🇳🇪}{🇳🇫}{🇳🇬}{🇳🇮}{🇳🇱}{🇳🇴}{🇳🇵}{🇳🇷}{🇳🇺}{🇳🇿}{🇴🇲}{🇵🇦}{🇵🇪}{🇵🇫}{🇵🇬}{🇵🇭}{🇵🇰}{🇵🇱}{🇵🇲}{🇵🇳}{🇵🇷}{🇵🇸}" - + "{🇵🇹}{🇵🇼}{🇵🇾}{🇶🇦}{🇷🇪}{🇷🇴}{🇷🇸}{🇷🇺}{🇷🇼}{🇸🇦}{🇸🇧}{🇸🇨}{🇸🇩}{🇸🇪}{🇸🇬}{🇸🇭}{🇸🇮}{🇸🇯}{🇸🇰}{🇸🇱}{🇸🇲}{🇸🇳}{🇸🇴}{🇸🇷}{🇸🇸}{🇸🇹}{🇸🇻}{🇸🇽}{🇸🇾}{🇸🇿}{🇹🇦}{🇹🇨}" - + "{🇹🇩}{🇹🇫}{🇹🇬}{🇹🇭}{🇹🇯}{🇹🇰}{🇹🇱}{🇹🇲}{🇹🇳}{🇹🇴}{🇹🇷}{🇹🇹}{🇹🇻}{🇹🇼}{🇹🇿}{🇺🇦}{🇺🇬}{🇺🇲}{🇺🇸}{🇺🇾}{🇺🇿}{🇻🇦}{🇻🇨}{🇻🇪}{🇻🇬}{🇻🇮}{🇻🇳}{🇻🇺}{🇼🇫}" - + "{🇼🇸}{🇽🇰}{🇾🇪}{🇾🇹}{🇿🇦}{🇿🇲}{🇿🇼}]").freeze(); - public static final UnicodeSet GROUPS = new UnicodeSet("[💏 💑 👪 {👨‍❤️‍👨}{👨‍❤️‍💋‍👨}{👨‍👨‍👦}{👨‍👨‍👦‍👦}{👨‍👨‍👧}{👨‍👨‍👧‍👦}{👨‍👨‍👧‍👧}{👨‍👩‍👦}{👨‍👩‍👦‍👦}{👨‍👩‍👧}{👨‍👩‍👧‍👦}{👨‍👩‍👧‍👧}{👩‍❤️‍👩}{👩‍❤️‍💋‍👩}{👩‍👩‍👦}{👩‍👩‍👦‍👦}{👩‍👩‍👧}{👩‍👩‍👧‍👦}{👩‍👩‍👧‍👧}]").freeze(); - public static final UnicodeSet PRIMARY = new UnicodeSet("[🎅👦-👩👮👰-👸👼💁💂💆💇🙅-🙇🙋🙍🙎" - + "{🎅🏻}{🎅🏼}{🎅🏽}{🎅🏾}{🎅🏿}{👦🏻}{👦🏼}{👦🏽}{👦🏾}{👦🏿}{👧🏻}{👧🏼}{👧🏽}{👧🏾}{👧🏿}{👨🏻}{👨🏼}{👨🏽}{👨🏾}{👨🏿}{👩🏻}{👩🏼}{👩🏽}{👩🏾}{👩🏿}{👮🏻}{👮🏼}{👮🏽}{👮🏾}{👮🏿}{👰🏻}{👰🏼}{👰🏽}{👰🏾}{👰🏿}{👱🏻}{👱🏼}{👱🏽}{👱🏾}{👱🏿}{👲🏻}{👲🏼}{👲🏽}{👲🏾}{👲🏿}{👳🏻}{👳🏼}{👳🏽}{👳🏾}{👳🏿}{👴🏻}{👴🏼}{👴🏽}{👴🏾}{👴🏿}{👵🏻}{👵🏼}{👵🏽}{👵🏾}{👵🏿}{👶🏻}{👶🏼}{👶🏽}{👶🏾}{👶🏿}{👷🏻}{👷🏼}{👷🏽}{👷🏾}{👷🏿}{👸🏻}{👸🏼}{👸🏽}{👸🏾}{👸🏿}{👼🏻}{👼🏼}{👼🏽}{👼🏾}{👼🏿}{💁🏻}{💁🏼}{💁🏽}{💁🏾}{💁🏿}{💂🏻}{💂🏼}{💂🏽}{💂🏾}{💂🏿}{💆🏻}{💆🏼}{💆🏽}{💆🏾}{💆🏿}{💇🏻}{💇🏼}{💇🏽}{💇🏾}{💇🏿}{🙅🏻}{🙅🏼}{🙅🏽}{🙅🏾}{🙅🏿}{🙆🏻}{🙆🏼}{🙆🏽}{🙆🏾}{🙆🏿}{🙇🏻}{🙇🏼}{🙇🏽}{🙇🏾}{🙇🏿}{🙋🏻}{🙋🏼}{🙋🏽}{🙋🏾}{🙋🏿}{🙍🏻}{🙍🏼}{🙍🏽}{🙍🏾}{🙍🏿}{🙎🏻}{🙎🏼}{🙎🏽}{🙎🏾}{🙎🏿}]").freeze(); - public static final UnicodeSet FACE = new UnicodeSet("[☺ ☹ 🙁 🙂 😀-😆 😉-😷 😇 😈 👿 🙃 🙄 🤐-🤕 🤗]").freeze(); - // + "{☹🏻}{☹🏼}{☹🏽}{☹🏾}{☹🏿}{☺🏻}{☺🏼}{☺🏽}{☺🏾}{☺🏿}{👿🏻}{👿🏼}{👿🏽}{👿🏾}{👿🏿}{😀🏻}{😀🏼}{😀🏽}{😀🏾}{😀🏿}{😁🏻}{😁🏼}{😁🏽}{😁🏾}{😁🏿}{😂🏻}{😂🏼}{😂🏽}{😂🏾}{😂🏿}{😃🏻}{😃🏼}{😃🏽}{😃🏾}{😃🏿}{😄🏻}{😄🏼}{😄🏽}{😄🏾}{😄🏿}{😅🏻}{😅🏼}{😅🏽}{😅🏾}{😅🏿}{😆🏻}{😆🏼}{😆🏽}{😆🏾}{😆🏿}{😇🏻}{😇🏼}{😇🏽}{😇🏾}{😇🏿}{😈🏻}{😈🏼}{😈🏽}{😈🏾}{😈🏿}{😉🏻}{😉🏼}{😉🏽}{😉🏾}{😉🏿}{😊🏻}{😊🏼}{😊🏽}{😊🏾}{😊🏿}{😋🏻}{😋🏼}{😋🏽}{😋🏾}{😋🏿}{😌🏻}{😌🏼}{😌🏽}{😌🏾}{😌🏿}{😍🏻}{😍🏼}{😍🏽}{😍🏾}{😍🏿}{😎🏻}{😎🏼}{😎🏽}{😎🏾}{😎🏿}{😏🏻}{😏🏼}{😏🏽}{😏🏾}{😏🏿}{😐🏻}{😐🏼}{😐🏽}{😐🏾}{😐🏿}{😑🏻}{😑🏼}{😑🏽}{😑🏾}{😑🏿}{😒🏻}{😒🏼}{😒🏽}{😒🏾}{😒🏿}{😓🏻}{😓🏼}{😓🏽}{😓🏾}{😓🏿}{😔🏻}{😔🏼}{😔🏽}{😔🏾}{😔🏿}{😕🏻}{😕🏼}{😕🏽}{😕🏾}{😕🏿}{😖🏻}{😖🏼}{😖🏽}{😖🏾}{😖🏿}{😗🏻}{😗🏼}{😗🏽}{😗🏾}{😗🏿}{😘🏻}{😘🏼}{😘🏽}{😘🏾}{😘🏿}{😙🏻}{😙🏼}{😙🏽}{😙🏾}{😙🏿}{😚🏻}{😚🏼}{😚🏽}{😚🏾}{😚🏿}{😛🏻}{😛🏼}{😛🏽}{😛🏾}{😛🏿}{😜🏻}{😜🏼}{😜🏽}{😜🏾}{😜🏿}{😝🏻}{😝🏼}{😝🏽}{😝🏾}{😝🏿}{😞🏻}{😞🏼}{😞🏽}{😞🏾}{😞🏿}{😟🏻}{😟🏼}{😟🏽}{😟🏾}{😟🏿}{😠🏻}{😠🏼}{😠🏽}{😠🏾}{😠🏿}{😡🏻}{😡🏼}{😡🏽}{😡🏾}{😡🏿}{😢🏻}{😢🏼}{😢🏽}{😢🏾}{😢🏿}{😣🏻}{😣🏼}{😣🏽}{😣🏾}{😣🏿}{😤🏻}{😤🏼}{😤🏽}{😤🏾}{😤🏿}{😥🏻}{😥🏼}{😥🏽}{😥🏾}{😥🏿}{😦🏻}{😦🏼}{😦🏽}{😦🏾}{😦🏿}{😧🏻}{😧🏼}{😧🏽}{😧🏾}{😧🏿}{😨🏻}{😨🏼}{😨🏽}{😨🏾}{😨🏿}{😩🏻}{😩🏼}{😩🏽}{😩🏾}{😩🏿}{😪🏻}{😪🏼}{😪🏽}{😪🏾}{😪🏿}{😫🏻}{😫🏼}{😫🏽}{😫🏾}{😫🏿}{😬🏻}{😬🏼}{😬🏽}{😬🏾}{😬🏿}{😭🏻}{😭🏼}{😭🏽}{😭🏾}{😭🏿}{😮🏻}{😮🏼}{😮🏽}{😮🏾}{😮🏿}{😯🏻}{😯🏼}{😯🏽}{😯🏾}{😯🏿}{😰🏻}{😰🏼}{😰🏽}{😰🏾}{😰🏿}{😱🏻}{😱🏼}{😱🏽}{😱🏾}{😱🏿}{😲🏻}{😲🏼}{😲🏽}{😲🏾}{😲🏿}{😳🏻}{😳🏼}{😳🏽}{😳🏾}{😳🏿}{😴🏻}{😴🏼}{😴🏽}{😴🏾}{😴🏿}{😵🏻}{😵🏼}{😵🏽}{😵🏾}{😵🏿}{😶🏻}{😶🏼}{😶🏽}{😶🏾}{😶🏿}{😷🏻}{😷🏼}{😷🏽}{😷🏾}{😷🏿}{🙁🏻}{🙁🏼}{🙁🏽}{🙁🏾}{🙁🏿}{🙂🏻}{🙂🏼}{🙂🏽}{🙂🏾}{🙂🏿}{🙃🏻}{🙃🏼}{🙃🏽}{🙃🏾}{🙃🏿}{🙄🏻}{🙄🏼}{🙄🏽}{🙄🏾}{🙄🏿}{🤐🏻}{🤐🏼}{🤐🏽}{🤐🏾}{🤐🏿}{🤑🏻}{🤑🏼}{🤑🏽}{🤑🏾}{🤑🏿}{🤒🏻}{🤒🏼}{🤒🏽}{🤒🏾}{🤒🏿}{🤓🏻}{🤓🏼}{🤓🏽}{🤓🏾}{🤓🏿}{🤔🏻}{🤔🏼}{🤔🏽}{🤔🏾}{🤔🏿}{🤕🏻}{🤕🏼}{🤕🏽}{🤕🏾}{🤕🏿}{🤗🏻}{🤗🏼}{🤗🏽}{🤗🏾}{🤗🏿}]").freeze(); - public static final UnicodeSet SECONDARY = new UnicodeSet("[☝✊-✍🏂-🏄🏇🏊👂👃👆-👐💃💅💪🖐🖕 🖖🙌🙏🚣🚴-🚶🛀🤘" - + "{☝🏻}{☝🏼}{☝🏽}{☝🏾}{☝🏿}{✊🏻}{✊🏼}{✊🏽}{✊🏾}{✊🏿}{✋🏻}{✋🏼}{✋🏽}{✋🏾}{✋🏿}{✌🏻}{✌🏼}{✌🏽}{✌🏾}{✌🏿}{✍🏻}{✍🏼}{✍🏽}{✍🏾}{✍🏿}{🏂🏻}{🏂🏼}{🏂🏽}{🏂🏾}{🏂🏿}{🏃🏻}{🏃🏼}{🏃🏽}{🏃🏾}{🏃🏿}{🏄🏻}{🏄🏼}{🏄🏽}{🏄🏾}{🏄🏿}{🏇🏻}{🏇🏼}{🏇🏽}{🏇🏾}{🏇🏿}{🏊🏻}{🏊🏼}{🏊🏽}{🏊🏾}{🏊🏿}{👂🏻}{👂🏼}{👂🏽}{👂🏾}{👂🏿}{👃🏻}{👃🏼}{👃🏽}{👃🏾}{👃🏿}{👆🏻}{👆🏼}{👆🏽}{👆🏾}{👆🏿}{👇🏻}{👇🏼}{👇🏽}{👇🏾}{👇🏿}{👈🏻}{👈🏼}{👈🏽}{👈🏾}{👈🏿}{👉🏻}{👉🏼}{👉🏽}{👉🏾}{👉🏿}{👊🏻}{👊🏼}{👊🏽}{👊🏾}{👊🏿}{👋🏻}{👋🏼}{👋🏽}{👋🏾}{👋🏿}{👌🏻}{👌🏼}{👌🏽}{👌🏾}{👌🏿}{👍🏻}{👍🏼}{👍🏽}{👍🏾}{👍🏿}{👎🏻}{👎🏼}{👎🏽}{👎🏾}{👎🏿}{👏🏻}{👏🏼}{👏🏽}{👏🏾}{👏🏿}{👐🏻}{👐🏼}{👐🏽}{👐🏾}{👐🏿}{💃🏻}{💃🏼}{💃🏽}{💃🏾}{💃🏿}{💅🏻}{💅🏼}{💅🏽}{💅🏾}{💅🏿}{💪🏻}{💪🏼}{💪🏽}{💪🏾}{💪🏿}{🖐🏻}{🖐🏼}{🖐🏽}{🖐🏾}{🖐🏿}{🖕🏻}{🖕🏼}{🖕🏽}{🖕🏾}{🖕🏿}{🖖🏻}{🖖🏼}{🖖🏽}{🖖🏾}{🖖🏿}{🙌🏻}{🙌🏼}{🙌🏽}{🙌🏾}{🙌🏿}{🙏🏻}{🙏🏼}{🙏🏽}{🙏🏾}{🙏🏿}{🚣🏻}{🚣🏼}{🚣🏽}{🚣🏾}{🚣🏿}{🚴🏻}{🚴🏼}{🚴🏽}{🚴🏾}{🚴🏿}{🚵🏻}{🚵🏼}{🚵🏽}{🚵🏾}{🚵🏿}{🚶🏻}{🚶🏼}{🚶🏽}{🚶🏾}{🚶🏿}{🛀🏻}{🛀🏼}{🛀🏽}{🛀🏾}{🛀🏿}{🤘🏻}{🤘🏼}{🤘🏽}{🤘🏾}{🤘🏿}]").freeze(); - static final UnicodeSet MODIFIERS = new UnicodeSet(0x1F3FB,0x1F3FF).freeze(); - static final UnicodeSet REGIONALS = new UnicodeSet(0x1F1E6,0x1F1FF).freeze(); + public static final UnicodeSet SINGLETONS = + new UnicodeSet( + "[©®‼⁉™ℹ↔-↙↩↪⌚⌛⌨⏏⏩-⏳⏸-⏺Ⓜ▪▫▶◀◻-◾☀-☄☎☑☔☕☘☝☠☢☣☦☪☮☯☸-☺♈-♓♠♣♥♦♨♻♿⚒-⚔⚖⚗⚙⚛⚜⚠⚡" + + "⚪⚫⚰⚱⚽⚾⛄⛅⛈⛎⛏⛑⛓⛔⛩⛪⛰-⛵⛷-⛺⛽✂✅✈-✍✏✒✔✖✝✡✨✳✴❄❇❌❎❓-❕❗❣❤➕-➗➡➰➿⤴⤵⬅-⬇⬛⬜⭐⭕〰〽㊗㊙🀄🃏🅰🅱🅾🅿🆎🆑-🆚🈁🈂🈚🈯🈲-🈺" + + "🉐🉑🌀-🌡🌤-🎓🎖🎗🎙-🎛🎞-🏰🏳-🏵🏷-📽📿-🔽🕉-🕎🕐-🕧🕯🕰🕳-🕹🖇🖊-🖍🖐🖕🖖🖥🖨🖱🖲🖼🗂-🗄🗑-🗓🗜-🗞🗡🗣🗯🗳🗺-🙏🚀-🛅🛋-🛐🛠-🛥🛩🛫🛬🛰🛳🤐-🤘🦀-🦄🧀]") + .freeze(); + public static final UnicodeSet KEYCAPS = + new UnicodeSet("[{#⃣}{*⃣}{0⃣}{1⃣}{2⃣}{3⃣}{4⃣}{5⃣}{6⃣}{7⃣}{8⃣}{9⃣}]").freeze(); + public static final UnicodeSet FLAGS = + new UnicodeSet( + "[{🇦🇨}" + + "{🇦🇩}{🇦🇪}{🇦🇫}{🇦🇬}{🇦🇮}{🇦🇱}{🇦🇲}{🇦🇴}{🇦🇶}{🇦🇷}{🇦🇸}{🇦🇹}{🇦🇺}{🇦🇼}{🇦🇽}{🇦🇿}{🇧🇦}{🇧🇧}{🇧🇩}{🇧🇪}{🇧🇫}{🇧🇬}{🇧🇭}{🇧🇮}{🇧🇯}{🇧🇱}{🇧🇲}{🇧🇳}{🇧🇴}{🇧🇶}{🇧🇷}{🇧🇸}" + + "{🇧🇹}{🇧🇻}{🇧🇼}{🇧🇾}{🇧🇿}{🇨🇦}{🇨🇨}{🇨🇩}{🇨🇫}{🇨🇬}{🇨🇭}{🇨🇮}{🇨🇰}{🇨🇱}{🇨🇲}{🇨🇳}{🇨🇴}{🇨🇵}{🇨🇷}{🇨🇺}{🇨🇻}{🇨🇼}{🇨🇽}{🇨🇾}{🇨🇿}{🇩🇪}{🇩🇬}{🇩🇯}{🇩🇰}{🇩🇲}{🇩🇴}" + + "{🇩🇿}{🇪🇦}{🇪🇨}{🇪🇪}{🇪🇬}{🇪🇭}{🇪🇷}{🇪🇸}{🇪🇹}{🇪🇺}{🇫🇮}{🇫🇯}{🇫🇰}{🇫🇲}{🇫🇴}{🇫🇷}{🇬🇦}{🇬🇧}{🇬🇩}{🇬🇪}{🇬🇫}{🇬🇬}{🇬🇭}{🇬🇮}{🇬🇱}{🇬🇲}{🇬🇳}{🇬🇵}{🇬🇶}{🇬🇷}" + + "{🇬🇸}{🇬🇹}{🇬🇺}{🇬🇼}{🇬🇾}{🇭🇰}{🇭🇲}{🇭🇳}{🇭🇷}{🇭🇹}{🇭🇺}{🇮🇨}{🇮🇩}{🇮🇪}{🇮🇱}{🇮🇲}{🇮🇳}{🇮🇴}{🇮🇶}{🇮🇷}{🇮🇸}{🇮🇹}{🇯🇪}{🇯🇲}{🇯🇴}{🇯🇵}{🇰🇪}{🇰🇬}{🇰🇭}{🇰🇮}{🇰🇲}" + + "{🇰🇳}{🇰🇵}{🇰🇷}{🇰🇼}{🇰🇾}{🇰🇿}{🇱🇦}{🇱🇧}{🇱🇨}{🇱🇮}{🇱🇰}{🇱🇷}{🇱🇸}{🇱🇹}{🇱🇺}{🇱🇻}{🇱🇾}{🇲🇦}{🇲🇨}{🇲🇩}{🇲🇪}{🇲🇫}{🇲🇬}{🇲🇭}{🇲🇰}{🇲🇱}{🇲🇲}{🇲🇳}{🇲🇴}{🇲🇵}{🇲🇶}{🇲🇷}{🇲🇸}" + + "{🇲🇹}{🇲🇺}{🇲🇻}{🇲🇼}{🇲🇽}{🇲🇾}{🇲🇿}{🇳🇦}{🇳🇨}{🇳🇪}{🇳🇫}{🇳🇬}{🇳🇮}{🇳🇱}{🇳🇴}{🇳🇵}{🇳🇷}{🇳🇺}{🇳🇿}{🇴🇲}{🇵🇦}{🇵🇪}{🇵🇫}{🇵🇬}{🇵🇭}{🇵🇰}{🇵🇱}{🇵🇲}{🇵🇳}{🇵🇷}{🇵🇸}" + + "{🇵🇹}{🇵🇼}{🇵🇾}{🇶🇦}{🇷🇪}{🇷🇴}{🇷🇸}{🇷🇺}{🇷🇼}{🇸🇦}{🇸🇧}{🇸🇨}{🇸🇩}{🇸🇪}{🇸🇬}{🇸🇭}{🇸🇮}{🇸🇯}{🇸🇰}{🇸🇱}{🇸🇲}{🇸🇳}{🇸🇴}{🇸🇷}{🇸🇸}{🇸🇹}{🇸🇻}{🇸🇽}{🇸🇾}{🇸🇿}{🇹🇦}{🇹🇨}" + + "{🇹🇩}{🇹🇫}{🇹🇬}{🇹🇭}{🇹🇯}{🇹🇰}{🇹🇱}{🇹🇲}{🇹🇳}{🇹🇴}{🇹🇷}{🇹🇹}{🇹🇻}{🇹🇼}{🇹🇿}{🇺🇦}{🇺🇬}{🇺🇲}{🇺🇸}{🇺🇾}{🇺🇿}{🇻🇦}{🇻🇨}{🇻🇪}{🇻🇬}{🇻🇮}{🇻🇳}{🇻🇺}{🇼🇫}" + + "{🇼🇸}{🇽🇰}{🇾🇪}{🇾🇹}{🇿🇦}{🇿🇲}{🇿🇼}]") + .freeze(); + public static final UnicodeSet GROUPS = + new UnicodeSet( + "[💏 💑 👪 {👨‍❤️‍👨}{👨‍❤️‍💋‍👨}{👨‍👨‍👦}{👨‍👨‍👦‍👦}{👨‍👨‍👧}{👨‍👨‍👧‍👦}{👨‍👨‍👧‍👧}{👨‍👩‍👦}{👨‍👩‍👦‍👦}{👨‍👩‍👧}{👨‍👩‍👧‍👦}{👨‍👩‍👧‍👧}{👩‍❤️‍👩}{👩‍❤️‍💋‍👩}{👩‍👩‍👦}{👩‍👩‍👦‍👦}{👩‍👩‍👧}{👩‍👩‍👧‍👦}{👩‍👩‍👧‍👧}]") + .freeze(); + public static final UnicodeSet PRIMARY = + new UnicodeSet( + "[🎅👦-👩👮👰-👸👼💁💂💆💇🙅-🙇🙋🙍🙎" + + "{🎅🏻}{🎅🏼}{🎅🏽}{🎅🏾}{🎅🏿}{👦🏻}{👦🏼}{👦🏽}{👦🏾}{👦🏿}{👧🏻}{👧🏼}{👧🏽}{👧🏾}{👧🏿}{👨🏻}{👨🏼}{👨🏽}{👨🏾}{👨🏿}{👩🏻}{👩🏼}{👩🏽}{👩🏾}{👩🏿}{👮🏻}{👮🏼}{👮🏽}{👮🏾}{👮🏿}{👰🏻}{👰🏼}{👰🏽}{👰🏾}{👰🏿}{👱🏻}{👱🏼}{👱🏽}{👱🏾}{👱🏿}{👲🏻}{👲🏼}{👲🏽}{👲🏾}{👲🏿}{👳🏻}{👳🏼}{👳🏽}{👳🏾}{👳🏿}{👴🏻}{👴🏼}{👴🏽}{👴🏾}{👴🏿}{👵🏻}{👵🏼}{👵🏽}{👵🏾}{👵🏿}{👶🏻}{👶🏼}{👶🏽}{👶🏾}{👶🏿}{👷🏻}{👷🏼}{👷🏽}{👷🏾}{👷🏿}{👸🏻}{👸🏼}{👸🏽}{👸🏾}{👸🏿}{👼🏻}{👼🏼}{👼🏽}{👼🏾}{👼🏿}{💁🏻}{💁🏼}{💁🏽}{💁🏾}{💁🏿}{💂🏻}{💂🏼}{💂🏽}{💂🏾}{💂🏿}{💆🏻}{💆🏼}{💆🏽}{💆🏾}{💆🏿}{💇🏻}{💇🏼}{💇🏽}{💇🏾}{💇🏿}{🙅🏻}{🙅🏼}{🙅🏽}{🙅🏾}{🙅🏿}{🙆🏻}{🙆🏼}{🙆🏽}{🙆🏾}{🙆🏿}{🙇🏻}{🙇🏼}{🙇🏽}{🙇🏾}{🙇🏿}{🙋🏻}{🙋🏼}{🙋🏽}{🙋🏾}{🙋🏿}{🙍🏻}{🙍🏼}{🙍🏽}{🙍🏾}{🙍🏿}{🙎🏻}{🙎🏼}{🙎🏽}{🙎🏾}{🙎🏿}]") + .freeze(); + public static final UnicodeSet FACE = + new UnicodeSet("[☺ ☹ 🙁 🙂 😀-😆 😉-😷 😇 😈 👿 🙃 🙄 🤐-🤕 🤗]").freeze(); + // + + // "{☹🏻}{☹🏼}{☹🏽}{☹🏾}{☹🏿}{☺🏻}{☺🏼}{☺🏽}{☺🏾}{☺🏿}{👿🏻}{👿🏼}{👿🏽}{👿🏾}{👿🏿}{😀🏻}{😀🏼}{😀🏽}{😀🏾}{😀🏿}{😁🏻}{😁🏼}{😁🏽}{😁🏾}{😁🏿}{😂🏻}{😂🏼}{😂🏽}{😂🏾}{😂🏿}{😃🏻}{😃🏼}{😃🏽}{😃🏾}{😃🏿}{😄🏻}{😄🏼}{😄🏽}{😄🏾}{😄🏿}{😅🏻}{😅🏼}{😅🏽}{😅🏾}{😅🏿}{😆🏻}{😆🏼}{😆🏽}{😆🏾}{😆🏿}{😇🏻}{😇🏼}{😇🏽}{😇🏾}{😇🏿}{😈🏻}{😈🏼}{😈🏽}{😈🏾}{😈🏿}{😉🏻}{😉🏼}{😉🏽}{😉🏾}{😉🏿}{😊🏻}{😊🏼}{😊🏽}{😊🏾}{😊🏿}{😋🏻}{😋🏼}{😋🏽}{😋🏾}{😋🏿}{😌🏻}{😌🏼}{😌🏽}{😌🏾}{😌🏿}{😍🏻}{😍🏼}{😍🏽}{😍🏾}{😍🏿}{😎🏻}{😎🏼}{😎🏽}{😎🏾}{😎🏿}{😏🏻}{😏🏼}{😏🏽}{😏🏾}{😏🏿}{😐🏻}{😐🏼}{😐🏽}{😐🏾}{😐🏿}{😑🏻}{😑🏼}{😑🏽}{😑🏾}{😑🏿}{😒🏻}{😒🏼}{😒🏽}{😒🏾}{😒🏿}{😓🏻}{😓🏼}{😓🏽}{😓🏾}{😓🏿}{😔🏻}{😔🏼}{😔🏽}{😔🏾}{😔🏿}{😕🏻}{😕🏼}{😕🏽}{😕🏾}{😕🏿}{😖🏻}{😖🏼}{😖🏽}{😖🏾}{😖🏿}{😗🏻}{😗🏼}{😗🏽}{😗🏾}{😗🏿}{😘🏻}{😘🏼}{😘🏽}{😘🏾}{😘🏿}{😙🏻}{😙🏼}{😙🏽}{😙🏾}{😙🏿}{😚🏻}{😚🏼}{😚🏽}{😚🏾}{😚🏿}{😛🏻}{😛🏼}{😛🏽}{😛🏾}{😛🏿}{😜🏻}{😜🏼}{😜🏽}{😜🏾}{😜🏿}{😝🏻}{😝🏼}{😝🏽}{😝🏾}{😝🏿}{😞🏻}{😞🏼}{😞🏽}{😞🏾}{😞🏿}{😟🏻}{😟🏼}{😟🏽}{😟🏾}{😟🏿}{😠🏻}{😠🏼}{😠🏽}{😠🏾}{😠🏿}{😡🏻}{😡🏼}{😡🏽}{😡🏾}{😡🏿}{😢🏻}{😢🏼}{😢🏽}{😢🏾}{😢🏿}{😣🏻}{😣🏼}{😣🏽}{😣🏾}{😣🏿}{😤🏻}{😤🏼}{😤🏽}{😤🏾}{😤🏿}{😥🏻}{😥🏼}{😥🏽}{😥🏾}{😥🏿}{😦🏻}{😦🏼}{😦🏽}{😦🏾}{😦🏿}{😧🏻}{😧🏼}{😧🏽}{😧🏾}{😧🏿}{😨🏻}{😨🏼}{😨🏽}{😨🏾}{😨🏿}{😩🏻}{😩🏼}{😩🏽}{😩🏾}{😩🏿}{😪🏻}{😪🏼}{😪🏽}{😪🏾}{😪🏿}{😫🏻}{😫🏼}{😫🏽}{😫🏾}{😫🏿}{😬🏻}{😬🏼}{😬🏽}{😬🏾}{😬🏿}{😭🏻}{😭🏼}{😭🏽}{😭🏾}{😭🏿}{😮🏻}{😮🏼}{😮🏽}{😮🏾}{😮🏿}{😯🏻}{😯🏼}{😯🏽}{😯🏾}{😯🏿}{😰🏻}{😰🏼}{😰🏽}{😰🏾}{😰🏿}{😱🏻}{😱🏼}{😱🏽}{😱🏾}{😱🏿}{😲🏻}{😲🏼}{😲🏽}{😲🏾}{😲🏿}{😳🏻}{😳🏼}{😳🏽}{😳🏾}{😳🏿}{😴🏻}{😴🏼}{😴🏽}{😴🏾}{😴🏿}{😵🏻}{😵🏼}{😵🏽}{😵🏾}{😵🏿}{😶🏻}{😶🏼}{😶🏽}{😶🏾}{😶🏿}{😷🏻}{😷🏼}{😷🏽}{😷🏾}{😷🏿}{🙁🏻}{🙁🏼}{🙁🏽}{🙁🏾}{🙁🏿}{🙂🏻}{🙂🏼}{🙂🏽}{🙂🏾}{🙂🏿}{🙃🏻}{🙃🏼}{🙃🏽}{🙃🏾}{🙃🏿}{🙄🏻}{🙄🏼}{🙄🏽}{🙄🏾}{🙄🏿}{🤐🏻}{🤐🏼}{🤐🏽}{🤐🏾}{🤐🏿}{🤑🏻}{🤑🏼}{🤑🏽}{🤑🏾}{🤑🏿}{🤒🏻}{🤒🏼}{🤒🏽}{🤒🏾}{🤒🏿}{🤓🏻}{🤓🏼}{🤓🏽}{🤓🏾}{🤓🏿}{🤔🏻}{🤔🏼}{🤔🏽}{🤔🏾}{🤔🏿}{🤕🏻}{🤕🏼}{🤕🏽}{🤕🏾}{🤕🏿}{🤗🏻}{🤗🏼}{🤗🏽}{🤗🏾}{🤗🏿}]").freeze(); + public static final UnicodeSet SECONDARY = + new UnicodeSet( + "[☝✊-✍🏂-🏄🏇🏊👂👃👆-👐💃💅💪🖐🖕 🖖🙌🙏🚣🚴-🚶🛀🤘" + + "{☝🏻}{☝🏼}{☝🏽}{☝🏾}{☝🏿}{✊🏻}{✊🏼}{✊🏽}{✊🏾}{✊🏿}{✋🏻}{✋🏼}{✋🏽}{✋🏾}{✋🏿}{✌🏻}{✌🏼}{✌🏽}{✌🏾}{✌🏿}{✍🏻}{✍🏼}{✍🏽}{✍🏾}{✍🏿}{🏂🏻}{🏂🏼}{🏂🏽}{🏂🏾}{🏂🏿}{🏃🏻}{🏃🏼}{🏃🏽}{🏃🏾}{🏃🏿}{🏄🏻}{🏄🏼}{🏄🏽}{🏄🏾}{🏄🏿}{🏇🏻}{🏇🏼}{🏇🏽}{🏇🏾}{🏇🏿}{🏊🏻}{🏊🏼}{🏊🏽}{🏊🏾}{🏊🏿}{👂🏻}{👂🏼}{👂🏽}{👂🏾}{👂🏿}{👃🏻}{👃🏼}{👃🏽}{👃🏾}{👃🏿}{👆🏻}{👆🏼}{👆🏽}{👆🏾}{👆🏿}{👇🏻}{👇🏼}{👇🏽}{👇🏾}{👇🏿}{👈🏻}{👈🏼}{👈🏽}{👈🏾}{👈🏿}{👉🏻}{👉🏼}{👉🏽}{👉🏾}{👉🏿}{👊🏻}{👊🏼}{👊🏽}{👊🏾}{👊🏿}{👋🏻}{👋🏼}{👋🏽}{👋🏾}{👋🏿}{👌🏻}{👌🏼}{👌🏽}{👌🏾}{👌🏿}{👍🏻}{👍🏼}{👍🏽}{👍🏾}{👍🏿}{👎🏻}{👎🏼}{👎🏽}{👎🏾}{👎🏿}{👏🏻}{👏🏼}{👏🏽}{👏🏾}{👏🏿}{👐🏻}{👐🏼}{👐🏽}{👐🏾}{👐🏿}{💃🏻}{💃🏼}{💃🏽}{💃🏾}{💃🏿}{💅🏻}{💅🏼}{💅🏽}{💅🏾}{💅🏿}{💪🏻}{💪🏼}{💪🏽}{💪🏾}{💪🏿}{🖐🏻}{🖐🏼}{🖐🏽}{🖐🏾}{🖐🏿}{🖕🏻}{🖕🏼}{🖕🏽}{🖕🏾}{🖕🏿}{🖖🏻}{🖖🏼}{🖖🏽}{🖖🏾}{🖖🏿}{🙌🏻}{🙌🏼}{🙌🏽}{🙌🏾}{🙌🏿}{🙏🏻}{🙏🏼}{🙏🏽}{🙏🏾}{🙏🏿}{🚣🏻}{🚣🏼}{🚣🏽}{🚣🏾}{🚣🏿}{🚴🏻}{🚴🏼}{🚴🏽}{🚴🏾}{🚴🏿}{🚵🏻}{🚵🏼}{🚵🏽}{🚵🏾}{🚵🏿}{🚶🏻}{🚶🏼}{🚶🏽}{🚶🏾}{🚶🏿}{🛀🏻}{🛀🏼}{🛀🏽}{🛀🏾}{🛀🏿}{🤘🏻}{🤘🏼}{🤘🏽}{🤘🏾}{🤘🏿}]") + .freeze(); + static final UnicodeSet MODIFIERS = new UnicodeSet(0x1F3FB, 0x1F3FF).freeze(); + static final UnicodeSet REGIONALS = new UnicodeSet(0x1F1E6, 0x1F1FF).freeze(); + + public static final UnicodeSet TAKES_EMOJI_VS = + new UnicodeSet( + "[©®‼⁉™↔-↙↩↪⌚⌛Ⓜ▪▫▶◀◻-◾☀☁☎☑☔☕☝☺♈-♓♠♣♥♦♨♻♿⚓⚠⚡⚪⚫⚽⚾⛄⛅⛔⛪⛲⛳⛵⛺⛽✂✈✉✌✏✒✔✖✳✴❄❇❗❤➡⤴⤵⬅-⬇⬛⬜⭐⭕〰〽㊗㊙🀄🅰🅱🅾🅿🈂🈚🈯🈷]") + .freeze(); + + public static final StringComparator CODEPOINT_ORDER = + new UTF16.StringComparator(true, false, 0); + public static final RuleBasedCollator RAW_COLLATOR = + (RuleBasedCollator) Collator.getInstance(new ULocale("en-u-co-emoji")); - public static final UnicodeSet TAKES_EMOJI_VS = new UnicodeSet("[©®‼⁉™↔-↙↩↪⌚⌛Ⓜ▪▫▶◀◻-◾☀☁☎☑☔☕☝☺♈-♓♠♣♥♦♨♻♿⚓⚠⚡⚪⚫⚽⚾⛄⛅⛔⛪⛲⛳⛵⛺⛽✂✈✉✌✏✒✔✖✳✴❄❇❗❤➡⤴⤵⬅-⬇⬛⬜⭐⭕〰〽㊗㊙🀄🅰🅱🅾🅿🈂🈚🈯🈷]").freeze(); - - public static final StringComparator CODEPOINT_ORDER = new UTF16.StringComparator(true, false,0); - public static final RuleBasedCollator RAW_COLLATOR = (RuleBasedCollator) Collator.getInstance(new ULocale("en-u-co-emoji")); static { RAW_COLLATOR.setNumericCollation(true); RAW_COLLATOR.setCaseLevel(true); RAW_COLLATOR.freeze(); } - public static final Comparator MAIN_COLLATOR = new MultiComparator(RAW_COLLATOR, CODEPOINT_ORDER); + + public static final Comparator MAIN_COLLATOR = + new MultiComparator(RAW_COLLATOR, CODEPOINT_ORDER); public static String addEmojiVariation(String s) { StringBuilder b = new StringBuilder(); @@ -79,27 +103,28 @@ public static UnicodeSet parseUnicodeSet(String input) { input = UPLUS.matcher(input).replaceAll("\\\\x{$1}"); input = DOTDOT.matcher(input).replaceAll("-"); -// setA = setA.replace("..U+", "-\\u"); -// setA = setA.replace("U+", "\\u"); + // setA = setA.replace("..U+", "-\\u"); + // setA = setA.replace("U+", "\\u"); input = input.trim() + "]]]]]"; String parseInput = "[" + input + "]]]]]"; ParsePosition parsePosition = new ParsePosition(0); UnicodeSet result = new UnicodeSet(parseInput, parsePosition, fullSymbolTable); int parseEnd = parsePosition.getIndex(); - if (parseEnd != parseInput.length() && !UnicodeSetUtilities.OK_AT_END.containsAll(parseInput.substring(parseEnd))) { + if (parseEnd != parseInput.length() + && !UnicodeSetUtilities.OK_AT_END.containsAll(parseInput.substring(parseEnd))) { parseEnd--; // get input offset - throw new IllegalArgumentException("Additional characters past the end of the set, at " - + parseEnd + ", ..." - + input.substring(Math.max(0, parseEnd - 10), parseEnd) - + "|" - + input.substring(parseEnd, Math.min(input.length(), parseEnd + 10)) - ); + throw new IllegalArgumentException( + "Additional characters past the end of the set, at " + + parseEnd + + ", ..." + + input.substring(Math.max(0, parseEnd - 10), parseEnd) + + "|" + + input.substring(parseEnd, Math.min(input.length(), parseEnd + 10))); } return result; } - static UnicodeSet.XSymbolTable fullSymbolTable = new MySymbolTable(); private static class MySymbolTable extends UnicodeSet.XSymbolTable { @@ -112,7 +137,6 @@ public MySymbolTable() { unicodeRegex = new UnicodeRegex().setSymbolTable(this); } - // public boolean applyPropertyAlias0(String propertyName, // String propertyValue, UnicodeSet result) { // if (!propertyName.contains("*")) { @@ -126,8 +150,8 @@ public MySymbolTable() { // return null; // } - public boolean applyPropertyAlias(String propertyName, - String propertyValue, UnicodeSet result) { + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { boolean status = false; boolean invert = false; if (factory == null) { @@ -141,9 +165,11 @@ public boolean applyPropertyAlias(String propertyName, if (posNotEqual < 0) posNotEqual = propertyName.length(); if (posColon < 0) posColon = propertyName.length(); int opPos = posNotEqual < posColon ? posNotEqual : posColon; - propertyValue = propertyValue.length() == 0 ? propertyName.substring(opPos+1) - : propertyName.substring(opPos+1) + "=" + propertyValue; - propertyName = propertyName.substring(0,opPos); + propertyValue = + propertyValue.length() == 0 + ? propertyName.substring(opPos + 1) + : propertyName.substring(opPos + 1) + "=" + propertyValue; + propertyName = propertyName.substring(0, opPos); if (posNotEqual < posColon) { invert = true; } @@ -165,15 +191,21 @@ public boolean applyPropertyAlias(String propertyName, } else { try { status = applyPropertyAlias0(gcProp, propertyName, result, invert); - } catch (Exception e) {}; + } catch (Exception e) { + } + ; if (!status) { try { status = applyPropertyAlias0(scProp, propertyName, result, invert); - } catch (Exception e) {}; + } catch (Exception e) { + } + ; if (!status) { try { status = applyPropertyAlias0(prop, "No", result, !invert); - } catch (Exception e) {}; + } catch (Exception e) { + } + ; if (!status) { status = applyPropertyAlias0(prop, "", result, invert); } @@ -183,20 +215,28 @@ public boolean applyPropertyAlias(String propertyName, return status; } - private boolean applyPropertyAlias0(UnicodeProperty prop, - String propertyValue, UnicodeSet result, boolean invert) { + private boolean applyPropertyAlias0( + UnicodeProperty prop, String propertyValue, UnicodeSet result, boolean invert) { result.clear(); String propertyName = prop.getName(); String trimmedPropertyValue = propertyValue.trim(); PatternMatcher patternMatcher = null; - if (trimmedPropertyValue.length() > 1 && trimmedPropertyValue.startsWith("/") && trimmedPropertyValue.endsWith("/")) { - String fixedRegex = unicodeRegex.transform(trimmedPropertyValue.substring(1, trimmedPropertyValue.length() - 1)); + if (trimmedPropertyValue.length() > 1 + && trimmedPropertyValue.startsWith("/") + && trimmedPropertyValue.endsWith("/")) { + String fixedRegex = + unicodeRegex.transform( + trimmedPropertyValue.substring( + 1, trimmedPropertyValue.length() - 1)); patternMatcher = new UnicodeProperty.RegexMatcher().set(fixedRegex); } UnicodeProperty otherProperty = null; boolean testCp = false; - if (trimmedPropertyValue.length() > 1 && trimmedPropertyValue.startsWith("@") && trimmedPropertyValue.endsWith("@")) { - String otherPropName = trimmedPropertyValue.substring(1, trimmedPropertyValue.length() - 1).trim(); + if (trimmedPropertyValue.length() > 1 + && trimmedPropertyValue.startsWith("@") + && trimmedPropertyValue.endsWith("@")) { + String otherPropName = + trimmedPropertyValue.substring(1, trimmedPropertyValue.length() - 1).trim(); if ("cp".equalsIgnoreCase(otherPropName)) { testCp = true; } else { @@ -224,9 +264,15 @@ private boolean applyPropertyAlias0(UnicodeProperty prop, } } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { - throw new IllegalArgumentException("The value '" + propertyValue + "' is illegal. Values for " + propertyName - + " must be in " - + prop.getAvailableValues() + " or in " + prop.getValueAliases()); + throw new IllegalArgumentException( + "The value '" + + propertyValue + + "' is illegal. Values for " + + propertyName + + " must be in " + + prop.getAvailableValues() + + " or in " + + prop.getValueAliases()); } if (isAge) { set = prop.getSet(new ComparisonMatcher(propertyValue, Relation.geq)); @@ -261,21 +307,27 @@ private boolean applyPropertyAlias0(UnicodeProperty prop, throw new IllegalArgumentException("Illegal property: " + propertyName); } - - private boolean isValid(UnicodeProperty prop, String propertyValue) { // if (prop.getName().equals("General_Category")) { // if (propertyValue) // } return prop.isValidValue(propertyValue); } - - }; + } + ; public static class ComparisonMatcher implements PatternMatcher { Relation relation; - enum Relation {less, leq, equal, geq, greater} - static Comparator comparator = new UTF16.StringComparator(true, false,0); + + enum Relation { + less, + leq, + equal, + geq, + greater + } + + static Comparator comparator = new UTF16.StringComparator(true, false, 0); String pattern; @@ -287,11 +339,16 @@ public ComparisonMatcher(String pattern, Relation comparator) { public boolean test(String value) { int comp = comparator.compare(pattern, value.toString()); switch (relation) { - case less: return comp < 0; - case leq: return comp <= 0; - default: return comp == 0; - case geq: return comp >= 0; - case greater: return comp > 0; + case less: + return comp < 0; + case leq: + return comp <= 0; + default: + return comp == 0; + case geq: + return comp >= 0; + case greater: + return comp > 0; } } @@ -300,7 +357,4 @@ public PatternMatcher set(String pattern) { return this; } } - - - } diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java index cb7b288cd..2031372c7 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UnicodeUtilities.java @@ -1,5 +1,24 @@ package org.unicode.jsp; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row.R4; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.SpoofChecker; +import com.ibm.icu.text.StringTransform; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -21,7 +40,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.tool.TablePrinter; import org.unicode.cldr.util.Predicate; import org.unicode.cldr.util.UnicodeSetPrettyPrinter; @@ -31,37 +49,24 @@ import org.unicode.idna.IdnaTypes; import org.unicode.idna.Punycode; import org.unicode.idna.Uts46; -import org.unicode.props.UnicodeProperty.UnicodeMapProperty; import org.unicode.props.UnicodeProperty; - -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row.R4; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.SpoofChecker; -import com.ibm.icu.text.StringTransform; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UnicodeProperty.UnicodeMapProperty; // For dependency management, it might be useful to split this omnibus class into // pieces by topic, such as collation utilities vs. IDNA utilities etc. public class UnicodeUtilities { - private static final Collator COLLATOR = Collator.getInstance(new ULocale("en-u-co-emoji")); - static final UnicodeSet OFF_LIMITS = new UnicodeSet(UnicodeProperty.getUNASSIGNED()).addAll(UnicodeProperty.PRIVATE_USE).addAll(UnicodeProperty.SURROGATE).freeze(); - static final UnicodeSet NONCHAR = new UnicodeSet(OFF_LIMITS).addAll(new UnicodeSet("[:Cc:]")).removeAll(new UnicodeSet("[:whitespace:]")).freeze(); + static final UnicodeSet OFF_LIMITS = + new UnicodeSet(UnicodeProperty.getUNASSIGNED()) + .addAll(UnicodeProperty.PRIVATE_USE) + .addAll(UnicodeProperty.SURROGATE) + .freeze(); + static final UnicodeSet NONCHAR = + new UnicodeSet(OFF_LIMITS) + .addAll(new UnicodeSet("[:Cc:]")) + .removeAll(new UnicodeSet("[:whitespace:]")) + .freeze(); static { CachedProps cp = CachedProps.CACHED_PROPS; // force load @@ -71,24 +76,33 @@ public class UnicodeUtilities { static Transliterator toHTML; static String HTML_RULES_CONTROLS; - static { - String BASE_RULES = "'<' > '<' ;" + "'<' < '&'[lL][Tt]';' ;" - + "'&' > '&' ;" + "'&' < '&'[aA][mM][pP]';' ;" - + "'>' < '&'[gG][tT]';' ;" + "'\"' < '&'[qQ][uU][oO][tT]';' ; " - + "'' < '&'[aA][pP][oO][sS]';' ; "; + static { + String BASE_RULES = + "'<' > '<' ;" + + "'<' < '&'[lL][Tt]';' ;" + + "'&' > '&' ;" + + "'&' < '&'[aA][mM][pP]';' ;" + + "'>' < '&'[gG][tT]';' ;" + + "'\"' < '&'[qQ][uU][oO][tT]';' ; " + + "'' < '&'[aA][pP][oO][sS]';' ; "; String CONTENT_RULES = "'>' > '>' ;"; String HTML_RULES = BASE_RULES + CONTENT_RULES + "'\"' > '"' ; "; - HTML_RULES_CONTROLS = HTML_RULES - + "[[:di:]-[:cc:]-[:cs:]-[\\u200c-\\u200F]] > ; " // remove, should ignore in rendering (but may not be in browser) - + "[[:nchar:][:cn:][:cs:][:co:][:cc:]-[:whitespace:]-[\\u200c-\\u200F]] > \\uFFFD ; "; // should be missing glyph (but may not be in browser) - // + "([[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]-[\\u0020]]) > &hex/xml($1) ; "; // [\\u0080-\\U0010FFFF] - - toHTML = Transliterator.createFromRules("any-xml", HTML_RULES_CONTROLS, - Transliterator.FORWARD); + HTML_RULES_CONTROLS = + HTML_RULES + + "[[:di:]-[:cc:]-[:cs:]-[\\u200c-\\u200F]] > ; " // remove, should ignore + // in rendering (but may + // not be in browser) + + "[[:nchar:][:cn:][:cs:][:co:][:cc:]-[:whitespace:]-[\\u200c-\\u200F]] > \\uFFFD ; "; // should be missing glyph (but may not be in browser) + // + "([[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]-[\\u0020]]) > + // &hex/xml($1) ; "; // [\\u0080-\\U0010FFFF] + + toHTML = + Transliterator.createFromRules( + "any-xml", HTML_RULES_CONTROLS, Transliterator.FORWARD); } public static String toHTML(String input) { @@ -106,9 +120,11 @@ public static String toHTML(String input) { // } // } - public static UnicodeSet IGNORE_IN_IDNA_DIFF = new UnicodeSet("[[\\u0000-\\u007F][:Cc:][:Cn:][:Co:][:Cs:]]").freeze(); + public static UnicodeSet IGNORE_IN_IDNA_DIFF = + new UnicodeSet("[[\\u0000-\\u007F][:Cc:][:Cn:][:Co:][:Cs:]]").freeze(); - public static UnicodeMap getIdnaDifferences(UnicodeSet remapped, UnicodeSet overallAllowed) { + public static UnicodeMap getIdnaDifferences( + UnicodeSet remapped, UnicodeSet overallAllowed) { UnicodeMap result = new UnicodeMap(); UnicodeSet valid2008 = Idna2008.getIdna2008Valid(); @@ -122,17 +138,22 @@ public static UnicodeMap getIdnaDifferences(UnicodeSet remapped, Unicode String age = isNew ? "v4.0-5.2" : "v3.2"; IdnaType idna2003 = Idna2003.getIDNA2003Type(i); IdnaType tr46 = Uts46.SINGLETON.getType(i); - if (isNew) {// skip - } else if ((tr46 == IdnaType.mapped || idna2003 == IdnaType.mapped) && tr46 != IdnaType.disallowed && idna2003 != IdnaType.disallowed) { + if (isNew) { // skip + } else if ((tr46 == IdnaType.mapped || idna2003 == IdnaType.mapped) + && tr46 != IdnaType.disallowed + && idna2003 != IdnaType.disallowed) { remapped.add(i); } - //TestStatus testResult = valid2008.contains(i); + // TestStatus testResult = valid2008.contains(i); IdnaType idna2008 = valid2008.contains(i) ? IdnaType.valid : IdnaType.disallowed; - String iClass = age - + "\t" + getShortName(idna2003) - + "\t" + getShortName(tr46) - + "\t" + getShortName(idna2008) - ; + String iClass = + age + + "\t" + + getShortName(idna2003) + + "\t" + + getShortName(tr46) + + "\t" + + getShortName(idna2008); result.put(i, iClass); } return result.freeze(); @@ -141,17 +162,18 @@ public static UnicodeMap getIdnaDifferences(UnicodeSet remapped, Unicode static String getShortName(IdnaType tr46) { // TODO Auto-generated method stub return UCharacter.toTitleCase( - tr46==IdnaType.valid ? "Valid" - : tr46==IdnaType.ignored || tr46==IdnaType.mapped ? "Mapped/Ignored" - : tr46.toString() - , null); + tr46 == IdnaType.valid + ? "Valid" + : tr46 == IdnaType.ignored || tr46 == IdnaType.mapped + ? "Mapped/Ignored" + : tr46.toString(), + null); } - - static final UnicodeSet MARK = new UnicodeSet("[:M:]").freeze(); - static String getXStringPropertyValue(int propertyEnum, int codepoint, int nameChoice, Normalizer.Mode compat) { + static String getXStringPropertyValue( + int propertyEnum, int codepoint, int nameChoice, Normalizer.Mode compat) { if (compat == null || Normalizer.isNormalized(codepoint, compat, 0)) { return Common.getXStringPropertyValue(propertyEnum, codepoint, nameChoice); } @@ -173,20 +195,24 @@ static String getXStringPropertyValue(int propertyEnum, int codepoint, int nameC return lastPart; } - static UnicodeSet COMMON_USE_SCRIPTS = new UnicodeSet("[[:script=Zyyy:] [:script=Zinh:] [:script=Arab:] [:script=Armn:]" + - " [:script=Beng:] [:script=Bopo:] [:script=Cans:] [:script=Cyrl:] [:script=Deva:] [:script=Ethi:]" + - " [:script=Geor:] [:script=Grek:] [:script=Gujr:] [:script=Guru:] [:script=Hani:] [:script=Hang:]" + - " [:script=Hebr:] [:script=Hira:] [:script=Knda:] [:script=Kana:] [:script=Khmr:] [:script=Laoo:]" + - " [:script=Latn:] [:script=Mlym:] [:script=Mong:] [:script=Mymr:] [:script=Orya:] [:script=Sinh:] " + - "[:script=Taml:] [:script=Telu:] [:script=Tfng:] [:script=Thaa:] [:script=Thai:] [:script=Tibt:] [:script=Yiii:]]").freeze(); - - static UnicodeSet LITURGICAL = new UnicodeSet("[\u0615\u0617-\u061A\u0671\u06D6-\u06ED\u08F0-\u08F3[:sc=coptic:]" + - "\u1CD0-\u1CF2\u214F]"); + static UnicodeSet COMMON_USE_SCRIPTS = + new UnicodeSet( + "[[:script=Zyyy:] [:script=Zinh:] [:script=Arab:] [:script=Armn:]" + + " [:script=Beng:] [:script=Bopo:] [:script=Cans:] [:script=Cyrl:] [:script=Deva:] [:script=Ethi:]" + + " [:script=Geor:] [:script=Grek:] [:script=Gujr:] [:script=Guru:] [:script=Hani:] [:script=Hang:]" + + " [:script=Hebr:] [:script=Hira:] [:script=Knda:] [:script=Kana:] [:script=Khmr:] [:script=Laoo:]" + + " [:script=Latn:] [:script=Mlym:] [:script=Mong:] [:script=Mymr:] [:script=Orya:] [:script=Sinh:] " + + "[:script=Taml:] [:script=Telu:] [:script=Tfng:] [:script=Thaa:] [:script=Thai:] [:script=Tibt:] [:script=Yiii:]]") + .freeze(); + + static UnicodeSet LITURGICAL = + new UnicodeSet( + "[\u0615\u0617-\u061A\u0671\u06D6-\u06ED\u08F0-\u08F3[:sc=coptic:]" + + "\u1CD0-\u1CF2\u214F]"); static UnicodeSet DEPRECATED = new UnicodeSet("[:deprecated:]").freeze(); static int getXPropertyEnum(String propertyAlias) { - int extra = Common.XPROPERTY_NAMES.indexOf(propertyAlias - .toLowerCase(Locale.ENGLISH)); + int extra = Common.XPROPERTY_NAMES.indexOf(propertyAlias.toLowerCase(Locale.ENGLISH)); if (extra != -1) { return UProperty.STRING_LIMIT + extra; } @@ -209,7 +235,8 @@ static int getXPropertyEnum(String propertyAlias) { static boolean getBinaryValue(String propertyValue) { boolean invert; - if (propertyValue.length() == 0 || propertyValue.equalsIgnoreCase("true") + if (propertyValue.length() == 0 + || propertyValue.equalsIgnoreCase("true") || propertyValue.equalsIgnoreCase("t") || propertyValue.equalsIgnoreCase("yes") || propertyValue.equalsIgnoreCase("y")) { @@ -244,12 +271,15 @@ static XPropertyFactory getFactory() { return XPropertyFactory.make(); } - static NumberFormat numberFormat = NumberFormat.getInstance(ULocale.ENGLISH, NumberFormat.NUMBERSTYLE); + static NumberFormat numberFormat = + NumberFormat.getInstance(ULocale.ENGLISH, NumberFormat.NUMBERSTYLE); + static { numberFormat.setGroupingUsed(true); } - public static void showSetMain(UnicodeSet a, CodePointShower codePointShower, Appendable out) throws IOException { + public static void showSetMain(UnicodeSet a, CodePointShower codePointShower, Appendable out) + throws IOException { if (codePointShower.groupingProps.isEmpty()) { showSet(a, codePointShower, out); return; @@ -271,7 +301,7 @@ public static void showSetMain(UnicodeSet a, CodePointShower codePointShower, Ap for (String s : sorted) { String[] props2 = s.split("; "); int level = getFirstDiff(propsOld, props2); - //out.append("// level: " + level + ", lastLevel: " + lastLevel + "\n"); + // out.append("// level: " + level + ", lastLevel: " + lastLevel + "\n"); // if higher, back off if (lastLevel >= 0) { for (int i = level; i < length; ++i) { @@ -281,9 +311,16 @@ public static void showSetMain(UnicodeSet a, CodePointShower codePointShower, Ap lastLevel = level; UnicodeSet items = map.getSet(s); for (int i = lastLevel; i < length; ++i) { - out.append("

" + props2[i] + - (i == length - 1 ? "
items: " + numberFormat.format(items.size()) : "
") + - "

\n"); + out.append( + "

" + + props2[i] + + (i == length - 1 + ? "
items: " + + numberFormat.format(items.size()) + : "
") + + "

\n"); } showSet(items, codePointShower, out); for (int i = 0; i < propsOld.length; ++i) { @@ -308,9 +345,11 @@ static int getFirstDiff(String[] a, String[] b) { // return getFactory().getAvailableNames(); // } - public static String getStringProperties(UnicodeProperty prop, String s, String separator, boolean getShortest) { + public static String getStringProperties( + UnicodeProperty prop, String s, String separator, boolean getShortest) { // check for single code point, later - if (prop instanceof UnicodeMapProperty || prop instanceof CachedProps.DelayedUnicodeProperty) { + if (prop instanceof UnicodeMapProperty + || prop instanceof CachedProps.DelayedUnicodeProperty) { Object value = prop.getUnicodeMap().get(s); if (value != null) { return (String) value; @@ -335,7 +374,9 @@ public static String getStringProperties(UnicodeProperty prop, String s, String } /*jsp*/ - public static void showSet(UnicodeSet inputSetRaw, CodePointShower codePointShower, Appendable out) throws IOException { + public static void showSet( + UnicodeSet inputSetRaw, CodePointShower codePointShower, Appendable out) + throws IOException { if (codePointShower.doTable) { out.append("

Confusable Characters

"); } @@ -350,15 +391,19 @@ public static void showSet(UnicodeSet inputSetRaw, CodePointShower codePointShow } else if (codePointShower.abbreviate) { codePointShower.showAbbreviated(inputSetRaw, out); } else { - LinkedHashMap items = new LinkedHashMap(); + LinkedHashMap items = new LinkedHashMap(); String specials = "Unassigned, Private use, or Surrogates"; - UnicodeSet specialSet = new UnicodeSet(inputSetRaw).retainAll(UnicodeProperty.getSPECIALS()); - UnicodeSet inputSet = specialSet.size() == 0 ? inputSetRaw : new UnicodeSet(inputSetRaw).removeAll(UnicodeProperty.getSPECIALS()); + UnicodeSet specialSet = + new UnicodeSet(inputSetRaw).retainAll(UnicodeProperty.getSPECIALS()); + UnicodeSet inputSet = + specialSet.size() == 0 + ? inputSetRaw + : new UnicodeSet(inputSetRaw).removeAll(UnicodeProperty.getSPECIALS()); if (specialSet.size() != 0) { items.put(specials, specialSet); } - for (UnicodeSetIterator it = new UnicodeSetIterator(inputSet); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(inputSet); it.next(); ) { int s = it.codepoint; if (s == UnicodeSetIterator.IS_STRING) { String newBlock = "Strings"; @@ -366,13 +411,26 @@ public static void showSet(UnicodeSet inputSetRaw, CodePointShower codePointShow if (set == null) items.put(newBlock, set = new UnicodeSet()); set.add(it.string); } else { - String block = UCharacter.getStringPropertyValue(BLOCK_ENUM, s, UProperty.NameChoice.LONG).replace('_', ' '); - String newBlock = "" + block + ""; + String block = + UCharacter.getStringPropertyValue( + BLOCK_ENUM, s, UProperty.NameChoice.LONG) + .replace('_', ' '); + String newBlock = + "" + + block + + ""; String newSubhead = getSubheader().getSubheader(s); if (newSubhead == null) { newSubhead = "no subhead"; } else { - newSubhead = "" + newSubhead + ""; + newSubhead = + "" + + newSubhead + + ""; } newBlock = newBlock + " \u2014 " + newSubhead + ""; UnicodeSet set = items.get(newBlock); @@ -386,7 +444,12 @@ public static void showSet(UnicodeSet inputSetRaw, CodePointShower codePointShow if (codePointShower.doTable) { out.append(""); } @@ -394,12 +457,13 @@ public static void showSet(UnicodeSet inputSetRaw, CodePointShower codePointShow if (set.size() > 1000 || newBlock == specials) { codePointShower.showAbbreviated(set, out); } else if (codePointShower.collate) { - TreeSet sorted = set.addAllTo(new TreeSet(UnicodeSetUtilities.MAIN_COLLATOR)); + TreeSet sorted = + set.addAllTo(new TreeSet(UnicodeSetUtilities.MAIN_COLLATOR)); for (String s : sorted) { codePointShower.showString(s, ", ", out); } } else { - for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next(); ) { int s = it.codepoint; if (s == UnicodeSetIterator.IS_STRING) { codePointShower.showString(it.string, ", ", out); @@ -422,7 +486,10 @@ public static String getIdentifier(String script) { scriptSet = scriptProp.getSet(script); scriptSet.removeAll(NONCHAR); if (scriptSet.size() == 0) { - result.append("

Illegal script: " + toHTML(script) + ". Please pick one of the following:

\n

"); + result.append( + "

Illegal script: " + + toHTML(script) + + ". Please pick one of the following:

\n

"); String last = null; TreeSet sorted = new TreeSet(col); sorted.addAll(scriptProp.getAvailableValues()); @@ -440,7 +507,8 @@ public static String getIdentifier(String script) { } else { result.append("

"); } - result.append("" + name + ""); + result.append( + "" + name + ""); last = s; } result.append("

\n"); @@ -448,7 +516,8 @@ public static String getIdentifier(String script) { } try { UnicodeSet allowed = new UnicodeSet(scriptSet).retainAll(XIDModifications.getAllowed()); - UnicodeSet restricted = new UnicodeSet(scriptSet).removeAll(XIDModifications.getAllowed()); + UnicodeSet restricted = + new UnicodeSet(scriptSet).removeAll(XIDModifications.getAllowed()); result.append("

Allowed

"); if (allowed.size() == 0) { result.append("none"); @@ -465,8 +534,14 @@ public static String getIdentifier(String script) { UnicodeSet shard = types.getSet(reason); UnicodeSet items = new UnicodeSet(restricted).retainAll(shard); if (items.size() != 0) { - result.append("

Restricted - " + reason + "

"); - showSet(items, new CodePointShower("", "", true, false, false).setRestricted(true), result); + result.append( + "

Restricted - " + + reason + + "

"); + showSet( + items, + new CodePointShower("", "", true, false, false).setRestricted(true), + result); } } } @@ -476,7 +551,7 @@ public static String getIdentifier(String script) { } } - static private UnicodeSet RTL= new UnicodeSet("[[:bc=R:][:bc=AL:]]"); + private static UnicodeSet RTL = new UnicodeSet("[[:bc=R:][:bc=AL:]]"); private static String showCodePoint(int codepoint) { return showCodePoint(UTF16.valueOf(codepoint)); @@ -484,7 +559,11 @@ private static String showCodePoint(int codepoint) { private static String showCodePoint(String s) { String literal = getLiteral(s); - return "\u00a0" + literal + "\u00a0"; + return "\u00a0" + + literal + + "\u00a0"; } private static String getLiteral(int codepoint) { @@ -515,7 +594,12 @@ public CodePointShower setRestricted(boolean restricted) { return this; } - public CodePointShower(String grouping, String info, boolean abbreviate, boolean ucdFormat, boolean collate) { + public CodePointShower( + String grouping, + String info, + boolean abbreviate, + boolean ucdFormat, + boolean collate) { this.groupingProps = getProps(grouping); this.infoProps = getProps(info); this.doTable = true; // !infoProps.isEmpty(); @@ -530,7 +614,8 @@ void showCodePoint(int codePoint, Appendable out) throws IOException { showString(string, separator, out); } - private void showString(final String string, String separator, Appendable out) throws IOException { + private void showString(final String string, String separator, Appendable out) + throws IOException { if (doTable) { out.append("
"); } @@ -558,14 +643,29 @@ private void showString(final String string, String separator, Appendable out) t literal = UnicodeSetUtilities.addEmojiVariation(literal); if (doTable) { out.append( - "" - + "" - + "" + name + ""); + "" + + "" + + "" + + name + + ""); } else if (ucdFormat) { out.append(UnicodeUtilities.getHex(string, separator, ucdFormat) + " ;\t" + name); } else { - //out.append("
\u00A0" + literal + "\u00A0
" + UnicodeUtilities.getHex(string, separator, ucdFormat) + " \t" + name); - out.append("\u00A0" + literal + "\u00A0\t" + UnicodeUtilities.getHex(string, separator, ucdFormat) + " \t" + name); + // out.append("
\u00A0" + literal + "\u00A0
" + + // UnicodeUtilities.getHex(string, separator, ucdFormat) + " \t" + name); + out.append( + "\u00A0" + + literal + + "\u00A0\t" + + UnicodeUtilities.getHex(string, separator, ucdFormat) + + " \t" + + name); if (hasJoiner) { boolean hasJoiner2 = literal.contains("\u200D"); if (hasJoiner2 != hasJoiner) { @@ -575,7 +675,7 @@ private void showString(final String string, String separator, Appendable out) t } if (!infoProps.isEmpty()) { int cp = string.codePointAt(0); - //StringBuilder confusableString = displayConfusables(cp); + // StringBuilder confusableString = displayConfusables(cp); if (doTable) { out.append("
"); } - out.append("

" + newBlock + "
items: " + numberFormat.format(set.size()) + "

\n"); + out.append( + "

" + + newBlock + + "
items: " + + numberFormat.format(set.size()) + + "

\n"); if (codePointShower.doTable) { out.append("
\u00A0" + literal + "\u00A0" + UnicodeUtilities.getHex(string, separator, ucdFormat) + "\u00A0" + + literal + + "\u00A0" + + UnicodeUtilities.getHex(string, separator, ucdFormat) + + ""); } else { @@ -620,7 +720,7 @@ private void showAbbreviated(UnicodeSet a, Appendable out) throws IOException { UnicodeUtilities.CodePointShower codePointShower = this; boolean haveStrings = false; - for (UnicodeSetIterator it = new UnicodeSetIterator(a); it.nextRange();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(a); it.nextRange(); ) { int s = it.codepoint; if (s == UnicodeSetIterator.IS_STRING) { if (!haveStrings && codePointShower.doTable) { @@ -643,9 +743,13 @@ private void showAbbreviated(UnicodeSet a, Appendable out) throws IOException { } else { codePointShower.showCodePoint(s, out); if (doTable) { - out.append("
" + "\u2026{" + (end-s-1) + "}\u2026"); + out.append( + "
" + + "\u2026{" + + (end - s - 1) + + "}\u2026"); } else { - out.append("\u2026{" + (end-s-1) + "}\u2026"); + out.append("\u2026{" + (end - s - 1) + "}\u2026"); } codePointShower.showCodePoint(end, out); } @@ -670,7 +774,9 @@ String getPropString(List props, String codePoints, boolean sho name = aliases.get(0); } } - builder.append(name).append("=").append(getStringProperties(prop, codePoints, ", ", shortName)); + builder.append(name) + .append("=") + .append(getStringProperties(prop, codePoints, ", ", shortName)); } return builder.toString(); } @@ -681,7 +787,8 @@ String getPropString(List props, String codePoints, boolean sho // if (builder.length() != 0) { // builder.append("; "); // } - // builder.append(prop.getName()).append("=").append(prop.getValue(codePoint)); + // + // builder.append(prop.getName()).append("=").append(prop.getValue(codePoint)); // } // return builder.toString(); // } @@ -705,10 +812,12 @@ private static String getName(String string, String separator, boolean andCode) private static String getHex(int codePoint, boolean ucdFormat) { String hex = com.ibm.icu.impl.Utility.hex(codePoint, 4); - final String string = "" + - ("") - + (ucdFormat ? "" : "U+") - + hex + ""; + final String string = + "" + + ("") + + (ucdFormat ? "" : "U+") + + hex + + ""; return string; } @@ -724,7 +833,8 @@ private static String getHex(String string, String separator, boolean ucdFormat) return result.toString(); } - // private static void showString(String s, String separator, boolean ucdFormat, Writer out) throws IOException { + // private static void showString(String s, String separator, boolean ucdFormat, Writer out) + // throws IOException { // int cp; // for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { // if (i != 0) { @@ -739,36 +849,42 @@ private static String getHex(String string, String separator, boolean ucdFormat) static final SpoofChecker sc = new SpoofChecker.Builder().build(); static { - Transliterator.registerInstance(getTransliteratorFromFile("en-IPA", "en-IPA.txt", Transliterator.FORWARD)); - Transliterator.registerInstance(getTransliteratorFromFile("IPA-en", "en-IPA.txt", Transliterator.REVERSE)); - - Transliterator.registerInstance(getTransliteratorFromFile("deva-ipa", "Deva-IPA.txt", Transliterator.FORWARD)); - Transliterator.registerInstance(getTransliteratorFromFile("ipa-deva", "Deva-IPA.txt", Transliterator.REVERSE)); - - Transform confusable = new Transform() { - public String transform(String source) { - return sc.getSkeleton(SpoofChecker.ANY_CASE, source); // doc wrong - } - }; + Transliterator.registerInstance( + getTransliteratorFromFile("en-IPA", "en-IPA.txt", Transliterator.FORWARD)); + Transliterator.registerInstance( + getTransliteratorFromFile("IPA-en", "en-IPA.txt", Transliterator.REVERSE)); + + Transliterator.registerInstance( + getTransliteratorFromFile("deva-ipa", "Deva-IPA.txt", Transliterator.FORWARD)); + Transliterator.registerInstance( + getTransliteratorFromFile("ipa-deva", "Deva-IPA.txt", Transliterator.REVERSE)); + + Transform confusable = + new Transform() { + public String transform(String source) { + return sc.getSkeleton(SpoofChecker.ANY_CASE, source); // doc wrong + } + }; Transliterator.registerInstance(new SimpleTransliterator("confusable", confusable)); - Transform confusableLower = new Transform() { - public String transform(String source) { - return sc.getSkeleton(0, source); // doc wrong - } - }; - Transliterator.registerInstance(new SimpleTransliterator("confusableLower", confusableLower)); - - - Transform nfkccf = new Transform() { - public String transform(String source) { - return NFKCCF.normalize(source); // doc wrong - } - }; + Transform confusableLower = + new Transform() { + public String transform(String source) { + return sc.getSkeleton(0, source); // doc wrong + } + }; + Transliterator.registerInstance( + new SimpleTransliterator("confusableLower", confusableLower)); + + Transform nfkccf = + new Transform() { + public String transform(String source) { + return NFKCCF.normalize(source); // doc wrong + } + }; Transliterator.registerInstance(new SimpleTransliterator("NFKCCF", confusable)); } - public static Transliterator getTransliteratorFromFile(String ID, String file, int direction) { try { BufferedReader br = FileUtilities.openFile(UnicodeUtilities.class, file); @@ -786,13 +902,14 @@ public static Transliterator getTransliteratorFromFile(String ID, String file, i } return Transliterator.createFromRules(ID, input.toString(), direction); } catch (IOException e) { - throw (IllegalArgumentException) new IllegalArgumentException("Can't open transliterator file " + file).initCause(e); + throw (IllegalArgumentException) + new IllegalArgumentException("Can't open transliterator file " + file) + .initCause(e); } } public static final Transliterator UNESCAPER = Transliterator.getInstance("hex-any"); - /*jsp*/ public static String showTransform(String transform, String sample) { // if (!haveCaseFold) { @@ -814,24 +931,28 @@ public static String showTransform(String transform, String sample) { if (UnicodeSet.resemblesPattern(sample, 0)) { try { set = UnicodeSetUtilities.parseUnicodeSet(sample); - } catch (Exception e) {} + } catch (Exception e) { + } } if (set == null) { sample = UNESCAPER.transform(sample); return getLiteral(trans.transform(sample)).replace("\n", "
"); } - UnicodeSetPrettyPrinter pp = new UnicodeSetPrettyPrinter().setOrdering(UnicodeSetUtilities.MAIN_COLLATOR) - //.setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(RuleBasedCollator.PRIMARY)) - .setSpaceComparator(new Comparator() { - public int compare(String o1, String o2) { - return 1; - } - }); + UnicodeSetPrettyPrinter pp = + new UnicodeSetPrettyPrinter() + .setOrdering(UnicodeSetUtilities.MAIN_COLLATOR) + // .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(RuleBasedCollator.PRIMARY)) + .setSpaceComparator( + new Comparator() { + public int compare(String o1, String o2) { + return 1; + } + }); - Map mapping = new TreeMap(pp.getOrdering()); + Map mapping = new TreeMap(pp.getOrdering()); - for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next(); ) { String s = it.getString(); String mapped = trans.transform(s); if (!mapped.equals(s)) { @@ -862,10 +983,12 @@ public int compare(String o1, String o2) { public static class StringPair implements Comparable { String first; String second; + public StringPair(String first, String second) { this.first = first; this.second = second; } + public int compareTo(StringPair o) { int result = first.compareTo(o.first); if (result != 0) { @@ -881,13 +1004,28 @@ public static String listTransforms() { if (TRANSFORMLIST == null) { StringBuilder result = new StringBuilder(); Set pairs = new TreeSet(); - Set sources = append(new TreeSet(col), (Enumeration) Transliterator.getAvailableSources()); + Set sources = + append( + new TreeSet(col), + (Enumeration) Transliterator.getAvailableSources()); for (String source : sources) { - Set targets = append(new TreeSet(col), (Enumeration) Transliterator.getAvailableTargets(source)); + Set targets = + append( + new TreeSet(col), + (Enumeration) Transliterator.getAvailableTargets(source)); for (String target : targets) { - Set variants = append(new TreeSet(col), (Enumeration) Transliterator.getAvailableVariants(source, target)); + Set variants = + append( + new TreeSet(col), + (Enumeration) + Transliterator.getAvailableVariants(source, target)); for (String variant : variants) { - final String id = toHTML.transform(source + "-" + target + (variant.length() == 0 ? "" : "/" + variant)); + final String id = + toHTML.transform( + source + + "-" + + target + + (variant.length() == 0 ? "" : "/" + variant)); pairs.add(new StringPair(target, id)); } } @@ -904,7 +1042,8 @@ public static String listTransforms() { } result.append("
" + pair.first + ""); } - result.append("" + pair.second + "\n"); + result.append( + "" + pair.second + "\n"); last = pair.first; } result.append("\t\t\n\t\n"); @@ -934,26 +1073,29 @@ private static > U append(U result, Enumeration so // } // } // rules.append("::Lower;"); - // Transliterator.registerInstance(Transliterator.createFromRules("Any-CaseFold", rules.toString(), Transliterator.FORWARD)); + // Transliterator.registerInstance(Transliterator.createFromRules("Any-CaseFold", + // rules.toString(), Transliterator.FORWARD)); // haveCaseFold = true; // } static class FilteredStringTransform implements StringTransform { final UnicodeSet toExclude; final StringTransform trans; + public FilteredStringTransform(UnicodeSet toExclude, StringTransform trans) { this.toExclude = toExclude; this.trans = trans; } + public String transform(String source) { StringBuilder result = new StringBuilder(); int start = 0; while (start < source.length()) { int end = toExclude.findIn(source, start, false); - result.append(trans.transform(source.substring(start,end))); + result.append(trans.transform(source.substring(start, end))); if (end == source.length()) break; start = toExclude.findIn(source, end, true); - result.append(source.substring(end,start)); + result.append(source.substring(end, start)); } return result.toString(); } @@ -961,26 +1103,34 @@ public String transform(String source) { public static final char JOINER = '\u200D'; public static final UnicodeSet NON_ASCII = new UnicodeSet("[^\\u0021-\\u007E]").freeze(); - public static final UnicodeSet WHITESPACE_IGNORABLES_C = new UnicodeSet("[" - + "[:C:]" - + "[:Default_Ignorable_Code_Point:]" - + "[:patternwhitespace:]" - + "[:whitespace:]" - + "]").remove(JOINER).remove(0xFF0F).freeze(); + public static final UnicodeSet WHITESPACE_IGNORABLES_C = + new UnicodeSet( + "[" + + "[:C:]" + + "[:Default_Ignorable_Code_Point:]" + + "[:patternwhitespace:]" + + "[:whitespace:]" + + "]") + .remove(JOINER) + .remove(0xFF0F) + .freeze(); public static final UnicodeSet CombiningMarks = new UnicodeSet("[:M:]").freeze(); - public static final UnicodeSet NOBREAKBEFORE = new UnicodeSet(CombiningMarks) - .addAll(UnicodeSetUtilities.MODIFIERS) - .addAll(UnicodeSetUtilities.REGIONALS) - .add(JOINER) - .add('\uFE0F') - .add('\uFE0E') - .freeze(); + public static final UnicodeSet NOBREAKBEFORE = + new UnicodeSet(CombiningMarks) + .addAll(UnicodeSetUtilities.MODIFIERS) + .addAll(UnicodeSetUtilities.REGIONALS) + .add(JOINER) + .add('\uFE0F') + .add('\uFE0E') + .freeze(); public static String getPrettySet(UnicodeSet a, boolean abbreviate, boolean escape) { String a_out; if (a.size() < 10000 && !abbreviate) { - UnicodeSetPrettyPrinter pp = new UnicodeSetPrettyPrinter().setOrdering(UnicodeSetUtilities.MAIN_COLLATOR) - .setSpaceComparator(COLLATOR.setStrength2(RuleBasedCollator.PRIMARY)); + UnicodeSetPrettyPrinter pp = + new UnicodeSetPrettyPrinter() + .setOrdering(UnicodeSetUtilities.MAIN_COLLATOR) + .setSpaceComparator(COLLATOR.setStrength2(RuleBasedCollator.PRIMARY)); if (escape) { pp.setToQuote(NON_ASCII); @@ -1001,62 +1151,67 @@ public static String getPrettySet(UnicodeSet a, boolean abbreviate, boolean esca StringBuffer out = new StringBuffer(); int charCount = 0; Status status = Status.NORMAL; - for (int i = 0; i < a_out.length(); i+= UTF16.getCharCount(cp)) { + for (int i = 0; i < a_out.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(a_out, i); ++charCount; switch (status) { - case AFTERSLASH: - status = Status.NORMAL; - break; - case INSTRINGAFTERSLASH: - status = Status.INSTRING; - break; - case INSTRING: - if (cp == '\\') { - status = Status.INSTRINGAFTERSLASH; - } else if (cp == '}') { + case AFTERSLASH: status = Status.NORMAL; - } - break; - case NORMAL: - if (cp == '\\') { - status = Status.AFTERSLASH; - } else if (cp == '{') { + break; + case INSTRINGAFTERSLASH: status = Status.INSTRING; - } else if (cp == ' ') { - charCount = 0; - } else if (charCount > 20) { - // add a space, but not in x-y, or \\uXXXX - // TODO, don't change {...} - if ( - // no break before character - cp < 0x80 - || cp == '-' - || cp == '}' - || NOBREAKBEFORE.contains(cp) - // no break after character - || oldCp == '-' - || oldCp == '\\' - || oldCp == '{' - ) { - // do nothing + break; + case INSTRING: + if (cp == '\\') { + status = Status.INSTRINGAFTERSLASH; + } else if (cp == '}') { + status = Status.NORMAL; + } + break; + case NORMAL: + if (cp == '\\') { + status = Status.AFTERSLASH; + } else if (cp == '{') { + status = Status.INSTRING; } else if (cp == ' ') { charCount = 0; - } else { - out.append(' '); - charCount = 0; + } else if (charCount > 20) { + // add a space, but not in x-y, or \\uXXXX + // TODO, don't change {...} + if ( + // no break before character + cp < 0x80 + || cp == '-' + || cp == '}' + || NOBREAKBEFORE.contains(cp) + // no break after character + || oldCp == '-' + || oldCp == '\\' + || oldCp == '{') { + // do nothing + } else if (cp == ' ') { + charCount = 0; + } else { + out.append(' '); + charCount = 0; + } } - } - break; + break; } UTF16.append(out, cp); oldCp = cp; } return out.toString(); } - enum Status {NORMAL, AFTERSLASH, INSTRING, INSTRINGAFTERSLASH} - public static UnicodeSet parseSimpleSet(String setA, String[] exceptionMessage) { + enum Status { + NORMAL, + AFTERSLASH, + INSTRING, + INSTRINGAFTERSLASH + } + + public static UnicodeSet parseSimpleSet(String setA, String[] exceptionMessage) { try { exceptionMessage[0] = null; // setA = setA.replace("..U+", "-\\u"); @@ -1068,8 +1223,13 @@ public static UnicodeSet parseSimpleSet(String setA, String[] exceptionMessage) return null; } - public static void getDifferences(String setA, String setB, - boolean abbreviate, String[] abResults, int[] abSizes, String[] abLinks) { + public static void getDifferences( + String setA, + String setB, + boolean abbreviate, + String[] abResults, + int[] abSizes, + String[] abLinks) { boolean escape = false; String setAr = toHTML.transliterate(UtfParameters.fixQuery(setA)); @@ -1102,10 +1262,10 @@ public static void getDifferences(String setA, String setB, // } int a_bSize = 0, b_aSize = 0, abSize = 0; if (a == null || b == null) { - a_b = a == null ? aMessage[0] : "error" ; - b_a = b == null ? bMessage[0] : "error" ; + a_b = a == null ? aMessage[0] : "error"; + b_a = b == null ? bMessage[0] : "error"; ab = "error"; - } else { + } else { UnicodeSet temp = new UnicodeSet(a).removeAll(b); a_bSize = temp.size(); a_b = getPrettySet(temp, abbreviate, escape); @@ -1126,10 +1286,12 @@ public static void getDifferences(String setA, String setB, abSizes[2] = abSize; } - static int[][] ranges = { { UProperty.BINARY_START, UProperty.BINARY_LIMIT }, - { UProperty.INT_START, UProperty.INT_LIMIT }, - { UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT }, - { UProperty.STRING_START, UProperty.STRING_LIMIT }, }; + static int[][] ranges = { + {UProperty.BINARY_START, UProperty.BINARY_LIMIT}, + {UProperty.INT_START, UProperty.INT_LIMIT}, + {UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT}, + {UProperty.STRING_START, UProperty.STRING_LIMIT}, + }; static Comparator col = UnicodeSetUtilities.MAIN_COLLATOR; // Collator.getInstance(ULocale.ROOT); @@ -1158,7 +1320,10 @@ public static void showProperties(int cp, Appendable out) throws IOException { String hex = com.ibm.icu.impl.Utility.hex(cp, 4); out.append("
\n"); - out.append("\n"); + out.append( + "\n"); out.append("\n"); out.append("\n"); out.append("\n"); @@ -1166,11 +1331,15 @@ public static void showProperties(int cp, Appendable out) throws IOException { if (allowed) { out.append("allowed"); } else { - out.append("restricted"); + out.append( + "restricted"); } out.append("\n"); StringBuilder confusableString = displayConfusables(cp); - out.append("\n"); out.append("
\u00A0" + toHTML.transliterate(text) + "\u00A0
\u00A0" + + toHTML.transliterate(text) + + "\u00A0
" + hex + "
" + name + "
" + scriptCat + "
confuse: "); + out.append( + "
confuse: "); if (confusableString.length() == 0) { out.append("none"); } else { @@ -1179,17 +1348,17 @@ public static void showProperties(int cp, Appendable out) throws IOException { out.append("
\n"); - List availableNames = (List)getFactory().getAvailableNames(); - TreeSet sortedProps = Builder - .with(new TreeSet(col)) - .addAll(availableNames) - .remove("Name") - .get(); - - out.append("" - + "" - + "" + - "\n"); } - static void addCell(StringBuilder resultLines, Transliterator hex, String tr46, String attributes, String confusableChoice) { + static void addCell( + StringBuilder resultLines, + Transliterator hex, + String tr46, + String attributes, + String confusableChoice) { if (tr46 == null) { - resultLines.append("\n"); + resultLines.append("\n"); } else { String escaped = showEscaped(tr46); String linkStart = "", linkEnd = ""; if (confusableChoice != null) { - linkStart = ""; + linkStart = + ""; linkEnd = ""; } - resultLines.append(""); for (int i = 0; i < str.length(); ++i) { - final String s = str.substring(i,i+1); + final String s = str.substring(i, i + 1); String title = toHTML.transform(getName(s, "", true)); - writer.println(""); + writer.println( + ""); } writer.println(""); for (int i = 0; i < str.length(); ++i) { - writer.println(""); + writer.println( + ""); } writer.println(""); for (int i = 0; i < str.length(); ++i) { - writer.println(""); + writer.println( + ""); } writer.println(""); for (int i = 0; i < str.length(); ++i) { @@ -1683,20 +1972,36 @@ private static void showBidiLine(String str, int baseDirection, PrintWriter writ for (int k = 0; k < str.length(); ++k) { final int i = reorder[k]; final String bidiChar = getBidiChar(str, i, codes[i]); - String title = bidiChar.length() == 0 ? "deleted" : toHTML.transform(getName(bidiChar, "", true)); + String title = + bidiChar.length() == 0 + ? "deleted" + : toHTML.transform(getName(bidiChar, "", true)); String td = bidiChar.length() == 0 ? "bxcell" : "bccell"; - writer.println(""); + writer.println( + ""); } writer.println("
Properties for U+" + hex + "
With Non-Default ValuesWith Default Values
\n"); + List availableNames = (List) getFactory().getAvailableNames(); + TreeSet sortedProps = + Builder.with(new TreeSet(col)).addAll(availableNames).remove("Name").get(); + + out.append( + "" + + "" + + "" + + "
Properties for U+" + + hex + + "
With Non-Default ValuesWith Default Values
\n"); out.append("\n"); for (String propName : sortedProps) { @@ -1232,7 +1401,7 @@ private static StringBuilder displayConfusables(int codepoint) { // get basic confusables Set list = Confusables.getEquivalents(same); if (list != null) { - for (String s: list) { + for (String s : list) { if (same.equals(s)) { continue; } @@ -1246,7 +1415,6 @@ private static StringBuilder displayConfusables(int codepoint) { } } - // Now, get the combinations if (nfd.codePointCount(0, nfd.length()) > 1) { if (confusableString.length() != 0) { @@ -1261,7 +1429,8 @@ private static StringBuilder displayConfusables(int codepoint) { confusableString.append("+"); } cp = nfd.codePointAt(i); - Confusables currentCombos = new Confusables(UTF16.valueOf(cp)).setNormalizationCheck(Normalizer.NFKC); + Confusables currentCombos = + new Confusables(UTF16.valueOf(cp)).setNormalizationCheck(Normalizer.NFKC); combos.add(currentCombos); confusableString.append("
"); for (String s : currentCombos) { @@ -1274,7 +1443,7 @@ private static StringBuilder displayConfusables(int codepoint) { } Confusables confusables = new Confusables(same).setNormalizationCheck(Normalizer.NFKC); - for (String s: confusables) { + for (String s : confusables) { if (skip.contains(s)) { continue; } @@ -1308,19 +1477,20 @@ private static StringBuilder displayConfusables(int codepoint) { } // add recursively, for simplicity - private static void addToSkip(String prefix, int i, List combos, Set skip) { + private static void addToSkip( + String prefix, int i, List combos, Set skip) { if (i >= combos.size()) { skip.add(prefix); } else { for (String s : combos.get(i)) { - addToSkip(prefix + s, i+1, combos, skip); + addToSkip(prefix + s, i + 1, combos, skip); } } } private static void getBoxedCharacters(String s, StringBuilder confusableString) { - confusableString - .append("
"); + confusableString.append( + "
"); int cp; for (int i = 0; i < s.length(); i += Character.charCount(cp)) { cp = s.codePointAt(i); @@ -1328,59 +1498,107 @@ private static void getBoxedCharacters(String s, StringBuilder confusableString) confusableString.append("+"); } confusableString - .append("" + " ") - .append(toHTML(UTF16.valueOf(cp))) - .append(" "); + .append( + "" + + " ") + .append(toHTML(UTF16.valueOf(cp))) + .append(" "); } confusableString.append("
"); } - private static void showPropertyValue(String propName, String propValue, boolean isDefault, Appendable out) throws IOException { + private static void showPropertyValue( + String propName, String propValue, boolean isDefault, Appendable out) + throws IOException { String defaultClass = isDefault ? " class='default'" : ""; if (propValue == null) { - out.append("
null\n"); + out.append( + "null\n"); return; } String hValue = toHTML.transliterate(propValue); - hValue = "" + hValue + ""; - - out.append("" + hValue + "\n"); + hValue = + "" + + hValue + + ""; + + out.append( + "" + + hValue + + "\n"); } /*jsp*/ - public static void showPropsTable(Appendable out, String propForValues, String myLink) throws IOException { + public static void showPropsTable(Appendable out, String propForValues, String myLink) + throws IOException { // ((RuleBasedCollator)col).setNumericCollation(true); Map> alpha = new TreeMap>(col); Map longToShort = new HashMap(); Set showLink = new HashSet(); - TablePrinter tablePrinter = new TablePrinter() - .setTableAttributes("style='border-collapse: collapse' border='1'") - .addColumn("Category").setSpanRows(true).setBreakSpans(true).setCellAttributes("class='propCategory'").setSortPriority(0) - .addColumn("Datatype").setSpanRows(true).setCellAttributes("class='propDatatype'").setSortPriority(1) - .addColumn("Source").setSpanRows(true).setCellAttributes("class='propSource'").setSortPriority(2) - .addColumn("Property").setSpanRows(false).setCellAttributes("class='propTitle'") - // .addColumn("Abbr. Prop").setSpanRows(false).setCellAttributes("class='propTitle'") - .addColumn("Values").setSpanRows(false).setCellAttributes("class='propValues'") - ; - //tablePrinter.addRows(data); - //tablePrinter.addRow().addCell("Foo").addCell(1.5d).addCell(99).finishRow(); - - //out.append("
" + propName + "
" + + propName + + "
" + propName + "
" + + propName + + "
\n"); + TablePrinter tablePrinter = + new TablePrinter() + .setTableAttributes("style='border-collapse: collapse' border='1'") + .addColumn("Category") + .setSpanRows(true) + .setBreakSpans(true) + .setCellAttributes("class='propCategory'") + .setSortPriority(0) + .addColumn("Datatype") + .setSpanRows(true) + .setCellAttributes("class='propDatatype'") + .setSortPriority(1) + .addColumn("Source") + .setSpanRows(true) + .setCellAttributes("class='propSource'") + .setSortPriority(2) + .addColumn("Property") + .setSpanRows(false) + .setCellAttributes("class='propTitle'") + // .addColumn("Abbr. + // Prop").setSpanRows(false).setCellAttributes("class='propTitle'") + .addColumn("Values") + .setSpanRows(false) + .setCellAttributes("class='propValues'"); + // tablePrinter.addRows(data); + // tablePrinter.addRow().addCell("Foo").addCell(1.5d).addCell(99).finishRow(); + + // out.append("
\n"); // out.append("\n") // .append("\n") // .append("\n") // .append("\n") // .append("\n"); - //for (String propName : Builder.with(new TreeSet(col)).addAll((List)getFactory().getAvailableNames()).get()) { + // for (String propName : Builder.with(new + // TreeSet(col)).addAll((List)getFactory().getAvailableNames()).get()) { Set missing = new TreeSet(COLLATOR); missing.addAll(getFactory().getAvailableNames()); - for (R4 propData : PropertyMetadata.getCategoryDatatypeSourceProperty()) { + for (R4 propData : + PropertyMetadata.getCategoryDatatypeSourceProperty()) { String propName = propData.get3(); UnicodeProperty prop = getFactory().getProperty(propName); if (prop == null) continue; @@ -1393,49 +1611,64 @@ public static void showPropsTable(Appendable out, String propForValues, String m throw new IllegalArgumentException(propData.toString(), e); } String propHtml = toHTML.transform(propName); - String shortHtml = shortName == null || shortName.equalsIgnoreCase(propName) ? propHtml : toHTML(shortName); - // String title = shortName == null || shortName.equals(propName) ? "" : " title='" + shortHtml + "'"; - // String propHtml = toHTML.transform(propName + (shortName.equalsIgnoreCase(propName) ? "" : " (" + shortName + ")")); - String propInfo = propHtml ; // "" + propHtml + ""; + String shortHtml = + shortName == null || shortName.equalsIgnoreCase(propName) + ? propHtml + : toHTML(shortName); + // String title = shortName == null || shortName.equals(propName) ? "" : " + // title='" + shortHtml + "'"; + // String propHtml = toHTML.transform(propName + + // (shortName.equalsIgnoreCase(propName) ? "" : " (" + shortName + ")")); + String propInfo = propHtml; // "" + propHtml + ""; StringBuilder propValues = new StringBuilder(); String dataType = propData.get1(); if (propName.equals(propForValues) || (dataType.equals("Binary") || dataType.equals("Enumerated")) - && prop.getAvailableValues().size() < 10) { + && prop.getAvailableValues().size() < 10) { getHtmlPropValues(prop, propHtml, shortHtml, propValues); } else { - propValues.append("Show Values"); - } - tablePrinter.addRow() - .addCell(propData.get0()) - .addCell(dataType) - .addCell(propData.get2()) - .addCell(propInfo) - //.addCell(shortHtml) - .addCell(propValues.toString()) - .finishRow(); - //out.append("\n"); + propValues.append( + "Show Values"); + } + tablePrinter + .addRow() + .addCell(propData.get0()) + .addCell(dataType) + .addCell(propData.get2()) + .addCell(propInfo) + // .addCell(shortHtml) + .addCell(propValues.toString()) + .finishRow(); + // out.append("\n"); } for (String name : missing) { String propHtml = toHTML.transform(name); - tablePrinter.addRow() - .addCell("Z-Other") - .addCell("Other") - .addCell("Other") - .addCell(propHtml) - //.addCell(shortHtml) - .addCell("Other") - .finishRow(); - } - //out.append("
SourceCategoryDatatypePropertyValues
\n"); + tablePrinter + .addRow() + .addCell("Z-Other") + .addCell("Other") + .addCell("Other") + .addCell(propHtml) + // .addCell(shortHtml) + .addCell("Other") + .finishRow(); + } + // out.append("
\n"); out.append(tablePrinter.toTable()); } - private static void getHtmlPropValues(UnicodeProperty prop, String propHtml, - String shortPropHtml, StringBuilder propValues) { - List availableValues = (List)prop.getAvailableValues(); - TreeSet sortedList = Builder.with(new TreeSet(col)).addAll(availableValues).get(); + private static void getHtmlPropValues( + UnicodeProperty prop, String propHtml, String shortPropHtml, StringBuilder propValues) { + List availableValues = (List) prop.getAvailableValues(); + TreeSet sortedList = + Builder.with(new TreeSet(col)).addAll(availableValues).get(); int count = 255; int lastFirstChar = 0; for (String valueName : sortedList) { @@ -1462,25 +1695,39 @@ private static void getHtmlPropValues(UnicodeProperty prop, String propHtml, } } - private static String getPropLink(String propHtml, String shortPropHtml, String valueHtml, String shortValueHtml) { + private static String getPropLink( + String propHtml, String shortPropHtml, String valueHtml, String shortValueHtml) { String propValue = valueHtml; final String propExp = propHtml + "=" + propValue; - //String title = shortName == null ? "" : " title='" + toHTML(shortName) + "'"; - String result = "" + valueHtml + ""; + // String title = shortName == null ? "" : " title='" + toHTML(shortName) + "'"; + String result = + "" + + valueHtml + + ""; if (propHtml.isEmpty()) { shortPropHtml = propHtml; } if (shortValueHtml == null) { shortValueHtml = valueHtml; } - if (!propHtml.equalsIgnoreCase(shortPropHtml) || !shortValueHtml.equalsIgnoreCase(valueHtml)) { + if (!propHtml.equalsIgnoreCase(shortPropHtml) + || !shortValueHtml.equalsIgnoreCase(valueHtml)) { String shortPropExp = - propValue.equals("Yes") ? shortPropHtml - : propValue.equals("No") ? "^" + shortPropHtml + propValue.equals("Yes") + ? shortPropHtml + : propValue.equals("No") + ? "^" + shortPropHtml : shortPropHtml + "=" + shortValueHtml; - result += "\u00A0(" + shortValueHtml + ")"; + result += + "\u00A0(" + + shortValueHtml + + ")"; } return result; } @@ -1500,20 +1747,22 @@ static Subheader getSubheader() { // subheader = new Subheader(unicodeDataDirectory); // } catch (IOException e2) { // final String[] list = new File("home").list(); - // String currentDirectory = list == null ? null : new TreeSet(Arrays.asList(list)).toString(); - // throw (RuntimeException) new IllegalArgumentException("Can't find file starting from: <" + currentDirectory + ">").initCause(e); + // String currentDirectory = list == null ? null : new + // TreeSet(Arrays.asList(list)).toString(); + // throw (RuntimeException) new IllegalArgumentException("Can't find file + // starting from: <" + currentDirectory + ">").initCause(e); // } // } } return subheader; } - //static IdnaLabelTester tester = null; - static String removals = new UnicodeSet("[\u1806[:di:]-[:cn:]]").complement().complement().toPattern(false); + // static IdnaLabelTester tester = null; + static String removals = + new UnicodeSet("[\u1806[:di:]-[:cn:]]").complement().complement().toPattern(false); static Matcher rem = Pattern.compile(removals).matcher(""); // TODO use UnicodeRegex - // static IdnaLabelTester getIdna2008Tester() { // if (tester == null) { // try { @@ -1534,24 +1783,28 @@ static void addBlank(StringBuilder resultLines) { resultLines.append("
 
failsfails") + resultLines + .append("") .append(linkStart) .append(escaped) .append(linkEnd) @@ -1559,11 +1812,17 @@ static void addCell(StringBuilder resultLines, Transliterator hex, String tr46, } } - public static final UnicodeSet TO_QUOTE = new UnicodeSet("[[:z:][:me:][:mn:][:di:][:c:]-[\u0020]]"); + public static final UnicodeSet TO_QUOTE = + new UnicodeSet("[[:z:][:me:][:mn:][:di:][:c:]-[\u0020]]"); - static final Transliterator ESCAPER = Transliterator.createFromRules("escaper", - "(" + TO_QUOTE + ") > ''&any-hex($1)'';" - + HTML_RULES_CONTROLS, Transliterator.FORWARD); + static final Transliterator ESCAPER = + Transliterator.createFromRules( + "escaper", + "(" + + TO_QUOTE + + ") > ''&any-hex($1)'';" + + HTML_RULES_CONTROLS, + Transliterator.FORWARD); public static final UnicodeSet SYMBOL = new UnicodeSet("[:s:]").freeze(); public static final UnicodeSet PUNCTUATION = new UnicodeSet("[:p:]").freeze(); @@ -1586,7 +1845,7 @@ public static String showBidi(String str, int baseDirection, boolean asciiHack) String[] parts = str.split("\\r\\n?|\\n"); for (int i = 0; i < parts.length; ++i) { - writer.println("

Paragraph " + (i+1) + "

"); + writer.println("

Paragraph " + (i + 1) + "

"); if (parts[i] == null || parts[i].length() == 0) { continue; } @@ -1595,11 +1854,17 @@ public static String showBidi(String str, int baseDirection, boolean asciiHack) if (asciiHack) { writer.println("

ASCII Hack

"); - writer.println("

For testing the UBA with only ASCII characters, the following property values are used (<,> are RLM and LRM):

"); + writer.println( + "

For testing the UBA with only ASCII characters, the following property values are used (<,> are RLM and LRM):

"); writer.println(""); for (byte i = 0; i < BidiReference.typenames.length; ++i) { final UnicodeSet modifiedClass = bidiCharMap.getAsciiHack(i); - writer.println(""); + writer.println( + ""); } writer.println("
" + BidiReference.getHtmlTypename(i) + "" + getList(modifiedClass) + "
" + + BidiReference.getHtmlTypename(i) + + "" + + getList(modifiedClass) + + "
"); } @@ -1610,17 +1875,23 @@ public static String showBidi(String str, int baseDirection, boolean asciiHack) private static String getList(final UnicodeSet uset) { StringBuffer codePointString = new StringBuffer(); - for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next(); ) { if (codePointString.length() != 0) { codePointString.append(" "); } - final String literal = it.codepoint <= 0x20 ? "\u00AB" + getLiteral(UCharacter.getExtendedName(it.codepoint)) + "\u00BB" : getLiteral(it.codepoint); + final String literal = + it.codepoint <= 0x20 + ? "\u00AB" + + getLiteral(UCharacter.getExtendedName(it.codepoint)) + + "\u00BB" + : getLiteral(it.codepoint); codePointString.append(literal); } return codePointString.toString(); } - private static void showBidiLine(String str, int baseDirection, PrintWriter writer, BidiCharMap bidiCharMap) { + private static void showBidiLine( + String str, int baseDirection, PrintWriter writer, BidiCharMap bidiCharMap) { byte[] codes = new byte[str.length()]; for (int i = 0; i < str.length(); ++i) { codes[i] = bidiCharMap.getBidiClass(str.charAt(i)); @@ -1628,13 +1899,20 @@ private static void showBidiLine(String str, int baseDirection, PrintWriter writ int[] linebreaks = new int[1]; linebreaks[0] = str.length(); - BidiReference bidi = new BidiReference(codes, (byte)baseDirection); - int[] reorder = bidi.getReordering(new int[] { codes.length }); + BidiReference bidi = new BidiReference(codes, (byte) baseDirection); + int[] reorder = bidi.getReordering(new int[] {codes.length}); byte[] levels = bidi.getLevels(linebreaks); writer.println(""); final byte baseLevel = bidi.getBaseLevel(); - writer.println(""); + writer.println( + ""); writer.println("
Base Level" + baseLevel + " = " + (baseLevel == 0 ? "LTR" : "RTL") + "" + (baseDirection >= 0 ? "explicit" : "heuristic") + "" + + baseLevel + + " = " + + (baseLevel == 0 ? "LTR" : "RTL") + + "" + + (baseDirection >= 0 ? "explicit" : "heuristic") + + "
"); // output original text @@ -1645,17 +1923,28 @@ private static void showBidiLine(String str, int baseDirection, PrintWriter writ } writer.println("
Character " + getLiteral(getBidiChar(str, i, codes[i])) + " " + + getLiteral(getBidiChar(str, i, codes[i])) + + "
Bidi Class" + BidiReference.getHtmlTypename(codes[i]) + "" + + BidiReference.getHtmlTypename(codes[i]) + + "
Rules Applied" + bidi.getChanges(i).replace("\n", "
") + "
" + + bidi.getChanges(i).replace("\n", "
") + + "
Resulting Level" + " " + getLiteral(bidiChar) +"" + + " " + + getLiteral(bidiChar) + + "
"); - } private static String getBidiChar(String str, int i, byte b) { - if (b == BidiReference.PDF || b == BidiReference.RLE || b == BidiReference.LRE || b == BidiReference.LRO || b == BidiReference.RLO || b == BidiReference.BN) { + if (b == BidiReference.PDF + || b == BidiReference.RLE + || b == BidiReference.LRE + || b == BidiReference.LRO + || b == BidiReference.RLO + || b == BidiReference.BN) { return ""; } - String substring = str.substring(i,i+1); - if ((substring.equals("<") || substring.equals(">")) && (b == BidiReference.L || b == BidiReference.R)) { + String substring = str.substring(i, i + 1); + if ((substring.equals("<") || substring.equals(">")) + && (b == BidiReference.L || b == BidiReference.R)) { return ""; } return substring; @@ -1717,16 +2022,18 @@ public static String testIdnaLines(String lines, String filter) { lines = UnicodeJsp.UNESCAPER.transform(lines.trim()); StringBuilder resultLines = new StringBuilder(); - //UnicodeUtilities.getIdna2008Tester(); + // UnicodeUtilities.getIdna2008Tester(); - Predicate verifier2008 = new Predicate() { - public boolean is(String item) { - return Idna2008.SINGLETON.isValid(item); - } - }; + Predicate verifier2008 = + new Predicate() { + public boolean is(String item) { + return Idna2008.SINGLETON.isValid(item); + } + }; resultLines.append("\n"); - resultLines.append("\n"); + resultLines.append( + "\n"); boolean first = true; boolean[] errorOut = new boolean[1]; @@ -1738,57 +2045,100 @@ public boolean is(String item) { addBlank(resultLines); } - String rawPunycode = UnicodeUtilities.processLabels(line, IdnaTypes.DOTS, true, new Predicate() { - public boolean is(Object item) { - return true; - }}); - - - // String tr46 = UnicodeUtilities.processLabels(tr46back, UnicodeUtilities.DOTS, true, new Predicate() { + String rawPunycode = + UnicodeUtilities.processLabels( + line, + IdnaTypes.DOTS, + true, + new Predicate() { + public boolean is(Object item) { + return true; + } + }); + + // String tr46 = UnicodeUtilities.processLabels(tr46back, + // UnicodeUtilities.DOTS, true, new Predicate() { // public boolean is(String item) { - // return Uts46.SINGLETON.transform(item).indexOf('\uFFFD') < 0; // Uts46.SINGLETON.Uts46Chars.containsAll(item); + // return Uts46.SINGLETON.transform(item).indexOf('\uFFFD') < 0; // + // Uts46.SINGLETON.Uts46Chars.containsAll(item); // } // }); // String tr46display = Uts46.SINGLETON.toUnicode(line, errorOut); - // tr46display = UnicodeUtilities.processLabels(tr46display, UnicodeUtilities.DOTS, false, new Predicate() { + // tr46display = UnicodeUtilities.processLabels(tr46display, + // UnicodeUtilities.DOTS, false, new Predicate() { // public boolean is(String item) { - // return Uts46.SINGLETON.toUnicode(item).indexOf('\uFFFD') < 0; // Uts46.SINGLETON.Uts46Chars.containsAll(item); + // return Uts46.SINGLETON.toUnicode(item).indexOf('\uFFFD') < 0; // + // Uts46.SINGLETON.Uts46Chars.containsAll(item); // //return Uts46.SINGLETON.Uts46CharsDisplay.containsAll(item); // } // }); - // first lines resultLines.append(""); resultLines.append(""); addCell(resultLines, hex, line, "class='cn ltgreen'", "None"); String idna2003unic = Idna2003.SINGLETON.toUnicode(line, errorOut, true); - addCell(resultLines, hex, idna2003unic, getIdnaClass("cn i2003", errorOut[0]), "IDNA2003"); + addCell( + resultLines, + hex, + idna2003unic, + getIdnaClass("cn i2003", errorOut[0]), + "IDNA2003"); String uts46unic = Uts46.SINGLETON.toUnicode(line, errorOut, true); - addCell(resultLines, hex, uts46unic, getIdnaClass("cn i46", errorOut[0]), "UTS46%2BUTS39"); - - String idna2008unic = UnicodeUtilities.processLabels(line, IdnaTypes.DOT, false, verifier2008); - addCell(resultLines, hex, idna2008unic, getIdnaClass("cn i2008", idna2008unic.contains("\uFFFD")), "IDNA2003"); + addCell( + resultLines, + hex, + uts46unic, + getIdnaClass("cn i46", errorOut[0]), + "UTS46%2BUTS39"); + + String idna2008unic = + UnicodeUtilities.processLabels(line, IdnaTypes.DOT, false, verifier2008); + addCell( + resultLines, + hex, + idna2008unic, + getIdnaClass("cn i2008", idna2008unic.contains("\uFFFD")), + "IDNA2003"); resultLines.append(""); resultLines.append(""); addCell(resultLines, hex, rawPunycode, "class='cn ltgreen mono'", null); String idna2003puny = Idna2003.SINGLETON.toPunyCode(line, errorOut); - addCell(resultLines, hex, idna2003puny, getIdnaClass("cn mono i2003", errorOut[0]), null); + addCell( + resultLines, + hex, + idna2003puny, + getIdnaClass("cn mono i2003", errorOut[0]), + null); String uts46puny = Uts46.SINGLETON.toPunyCode(line, errorOut); - addCell(resultLines, hex, uts46puny, getIdnaClass("cn mono i46", errorOut[0]), null); - - String idna2008puny = UnicodeUtilities.processLabels(line, IdnaTypes.DOT, true, verifier2008); - addCell(resultLines, hex, idna2008puny, getIdnaClass("cn mono i2008", idna2008puny.contains("\uFFFD")), null); + addCell( + resultLines, + hex, + uts46puny, + getIdnaClass("cn mono i46", errorOut[0]), + null); + + String idna2008puny = + UnicodeUtilities.processLabels(line, IdnaTypes.DOT, true, verifier2008); + addCell( + resultLines, + hex, + idna2008puny, + getIdnaClass("cn mono i2008", idna2008puny.contains("\uFFFD")), + null); // if (result == null) { - // resultLines.append(""); + // resultLines.append(""); // } else { // resultLines.append(""); @@ -1804,11 +2154,11 @@ public boolean is(Object item) { } private static String getIdnaClass(String classItems, boolean error) { - return "class='" + - classItems + (error ? " error" : "") + "'"; + return "class='" + classItems + (error ? " error" : "") + "'"; } - static String processLabels(String inputLabels, Pattern dotPattern, boolean punycode, Predicate verifier) { + static String processLabels( + String inputLabels, Pattern dotPattern, boolean punycode, Predicate verifier) { StringBuilder result = new StringBuilder(); for (String label : dotPattern.split(inputLabels)) { if (result.length() != 0) { @@ -1833,8 +2183,6 @@ static String processLabels(String inputLabels, Pattern dotPattern, boolean puny } return result.toString(); } - - } /* diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/UtfParameters.java b/UnicodeJsps/src/main/java/org/unicode/jsp/UtfParameters.java index 53ec18089..f360887ed 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/UtfParameters.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/UtfParameters.java @@ -1,8 +1,7 @@ -/** - * - */ +/** */ package org.unicode.jsp; +import com.ibm.icu.text.UnicodeSet; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.Collections; @@ -10,76 +9,80 @@ import java.util.LinkedHashMap; import java.util.Map; -import com.ibm.icu.text.UnicodeSet; - public class UtfParameters implements Iterable { - private Map map = new LinkedHashMap(); + private Map map = new LinkedHashMap(); - public UtfParameters(String query) { - if (query != null) { - String[] queries = query.split("&"); - for (String s : queries) { - int pos = s.indexOf('='); - String key = pos == -1 ? s : s.substring(0,pos); - try { - key = URLDecoder.decode(key, "UTF-8"); - } catch (Exception e) {} - String value = pos == -1 ? "" : s.substring(pos+1); - try { - value = URLDecoder.decode(value, "UTF-8"); - } catch (Exception e) {} - map.put(key, value); - } + public UtfParameters(String query) { + if (query != null) { + String[] queries = query.split("&"); + for (String s : queries) { + int pos = s.indexOf('='); + String key = pos == -1 ? s : s.substring(0, pos); + try { + key = URLDecoder.decode(key, "UTF-8"); + } catch (Exception e) { + } + String value = pos == -1 ? "" : s.substring(pos + 1); + try { + value = URLDecoder.decode(value, "UTF-8"); + } catch (Exception e) { + } + map.put(key, value); + } + } + map = Collections.unmodifiableMap(map); } - map = Collections.unmodifiableMap(map); - } - public String getParameter(String key) { - return map.get(key); - } - public String getParameter(String key, String nullReplacement) { - String result = map.get(key); - if (result == null) { - return nullReplacement; + + public String getParameter(String key) { + return map.get(key); } - return result; - } - public String getParameter(String key, String nullReplacement, String emptyReplacement) { - String result = map.get(key); - if (result == null) { - return nullReplacement; + + public String getParameter(String key, String nullReplacement) { + String result = map.get(key); + if (result == null) { + return nullReplacement; + } + return result; } - if (result.length() == 0) { - return emptyReplacement; + + public String getParameter(String key, String nullReplacement, String emptyReplacement) { + String result = map.get(key); + if (result == null) { + return nullReplacement; + } + if (result.length() == 0) { + return emptyReplacement; + } + return result; } - return result; - } - public Iterator iterator() { - return map.keySet().iterator(); - } - private static UnicodeSet okByte = new UnicodeSet("[A-Za-z0-9]"); + public Iterator iterator() { + return map.keySet().iterator(); + } + + private static UnicodeSet okByte = new UnicodeSet("[A-Za-z0-9]"); - public static String fixQuery(String input) { - try { - StringBuilder result = new StringBuilder(); - byte[] bytes = input.getBytes("utf-8"); - for (int i = 0; i < bytes.length; ++i) { - int ch = bytes[i] & 0xFF; - if (okByte.contains(ch)) { - result.append((char)ch); - } else { - result.append('%'); - String hex = Integer.toHexString(ch); - if (hex.length() == 1) { - result.append('0'); - } - result.append(hex); + public static String fixQuery(String input) { + try { + StringBuilder result = new StringBuilder(); + byte[] bytes = input.getBytes("utf-8"); + for (int i = 0; i < bytes.length; ++i) { + int ch = bytes[i] & 0xFF; + if (okByte.contains(ch)) { + result.append((char) ch); + } else { + result.append('%'); + String hex = Integer.toHexString(ch); + if (hex.length() == 1) { + result.append('0'); + } + result.append(hex); + } + } + return result.toString(); + } catch (UnsupportedEncodingException e) { + return null; } - } - return result.toString(); - } catch (UnsupportedEncodingException e) { - return null; } - } -} \ No newline at end of file +} diff --git a/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java b/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java index aaba813f8..e5c8268b9 100644 --- a/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java +++ b/UnicodeJsps/src/main/java/org/unicode/jsp/XPropertyFactory.java @@ -1,21 +1,5 @@ package org.unicode.jsp; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; - -import org.unicode.idna.Idna.IdnaType; -import org.unicode.idna.Idna2003; -import org.unicode.idna.Idna2008; -import org.unicode.idna.Uts46; -import org.unicode.props.UnicodeProperty; -import org.unicode.props.UnicodeProperty.AliasAddAction; -import org.unicode.props.UnicodeProperty.BaseProperty; -import org.unicode.props.UnicodeProperty.Factory; -import org.unicode.props.UnicodeProperty.SimpleProperty; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.lang.CharSequences; import com.ibm.icu.lang.UCharacter; @@ -30,26 +14,43 @@ import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.VersionInfo; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import org.unicode.idna.Idna.IdnaType; +import org.unicode.idna.Idna2003; +import org.unicode.idna.Idna2008; +import org.unicode.idna.Uts46; +import org.unicode.props.UnicodeProperty; +import org.unicode.props.UnicodeProperty.AliasAddAction; +import org.unicode.props.UnicodeProperty.BaseProperty; +import org.unicode.props.UnicodeProperty.Factory; +import org.unicode.props.UnicodeProperty.SimpleProperty; public class XPropertyFactory extends UnicodeProperty.Factory { - static final UnicodeSet ALL = new UnicodeSet("[[:^C:][:Cc:][:Cf:][:noncharactercodepoint:]]").freeze(); + static final UnicodeSet ALL = + new UnicodeSet("[[:^C:][:Cc:][:Cf:][:noncharactercodepoint:]]").freeze(); static final class XPropertyFactoryHelper { XPropertyFactory factory = null; + XPropertyFactoryHelper() { factory = new XPropertyFactory(); } + static XPropertyFactoryHelper INSTANCE = new XPropertyFactoryHelper(); } - public static XPropertyFactory make() { + public static XPropertyFactory make() { return XPropertyFactoryHelper.INSTANCE.factory; } public final Factory add2(UnicodeProperty sp) { UnicodeProperty already = getProperty(sp.getName()); - if (already== null) { + if (already == null) { return add(sp); } else { System.err.println("Duplicate property:" + sp.getName()); @@ -59,7 +60,8 @@ public final Factory add2(UnicodeProperty sp) { { ICUPropertyFactory base = ICUPropertyFactory.make(); - for (String propertyAlias : (List)base.getInternalAvailablePropertyAliases(new ArrayList())) { + for (String propertyAlias : + (List) base.getInternalAvailablePropertyAliases(new ArrayList())) { add(base.getProperty(propertyAlias)); } for (int i = Common.XSTRING_START; i < Common.XSTRING_LIMIT; ++i) { @@ -71,79 +73,170 @@ public final Factory add2(UnicodeProperty sp) { add(new UTS46()); add(new IDNA2008()); add(new IDNA2008c()); - //add(new Usage()); + // add(new Usage()); add(new HanType()); - add(new UnicodeProperty.UnicodeMapProperty().set(Idna2003.SINGLETON.mappings).setMain("toIdna2003", "toIdna2003", UnicodeProperty.STRING, "1.1")); - add(new UnicodeProperty.UnicodeMapProperty().set(Uts46.SINGLETON.mappings).setMain("toUts46t", "toUts46t", UnicodeProperty.STRING, "1.1")); - add(new UnicodeProperty.UnicodeMapProperty().set(Uts46.SINGLETON.getMappingsDisplay()).setMain("toUts46n", "toUts46n", UnicodeProperty.STRING, "1.1")); - - add(new StringTransformProperty(Common.NFKC_CF, false).setMain("NFKC_Casefold", "NFKC_CF", UnicodeProperty.STRING, "1.1").addName("toNFKC_CF")); - - add(new CodepointTransformProperty(new Transform() { - public String transform(Integer source) { - return Normalizer.normalize(source, Normalizer.NFC); - }}, false).setMain("toNFC", "toNFC", UnicodeProperty.STRING, "1.1")); - add(new CodepointTransformProperty(new Transform() { - public String transform(Integer source) { - return Normalizer.normalize(source, Normalizer.NFD); - }}, false).setMain("toNFD", "toNFD", UnicodeProperty.STRING, "1.1")); - add(new CodepointTransformProperty(new Transform() { - public String transform(Integer source) { - return Normalizer.normalize(source, Normalizer.NFKC); - }}, false).setMain("toNFKC", "toNFKC", UnicodeProperty.STRING, "1.1")); - add(new CodepointTransformProperty(new Transform() { - public String transform(Integer source) { - return Normalizer.normalize(source, Normalizer.NFKD); - }}, false).setMain("toNFKD", "toNFKD", UnicodeProperty.STRING, "1.1")); - - add(new StringTransformProperty(new StringTransform() { - public String transform(String source) { - return UCharacter.foldCase(source, true); - }}, false).setMain("toCasefold", "toCF", UnicodeProperty.STRING, "1.1")); - add(new StringTransformProperty(new StringTransform() { - public String transform(String source) { - return UCharacter.toLowerCase(ULocale.ROOT, source); - }}, false).setMain("toLowercase", "toLC", UnicodeProperty.STRING, "1.1")); - add(new StringTransformProperty(new StringTransform() { - public String transform(String source) { - return UCharacter.toUpperCase(ULocale.ROOT, source); - }}, false).setMain("toUppercase", "toUC", UnicodeProperty.STRING, "1.1")); - add(new StringTransformProperty(new StringTransform() { - public String transform(String source) { - return UCharacter.toTitleCase(ULocale.ROOT, source, null); - }}, false).setMain("toTitlecase", "toTC", UnicodeProperty.STRING, "1.1")); - - add(new StringTransformProperty(new StringTransform() { - public String transform(String source) { - StringBuilder b = new StringBuilder(); - for (int cp : CharSequences.codePoints(source)) { - b.appendCodePoint(UCharacter.getBidiPairedBracket(cp)); - } - return b.toString(); - }}, false).setMain("Bidi_Paired_Bracket", "bpb", UnicodeProperty.STRING, "7.0")); - - add(new StringTransformProperty(new StringTransform() { - public String transform(String source) { - String result = NFM.nfm.get(source); - return result == null ? source : result; - }}, false).setMain("toNFM", "toNFM", UnicodeProperty.STRING, "1.1")); - //add(new UnicodeProperty.UnicodeMapProperty().set(NFM.nfm).setMain("toNFM", "toNFM", UnicodeProperty.STRING, "1.1")); - add(new UnicodeSetProperty().set(NFM.nfm.getSet(null)).setMain("isNFM", "isNFM", UnicodeProperty.BINARY, "1.1")); - - add(new CodepointTransformProperty(new Transform() { - public String transform(Integer source) { - return UnicodeUtilities.getSubheader().getSubheader(source); - }}, false).setMain("subhead", "subhead", UnicodeProperty.STRING, "1.1")); - - add(new UnicodeSetProperty().set("[:^nfcqc=n:]").setMain("isNFC", "isNFC", UnicodeProperty.BINARY, "1.1")); - add(new UnicodeSetProperty().set("[:^nfdqc=n:]").setMain("isNFD", "isNFD", UnicodeProperty.BINARY, "1.1")); - add(new UnicodeSetProperty().set("[:^nfkcqc=n:]").setMain("isNFKC", "isNFKC", UnicodeProperty.BINARY, "1.1")); - add(new UnicodeSetProperty().set("[:^nfkdqc=n:]").setMain("isNFKD", "isNFKD", UnicodeProperty.BINARY, "1.1")); - add(new UnicodeSetProperty().set("[\\u0000-\\u007F]").setMain("ASCII", "ASCII", UnicodeProperty.BINARY, "1.1")); - add(new UnicodeSetProperty().set("[\\u0000-\\U0010FFFF]").setMain("ANY", "ANY", UnicodeProperty.BINARY, "1.1")); - - add(new UnicodeSetProperty().set(new UnicodeSet("[\\u0000-\\uFFFF]")) - .setMain("bmp", "bmp", UnicodeProperty.BINARY, "6.0")); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(Idna2003.SINGLETON.mappings) + .setMain("toIdna2003", "toIdna2003", UnicodeProperty.STRING, "1.1")); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(Uts46.SINGLETON.mappings) + .setMain("toUts46t", "toUts46t", UnicodeProperty.STRING, "1.1")); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(Uts46.SINGLETON.getMappingsDisplay()) + .setMain("toUts46n", "toUts46n", UnicodeProperty.STRING, "1.1")); + + add( + new StringTransformProperty(Common.NFKC_CF, false) + .setMain("NFKC_Casefold", "NFKC_CF", UnicodeProperty.STRING, "1.1") + .addName("toNFKC_CF")); + + add( + new CodepointTransformProperty( + new Transform() { + public String transform(Integer source) { + return Normalizer.normalize(source, Normalizer.NFC); + } + }, + false) + .setMain("toNFC", "toNFC", UnicodeProperty.STRING, "1.1")); + add( + new CodepointTransformProperty( + new Transform() { + public String transform(Integer source) { + return Normalizer.normalize(source, Normalizer.NFD); + } + }, + false) + .setMain("toNFD", "toNFD", UnicodeProperty.STRING, "1.1")); + add( + new CodepointTransformProperty( + new Transform() { + public String transform(Integer source) { + return Normalizer.normalize(source, Normalizer.NFKC); + } + }, + false) + .setMain("toNFKC", "toNFKC", UnicodeProperty.STRING, "1.1")); + add( + new CodepointTransformProperty( + new Transform() { + public String transform(Integer source) { + return Normalizer.normalize(source, Normalizer.NFKD); + } + }, + false) + .setMain("toNFKD", "toNFKD", UnicodeProperty.STRING, "1.1")); + + add( + new StringTransformProperty( + new StringTransform() { + public String transform(String source) { + return UCharacter.foldCase(source, true); + } + }, + false) + .setMain("toCasefold", "toCF", UnicodeProperty.STRING, "1.1")); + add( + new StringTransformProperty( + new StringTransform() { + public String transform(String source) { + return UCharacter.toLowerCase(ULocale.ROOT, source); + } + }, + false) + .setMain("toLowercase", "toLC", UnicodeProperty.STRING, "1.1")); + add( + new StringTransformProperty( + new StringTransform() { + public String transform(String source) { + return UCharacter.toUpperCase(ULocale.ROOT, source); + } + }, + false) + .setMain("toUppercase", "toUC", UnicodeProperty.STRING, "1.1")); + add( + new StringTransformProperty( + new StringTransform() { + public String transform(String source) { + return UCharacter.toTitleCase(ULocale.ROOT, source, null); + } + }, + false) + .setMain("toTitlecase", "toTC", UnicodeProperty.STRING, "1.1")); + + add( + new StringTransformProperty( + new StringTransform() { + public String transform(String source) { + StringBuilder b = new StringBuilder(); + for (int cp : CharSequences.codePoints(source)) { + b.appendCodePoint(UCharacter.getBidiPairedBracket(cp)); + } + return b.toString(); + } + }, + false) + .setMain("Bidi_Paired_Bracket", "bpb", UnicodeProperty.STRING, "7.0")); + + add( + new StringTransformProperty( + new StringTransform() { + public String transform(String source) { + String result = NFM.nfm.get(source); + return result == null ? source : result; + } + }, + false) + .setMain("toNFM", "toNFM", UnicodeProperty.STRING, "1.1")); + // add(new UnicodeProperty.UnicodeMapProperty().set(NFM.nfm).setMain("toNFM", "toNFM", + // UnicodeProperty.STRING, "1.1")); + add( + new UnicodeSetProperty() + .set(NFM.nfm.getSet(null)) + .setMain("isNFM", "isNFM", UnicodeProperty.BINARY, "1.1")); + + add( + new CodepointTransformProperty( + new Transform() { + public String transform(Integer source) { + return UnicodeUtilities.getSubheader().getSubheader(source); + } + }, + false) + .setMain("subhead", "subhead", UnicodeProperty.STRING, "1.1")); + + add( + new UnicodeSetProperty() + .set("[:^nfcqc=n:]") + .setMain("isNFC", "isNFC", UnicodeProperty.BINARY, "1.1")); + add( + new UnicodeSetProperty() + .set("[:^nfdqc=n:]") + .setMain("isNFD", "isNFD", UnicodeProperty.BINARY, "1.1")); + add( + new UnicodeSetProperty() + .set("[:^nfkcqc=n:]") + .setMain("isNFKC", "isNFKC", UnicodeProperty.BINARY, "1.1")); + add( + new UnicodeSetProperty() + .set("[:^nfkdqc=n:]") + .setMain("isNFKD", "isNFKD", UnicodeProperty.BINARY, "1.1")); + add( + new UnicodeSetProperty() + .set("[\\u0000-\\u007F]") + .setMain("ASCII", "ASCII", UnicodeProperty.BINARY, "1.1")); + add( + new UnicodeSetProperty() + .set("[\\u0000-\\U0010FFFF]") + .setMain("ANY", "ANY", UnicodeProperty.BINARY, "1.1")); + + add( + new UnicodeSetProperty() + .set(new UnicodeSet("[\\u0000-\\uFFFF]")) + .setMain("bmp", "bmp", UnicodeProperty.BINARY, "6.0")); addCollationProperty(); @@ -152,44 +245,63 @@ public String transform(Integer source) { UnicodeMap specialMap = new UnicodeMap(); specialMap.putAll(scriptProp.getUnicodeMap()); specialMap.putAll(ScriptTester.getScriptSpecialsNames()); - add(new UnicodeProperty.UnicodeMapProperty() - .set(specialMap) - .setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1") - .addValueAliases(ScriptTester.getScriptSpecialsAlternates(), AliasAddAction.IGNORE_IF_MISSING) - ); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(specialMap) + .setMain("Script_Extensions", "scx", UnicodeProperty.ENUMERATED, "1.1") + .addValueAliases( + ScriptTester.getScriptSpecialsAlternates(), + AliasAddAction.IGNORE_IF_MISSING)); CachedProps cp = CachedProps.CACHED_PROPS; for (String prop : cp.getAvailable()) { add2(cp.getProperty(prop)); } - UnicodeSet Basic_Emoji = cp.getProperty("Basic_Emoji").getSet("Yes", null); // TODO: was .getTrueSet(); - UnicodeSet Emoji_Keycap_Sequence = cp.getProperty("RGI_Emoji_Keycap_Sequence").getSet("Yes", null); // TODO: was .getTrueSet(); - UnicodeSet RGI_Emoji_Modifier_Sequence = cp.getProperty("RGI_Emoji_Modifier_Sequence").getSet("Yes", null); // TODO: was .getTrueSet(); - UnicodeSet RGI_Emoji_Tag_Sequence = cp.getProperty("RGI_Emoji_Tag_Sequence").getSet("Yes", null); // TODO: was .getTrueSet(); - UnicodeSet RGI_Emoji_Flag_Sequence = cp.getProperty("RGI_Emoji_Flag_Sequence").getSet("Yes", null); // TODO: was .getTrueSet(); - UnicodeSet RGI_Emoji_Zwj_Sequence = cp.getProperty("RGI_Emoji_Zwj_Sequence").getSet("Yes", null); // TODO: was .getTrueSet(); - UnicodeSet RGI_Emoji = new UnicodeSet() - .add(Basic_Emoji) - .add(Emoji_Keycap_Sequence) - .add(RGI_Emoji_Modifier_Sequence) - .add(RGI_Emoji_Flag_Sequence) - .add(RGI_Emoji_Tag_Sequence) - .add(RGI_Emoji_Zwj_Sequence) - .freeze(); - add(new UnicodeSetProperty().set(RGI_Emoji).setMain("RGI_Emoji", "RGI_Emoji", UnicodeProperty.BINARY, "13.0")); + UnicodeSet Basic_Emoji = + cp.getProperty("Basic_Emoji").getSet("Yes", null); // TODO: was .getTrueSet(); + UnicodeSet Emoji_Keycap_Sequence = + cp.getProperty("RGI_Emoji_Keycap_Sequence") + .getSet("Yes", null); // TODO: was .getTrueSet(); + UnicodeSet RGI_Emoji_Modifier_Sequence = + cp.getProperty("RGI_Emoji_Modifier_Sequence") + .getSet("Yes", null); // TODO: was .getTrueSet(); + UnicodeSet RGI_Emoji_Tag_Sequence = + cp.getProperty("RGI_Emoji_Tag_Sequence") + .getSet("Yes", null); // TODO: was .getTrueSet(); + UnicodeSet RGI_Emoji_Flag_Sequence = + cp.getProperty("RGI_Emoji_Flag_Sequence") + .getSet("Yes", null); // TODO: was .getTrueSet(); + UnicodeSet RGI_Emoji_Zwj_Sequence = + cp.getProperty("RGI_Emoji_Zwj_Sequence") + .getSet("Yes", null); // TODO: was .getTrueSet(); + UnicodeSet RGI_Emoji = + new UnicodeSet() + .add(Basic_Emoji) + .add(Emoji_Keycap_Sequence) + .add(RGI_Emoji_Modifier_Sequence) + .add(RGI_Emoji_Flag_Sequence) + .add(RGI_Emoji_Tag_Sequence) + .add(RGI_Emoji_Zwj_Sequence) + .freeze(); + add( + new UnicodeSetProperty() + .set(RGI_Emoji) + .setMain("RGI_Emoji", "RGI_Emoji", UnicodeProperty.BINARY, "13.0")); } private void addCollationProperty() { RuleBasedCollator c = UnicodeSetUtilities.RAW_COLLATOR; - //(RuleBasedCollator) Collator.getInstance(ULocale.ROOT); - //c.setCaseLevel(true); + // (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); + // c.setCaseLevel(true); UnicodeMap collationMap0 = new UnicodeMap(); UnicodeMap collationMap1 = new UnicodeMap(); UnicodeMap collationMap2 = new UnicodeMap(); UnicodeMap collationMap3 = new UnicodeMap(); RawCollationKey key = new RawCollationKey(); - StringBuilder[] builder = {new StringBuilder(), new StringBuilder(), new StringBuilder(), new StringBuilder()}; + StringBuilder[] builder = { + new StringBuilder(), new StringBuilder(), new StringBuilder(), new StringBuilder() + }; UnicodeSet contractions = new UnicodeSet(); UnicodeSet expansions = new UnicodeSet(); try { @@ -197,9 +309,13 @@ private void addCollationProperty() { } catch (Exception e) { throw new IllegalArgumentException(e); } - UnicodeSet stuff = new UnicodeSet(ALL).addAll(contractions).addAll(expansions).removeAll(new UnicodeSet("[:unified_ideograph:]")); + UnicodeSet stuff = + new UnicodeSet(ALL) + .addAll(contractions) + .addAll(expansions) + .removeAll(new UnicodeSet("[:unified_ideograph:]")); for (String s : stuff) { - //c.getRawCollationKey(s, key); + // c.getRawCollationKey(s, key); builder[0].setLength(0); builder[1].setLength(0); builder[2].setLength(0); @@ -228,12 +344,14 @@ private void addCollationProperty() { tertiary ^= caseLevel; caseLevel |= 1; // fake 1 bit - while (nextCe != CollationElementIterator.NULLORDER && (nextCe & 0xC0) == 0xC0) { // Continuation!! + while (nextCe != CollationElementIterator.NULLORDER + && (nextCe & 0xC0) == 0xC0) { // Continuation!! ce = nextCe; nextCe = it.next(); primary = (primary << 16) | CollationElementIterator.primaryOrder(ce); secondary = (secondary << 8) | CollationElementIterator.secondaryOrder(ce); - tertiary = (tertiary << 8) | (CollationElementIterator.tertiaryOrder(ce) & 0x3F); + tertiary = + (tertiary << 8) | (CollationElementIterator.tertiaryOrder(ce) & 0x3F); } addBytes(builder[0], primary); addBytes(builder[1], secondary); @@ -249,14 +367,22 @@ private void addCollationProperty() { // System.out.println(collationMap1.values().size()); // System.out.println(collationMap2.values().size()); // System.out.println(collationMap3.values().size()); - add(new UnicodeProperty.UnicodeMapProperty() - .set(collationMap0).setMain("uca", "uca1", UnicodeProperty.ENUMERATED, "1.1")); - add(new UnicodeProperty.UnicodeMapProperty() - .set(collationMap1).setMain("uca2", "uca2", UnicodeProperty.ENUMERATED, "1.1")); - add(new UnicodeProperty.UnicodeMapProperty() - .set(collationMap2).setMain("uca2.5", "uca2.5", UnicodeProperty.ENUMERATED, "1.1")); - add(new UnicodeProperty.UnicodeMapProperty() - .set(collationMap3).setMain("uca3", "uca3", UnicodeProperty.ENUMERATED, "1.1")); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(collationMap0) + .setMain("uca", "uca1", UnicodeProperty.ENUMERATED, "1.1")); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(collationMap1) + .setMain("uca2", "uca2", UnicodeProperty.ENUMERATED, "1.1")); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(collationMap2) + .setMain("uca2.5", "uca2.5", UnicodeProperty.ENUMERATED, "1.1")); + add( + new UnicodeProperty.UnicodeMapProperty() + .set(collationMap3) + .setMain("uca3", "uca3", UnicodeProperty.ENUMERATED, "1.1")); } private void addBytes(StringBuilder builder, int bytes) { @@ -325,10 +451,9 @@ protected List _getValueAliases(String valueAlias, List result) { protected String _getVersion() { return VersionInfo.ICU_VERSION.toString(); } - } - private static abstract class XEnumUnicodeProperty extends UnicodeProperty { + private abstract static class XEnumUnicodeProperty extends UnicodeProperty { List values = new ArrayList(); public XEnumUnicodeProperty(String name, Object[] values) { @@ -363,7 +488,6 @@ protected List _getValueAliases(String valueAlias, List result) { protected String _getVersion() { return VersionInfo.ICU_VERSION.toString(); } - } private static class IDNA2003 extends XEnumUnicodeProperty { @@ -375,6 +499,7 @@ public IDNA2003() { protected String _getValue(int codepoint) { return Idna2003.SINGLETON.getType(codepoint).toString(); } + @Override protected List _getNameAliases(List result) { super._getNameAliases(result); @@ -418,14 +543,19 @@ protected String _getValue(int codepoint) { private static class IcuEnumProperty extends XEnumUnicodeProperty { final int propNum; + public IcuEnumProperty(int propNum) { - super(UCharacter.getPropertyName(propNum, NameChoice.LONG), getValues(propNum).toArray()); + super( + UCharacter.getPropertyName(propNum, NameChoice.LONG), + getValues(propNum).toArray()); this.propNum = propNum; } private static List getValues(int propNum) { List valueList = new ArrayList(); - for (int i = UCharacter.getIntPropertyMinValue(propNum); i <= UCharacter.getIntPropertyMaxValue(propNum); ++i) { + for (int i = UCharacter.getIntPropertyMinValue(propNum); + i <= UCharacter.getIntPropertyMaxValue(propNum); + ++i) { valueList.add(UCharacter.getPropertyValueName(propNum, i, NameChoice.LONG)); } return valueList; @@ -445,12 +575,14 @@ protected String _getValue(int codepoint) { // private static class IcuBidiPairedBracket extends SimpleProperty { // final int propNum; // public IcuBidiPairedBracket() { - // setName(UCharacter.getPropertyName(UProperty.BIDI_PAIRED_BRACKET, NameChoice.LONG)); + // setName(UCharacter.getPropertyName(UProperty.BIDI_PAIRED_BRACKET, + // NameChoice.LONG)); // this.propNum = UProperty.BIDI_PAIRED_BRACKET; // } // @Override // public List _getNameAliases(List result) { - // return Arrays.asList(UCharacter.getPropertyName(propNum, NameChoice.LONG), UCharacter.getPropertyName(propNum, NameChoice.SHORT)); + // return Arrays.asList(UCharacter.getPropertyName(propNum, NameChoice.LONG), + // UCharacter.getPropertyName(propNum, NameChoice.SHORT)); // } // // @Override @@ -465,7 +597,8 @@ protected String _getValue(int codepoint) { // } // private static class Usage extends XEnumUnicodeProperty { - // enum UsageValues {common, historic, deprecated, liturgical, limited, symbol, punctuation, na; + // enum UsageValues {common, historic, deprecated, liturgical, limited, symbol, + // punctuation, na; // public static UsageValues getValue(int codepoint) { // if (UnicodeProperty.SPECIALS.contains(codepoint)) return na; // if (UnicodeUtilities.DEPRECATED.contains(codepoint)) return deprecated; @@ -492,7 +625,13 @@ protected String _getValue(int codepoint) { // } static class HanType extends XEnumUnicodeProperty { - enum HanTypeValues {na, Hans, Hant, Han} + enum HanTypeValues { + na, + Hans, + Hant, + Han + } + public HanType() { super("HanType", HanTypeValues.values()); setType(UnicodeProperty.EXTENDED_ENUMERATED); @@ -505,24 +644,28 @@ protected String _getValue(int codepoint) { } private static class StringTransformProperty extends SimpleProperty { - Transform transform; + Transform transform; - public StringTransformProperty(Transform transform, boolean hasUniformUnassigned) { + public StringTransformProperty( + Transform transform, boolean hasUniformUnassigned) { this.transform = transform; setUniformUnassigned(hasUniformUnassigned); } + protected String _getValue(int codepoint) { return transform.transform(UTF16.valueOf(codepoint)); } } private static class CodepointTransformProperty extends SimpleProperty { - Transform transform; + Transform transform; - public CodepointTransformProperty(Transform transform, boolean hasUniformUnassigned) { + public CodepointTransformProperty( + Transform transform, boolean hasUniformUnassigned) { this.transform = transform; setUniformUnassigned(hasUniformUnassigned); } + protected String _getValue(int codepoint) { return transform.transform(codepoint); } @@ -560,7 +703,7 @@ public boolean isDefault(int codepoint) { } private Object hex(byte b) { - String result = Integer.toHexString(0xFF&b).toUpperCase(Locale.ENGLISH); + String result = Integer.toHexString(0xFF & b).toUpperCase(Locale.ENGLISH); return result.length() == 2 ? result : "0" + result; } } @@ -578,10 +721,9 @@ protected String _getValue(int codepoint) { } } - public static class UnicodeSetProperty extends BaseProperty { protected UnicodeSet unicodeSet; - private static final String[] YESNO_ARRAY = new String[]{"Yes", "No"}; + private static final String[] YESNO_ARRAY = new String[] {"Yes", "No"}; private static final List YESNO = Arrays.asList(YESNO_ARRAY); public XPropertyFactory.UnicodeSetProperty set(UnicodeSet set) { @@ -609,5 +751,4 @@ protected List _getAvailableValues(List result) { return YESNO; } } - } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsp/TestUBAVersion.java b/UnicodeJsps/src/test/java/org/unicode/jsp/TestUBAVersion.java index 3c96df434..b83a38fe8 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsp/TestUBAVersion.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsp/TestUBAVersion.java @@ -5,10 +5,8 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import java.util.EnumSet; - import com.ibm.icu.text.UnicodeSet; - +import java.util.EnumSet; import org.junit.jupiter.api.Test; import org.unicode.props.UcdPropertyValues.Age_Values; @@ -34,7 +32,7 @@ void UBAVersionTest() { assertNotNull(versions); // Current is the last item - assertTrue(current.equals(versions.toArray()[versions.size()-1])); + assertTrue(current.equals(versions.toArray()[versions.size() - 1])); // First is 6.2 final Age_Values first = versions.iterator().next(); diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/QuickCheck.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/QuickCheck.java index 33233dc1d..6c0969f7a 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/QuickCheck.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/QuickCheck.java @@ -1,24 +1,24 @@ package org.unicode.jsptest; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; - import org.unicode.jsp.UnicodeJsp; -import com.ibm.icu.text.UnicodeSet; - public class QuickCheck { -public static void main(String[] args) throws IOException { - // public static void showSet(String grouping, UnicodeSet a, boolean abbreviate, boolean ucdFormat, Appendable out) throws IOException { -// public static String getSimpleSet(String setA, UnicodeSet a, boolean abbreviate, boolean escape) { + public static void main(String[] args) throws IOException { + // public static void showSet(String grouping, UnicodeSet a, boolean abbreviate, boolean + // ucdFormat, Appendable out) throws IOException { + // public static String getSimpleSet(String setA, UnicodeSet a, boolean abbreviate, + // boolean escape) { - StringBuilder out = new StringBuilder(); - UnicodeSet a = new UnicodeSet(); - String a_out = UnicodeJsp.getSimpleSet("[:confusables:]", a, true, true); - System.out.println(a_out); + StringBuilder out = new StringBuilder(); + UnicodeSet a = new UnicodeSet(); + String a_out = UnicodeJsp.getSimpleSet("[:confusables:]", a, true, true); + System.out.println(a_out); - String outer = UnicodeJsp.getSimpleSet("[:emoji=yes:]", a, false, false); - //UnicodeJsp.showSet("", a, true, false, out); - //String outer = out.toString(); - System.out.println(outer); -} + String outer = UnicodeJsp.getSimpleSet("[:emoji=yes:]", a, false, false); + // UnicodeJsp.showSet("", a, true, false, out); + // String outer = out.toString(); + System.out.println(outer); + } } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAll.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAll.java index 401bc49c7..fa9c3516f 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAll.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAll.java @@ -1,12 +1,10 @@ -//##header J2SE15 +// ##header J2SE15 package org.unicode.jsptest; import com.ibm.icu.dev.test.TestFmwk.TestGroup; -/** - * Top level test used to run all other tests as a batch. - */ +/** Top level test used to run all other tests as a batch. */ public class TestAll extends TestGroup { public static void main(String[] args) { @@ -16,20 +14,20 @@ public static void main(String[] args) { public TestAll() { super( new String[] { - "org.unicode.jsptest.TestAlternateIterator", - "org.unicode.jsptest.TestBasicProperties", - //"org.unicode.jsptest.TestBuilder", // not really a test, move - "org.unicode.jsptest.TestEmoji", - //"org.unicode.jsptest.TestGenerate", // not really a test, move - //"org.unicode.jsptest.TestIcuProperties", // not really a test, move - "org.unicode.jsptest.TestIdna", - "org.unicode.jsptest.TestJsp", - "org.unicode.jsptest.TestLanguageid", - "org.unicode.jsptest.TestProperties", - "org.unicode.jsptest.TestScriptTester", - // "org.unicode.jsptest.TestTypology", - "org.unicode.jsptest.TestUnicodeSet", - "org.unicode.jsptest.TestUts46", + "org.unicode.jsptest.TestAlternateIterator", + "org.unicode.jsptest.TestBasicProperties", + // "org.unicode.jsptest.TestBuilder", // not really a test, move + "org.unicode.jsptest.TestEmoji", + // "org.unicode.jsptest.TestGenerate", // not really a test, move + // "org.unicode.jsptest.TestIcuProperties", // not really a test, move + "org.unicode.jsptest.TestIdna", + "org.unicode.jsptest.TestJsp", + "org.unicode.jsptest.TestLanguageid", + "org.unicode.jsptest.TestProperties", + "org.unicode.jsptest.TestScriptTester", + // "org.unicode.jsptest.TestTypology", + "org.unicode.jsptest.TestUnicodeSet", + "org.unicode.jsptest.TestUts46", }, "All tests in jsptest"); } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAlternateIterator.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAlternateIterator.java index 99740213c..5f1cfb5fc 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAlternateIterator.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestAlternateIterator.java @@ -1,5 +1,7 @@ package org.unicode.jsptest; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UnicodeSet; import org.junit.jupiter.api.Test; import org.unicode.jsp.AlternateIterator; import org.unicode.jsp.Confusables; @@ -7,58 +9,54 @@ import org.unicode.jsp.XIDModifications; import org.unicode.unittest.TestFmwkMinusMinus; -import com.ibm.icu.dev.test.TestFmwk; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.UnicodeSet; - public class TestAlternateIterator extends TestFmwkMinusMinus { - @Test - public void TestUI() { - logln(XIDModifications.getAllowed().toPattern(false)); - logln(UnicodeJsp.getConfusables("\u00c5w", true, false, false, false)); - logln(UnicodeJsp.getConfusables("mark-davis", false, false, false, false)); - logln(UnicodeJsp.getConfusables("mark-davis", true, false, false, false)); - logln(UnicodeJsp.getConfusables("mark-davis", false, true, true, true)); - logln(UnicodeJsp.getConfusables("mark-davis", true, true, true, true)); - logln(UnicodeJsp.getConfusables("mark davis", false, true, true, true)); - } + @Test + public void TestUI() { + logln(XIDModifications.getAllowed().toPattern(false)); + logln(UnicodeJsp.getConfusables("\u00c5w", true, false, false, false)); + logln(UnicodeJsp.getConfusables("mark-davis", false, false, false, false)); + logln(UnicodeJsp.getConfusables("mark-davis", true, false, false, false)); + logln(UnicodeJsp.getConfusables("mark-davis", false, true, true, true)); + logln(UnicodeJsp.getConfusables("mark-davis", true, true, true, true)); + logln(UnicodeJsp.getConfusables("mark davis", false, true, true, true)); + } - @Test - public void TestBasic() { - AlternateIterator foo = AlternateIterator.start().add("a", "b", "c").add("d", "e").build(); - int count = 0; - for (String items : foo) { - logln(++count + "\t" + items); + @Test + public void TestBasic() { + AlternateIterator foo = AlternateIterator.start().add("a", "b", "c").add("d", "e").build(); + int count = 0; + for (String items : foo) { + logln(++count + "\t" + items); + } } - } - @Test - public void TestConfusables() { - String test = "mark-davis"; - Confusables confusables = new Confusables(test).setNormalizationCheck(Normalizer.NFKC); - confusables.setAllowedCharacters(new UnicodeSet("[\\-[:L:][:M:][:N:]]")); - confusables.setScriptCheck(Confusables.ScriptCheck.same); - check(confusables); - confusables.setAllowedCharacters(null); - check(confusables); - //confusables.setScriptCheck(Confusables.ScriptCheck.none); - //check(confusables); - } + @Test + public void TestConfusables() { + String test = "mark-davis"; + Confusables confusables = new Confusables(test).setNormalizationCheck(Normalizer.NFKC); + confusables.setAllowedCharacters(new UnicodeSet("[\\-[:L:][:M:][:N:]]")); + confusables.setScriptCheck(Confusables.ScriptCheck.same); + check(confusables); + confusables.setAllowedCharacters(null); + check(confusables); + // confusables.setScriptCheck(Confusables.ScriptCheck.none); + // check(confusables); + } - @Test - private void check(Confusables confusables) { - if (isVerbose()) { - logln("Confusables for: " + confusables.getOriginal()); - logln("\tNormalizationCheck:\t" + confusables.getNormalizationCheck()); - logln("\tScriptCheck:\t" + confusables.getScriptCheck()); - logln("\tAllowedCharacters:\t" + confusables.getAllowedCharacters()); - int count = 0; -// for (String item : confusables) { -// logln(++count + "\t" + item + "\t" + Utility.hex(item)); -// } - } else { - assertNotEquals("Confusable count", 0, confusables.iterator().hasNext()); - } - } + @Test + private void check(Confusables confusables) { + if (isVerbose()) { + logln("Confusables for: " + confusables.getOriginal()); + logln("\tNormalizationCheck:\t" + confusables.getNormalizationCheck()); + logln("\tScriptCheck:\t" + confusables.getScriptCheck()); + logln("\tAllowedCharacters:\t" + confusables.getAllowedCharacters()); + int count = 0; + // for (String item : confusables) { + // logln(++count + "\t" + item + "\t" + Utility.hex(item)); + // } + } else { + assertNotEquals("Confusable count", 0, confusables.iterator().hasNext()); + } + } } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBasicProperties.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBasicProperties.java index 45b427576..482be8bd6 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBasicProperties.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBasicProperties.java @@ -1,16 +1,14 @@ package org.unicode.jsptest; -import java.util.Map.Entry; - import com.ibm.icu.text.Collator; import com.ibm.icu.util.ULocale; - +import java.util.Map.Entry; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; import org.unicode.jsp.PropertyMetadata; import org.unicode.jsp.PropertyMetadata.PropertyMetaDatum; -import org.unicode.props.UnicodeProperty; import org.unicode.jsp.XPropertyFactory; +import org.unicode.props.UnicodeProperty; public class TestBasicProperties extends TestFmwk2 { @@ -18,17 +16,20 @@ public class TestBasicProperties extends TestFmwk2 { static Collator col = Collator.getInstance(ULocale.ROOT); static String sample = "क"; - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestListing() { - for (Entry propInfo : PropertyMetadata.getPropertyToData().entrySet()) { + for (Entry propInfo : + PropertyMetadata.getPropertyToData().entrySet()) { if (isVerbose()) { logln(propInfo.toString()); } String propName = propInfo.getKey(); UnicodeProperty prop = factory.getProperty(propName); if (prop == null) { - prop = factory.getProperty(propName+"β"); + prop = factory.getProperty(propName + "β"); } if (!assertNotNull("PropertyMetadata has property: " + propName, prop)) { String realName = prop.getName(); @@ -45,12 +46,14 @@ public void TestListing() { // public void TestPropertyMetadata() { // Set hasMetadata = new TreeSet(); - // for (R4 propData : PropertyMetadata.getCategoryDatatypeSourceProperty()) { + // for (R4 propData : + // PropertyMetadata.getCategoryDatatypeSourceProperty()) { // String propName = propData.get3(); // hasMetadata.add(propName); // } // CachedProps cp = CachedProps.getInstance(VersionInfo.getInstance(10)); - // Set propsMissingMetadata = new LinkedHashSet(cp.getPropertyNames()); + // Set propsMissingMetadata = new + // LinkedHashSet(cp.getPropertyNames()); // propsMissingMetadata.removeAll(hasMetadata); // assertEquals("PropertyMetadata", Collections.EMPTY_SET, propsMissingMetadata); // } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBuilder.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBuilder.java index 98d3a891f..277783b6e 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBuilder.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestBuilder.java @@ -14,190 +14,279 @@ import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; - import org.junit.jupiter.api.Test; import org.unicode.jsp.Builder; import org.unicode.jsp.Builder.EqualAction; import org.unicode.unittest.TestFmwkMinusMinus; -/** - * "No, really, a test!" - * Tests org.unicode.jsp.Builder - */ +/** "No, really, a test!" Tests org.unicode.jsp.Builder */ public class TestBuilder extends TestFmwkMinusMinus { - enum Foo {foo1, foo2}; - Dummy one = new Dummy(1); - Dummy two = new Dummy(2); - Dummy two2 = new Dummy(2); - Dummy three = new Dummy(3); - - @Test - public void TestCollection() { - SortedSet x = Builder.with(new TreeSet()).addAll(1, 2, 3).freeze(); - assertTrue("1,2,3", x.size()==3); - TreeSet x12 = Builder.with(new TreeSet()).addAll(1,2).get(); - SortedSet y = Builder.with(new TreeSet()).addAll(x).removeAll(3,4).freeze(); - assertTrue("1,2", y.equals(x12)); - SortedSet z = Builder.with(new TreeSet()).addAll(x).retainAll(3,4).freeze(); - assertTrue("3", z.size()==1 && z.contains(3)); - Set w = Builder.with(EnumSet.noneOf(Foo.class)).add(Foo.foo1).freeze(); - assertTrue("w", w.size()==1 && w.contains(Foo.foo1)); - } - /** - *
- * Operations: A is current contents, B is new collection, x indicates the results
-   * A-B   A&B    B-A   Name
-   *                    clear()
-   * x                  removeAll(B)
-   *        x           retainAll(B) -- option 1: keep A, option 2: substitute B
-   *               x    keepNew(B)
-   * x      x           
-   *        x      x    clear().addAll(B)
-   * x             x    xor(B)
-   * x      x      x    addAll(B)
- * 
- */ - @Test - public void TestCombos() { - Set dummyNone = Collections.emptySet(); - Set dummy1 = Builder.with(new TreeSet()).addAll(one).freeze(); - Set dummy2 = Builder.with(new TreeSet()).addAll(two).freeze(); - Set dummy3 = Builder.with(new TreeSet()).addAll(three).freeze(); - Set dummy12 = Builder.with(new TreeSet()).addAll(one, two).freeze(); - Set dummy23 = Builder.with(new TreeSet()).addAll(two, three).freeze(); - Set dummy13 = Builder.with(new TreeSet()).addAll(one, three).freeze(); - Set dummy123 = Builder.with(new TreeSet()).addAll(one, two, three).freeze(); - - assertEquals("none", dummyNone, Builder.with(new TreeSet(dummy12)).clear().get()); - assertEquals("removeAll", dummy1, Builder.with(new TreeSet(dummy12)).removeAll(dummy23).get()); - assertEquals("retainAll", dummy2, Builder.with(new TreeSet(dummy12)).retainAll(dummy23).get()); - assertEquals("keepNew", dummy3, Builder.with(new TreeSet(dummy12)).keepNew(dummy23).get()); - assertEquals("xor", dummy13, Builder.with(new TreeSet(dummy12)).xor(dummy23).get()); - assertEquals("addAll", dummy123, Builder.with(new TreeSet(dummy12)).addAll(dummy23).get()); - } - - @Test - public void TestMapCombos() { - Map dummyNone = Collections.emptyMap(); - Map dummy1 = Builder.with(new TreeMap()).put(one,2).freeze(); - Map dummy2 = Builder.with(new TreeMap()).put(two,2).freeze(); - Map dummy3 = Builder.with(new TreeMap()).put(three,2).freeze(); - Map dummy12 = Builder.with(new TreeMap()).on(one, two).put(2).freeze(); - Map dummy23 = Builder.with(new TreeMap()).on(two, three).put(2).freeze(); - Map dummy13 = Builder.with(new TreeMap()).on(one, three).put(2).freeze(); - Map dummy123 = Builder.with(new TreeMap()).on(one, two, three).put(2).freeze(); - - assertEquals("none", dummyNone, Builder.with(new TreeMap()).putAll(dummy12).clear().get()); - assertEquals("removeAll", dummy1, Builder.with(new TreeMap()).putAll(dummy12).removeAll(dummy23.keySet()).get()); - assertEquals("retainAll", dummy2, Builder.with(new TreeMap()).putAll(dummy12).retainAll(dummy23.keySet()).get()); - assertEquals("keepNew", dummy3, Builder.with(new TreeMap()).putAll(dummy12).keepNew(dummy23).get()); - assertEquals("xor", dummy13, Builder.with(new TreeMap()).putAll(dummy12).xor(dummy23).get()); - assertEquals("addAll", dummy123, Builder.with(new TreeMap()).putAll(dummy12).putAll(dummy23).get()); - } - - @Test - public void TestMap() { - Map x = Builder.with(new TreeMap()).put(1, "a").put(2,"b").put(3,"c").freeze(); - assertTrue("1,2,3", x.size()==3); - Map x2 = Builder.with(new TreeMap()).on(1,2,3).put("a","b","c").freeze(); - assertEquals("1,2,3 either way", x, x2); - Map x3 = Builder.with(new TreeMap()).on(1,2,3).put("a,b,c".split(",")).freeze(); - assertEquals("1,2,3 either way", x, x3); - - Map x12 = Builder.with(new TreeMap()).put(1, "a").put(2,"b").get(); - Map y = Builder.with(new TreeMap()).putAll(x).removeAll(3,4).freeze(); - assertTrue("1,2", y.equals(x12)); - Map z = Builder.with(new TreeMap()).putAll(x).retainAll(3,4).freeze(); - assertTrue("3", z.size()==1 && z.keySet().contains(3)); - Map z2 = Builder.with(new TreeMap()).on(1,2,3).put("a").freeze(); - assertTrue("3", z2.size()==3 && z2.containsKey(2)); - } - - - @Test - public void TestOptions() throws InstantiationException, IllegalAccessException { - checkOptions(TreeSet.class); - checkOptions(HashSet.class); - checkOptions(ArrayList.class); - checkOptions(LinkedHashSet.class); - checkOptions(ArrayDeque.class); - - checkMapOptions(LinkedHashMap.class); - checkMapOptions(HashMap.class); - checkMapOptions(TreeMap.class); - } - - public void checkOptions(Class class1) throws InstantiationException, IllegalAccessException { - logln(class1.getName()); + enum Foo { + foo1, + foo2 + }; + Dummy one = new Dummy(1); - Dummy one1 = new Dummy(1); - Collection set = Builder.with((Collection) class1.newInstance(), EqualAction.RETAIN).add(one).add(one1).freeze(); - assertTrue("size", set.size() == 1); - assertTrue("RETAIN", one == set.iterator().next()); - - set = Builder.with((Collection) class1.newInstance(), EqualAction.REPLACE).add(one).add(one1).freeze(); - assertTrue("REPLACE", one1 == set.iterator().next()); - - boolean ok; - try { - set = Builder.with((Collection) class1.newInstance(), EqualAction.THROW).add(one).add(one1).freeze(); - ok = false; - } catch (Exception e) { - ok = true; + Dummy two = new Dummy(2); + Dummy two2 = new Dummy(2); + Dummy three = new Dummy(3); + + @Test + public void TestCollection() { + SortedSet x = Builder.with(new TreeSet()).addAll(1, 2, 3).freeze(); + assertTrue("1,2,3", x.size() == 3); + TreeSet x12 = Builder.with(new TreeSet()).addAll(1, 2).get(); + SortedSet y = + Builder.with(new TreeSet()).addAll(x).removeAll(3, 4).freeze(); + assertTrue("1,2", y.equals(x12)); + SortedSet z = + Builder.with(new TreeSet()).addAll(x).retainAll(3, 4).freeze(); + assertTrue("3", z.size() == 1 && z.contains(3)); + Set w = Builder.with(EnumSet.noneOf(Foo.class)).add(Foo.foo1).freeze(); + assertTrue("w", w.size() == 1 && w.contains(Foo.foo1)); } - assertTrue("throw", ok); - } + /** + * + * + *
+     * Operations: A is current contents, B is new collection, x indicates the results
+     * A-B   A&B    B-A   Name
+     *                    clear()
+     * x                  removeAll(B)
+     *        x           retainAll(B) -- option 1: keep A, option 2: substitute B
+     *               x    keepNew(B)
+     * x      x           
+     *        x      x    clear().addAll(B)
+     * x             x    xor(B)
+     * x      x      x    addAll(B)
+     * 
+ */ + @Test + public void TestCombos() { + Set dummyNone = Collections.emptySet(); + Set dummy1 = Builder.with(new TreeSet()).addAll(one).freeze(); + Set dummy2 = Builder.with(new TreeSet()).addAll(two).freeze(); + Set dummy3 = Builder.with(new TreeSet()).addAll(three).freeze(); + Set dummy12 = Builder.with(new TreeSet()).addAll(one, two).freeze(); + Set dummy23 = Builder.with(new TreeSet()).addAll(two, three).freeze(); + Set dummy13 = Builder.with(new TreeSet()).addAll(one, three).freeze(); + Set dummy123 = Builder.with(new TreeSet()).addAll(one, two, three).freeze(); - public void checkMapOptions(Class class1) throws InstantiationException, IllegalAccessException { - logln(class1.getName()); - Dummy one = new Dummy(1); - Dummy one1 = new Dummy(1); - Map set = Builder.with((Map) class1.newInstance(), EqualAction.RETAIN).put(one, 1).put(one1, 2).freeze(); - assertTrue("size", set.size() == 1); - assertTrue("RETAIN", one == set.keySet().iterator().next()); - assertTrue("RETAIN-get", 1 == set.get(one)); - - set = Builder.with((Map) class1.newInstance(), EqualAction.REPLACE).put(one, 1).put(one1, 2).freeze(); - assertTrue("REPLACE", one1 == set.keySet().iterator().next()); - assertTrue("REPLACE-get", 2 == set.get(one)); - - boolean ok; - try { - set = Builder.with((Map) class1.newInstance(), EqualAction.THROW).put(one, 1).put(one1, 2).freeze(); - ok = false; - } catch (Exception e) { - ok = true; + assertEquals("none", dummyNone, Builder.with(new TreeSet(dummy12)).clear().get()); + assertEquals( + "removeAll", + dummy1, + Builder.with(new TreeSet(dummy12)).removeAll(dummy23).get()); + assertEquals( + "retainAll", + dummy2, + Builder.with(new TreeSet(dummy12)).retainAll(dummy23).get()); + assertEquals( + "keepNew", + dummy3, + Builder.with(new TreeSet(dummy12)).keepNew(dummy23).get()); + assertEquals("xor", dummy13, Builder.with(new TreeSet(dummy12)).xor(dummy23).get()); + assertEquals( + "addAll", + dummy123, + Builder.with(new TreeSet(dummy12)).addAll(dummy23).get()); } - assertTrue("throw", ok); - } - static class Dummy implements Comparable, Cloneable { - int item; + @Test + public void TestMapCombos() { + Map dummyNone = Collections.emptyMap(); + Map dummy1 = + Builder.with(new TreeMap()).put(one, 2).freeze(); + Map dummy2 = + Builder.with(new TreeMap()).put(two, 2).freeze(); + Map dummy3 = + Builder.with(new TreeMap()).put(three, 2).freeze(); + Map dummy12 = + Builder.with(new TreeMap()).on(one, two).put(2).freeze(); + Map dummy23 = + Builder.with(new TreeMap()).on(two, three).put(2).freeze(); + Map dummy13 = + Builder.with(new TreeMap()).on(one, three).put(2).freeze(); + Map dummy123 = + Builder.with(new TreeMap()).on(one, two, three).put(2).freeze(); - public Dummy(int item) { - this.item = item; + assertEquals( + "none", + dummyNone, + Builder.with(new TreeMap()).putAll(dummy12).clear().get()); + assertEquals( + "removeAll", + dummy1, + Builder.with(new TreeMap()) + .putAll(dummy12) + .removeAll(dummy23.keySet()) + .get()); + assertEquals( + "retainAll", + dummy2, + Builder.with(new TreeMap()) + .putAll(dummy12) + .retainAll(dummy23.keySet()) + .get()); + assertEquals( + "keepNew", + dummy3, + Builder.with(new TreeMap()).putAll(dummy12).keepNew(dummy23).get()); + assertEquals( + "xor", + dummy13, + Builder.with(new TreeMap()).putAll(dummy12).xor(dummy23).get()); + assertEquals( + "addAll", + dummy123, + Builder.with(new TreeMap()).putAll(dummy12).putAll(dummy23).get()); } - public boolean equals(Object obj) { - return item == ((Dummy) obj).item; + @Test + public void TestMap() { + Map x = + Builder.with(new TreeMap()) + .put(1, "a") + .put(2, "b") + .put(3, "c") + .freeze(); + assertTrue("1,2,3", x.size() == 3); + Map x2 = + Builder.with(new TreeMap()) + .on(1, 2, 3) + .put("a", "b", "c") + .freeze(); + assertEquals("1,2,3 either way", x, x2); + Map x3 = + Builder.with(new TreeMap()) + .on(1, 2, 3) + .put("a,b,c".split(",")) + .freeze(); + assertEquals("1,2,3 either way", x, x3); + + Map x12 = + Builder.with(new TreeMap()).put(1, "a").put(2, "b").get(); + Map y = + Builder.with(new TreeMap()).putAll(x).removeAll(3, 4).freeze(); + assertTrue("1,2", y.equals(x12)); + Map z = + Builder.with(new TreeMap()).putAll(x).retainAll(3, 4).freeze(); + assertTrue("3", z.size() == 1 && z.keySet().contains(3)); + Map z2 = + Builder.with(new TreeMap()).on(1, 2, 3).put("a").freeze(); + assertTrue("3", z2.size() == 3 && z2.containsKey(2)); } - public int hashCode() { - return item; + @Test + public void TestOptions() throws InstantiationException, IllegalAccessException { + checkOptions(TreeSet.class); + checkOptions(HashSet.class); + checkOptions(ArrayList.class); + checkOptions(LinkedHashSet.class); + checkOptions(ArrayDeque.class); + + checkMapOptions(LinkedHashMap.class); + checkMapOptions(HashMap.class); + checkMapOptions(TreeMap.class); } - public int compareTo(Dummy o) { - int item2 = ((Dummy) o).item; - return item < item2 ? -1 : item > item2 ? 1 : 0; + public void checkOptions(Class class1) + throws InstantiationException, IllegalAccessException { + logln(class1.getName()); + Dummy one = new Dummy(1); + Dummy one1 = new Dummy(1); + Collection set = + Builder.with((Collection) class1.newInstance(), EqualAction.RETAIN) + .add(one) + .add(one1) + .freeze(); + assertTrue("size", set.size() == 1); + assertTrue("RETAIN", one == set.iterator().next()); + + set = + Builder.with((Collection) class1.newInstance(), EqualAction.REPLACE) + .add(one) + .add(one1) + .freeze(); + assertTrue("REPLACE", one1 == set.iterator().next()); + + boolean ok; + try { + set = + Builder.with((Collection) class1.newInstance(), EqualAction.THROW) + .add(one) + .add(one1) + .freeze(); + ok = false; + } catch (Exception e) { + ok = true; + } + assertTrue("throw", ok); } - public Object clone() { - return clone(); + public void checkMapOptions(Class class1) + throws InstantiationException, IllegalAccessException { + logln(class1.getName()); + Dummy one = new Dummy(1); + Dummy one1 = new Dummy(1); + Map set = + Builder.with((Map) class1.newInstance(), EqualAction.RETAIN) + .put(one, 1) + .put(one1, 2) + .freeze(); + assertTrue("size", set.size() == 1); + assertTrue("RETAIN", one == set.keySet().iterator().next()); + assertTrue("RETAIN-get", 1 == set.get(one)); + + set = + Builder.with((Map) class1.newInstance(), EqualAction.REPLACE) + .put(one, 1) + .put(one1, 2) + .freeze(); + assertTrue("REPLACE", one1 == set.keySet().iterator().next()); + assertTrue("REPLACE-get", 2 == set.get(one)); + + boolean ok; + try { + set = + Builder.with((Map) class1.newInstance(), EqualAction.THROW) + .put(one, 1) + .put(one1, 2) + .freeze(); + ok = false; + } catch (Exception e) { + ok = true; + } + assertTrue("throw", ok); } - public String toString() { - return "<"+item+">"; + static class Dummy implements Comparable, Cloneable { + int item; + + public Dummy(int item) { + this.item = item; + } + + public boolean equals(Object obj) { + return item == ((Dummy) obj).item; + } + + public int hashCode() { + return item; + } + + public int compareTo(Dummy o) { + int item2 = ((Dummy) o).item; + return item < item2 ? -1 : item > item2 ? 1 : 0; + } + + public Object clone() { + return clone(); + } + + public String toString() { + return "<" + item + ">"; + } } - } } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestEmoji.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestEmoji.java index 77889b9bd..9dc4ce793 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestEmoji.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestEmoji.java @@ -1,9 +1,7 @@ package org.unicode.jsptest; -import java.io.IOException; - import com.ibm.icu.text.UnicodeSet; - +import java.io.IOException; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; import org.unicode.jsp.UnicodeJsp; @@ -13,7 +11,9 @@ public class TestEmoji extends TestFmwk2 { static XPropertyFactory factory = XPropertyFactory.make(); - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestBasic() throws IOException { String[] message = {""}; @@ -54,6 +54,4 @@ public void TestBasic() throws IOException { checkContained("[:Emoji_Tag_Sequenceβ:]", "[{🏴󠁧󠁢󠁳󠁣󠁴󠁿}]"); checkContained("[:Emoji_Tag_Sequenceβ:]", "[☝]", false); } - - } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestFmwk2.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestFmwk2.java index 70a45056e..7d1e7d1c1 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestFmwk2.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestFmwk2.java @@ -1,7 +1,6 @@ package org.unicode.jsptest; import com.ibm.icu.text.UnicodeSet; - import org.unicode.jsp.UnicodeUtilities; import org.unicode.unittest.TestFmwkMinusMinus; @@ -11,7 +10,8 @@ public void checkContained(final String setPattern, final String containedPatter checkContained(setPattern, containedPattern, true); } - public void checkContained(final String setPattern, final String containedPattern, boolean expected) { + public void checkContained( + final String setPattern, final String containedPattern, boolean expected) { String[] message = {""}; UnicodeSet container = UnicodeUtilities.parseSimpleSet(setPattern, message); UnicodeSet contained = UnicodeUtilities.parseSimpleSet(containedPattern, message); @@ -20,15 +20,24 @@ public void checkContained(final String setPattern, final String containedPatter } else if (contained == null) { errln(containedPattern + " fails to parse"); } else if (container.containsAll(contained) != expected) { - errln(toPattern(setPattern, container) + " doesn't contain " + toPattern(containedPattern, contained)); + errln( + toPattern(setPattern, container) + + " doesn't contain " + + toPattern(containedPattern, contained)); } else { - logln(toPattern(setPattern, container) + " contains " + toPattern(containedPattern, contained)); + logln( + toPattern(setPattern, container) + + " contains " + + toPattern(containedPattern, contained)); } } @Override public void msg(String message, int level, boolean incCount, boolean newln) { - super.msg(message.length() > 200 ? message.substring(0,200) + "…" : message, level, incCount, newln); + super.msg( + message.length() > 200 ? message.substring(0, 200) + "…" : message, + level, + incCount, + newln); } - } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestGenerate.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestGenerate.java index 3e4f68d6b..ae6a3bc1e 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestGenerate.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestGenerate.java @@ -1,17 +1,5 @@ package org.unicode.jsptest; -import java.io.IOException; -import java.util.Comparator; -import java.util.TreeSet; - -import org.unicode.cldr.util.props.BagFormatter; -import org.unicode.cldr.util.props.UnicodeLabel; -import org.unicode.idna.Idna; -import org.unicode.idna.Idna.IdnaType; -import org.unicode.idna.Idna2003; -import org.unicode.idna.Uts46; -import org.unicode.jsp.UnicodeUtilities; - import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Utility; @@ -24,167 +12,181 @@ import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.TimeZone; import com.ibm.icu.util.ULocale; +import java.io.IOException; +import java.util.Comparator; +import java.util.TreeSet; +import org.unicode.cldr.util.props.BagFormatter; +import org.unicode.cldr.util.props.UnicodeLabel; +import org.unicode.idna.Idna; +import org.unicode.idna.Idna.IdnaType; +import org.unicode.idna.Idna2003; +import org.unicode.idna.Uts46; +import org.unicode.jsp.UnicodeUtilities; +/** "Not really a test, move" */ +public class TestGenerate extends TestFmwk { -/** - * "Not really a test, move" - */ -public class TestGenerate extends TestFmwk{ - - static final String AGE = System.getProperty("age"); - static final UnicodeSet OVERALL_ALLOWED = new UnicodeSet().applyPropertyAlias("age", AGE == null ? "5.2" : AGE).freeze(); - - public static void main(String[] args) throws Exception { - new TestGenerate().run(args); - - - } - + static final String AGE = System.getProperty("age"); + static final UnicodeSet OVERALL_ALLOWED = + new UnicodeSet().applyPropertyAlias("age", AGE == null ? "5.2" : AGE).freeze(); - public void TestIdnaDifferences() { - UnicodeSet remapped = new UnicodeSet(); - UnicodeMap map = UnicodeUtilities.getIdnaDifferences(remapped, OVERALL_ALLOWED); - TreeSet ordered = new TreeSet(new InverseComparator()); - ordered.addAll(map.values()); - int max = 200; - for (String value : ordered) { - UnicodeSet set = map.getSet(value); - String prettySet = TestJsp.prettyTruncate(max, set); - System.out.println(value + "\t" + set.size() + "\t" + set); // prettySet - } - Transliterator name = Transliterator.getInstance("name"); - System.out.println("Code\tUts46\tidna2003\tCode\tUts46\tidna2003"); - - for (String s : remapped) { - String uts46 = Uts46.SINGLETON.transform(s); - String idna2003 = Idna2003.toIdna2003(s); - if (!uts46.equals(idna2003)) { - System.out.println(Utility.hex(s) + "\t" + Utility.hex(uts46) + "\t" + Utility.hex(idna2003) - + "\t" + name.transform(s) + "\t" + name.transform(uts46) + "\t" + name.transform(idna2003) - ); - } + public static void main(String[] args) throws Exception { + new TestGenerate().run(args); } - } - - public void TestGenerateDataFile() throws IOException { - //final UnicodeMap results = new UnicodeMap(); - final UnicodeMap hex_results = new UnicodeMap(); - final UnicodeMap hex_results_requiring_nfkc = new UnicodeMap(); - //hex_results.putAll(0,0x10FFFF,"valid"); - //hex_results.putAll(new UnicodeSet("[:cn:]"), "disallowed"); - //hex_results.putAll(new UnicodeSet("[:noncharactercodepoint:]"), "disallowed"); - for (int cp = 0; cp <= 0x10FFFF; ++cp) { - String s = UTF16.valueOf(cp); - String nfc = toNfc(s); - String nfkc = Normalizer.normalize(s, Normalizer.NFKC); - String uts46 = Uts46.SINGLETON.transform(s); - IdnaType statusInt = Uts46.SINGLETON.getType(cp); - String status = statusInt.toString(); - if (Uts46.SINGLETON.getType(cp) == IdnaType.deviation) { // Uts46.SINGLETON.DEVIATIONS.contains(cp) - status = "deviation"; - } - if (statusInt == Idna.IdnaType.mapped) { - status += Utility.repeat(" ", 10-status.length()) + " ; " + Utility.hex(uts46); - } - hex_results.put(cp, status); - // hex_results.put(cp, status==UnicodeUtilities.IGNORED ? "ignored" - // : UnicodeUtilities. ? "disallowed" - // : s.equals(uts46) ? "valid" - // //: nfc.equals(uts46) ? "needs_nfc" - // : Utility.hex(uts46, " ")); - // - // hex_results_requiring_nfkc.put(cp, Uts46.SINGLETON.length() == 0 ? "ignored" - // : !Uts46.SINGLETON.Uts46Chars.containsAll(uts46) ? "disallowed" - // : s.equals(uts46) ? "valid" - // : nfkc.equals(uts46) ? "needs_nfkc" - // : Utility.hex(uts46, " ")); + + public void TestIdnaDifferences() { + UnicodeSet remapped = new UnicodeSet(); + UnicodeMap map = UnicodeUtilities.getIdnaDifferences(remapped, OVERALL_ALLOWED); + TreeSet ordered = new TreeSet(new InverseComparator()); + ordered.addAll(map.values()); + int max = 200; + for (String value : ordered) { + UnicodeSet set = map.getSet(value); + String prettySet = TestJsp.prettyTruncate(max, set); + System.out.println(value + "\t" + set.size() + "\t" + set); // prettySet + } + Transliterator name = Transliterator.getInstance("name"); + System.out.println("Code\tUts46\tidna2003\tCode\tUts46\tidna2003"); + + for (String s : remapped) { + String uts46 = Uts46.SINGLETON.transform(s); + String idna2003 = Idna2003.toIdna2003(s); + if (!uts46.equals(idna2003)) { + System.out.println( + Utility.hex(s) + + "\t" + + Utility.hex(uts46) + + "\t" + + Utility.hex(idna2003) + + "\t" + + name.transform(s) + + "\t" + + name.transform(uts46) + + "\t" + + name.transform(idna2003)); + } + } } - BagFormatter bf = new BagFormatter(); - bf.setLabelSource(null); - bf.setRangeBreakSource(null); - bf.setShowCount(false); - bf.setNameSource(new UnicodeLabel() { - - @Override - public String getValue(int codepoint, boolean isShort) { - //String target = results.get(codepoint); - return UCharacter.getExtendedName(codepoint); - } - - }); - - // String sourceName = UCharacter.getName(cp); - // String targetName = UCharacter.getName(uts46, " + "); - // String names = (sourceName != null && targetName != null) ? "#\t" + sourceName + " \u2192 " + targetName : ""; - // System.out.println(Utility.hex(s) + ";\t" + Utility.hex(uts46) - // + ";\t" + names - // ); - //writeIdnaDataFile(hex_results, bf, "NFC", "IdnaMappingTable"); - //writeIdnaDataFile(hex_results_requiring_nfkc, bf, "NFKC", "uts46-data-pre-nfkc-5.1.txt"); - } - - - - private String toNfc(String s) { - return Normalizer.normalize(s, Normalizer.NFC); - } - - static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss 'GMT'", ULocale.US); - static { - dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); - } - - -// private void writeIdnaDataFile(final UnicodeMap hex_results, BagFormatter bf, String normalizationForm2, String filenameStem) throws IOException { -// String filename = filenameStem + "-" + AGE + ".0.txt"; -// PrintWriter writer = BagFormatter.openUTF8Writer("/Users/markdavis/Documents/workspace/draft/reports/tr46/", filename); -// String normalizationForm = normalizationForm2; -// writer.println("# " + filename + "- DRAFT\n" + -// "# Date: " + dateFormat.format(new Date()) + " [MD]\n" + -// "#\n" + -// "# Unicode IDNA Compatible Preprocessing (UTS #46)\n" + -// "# Copyright (c) 1991-2009 Unicode, Inc.\n" + -// "# For terms of use, see http://www.unicode.org/terms_of_use.html\n" + -// "# For documentation, see http://www.unicode.org/reports/tr46/\n"); -// -// // # IdnaMappingTable-5.1.0.txt - DRAFT -// // # Date: 2009-11-14 08:10:42 GMT [MD] -// // # -// // # Unicode IDNA Compatible Preprocessing (UTS #46) -// // # Copyright (c) 1991-2009 Unicode, Inc. -// // # For terms of use, see http://www.unicode.org/terms_of_use.html -// // # For documentation, see http://www.unicode.org/reports/tr46/ -// -// bf.setValueSource(new UnicodeProperty.UnicodeMapProperty().set(hex_results)); -// final UnicodeLabel oldLabel = bf.getNameSource(); -// bf.setNameSource(new UnicodeLabel() { -// public String getValue(int codepoint, boolean isShort) { -// if (OVERALL_ALLOWED.contains(codepoint)) { -// return oldLabel.getValue(codepoint, isShort); -// } -// return ""; -// } -// }); -// writer.println(bf.showSetNames(hex_results.keySet())); -// writer.close(); -// } - - public static class InverseComparator implements Comparator { - private Comparator other; - - public InverseComparator() { - this.other = null; + + public void TestGenerateDataFile() throws IOException { + // final UnicodeMap results = new UnicodeMap(); + final UnicodeMap hex_results = new UnicodeMap(); + final UnicodeMap hex_results_requiring_nfkc = new UnicodeMap(); + // hex_results.putAll(0,0x10FFFF,"valid"); + // hex_results.putAll(new UnicodeSet("[:cn:]"), "disallowed"); + // hex_results.putAll(new UnicodeSet("[:noncharactercodepoint:]"), "disallowed"); + for (int cp = 0; cp <= 0x10FFFF; ++cp) { + String s = UTF16.valueOf(cp); + String nfc = toNfc(s); + String nfkc = Normalizer.normalize(s, Normalizer.NFKC); + String uts46 = Uts46.SINGLETON.transform(s); + IdnaType statusInt = Uts46.SINGLETON.getType(cp); + String status = statusInt.toString(); + if (Uts46.SINGLETON.getType(cp) + == IdnaType.deviation) { // Uts46.SINGLETON.DEVIATIONS.contains(cp) + status = "deviation"; + } + if (statusInt == Idna.IdnaType.mapped) { + status += Utility.repeat(" ", 10 - status.length()) + " ; " + Utility.hex(uts46); + } + hex_results.put(cp, status); + // hex_results.put(cp, status==UnicodeUtilities.IGNORED ? "ignored" + // : UnicodeUtilities. ? "disallowed" + // : s.equals(uts46) ? "valid" + // //: nfc.equals(uts46) ? "needs_nfc" + // : Utility.hex(uts46, " ")); + // + // hex_results_requiring_nfkc.put(cp, Uts46.SINGLETON.length() == 0 ? "ignored" + // : !Uts46.SINGLETON.Uts46Chars.containsAll(uts46) ? "disallowed" + // : s.equals(uts46) ? "valid" + // : nfkc.equals(uts46) ? "needs_nfkc" + // : Utility.hex(uts46, " ")); + } + BagFormatter bf = new BagFormatter(); + bf.setLabelSource(null); + bf.setRangeBreakSource(null); + bf.setShowCount(false); + bf.setNameSource( + new UnicodeLabel() { + + @Override + public String getValue(int codepoint, boolean isShort) { + // String target = results.get(codepoint); + return UCharacter.getExtendedName(codepoint); + } + }); + + // String sourceName = UCharacter.getName(cp); + // String targetName = UCharacter.getName(uts46, " + "); + // String names = (sourceName != null && targetName != null) ? "#\t" + sourceName + " + // \u2192 " + targetName : ""; + // System.out.println(Utility.hex(s) + ";\t" + Utility.hex(uts46) + // + ";\t" + names + // ); + // writeIdnaDataFile(hex_results, bf, "NFC", "IdnaMappingTable"); + // writeIdnaDataFile(hex_results_requiring_nfkc, bf, "NFKC", "uts46-data-pre-nfkc-5.1.txt"); } - public InverseComparator(Comparator other) { - this.other = other; + private String toNfc(String s) { + return Normalizer.normalize(s, Normalizer.NFC); } - public int compare(Object a, Object b) { - return other == null - ? ((Comparable)b).compareTo(a) - : other.compare(b, a); + static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss 'GMT'", ULocale.US); + + static { + dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); } - } + // private void writeIdnaDataFile(final UnicodeMap hex_results, BagFormatter bf, String + // normalizationForm2, String filenameStem) throws IOException { + // String filename = filenameStem + "-" + AGE + ".0.txt"; + // PrintWriter writer = + // BagFormatter.openUTF8Writer("/Users/markdavis/Documents/workspace/draft/reports/tr46/", + // filename); + // String normalizationForm = normalizationForm2; + // writer.println("# " + filename + "- DRAFT\n" + + // "# Date: " + dateFormat.format(new Date()) + " [MD]\n" + + // "#\n" + + // "# Unicode IDNA Compatible Preprocessing (UTS #46)\n" + + // "# Copyright (c) 1991-2009 Unicode, Inc.\n" + + // "# For terms of use, see http://www.unicode.org/terms_of_use.html\n" + + // "# For documentation, see http://www.unicode.org/reports/tr46/\n"); + // + // // # IdnaMappingTable-5.1.0.txt - DRAFT + // // # Date: 2009-11-14 08:10:42 GMT [MD] + // // # + // // # Unicode IDNA Compatible Preprocessing (UTS #46) + // // # Copyright (c) 1991-2009 Unicode, Inc. + // // # For terms of use, see http://www.unicode.org/terms_of_use.html + // // # For documentation, see http://www.unicode.org/reports/tr46/ + // + // bf.setValueSource(new UnicodeProperty.UnicodeMapProperty().set(hex_results)); + // final UnicodeLabel oldLabel = bf.getNameSource(); + // bf.setNameSource(new UnicodeLabel() { + // public String getValue(int codepoint, boolean isShort) { + // if (OVERALL_ALLOWED.contains(codepoint)) { + // return oldLabel.getValue(codepoint, isShort); + // } + // return ""; + // } + // }); + // writer.println(bf.showSetNames(hex_results.keySet())); + // writer.close(); + // } + + public static class InverseComparator implements Comparator { + private Comparator other; + + public InverseComparator() { + this.other = null; + } + + public InverseComparator(Comparator other) { + this.other = other; + } + + public int compare(Object a, Object b) { + return other == null ? ((Comparable) b).compareTo(a) : other.compare(b, a); + } + } } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestIcuProperties.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestIcuProperties.java index c36e471f6..215bdb8be 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestIcuProperties.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestIcuProperties.java @@ -1,29 +1,26 @@ package org.unicode.jsptest; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; import java.util.Arrays; import java.util.List; - import org.unicode.jsp.ICUPropertyFactory; import org.unicode.props.UnicodeProperty; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UProperty.NameChoice; - -/** - * "Not really a test, move" - */ +/** "Not really a test, move" */ public class TestIcuProperties extends TestFmwk2 { public void testProps() { ICUPropertyFactory factory = ICUPropertyFactory.make(); String sample = "🤩"; int nameChoice = NameChoice.LONG; - List propRanges = Arrays.asList( - UProperty.BINARY_START, UProperty.BINARY_LIMIT, - UProperty.INT_START, UProperty.INT_LIMIT, - UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT, - UProperty.STRING_START, UProperty.STRING_LIMIT); + List propRanges = + Arrays.asList( + UProperty.BINARY_START, UProperty.BINARY_LIMIT, + UProperty.INT_START, UProperty.INT_LIMIT, + UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT, + UProperty.STRING_START, UProperty.STRING_LIMIT); for (int range = 0; range < propRanges.size(); range += 2) { final int rangeStart = propRanges.get(range); final int rangeLimit = propRanges.get(range + 1); @@ -36,9 +33,11 @@ public void testProps() { System.out.println(property + "\t" + name + "\tvalue('" + sample + "'): " + value); if (rangeStart == UProperty.INT_START) { String gap = "\t "; - for (int i = UCharacter.getIntPropertyMinValue(property); i <= UCharacter - .getIntPropertyMaxValue(property); ++i) { - String propertyValueName = UCharacter.getPropertyValueName(property, i, nameChoice); + for (int i = UCharacter.getIntPropertyMinValue(property); + i <= UCharacter.getIntPropertyMaxValue(property); + ++i) { + String propertyValueName = + UCharacter.getPropertyValueName(property, i, nameChoice); System.out.print(gap + propertyValueName); gap = ", "; } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestJsp.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestJsp.java index 97236f3f2..108d7bd3e 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestJsp.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestJsp.java @@ -1,5 +1,18 @@ package org.unicode.jsptest; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.IDNA; +import com.ibm.icu.text.StringPrepParseException; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.LocaleData; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; @@ -14,7 +27,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; import org.unicode.cldr.util.BNF; @@ -36,40 +48,38 @@ import org.unicode.props.UnicodeProperty; import org.unicode.unittest.TestFmwkMinusMinus; -import com.ibm.icu.dev.test.TestFmwk; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UProperty.NameChoice; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.IDNA; -import com.ibm.icu.text.StringPrepParseException; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.LocaleData; -import com.ibm.icu.util.ULocale; - -public class TestJsp extends TestFmwkMinusMinus { +public class TestJsp extends TestFmwkMinusMinus { - private static final String enSample = "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z"; + private static final String enSample = + "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z"; static final UnicodeSet U5_2 = new UnicodeSet().applyPropertyAlias("age", "5.2").freeze(); - public static final UnicodeSet U5_1 = new UnicodeSet().applyPropertyAlias("age", "5.1").freeze(); - static UnicodeSet BREAKING_WHITESPACE = new UnicodeSet("[\\p{whitespace=true}-\\p{linebreak=glue}]").freeze(); - - - static UnicodeSet IPA = new UnicodeSet("[a-zæçðøħŋœǀ-ǃɐ-ɨɪ-ɶ ɸ-ɻɽɾʀ-ʄʈ-ʒʔʕʘʙʛ-ʝʟʡʢ ʤʧʰ-ʲʴʷʼˈˌːˑ˞ˠˤ̀́̃̄̆̈ ̘̊̋̏-̜̚-̴̠̤̥̩̪̬̯̰̹-̽͜ ͡βθχ↑-↓↗↘]").freeze(); - static String IPA_SAMPLE = "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, æ, ç, ð, ø, ħ, ŋ, œ, ǀ, ǁ, ǂ, ǃ, ɐ, ɑ, ɒ, ɓ, ɔ, ɕ, ɖ, ɗ, ɘ, ə, ɚ, ɛ, ɜ, ɝ, ɞ, ɟ, ɠ, ɡ, ɢ, ɣ, ɤ, ɥ, ɦ, ɧ, ɨ, ɪ, ɫ, ɬ, ɭ, ɮ, ɯ, ɰ, ɱ, ɲ, ɳ, ɴ, ɵ, ɶ, ɸ, ɹ, ɺ, ɻ, ɽ, ɾ, ʀ, ʁ, ʂ, ʃ, ʄ, ʈ, ʉ, ʊ, ʋ, ʌ, ʍ, ʎ, ʏ, ʐ, ʑ, ʒ, ʔ, ʕ, ʘ, ʙ, ʛ, ʜ, ʝ, ʟ, ʡ, ʢ, ʤ, ʧ, ʰ, ʱ, ʲ, ʴ, ʷ, ʼ, ˈ, ˌ, ː, ˑ, ˞, ˠ, ˤ, ̀, ́, ̃, ̄, ̆, ̈, ̊, ̋, ̏, ̐, ̑, ̒, ̓, ̔, ̕, ̖, ̗, ̘, ̙, ̚, ̛, ̜, ̝, ̞, ̟, ̠, ̡, ̢, ̣, ̤, ̥, ̦, ̧, ̨, ̩, ̪, ̫, ̬, ̭, ̮, ̯, ̰, ̱, ̲, ̳, ̴, ̹, ̺, ̻, ̼, ̽, ͜, ͡, β, θ, χ, ↑, →, ↓, ↗, ↘"; - - enum Subtag {language, script, region, mixed, fail} + public static final UnicodeSet U5_1 = + new UnicodeSet().applyPropertyAlias("age", "5.1").freeze(); + static UnicodeSet BREAKING_WHITESPACE = + new UnicodeSet("[\\p{whitespace=true}-\\p{linebreak=glue}]").freeze(); + + static UnicodeSet IPA = + new UnicodeSet( + "[a-zæçðøħŋœǀ-ǃɐ-ɨɪ-ɶ ɸ-ɻɽɾʀ-ʄʈ-ʒʔʕʘʙʛ-ʝʟʡʢ ʤʧʰ-ʲʴʷʼˈˌːˑ˞ˠˤ̀́̃̄̆̈ ̘̊̋̏-̜̚-̴̠̤̥̩̪̬̯̰̹-̽͜ ͡βθχ↑-↓↗↘]") + .freeze(); + static String IPA_SAMPLE = + "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, æ, ç, ð, ø, ħ, ŋ, œ, ǀ, ǁ, ǂ, ǃ, ɐ, ɑ, ɒ, ɓ, ɔ, ɕ, ɖ, ɗ, ɘ, ə, ɚ, ɛ, ɜ, ɝ, ɞ, ɟ, ɠ, ɡ, ɢ, ɣ, ɤ, ɥ, ɦ, ɧ, ɨ, ɪ, ɫ, ɬ, ɭ, ɮ, ɯ, ɰ, ɱ, ɲ, ɳ, ɴ, ɵ, ɶ, ɸ, ɹ, ɺ, ɻ, ɽ, ɾ, ʀ, ʁ, ʂ, ʃ, ʄ, ʈ, ʉ, ʊ, ʋ, ʌ, ʍ, ʎ, ʏ, ʐ, ʑ, ʒ, ʔ, ʕ, ʘ, ʙ, ʛ, ʜ, ʝ, ʟ, ʡ, ʢ, ʤ, ʧ, ʰ, ʱ, ʲ, ʴ, ʷ, ʼ, ˈ, ˌ, ː, ˑ, ˞, ˠ, ˤ, ̀, ́, ̃, ̄, ̆, ̈, ̊, ̋, ̏, ̐, ̑, ̒, ̓, ̔, ̕, ̖, ̗, ̘, ̙, ̚, ̛, ̜, ̝, ̞, ̟, ̠, ̡, ̢, ̣, ̤, ̥, ̦, ̧, ̨, ̩, ̪, ̫, ̬, ̭, ̮, ̯, ̰, ̱, ̲, ̳, ̴, ̹, ̺, ̻, ̼, ̽, ͜, ͡, β, θ, χ, ↑, →, ↓, ↗, ↘"; + + enum Subtag { + language, + script, + region, + mixed, + fail + } - static UnicodeSetPrettyPrinter pretty = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ENGLISH)); + static UnicodeSetPrettyPrinter pretty = + new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ENGLISH)); static String prettyTruncate(int max, UnicodeSet set) { String prettySet = pretty.format(set); if (prettySet.length() > max) { - prettySet = prettySet.substring(0,max) + "..."; + prettySet = prettySet.substring(0, max) + "..."; } return prettySet; } @@ -82,7 +92,9 @@ public void TestLanguage() { assertContains(fii, "draft-ietf-ltru-4646bis"); } - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestJoiner() { @@ -118,15 +130,14 @@ public void TestJoiner() { checkValidity(Idna2008.SINGLETON, "xn--fa-hia.de", true, true); checkValidity(Idna2008.SINGLETON, "\u0080.de", false, false); checkValidity(Idna2008.SINGLETON, "xn--a.de", true, true); - } private void checkValidity(Idna uts46, String url, boolean expectedPuny, boolean expectedUni) { boolean[] error = new boolean[1]; String fii = uts46.toPunyCode(url, error); - assertEquals(uts46.getName() + "\ttoPunyCode(" + url + ")="+fii, !expectedPuny, error[0]); + assertEquals(uts46.getName() + "\ttoPunyCode(" + url + ")=" + fii, !expectedPuny, error[0]); fii = uts46.toUnicode(url, error, true); - assertEquals(uts46.getName() + "\ttoUnicode(" + url + ")="+fii, !expectedUni, error[0]); + assertEquals(uts46.getName() + "\ttoUnicode(" + url + ")=" + fii, !expectedUni, error[0]); } // public void Test2003vsUts46() { @@ -169,10 +180,15 @@ private String codeAndName(int i) { } private String codeAndName(String i) { - return i == null ? null : (Utility.hex(i, 4, ",", true, new StringBuilder()) + " ( " + i + " ) " + UCharacter.getName(i, "+")); + return i == null + ? null + : (Utility.hex(i, 4, ",", true, new StringBuilder()) + + " ( " + + i + + " ) " + + UCharacter.getName(i, "+")); } - static class TypeAndMap { IdnaType type; String mapping; @@ -190,17 +206,23 @@ public void oldTestIdnaAndIcu() { IdnaType type = Uts46.SINGLETON.getType(cp); // used to be Idna2003. String mapping = Uts46.SINGLETON.mappings.get(cp); // used to be Idna2003. - if (type != typeAndMapIcu.type || !UnicodeProperty.equals(mapping, typeAndMapIcu.mapping)) { + if (type != typeAndMapIcu.type + || !UnicodeProperty.equals(mapping, typeAndMapIcu.mapping)) { inbuffer.setLength(0); inbuffer.appendCodePoint(cp); getIcuIdna(inbuffer, typeAndMapIcu); String typeDiff = type + "\tvs ICU\t" + typeAndMapIcu.type; String mapDiff = "[" + mapping + "]\tvs ICU\t[" + typeAndMapIcu.mapping + "]"; - errors.put(cp, (type != typeAndMapIcu.type ? "\ttype:\t" + typeDiff : "") - + (!UnicodeProperty.equals(mapping, typeAndMapIcu.mapping) ? "\tmap:\t" + mapDiff : "")); + errors.put( + cp, + (type != typeAndMapIcu.type ? "\ttype:\t" + typeDiff : "") + + (!UnicodeProperty.equals(mapping, typeAndMapIcu.mapping) + ? "\tmap:\t" + mapDiff + : "")); // errln(Utility.hex(cp) + "\t( " + UTF16.valueOf(cp) + " )\tdifference:" // + (type != typeAndMapIcu.type ? "\ttype:\t" + typeDiff : "") - // + (!UnicodeProperty.equals(mapping, typeAndMapIcu.mapping) ? "\tmap:\t" + mapDiff : "")); + // + (!UnicodeProperty.equals(mapping, typeAndMapIcu.mapping) ? + // "\tmap:\t" + mapDiff : "")); if (++count > 50) { break; } @@ -249,7 +271,8 @@ private void getIcuIdna(StringBuffer inbuffer, TypeAndMap typeAndMapIcu) { } } - private static StringBuffer convertWithHack(StringBuffer inbuffer) throws StringPrepParseException { + private static StringBuffer convertWithHack(StringBuffer inbuffer) + throws StringPrepParseException { StringBuffer intermediate; try { intermediate = IDNA.convertToASCII(inbuffer, IDNA.USE_STD3_RULES); // USE_STD3_RULES, @@ -263,7 +286,6 @@ private static StringBuffer convertWithHack(StringBuffer inbuffer) throws String return intermediate; } - private void getIcuIdnaUts(StringBuilder inbuffer, TypeAndMap typeAndMapIcu) { IDNA icuIdna = IDNA.getUTS46Instance(0); IDNA.Info info = new IDNA.Info(); @@ -277,7 +299,8 @@ private void getIcuIdnaUts(StringBuilder inbuffer, TypeAndMap typeAndMapIcu) { typeAndMapIcu.type = IdnaType.ignored; typeAndMapIcu.mapping = ""; } else { - StringBuilder outbuffer = icuIdna.nameToUnicode(intermediate.toString(), intermediate, info); + StringBuilder outbuffer = + icuIdna.nameToUnicode(intermediate.toString(), intermediate, info); if (!UnicodeUtilities.equals(inbuffer, outbuffer)) { typeAndMapIcu.type = IdnaType.mapped; typeAndMapIcu.mapping = outbuffer.toString(); @@ -301,23 +324,26 @@ private void getIcuIdnaUts(StringBuilder inbuffer, TypeAndMap typeAndMapIcu) { } } - private static StringBuilder convertWithHackUts(StringBuilder inbuffer, IDNA icuIdna) throws StringPrepParseException { + private static StringBuilder convertWithHackUts(StringBuilder inbuffer, IDNA icuIdna) + throws StringPrepParseException { StringBuilder intermediate; try { - intermediate = icuIdna.nameToASCII(inbuffer.toString(), inbuffer, null); // USE_STD3_RULES, + intermediate = + icuIdna.nameToASCII(inbuffer.toString(), inbuffer, null); // USE_STD3_RULES, } catch (RuntimeException e) { if (!e.getMessage().contains("BIDI")) { throw e; } inbuffer.append("\\u05D9"); - intermediate = icuIdna.nameToASCII(inbuffer.toString(), inbuffer, null); // USE_STD3_RULES, + intermediate = + icuIdna.nameToASCII(inbuffer.toString(), inbuffer, null); // USE_STD3_RULES, } return intermediate; } - - - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestIdnaProps() { String map = Idna2003.SINGLETON.mappings.get(0x200c); @@ -345,7 +371,8 @@ public void TestIdnaProps() { showPropValues(XPropertyFactory.make().getProperty("uts46")); } - private void checkNullOrEqual(String title, int cp, IdnaType t1, String m1, IdnaType t2, String m2) { + private void checkNullOrEqual( + String title, int cp, IdnaType t1, String m1, IdnaType t2, String m2) { if (t1 == IdnaType.disallowed || t2 == IdnaType.disallowed) return; if (t1 == IdnaType.valid && t2 == IdnaType.valid) return; m1 = m1 == null ? UTF16.valueOf(cp) : m1; @@ -362,12 +389,11 @@ public void TestConfusables() { logln("***TRIAL1 : " + trial); trial = UnicodeJsp.getConfusables("sox", 1); logln("***TRIAL2 : " + trial); - //showPropValues( + // showPropValues( XPropertyFactory.make().getProperty("confusable"); XPropertyFactory.make().getProperty("idr"); } - private void showIcuEnums() { for (int prop = UProperty.BINARY_START; prop < UProperty.BINARY_LIMIT; ++prop) { showEnumPropValues(prop); @@ -390,7 +416,9 @@ private void showEnumPropValues(int prop) { break; } } - for (int i = UCharacter.getIntPropertyMinValue(prop); i <= UCharacter.getIntPropertyMaxValue(prop); ++i) { + for (int i = UCharacter.getIntPropertyMinValue(prop); + i <= UCharacter.getIntPropertyMaxValue(prop); + ++i) { logln("\tProperty value number:\t" + i); for (int nameChoice = 0; ; ++nameChoice) { String propertyValueName; @@ -399,7 +427,7 @@ private void showEnumPropValues(int prop) { if (propertyValueName == null && nameChoice > NameChoice.LONG) { break; } - logln("\t\t"+ nameChoice + "\t" + propertyValueName); + logln("\t\t" + nameChoice + "\t" + propertyValueName); } catch (Exception e) { break; } @@ -425,11 +453,13 @@ public void checkLanguageLocalizations() { addIfNotEmpty(scripts, displayLanguage.getScript()); addIfNotEmpty(countries, displayLanguage.getCountry()); } - Map> canDisplay = new TreeMap>(new Comparator() { - public int compare(ULocale o1, ULocale o2) { - return o1.toLanguageTag().compareTo(o2.toString()); - } - }); + Map> canDisplay = + new TreeMap>( + new Comparator() { + public int compare(ULocale o1, ULocale o2) { + return o1.toLanguageTag().compareTo(o2.toString()); + } + }); for (ULocale displayLanguage : ULocale.getAvailableLocales()) { if (displayLanguage.getCountry().length() != 0) { @@ -439,8 +469,11 @@ public int compare(ULocale o1, ULocale o2) { canDisplay.put(displayLanguage, counter); final LocaleData localeData = LocaleData.getInstance(displayLanguage); - final UnicodeSet exemplarSet = new UnicodeSet() - .addAll(localeData.getExemplarSet(UnicodeSet.CASE, LocaleData.ES_STANDARD)); + final UnicodeSet exemplarSet = + new UnicodeSet() + .addAll( + localeData.getExemplarSet( + UnicodeSet.CASE, LocaleData.ES_STANDARD)); final String language = displayLanguage.getLanguage(); final String script = displayLanguage.getScript(); if (language.equals("zh")) { @@ -450,12 +483,17 @@ public int compare(ULocale o1, ULocale o2) { exemplarSet.removeAll(Common.tradOnly); } } else { - exemplarSet.addAll(localeData.getExemplarSet(UnicodeSet.CASE, LocaleData.ES_AUXILIARY)); + exemplarSet.addAll( + localeData.getExemplarSet(UnicodeSet.CASE, LocaleData.ES_AUXILIARY)); if (language.equals("ja")) { exemplarSet.add('ー'); } } - final UnicodeSet okChars = (UnicodeSet) new UnicodeSet("[[:P:][:S:][:Cf:][:m:][:whitespace:]]").addAll(exemplarSet).freeze(); + final UnicodeSet okChars = + (UnicodeSet) + new UnicodeSet("[[:P:][:S:][:Cf:][:m:][:whitespace:]]") + .addAll(exemplarSet) + .freeze(); Set mixedSamples = new TreeSet(); @@ -474,50 +512,81 @@ public int compare(ULocale o1, ULocale o2) { } missing.removeAll(okChars); - final long total = counter.getTotal() - counter.getCount(Subtag.mixed) - counter.getCount(Subtag.fail); - final String missingDisplay = mixedSamples.size() == 0 ? "" : "\t" + missing.toPattern(false) + "\t" + mixedSamples; - logln(displayLanguage + "\t" + displayLanguage.getDisplayName(ULocale.ENGLISH) - + "\t" + (total/(double)counter.getTotal()) - + "\t" + total - + "\t" + counter.getCount(Subtag.language) - + "\t" + counter.getCount(Subtag.script) - + "\t" + counter.getCount(Subtag.region) - + "\t" + counter.getCount(Subtag.mixed) - + "\t" + counter.getCount(Subtag.fail) - + missingDisplay - ); + final long total = + counter.getTotal() + - counter.getCount(Subtag.mixed) + - counter.getCount(Subtag.fail); + final String missingDisplay = + mixedSamples.size() == 0 + ? "" + : "\t" + missing.toPattern(false) + "\t" + mixedSamples; + logln( + displayLanguage + + "\t" + + displayLanguage.getDisplayName(ULocale.ENGLISH) + + "\t" + + (total / (double) counter.getTotal()) + + "\t" + + total + + "\t" + + counter.getCount(Subtag.language) + + "\t" + + counter.getCount(Subtag.script) + + "\t" + + counter.getCount(Subtag.region) + + "\t" + + counter.getCount(Subtag.mixed) + + "\t" + + counter.getCount(Subtag.fail) + + missingDisplay); } } - private void add(ULocale displayLanguage, Subtag subtag, String code, Counter counter, UnicodeSet okChars, Set mixedSamples) { + private void add( + ULocale displayLanguage, + Subtag subtag, + String code, + Counter counter, + UnicodeSet okChars, + Set mixedSamples) { switch (canDisplay(displayLanguage, subtag, code, okChars, mixedSamples)) { - case code: - counter.add(Subtag.fail, 1); - break; - case localized: - counter.add(subtag, 1); - break; - case badLocalization: - counter.add(Subtag.mixed, 1); - break; + case code: + counter.add(Subtag.fail, 1); + break; + case localized: + counter.add(subtag, 1); + break; + case badLocalization: + counter.add(Subtag.mixed, 1); + break; } } - enum Display {code, localized, badLocalization} + enum Display { + code, + localized, + badLocalization + } - private Display canDisplay(ULocale displayLanguage, Subtag subtag, String code, UnicodeSet okChars, Set mixedSamples) { + private Display canDisplay( + ULocale displayLanguage, + Subtag subtag, + String code, + UnicodeSet okChars, + Set mixedSamples) { String display; switch (subtag) { - case language: - display = ULocale.getDisplayLanguage(code, displayLanguage); - break; - case script: - display = ULocale.getDisplayScript("und-" + code, displayLanguage); - break; - case region: - display = ULocale.getDisplayCountry("und-" + code, displayLanguage); - break; - default: throw new IllegalArgumentException(); + case language: + display = ULocale.getDisplayLanguage(code, displayLanguage); + break; + case script: + display = ULocale.getDisplayScript("und-" + code, displayLanguage); + break; + case region: + display = ULocale.getDisplayCountry("und-" + code, displayLanguage); + break; + default: + throw new IllegalArgumentException(); } if (display.equals(code)) { return Display.code; @@ -551,38 +620,65 @@ public void TestLanguageTag() { assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("eng-eng", ulocale)); assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("eng-yyy", ulocale)); - assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("gsw-Hrkt-AQ-pinyin-AbCdE-1901-b-fo-fjdklkfj-23-a-foobar-x-1", ulocale)); + assertNoMatch( + null, + "Ill-Formed", + UnicodeJsp.validateLanguageID( + "gsw-Hrkt-AQ-pinyin-AbCdE-1901-b-fo-fjdklkfj-23-a-foobar-x-1", ulocale)); assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("fi-Latn-US", ulocale)); assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("fil-Latn-US", ulocale)); - assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("aaa-Latn-003-FOOBAR-ALPHA-A-xyzw", ulocale)); + assertNoMatch( + null, + "Ill-Formed", + UnicodeJsp.validateLanguageID("aaa-Latn-003-FOOBAR-ALPHA-A-xyzw", ulocale)); assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("aaa-A-xyzw", ulocale)); - assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("x-aaa-Latn-003-FOOBAR-ALPHA-A-xyzw", ulocale)); - assertNoMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("aaa-x-Latn-003-FOOBAR-ALPHA-A-xyzw", ulocale)); + assertNoMatch( + null, + "Ill-Formed", + UnicodeJsp.validateLanguageID("x-aaa-Latn-003-FOOBAR-ALPHA-A-xyzw", ulocale)); + assertNoMatch( + null, + "Ill-Formed", + UnicodeJsp.validateLanguageID("aaa-x-Latn-003-FOOBAR-ALPHA-A-xyzw", ulocale)); assertMatch(null, "invalid\\scode", UnicodeJsp.validateLanguageID("zho-Xxxx-248", ulocale)); - assertMatch(null, "invalid\\sextlang\\scode", UnicodeJsp.validateLanguageID("aaa-bbb", ulocale)); + assertMatch( + null, + "invalid\\sextlang\\scode", + UnicodeJsp.validateLanguageID("aaa-bbb", ulocale)); assertMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("aaa--bbb", ulocale)); - assertMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("aaa-bbb-abcdefghihkl", ulocale)); - assertMatch(null, "Ill-Formed", UnicodeJsp.validateLanguageID("1aaa-bbb-abcdefghihkl", ulocale)); + assertMatch( + null, "Ill-Formed", UnicodeJsp.validateLanguageID("aaa-bbb-abcdefghihkl", ulocale)); + assertMatch( + null, + "Ill-Formed", + UnicodeJsp.validateLanguageID("1aaa-bbb-abcdefghihkl", ulocale)); } public void assertMatch(String message, String pattern, Object actual) { - assertMatches(message, Pattern.compile(pattern, Pattern.COMMENTS | Pattern.DOTALL), true, actual); + assertMatches( + message, Pattern.compile(pattern, Pattern.COMMENTS | Pattern.DOTALL), true, actual); } public void assertNoMatch(String message, String pattern, Object actual) { - assertMatches(message, Pattern.compile(pattern, Pattern.COMMENTS | Pattern.DOTALL), false, actual); + assertMatches( + message, + Pattern.compile(pattern, Pattern.COMMENTS | Pattern.DOTALL), + false, + actual); } - // return handleAssert(expected == actual, message, stringFor(expected), stringFor(actual), "==", false); + // return handleAssert(expected == actual, message, stringFor(expected), + // stringFor(actual), "==", false); private void assertMatches(String message, Pattern pattern, boolean expected, Object actual) { final String actualString = actual == null ? "null" : actual.toString(); final boolean result = pattern.matcher(actualString).find() == expected; - handleAssert(result, + handleAssert( + result, message, "/" + pattern.toString() + "/", actualString, expected ? "matches" : "doesn't match", - true); + true); } @Test @@ -591,32 +687,36 @@ public void TestATransform() { checkCompleteness(IPA_SAMPLE, "ipa-en", new UnicodeSet("[a-z]")); String sample; sample = UnicodeJsp.showTransform("en-IPA; IPA-en", enSample); - //logln(sample); + // logln(sample); sample = UnicodeJsp.showTransform("en-IPA; IPA-deva", "The quick brown fox."); - //logln(sample); - String deva = "कँ, कं, कः, ऄ, अ, आ, इ, ई, उ, ऊ, ऋ, ऌ, ऍ, ऎ, ए, ऐ, ऑ, ऒ, ओ, औ, क, ख, ग, घ, ङ, च, छ, ज, झ, ञ, ट, ठ, ड, ढ, ण, त, थ, द, ध, न, ऩ, प, फ, ब, भ, म, य, र, ऱ, ल, ळ, ऴ, व, श, ष, स, ह, ़, ऽ, क्, का, कि, की, कु, कू, कृ, कॄ, कॅ, कॆ, के, कै, कॉ, कॊ, को, कौ, क्, क़, ख़, ग़, ज़, ड़, ढ़, फ़, य़, ॠ, ॡ, कॢ, कॣ, ०, १, २, ३, ४, ५, ६, ७, ८, ९, ।"; + // logln(sample); + String deva = + "कँ, कं, कः, ऄ, अ, आ, इ, ई, उ, ऊ, ऋ, ऌ, ऍ, ऎ, ए, ऐ, ऑ, ऒ, ओ, औ, क, ख, ग, घ, ङ, च, छ, ज, झ, ञ, ट, ठ, ड, ढ, ण, त, थ, द, ध, न, ऩ, प, फ, ब, भ, म, य, र, ऱ, ल, ळ, ऴ, व, श, ष, स, ह, ़, ऽ, क्, का, कि, की, कु, कू, कृ, कॄ, कॅ, कॆ, के, कै, कॉ, कॊ, को, कौ, क्, क़, ख़, ग़, ज़, ड़, ढ़, फ़, य़, ॠ, ॡ, कॢ, कॣ, ०, १, २, ३, ४, ५, ६, ७, ८, ९, ।"; checkCompleteness(IPA_SAMPLE, "ipa-deva", null); checkCompleteness(deva, "deva-ipa", null); } - private void checkCompleteness(String testString, String transId, UnicodeSet exceptionsAllowed) { + private void checkCompleteness( + String testString, String transId, UnicodeSet exceptionsAllowed) { String pieces[] = testString.split(",\\s*"); - UnicodeSet shouldNotBeLeftOver = new UnicodeSet().addAll(testString).remove(' ').remove(','); + UnicodeSet shouldNotBeLeftOver = + new UnicodeSet().addAll(testString).remove(' ').remove(','); if (exceptionsAllowed != null) { shouldNotBeLeftOver.removeAll(exceptionsAllowed); } UnicodeSet allProblems = new UnicodeSet(); for (String piece : pieces) { String sample = UnicodeJsp.showTransform(transId, piece); - //logln(piece + " => " + sample); + // logln(piece + " => " + sample); if (shouldNotBeLeftOver.containsSome(sample)) { - final UnicodeSet missing = new UnicodeSet().addAll(sample).retainAll(shouldNotBeLeftOver); + final UnicodeSet missing = + new UnicodeSet().addAll(sample).retainAll(shouldNotBeLeftOver); allProblems.addAll(missing); warnln("Leftover from " + transId + ": " + missing.toPattern(false)); Transliterator foo = Transliterator.getInstance(transId, Transliterator.FORWARD); - //Transliterator.DEBUG = true; + // Transliterator.DEBUG = true; sample = UnicodeJsp.showTransform(transId, piece); - //Transliterator.DEBUG = false; + // Transliterator.DEBUG = false; } } if (allProblems.size() != 0) { @@ -648,38 +748,70 @@ public void TestMapping() { assertContains(sample, "\u00A0ACBd\u00A0"); sample = UnicodeJsp.showTransform("casefold", "[\\u0000-\\u00FF]"); assertContains(sample, "\u00A0\u00E1\u00A0"); - } @Test public void TestGrouping() throws IOException { StringWriter printWriter = new StringWriter(); - UnicodeJsp.showSet("sc gc", "", UnicodeSetUtilities.parseUnicodeSet("[:subhead=/Syllables/:]"), true, true, true, printWriter); + UnicodeJsp.showSet( + "sc gc", + "", + UnicodeSetUtilities.parseUnicodeSet("[:subhead=/Syllables/:]"), + true, + true, + true, + printWriter); assertContains(printWriter.toString(), "General_Category=Letter_Number"); printWriter.getBuffer().setLength(0); - UnicodeJsp.showSet("subhead", "", UnicodeSetUtilities.parseUnicodeSet("[:subhead=/Syllables/:]"), true, true, true, printWriter); + UnicodeJsp.showSet( + "subhead", + "", + UnicodeSetUtilities.parseUnicodeSet("[:subhead=/Syllables/:]"), + true, + true, + true, + printWriter); assertContains(printWriter.toString(), "a=A595"); } @Test public void TestStuff() throws IOException { - //int script = UScript.getScript(0xA6E6); - //int script2 = UCharacter.getIntPropertyValue(0xA6E6, UProperty.SCRIPT); + // int script = UScript.getScript(0xA6E6); + // int script2 = UCharacter.getIntPropertyValue(0xA6E6, UProperty.SCRIPT); String propValue = Common.getXStringPropertyValue(Common.SUBHEAD, 0xA6E6, NameChoice.LONG); - //logln(propValue); - + // logln(propValue); - //logln("Script for A6E6: " + script + ", " + UScript.getName(script) + ", " + script2); + // logln("Script for A6E6: " + script + ", " + UScript.getName(script) + ", " + script2); try (final PrintWriter printWriter = new PrintWriter(System.out)) { - //if (true) return; - UnicodeJsp.showSet("sc gc", "", new UnicodeSet("[[:ascii:]{123}{ab}{456}]"), true, true, true, printWriter); - - UnicodeJsp.showSet("", "", new UnicodeSet("[\\u0080\\U0010FFFF]"), true, true, true, printWriter); - UnicodeJsp.showSet("", "", new UnicodeSet("[\\u0080\\U0010FFFF{abc}]"), true, true, true, printWriter); - UnicodeJsp.showSet("", "", new UnicodeSet("[\\u0080-\\U0010FFFF{abc}]"), true, true, true, printWriter); - - + // if (true) return; + UnicodeJsp.showSet( + "sc gc", + "", + new UnicodeSet("[[:ascii:]{123}{ab}{456}]"), + true, + true, + true, + printWriter); + + UnicodeJsp.showSet( + "", "", new UnicodeSet("[\\u0080\\U0010FFFF]"), true, true, true, printWriter); + UnicodeJsp.showSet( + "", + "", + new UnicodeSet("[\\u0080\\U0010FFFF{abc}]"), + true, + true, + true, + printWriter); + UnicodeJsp.showSet( + "", + "", + new UnicodeSet("[\\u0080-\\U0010FFFF{abc}]"), + true, + true, + true, + printWriter); String[] abResults = new String[3]; String[] abLinks = new String[3]; @@ -693,13 +825,23 @@ public void TestStuff() throws IOException { logln("simple: " + UnicodeJsp.getSimpleSet("[a-bm-p\uAc00]", unicodeSet, true, false)); UnicodeJsp.showSet("", "", unicodeSet, true, true, true, printWriter); - - // String archaic = "[[\u018D\u01AA\u01AB\u01B9-\u01BB\u01BE\u01BF\u021C\u021D\u025F\u0277\u027C\u029E\u0343\u03D0\u03D1\u03D5-\u03E1\u03F7-\u03FB\u0483-\u0486\u05A2\u05C5-\u05C7\u066E\u066F\u068E\u0CDE\u10F1-\u10F6\u1100-\u115E\u1161-\u11FF\u17A8\u17D1\u17DD\u1DC0-\u1DC3\u3165-\u318E\uA700-\uA707\\U00010140-\\U00010174]" + - // "[\u02EF-\u02FF\u0363-\u0373\u0376\u0377\u07E8-\u07EA\u1DCE-\u1DE6\u1DFE\u1DFF\u1E9C\u1E9D\u1E9F\u1EFA-\u1EFF\u2056\u2058-\u205E\u2180-\u2183\u2185-\u2188\u2C77-\u2C7D\u2E00-\u2E17\u2E2A-\u2E30\uA720\uA721\uA730-\uA778\uA7FB-\uA7FF]" + - // "[\u0269\u027F\u0285-\u0287\u0293\u0296\u0297\u029A\u02A0\u02A3\u02A5\u02A6\u02A8-\u02AF\u0313\u037B-\u037D\u03CF\u03FD-\u03FF]" + - //""; - //UnicodeJsp.showSet("",UnicodeSetUtilities.parseUnicodeSet("[:usage=/.+/:]"), false, false, printWriter); - UnicodeJsp.showSet("","", UnicodeSetUtilities.parseUnicodeSet("[:hantype=/simp/:]"), false, false, true, printWriter); + // String archaic = + // "[[\u018D\u01AA\u01AB\u01B9-\u01BB\u01BE\u01BF\u021C\u021D\u025F\u0277\u027C\u029E\u0343\u03D0\u03D1\u03D5-\u03E1\u03F7-\u03FB\u0483-\u0486\u05A2\u05C5-\u05C7\u066E\u066F\u068E\u0CDE\u10F1-\u10F6\u1100-\u115E\u1161-\u11FF\u17A8\u17D1\u17DD\u1DC0-\u1DC3\u3165-\u318E\uA700-\uA707\\U00010140-\\U00010174]" + + // + // "[\u02EF-\u02FF\u0363-\u0373\u0376\u0377\u07E8-\u07EA\u1DCE-\u1DE6\u1DFE\u1DFF\u1E9C\u1E9D\u1E9F\u1EFA-\u1EFF\u2056\u2058-\u205E\u2180-\u2183\u2185-\u2188\u2C77-\u2C7D\u2E00-\u2E17\u2E2A-\u2E30\uA720\uA721\uA730-\uA778\uA7FB-\uA7FF]" + + // + // "[\u0269\u027F\u0285-\u0287\u0293\u0296\u0297\u029A\u02A0\u02A3\u02A5\u02A6\u02A8-\u02AF\u0313\u037B-\u037D\u03CF\u03FD-\u03FF]" + + // ""; + // UnicodeJsp.showSet("",UnicodeSetUtilities.parseUnicodeSet("[:usage=/.+/:]"), false, + // false, printWriter); + UnicodeJsp.showSet( + "", + "", + UnicodeSetUtilities.parseUnicodeSet("[:hantype=/simp/:]"), + false, + false, + true, + printWriter); } } @@ -709,34 +851,42 @@ public void TestShowProperties() throws IOException { UnicodeJsp.showProperties(0x00C5, out); assertTrue("props for character", out.toString().contains("Line_Break")); logln(out.toString()); - //logln(out); + // logln(out); } public void TestIdentifiers() throws IOException { String out = UnicodeUtilities.getIdentifier("Latin"); assertTrue("identifier info", out.toString().contains("U+016F")); logln(out.toString()); - //logln(out); + // logln(out); } @Test public void TestShowSet() throws IOException { StringWriter out = new StringWriter(); - // UnicodeJsp.showSet("sc gc", UnicodeSetUtilities.parseUnicodeSet("[:Hangul_Syllable_Type=LVT_Syllable:]", TableStyle.extras), false, true, out); + // UnicodeJsp.showSet("sc gc", + // UnicodeSetUtilities.parseUnicodeSet("[:Hangul_Syllable_Type=LVT_Syllable:]", + // TableStyle.extras), false, true, out); // assertTrue("props table", out.toString().contains("Hangul")); // logln(out); // // out.getBuffer().setLength(0); - // UnicodeJsp.showSet("sc gc", UnicodeSetUtilities.parseUnicodeSet("[:cn:]", TableStyle.extras), false, true, out); + // UnicodeJsp.showSet("sc gc", UnicodeSetUtilities.parseUnicodeSet("[:cn:]", + // TableStyle.extras), false, true, out); // assertTrue("props table", out.toString().contains("unassigned")); // logln(out); out.getBuffer().setLength(0); - UnicodeJsp.showSet("sc", "", UnicodeSetUtilities.parseUnicodeSet("[:script=/Han/:]"), false, true,true, out); + UnicodeJsp.showSet( + "sc", + "", + UnicodeSetUtilities.parseUnicodeSet("[:script=/Han/:]"), + false, + true, + true, + out); assertFalse("props table", out.toString().contains("unassigned")); logln(out.toString()); - - } @Test @@ -749,7 +899,10 @@ public void TestParameters() { public void TestRegex() { final String fix = UnicodeRegex.fix("ab[[:ascii:]&[:Ll:]]*c"); assertEquals("", "ab[a-z]*c", fix); - assertEquals("", "abcc abxyzc ab$c", UnicodeJsp.showRegexFind(fix, "abcc abxyzc ab$c")); + assertEquals( + "", + "abcc abxyzc ab$c", + UnicodeJsp.showRegexFind(fix, "abcc abxyzc ab$c")); } @Test @@ -763,7 +916,7 @@ public void TestIdna() { checkInvalidIdna(Uts46.SINGLETON, "≠"); checkInvalidIdna(Uts46.SINGLETON, "\u0001"); checkToUnicode(Uts46.SINGLETON, "ß。ab", "ß.ab"); - //checkToPunyCode(Uts46.SINGLETON, "\u0002", "xn---"); + // checkToPunyCode(Uts46.SINGLETON, "\u0002", "xn---"); checkToPunyCode(Uts46.SINGLETON, "ß。ab", "ss.ab"); checkToUnicodeAndPunyCode(Uts46.SINGLETON, "faß.de", "faß.de", "fass.de"); @@ -773,9 +926,8 @@ public void TestIdna() { checkValidIdna(Idna2003.SINGLETON, "À÷"); checkValidIdna(Idna2003.SINGLETON, "≠"); - checkToUnicodeAndPunyCode(Idna2003.SINGLETON, "نامه\u200Cای.de", "نامهای.de", "xn--mgba3gch31f.de"); - - + checkToUnicodeAndPunyCode( + Idna2003.SINGLETON, "نامه\u200Cای.de", "نامهای.de", "xn--mgba3gch31f.de"); checkValues(error, Idna2008.SINGLETON); checkToUnicode(Idna2008.SINGLETON, "ß", "ß"); @@ -785,7 +937,6 @@ public void TestIdna() { checkInvalidIdna(Idna2008.SINGLETON, "≠"); checkInvalidIdna(Idna2008.SINGLETON, "ß。"); - Uts46.SINGLETON.isValid("≠"); assertTrue("uts46 a", Uts46.SINGLETON.isValid("a")); assertFalse("uts46 not equals", Uts46.SINGLETON.isValid("≠")); @@ -795,12 +946,13 @@ public void TestIdna() { testLines = UnicodeJsp.testIdnaLines(UnicodeJsp.getDefaultIdnaInput(), "[]"); assertContains(testLines, "xn--bb-eka.at"); - - //showIDNARemapDifferences(printWriter); + // showIDNARemapDifferences(printWriter); expectError("][:idna=valid:][abc]"); - assertTrue("contains hyphen", UnicodeSetUtilities.parseUnicodeSet("[:idna=valid:]").contains('-')); + assertTrue( + "contains hyphen", + UnicodeSetUtilities.parseUnicodeSet("[:idna=valid:]").contains('-')); } private void checkValues(boolean[] error, Idna idna) { @@ -809,7 +961,8 @@ private void checkValues(boolean[] error, Idna idna) { checkInvalidIdna(idna, "="); } - private void checkToUnicodeAndPunyCode(Idna idna, String source, String toUnicode, String toPunycode) { + private void checkToUnicodeAndPunyCode( + Idna idna, String source, String toUnicode, String toPunycode) { checkToUnicode(idna, source, toUnicode); checkToPunyCode(idna, source, toPunycode); } @@ -851,32 +1004,26 @@ public void expectError(String input) { public void TestBnf() { UnicodeRegex regex = new UnicodeRegex(); final String[][] tests = { - { - "c = a* wq;\n" + - "a = xyz;\n" + - "b = a{2} c;\n" - }, - { - "c = a* b;\n" + - "a = xyz;\n" + - "b = a{2} c;\n", - "Exception" - }, - { - "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" + - "scheme = reserved+;\n" + - "host = \\/\\/ reserved+;\n" + - "query = [\\=reserved]+;\n" + - "fragment = reserved+;\n" + - "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n", - "http://αβγ?huh=hi#there"}, -// { -// "/Users/markdavis/Documents/workspace/cldr/tools/java/org/unicode/cldr/util/data/langtagRegex.txt" -// } + {"c = a* wq;\n" + "a = xyz;\n" + "b = a{2} c;\n"}, + {"c = a* b;\n" + "a = xyz;\n" + "b = a{2} c;\n", "Exception"}, + { + "uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;\n" + + "scheme = reserved+;\n" + + "host = \\/\\/ reserved+;\n" + + "query = [\\=reserved]+;\n" + + "fragment = reserved+;\n" + + "reserved = [[:ascii:][:sc=grek:]&[:alphabetic:]];\n", + "http://αβγ?huh=hi#there" + }, + // { + // + // "/Users/markdavis/Documents/workspace/cldr/tools/java/org/unicode/cldr/util/data/langtagRegex.txt" + // } }; for (int i = 0; i < tests.length; ++i) { String test = tests[i][0]; - final boolean expectException = tests[i].length < 2 ? false : tests[i][1].equals("Exception"); + final boolean expectException = + tests[i].length < 2 ? false : tests[i][1].equals("Exception"); try { String result; if (test.endsWith(".txt")) { @@ -889,9 +1036,11 @@ public void TestBnf() { errln("Expected exception for " + test); continue; } - String result2 = result.replaceAll("[0-9]+%", ""); // just so we can use the language subtag stuff + String result2 = + result.replaceAll( + "[0-9]+%", ""); // just so we can use the language subtag stuff String resolved = regex.transform(result2); - //logln(resolved); + // logln(resolved); Matcher m = Pattern.compile(resolved, Pattern.COMMENTS).matcher(""); String checks = ""; for (int j = 1; j < tests[i].length; ++j) { @@ -905,9 +1054,9 @@ public void TestBnf() { checks += "\n"; } } - //logln("Result: " + result + "\n" + checks + "\n" + test); + // logln("Result: " + result + "\n" + checks + "\n" + test); String randomBnf = UnicodeJsp.getBnf(result, 10, 10); - //logln(randomBnf); + // logln(randomBnf); } catch (Exception e) { if (!expectException) { errln(e.getClass().getName() + ": " + e.getMessage()); @@ -920,12 +1069,12 @@ public void TestBnf() { @Test public void TestBnfMax() { BNF bnf = new BNF(new Random(), new Quoter.RuleQuoter()); - bnf.setMaxRepeat(10) - .addRules("$root=[0-9]+;") - .complete(); + bnf.setMaxRepeat(10).addRules("$root=[0-9]+;").complete(); for (int i = 0; i < 100; ++i) { String s = bnf.next(); - assertTrue("Max too large? " + i + ", " + s.length() + ", " + s, 1 <= s.length() && s.length() < 11); + assertTrue( + "Max too large? " + i + ", " + s.length() + ", " + s, + 1 <= s.length() && s.length() < 11); } } @@ -938,17 +1087,18 @@ public void TestBnfGen() { assertContains(stuff, "

\\U0001D7E8"); stuff = UnicodeJsp.getBnf("[0-9]+ ([[:WB=MB:][:WB=MN:]] [0-9]+)?", 100, 10); assertContains(stuff, "726283663"); - String bnf = "item = word | number;\n" + - "word = $alpha+;\n" + - "number = (digits (separator digits)?);\n" + - "digits = [:Pd:]+;\n" + - "separator = [[:WB=MB:][:WB=MN:]];\n" + - "$alpha = [:alphabetic:];"; + String bnf = + "item = word | number;\n" + + "word = $alpha+;\n" + + "number = (digits (separator digits)?);\n" + + "digits = [:Pd:]+;\n" + + "separator = [[:WB=MB:][:WB=MN:]];\n" + + "$alpha = [:alphabetic:];"; String fixedbnf = new UnicodeRegex().compileBnf(bnf); String fixedbnf2 = UnicodeRegex.fix(fixedbnf); - //String fixedbnfNoPercent = fixedbnf2.replaceAll("[0-9]+%", ""); + // String fixedbnfNoPercent = fixedbnf2.replaceAll("[0-9]+%", ""); String random = UnicodeJsp.getBnf(fixedbnf2, 100, 10); - //assertContains(random, "\\U0002A089"); + // assertContains(random, "\\U0002A089"); } @Test @@ -964,13 +1114,14 @@ public void TestSimpleSet() { private void checkUnicodeSetParse(String expected1, String test) { UnicodeSet actual = new UnicodeSet(); UnicodeSet expected = new UnicodeSet(expected1); - UnicodeJsp.getSimpleSet(test, actual , true, false); + UnicodeJsp.getSimpleSet(test, actual, true, false); assertEquals(test, expected, actual); } + private void checkUnicodeSetParseContains(String expected1, String test) { UnicodeSet actual = new UnicodeSet(); UnicodeSet expectedSubset = new UnicodeSet(expected1); - UnicodeJsp.getSimpleSet(test, actual , true, false); + UnicodeJsp.getSimpleSet(test, actual, true, false); assertContains(test, expectedSubset, actual); } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestLanguageid.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestLanguageid.java index fd840e0c9..7bd449fa7 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestLanguageid.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestLanguageid.java @@ -1,7 +1,6 @@ package org.unicode.jsptest; import com.ibm.icu.util.ULocale; - import org.junit.jupiter.api.Test; import org.unicode.jsp.LanguageCode; import org.unicode.unittest.TestFmwkMinusMinus; @@ -11,12 +10,12 @@ public class TestLanguageid extends TestFmwkMinusMinus { @Test public void TestParse() { String results; - results = LanguageCode.validate("pap-CW",new ULocale("en")); + results = LanguageCode.validate("pap-CW", new ULocale("en")); if (!assertTrue("", results.contains("Curaçao"))) { errln(results); } - results = LanguageCode.validate("$, eng-840, fr-fr",new ULocale("en")); + results = LanguageCode.validate("$, eng-840, fr-fr", new ULocale("en")); assertContains(results, "target='languageid'>fr-FR"); } } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestProperties.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestProperties.java index 18ee15634..c9a810386 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestProperties.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestProperties.java @@ -1,5 +1,18 @@ package org.unicode.jsptest; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Row.R4; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.StringWriter; import java.util.Arrays; @@ -13,7 +26,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; import org.unicode.cldr.draft.FileUtilities; @@ -21,28 +33,15 @@ import org.unicode.jsp.NFM; import org.unicode.jsp.PropertyMetadata; import org.unicode.jsp.UnicodeJsp; -import org.unicode.props.UnicodeProperty; import org.unicode.jsp.UnicodeSetUtilities; import org.unicode.jsp.UnicodeUtilities; import org.unicode.jsp.XPropertyFactory; - -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row.R4; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UProperty.NameChoice; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; +import org.unicode.props.UnicodeProperty; public class TestProperties extends TestFmwk2 { static XPropertyFactory factory = XPropertyFactory.make(); static Collator col = Collator.getInstance(ULocale.ROOT); + static { ((RuleBasedCollator) col).setNumericCollation(true); } @@ -58,31 +57,36 @@ public void checkContained(final String setPattern, final String containedPatter } } - public static final Set SKIP_CJK = Collections.unmodifiableSet(new HashSet(Arrays.asList( - "kAccountingNumeric", - "kCompatibilityVariant", - "kIICore", - "kIRG_GSource", - "kIRG_HSource", - "kIRG_JSource", - "kIRG_KPSource", - "kIRG_KSource", - "kIRG_MSource", - "kIRG_TSource", - "kIRG_USource", - "kIRG_VSource", - "kOtherNumeric", - "kPrimaryNumeric", - "kRSUnicode", - "Unicode_Radical_Stroke"))); - - public static final Set SKIP_FOR_NOW = Collections.unmodifiableSet(new HashSet(Arrays.asList( - "Unicode_Radical_Stroke"))); + public static final Set SKIP_CJK = + Collections.unmodifiableSet( + new HashSet( + Arrays.asList( + "kAccountingNumeric", + "kCompatibilityVariant", + "kIICore", + "kIRG_GSource", + "kIRG_HSource", + "kIRG_JSource", + "kIRG_KPSource", + "kIRG_KSource", + "kIRG_MSource", + "kIRG_TSource", + "kIRG_USource", + "kIRG_VSource", + "kOtherNumeric", + "kPrimaryNumeric", + "kRSUnicode", + "Unicode_Radical_Stroke"))); + + public static final Set SKIP_FOR_NOW = + Collections.unmodifiableSet( + new HashSet(Arrays.asList("Unicode_Radical_Stroke"))); @Test public void TestScope() { Set metaprops = new TreeSet(); - for (R4 propData : PropertyMetadata.getCategoryDatatypeSourceProperty()) { + for (R4 propData : + PropertyMetadata.getCategoryDatatypeSourceProperty()) { String category = propData.get0(); if (category.startsWith("X-")) { continue; @@ -95,14 +99,26 @@ public void TestScope() { } } for (String propName : ScopeOfUse.getProperties()) { - if (ScopeOfUse.isContributory(propName) || ScopeOfUse.isDeprecated(propName) || SKIP_CJK.contains(propName)) { + if (ScopeOfUse.isContributory(propName) + || ScopeOfUse.isDeprecated(propName) + || SKIP_CJK.contains(propName)) { continue; } if (!metaprops.contains(propName)) { if (SKIP_FOR_NOW.contains(propName)) { - warnln(propName + "\tCat != scope: " + null + "\t" + ScopeOfUse.getScope(propName)); + warnln( + propName + + "\tCat != scope: " + + null + + "\t" + + ScopeOfUse.getScope(propName)); } else { - warnln(propName + "\tCat != scope: " + null + "\t" + ScopeOfUse.getScope(propName)); + warnln( + propName + + "\tCat != scope: " + + null + + "\t" + + ScopeOfUse.getScope(propName)); } } } @@ -113,36 +129,48 @@ public void TestScopeForPropertyAliases() { for (String propName : PropertyAliases.names) { String scope = ScopeOfUse.getScope(propName); if (scope == null) { - msg(propName + " in PropertyAliases, but not in http://unicode.org/reports/tr44/proposed.html#Property_Index_Table", - SKIP_CJK.contains(propName) ? LOG - : SKIP_FOR_NOW.contains(propName) ? WARN - : ERR, - true, true); + msg( + propName + + " in PropertyAliases, but not in http://unicode.org/reports/tr44/proposed.html#Property_Index_Table", + SKIP_CJK.contains(propName) + ? LOG + : SKIP_FOR_NOW.contains(propName) ? WARN : ERR, + true, + true); } } } - public enum Source {METADATA, ICU, FACTORY, PROPERTY_ALIASES} - private static final Comparator LC = new Comparator() { - @Override - public int compare(String o1, String o2) { - return o1.compareToIgnoreCase(o2); - } - }; + public enum Source { + METADATA, + ICU, + FACTORY, + PROPERTY_ALIASES + } + + private static final Comparator LC = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return o1.compareToIgnoreCase(o2); + } + }; @Test public void TestPropertySupport() { - Relation source = Relation.of(new TreeMap>(LC), TreeSet.class); - for (R4 propData : PropertyMetadata.getCategoryDatatypeSourceProperty()) { + Relation source = + Relation.of(new TreeMap>(LC), TreeSet.class); + for (R4 propData : + PropertyMetadata.getCategoryDatatypeSourceProperty()) { String propName = propData.get3(); put(source, propName, Source.METADATA); } int[][] ranges = { - {UProperty.BINARY_START, UProperty.BINARY_LIMIT}, - {UProperty.INT_START, UProperty.INT_LIMIT}, - {UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT}, - {UProperty.STRING_START, UProperty.STRING_LIMIT}, - {UProperty.OTHER_PROPERTY_START, UProperty.OTHER_PROPERTY_LIMIT}, + {UProperty.BINARY_START, UProperty.BINARY_LIMIT}, + {UProperty.INT_START, UProperty.INT_LIMIT}, + {UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT}, + {UProperty.STRING_START, UProperty.STRING_LIMIT}, + {UProperty.OTHER_PROPERTY_START, UProperty.OTHER_PROPERTY_LIMIT}, }; for (int[] range : ranges) { for (int property = range[0]; property < range[1]; ++property) { @@ -202,14 +230,12 @@ public void TestNFM() { @Test public void TestInstantiateProps() { - Set> propInfo = new TreeSet>(); - //Relation typeToProp = new Relation(new TreeMap(), TreeSet.class, col); - List availableNames = (List)factory.getAvailableNames(); - TreeSet sortedProps = Builder - .with(new TreeSet(col)) - .addAll(availableNames) - .remove("Name") - .get(); + Set> propInfo = + new TreeSet>(); + // Relation typeToProp = new Relation(new TreeMap(), TreeSet.class, col); + List availableNames = (List) factory.getAvailableNames(); + TreeSet sortedProps = + Builder.with(new TreeSet(col)).addAll(availableNames).remove("Name").get(); int cp = 'a'; logln("Properties for " + UTF16.valueOf(cp)); @@ -219,8 +245,8 @@ public void TestInstantiateProps() { boolean isDefault; try { prop = factory.getProperty(propName); - //int type = prop.getType(); - //typeToProp.put(type, propName); + // int type = prop.getType(); + // typeToProp.put(type, propName); isDefault = prop.isDefault(cp); } catch (Exception e) { errln(propName + "\t" + Arrays.asList(e.getStackTrace()).toString()); @@ -237,7 +263,8 @@ public void TestInstantiateProps() { // } Set notCovered = new HashSet(availableNames); - for (R4 propData : PropertyMetadata.getCategoryDatatypeSourceProperty()) { + for (R4 propData : + PropertyMetadata.getCategoryDatatypeSourceProperty()) { logln(propData.toString()); notCovered.remove(propData.get3()); } @@ -252,7 +279,7 @@ public void TestPropsTable() throws IOException { UnicodeJsp.showPropsTable(out, "Block", "properties.jsp"); assertTrue("props table", out.toString().contains("Cherokee")); logln(out.toString()); - //System.out.println(out); + // System.out.println(out); } @Test @@ -279,7 +306,7 @@ private void checkProperty(XPropertyFactory factory, String prop) { System.out.println("Testing " + prop + "\t\t" + property.getTypeName()); List values = property.getAvailableValues(); for (String value : values) { - //HashSet seen = new HashSet(); + // HashSet seen = new HashSet(); // for (int i = 0; i <= 0x10FFFF; ++i) { // String value = property.getValue(i); // if (seen.contains(value)) { @@ -302,7 +329,8 @@ private void checkProperty(XPropertyFactory factory, String prop) { if (expectedRegex.contains("}")) { // int debug = 0; continue; - }; + } + ; List alts = property.getValueAliases(value); if (!alts.contains(value)) { errln(value + " not in " + alts + " for " + prop); @@ -332,41 +360,44 @@ private void checkProperty(XPropertyFactory factory, String prop) { } } - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestAllProperties() { UnicodeProperty foo; XPropertyFactory factory = XPropertyFactory.make(); -// checkProperty(factory, "NFKC_Casefold"); -// checkProperty(factory, "Age"); + // checkProperty(factory, "NFKC_Casefold"); + // checkProperty(factory, "Age"); ////// checkProperty(factory, "Lead_Canonical_Combining_Class"); ////// checkProperty(factory, "Joining_Group"); // if (true) return; long start = System.currentTimeMillis(); - for (String prop : (Collection)factory.getAvailableNames()) { + for (String prop : (Collection) factory.getAvailableNames()) { try { checkProperty(factory, prop); } catch (Throwable e) { - errln (prop + "\t" + maxLen(150, e.getMessage())); + errln(prop + "\t" + maxLen(150, e.getMessage())); break; } long current = System.currentTimeMillis(); - logln("Time: " + prop + "\t\t" + (current-start) + "ms"); + logln("Time: " + prop + "\t\t" + (current - start) + "ms"); start = current; } } private String maxLen(int max, String message) { - return message.length() <= max ? message : message.substring(0,max)+"…"; + return message.length() <= max ? message : message.substring(0, max) + "…"; } static final class PropertyAliases { static Set names = new TreeSet(); + static { Splitter SEMI = Splitter.on(';').trimResults(); - for (String line : FileUtilities.in(PropertyAliases.class,"PropertyAliases.txt")) { + for (String line : FileUtilities.in(PropertyAliases.class, "PropertyAliases.txt")) { // bc ; Bidi_Class if (line.isEmpty() || line.startsWith("#")) { continue; @@ -380,48 +411,53 @@ static final class PropertyAliases { static final class ScopeOfUse { public ScopeOfUse(List parts) { scope = parts.get(1); - contributory = parts.get(1).equals("Contributory Properties") - || parts.get(0).equals("Composition_Exclusion") - || parts.get(0).equals("Decomposition_Mapping"); + contributory = + parts.get(1).equals("Contributory Properties") + || parts.get(0).equals("Composition_Exclusion") + || parts.get(0).equals("Decomposition_Mapping"); deprecated = !parts.get(2).isEmpty(); } + public final String scope; public final boolean deprecated; public final boolean contributory; - private static final Map data; + private static final Map data; public static ScopeOfUse get(String propName) { return data.get(propName); } + public static boolean isDeprecated(String prop) { ScopeOfUse item = get(prop); return item == null ? false : item.contributory; } + public static boolean isContributory(String prop) { ScopeOfUse item = get(prop); return item == null ? false : item.deprecated; } + public static String getScope(String prop) { ScopeOfUse item = get(prop); return item == null ? null : item.scope; } + public static Set getProperties() { return data.keySet(); } static { Splitter SEMI = Splitter.on(';').trimResults(); - TreeMap _data = new TreeMap(); - for (String line : FileUtilities.in(PropertyAliases.class,"ScopeOfUse.txt")) { + TreeMap _data = new TreeMap(); + for (String line : FileUtilities.in(PropertyAliases.class, "ScopeOfUse.txt")) { // bc ; Bidi_Class if (line.isEmpty() || line.startsWith("#")) { continue; } List parts = SEMI.splitToList(line); - _data.put(parts.get(0),new ScopeOfUse(parts)); + _data.put(parts.get(0), new ScopeOfUse(parts)); } data = Collections.unmodifiableMap(_data); } } - } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestScriptTester.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestScriptTester.java index 8d9655b79..644120f09 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestScriptTester.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestScriptTester.java @@ -1,14 +1,12 @@ package org.unicode.jsptest; +import com.ibm.icu.text.Normalizer; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; - -import com.ibm.icu.text.Normalizer; - import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; import org.unicode.jsp.Builder; @@ -20,89 +18,91 @@ public class TestScriptTester extends TestFmwkMinusMinus { - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") - @Test - public void TestBasic() { - ScriptTester scriptTester = ScriptTester.start().get(); + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @Test + public void TestBasic() { + ScriptTester scriptTester = ScriptTester.start().get(); - String[] bad = { - "gー", - "ä\u0308", - "1٦", - "٦۶", - "1aᎠ", - "aᎠ1", - "aᎠ", // Cherokee + String[] bad = { + "gー", "ä\u0308", "1٦", "٦۶", "1aᎠ", "aᎠ1", "aᎠ", // Cherokee "aα", // simplified and traditional "万丟", // simplified and traditional - }; - for (String testCase : bad) { - boolean result = scriptTester.isOk(testCase); - assertFalse(testCase, result); + }; + for (String testCase : bad) { + boolean result = scriptTester.isOk(testCase); + assertFalse(testCase, result); + } + + String[] cases = {"abc", "1abc", "abc1", "ab가", "一가", "一万", "一丟", "一\u4E07"}; + for (String testCase : cases) { + boolean result = scriptTester.isOk(testCase); + assertTrue(testCase + " should be ok: ", result); + } } - String[] cases = {"abc", "1abc", "abc1", "ab가", "一가", "一万", "一丟", "一\u4E07"}; - for (String testCase : cases) { - boolean result = scriptTester.isOk(testCase); - assertTrue(testCase + " should be ok: ", result); + @Test + public void TestFilter() { + checkFilter("万"); + checkFilter("丟"); + checkFilter("\u4e01"); } - } - - @Test - public void TestFilter() { - checkFilter("万"); - checkFilter("丟"); - checkFilter("\u4e01"); - } - - private void checkFilter(String testChar) { - List> listTrial = Builder.with(new ArrayList>()) - .add(Builder.with(new LinkedHashSet()).addAll("\u30FC", "-", "\u4e00").get()) - .add(Builder.with(new LinkedHashSet()).addAll(testChar).get()) - .get(); - ScriptTester scriptTester = ScriptTester.start().get(); - String before = listTrial.toString(); - scriptTester.filterTable(listTrial); - String after = listTrial.toString(); - assertEquals("filterTable", before, after); - } - - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") - @Test - public void TestConfusables() { - TreeSet expected = Builder.with(new TreeSet()).addAll("google", "goog1e", "googIe").get(); - checkScriptCheck("google", expected); - checkScriptCheck("mark", null); - checkScriptCheck("scope", null); - checkScriptCheck("pop", null); - - //g໐໐g1e - } - - private void checkScriptCheck(String string, TreeSet expected) { - Confusables confusables = new Confusables(string) - .setNormalizationCheck(Normalizer.NFKC) - .setScriptCheck(ScriptCheck.same) - .setAllowedCharacters(XIDModifications.getAllowed()); - - TreeSet items = Builder.with(new TreeSet()).addAll(confusables).get(); - if (expected != null) { - assertEquals("Confusables for '" + string + - "'", expected, items); + private void checkFilter(String testChar) { + List> listTrial = + Builder.with(new ArrayList>()) + .add( + Builder.with(new LinkedHashSet()) + .addAll("\u30FC", "-", "\u4e00") + .get()) + .add(Builder.with(new LinkedHashSet()).addAll(testChar).get()) + .get(); + ScriptTester scriptTester = ScriptTester.start().get(); + String before = listTrial.toString(); + scriptTester.filterTable(listTrial); + String after = listTrial.toString(); + assertEquals("filterTable", before, after); } - Confusables confusables2 = new Confusables(string) - .setNormalizationCheck(Normalizer.NFKC) - .setAllowedCharacters(XIDModifications.getAllowed()); + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @Test + public void TestConfusables() { + TreeSet expected = + Builder.with(new TreeSet()).addAll("google", "goog1e", "googIe").get(); + checkScriptCheck("google", expected); + checkScriptCheck("mark", null); + checkScriptCheck("scope", null); + checkScriptCheck("pop", null); + + // g໐໐g1e + } - HashSet filteredDifferently = new HashSet(); - for (String s : confusables2) { - if (Confusables.scriptTester.isOk(s)) { - filteredDifferently.add(s); - } + private void checkScriptCheck(String string, TreeSet expected) { + Confusables confusables = + new Confusables(string) + .setNormalizationCheck(Normalizer.NFKC) + .setScriptCheck(ScriptCheck.same) + .setAllowedCharacters(XIDModifications.getAllowed()); + + TreeSet items = Builder.with(new TreeSet()).addAll(confusables).get(); + if (expected != null) { + assertEquals("Confusables for '" + string + "'", expected, items); + } + + Confusables confusables2 = + new Confusables(string) + .setNormalizationCheck(Normalizer.NFKC) + .setAllowedCharacters(XIDModifications.getAllowed()); + + HashSet filteredDifferently = new HashSet(); + for (String s : confusables2) { + if (Confusables.scriptTester.isOk(s)) { + filteredDifferently.add(s); + } + } + assertEquals("Confusables for '" + string + "'", items, filteredDifferently); } - assertEquals("Confusables for '" + string + - "'", items, filteredDifferently); - } } diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestTypology.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestTypology.java index 5e4423e71..ad4e6fd95 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestTypology.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestTypology.java @@ -1,5 +1,15 @@ package org.unicode.jsptest; +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -13,41 +23,31 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.MultiComparator; import org.unicode.cldr.util.UnicodeSetPrettyPrinter; import org.unicode.jsp.Subheader; import org.unicode.jsp.Typology; -import com.ibm.icu.dev.test.TestFmwk; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - -/** - * Was commented out of TestAll. Status? - */ +/** Was commented out of TestAll. Status? */ public class TestTypology extends TestFmwk { public static void main(String[] args) { new TestTypology().run(args); } - MultiComparator col = new MultiComparator( - Collator.getInstance(), - new UTF16.StringComparator(true, false, 0)); + MultiComparator col = + new MultiComparator(Collator.getInstance(), new UTF16.StringComparator(true, false, 0)); Collator primaryOnly = Collator.getInstance(); - UnicodeSetPrettyPrinter pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance()).setSpaceComparator(primaryOnly); + UnicodeSetPrettyPrinter pp = + new UnicodeSetPrettyPrinter() + .setOrdering(Collator.getInstance()) + .setSpaceComparator(primaryOnly); public void TestSimple() throws IOException { - Set archaicLabels = new HashSet(Arrays.asList("Archaic Ancient Biblical Historic".split("\\s"))); + Set archaicLabels = + new HashSet( + Arrays.asList("Archaic Ancient Biblical Historic".split("\\s"))); UnicodeSet archaic = new UnicodeSet(); // System.out.println(); // System.out.println("Label\tSet"); @@ -68,35 +68,43 @@ public void TestSimple() throws IOException { logln(list.toString()); String GEN_DIR = "/Users/markdavis/Documents/workspace/Generated/"; - PrintWriter out = FileUtilities.openUTF8Writer(GEN_DIR + "/categories", "CategoryLabels.txt"); - PrintWriter html = FileUtilities.openUTF8Writer(GEN_DIR + "/categories", "CategoryLabels.html"); - String fontList = "Georgia, 'Times New Roman', Times, Symbola, Aegyptus, Aegean, Akkadian, Analecta, Musica, Code2000, Code2001, Code2002, serif"; + PrintWriter out = + FileUtilities.openUTF8Writer(GEN_DIR + "/categories", "CategoryLabels.txt"); + PrintWriter html = + FileUtilities.openUTF8Writer(GEN_DIR + "/categories", "CategoryLabels.html"); + String fontList = + "Georgia, 'Times New Roman', Times, Symbola, Aegyptus, Aegean, Akkadian, Analecta, Musica, Code2000, Code2001, Code2002, serif"; html.println( - "\n" + - "\n" + - "\n" + - " " + - "\n" + - "

L2/10-450R4

\n" + - "

Subject: Labels and UTR#49

\n" + - "

From: Mark Davis

\n" + - "

Date: " + new Date() + "

\n" + - "

The following provides a breakdown of the data file for UTR#49. I'll explain more about the" + - " format during the meeting. Body fonts include: " + fontList + ".

" - ); + "\n" + + "\n" + + "\n" + + " " + + "\n" + + "

L2/10-450R4

\n" + + "

Subject: Labels and UTR#49

\n" + + "

From: Mark Davis

\n" + + "

Date: " + + new Date() + + "

\n" + + "

The following provides a breakdown of the data file for UTR#49. I'll explain more about the" + + " format during the meeting. Body fonts include: " + + fontList + + ".

"); // startTable(html, out, "Labels"); // showLabel(html, out, pp, "Label", null, false); @@ -108,19 +116,23 @@ public void TestSimple() throws IOException { // // closeTable(html, out); - startTable(html, out, "Labels, Other Labels in Path, and Subheads"); - showLabel(html, out, pp, "Label\tSize\tContents/Other-Labels/Subheads", LabelStyle.title, LabelRowStyle.normal); + showLabel( + html, + out, + pp, + "Label\tSize\tContents/Other-Labels/Subheads", + LabelStyle.title, + LabelRowStyle.normal); for (String label : list) { UnicodeSet uset = Typology.label_to_uset.get(label); - UnicodeSet[] usets = { uset }; + UnicodeSet[] usets = {uset}; showLabel(html, out, pp, label, LabelStyle.subhead, LabelRowStyle.normal, usets); } closeTable(html, out); - // startTable(html, out, "Label in single path"); // showLabel(html, out, pp, "Label\tPath\tSize\tContents", LabelStyle.title); // @@ -147,16 +159,22 @@ public void TestSimple() throws IOException { for (Entry parentAndSet : lists.entrySet()) { String parent = parentAndSet.getKey(); UnicodeSet uset = parentAndSet.getValue(); - showLabel(html, out, pp, (label + "\t" + parent), LabelStyle.normal, + showLabel( + html, + out, + pp, + (label + "\t" + parent), + LabelStyle.normal, !label.equals(oldLabel) ? LabelRowStyle.normal : LabelRowStyle.sub, - uset); + uset); oldLabel = label; } } closeTable(html, out); Set props = getProps(); - Set>>> matches = new TreeSet>>>(); + Set>>> matches = + new TreeSet>>>(); for (String label : list) { UnicodeSet uset = Typology.label_to_uset.get(label); @@ -185,12 +203,19 @@ public void TestSimple() throws IOException { closestValue = row.get0(); } - R3>> match = Row.of(closestValue, label, close); + R3>> match = + Row.of(closestValue, label, close); matches.add(match); } startTable(html, out, "Labels compared to Properties"); - showLabel(html, out, pp, "Overlap\tLabel\tProp/Subhead\tCount Shared\tShared\tLabel-Prop\tProp-Label", LabelStyle.title, LabelRowStyle.normal); + showLabel( + html, + out, + pp, + "Overlap\tLabel\tProp/Subhead\tCount Shared\tShared\tLabel-Prop\tProp-Label", + LabelStyle.title, + LabelRowStyle.normal); for (R3>> match : matches) { String label = match.get1(); @@ -206,15 +231,24 @@ public void TestSimple() throws IOException { UnicodeSet label_AND_propset = new UnicodeSet(uset).retainAll(propSet); UnicodeSet label_propset = new UnicodeSet(uset).removeAll(propSet); UnicodeSet propset_label = new UnicodeSet(propSet).removeAll(uset); - //showLabel(html, out, pp, label + "\t" + path, Typology.path_to_uset.get(path), false); - showLabel(html, out, pp, pf.format(closeness) - + "\t" + label - + "\t" + propData.getName() + "=" + propData.getValue(), + // showLabel(html, out, pp, label + "\t" + path, Typology.path_to_uset.get(path), + // false); + showLabel( + html, + out, + pp, + pf.format(closeness) + + "\t" + + label + + "\t" + + propData.getName() + + "=" + + propData.getValue(), LabelStyle.normal, first ? LabelRowStyle.normal : LabelRowStyle.sub, - label_AND_propset, - label_propset, propset_label - ); + label_AND_propset, + label_propset, + propset_label); first = false; } } @@ -229,6 +263,7 @@ public void TestSimple() throws IOException { static class InverseIterator implements Iterator, Iterable { private ArrayList items; private int position; + private InverseIterator(ArrayList items) { this.items = items; this.position = items.size(); @@ -275,9 +310,11 @@ public PropData(UnicodeSet values, String name, String value) { public UnicodeSet getSet() { return get0(); } + public String getName() { return get1(); } + public String getValue() { return get2(); } @@ -288,31 +325,40 @@ private double getCloseness(UnicodeSet get2, UnicodeSet uset) { return 0d; } UnicodeSet intersect = new UnicodeSet(get2).retainAll(uset); - return (2d * intersect.size())/(get2.size() + uset.size()); + return (2d * intersect.size()) / (get2.size() + uset.size()); } - static final Set SKIP_PROPS = new HashSet(Arrays.asList("Trail_Canonical_Combining_Class Lead_Canonical_Combining_Class".split("\\s"))); + static final Set SKIP_PROPS = + new HashSet( + Arrays.asList( + "Trail_Canonical_Combining_Class Lead_Canonical_Combining_Class" + .split("\\s"))); private Set getProps() { Set props = new HashSet(); int[][] ranges = { - {UProperty.BINARY_START, UProperty.BINARY_LIMIT}, - {UProperty.INT_START, UProperty.INT_LIMIT}, - {UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT}, - {UProperty.STRING_START, UProperty.STRING_LIMIT}, + {UProperty.BINARY_START, UProperty.BINARY_LIMIT}, + {UProperty.INT_START, UProperty.INT_LIMIT}, + {UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT}, + {UProperty.STRING_START, UProperty.STRING_LIMIT}, }; UnicodeSet skip = new UnicodeSet("[[:cn:][:cs:][:co:]]"); for (int[] range : ranges) { - for (int propEnum = range[0]; propEnum < range[1]; ++propEnum) { + for (int propEnum = range[0]; propEnum < range[1]; ++propEnum) { String alias = UCharacter.getPropertyName(propEnum, UProperty.NameChoice.LONG); if (SKIP_PROPS.contains(alias)) { continue; } int max = UCharacter.getIntPropertyMaxValue(propEnum); - for (int valueEnum = UCharacter.getIntPropertyMinValue(propEnum); valueEnum <= max; ++valueEnum) { + for (int valueEnum = UCharacter.getIntPropertyMinValue(propEnum); + valueEnum <= max; + ++valueEnum) { try { - UnicodeSet foo = new UnicodeSet().applyIntPropertyValue(propEnum, valueEnum); - String valueAlias = UCharacter.getPropertyValueName(propEnum, valueEnum, UProperty.NameChoice.LONG); + UnicodeSet foo = + new UnicodeSet().applyIntPropertyValue(propEnum, valueEnum); + String valueAlias = + UCharacter.getPropertyValueName( + propEnum, valueEnum, UProperty.NameChoice.LONG); // foo.removeAll(skip); // if (foo.size() == 0) { @@ -338,13 +384,14 @@ private Set getProps() { return props; } - private void addProps(Set props, UnicodeSet skip, String subhead, UnicodeSet uset, String alias) { + private void addProps( + Set props, UnicodeSet skip, String subhead, UnicodeSet uset, String alias) { UnicodeSet foo = new UnicodeSet(uset); foo.removeAll(skip); if (foo.size() != 0) { PropData r = new PropData(foo, alias, subhead); props.add(r); - logln(alias + "=" + subhead); + logln(alias + "=" + subhead); } } @@ -364,11 +411,24 @@ private void startTable(PrintWriter html, PrintWriter out, String title) { final String unicodeDataDirectory = "../jsp/"; Subheader subheader = new Subheader(Typology.class.getResourceAsStream("NamesList.txt")); - enum LabelStyle {title, normal, subhead} - enum LabelRowStyle {normal, sub} + enum LabelStyle { + title, + normal, + subhead + } + + enum LabelRowStyle { + normal, + sub + } - private void showLabel(PrintWriter html, PrintWriter printStream, UnicodeSetPrettyPrinter ppx, - String labelName, LabelStyle subhead, LabelRowStyle labelRowStyle, + private void showLabel( + PrintWriter html, + PrintWriter printStream, + UnicodeSetPrettyPrinter ppx, + String labelName, + LabelStyle subhead, + LabelRowStyle labelRowStyle, UnicodeSet... usets) { String cell; String setString; @@ -377,26 +437,35 @@ private void showLabel(PrintWriter html, PrintWriter printStream, UnicodeSetPret cell = "th"; } else if (subhead == LabelStyle.subhead) { cell = "td"; - Map subheads = getSubheadInfo(usets[0]); - setString = formatUnicodeSet(usets) - + "

" + Typology.label_parent_uset.get(labelName).keySet() - + "

" + join(subheads, labelName); - - sizeString = usets.length == 0 ? "" : usets[0].size()+""; + Map subheads = getSubheadInfo(usets[0]); + setString = + formatUnicodeSet(usets) + + "

" + + Typology.label_parent_uset.get(labelName).keySet() + + "

" + + join(subheads, labelName); + + sizeString = usets.length == 0 ? "" : usets[0].size() + ""; labelName = BREAK_AFTER.matcher(labelName).replaceAll("$1\u200B"); labelName += "\t" + sizeString + "\t" + setString; } else { cell = "td"; setString = formatUnicodeSet(usets); - sizeString = usets.length == 0 ? "" : usets[0].size()+""; + sizeString = usets.length == 0 ? "" : usets[0].size() + ""; labelName = BREAK_AFTER.matcher(labelName).replaceAll("$1\u200B"); labelName += "\t" + sizeString + "\t" + setString; } printStream.println(labelName); - html.println("<" + cell + ">" - + labelName.replace("\t", "<" + cell + ">") - + "

"); + html.println( + "<" + + cell + + ">" + + labelName.replace("\t", "<" + cell + ">") + + ""); } private String formatUnicodeSet(UnicodeSet... usets) { @@ -411,7 +480,7 @@ private String formatUnicodeSet(UnicodeSet... usets) { } private String formatUnicodeSet(UnicodeSet uset) { - if (uset.size() ==0) return "∅"; + if (uset.size() == 0) return "∅"; String setString; setString = pp.format(uset); if (setString.length() > LIMIT) { @@ -425,13 +494,13 @@ private String formatUnicodeSet(UnicodeSet uset) { setString = setString.substring(0, limit) + "…"; } String uset2 = uset.toPattern(false); - String href = ""; + String href = + ""; return "" + href + setString + ""; } - - private Map getSubheadInfo(UnicodeSet uset) { - Map subheads = new TreeMap(); + private Map getSubheadInfo(UnicodeSet uset) { + Map subheads = new TreeMap(); for (String s : uset) { String subheadString = subheader.getSubheader(s.codePointAt(0)); double percent; @@ -441,14 +510,14 @@ private Map getSubheadInfo(UnicodeSet uset) { } else { UnicodeSet other = subheader.getUnicodeSet(subheadString); UnicodeSet overlap = new UnicodeSet(other).retainAll(uset); - percent = overlap.size()/(double)other.size(); + percent = overlap.size() / (double) other.size(); } subheads.put(subheadString, percent); } return subheads; } - public static String join(Map map, String label) { + public static String join(Map map, String label) { StringBuffer result = new StringBuffer("OL: "); Set lists = Typology.labelToPaths.getAll(label); TreeSet otherLabels = new TreeSet(); @@ -516,7 +585,7 @@ private static void joinItems(StringBuffer result, String label, TreeSet } catch (Exception e) { throw new IllegalArgumentException(); } - appendWithCoverage(result, otherLabel, overlap.size()/(double)other.size()); + appendWithCoverage(result, otherLabel, overlap.size() / (double) other.size()); } } @@ -527,11 +596,11 @@ private static void appendWithCoverage(StringBuffer result, String item, Double } private static String coverageToClass(double coverage) { - String pc = coverage >= 0.999d ? "p100" - : coverage > 0.666d ? "p66" - : coverage > 0.333 ? "p33" - : "p0"; - return pc; + String pc = + coverage >= 0.999d + ? "p100" + : coverage > 0.666d ? "p66" : coverage > 0.333 ? "p33" : "p0"; + return pc; } static NumberFormat pf = NumberFormat.getPercentInstance(); diff --git a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java index bbe1a543d..3398bbb8f 100644 --- a/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java +++ b/UnicodeJsps/src/test/java/org/unicode/jsptest/TestUnicodeSet.java @@ -1,19 +1,34 @@ package org.unicode.jsptest; +import static org.junit.jupiter.api.Assumptions.assumeFalse; +import static org.junit.jupiter.api.Assumptions.assumeTrue; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.google.common.base.Objects; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.LocaleData; +import com.ibm.icu.util.TimeZone; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; -import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeSet; import java.util.stream.Stream; - import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIf; import org.junit.jupiter.params.ParameterizedTest; @@ -22,39 +37,20 @@ import org.unicode.jsp.CharEncoder; import org.unicode.jsp.Common; import org.unicode.jsp.UnicodeJsp; -import org.unicode.props.UnicodeProperty; import org.unicode.jsp.UnicodeSetUtilities; import org.unicode.jsp.UnicodeUtilities; import org.unicode.jsp.XPropertyFactory; +import org.unicode.props.UnicodeProperty; -import com.google.common.base.Objects; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UProperty.NameChoice; -import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.LocaleData; -import com.ibm.icu.util.TimeZone; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - -import static org.junit.jupiter.api.Assumptions.assumeFalse; -import static org.junit.jupiter.api.Assumptions.assumeTrue; -import static org.junit.jupiter.params.provider.Arguments.arguments; - - -public class TestUnicodeSet extends TestFmwk2 { +public class TestUnicodeSet extends TestFmwk2 { @Test public void TestInput() { String[][] tests = { - // loose, strict - {"U+00A0", "[\u00A0]"}, - {"U+10FFFE..U+10FFFF", "[\\U0010FFFE\\U0010FFFF]"}, - {"a..z", "[a-z]"}, + // loose, strict + {"U+00A0", "[\u00A0]"}, + {"U+10FFFE..U+10FFFF", "[\\U0010FFFE\\U0010FFFF]"}, + {"a..z", "[a-z]"}, }; for (String[] test : tests) { UnicodeSet source = UnicodeSetUtilities.parseUnicodeSet(test[0]); @@ -65,10 +61,10 @@ public void TestInput() { @Test public void TestOutput() { String[][] tests = { - // loose, strict - {"[\u00A0]", "[\\u00A0]", "abb", "esc"}, - {"[{👨\u200D👨\u200D👦}]", "[{👨‍👨‍👦}]"}, - {"[{👨\u200D❤\uFE0F\u200D👨}]", "[{👨‍❤️‍👨}]"}, + // loose, strict + {"[\u00A0]", "[\\u00A0]", "abb", "esc"}, + {"[{👨\u200D👨\u200D👦}]", "[{👨‍👨‍👦}]"}, + {"[{👨\u200D❤\uFE0F\u200D👨}]", "[{👨‍❤️‍👨}]"}, }; assertFalse("", UnicodeUtilities.WHITESPACE_IGNORABLES_C.contains(UnicodeUtilities.JOINER)); @@ -80,13 +76,16 @@ public void TestOutput() { if (test.length > 3) { abbreviate = test[2].startsWith("abb"); } - String a_out = UnicodeUtilities.getPrettySet(new UnicodeSet(test[0]), abbreviate, escape); + String a_out = + UnicodeUtilities.getPrettySet(new UnicodeSet(test[0]), abbreviate, escape); - assertEquals("input unicode set " + test[0] + ", " + abbreviate + ", " + escape, test[1], a_out); + assertEquals( + "input unicode set " + test[0] + ", " + abbreviate + ", " + escape, + test[1], + a_out); } } - @Test public void TestEmoji() throws IOException { StringBuilder b = new StringBuilder(); @@ -97,21 +96,24 @@ public void TestEmoji() throws IOException { } } - - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestPretty() { String[] tests = { - "00A0", - "0000", + "00A0", "0000", }; for (String test : tests) { UnicodeSet source = UnicodeSetUtilities.parseUnicodeSet("[\\u" + test + "]"); - for (boolean abbreviate : new boolean[]{false}) { - for (boolean escape : new boolean[]{false, true}) { + for (boolean abbreviate : new boolean[] {false}) { + for (boolean escape : new boolean[] {false, true}) { String derived = UnicodeUtilities.getPrettySet(source, abbreviate, escape); UnicodeSet reparsed = new UnicodeSet(derived); - if (!assertEquals("UnicodeSet " + source + ", " + abbreviate + ", " + escape, source, reparsed)) { + if (!assertEquals( + "UnicodeSet " + source + ", " + abbreviate + ", " + escape, + source, + reparsed)) { logln(derived); } else if (!assertTrue("Contains", derived.contains(test))) { logln(derived); @@ -119,24 +121,25 @@ public void TestPretty() { } } } - String test = "[[[:age=4.1:]&" + - "[:toNFM!=@toNFKC_CF@:]]-[[:age=4.1:]&" + - "[[:dt=circle:]" + - "[:dt=sub:]" + - "[:dt=super:]" + - "[:dt=small:]" + - "[:dt=square:]" + - "[:dt=vertical:]" + - "[[:block=Kangxi_Radicals:]-[:cn:]]" + - "[[:toNFKC=/[ ().0-9/°]/:]-[:toNFKC=/^.$/:]]" + - "[[:defaultignorablecodepoint:]&[:cn:]]" + - "[:block=Hangul Compatibility Jamo:]" + - "[[:block=Halfwidth_And_Fullwidth_Forms:]&[:sc=Hang:]]" + - "[:block=tags:]" + - "]]]"; + String test = + "[[[:age=4.1:]&" + + "[:toNFM!=@toNFKC_CF@:]]-[[:age=4.1:]&" + + "[[:dt=circle:]" + + "[:dt=sub:]" + + "[:dt=super:]" + + "[:dt=small:]" + + "[:dt=square:]" + + "[:dt=vertical:]" + + "[[:block=Kangxi_Radicals:]-[:cn:]]" + + "[[:toNFKC=/[ ().0-9/°]/:]-[:toNFKC=/^.$/:]]" + + "[[:defaultignorablecodepoint:]&[:cn:]]" + + "[:block=Hangul Compatibility Jamo:]" + + "[[:block=Halfwidth_And_Fullwidth_Forms:]&[:sc=Hang:]]" + + "[:block=tags:]" + + "]]]"; UnicodeSet source = UnicodeSetUtilities.parseUnicodeSet(test); String derived = UnicodeUtilities.getPrettySet(source, false, false); - assertTrue ("contains 00A0", derived.contains("00A0")); + assertTrue("contains 00A0", derived.contains("00A0")); logln(derived); } @@ -149,9 +152,11 @@ public void TestPretty() { // checkProperties("[:isEncEUCKR:]", "[\\u00B0]", "[\u0350]"); // } - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test - public void TestU60 () { + public void TestU60() { logln("ICU Version: " + VersionInfo.ICU_VERSION.toString()); logln("Unicode Data Version: " + UCharacter.getUnicodeVersion().toString()); logln("Java Version: " + System.getProperty("java.version")); @@ -173,7 +178,7 @@ public void TestU60 () { } @Test - public void TestUCA () { + public void TestUCA() { checkUca("[:uca=0304:]", "[\t]"); checkUca("[:uca2=05 9E:]", "[Øø]"); checkUca("[:uca2.5=81 81 01:]", "[DŽǢ]"); @@ -190,11 +195,16 @@ private void checkUca(String ucaPropValue, String containedItemsString) { } } - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestICUEnums() { UnicodeSet nonchars = UnicodeSetUtilities.parseUnicodeSet("\\p{noncharactercodepoint}"); - assertEquals("Nonchars",new UnicodeSet("[:noncharactercodepoint:]").complement().complement(), nonchars.complement().complement()); + assertEquals( + "Nonchars", + new UnicodeSet("[:noncharactercodepoint:]").complement().complement(), + nonchars.complement().complement()); XPropertyFactory factory = XPropertyFactory.make(); for (int propEnum = UProperty.INT_START; propEnum < UProperty.INT_LIMIT; ++propEnum) { @@ -203,7 +213,6 @@ public void TestICUEnums() { for (int propEnum = UProperty.BINARY_START; propEnum < UProperty.BINARY_LIMIT; ++propEnum) { checkProperty(factory, propEnum); } - } @Test @@ -223,42 +232,49 @@ public void TestICUStringProps() { String icuValue; try { switch (propEnum) { - case UProperty.BIDI_PAIRED_BRACKET: - icuValue = UTF16.valueOf(UCharacter.getBidiPairedBracket(i)); - break; - case UProperty.CASE_FOLDING: - icuValue = UCharacter.foldCase(UTF16.valueOf(i), true); - break; - case UProperty.LOWERCASE_MAPPING: - icuValue = UCharacter.toLowerCase(UTF16.valueOf(i)); - break; - case UProperty.TITLECASE_MAPPING: - icuValue = UCharacter.toTitleCase(UTF16.valueOf(i), titleIter); - break; - case UProperty.UPPERCASE_MAPPING: - icuValue = UCharacter.toUpperCase(UTF16.valueOf(i)); - break; - default: - icuValue = UCharacter.getStringPropertyValue(propEnum, i, NameChoice.SHORT); - if (propEnum == UProperty.AGE) { - icuValue = icuValue.equals("0.0.0.0") ? "unassigned" - : VersionInfo.getInstance(icuValue).getVersionString(2, 2); - } + case UProperty.BIDI_PAIRED_BRACKET: + icuValue = UTF16.valueOf(UCharacter.getBidiPairedBracket(i)); + break; + case UProperty.CASE_FOLDING: + icuValue = UCharacter.foldCase(UTF16.valueOf(i), true); + break; + case UProperty.LOWERCASE_MAPPING: + icuValue = UCharacter.toLowerCase(UTF16.valueOf(i)); + break; + case UProperty.TITLECASE_MAPPING: + icuValue = UCharacter.toTitleCase(UTF16.valueOf(i), titleIter); + break; + case UProperty.UPPERCASE_MAPPING: + icuValue = UCharacter.toUpperCase(UTF16.valueOf(i)); + break; + default: + icuValue = + UCharacter.getStringPropertyValue( + propEnum, i, NameChoice.SHORT); + if (propEnum == UProperty.AGE) { + icuValue = + icuValue.equals("0.0.0.0") + ? "unassigned" + : VersionInfo.getInstance(icuValue) + .getVersionString(2, 2); + } } } catch (Exception e) { errln(propNameLong + "\t" + e.getMessage()); - if (++errorCount > 5) break; else continue; + if (++errorCount > 5) break; + else continue; } String propValue = prop3.getValue(i); - if (!Objects.equal(icuValue, propValue)) { // do to avoid verbose mode being every character + if (!Objects.equal( + icuValue, propValue)) { // do to avoid verbose mode being every character assertEquals("string value", icuValue, propValue); - if (++errorCount > 5) break; else continue; + if (++errorCount > 5) break; + else continue; } } } } - @Test public void TestICUDoubleProps() { XPropertyFactory factory = XPropertyFactory.make(); @@ -293,7 +309,8 @@ private void checkProperty(XPropertyFactory factory, int propEnum) { logln(propName); for (int value = min; value <= max; ++value) { UnicodeSet icuSet = new UnicodeSet().applyIntPropertyValue(propEnum, value); - String valueName = UCharacter.getPropertyValueName(propEnum, value, NameChoice.SHORT); + String valueName = + UCharacter.getPropertyValueName(propEnum, value, NameChoice.SHORT); if (valueName == null) { valueName = UCharacter.getPropertyValueName(propEnum, value, NameChoice.LONG); } @@ -313,9 +330,15 @@ private void checkProperty(XPropertyFactory factory, int propEnum) { assertEquals(propName + "=" + valueName, icuSet, toolSet); } if (propName.equals("gc")) { - toolValues.removeAll(Arrays.asList("Cased_Letter, Letter, Mark, Number, Other, Punctuation, Separator, Symbol".split(", "))); + toolValues.removeAll( + Arrays.asList( + "Cased_Letter, Letter, Mark, Number, Other, Punctuation, Separator, Symbol" + .split(", "))); } - if (!assertEquals(propName + " should have no extra values: ", Collections.EMPTY_SET, toolValues)) { + if (!assertEquals( + propName + " should have no extra values: ", + Collections.EMPTY_SET, + toolValues)) { int debug = 0; } } catch (Exception e) { @@ -330,7 +353,8 @@ private void checkProperty(XPropertyFactory factory, int propEnum) { // UnicodeProperty prop2 = factory.getProperty("enc_Latin2"); // UnicodeMap map = prop.getUnicodeMap(); // UnicodeMap map2 = prop2.getUnicodeMap(); - // for (String value : Builder.with(new TreeSet()).addAll(map.values()).addAll(map2.values()).get()) { + // for (String value : Builder.with(new + // TreeSet()).addAll(map.values()).addAll(map2.values()).get()) { // logln(value + "\t" + map.getSet(value) + "\t" + map2.getSet(value)); // } // UnicodeSet set = UnicodeSetUtilities.parseUnicodeSet("[:enc_Latin1=/61/:]"); @@ -340,7 +364,7 @@ private void checkProperty(XPropertyFactory factory, int propEnum) { public static Stream charsetProvider() { final SortedMap charsets = Charset.availableCharsets(); final List args = new ArrayList(charsets.size()); - int count = (int)(5 + charsets.size()*getInclusion()/10.0); + int count = (int) (5 + charsets.size() * getInclusion() / 10.0); for (final Map.Entry e : Charset.availableCharsets().entrySet()) { if (--count < 0) break; args.add(arguments(e.getKey(), e.getValue())); @@ -391,37 +415,37 @@ public void TestScriptSpecials() { // assertNotEquals("Hant", 0, set.size()); UnicodeSet set2 = UnicodeSetUtilities.parseUnicodeSet("[:scx=Arab,Syrc:]"); assertNotEquals("Arab Syrc", 0, set2.size()); - } @Test public void TestGC() { - Map> SPECIAL_GC = new LinkedHashMap>(); + Map> SPECIAL_GC = + new LinkedHashMap>(); String[][] extras = { - {"Other", "C", "[[:Cc:][:Cf:][:Cn:][:Co:][:Cs:]]"}, - {"Letter", "L", "[[:Ll:][:Lm:][:Lo:][:Lt:][:Lu:]]"}, - {"Cased_Letter", "LC", "[[:Ll:][:Lt:][:Lu:]]"}, - {"Mark", "M", "[[:Mc:][:Me:][:Mn:]]"}, - {"Number", "N", "[[:Nd:][:Nl:][:No:]]"}, - {"Punctuation", "P", "[[:Pc:][:Pd:][:Pe:][:Pf:][:Pi:][:Po:][:Ps:]]"}, - {"Symbol", "S", "[[:Sc:][:Sk:][:Sm:][:So:]]"}, - {"Separator", "Z", "[[:Zl:][:Zp:][:Zs:]]"}, + {"Other", "C", "[[:Cc:][:Cf:][:Cn:][:Co:][:Cs:]]"}, + {"Letter", "L", "[[:Ll:][:Lm:][:Lo:][:Lt:][:Lu:]]"}, + {"Cased_Letter", "LC", "[[:Ll:][:Lt:][:Lu:]]"}, + {"Mark", "M", "[[:Mc:][:Me:][:Mn:]]"}, + {"Number", "N", "[[:Nd:][:Nl:][:No:]]"}, + {"Punctuation", "P", "[[:Pc:][:Pd:][:Pe:][:Pf:][:Pi:][:Po:][:Ps:]]"}, + {"Symbol", "S", "[[:Sc:][:Sk:][:Sm:][:So:]]"}, + {"Separator", "Z", "[[:Zl:][:Zp:][:Zs:]]"}, }; String[] gcs = {"General_Category=", "", "gc="}; /* -gc ; C ; Other # Cc | Cf | Cn | Co | Cs -gc ; Cc ; Control ; cntrl -gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu -gc ; LC ; Cased_Letter # Ll | Lt | Lu -gc ; M ; Mark # Mc | Me | Mn -gc ; N ; Number # Nd | Nl | No -gc ; Nd ; Decimal_Number ; digit -gc ; P ; Punctuation ; punct # Pc | Pd | Pe | Pf | Pi | Po | Ps -gc ; S ; Symbol # Sc | Sk | Sm | So -gc ; Z ; Separator # Zl | Zp | Zs - */ + gc ; C ; Other # Cc | Cf | Cn | Co | Cs + gc ; Cc ; Control ; cntrl + gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu + gc ; LC ; Cased_Letter # Ll | Lt | Lu + gc ; M ; Mark # Mc | Me | Mn + gc ; N ; Number # Nd | Nl | No + gc ; Nd ; Decimal_Number ; digit + gc ; P ; Punctuation ; punct # Pc | Pd | Pe | Pf | Pi | Po | Ps + gc ; S ; Symbol # Sc | Sk | Sm | So + gc ; Z ; Separator # Zl | Zp | Zs + */ for (String[] extra : extras) { UnicodeSet expected = new UnicodeSet(extra[2]).freeze(); for (String test : extra) { @@ -432,19 +456,23 @@ public void TestGC() { } } } - assertEquals("Coverage:\t", new UnicodeSet("[:any:]"), UnicodeSetUtilities.parseUnicodeSet("[[:C:][:L:][:M:][:N:][:P:][:S:][:Z:]]")); + assertEquals( + "Coverage:\t", + new UnicodeSet("[:any:]"), + UnicodeSetUtilities.parseUnicodeSet("[[:C:][:L:][:M:][:N:][:P:][:S:][:Z:]]")); } @Test public void TestNF() { - for (String nf : new String[]{"d", "c", "kd", "kc"}) { + for (String nf : new String[] {"d", "c", "kd", "kc"}) { checkSetsEqual("[:isnf" + nf + ":]", "[:nf" + nf + "qc!=N:]"); checkSetsEqual("[:isnf" + nf + ":]", "[:tonf" + nf + "=@cp@:]"); } } - - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestSets() { @@ -452,8 +480,8 @@ public void TestSets() { checkProperties("[:toLowercase=a:]", "[Aa]", "[b]"); checkProperties("[:subhead=/Mayanist/:]", "[\uA726]"); - //checkProperties("[[:script=*latin:]-[:script=latin:]]"); - //checkProperties("[[:script=**latin:]-[:script=latin:]]"); + // checkProperties("[[:script=*latin:]-[:script=latin:]]"); + // checkProperties("[[:script=**latin:]-[:script=latin:]]"); checkProperties("abc-m", "[d]"); // checkProperties("[:usage=common:]", "[9]"); @@ -490,14 +518,24 @@ void checkProperties(String testString, String containsSet, String doesntContain UnicodeSet contains = new UnicodeSet(containsSet); if (!tc1.containsAll(contains)) { UnicodeSet missing = new UnicodeSet(contains).removeAll(tc1); - errln(tc1 + "\t=\t" + tc1.complement().complement() + "\t\nDoesn't contain " + missing); + errln( + tc1 + + "\t=\t" + + tc1.complement().complement() + + "\t\nDoesn't contain " + + missing); } } if (doesntContainSet != null) { UnicodeSet doesntContain = new UnicodeSet(doesntContainSet); if (!tc1.containsNone(doesntContain)) { UnicodeSet extra = new UnicodeSet(doesntContain).retainAll(tc1); - errln(tc1 + "\t=\t" + tc1.complement().complement() + "\t\nContains some of" + extra); + errln( + tc1 + + "\t=\t" + + tc1.complement().complement() + + "\t\nContains some of" + + extra); } } } @@ -514,12 +552,15 @@ private void checkSetsEqual(String... unicodeSetPatterns) { } } - @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + @EnabledIf( + value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", + disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") @Test public void TestSetSyntax() { - //System.out.println("Script for A6E6: " + script + ", " + UScript.getName(script) + ", " + script2); + // System.out.println("Script for A6E6: " + script + ", " + UScript.getName(script) + ", " + + // script2); checkProperties("[:subhead=/Syllables/:]", "[\u1200]"); - //showIcuEnums(); + // showIcuEnums(); checkProperties("\\p{ccc:0}", "\\p{ccc=0}", "[\u0308]"); checkProperties("\\p{isNFC}", "[:ASCII:]", "[\u212B]"); checkProperties("[:isNFC=no:]", "[\u212B]", "[:ASCII:]"); @@ -532,7 +573,13 @@ public void TestSetSyntax() { UnicodeProperty prop = factory.getProperty("tonfkccf"); String trans2 = prop.getValue('\u2065'); if (!trans1.equals(trans2)) { - errln("mapping of \u2065 " + UCharacter.getName('\u2065') + "," + trans1 + "," + trans2); + errln( + "mapping of \u2065 " + + UCharacter.getName('\u2065') + + "," + + trans1 + + "," + + trans2); } checkProperties("[:tonfkccf=/^$/:]", "[:di:]", "[abc]"); checkProperties("[:ccc=/3/:]", "[\u0308]"); @@ -544,7 +591,8 @@ public void TestSetSyntax() { checkProperties("[:^sc:Latn:]", "[\u0308]"); checkProperties("[:sc≠Latn:]", "[\u0308]"); checkSetsEqual("[:sc≠Latn:]", "[:^sc:Latn:]", "[:^sc=Latn:]", "[:sc!=Latn:]"); - checkSetsEqual("[:sc=Latn:]", "[:sc:Latn:]", "[:^sc≠Latn:]", "[:^sc!=Latn:]", "[:^sc!:Latn:]"); + checkSetsEqual( + "[:sc=Latn:]", "[:sc:Latn:]", "[:^sc≠Latn:]", "[:^sc!=Latn:]", "[:^sc!:Latn:]"); try { checkProperties("[:linebreak:]", "[\u0308]"); @@ -571,13 +619,9 @@ public void TestSetSyntax() { checkProperties("[:alphabetic=f:]", "[\u0308]"); checkProperties("[:alphabetic=n:]", "[\u0308]"); - checkProperties("\\p{idna2003=disallowed}", "[\\u0001]"); checkProperties("\\p{idna=valid}", "[\u0308]"); checkProperties("\\p{uts46=valid}", "[\u0308]"); checkProperties("\\p{idna2008=disallowed}", "[A]"); } - - - } diff --git a/pom.xml b/pom.xml index 3887b0e33..9ef97a70a 100644 --- a/pom.xml +++ b/pom.xml @@ -1,136 +1,164 @@ - 4.0.0 - - org.unicode.unicodetools - unicodetools-parent - 1.0.0 - Parent of Unicode Tools - pom - - - Unicode-DFS-2016 - - - - - https://unicode.org/cldr - - - scm:git:https://github.com/unicode-org/unicodetools.git - - - - - 70.0.1-SNAPSHOT-cldr-2021-09-15 - - - 0.0.0-SNAPSHOT-e849e51d51 - - - - 11 - 11 - - - UTF-8 - UTF-8 - - - 5.7.2 - - 3.0.0-M5 - - - - unicodetools-testutils - unicodetools - UnicodeJsps - - - - - - - org.unicode.unicodetools - unicodetools - ${project.version} - - - - - com.ibm.icu - icu4j-for-cldr - ${icu.version} - - - - com.ibm.icu - utilities-for-cldr - ${icu.version} - - - - org.unicode.cldr - cldr-code - ${cldr.version} - - - - org.junit.jupiter - junit-jupiter - ${junit-version} - - - - org.unicode.unicodetools - unicodetools-testutils - ${project.version} - - - - - - - - maven-surefire-plugin - ${maven-surefire-plugin-version} - - - - 13.0.0 - UNITTEST - true - ${project.basedir}/.. - - ${project.basedir}/../Generated - - -Xmx10g -enableassertions - - - - maven-failsafe-plugin - ${maven-surefire-plugin-version} - - - - - - - githubicu - GitHub unicode-org/icu Apache Maven Packages - https://maven.pkg.github.com/unicode-org/icu - - - githubcldr - GitHub unicode-org/cldr Apache Maven Packages - https://maven.pkg.github.com/unicode-org/cldr - - + 4.0.0 + + org.unicode.unicodetools + unicodetools-parent + 1.0.0 + Parent of Unicode Tools + pom + + + Unicode-DFS-2016 + + + + + https://unicode-org.github.io/unicodetools/ + + + scm:git:https://github.com/unicode-org/unicodetools.git + + + + + 70.0.1-SNAPSHOT-cldr-2021-09-15 + + + 0.0.0-SNAPSHOT-e849e51d51 + + + + 11 + 11 + + + UTF-8 + UTF-8 + + + 5.7.2 + + 3.0.0-M5 + + 2.22.5 + + + + unicodetools-testutils + unicodetools + UnicodeJsps + + + + + + + org.unicode.unicodetools + unicodetools + ${project.version} + + + + + com.ibm.icu + icu4j-for-cldr + ${icu.version} + + + + com.ibm.icu + utilities-for-cldr + ${icu.version} + + + + org.unicode.cldr + cldr-code + ${cldr.version} + + + + org.junit.jupiter + junit-jupiter + ${junit-version} + + + + org.unicode.unicodetools + unicodetools-testutils + ${project.version} + + + + + + + + maven-surefire-plugin + ${maven-surefire-plugin-version} + + + + 13.0.0 + UNITTEST + true + ${project.basedir}/.. + + ${project.basedir}/../Generated + + -Xmx10g -enableassertions + + + + maven-failsafe-plugin + ${maven-surefire-plugin-version} + + + + com.diffplug.spotless + spotless-maven-plugin + ${spotless.version} + + + + NONE + + + + + + + 1.15.0 + + false + + + + + + + + + + githubicu + GitHub unicode-org/icu Apache Maven Packages + https://maven.pkg.github.com/unicode-org/icu + + + githubcldr + GitHub unicode-org/cldr Apache Maven Packages + https://maven.pkg.github.com/unicode-org/cldr + + diff --git a/unicodetools-testutils/src/main/java/org/unicode/unittest/TestFmwkMinusMinus.java b/unicodetools-testutils/src/main/java/org/unicode/unittest/TestFmwkMinusMinus.java index 452ea5263..9cc215190 100644 --- a/unicodetools-testutils/src/main/java/org/unicode/unittest/TestFmwkMinusMinus.java +++ b/unicodetools-testutils/src/main/java/org/unicode/unittest/TestFmwkMinusMinus.java @@ -1,18 +1,14 @@ package org.unicode.unittest; -import static org.junit.jupiter.api.Assumptions.assumeFalse; - +import com.ibm.icu.text.UnicodeSet; import java.util.LinkedList; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; - import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; -import com.ibm.icu.text.UnicodeSet; - public class TestFmwkMinusMinus { public List errLines = new LinkedList<>(); public Logger logger = Logger.getLogger(getClass().getSimpleName()); @@ -27,9 +23,9 @@ public void tearDown() { Assertions.assertEquals(0, errLines.size(), "errln()\n" + String.join("\n", errLines)); } - /** * Collect an error and keep going + * * @param s */ public void errln(String s) { @@ -64,6 +60,7 @@ public boolean assertNotEquals(String msg, Object a, Object b) { Assertions.assertNotEquals(a, b, msg); return true; } + public boolean assertNotNull(String msg, Object a) { Assertions.assertNotNull(a, msg); return true; @@ -90,18 +87,19 @@ public boolean isVerbose() { public void msg(String message, int level, boolean incCount, boolean newln) { if (level == ERR) { errln(message); - } else if(level == WARN) { + } else if (level == WARN) { warnln(message); } else { logln(message); } } - static final private boolean LOG_KNOWN_ISSUE = Boolean.parseBoolean(System.getProperty("LOG_KNOWN_ISSUE", "true")); + private static final boolean LOG_KNOWN_ISSUE = + Boolean.parseBoolean(System.getProperty("LOG_KNOWN_ISSUE", "true")); protected boolean logKnownIssue(String a, String b) { if (LOG_KNOWN_ISSUE == true) { - System.err.println("-DLOG_KNOWN_ISSUE=true (set to false to fail): " + a + ", "+ b); + System.err.println("-DLOG_KNOWN_ISSUE=true (set to false to fail): " + a + ", " + b); return true; } else { return false; @@ -120,7 +118,12 @@ public boolean assertContains(String test, UnicodeSet expectedSubset, UnicodeSet if (!actual.containsAll(expectedSubset)) { UnicodeSet inExpected = new UnicodeSet(expectedSubset).removeAll(actual); UnicodeSet has = new UnicodeSet(actual).removeAll(expectedSubset); - errln(test + " missing: " + toPattern("?", inExpected) + ", has: " + toPattern("*", has)); + errln( + test + + " missing: " + + toPattern("?", inExpected) + + ", has: " + + toPattern("*", has)); return false; } else { logln("OK\t\t" + test); @@ -170,11 +173,14 @@ public static String toPattern(String title, UnicodeSet primary) { return primary == null ? title : primary.toPattern(false); } - /** - * Copied from TestFmwk. Low level assertion. - */ - public boolean handleAssert(boolean result, String message, - Object expected, Object actual, String relation, boolean flip) { + /** Copied from TestFmwk. Low level assertion. */ + public boolean handleAssert( + boolean result, + String message, + Object expected, + Object actual, + String relation, + boolean flip) { if (!result || isVerbose()) { if (message == null) { message = ""; @@ -184,53 +190,55 @@ public boolean handleAssert(boolean result, String message, } relation = relation == null ? ", got " : " " + relation + " "; if (result) { - logln("OK " + message + ": " - + (flip ? expected + relation + actual : expected)); + logln("OK " + message + ": " + (flip ? expected + relation + actual : expected)); } else { // assert must assume errors are true errors and not just warnings // so cannot warnln here - errln( message - + ": expected" - + (flip ? relation + expected : " " + expected - + (actual != null ? relation + actual : ""))); + errln( + message + + ": expected" + + (flip + ? relation + expected + : " " + + expected + + (actual != null ? relation + actual : ""))); } } return result; } - private final static Integer inclusion = Integer.parseInt(System.getProperty("UNICODETOOLS_INCLUSION", "5")); - private final static Boolean verbose = Boolean.parseBoolean(System.getProperty("UNICODETOOLS_VERBOSE", "false")); - private final static Boolean runBroken = Boolean.parseBoolean(System.getProperty("UNICODETOOLS_RUN_BROKEN_TEST", "false")); + private static final Integer inclusion = + Integer.parseInt(System.getProperty("UNICODETOOLS_INCLUSION", "5")); + private static final Boolean verbose = + Boolean.parseBoolean(System.getProperty("UNICODETOOLS_VERBOSE", "false")); + private static final Boolean runBroken = + Boolean.parseBoolean(System.getProperty("UNICODETOOLS_RUN_BROKEN_TEST", "false")); + static { System.err.println("UNICODETOOLS_INCLUSION=" + inclusion); System.err.println("UNICODETOOLS_VERBOSE=" + verbose); System.err.println("UNICODETOOLS_RUN_BROKEN_TEST=" + runBroken); } /** - * set property: UNICODETOOLS_INCLUSION - * 0 = fewest tests, 5 is normal build (default), 10 is most tests + * set property: UNICODETOOLS_INCLUSION 0 = fewest tests, 5 is normal build (default), 10 is + * most tests */ public static int getInclusion() { return inclusion; } - /** - * Set property: UNICODETOOLS_VERBOSE - * Defalt false - */ + /** Set property: UNICODETOOLS_VERBOSE Defalt false */ public boolean getVerbose() { return verbose; } /** - * Set property: UNICODETOOLS_RUN_BROKEN_TEST - * Default false - * Set true to run known-broken tests - * To use: add this annotation: - * @EnabledIf(value = "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless UNICODETOOLS_RUN_BROKEN_TEST=true") + * Set property: UNICODETOOLS_RUN_BROKEN_TEST Default false Set true to run known-broken tests + * To use: add this annotation: @EnabledIf(value = + * "org.unicode.unittest.TestFmwkMinusMinus#getRunBroken", disabledReason = "Skip unless + * UNICODETOOLS_RUN_BROKEN_TEST=true") */ public static boolean getRunBroken() { return runBroken; } - } diff --git a/unicodetools/src/main/java/com/ibm/icu/dev/tool/UOption.java b/unicodetools/src/main/java/com/ibm/icu/dev/tool/UOption.java index 45a12694f..d6f2f2fd4 100644 --- a/unicodetools/src/main/java/com/ibm/icu/dev/tool/UOption.java +++ b/unicodetools/src/main/java/com/ibm/icu/dev/tool/UOption.java @@ -1,40 +1,38 @@ /* -********************************************************************** -* Copyright (c) 2002-2004, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -* Author: Alan Liu -* Created: November 15 2002 -* Since: ICU 2.4 -********************************************************************** -*/ + ********************************************************************** + * Copyright (c) 2002-2004, International Business Machines + * Corporation and others. All Rights Reserved. + ********************************************************************** + * Author: Alan Liu + * Created: November 15 2002 + * Since: ICU 2.4 + ********************************************************************** + */ package com.ibm.icu.dev.tool; /** - * A command-line option. A UOption specifies the name of an option - * and whether or not it takes an argument. It is a mutable object - * that later contains the option argument, if any, and a boolean + * A command-line option. A UOption specifies the name of an option and whether or not it takes an + * argument. It is a mutable object that later contains the option argument, if any, and a boolean * flag stating whether the option was seen or not. * - * The static method parseArgs() takes an array of command-line - * arguments and an array of UOptions and parses the command-line - * arguments. + *

The static method parseArgs() takes an array of command-line arguments and an array of + * UOptions and parses the command-line arguments. * - * This deliberately resembles the icu4c file uoption.[ch]. + *

This deliberately resembles the icu4c file uoption.[ch]. */ public class UOption { // Deliberated public data members - public String longName; - public String value; - public Fn optionFn; - public Object context; - public char shortName; - public int hasArg; + public String longName; + public String value; + public Fn optionFn; + public Object context; + public char shortName; + public int hasArg; public boolean doesOccur; // Values of hasArg - public static final int NO_ARG = 0; + public static final int NO_ARG = 0; public static final int REQUIRES_ARG = 1; public static final int OPTIONAL_ARG = 2; @@ -44,137 +42,154 @@ public interface Fn { int handle(UOption option); } - /** - * Create a UOption with the given attributes. - */ - public static UOption create(String aLongName, - char aShortName, - int hasArgument) { + /** Create a UOption with the given attributes. */ + public static UOption create(String aLongName, char aShortName, int hasArgument) { return new UOption(aLongName, aShortName, hasArgument); } - /** - * Create a UOption with the given attributes. - * Synonym for create(), for C compatibility. - */ - public static UOption DEF(String aLongName, - char aShortName, - int hasArgument) { + /** Create a UOption with the given attributes. Synonym for create(), for C compatibility. */ + public static UOption DEF(String aLongName, char aShortName, int hasArgument) { return create(aLongName, aShortName, hasArgument); } // Standard canned options. These create a new object when // called. Since the UOption object is mutable, we cannot use // static final instances. - public static UOption HELP_H() { return create("help", 'h', NO_ARG); } - public static UOption HELP_QUESTION_MARK() { return create("help", '?', NO_ARG); } - public static UOption VERBOSE() { return create("verbose", 'v', NO_ARG); } - public static UOption QUIET() { return create("quiet", 'q', NO_ARG); } - public static UOption VERSION() { return create("version", 'V', NO_ARG); } - public static UOption COPYRIGHT() { return create("copyright", 'c', NO_ARG); } + public static UOption HELP_H() { + return create("help", 'h', NO_ARG); + } + + public static UOption HELP_QUESTION_MARK() { + return create("help", '?', NO_ARG); + } + + public static UOption VERBOSE() { + return create("verbose", 'v', NO_ARG); + } - public static UOption DESTDIR() { return create("destdir", 'd', REQUIRES_ARG); } - public static UOption SOURCEDIR() { return create("sourcedir", 's', REQUIRES_ARG); } - public static UOption ENCODING() { return create("encoding", 'e', REQUIRES_ARG); } - public static UOption ICUDATADIR() { return create("icudatadir", 'i', REQUIRES_ARG); } - public static UOption PACKAGE_NAME() { return create("package-name", 'p', REQUIRES_ARG); } - public static UOption BUNDLE_NAME() { return create("bundle-name", 'b', REQUIRES_ARG); } + public static UOption QUIET() { + return create("quiet", 'q', NO_ARG); + } + + public static UOption VERSION() { + return create("version", 'V', NO_ARG); + } + + public static UOption COPYRIGHT() { + return create("copyright", 'c', NO_ARG); + } + + public static UOption DESTDIR() { + return create("destdir", 'd', REQUIRES_ARG); + } + + public static UOption SOURCEDIR() { + return create("sourcedir", 's', REQUIRES_ARG); + } + + public static UOption ENCODING() { + return create("encoding", 'e', REQUIRES_ARG); + } + + public static UOption ICUDATADIR() { + return create("icudatadir", 'i', REQUIRES_ARG); + } + + public static UOption PACKAGE_NAME() { + return create("package-name", 'p', REQUIRES_ARG); + } + + public static UOption BUNDLE_NAME() { + return create("bundle-name", 'b', REQUIRES_ARG); + } /** * Java Command line argument parser. * - * This function takes the argv[] command line and a description of - * the program's options in form of an array of UOption structures. - * Each UOption defines a long and a short name (a string and a character) - * for options like "--foo" and "-f". + *

This function takes the argv[] command line and a description of the program's options in + * form of an array of UOption structures. Each UOption defines a long and a short name (a + * string and a character) for options like "--foo" and "-f". * - * Each option is marked with whether it does not take an argument, - * requires one, or optionally takes one. The argument may follow in - * the same argv[] entry for short options, or it may always follow - * in the next argv[] entry. + *

Each option is marked with whether it does not take an argument, requires one, or + * optionally takes one. The argument may follow in the same argv[] entry for short options, or + * it may always follow in the next argv[] entry. * - * An argument is in the next argv[] entry for both long and short name - * options, except it is taken from directly behind the short name in - * its own argv[] entry if there are characters following the option letter. - * An argument in its own argv[] entry must not begin with a '-' - * unless it is only the '-' itself. There is no restriction of the - * argument format if it is part of the short name options's argv[] entry. + *

An argument is in the next argv[] entry for both long and short name options, except it is + * taken from directly behind the short name in its own argv[] entry if there are characters + * following the option letter. An argument in its own argv[] entry must not begin with a '-' + * unless it is only the '-' itself. There is no restriction of the argument format if it is + * part of the short name options's argv[] entry. * - * The argument is stored in the value field of the corresponding - * UOption entry, and the doesOccur field is set to 1 if the option - * is found at all. + *

The argument is stored in the value field of the corresponding UOption entry, and the + * doesOccur field is set to 1 if the option is found at all. * - * Short name options without arguments can be collapsed into a single - * argv[] entry. After an option letter takes an argument, following - * letters will be taken as its argument. + *

Short name options without arguments can be collapsed into a single argv[] entry. After an + * option letter takes an argument, following letters will be taken as its argument. * - * If the same option is found several times, then the last - * argument value will be stored in the value field. + *

If the same option is found several times, then the last argument value will be stored in + * the value field. * - * For each option, a function can be called. This could be used - * for options that occur multiple times and all arguments are to - * be collected. + *

For each option, a function can be called. This could be used for options that occur + * multiple times and all arguments are to be collected. * - * All options are removed from the argv[] array itself. If the parser - * is successful, then it returns the number of remaining non-option - * strings. (Unlike C, the Java argv[] array does NOT contain - * the program name in argv[0].) + *

All options are removed from the argv[] array itself. If the parser is successful, then it + * returns the number of remaining non-option strings. (Unlike C, the Java argv[] array does NOT + * contain the program name in argv[0].) * - * An option "--" ends option processing; everything after this - * remains in the argv[] array. + *

An option "--" ends option processing; everything after this remains in the argv[] array. * - * An option string "-" alone is treated as a non-option. + *

An option string "-" alone is treated as a non-option. * - * If an option is not recognized or an argument missing, then - * the parser returns with the negative index of the argv[] entry - * where the error was detected. + *

If an option is not recognized or an argument missing, then the parser returns with the + * negative index of the argv[] entry where the error was detected. * * @param argv this parameter is modified - * @param start the first argument in argv[] to examine. Must be - * 0..argv.length-1. Arguments from 0..start-1 are ignored. + * @param start the first argument in argv[] to examine. Must be 0..argv.length-1. Arguments + * from 0..start-1 are ignored. * @param options this parameter is modified - * @return the number of unprocessed arguments in argv[], including - * arguments 0..start-1. + * @return the number of unprocessed arguments in argv[], including arguments 0..start-1. */ public static int parseArgs(String argv[], int start, UOption options[]) { String arg; - int i=start, remaining=start; + int i = start, remaining = start; char c; - boolean stopOptions=false; + boolean stopOptions = false; - while(i1 && arg.charAt(0)=='-') { + while (i < argv.length) { + arg = argv[i]; + if (!stopOptions && arg.length() > 1 && arg.charAt(0) == '-') { /* process an option */ - c=arg.charAt(1); - UOption option=null; - arg=arg.substring(2); - if(c=='-') { + c = arg.charAt(1); + UOption option = null; + arg = arg.substring(2); + if (c == '-') { /* process a long option */ - if(arg.length()==0) { + if (arg.length() == 0) { /* stop processing options after "--" */ - stopOptions=true; + stopOptions = true; } else { /* search for the option string */ int j; - for(j=0; j1 && argv[i+1].charAt(0)=='-')) { + if (i + 1 < argv.length + && !(argv[i + 1].length() > 1 + && argv[i + 1].charAt(0) == '-')) { /* argument in the next argv[], and there is not an option in there */ - option.value=argv[++i]; - } else if(option.hasArg==REQUIRES_ARG) { + option.value = argv[++i]; + } else if (option.hasArg == REQUIRES_ARG) { /* there is no argument, but one is required: return with error */ syntaxError("Option " + argv[i] + " lacks required argument"); } @@ -182,48 +197,52 @@ public static int parseArgs(String argv[], int start, UOption options[]) { } } else { /* process one or more short options */ - for (;;) { + for (; ; ) { /* search for the option letter */ int j; - for(j=0; j1 && argv[i+1].charAt(0)=='-')) { + } else if (i + 1 < argv.length + && !(argv[i + 1].length() > 1 + && argv[i + 1].charAt(0) == '-')) { /* argument in the next argv[], and there is not an option in there */ - option.value=argv[++i]; + option.value = argv[++i]; /* this break is redundant because we know that *arg==0 */ break; - } else if(option.hasArg==REQUIRES_ARG) { + } else if (option.hasArg == REQUIRES_ARG) { /* there is no argument, but one is required: return with error */ syntaxError("Option -" + c + " lacks required argument"); } } /* get the next option letter */ - option=null; - if (arg.length()==0) break; - c=arg.charAt(0); - arg=arg.substring(1); + option = null; + if (arg.length() == 0) break; + c = arg.charAt(0); + arg = arg.substring(1); } } - if(option!=null && option.optionFn!=null && option.optionFn.handle(option)<0) { + if (option != null + && option.optionFn != null + && option.optionFn.handle(option) < 0) { /* the option function was called and returned an error */ syntaxError("Option handler failed for " + argv[i]); } @@ -232,7 +251,7 @@ public static int parseArgs(String argv[], int start, UOption options[]) { ++i; } else { /* move a non-option up in argv[] */ - argv[remaining++]=arg; + argv[remaining++] = arg; ++i; } } @@ -241,34 +260,28 @@ public static int parseArgs(String argv[], int start, UOption options[]) { /** * Allows the default to be set in an option list. + * * @param s * @return this - */public UOption setDefault(String s) { + */ + public UOption setDefault(String s) { value = s; return this; } - /** - * Convenient method. - */ + /** Convenient method. */ public static int parseArgs(String argv[], UOption options[]) { return parseArgs(argv, 0, options); } - /** - * Constructor. - */ - private UOption(String aLongName, - char aShortName, - int hasArgument) { + /** Constructor. */ + private UOption(String aLongName, char aShortName, int hasArgument) { longName = aLongName; shortName = aShortName; hasArg = hasArgument; } - /** - * Throw an exception indicating a syntax error. - */ + /** Throw an exception indicating a syntax error. */ private static void syntaxError(String message) { throw new IllegalArgumentException("Error in argument list: " + message); } diff --git a/unicodetools/src/main/java/org/unicode/bidi/BidiConformanceTestBuilder.java b/unicodetools/src/main/java/org/unicode/bidi/BidiConformanceTestBuilder.java index 606183bce..73ccb5e97 100644 --- a/unicodetools/src/main/java/org/unicode/bidi/BidiConformanceTestBuilder.java +++ b/unicodetools/src/main/java/org/unicode/bidi/BidiConformanceTestBuilder.java @@ -46,261 +46,309 @@ public class BidiConformanceTestBuilder { private static final byte PDI = BidiReference.PDI; private static byte[][] extraTests = { - // Test data provided by Behdad Esfahbod, Dov Grobgeld, Aharon Lanin, and Roozbeh Pournader - { AL, AL, R, WS, R, R, WS, L, L, L, WS, L, L, L, WS, R, R, WS, AL, R, R, R, R, R, R }, - { AL, L, AL, WS, L, R, L, LRO, WS, AL, L, R, L, R, CS, WS, EN, EN, EN, CS, AN, AN, AN, WS, L, R, L, R, PDF }, - { AL, L, L, WS, RLE, WS, EN, EN, EN, CS, AN, AN, AN, LRO, R, R, AL, RLO, WS, L, L, L, L, PDF, WS, L, R, AL, CS }, - { AL, ON, FSI, L, PDI, LRI, L, PDI, RLI, R, PDI, ON, ET, EN }, - { AL, R, AL, R, R, AL, WS, ON, EN, EN, ON }, - { AL, R, R, R, R, AL, R, WS, AL, R, R, WS, R, AL, AL, R, WS, R, R, R, AL, R, CS, WS, AL, EN, EN, EN, CS, EN, EN, AN }, - { AL, R, WS, AL, R, AL, AL, R, AL, WS, LRE, PDF, WS, EN, EN, EN, ON, EN, EN, AN, WS, R, R, AL, AL, WS, R, R, AL, ON }, - { AL, R, WS, AL, R, AL, AL, R, AL, WS, LRE, WS, PDF, WS, EN, EN, EN, ON, EN, EN, AN, WS, R, AL, R, R }, - { AL, WS, R, AL, AL, R, WS, AL, R, R, LRE, PDF, WS, AL, R, R, ON }, - { AL, WS, R, AL, R, R, WS, AL, R, R, WS, R, AL, AL, R, WS, R, R, R, AL, R, CS, WS, EN, EN, EN, ET, CS, EN, EN, AN }, - { AN, ON, FSI, L, PDI, LRI, L, PDI, RLI, L, PDI, ON, AL }, - { EN, ES, LRI, PDI, EN, ES, RLI, PDI, EN, ES, FSI, PDI, EN }, - { ET, LRI, PDI, EN, RLI, PDI, ET, FSI, PDI, EN }, - { FSI, ON, AN, AN, PDI }, - { FSI, ON, AN, L, R, PDI }, - { FSI, ON, EN, EN, PDI }, - { FSI, ON, EN, R, L, PDI }, - { FSI, ON, FSI, L, PDI, LRI, L, PDI, R, PDI }, - { FSI, ON, FSI, R, PDI, RLI, R, PDI, L, PDI }, - { FSI, ON, LRE, R, PDF, L, PDI }, - { FSI, ON, LRO, R, PDF, L, PDI }, - { FSI, ON, RLE, L, PDF, R, PDI }, - { FSI, ON, RLO, L, PDF, R, PDI }, - { L, FSI, R, WS, PDI, WS, RLI, WS, LRI, WS, PDI, PDI, WS }, - { L, L, L, L, LRO, WS, L, L, PDF, PDF, PDF, WS, RLO, WS, L, WS, L, L, L, L, WS, L, L, L, PDF }, - { L, L, L, L, WS, L, L, L, WS, AN, AN, RLE, PDF, AN, AN, WS, L, L, L, L, L, L, L, ON }, - { L, L, L, L, WS, L, L, WS, LRO, R, R, R, R, PDF }, - { L, L, L, WS, L, L, WS, R, R, AL, WS, AL, AL, R, WS, L, L, WS, L, L, L, L, L, L }, - { L, L, L, WS, R, AL, R, R, S, R, EN, WS, S, S, L, L, L }, - { L, LRI, R, PDI, FSI, R, PDI, RLI, R, PDI, R }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, EN, EN, EN, CS, EN, EN, AN, CS, WS, R, R, ON }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, EN, EN, EN, CS, WS, EN, EN, AN, CS, WS, R, R, ON }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, AN, AN, AN, CS, AN, AN, AN, CS, WS, R, R, ON }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, AN, AN, AN, CS, WS, AN, AN, AN, CS, WS, R, R, ON }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, EN, EN, EN, CS, EN, EN, AN, ON, CS, WS, R, R, ON }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, EN, EN, EN, CS, WS, EN, EN, AN, ON, CS, WS, R, R, ON }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, AN, AN, AN, CS, AN, AN, AN, ON, CS, WS, R, R, ON }, - { L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, AN, AN, AN, CS, WS, AN, AN, AN, ON, CS, WS, R, R, ON }, - { L, ON, FSI, R, PDI, RLI, R, PDI, LRI, AL, PDI, ON, L }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - ON, RLO, L, LRE, RLI, LRE, RLE, LRO, RLO, PDI, PDF, L, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - ON, RLO, L, LRI, L, RLE, LRE, RLO, LRO, L, PDI, L, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - ON, RLO, LRI, RLE, LRE, RLO, LRO, ON, PDI, L, PDI, L, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - RLE, ON, LRO, R, LRI, R, LRE, RLE, LRO, RLO, R, PDI, R, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - RLE, ON, LRO, R, RLI, ON, LRO, RLE, RLO, LRE, ON, PDI, R, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, - ON, RLO, L, LRE, RLI, LRE, RLE, LRO, RLO, PDI, PDF, L, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, - ON, RLO, L, LRI, L, RLE, LRE, RLO, LRO, L, PDI, L, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, - ON, RLO, LRI, RLE, LRE, RLO, LRO, ON, PDI, L, PDI, L, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, - RLE, ON, LRO, R, LRI, R, LRE, RLE, LRO, RLO, R, PDI, R, PDF, ON }, - { LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, - LRE, - RLE, ON, LRO, R, RLI, ON, LRO, RLE, RLO, LRE, ON, PDI, R, PDF, ON }, - { LRE, RLE, LRO, RLO, LRI, ON, RLO, L, PDF, ON, PDF, ON, PDI, L }, - { L, WS, LRE, L, L, L, L, L, L, WS, RLO, L, L, R, R, PDF, WS, L, L, PDF, L, L }, - { L, WS, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, - RLO, L, L, L }, - { L, WS, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, - LRO, LRO, LRO, - RLO, L, L, L }, - { L, WS, R, R, R, AL, WS, ON, WS, LRO, PDF, R, AL, R, R, ES }, - { ON, EN, EN, ON, WS, AL, R, AL, R, R, AL }, - { ON, EN, WS, AL, AL, R, R, R, R, R, WS, R, R, WS, AL, R, R, AL }, - { ON, FSI, L, PDI, LRI, L, PDI, R }, - { ON, FSI, R, AL, PDI, RLI, R, AL, PDI }, - { ON, L, EN, EN, EN, ON, R, AL, R, AL, AL, R, ON, ES, L, EN, EN, EN, ON }, - { ON, LRE, ON, LRI, ON, LRO, R, PDF, ON, PDI, ON, PDF, ON }, - { ON, LRE, ON, RLE, ON, LRO, R, RLO, L, PDI, L, PDF, R }, - { ON, LRE, ON, RLI, ON, LRO, R, PDF, ON, PDI, ON, PDF, ON }, - { ON, LRI, ON, RLI, ON, FSI, ON, PDI, ON, PDI, ON, PDI, ON }, - { ON, R, EN, EN, EN, ON, L, L, L, L, L, L, ON, ES, R, EN, EN, EN, ON }, - { ON, RLE, ON, FSI, ON, L, RLO, L, PDF, ON, PDI, ON, PDF, ON }, - { ON, RLE, ON, FSI, ON, R, RLO, L, PDF, ON, PDI, ON, PDF, ON }, - { ON, RLE, ON, LRI, ON, RLO, L, PDF, ON, PDI, ON, PDF, ON }, - { ON, RLE, ON, RLI, ON, RLO, L, PDF, ON, PDI, ON, PDF, ON }, - { ON, RLI, ON, FSI, ON, R, LRI, ON, PDI, ON, PDI, ON, PDI, ON }, - { R, AL, R, WS, AL, WS, LRO, R, AL, AL, WS, L, L, L, L, L, L, L }, - { R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, AN, AN, AN, CS, WS, AN, AN, AN, CS, WS, L, L, ON }, - { R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, EN, EN, EN, CS, WS, EN, EN, AN, CS, WS, L, L, ON }, - { R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, L, WS, L, L, L, ON, L, ON, WS, AL, R, AL, WS, R, AL, R }, - { R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, L, WS, L, L, L, ON, ON, WS, AL, R, AL, WS, R, AL, R }, - { R, FSI, L, PDI, LRI, L, PDI, RLI, L, PDI, L }, - { R, FSI, L, WS, PDI, WS, LRI, WS, RLI, WS, PDI, PDI, WS }, - { RLE, LRE, RLO, LRO, RLI, ON, LRO, R, PDF, ON, PDF, ON, PDI, R }, - { RLE, LRE, RLO, LRO, FSI, R, LRO, R, PDF, ON, PDF, ON, PDI, R }, - { RLO, RLE, WS, L, L, L, WS, L, L, L, WS, L, L, L, L, WS, LRO, R, R, AL, PDF, WS, R, R, WS }, - { R, ON, FSI, L, PDI, LRI, L, PDI, RLI, L, PDI, ON, R }, - { R, ON, LRI, L, PDI, FSI, L, PDI, RLI, L, PDI, ON, EN }, - { R, R, AL, WS, R, AL, R, R, AL, WS, R, R, WS, EN, ES, EN, ES, ES, EN }, - { R, R, AL, WS, RLE, L, L, L, L, WS, LRE, R, R, AL, WS, L, L, L }, - { R, R, R, AL, WS, R, AL, AL, R, WS, R, R, R, AL, R, CS, WS, L, L, L, L, L, L, EN, EN, EN, CS, EN, EN, AN }, - { R, R, R, R, AL, WS, EN, ON, EN, WS, EN, ON, EN, WS, EN, ES, EN, WS, EN, ET, EN }, - { R, R, R, WS, ET, EN, EN }, - - // Test data provided by Laurențiu Iancu - { R, LRI, RLE, PDI, R }, - { R, LRI, RLE, R, PDI }, - { R, LRI, R, RLE, PDI }, - { R, LRI, RLE, PDI, AL }, - { R, LRI, RLE, AL, PDI }, - { R, LRI, AL, RLE, PDI }, - { R, LRI, RLE, PDI, EN }, - { R, LRI, RLE, EN, PDI }, - { R, LRI, EN, RLE, PDI }, - { R, LRI, RLE, PDI, AN }, - { R, LRI, RLE, AN, PDI }, - { R, LRI, AN, RLE, PDI }, - - { R, LRI, RLO, PDI, R }, - { R, LRI, RLO, R, PDI }, - { R, LRI, R, RLO, PDI }, - { R, LRI, RLO, PDI, AL }, - { R, LRI, RLO, AL, PDI }, - { R, LRI, AL, RLO, PDI }, - { R, LRI, RLO, PDI, EN }, - { R, LRI, RLO, EN, PDI }, - { R, LRI, EN, RLO, PDI }, - { R, LRI, RLO, PDI, AN }, - { R, LRI, RLO, AN, PDI }, - { R, LRI, AN, RLO, PDI }, - - { R, LRI, LRE, PDI, R }, - { R, LRI, LRE, R, PDI }, - { R, LRI, R, LRE, PDI }, - { R, LRI, LRE, PDI, AL }, - { R, LRI, LRE, AL, PDI }, - { R, LRI, AL, LRE, PDI }, - { R, LRI, LRE, PDI, EN }, - { R, LRI, LRE, EN, PDI }, - { R, LRI, EN, LRE, PDI }, - { R, LRI, LRE, PDI, AN }, - { R, LRI, LRE, AN, PDI }, - { R, LRI, AN, LRE, PDI }, - - { R, LRI, LRO, PDI, R }, - { R, LRI, LRO, R, PDI }, - { R, LRI, R, LRO, PDI }, - { R, LRI, LRO, PDI, AL }, - { R, LRI, LRO, AL, PDI }, - { R, LRI, AL, LRO, PDI }, - { R, LRI, LRO, PDI, EN }, - { R, LRI, LRO, EN, PDI }, - { R, LRI, EN, LRO, PDI }, - { R, LRI, LRO, PDI, AN }, - { R, LRI, LRO, AN, PDI }, - { R, LRI, AN, LRO, PDI }, - - { L, RLI, LRE, PDI, L }, - { L, RLI, LRE, L, PDI }, - { L, RLI, L, LRE, PDI }, - { L, RLI, LRE, PDI, AL }, - { L, RLI, LRE, AL, PDI }, - { L, RLI, AL, LRE, PDI }, - { L, RLI, LRE, PDI, EN }, - { L, RLI, LRE, EN, PDI }, - { L, RLI, EN, LRE, PDI }, - { L, RLI, LRE, PDI, AN }, - { L, RLI, LRE, AN, PDI }, - { L, RLI, AN, LRE, PDI }, - - { L, RLI, LRO, PDI, L }, - { L, RLI, LRO, L, PDI }, - { L, RLI, L, LRO, PDI }, - { L, RLI, LRO, PDI, AL }, - { L, RLI, LRO, AL, PDI }, - { L, RLI, AL, LRO, PDI }, - { L, RLI, LRO, PDI, EN }, - { L, RLI, LRO, EN, PDI }, - { L, RLI, EN, LRO, PDI }, - { L, RLI, LRO, PDI, AN }, - { L, RLI, LRO, AN, PDI }, - { L, RLI, AN, LRO, PDI }, - - { L, RLI, RLE, PDI, L }, - { L, RLI, RLE, L, PDI }, - { L, RLI, L, RLE, PDI }, - { L, RLI, RLE, PDI, AL }, - { L, RLI, RLE, AL, PDI }, - { L, RLI, AL, RLE, PDI }, - { L, RLI, RLE, PDI, EN }, - { L, RLI, RLE, EN, PDI }, - { L, RLI, EN, RLE, PDI }, - { L, RLI, RLE, PDI, AN }, - { L, RLI, RLE, AN, PDI }, - { L, RLI, AN, RLE, PDI }, - - { L, RLI, RLO, PDI, L }, - { L, RLI, RLO, L, PDI }, - { L, RLI, L, RLO, PDI }, - { L, RLI, RLO, PDI, AL }, - { L, RLI, RLO, AL, PDI }, - { L, RLI, AL, RLO, PDI }, - { L, RLI, RLO, PDI, EN }, - { L, RLI, RLO, EN, PDI }, - { L, RLI, EN, RLO, PDI }, - { L, RLI, RLO, PDI, AN }, - { L, RLI, RLO, AN, PDI }, - { L, RLI, AN, RLO, PDI }, + // Test data provided by Behdad Esfahbod, Dov Grobgeld, Aharon Lanin, and Roozbeh Pournader + {AL, AL, R, WS, R, R, WS, L, L, L, WS, L, L, L, WS, R, R, WS, AL, R, R, R, R, R, R}, + { + AL, L, AL, WS, L, R, L, LRO, WS, AL, L, R, L, R, CS, WS, EN, EN, EN, CS, AN, AN, AN, WS, + L, R, L, R, PDF + }, + { + AL, L, L, WS, RLE, WS, EN, EN, EN, CS, AN, AN, AN, LRO, R, R, AL, RLO, WS, L, L, L, L, + PDF, WS, L, R, AL, CS + }, + {AL, ON, FSI, L, PDI, LRI, L, PDI, RLI, R, PDI, ON, ET, EN}, + {AL, R, AL, R, R, AL, WS, ON, EN, EN, ON}, + { + AL, R, R, R, R, AL, R, WS, AL, R, R, WS, R, AL, AL, R, WS, R, R, R, AL, R, CS, WS, AL, + EN, EN, EN, CS, EN, EN, AN + }, + { + AL, R, WS, AL, R, AL, AL, R, AL, WS, LRE, PDF, WS, EN, EN, EN, ON, EN, EN, AN, WS, R, R, + AL, AL, WS, R, R, AL, ON + }, + { + AL, R, WS, AL, R, AL, AL, R, AL, WS, LRE, WS, PDF, WS, EN, EN, EN, ON, EN, EN, AN, WS, + R, AL, R, R + }, + {AL, WS, R, AL, AL, R, WS, AL, R, R, LRE, PDF, WS, AL, R, R, ON}, + { + AL, WS, R, AL, R, R, WS, AL, R, R, WS, R, AL, AL, R, WS, R, R, R, AL, R, CS, WS, EN, EN, + EN, ET, CS, EN, EN, AN + }, + {AN, ON, FSI, L, PDI, LRI, L, PDI, RLI, L, PDI, ON, AL}, + {EN, ES, LRI, PDI, EN, ES, RLI, PDI, EN, ES, FSI, PDI, EN}, + {ET, LRI, PDI, EN, RLI, PDI, ET, FSI, PDI, EN}, + {FSI, ON, AN, AN, PDI}, + {FSI, ON, AN, L, R, PDI}, + {FSI, ON, EN, EN, PDI}, + {FSI, ON, EN, R, L, PDI}, + {FSI, ON, FSI, L, PDI, LRI, L, PDI, R, PDI}, + {FSI, ON, FSI, R, PDI, RLI, R, PDI, L, PDI}, + {FSI, ON, LRE, R, PDF, L, PDI}, + {FSI, ON, LRO, R, PDF, L, PDI}, + {FSI, ON, RLE, L, PDF, R, PDI}, + {FSI, ON, RLO, L, PDF, R, PDI}, + {L, FSI, R, WS, PDI, WS, RLI, WS, LRI, WS, PDI, PDI, WS}, + { + L, L, L, L, LRO, WS, L, L, PDF, PDF, PDF, WS, RLO, WS, L, WS, L, L, L, L, WS, L, L, L, + PDF + }, + {L, L, L, L, WS, L, L, L, WS, AN, AN, RLE, PDF, AN, AN, WS, L, L, L, L, L, L, L, ON}, + {L, L, L, L, WS, L, L, WS, LRO, R, R, R, R, PDF}, + {L, L, L, WS, L, L, WS, R, R, AL, WS, AL, AL, R, WS, L, L, WS, L, L, L, L, L, L}, + {L, L, L, WS, R, AL, R, R, S, R, EN, WS, S, S, L, L, L}, + {L, LRI, R, PDI, FSI, R, PDI, RLI, R, PDI, R}, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, EN, EN, EN, CS, EN, EN, AN, CS, WS, R, + R, ON + }, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, EN, EN, EN, CS, WS, EN, EN, AN, CS, + WS, R, R, ON + }, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, AN, AN, AN, CS, AN, AN, AN, CS, WS, R, + R, ON + }, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, AN, AN, AN, CS, WS, AN, AN, AN, CS, + WS, R, R, ON + }, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, EN, EN, EN, CS, EN, EN, AN, ON, + CS, WS, R, R, ON + }, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, EN, EN, EN, CS, WS, EN, EN, AN, + ON, CS, WS, R, R, ON + }, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, AN, AN, AN, CS, AN, AN, AN, ON, + CS, WS, R, R, ON + }, + { + L, L, WS, L, L, L, L, WS, ON, R, R, WS, R, R, WS, ON, AN, AN, AN, CS, WS, AN, AN, AN, + ON, CS, WS, R, R, ON + }, + {L, ON, FSI, R, PDI, RLI, R, PDI, LRI, AL, PDI, ON, L}, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, ON, RLO, L, LRE, RLI, + LRE, RLE, LRO, RLO, PDI, PDF, L, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, ON, RLO, L, LRI, L, + RLE, LRE, RLO, LRO, L, PDI, L, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, ON, RLO, LRI, RLE, LRE, RLO, + LRO, ON, PDI, L, PDI, L, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, RLE, ON, LRO, R, LRI, R, + LRE, RLE, LRO, RLO, R, PDI, R, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, RLE, ON, LRO, R, RLI, ON, + LRO, RLE, RLO, LRE, ON, PDI, R, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, ON, RLO, L, LRE, RLI, LRE, RLE, + LRO, RLO, PDI, PDF, L, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, ON, RLO, L, LRI, L, RLE, LRE, + RLO, LRO, L, PDI, L, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, ON, RLO, LRI, RLE, LRE, RLO, LRO, ON, + PDI, L, PDI, L, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, RLE, ON, LRO, R, LRI, R, LRE, RLE, + LRO, RLO, R, PDI, R, PDF, ON + }, + { + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, + LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, LRE, RLE, ON, LRO, R, RLI, ON, LRO, RLE, + RLO, LRE, ON, PDI, R, PDF, ON + }, + {LRE, RLE, LRO, RLO, LRI, ON, RLO, L, PDF, ON, PDF, ON, PDI, L}, + {L, WS, LRE, L, L, L, L, L, L, WS, RLO, L, L, R, R, PDF, WS, L, L, PDF, L, L}, + { + L, WS, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, + LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, RLO, L, L, L + }, + { + L, WS, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, + LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, + LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, + LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, LRO, RLO, L, L, L + }, + {L, WS, R, R, R, AL, WS, ON, WS, LRO, PDF, R, AL, R, R, ES}, + {ON, EN, EN, ON, WS, AL, R, AL, R, R, AL}, + {ON, EN, WS, AL, AL, R, R, R, R, R, WS, R, R, WS, AL, R, R, AL}, + {ON, FSI, L, PDI, LRI, L, PDI, R}, + {ON, FSI, R, AL, PDI, RLI, R, AL, PDI}, + {ON, L, EN, EN, EN, ON, R, AL, R, AL, AL, R, ON, ES, L, EN, EN, EN, ON}, + {ON, LRE, ON, LRI, ON, LRO, R, PDF, ON, PDI, ON, PDF, ON}, + {ON, LRE, ON, RLE, ON, LRO, R, RLO, L, PDI, L, PDF, R}, + {ON, LRE, ON, RLI, ON, LRO, R, PDF, ON, PDI, ON, PDF, ON}, + {ON, LRI, ON, RLI, ON, FSI, ON, PDI, ON, PDI, ON, PDI, ON}, + {ON, R, EN, EN, EN, ON, L, L, L, L, L, L, ON, ES, R, EN, EN, EN, ON}, + {ON, RLE, ON, FSI, ON, L, RLO, L, PDF, ON, PDI, ON, PDF, ON}, + {ON, RLE, ON, FSI, ON, R, RLO, L, PDF, ON, PDI, ON, PDF, ON}, + {ON, RLE, ON, LRI, ON, RLO, L, PDF, ON, PDI, ON, PDF, ON}, + {ON, RLE, ON, RLI, ON, RLO, L, PDF, ON, PDI, ON, PDF, ON}, + {ON, RLI, ON, FSI, ON, R, LRI, ON, PDI, ON, PDI, ON, PDI, ON}, + {R, AL, R, WS, AL, WS, LRO, R, AL, AL, WS, L, L, L, L, L, L, L}, + { + R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, AN, AN, AN, CS, WS, AN, AN, AN, CS, + WS, L, L, ON + }, + { + R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, EN, EN, EN, CS, WS, EN, EN, AN, CS, + WS, L, L, ON + }, + { + R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, L, WS, L, L, L, ON, L, ON, WS, AL, + R, AL, WS, R, AL, R + }, + { + R, AL, WS, R, AL, R, AL, WS, ON, L, L, WS, L, L, WS, L, WS, L, L, L, ON, ON, WS, AL, R, + AL, WS, R, AL, R + }, + {R, FSI, L, PDI, LRI, L, PDI, RLI, L, PDI, L}, + {R, FSI, L, WS, PDI, WS, LRI, WS, RLI, WS, PDI, PDI, WS}, + {RLE, LRE, RLO, LRO, RLI, ON, LRO, R, PDF, ON, PDF, ON, PDI, R}, + {RLE, LRE, RLO, LRO, FSI, R, LRO, R, PDF, ON, PDF, ON, PDI, R}, + {RLO, RLE, WS, L, L, L, WS, L, L, L, WS, L, L, L, L, WS, LRO, R, R, AL, PDF, WS, R, R, WS}, + {R, ON, FSI, L, PDI, LRI, L, PDI, RLI, L, PDI, ON, R}, + {R, ON, LRI, L, PDI, FSI, L, PDI, RLI, L, PDI, ON, EN}, + {R, R, AL, WS, R, AL, R, R, AL, WS, R, R, WS, EN, ES, EN, ES, ES, EN}, + {R, R, AL, WS, RLE, L, L, L, L, WS, LRE, R, R, AL, WS, L, L, L}, + { + R, R, R, AL, WS, R, AL, AL, R, WS, R, R, R, AL, R, CS, WS, L, L, L, L, L, L, EN, EN, EN, + CS, EN, EN, AN + }, + {R, R, R, R, AL, WS, EN, ON, EN, WS, EN, ON, EN, WS, EN, ES, EN, WS, EN, ET, EN}, + {R, R, R, WS, ET, EN, EN}, + + // Test data provided by Laurențiu Iancu + {R, LRI, RLE, PDI, R}, + {R, LRI, RLE, R, PDI}, + {R, LRI, R, RLE, PDI}, + {R, LRI, RLE, PDI, AL}, + {R, LRI, RLE, AL, PDI}, + {R, LRI, AL, RLE, PDI}, + {R, LRI, RLE, PDI, EN}, + {R, LRI, RLE, EN, PDI}, + {R, LRI, EN, RLE, PDI}, + {R, LRI, RLE, PDI, AN}, + {R, LRI, RLE, AN, PDI}, + {R, LRI, AN, RLE, PDI}, + {R, LRI, RLO, PDI, R}, + {R, LRI, RLO, R, PDI}, + {R, LRI, R, RLO, PDI}, + {R, LRI, RLO, PDI, AL}, + {R, LRI, RLO, AL, PDI}, + {R, LRI, AL, RLO, PDI}, + {R, LRI, RLO, PDI, EN}, + {R, LRI, RLO, EN, PDI}, + {R, LRI, EN, RLO, PDI}, + {R, LRI, RLO, PDI, AN}, + {R, LRI, RLO, AN, PDI}, + {R, LRI, AN, RLO, PDI}, + {R, LRI, LRE, PDI, R}, + {R, LRI, LRE, R, PDI}, + {R, LRI, R, LRE, PDI}, + {R, LRI, LRE, PDI, AL}, + {R, LRI, LRE, AL, PDI}, + {R, LRI, AL, LRE, PDI}, + {R, LRI, LRE, PDI, EN}, + {R, LRI, LRE, EN, PDI}, + {R, LRI, EN, LRE, PDI}, + {R, LRI, LRE, PDI, AN}, + {R, LRI, LRE, AN, PDI}, + {R, LRI, AN, LRE, PDI}, + {R, LRI, LRO, PDI, R}, + {R, LRI, LRO, R, PDI}, + {R, LRI, R, LRO, PDI}, + {R, LRI, LRO, PDI, AL}, + {R, LRI, LRO, AL, PDI}, + {R, LRI, AL, LRO, PDI}, + {R, LRI, LRO, PDI, EN}, + {R, LRI, LRO, EN, PDI}, + {R, LRI, EN, LRO, PDI}, + {R, LRI, LRO, PDI, AN}, + {R, LRI, LRO, AN, PDI}, + {R, LRI, AN, LRO, PDI}, + {L, RLI, LRE, PDI, L}, + {L, RLI, LRE, L, PDI}, + {L, RLI, L, LRE, PDI}, + {L, RLI, LRE, PDI, AL}, + {L, RLI, LRE, AL, PDI}, + {L, RLI, AL, LRE, PDI}, + {L, RLI, LRE, PDI, EN}, + {L, RLI, LRE, EN, PDI}, + {L, RLI, EN, LRE, PDI}, + {L, RLI, LRE, PDI, AN}, + {L, RLI, LRE, AN, PDI}, + {L, RLI, AN, LRE, PDI}, + {L, RLI, LRO, PDI, L}, + {L, RLI, LRO, L, PDI}, + {L, RLI, L, LRO, PDI}, + {L, RLI, LRO, PDI, AL}, + {L, RLI, LRO, AL, PDI}, + {L, RLI, AL, LRO, PDI}, + {L, RLI, LRO, PDI, EN}, + {L, RLI, LRO, EN, PDI}, + {L, RLI, EN, LRO, PDI}, + {L, RLI, LRO, PDI, AN}, + {L, RLI, LRO, AN, PDI}, + {L, RLI, AN, LRO, PDI}, + {L, RLI, RLE, PDI, L}, + {L, RLI, RLE, L, PDI}, + {L, RLI, L, RLE, PDI}, + {L, RLI, RLE, PDI, AL}, + {L, RLI, RLE, AL, PDI}, + {L, RLI, AL, RLE, PDI}, + {L, RLI, RLE, PDI, EN}, + {L, RLI, RLE, EN, PDI}, + {L, RLI, EN, RLE, PDI}, + {L, RLI, RLE, PDI, AN}, + {L, RLI, RLE, AN, PDI}, + {L, RLI, AN, RLE, PDI}, + {L, RLI, RLO, PDI, L}, + {L, RLI, RLO, L, PDI}, + {L, RLI, L, RLO, PDI}, + {L, RLI, RLO, PDI, AL}, + {L, RLI, RLO, AL, PDI}, + {L, RLI, AL, RLO, PDI}, + {L, RLI, RLO, PDI, EN}, + {L, RLI, RLO, EN, PDI}, + {L, RLI, EN, RLO, PDI}, + {L, RLI, RLO, PDI, AN}, + {L, RLI, RLO, AN, PDI}, + {L, RLI, AN, RLO, PDI}, }; private static final int R_DEFAULT = -2; @@ -310,6 +358,7 @@ public class BidiConformanceTestBuilder { public static int MAX_SIZE = 4; private static BitSet SKIPS = new BitSet(); + static { // skip RLE, LRE, RLO, LRO, PDF, and BN SKIPS.set(BidiReference.RLE); @@ -392,11 +441,13 @@ public byte[] getArray() { public static void write(PrintWriter out) throws FileNotFoundException { final int[] linebreaks = new int[1]; - final Map> resultToSource = new TreeMap>(SHORTEST_FIRST); + final Map> resultToSource = + new TreeMap>(SHORTEST_FIRST); final Map condensed = new HashMap(); final Sample sample = new Sample(MAX_SIZE); - main: while (sample.next()) { + main: + while (sample.next()) { // make sure B doesn't occur in any but the last for (int i = 0; i < sample.items.size() - 1; ++i) { if (sample.items.get(i) == BidiReference.B) { @@ -408,31 +459,38 @@ public static void write(PrintWriter out) throws FileNotFoundException { final byte[] TYPELIST = sample.getArray(); linebreaks[0] = TYPELIST.length; condensed.clear(); - for (byte paragraphEmbeddingLevel = BIDI_START_LEVEL; paragraphEmbeddingLevel <= 1; ++paragraphEmbeddingLevel) { + for (byte paragraphEmbeddingLevel = BIDI_START_LEVEL; + paragraphEmbeddingLevel <= 1; + ++paragraphEmbeddingLevel) { - final String reorderedIndexes = reorderedIndexes(TYPELIST, paragraphEmbeddingLevel, linebreaks); + final String reorderedIndexes = + reorderedIndexes(TYPELIST, paragraphEmbeddingLevel, linebreaks); Integer bitmask = condensed.get(reorderedIndexes); if (bitmask == null) { bitmask = 0; } - final int reordered = paragraphEmbeddingLevel == R_DEFAULT ? 3 : paragraphEmbeddingLevel + 1; + final int reordered = + paragraphEmbeddingLevel == R_DEFAULT ? 3 : paragraphEmbeddingLevel + 1; bitmask |= 1 << (reordered); condensed.put(reorderedIndexes, bitmask); } for (final String reorderedIndexes : condensed.keySet()) { final Integer bitset = condensed.get(reorderedIndexes); - addResult(resultToSource, typeString + "; " + Integer.toHexString(bitset).toUpperCase(Locale.ENGLISH), reorderedIndexes); + addResult( + resultToSource, + typeString + "; " + Integer.toHexString(bitset).toUpperCase(Locale.ENGLISH), + reorderedIndexes); } } -/* - for (int i = BidiReference.TYPE_MIN; i < BidiReference.TYPE_MAX; ++i) - { - UnicodeSet data = new UnicodeSet("[:bidi_class=" + - BidiReference.typenames[i] + ":]"); - data.complement().complement(); - out.println("@Type:\t" + BidiReference.typenames[i] + ":\t" + data); - } -*/ + /* + for (int i = BidiReference.TYPE_MIN; i < BidiReference.TYPE_MAX; ++i) + { + UnicodeSet data = new UnicodeSet("[:bidi_class=" + + BidiReference.typenames[i] + ":]"); + data.complement().complement(); + out.println("@Type:\t" + BidiReference.typenames[i] + ":\t" + data); + } + */ int totalCount = 0; for (final String reorderedIndexes : resultToSource.keySet()) { out.println(); @@ -458,7 +516,9 @@ public static void write(PrintWriter out) throws FileNotFoundException { System.out.println("Done"); } - private static void addResult(Map> resultToSource, final String source, + private static void addResult( + Map> resultToSource, + final String source, final String reorderedIndexes) { Set sources = resultToSource.get(reorderedIndexes); if (sources == null) { @@ -467,7 +527,8 @@ private static void addResult(Map> resultToSource, final Str sources.add(source); } - private static String reorderedIndexes(byte[] types, byte paragraphEmbeddingLevel, int[] linebreaks) { + private static String reorderedIndexes( + byte[] types, byte paragraphEmbeddingLevel, int[] linebreaks) { final StringBuilder result = new StringBuilder(); final BidiReference bidi = new BidiReference(types, paragraphEmbeddingLevel); @@ -503,16 +564,16 @@ private static String reorderedIndexes(byte[] types, byte paragraphEmbeddingLeve return result.toString(); } - static Comparator SHORTEST_FIRST = new Comparator() { + static Comparator SHORTEST_FIRST = + new Comparator() { - @Override - public int compare(String o1, String o2) { - final int result = o1.length() - o2.length(); - if (result != 0) { - return result; - } - return o1.compareTo(o2); - } - - }; + @Override + public int compare(String o1, String o2) { + final int result = o1.length() - o2.length(); + if (result != 0) { + return result; + } + return o1.compareTo(o2); + } + }; } diff --git a/unicodetools/src/main/java/org/unicode/bidi/BidiReference.java b/unicodetools/src/main/java/org/unicode/bidi/BidiReference.java index 328aba9e3..da39a8548 100644 --- a/unicodetools/src/main/java/org/unicode/bidi/BidiReference.java +++ b/unicodetools/src/main/java/org/unicode/bidi/BidiReference.java @@ -14,58 +14,55 @@ * (C) Copyright Google Inc. 2013, All Rights Reserved * * Distributed under the Terms of Use in http://www.unicode.org/copyright.html. -*/ + */ /** * Reference implementation of the Unicode Bidirectional Algorithm (UAX #9). * - *

- * This implementation is not optimized for performance. It is intended as a - * reference implementation that closely follows the specification of the - * Bidirectional Algorithm in The Unicode Standard version 6.3. - *

- * Input:
- * There are two levels of input to the algorithm, since clients may prefer to - * supply some information from out-of-band sources rather than relying on the - * default behavior. + *

This implementation is not optimized for performance. It is intended as a reference + * implementation that closely follows the specification of the Bidirectional Algorithm in The + * Unicode Standard version 6.3. + * + *

Input:
+ * There are two levels of input to the algorithm, since clients may prefer to supply some + * information from out-of-band sources rather than relying on the default behavior. + * *

    - *
  1. Bidi class array - *
  2. Bidi class array, with externally supplied base line direction + *
  3. Bidi class array + *
  4. Bidi class array, with externally supplied base line direction *
- *

- * Output:
- * Output is separated into several stages as well, to better enable clients to - * evaluate various aspects of implementation conformance. + * + *

Output:
+ * Output is separated into several stages as well, to better enable clients to evaluate various + * aspects of implementation conformance. + * *

    - *
  1. levels array over entire paragraph - *
  2. reordering array over entire paragraph - *
  3. levels array over line - *
  4. reordering array over line + *
  5. levels array over entire paragraph + *
  6. reordering array over entire paragraph + *
  7. levels array over line + *
  8. reordering array over line *
- * Note that for conformance to the Unicode Bidirectional Algorithm, - * implementations are only required to generate correct reordering and - * character directionality (odd or even levels) over a line. Generating - * identical level arrays over a line is not required. Bidi explicit format - * codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and - * positions as long as the rest of the input is properly reordered. - *

- * As the algorithm is defined to operate on a single paragraph at a time, this - * implementation is written to handle single paragraphs. Thus rule P1 is - * presumed by this implementation-- the data provided to the implementation is - * assumed to be a single paragraph, and either contains no 'B' codes, or a - * single 'B' code at the end of the input. 'B' is allowed as input to - * illustrate how the algorithm assigns it a level. - *

- * Also note that rules L3 and L4 depend on the rendering engine that uses the - * result of the bidi algorithm. This implementation assumes that the rendering - * engine expects combining marks in visual order (e.g. to the left of their - * base character in RTL runs) and that it adjusts the glyphs used to render - * mirrored characters that are in RTL runs so that they render appropriately. + * + * Note that for conformance to the Unicode Bidirectional Algorithm, implementations are only + * required to generate correct reordering and character directionality (odd or even levels) over a + * line. Generating identical level arrays over a line is not required. Bidi explicit format codes + * (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and positions as long as the + * rest of the input is properly reordered. + * + *

As the algorithm is defined to operate on a single paragraph at a time, this implementation is + * written to handle single paragraphs. Thus rule P1 is presumed by this implementation-- the data + * provided to the implementation is assumed to be a single paragraph, and either contains no 'B' + * codes, or a single 'B' code at the end of the input. 'B' is allowed as input to illustrate how + * the algorithm assigns it a level. + * + *

Also note that rules L3 and L4 depend on the rendering engine that uses the result of the bidi + * algorithm. This implementation assumes that the rendering engine expects combining marks in + * visual order (e.g. to the left of their base character in RTL runs) and that it adjusts the + * glyphs used to render mirrored characters that are in RTL runs so that they render appropriately. * * @author Doug Felt * @author Roozbeh Pournader */ - public final class BidiReference { private final byte[] initialTypes; private byte paragraphEmbeddingLevel = -1; // undefined @@ -168,29 +165,8 @@ public final class BidiReference { /** Shorthand names of bidi type values, for error reporting. */ public static final String[] typenames = { - "L", - "LRE", - "LRO", - "R", - "AL", - "RLE", - "RLO", - "PDF", - "EN", - "ES", - "ET", - "AN", - "CS", - "NSM", - "BN", - "B", - "S", - "WS", - "ON", - "LRI", - "RLI", - "FSI", - "PDI" + "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", + "BN", "B", "S", "WS", "ON", "LRI", "RLI", "FSI", "PDI" }; // @@ -198,12 +174,10 @@ public final class BidiReference { // /** - * Initialize using an array of direction types. Types range from TYPE_MIN - * to TYPE_MAX inclusive and represent the direction codes of the characters - * in the text. + * Initialize using an array of direction types. Types range from TYPE_MIN to TYPE_MAX inclusive + * and represent the direction codes of the characters in the text. * - * @param types - * the types array + * @param types the types array */ public BidiReference(byte[] types) { validateTypes(types); @@ -214,16 +188,14 @@ public BidiReference(byte[] types) { } /** - * Initialize using an array of direction types and an externally supplied - * paragraph embedding level. The embedding level may be -1, 0, or 1. - *

- * -1 means to apply the default algorithm (rules P2 and P3), 0 is for LTR - * paragraphs, and 1 is for RTL paragraphs. + * Initialize using an array of direction types and an externally supplied paragraph embedding + * level. The embedding level may be -1, 0, or 1. + * + *

-1 means to apply the default algorithm (rules P2 and P3), 0 is for LTR paragraphs, and 1 + * is for RTL paragraphs. * - * @param types - * the types array - * @param paragraphEmbeddingLevel - * the externally supplied paragraph embedding level. + * @param types the types array + * @param paragraphEmbeddingLevel the externally supplied paragraph embedding level. */ public BidiReference(byte[] types, byte paragraphEmbeddingLevel) { validateTypes(types); @@ -236,8 +208,8 @@ public BidiReference(byte[] types, byte paragraphEmbeddingLevel) { } /** - * The algorithm. Does not include line-based processing (Rules L1, L2). - * These are applied later in the line-based phase of the algorithm. + * The algorithm. Does not include line-based processing (Rules L1, L2). These are applied later + * in the line-based phase of the algorithm. */ private void runAlgorithm() { textLength = initialTypes.length; @@ -302,19 +274,18 @@ private void runAlgorithm() { /** * Determine the matching PDI for each isolate initiator and vice versa. - *

- * Definition BD9. - *

- * At the end of this function: + * + *

Definition BD9. + * + *

At the end of this function: + * *

    - *
  • The member variable matchingPDI is set to point to the index of the - * matching PDI character for each isolate initiator character. If there is - * no matching PDI, it is set to the length of the input text. For other - * characters, it is set to -1. - *
  • The member variable matchingIsolateInitiator is set to point to the - * index of the matching isolate initiator character for each PDI character. - * If there is no matching isolate initiator, or the character is not a PDI, - * it is set to -1. + *
  • The member variable matchingPDI is set to point to the index of the matching PDI + * character for each isolate initiator character. If there is no matching PDI, it is set + * to the length of the input text. For other characters, it is set to -1. + *
  • The member variable matchingIsolateInitiator is set to point to the index of the + * matching isolate initiator character for each PDI character. If there is no matching + * isolate initiator, or the character is not a PDI, it is set to -1. *
*/ private void determineMatchingIsolates() { @@ -352,16 +323,12 @@ private void determineMatchingIsolates() { } /** - * Determines the paragraph level based on rules P2, P3. This is also used - * in rule X5c to find if an FSI should resolve to LRI or RLI. - * - * @param startIndex - * the index of the beginning of the substring - * @param endIndex - * the index of the character after the end of the string + * Determines the paragraph level based on rules P2, P3. This is also used in rule X5c to find + * if an FSI should resolve to LRI or RLI. * - * @return the resolved paragraph direction of the substring limited by - * startIndex and endIndex + * @param startIndex the index of the beginning of the substring + * @param endIndex the index of the character after the end of the string + * @return the resolved paragraph direction of the substring limited by startIndex and endIndex */ private byte determineParagraphEmbeddingLevel(int startIndex, int endIndex) { byte strongType = -1; // unknown @@ -431,9 +398,7 @@ public boolean lastDirectionalIsolateStatus() { } } - /** - * Determine explicit levels using rules X1 - X8 - */ + /** Determine explicit levels using rules X1 - X8 */ private void determineExplicitEmbeddingLevels() { directionalStatusStack stack = new directionalStatusStack(); int overflowIsolateCount, overflowEmbeddingCount, validIsolateCount; @@ -449,114 +414,113 @@ private void determineExplicitEmbeddingLevels() { // Rules X2, X3, X4, X5, X5a, X5b, X5c switch (t) { - case RLE: - case LRE: - case RLO: - case LRO: - case RLI: - case LRI: - case FSI: - boolean isIsolate = (t == RLI || t == LRI || t == FSI); - boolean isRTL = (t == RLE || t == RLO || t == RLI); - // override if this is an FSI that resolves to RLI - if (t == FSI) { - isRTL = (determineParagraphEmbeddingLevel(i + 1, matchingPDI[i]) == 1); - } - - if (isIsolate) { - resultLevels[i] = stack.lastEmbeddingLevel(); - } - - byte newLevel; - if (isRTL) { - // least greater odd - newLevel = (byte) ((stack.lastEmbeddingLevel() + 1) | 1); - } else { - // least greater even - newLevel = (byte) ((stack.lastEmbeddingLevel() + 2) & ~1); - } + case RLE: + case LRE: + case RLO: + case LRO: + case RLI: + case LRI: + case FSI: + boolean isIsolate = (t == RLI || t == LRI || t == FSI); + boolean isRTL = (t == RLE || t == RLO || t == RLI); + // override if this is an FSI that resolves to RLI + if (t == FSI) { + isRTL = (determineParagraphEmbeddingLevel(i + 1, matchingPDI[i]) == 1); + } - if (newLevel <= MAX_DEPTH && overflowIsolateCount == 0 && overflowEmbeddingCount == 0) { if (isIsolate) { - ++validIsolateCount; + resultLevels[i] = stack.lastEmbeddingLevel(); } - // Push new embedding level, override status, and isolated - // status. - // No check for valid stack counter, since the level check - // suffices. - stack.push( - newLevel, - t == LRO ? L : t == RLO ? R : ON, - isIsolate); - // Not really part of the spec - if (!isIsolate) { - resultLevels[i] = newLevel; + byte newLevel; + if (isRTL) { + // least greater odd + newLevel = (byte) ((stack.lastEmbeddingLevel() + 1) | 1); + } else { + // least greater even + newLevel = (byte) ((stack.lastEmbeddingLevel() + 2) & ~1); } - } else { - // This is an invalid explicit formatting character, - // so apply the "Otherwise" part of rules X2-X5b. - if (isIsolate) { - ++overflowIsolateCount; - } else { // !isIsolate - if (overflowIsolateCount == 0) { - ++overflowEmbeddingCount; + + if (newLevel <= MAX_DEPTH + && overflowIsolateCount == 0 + && overflowEmbeddingCount == 0) { + if (isIsolate) { + ++validIsolateCount; + } + // Push new embedding level, override status, and isolated + // status. + // No check for valid stack counter, since the level check + // suffices. + stack.push(newLevel, t == LRO ? L : t == RLO ? R : ON, isIsolate); + + // Not really part of the spec + if (!isIsolate) { + resultLevels[i] = newLevel; + } + } else { + // This is an invalid explicit formatting character, + // so apply the "Otherwise" part of rules X2-X5b. + if (isIsolate) { + ++overflowIsolateCount; + } else { // !isIsolate + if (overflowIsolateCount == 0) { + ++overflowEmbeddingCount; + } } } - } - break; + break; - // Rule X6a - case PDI: - if (overflowIsolateCount > 0) { - --overflowIsolateCount; - } else if (validIsolateCount == 0) { - // do nothing - } else { - overflowEmbeddingCount = 0; - while (!stack.lastDirectionalIsolateStatus()) { + // Rule X6a + case PDI: + if (overflowIsolateCount > 0) { + --overflowIsolateCount; + } else if (validIsolateCount == 0) { + // do nothing + } else { + overflowEmbeddingCount = 0; + while (!stack.lastDirectionalIsolateStatus()) { + stack.pop(); + } stack.pop(); + --validIsolateCount; } - stack.pop(); - --validIsolateCount; - } - resultLevels[i] = stack.lastEmbeddingLevel(); - break; + resultLevels[i] = stack.lastEmbeddingLevel(); + break; - // Rule X7 - case PDF: - // Not really part of the spec - resultLevels[i] = stack.lastEmbeddingLevel(); - - if (overflowIsolateCount > 0) { - // do nothing - } else if (overflowEmbeddingCount > 0) { - --overflowEmbeddingCount; - } else if (!stack.lastDirectionalIsolateStatus() && stack.depth() >= 2) { - stack.pop(); - } else { - // do nothing - } - break; + // Rule X7 + case PDF: + // Not really part of the spec + resultLevels[i] = stack.lastEmbeddingLevel(); - case B: - // Rule X8. + if (overflowIsolateCount > 0) { + // do nothing + } else if (overflowEmbeddingCount > 0) { + --overflowEmbeddingCount; + } else if (!stack.lastDirectionalIsolateStatus() && stack.depth() >= 2) { + stack.pop(); + } else { + // do nothing + } + break; - // These values are reset for clarity, in this implementation B - // can only occur as the last code in the array. - stack.empty(); - overflowIsolateCount = 0; - overflowEmbeddingCount = 0; - validIsolateCount = 0; - resultLevels[i] = paragraphEmbeddingLevel; - break; + case B: + // Rule X8. - default: - resultLevels[i] = stack.lastEmbeddingLevel(); - if (stack.lastDirectionalOverrideStatus() != ON) { - resultTypes[i] = stack.lastDirectionalOverrideStatus(); - } - break; + // These values are reset for clarity, in this implementation B + // can only occur as the last code in the array. + stack.empty(); + overflowIsolateCount = 0; + overflowEmbeddingCount = 0; + validIsolateCount = 0; + resultLevels[i] = paragraphEmbeddingLevel; + break; + + default: + resultLevels[i] = stack.lastEmbeddingLevel(); + if (stack.lastDirectionalOverrideStatus() != ON) { + resultTypes[i] = stack.lastDirectionalOverrideStatus(); + } + break; } } } @@ -565,9 +529,9 @@ private class IsolatingRunSequence { private final int[] indexes; // indexes to the original string private final byte[] types; // type of each character using the index private byte[] resolvedLevels; // resolved levels after application of - // rules + // rules private final int length; // length of isolating run sequence in - // characters + // characters private final byte level; private final byte sos, eos; @@ -595,8 +559,8 @@ public IsolatingRunSequence(int[] inputIndexes) { succLevel = paragraphEmbeddingLevel; } else { int limit = indexes[length - 1] + 1; // the first character - // after the end of - // run sequence + // after the end of + // run sequence while (limit < textLength && isRemovedByX9(initialTypes[limit])) { ++limit; } @@ -608,13 +572,15 @@ public IsolatingRunSequence(int[] inputIndexes) { /** * 3) resolving weak types Rules W1-W7. * - * Note that some weak types (EN, AN) remain after this processing is - * complete. + *

Note that some weak types (EN, AN) remain after this processing is complete. */ public void resolveWeakTypes() { // on entry, only these types remain - assertOnly(new byte[] { L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI }); + assertOnly( + new byte[] { + L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI + }); // Rule W1. // Changes all NSMs. @@ -684,7 +650,7 @@ public void resolveWeakTypes() { if (types[i] == ET) { // locate end of sequence int runstart = i; - int runlimit = findRunLimit(runstart, length, new byte[] { ET }); + int runlimit = findRunLimit(runstart, length, new byte[] {ET}); // check values at ends of sequence byte t = runstart == 0 ? sos : types[runstart - 1]; @@ -729,20 +695,23 @@ public void resolveWeakTypes() { } } - /** - * 6) resolving neutral types Rules N1-N2. - */ + /** 6) resolving neutral types Rules N1-N2. */ public void resolveNeutralTypes() { // on entry, only these types can be in resultTypes - assertOnly(new byte[] { L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI }); + assertOnly(new byte[] {L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI}); for (int i = 0; i < length; ++i) { byte t = types[i]; - if (t == WS || t == ON || t == B || t == S || t == RLI || t == LRI || t == FSI || t == PDI) { + if (t == WS || t == ON || t == B || t == S || t == RLI || t == LRI || t == FSI + || t == PDI) { // find bounds of run of neutrals int runstart = i; - int runlimit = findRunLimit(runstart, length, new byte[] { B, S, WS, ON, RLI, LRI, FSI, PDI }); + int runlimit = + findRunLimit( + runstart, + length, + new byte[] {B, S, WS, ON, RLI, LRI, FSI, PDI}); // determine effective types at ends of run byte leadingType; @@ -787,13 +756,11 @@ public void resolveNeutralTypes() { } } - /** - * 7) resolving implicit embedding levels Rules I1, I2. - */ + /** 7) resolving implicit embedding levels Rules I1, I2. */ public void resolveImplicitLevels() { // on entry, only these types can be in resultTypes - assertOnly(new byte[] { L, R, EN, AN }); + assertOnly(new byte[] {L, R, EN, AN}); resolvedLevels = new byte[length]; setLevels(resolvedLevels, 0, length, level); @@ -836,12 +803,12 @@ public void applyLevelsAndTypes() { } /** - * Return the limit of the run consisting only of the types in validSet - * starting at index. This checks the value at index, and will return - * index if that value is not in validSet. + * Return the limit of the run consisting only of the types in validSet starting at index. + * This checks the value at index, and will return index if that value is not in validSet. */ private int findRunLimit(int index, int limit, byte[] validSet) { - loop: while (index < limit) { + loop: + while (index < limit) { byte t = types[index]; for (int i = 0; i < validSet.length; ++i) { if (t == validSet[i]) { @@ -855,21 +822,17 @@ private int findRunLimit(int index, int limit, byte[] validSet) { return limit; } - /** - * Set types from start up to (but not including) limit to newType. - */ + /** Set types from start up to (but not including) limit to newType. */ private void setTypes(int start, int limit, byte newType) { for (int i = start; i < limit; ++i) { types[i] = newType; } } - /** - * Algorithm validation. Assert that all values in types are in the - * provided set. - */ + /** Algorithm validation. Assert that all values in types are in the provided set. */ private void assertOnly(byte[] codes) { - loop: for (int i = 0; i < length; ++i) { + loop: + for (int i = 0; i < length; ++i) { byte t = types[i]; for (int j = 0; j < codes.length; ++j) { if (t == codes[j]) { @@ -877,7 +840,11 @@ private void assertOnly(byte[] codes) { } } - throw new Error("invalid bidi code " + typenames[t] + " present in assertOnly at position " + indexes[i]); + throw new Error( + "invalid bidi code " + + typenames[t] + + " present in assertOnly at position " + + indexes[i]); } } } @@ -905,26 +872,33 @@ private IsolatingRunSequence[] determineIsolatingRunSequences() { int[] currentRunSequence = new int[textLength]; for (int i = 0; i < levelRuns.length; ++i) { int firstCharacter = levelRuns[i][0]; - if (initialTypes[firstCharacter] != PDI || matchingIsolateInitiator[firstCharacter] == -1) { + if (initialTypes[firstCharacter] != PDI + || matchingIsolateInitiator[firstCharacter] == -1) { int currentRunSequenceLength = 0; int run = i; do { // Copy this level run into currentRunSequence - System.arraycopy(levelRuns[run], 0, currentRunSequence, currentRunSequenceLength, levelRuns[run].length); + System.arraycopy( + levelRuns[run], + 0, + currentRunSequence, + currentRunSequenceLength, + levelRuns[run].length); currentRunSequenceLength += levelRuns[run].length; int lastCharacter = currentRunSequence[currentRunSequenceLength - 1]; byte lastType = initialTypes[lastCharacter]; - if ((lastType == LRI || lastType == RLI || lastType == FSI) && - matchingPDI[lastCharacter] != textLength) { + if ((lastType == LRI || lastType == RLI || lastType == FSI) + && matchingPDI[lastCharacter] != textLength) { run = runForCharacter[matchingPDI[lastCharacter]]; } else { break; } } while (true); - sequences[numSequences] = new IsolatingRunSequence( - Arrays.copyOf(currentRunSequence, currentRunSequenceLength)); + sequences[numSequences] = + new IsolatingRunSequence( + Arrays.copyOf(currentRunSequence, currentRunSequenceLength)); ++numSequences; } } @@ -932,12 +906,11 @@ private IsolatingRunSequence[] determineIsolatingRunSequences() { } /** - * Determines the level runs. Rule X9 will be applied in determining the - * runs, in the way that makes sure the characters that are supposed to be - * removed are not included in the runs. + * Determines the level runs. Rule X9 will be applied in determining the runs, in the way that + * makes sure the characters that are supposed to be removed are not included in the runs. * - * @return an array of level runs. Each level run is described as an array - * of indexes into the input string. + * @return an array of level runs. Each level run is described as an array of indexes into the + * input string. */ private int[][] determineLevelRuns() { // temporary array to hold the run @@ -951,7 +924,7 @@ private int[][] determineLevelRuns() { for (int i = 0; i < textLength; ++i) { if (!isRemovedByX9(initialTypes[i])) { if (resultLevels[i] != currentLevel) { // we just encountered a - // new run + // new run // Wrap up last run if (currentLevel >= 0) { // only wrap it up if there was a run int[] run = Arrays.copyOf(temporaryRun, runLength); @@ -977,15 +950,12 @@ private int[][] determineLevelRuns() { } /** - * Assign level information to characters removed by rule X9. This is for - * ease of relating the level information to the original input data. Note - * that the levels assigned to these codes are arbitrary, they're chosen so - * as to avoid breaking level runs. + * Assign level information to characters removed by rule X9. This is for ease of relating the + * level information to the original input data. Note that the levels assigned to these codes + * are arbitrary, they're chosen so as to avoid breaking level runs. * - * @param textLength - * the length of the data after compression - * @return the length of the data (original length of types array supplied - * to constructor) + * @param textLength the length of the data after compression + * @return the length of the data (original length of types array supplied to constructor) */ private int assignLevelsToCharactersRemovedByX9() { for (int i = 0; i < initialTypes.length; ++i) { @@ -1022,16 +992,15 @@ private int assignLevelsToCharactersRemovedByX9() { /** * Return levels array breaking lines at offsets in linebreaks.
* Rule L1. - *

- * The returned levels array contains the resolved level for each bidi code - * passed to the constructor. - *

- * The linebreaks array must include at least one value. The values must be - * in strictly increasing order (no duplicates) between 1 and the length of - * the text, inclusive. The last value must be the length of the text. * - * @param linebreaks - * the offsets at which to break the paragraph + *

The returned levels array contains the resolved level for each bidi code passed to the + * constructor. + * + *

The linebreaks array must include at least one value. The values must be in strictly + * increasing order (no duplicates) between 1 and the length of the text, inclusive. The last + * value must be the length of the text. + * + * @param linebreaks the offsets at which to break the paragraph * @return the resolved levels of the text */ public byte[] getLevels(int[] linebreaks) { @@ -1050,7 +1019,7 @@ public byte[] getLevels(int[] linebreaks) { validateLineBreaks(linebreaks, textLength); byte[] result = resultLevels.clone(); // will be returned to - // caller + // caller // don't worry about linebreaks since if there is a break within // a series of WS values preceding S, the linebreak itself @@ -1064,7 +1033,7 @@ public byte[] getLevels(int[] linebreaks) { // Rule L1, clause three. for (int j = i - 1; j >= 0; --j) { if (isWhitespace(initialTypes[j])) { // including format - // codes + // codes result[j] = paragraphEmbeddingLevel; } else { break; @@ -1093,25 +1062,23 @@ public byte[] getLevels(int[] linebreaks) { /** * Return reordering array breaking lines at offsets in linebreaks. - *

- * The reordering array maps from a visual index to a logical index. Lines - * are concatenated from left to right. So for example, the fifth character - * from the left on the third line is + * + *

The reordering array maps from a visual index to a logical index. Lines are concatenated + * from left to right. So for example, the fifth character from the left on the third line is * *

      * getReordering(linebreaks)[linebreaks[1] + 4]
      * 
* - * (linebreaks[1] is the position after the last character of the second - * line, which is also the index of the first character on the third line, - * and adding four gets the fifth character from the left). - *

- * The linebreaks array must include at least one value. The values must be - * in strictly increasing order (no duplicates) between 1 and the length of - * the text, inclusive. The last value must be the length of the text. + * (linebreaks[1] is the position after the last character of the second line, which is also the + * index of the first character on the third line, and adding four gets the fifth character from + * the left). * - * @param linebreaks - * the offsets at which to break the paragraph. + *

The linebreaks array must include at least one value. The values must be in strictly + * increasing order (no duplicates) between 1 and the length of the text, inclusive. The last + * value must be the length of the text. + * + * @param linebreaks the offsets at which to break the paragraph. */ public int[] getReordering(int[] linebreaks) { validateLineBreaks(linebreaks, textLength); @@ -1122,8 +1089,8 @@ public int[] getReordering(int[] linebreaks) { } /** - * Return multiline reordering array for a given level array. Reordering - * does not occur across a line break. + * Return multiline reordering array for a given level array. Reordering does not occur across a + * line break. */ private static int[] computeMultilineReordering(byte[] levels, int[] linebreaks) { int[] result = new int[levels.length]; @@ -1147,9 +1114,9 @@ private static int[] computeMultilineReordering(byte[] levels, int[] linebreaks) } /** - * Return reordering array for a given level array. This reorders a single - * line. The reordering is a visual to logical map. For example, the - * leftmost char is string.charAt(order[0]). Rule L2. + * Return reordering array for a given level array. This reorders a single line. The reordering + * is a visual to logical map. For example, the leftmost char is string.charAt(order[0]). Rule + * L2. */ private static int[] computeReordering(byte[] levels) { int lineLength = levels.length; @@ -1203,65 +1170,54 @@ private static int[] computeReordering(byte[] levels) { return result; } - /** - * Return the base level of the paragraph. - */ + /** Return the base level of the paragraph. */ public byte getBaseLevel() { return paragraphEmbeddingLevel; } // --- internal utilities ------------------------------------------------- - /** - * Return true if the type is considered a whitespace type for the line - * break rules. - */ + /** Return true if the type is considered a whitespace type for the line break rules. */ private static boolean isWhitespace(byte biditype) { switch (biditype) { - case LRE: - case RLE: - case LRO: - case RLO: - case PDF: - case LRI: - case RLI: - case FSI: - case PDI: - case BN: - case WS: - return true; - default: - return false; + case LRE: + case RLE: + case LRO: + case RLO: + case PDF: + case LRI: + case RLI: + case FSI: + case PDI: + case BN: + case WS: + return true; + default: + return false; } } - /** - * Return true if the type is one of the types removed in X9. - */ + /** Return true if the type is one of the types removed in X9. */ private static boolean isRemovedByX9(byte biditype) { switch (biditype) { - case LRE: - case RLE: - case LRO: - case RLO: - case PDF: - case BN: - return true; - default: - return false; + case LRE: + case RLE: + case LRO: + case RLO: + case PDF: + case BN: + return true; + default: + return false; } } - /** - * Return the strong type (L or R) corresponding to the level. - */ + /** Return the strong type (L or R) corresponding to the level. */ private static byte typeForLevel(int level) { return ((level & 0x1) == 0) ? L : R; } - /** - * Set levels from start up to (but not including) limit to newLevel. - */ + /** Set levels from start up to (but not including) limit to newLevel. */ private void setLevels(byte[] levels, int start, int limit, byte newLevel) { for (int i = start; i < limit; ++i) { levels[i] = newLevel; @@ -1270,9 +1226,7 @@ private void setLevels(byte[] levels, int start, int limit, byte newLevel) { // --- input validation --------------------------------------------------- - /** - * Throw exception if type array is invalid. - */ + /** Throw exception if type array is invalid. */ private static void validateTypes(byte[] types) { if (types == null) { throw new IllegalArgumentException("types is null"); @@ -1290,21 +1244,19 @@ private static void validateTypes(byte[] types) { } /** - * Throw exception if paragraph embedding level is invalid. Special - * allowance for -1 so that default processing can still be performed when - * using this API. + * Throw exception if paragraph embedding level is invalid. Special allowance for -1 so that + * default processing can still be performed when using this API. */ private static void validateParagraphEmbeddingLevel(byte paragraphEmbeddingLevel) { - if (paragraphEmbeddingLevel != -1 && - paragraphEmbeddingLevel != 0 && - paragraphEmbeddingLevel != 1) { - throw new IllegalArgumentException("illegal paragraph embedding level: " + paragraphEmbeddingLevel); + if (paragraphEmbeddingLevel != -1 + && paragraphEmbeddingLevel != 0 + && paragraphEmbeddingLevel != 1) { + throw new IllegalArgumentException( + "illegal paragraph embedding level: " + paragraphEmbeddingLevel); } } - /** - * Throw exception if line breaks array is invalid. - */ + /** Throw exception if line breaks array is invalid. */ private static void validateLineBreaks(int[] linebreaks, int textLength) { int prev = 0; for (int i = 0; i < linebreaks.length; ++i) { diff --git a/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTest.java b/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTest.java index e19210bd6..91113f4f8 100644 --- a/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTest.java +++ b/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTest.java @@ -13,15 +13,13 @@ /** * A simple command-line interface to the BidiReference class. - *

- * This prompts the user for an ASCII string, runs the reference - * algorithm on the string, and displays the results to the terminal. - * An empty return to the prompt exits the program. - *

- * ASCII characters are preassigned various bidi direction types. - * These types can be displayed by the user for reference by - * typing -display at the prompt. More help can be - * obtained by typing -help at the prompt. + * + *

This prompts the user for an ASCII string, runs the reference algorithm on the string, and + * displays the results to the terminal. An empty return to the prompt exits the program. + * + *

ASCII characters are preassigned various bidi direction types. These types can be displayed by + * the user for reference by typing -display at the prompt. More help can be obtained + * by typing -help at the prompt. */ public class BidiReferenceTest { BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); @@ -29,9 +27,7 @@ public class BidiReferenceTest { BidiReferenceTestCharmap charmap = BidiReferenceTestCharmap.TEST_ARABIC; byte baseDirection = -1; - /** - * Run the interactive test. - */ + /** Run the interactive test. */ public static void main(String args[]) { new BidiReferenceTest().run(); } @@ -45,8 +41,7 @@ void run() { String input; try { input = reader.readLine(); - } - catch (final Exception e) { + } catch (final Exception e) { writer.println(e); continue; } @@ -92,39 +87,45 @@ void run() { } } - /** - * Display instructions to the user. - */ + /** Display instructions to the user. */ void printHelp() { writer.println("Bidi Reference Interactive Test"); writer.println(); - writer.println("To exit the program, hit return or enter at the prompt without typing any text"); + writer.println( + "To exit the program, hit return or enter at the prompt without typing any text"); writer.println("To run the bidi algorithm, just enter some text (without a leading '-')"); writer.println(); writer.println("To see the current mapping of characters to Bidi types, enter '-display'"); writer.println("To switch the mapping to english, enter '-english'"); writer.println("To switch the mapping to hebrew for upper case, enter '-hebrew'"); - writer.println("To switch the mapping to arabic for upper case and numbers, enter '-arabic'"); - writer.println("To switch the mapping to mixed hebrew and arabic for upper case and numbers, enter '-mixed'"); + writer.println( + "To switch the mapping to arabic for upper case and numbers, enter '-arabic'"); + writer.println( + "To switch the mapping to mixed hebrew and arabic for upper case and numbers, enter '-mixed'"); writer.println(); writer.println("To force an LTR base direction, enter '-baseLTR'"); writer.println("To force an RTL base direction, enter '-baseRTL'"); - writer.println("To compute the default base direction using the algorithm, enter '-baseDefault'"); + writer.println( + "To compute the default base direction using the algorithm, enter '-baseDefault'"); writer.println(); writer.println("To display this help message, enter '-help'"); } /** - * Run the BidiReference algorithm over the string using the current character to direction code mapping. + * Run the BidiReference algorithm over the string using the current character to direction code + * mapping. */ void runSample(String str) { try { final byte[] codes = charmap.getCodes(str); final BidiReference bidi = new BidiReference(codes, baseDirection); - final int[] reorder = bidi.getReordering(new int[] { codes.length }); + final int[] reorder = bidi.getReordering(new int[] {codes.length}); - writer.println("base level: " + bidi.getBaseLevel() + (baseDirection != -1 ? " (forced)" : "")); + writer.println( + "base level: " + + bidi.getBaseLevel() + + (baseDirection != -1 ? " (forced)" : "")); // output original text for (int i = 0; i < str.length(); ++i) { @@ -137,8 +138,7 @@ void runSample(String str) { displayChar(str.charAt(reorder[i])); } writer.println(); - } - catch (final Exception e) { + } catch (final Exception e) { writer.println(e); } writer.println(); @@ -154,4 +154,3 @@ void displayChar(char c) { } } } - diff --git a/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTestCharmap.java b/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTestCharmap.java index 6ca1170d0..ba02f050d 100644 --- a/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTestCharmap.java +++ b/unicodetools/src/main/java/org/unicode/bidi/BidiReferenceTestCharmap.java @@ -10,8 +10,8 @@ import java.io.PrintWriter; /** - * A class that maps ASCII characters to bidi direction types, used for testing purposes. - * This class should not be used as a model for access to or storage of this information. + * A class that maps ASCII characters to bidi direction types, used for testing purposes. This class + * should not be used as a model for access to or storage of this information. * * @author Doug Felt */ @@ -58,13 +58,12 @@ public abstract class BidiReferenceTestCharmap { private static final String[] typenames = BidiReference.typenames; - /** - * Return the name of this mapping. - */ + /** Return the name of this mapping. */ public abstract String getName(); /** * Return the bidi direction codes corresponding to the ASCII characters in the string. + * * @param str the string * @return an array of bidi direction codes */ @@ -74,6 +73,7 @@ public final byte[] getCodes(String str) { /** * Return the bidi direction codes corresponding to the ASCII characters in the array. + * * @param chars the array of ASCII characters * @return an array of bidi direction codes */ @@ -82,8 +82,9 @@ public final byte[] getCodes(char[] chars) { } /** - * Return the bidi direction codes corresponding to the ASCII characters in the subrange - * of the array. + * Return the bidi direction codes corresponding to the ASCII characters in the subrange of the + * array. + * * @param chars the array of ASCII characters * @param charstart the start of the subrange to use * @param count the number of characters in the subrange to use @@ -95,9 +96,7 @@ public final byte[] getCodes(char[] chars, int charstart, int count) { return result; } - /** - * Display the mapping from ASCII to bidi direction codes using the provided PrintWriter. - */ + /** Display the mapping from ASCII to bidi direction codes using the provided PrintWriter. */ public abstract void dumpInfo(PrintWriter w); /** @@ -109,11 +108,10 @@ public final byte[] getCodes(char[] chars, int charstart, int count) { * @param codeStart the start position in the codes array * @param count the number of characters to convert to direction codes */ - public abstract void convert(char[] chars, int charStart, byte[] codes, int codeStart, int count); + public abstract void convert( + char[] chars, int charStart, byte[] codes, int codeStart, int count); - /** - * Constructor for subclass use. - */ + /** Constructor for subclass use. */ protected BidiReferenceTestCharmap() { // don't know why the compiler default constructor isn't acceptable } @@ -125,15 +123,13 @@ protected BidiReferenceTestCharmap() { /** * Default implementation that maps ASCII to all bidi types. * - * This is the base class for TestArabic, TestHebrew, and TestMixed mappings. + *

This is the base class for TestArabic, TestHebrew, and TestMixed mappings. */ public static class DefaultCharmap extends BidiReferenceTestCharmap { protected String name; protected byte[] map; - /** - * Initialize to default mapping, and define name. - */ + /** Initialize to default mapping, and define name. */ public DefaultCharmap(String name) { this.name = name; @@ -152,14 +148,12 @@ public DefaultCharmap(String name) { setMap(FSI, "?"); setMap(PDI, "="); setMap(NSM, "~"); - setMap( BN, "`"); - setMap( B, "|"); // visible character for convenience - setMap( S, "_"); // visible character for convenience + setMap(BN, "`"); + setMap(B, "|"); // visible character for convenience + setMap(S, "_"); // visible character for convenience } - /** - * Utility used to change the mapping. - */ + /** Utility used to change the mapping. */ protected void setMap(byte value, String chars) { for (int i = 0; i < chars.length(); ++i) { map[chars.charAt(i)] = value; @@ -167,39 +161,149 @@ protected void setMap(byte value, String chars) { } /** - * Standard character mapping for Latin-1. Protected so that it can be - * directly accessed by subclasses. + * Standard character mapping for Latin-1. Protected so that it can be directly accessed by + * subclasses. */ protected static final byte[] baseMap = { - ON, ON, ON, ON, ON, ON, ON, ON, // 00-07 c0 c0 c0 c0 c0 c0 c0 c0 - ON, S, B, S, B, B, ON, ON, // 08-0f c0 HT LF VT FF CR c0 c0 - ON, ON, ON, ON, ON, ON, ON, ON, // 10-17 c0 c0 c0 c0 c0 c0 c0 c0 - ON, ON, ON, ON, B, B, B, S, // 18-1f c0 c0 c0 c0 FS GS RS US - WS, ON, ON, ET, ET, ET, ON, ON, // 20-27 ! " # $ % & ' - ON, ON, ON, ET, CS, ET, CS, ES, // 28-2f ( ) * + , - . / - EN, EN, EN, EN, EN, EN, EN, EN, // 30-37 0 1 2 3 4 5 6 7 - EN, EN, CS, ON, ON, ON, ON, ON, // 38-3f 8 9 : ; < = > ? - ON, L, L, L, L, L, L, L, // 40-47 @ A B C D E F G - L, L, L, L, L, L, L, L, // 48-4f H I J K L M N O - L, L, L, L, L, L, L, L, // 50-57 P Q R S T U V W - L, L, L, ON, ON, ON, ON, S, // 58-5f X Y Z [ \ ] ^ _ - ON, L, L, L, L, L, L, L, // 60-67 ` a b c d e f g - L, L, L, L, L, L, L, L, // 68-6f h i j k l m n o - L, L, L, L, L, L, L, L, // 70-77 p q r s t u v w - L, L, L, ON, ON, ON, ON, ON // 78-7f x y z { | } ~ DEL + ON, + ON, + ON, + ON, + ON, + ON, + ON, + ON, // 00-07 c0 c0 c0 c0 c0 c0 c0 c0 + ON, + S, + B, + S, + B, + B, + ON, + ON, // 08-0f c0 HT LF VT FF CR c0 c0 + ON, + ON, + ON, + ON, + ON, + ON, + ON, + ON, // 10-17 c0 c0 c0 c0 c0 c0 c0 c0 + ON, + ON, + ON, + ON, + B, + B, + B, + S, // 18-1f c0 c0 c0 c0 FS GS RS US + WS, + ON, + ON, + ET, + ET, + ET, + ON, + ON, // 20-27 ! " # $ % & ' + ON, + ON, + ON, + ET, + CS, + ET, + CS, + ES, // 28-2f ( ) * + , - . / + EN, + EN, + EN, + EN, + EN, + EN, + EN, + EN, // 30-37 0 1 2 3 4 5 6 7 + EN, + EN, + CS, + ON, + ON, + ON, + ON, + ON, // 38-3f 8 9 : ; < = > ? + ON, + L, + L, + L, + L, + L, + L, + L, // 40-47 @ A B C D E F G + L, + L, + L, + L, + L, + L, + L, + L, // 48-4f H I J K L M N O + L, + L, + L, + L, + L, + L, + L, + L, // 50-57 P Q R S T U V W + L, + L, + L, + ON, + ON, + ON, + ON, + S, // 58-5f X Y Z [ \ ] ^ _ + ON, + L, + L, + L, + L, + L, + L, + L, // 60-67 ` a b c d e f g + L, + L, + L, + L, + L, + L, + L, + L, // 68-6f h i j k l m n o + L, + L, + L, + L, + L, + L, + L, + L, // 70-77 p q r s t u v w + L, + L, + L, + ON, + ON, + ON, + ON, + ON // 78-7f x y z { | } ~ DEL }; - /** - * Return the name. - */ + /** Return the name. */ @Override public String getName() { return name; } /** - * Standard implementation of dumpInfo that displays, for each bidi - * direction type, the characters that are mapped to that type. + * Standard implementation of dumpInfo that displays, for each bidi direction type, the + * characters that are mapped to that type. */ @Override public void dumpInfo(PrintWriter w) { @@ -227,29 +331,29 @@ public void dumpInfo(PrintWriter w) { w.print(","); } switch (runEnd - runStart) { - case 1: - dumpChar(runStart, w); - break; - case 2: - dumpChar(runStart, w); - w.print(","); - dumpChar(runEnd - 1, w); - break; - default: - // only use ranges for a-z, A-Z, 0-9, c0 (hex display) - if ((runStart >= 'a' && (runEnd - 1 <= 'z')) || - (runStart >= 'A' && (runEnd - 1 <= 'Z')) || - (runStart >= '0' && (runEnd - 1 <= '9')) || - (runStart >= 0x0 && (runEnd - 1 <= 0x1f))) { - + case 1: dumpChar(runStart, w); - w.print("-"); - dumpChar(runEnd - 1, w); - } else { + break; + case 2: dumpChar(runStart, w); - runEnd = runStart + 1; - } - break; + w.print(","); + dumpChar(runEnd - 1, w); + break; + default: + // only use ranges for a-z, A-Z, 0-9, c0 (hex display) + if ((runStart >= 'a' && (runEnd - 1 <= 'z')) + || (runStart >= 'A' && (runEnd - 1 <= 'Z')) + || (runStart >= '0' && (runEnd - 1 <= '9')) + || (runStart >= 0x0 && (runEnd - 1 <= 0x1f))) { + + dumpChar(runStart, w); + w.print("-"); + dumpChar(runEnd - 1, w); + } else { + dumpChar(runStart, w); + runEnd = runStart + 1; + } + break; } runStart = runEnd; @@ -261,19 +365,15 @@ public void dumpInfo(PrintWriter w) { } /** - * Utility used to output a 'name' of single character, passed as an - * integer. Printable characters are represented as themselves, - * non-printable characters as hex values. Comma, hyphen, and space are - * represented as strings surrounded by square brackets. + * Utility used to output a 'name' of single character, passed as an integer. Printable + * characters are represented as themselves, non-printable characters as hex values. Comma, + * hyphen, and space are represented as strings surrounded by square brackets. * - * @param i - * the integer value of the character - * @param w - * the PrintWriter on which to output the representation of - * the character + * @param i the integer value of the character + * @param w the PrintWriter on which to output the representation of the character */ protected static void dumpChar(int i, PrintWriter w) { - final char c = (char)i; + final char c = (char) i; if (c == ',') { w.print("[comma]"); @@ -288,9 +388,7 @@ protected static void dumpChar(int i, PrintWriter w) { } } - /** - * Standard implementation of convert. - */ + /** Standard implementation of convert. */ @Override public void convert(char[] chars, int charStart, byte[] codes, int codeStart, int count) { for (int i = 0; i < count; ++i) { @@ -329,7 +427,7 @@ private TestMixed() { private static class TestHebrew extends DefaultCharmap { private TestHebrew() { - super ("Test Hebrew"); + super("Test Hebrew"); setMap(R, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"); } diff --git a/unicodetools/src/main/java/org/unicode/bidi/BidiTestIcu4jConformance.java b/unicodetools/src/main/java/org/unicode/bidi/BidiTestIcu4jConformance.java index 7629659fc..21ad6693d 100644 --- a/unicodetools/src/main/java/org/unicode/bidi/BidiTestIcu4jConformance.java +++ b/unicodetools/src/main/java/org/unicode/bidi/BidiTestIcu4jConformance.java @@ -1,12 +1,10 @@ package org.unicode.bidi; +import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection; import java.io.BufferedReader; import java.io.FileReader; - import org.unicode.text.utility.Settings; -import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection; - public class BidiTestIcu4jConformance { static com.ibm.icu.text.Bidi bidi; diff --git a/unicodetools/src/main/java/org/unicode/bidi/GenerateN1Tests.java b/unicodetools/src/main/java/org/unicode/bidi/GenerateN1Tests.java index 11ef977d3..405a994dd 100644 --- a/unicodetools/src/main/java/org/unicode/bidi/GenerateN1Tests.java +++ b/unicodetools/src/main/java/org/unicode/bidi/GenerateN1Tests.java @@ -6,15 +6,23 @@ import java.util.List; public class GenerateN1Tests { - enum Sample {AL("\u0627\u062F\u0630\u0631"), R("\u05D0\u05D1\u05d2\u05d3"), L("abcd"), AN("\u0660\u0661\u0662\u0663"), EN("6789"), N("!&?@"); - static EnumSet STRONG = EnumSet.of(Sample.R, Sample.AL, Sample.L); - static EnumSet NUMERIC = EnumSet.of(Sample.AN, Sample.EN); - private final String[] str = new String[2]; - Sample(String instr) { - str[0] = instr.substring(0,2); - str[1] = instr.substring(2); - } + enum Sample { + AL("\u0627\u062F\u0630\u0631"), + R("\u05D0\u05D1\u05d2\u05d3"), + L("abcd"), + AN("\u0660\u0661\u0662\u0663"), + EN("6789"), + N("!&?@"); + static EnumSet STRONG = EnumSet.of(Sample.R, Sample.AL, Sample.L); + static EnumSet NUMERIC = EnumSet.of(Sample.AN, Sample.EN); + private final String[] str = new String[2]; + + Sample(String instr) { + str[0] = instr.substring(0, 2); + str[1] = instr.substring(2); + } } + static class SampleEnumerator { private final List items = new ArrayList(); private final List readOnlyItems = Collections.unmodifiableList(items); @@ -25,9 +33,12 @@ public SampleEnumerator(int count) { } boolean next() { - for (int i = items.size()-1; i >= 0; --i) { + for (int i = items.size() - 1; i >= 0; --i) { final Sample oldValue = items.get(i); - final Sample newValue = oldValue.ordinal() < Sample.values().length - 1 ? Sample.values()[oldValue.ordinal()+1] : null; // next value + final Sample newValue = + oldValue.ordinal() < Sample.values().length - 1 + ? Sample.values()[oldValue.ordinal() + 1] + : null; // next value if (newValue != null) { items.set(i, newValue); return true; @@ -79,9 +90,7 @@ public static void main(String[] args) { if (samples.getItems().get(2) != Sample.N) { continue; } - if (samples.getItems().get(1) == Sample.N - || samples.getItems().get(3) == Sample.N - ) { + if (samples.getItems().get(1) == Sample.N || samples.getItems().get(3) == Sample.N) { continue; } if (!Sample.STRONG.contains(samples.getItems().get(0))) { diff --git a/unicodetools/src/main/java/org/unicode/com/ibm/icu/text/StringTransform.java b/unicodetools/src/main/java/org/unicode/com/ibm/icu/text/StringTransform.java index 78787157b..f7efa62ea 100644 --- a/unicodetools/src/main/java/org/unicode/com/ibm/icu/text/StringTransform.java +++ b/unicodetools/src/main/java/org/unicode/com/ibm/icu/text/StringTransform.java @@ -9,19 +9,21 @@ import com.ibm.icu.text.Transform; /** - * Provide a base class for Transforms that focuses just on the transformation of the text. APIs that take Transliterator, but only depend on the text transformation should use this interface in the API instead. + * Provide a base class for Transforms that focuses just on the transformation of the text. APIs + * that take Transliterator, but only depend on the text transformation should use this interface in + * the API instead. * * @stable ICU 3.8 * @author markdavis - * */ -public interface StringTransform extends Transform { +public interface StringTransform extends Transform { /** * Transform the text in some way, to be determined by the subclass. + * * @param source text to be transformed (eg lowercased) * @return result * @stable ICU 3.8 */ @Override public String transform(String source); -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/draft/AcceptLanguage.java b/unicodetools/src/main/java/org/unicode/draft/AcceptLanguage.java index 1014ab1be..415c2c877 100644 --- a/unicodetools/src/main/java/org/unicode/draft/AcceptLanguage.java +++ b/unicodetools/src/main/java/org/unicode/draft/AcceptLanguage.java @@ -16,22 +16,19 @@ * limitations under the License. */ - - +import com.ibm.icu.lang.UCharacter; import java.util.Enumeration; import java.util.Hashtable; import java.util.Locale; import java.util.StringTokenizer; import java.util.Vector; -import com.ibm.icu.lang.UCharacter; - /** - * Util to process the "Accept-Language" header. Used by facade to implement - * getLocale() and by StaticInterceptor. + * Util to process the "Accept-Language" header. Used by facade to implement getLocale() and by + * StaticInterceptor. + * + *

Not optimized - it's very slow. * - * Not optimized - it's very slow. - * * @author James Duncan Davidson [duncan@eng.sun.com] * @author James Todd [gonzo@eng.sun.com] * @author Jason Hunter [jch@eng.sun.com] @@ -42,22 +39,22 @@ public class AcceptLanguage { UCharacter foo; public static Locale getLocale(String acceptLanguage) { - if( acceptLanguage == null ) { + if (acceptLanguage == null) { return Locale.getDefault(); } final Hashtable languages = new Hashtable(); - final Vector quality=new Vector(); - processAcceptLanguage(acceptLanguage, languages,quality); + final Vector quality = new Vector(); + processAcceptLanguage(acceptLanguage, languages, quality); if (languages.size() == 0) { return Locale.getDefault(); } final Vector l = new Vector(); - extractLocales( languages,quality, l); + extractLocales(languages, quality, l); - return (Locale)l.elementAt(0); + return (Locale) l.elementAt(0); } public static Enumeration getLocales(String acceptLanguage) { @@ -69,8 +66,8 @@ public static Enumeration getLocales(String acceptLanguage) { } final Hashtable languages = new Hashtable(); - final Vector quality=new Vector(); - processAcceptLanguage(acceptLanguage, languages , quality); + final Vector quality = new Vector(); + processAcceptLanguage(acceptLanguage, languages, quality); if (languages.size() == 0) { final Vector v = new Vector(); @@ -78,15 +75,13 @@ public static Enumeration getLocales(String acceptLanguage) { return v.elements(); } final Vector l = new Vector(); - extractLocales( languages, quality , l); + extractLocales(languages, quality, l); return l.elements(); } - private static void processAcceptLanguage( String acceptLanguage, - Hashtable languages, Vector q) - { - final StringTokenizer languageTokenizer = - new StringTokenizer(acceptLanguage, ","); + private static void processAcceptLanguage( + String acceptLanguage, Hashtable languages, Vector q) { + final StringTokenizer languageTokenizer = new StringTokenizer(acceptLanguage, ","); while (languageTokenizer.hasMoreTokens()) { String language = languageTokenizer.nextToken().trim(); @@ -95,16 +90,13 @@ private static void processAcceptLanguage( String acceptLanguage, final int equalIndex = language.indexOf('='); Double qValue = new Double(1); - if (qValueIndex > -1 && - qValueIndex < qIndex && - qIndex < equalIndex) { + if (qValueIndex > -1 && qValueIndex < qIndex && qIndex < equalIndex) { String qValueStr = language.substring(qValueIndex + 1); language = language.substring(0, qValueIndex); qValueStr = qValueStr.trim().toLowerCase(); qValueIndex = qValueStr.indexOf('='); qValue = new Double(0); - if (qValueStr.startsWith("q") && - qValueIndex > -1) { + if (qValueStr.startsWith("q") && qValueIndex > -1) { qValueStr = qValueStr.substring(qValueIndex + 1); try { qValue = new Double(qValueStr.trim()); @@ -116,13 +108,13 @@ private static void processAcceptLanguage( String acceptLanguage, // XXX // may need to handle "*" at some point in time - if (! language.equals("*")) { + if (!language.equals("*")) { final String key = qValue.toString(); Vector v; if (languages.containsKey(key)) { - v = (Vector)languages.get(key) ; + v = (Vector) languages.get(key); } else { - v= new Vector(); + v = new Vector(); q.addElement(qValue); } v.addElement(language); @@ -131,16 +123,14 @@ private static void processAcceptLanguage( String acceptLanguage, } } - private static void extractLocales(Hashtable languages, Vector q,Vector l) - { + private static void extractLocales(Hashtable languages, Vector q, Vector l) { // XXX We will need to order by q value Vector in the Future ? final Enumeration e = q.elements(); while (e.hasMoreElements()) { - final Vector v = - (Vector)languages.get(((Double)e.nextElement()).toString()); + final Vector v = (Vector) languages.get(((Double) e.nextElement()).toString()); final Enumeration le = v.elements(); while (le.hasMoreElements()) { - String language = (String)le.nextElement(); + String language = (String) le.nextElement(); String country = ""; final int countryIndex = language.indexOf("-"); if (countryIndex > -1) { @@ -151,6 +141,7 @@ private static void extractLocales(Hashtable languages, Vector q,Vector l) } } } + public static void main(String[] args) { final Hashtable languages = new Hashtable(); final Vector quality = new Vector(); @@ -161,5 +152,4 @@ public static void main(String[] args) { extractLocales(languages, quality, locales); System.out.println("locales: " + locales); } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/Alphagram.java b/unicodetools/src/main/java/org/unicode/draft/Alphagram.java index 0024ee076..5c00c98d9 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Alphagram.java +++ b/unicodetools/src/main/java/org/unicode/draft/Alphagram.java @@ -13,8 +13,7 @@ public Alphagram(String a) { } } - public Alphagram() { - } + public Alphagram() {} public Alphagram intersection(Alphagram other) { final Alphagram result = new Alphagram(); @@ -38,7 +37,7 @@ public String toString() { for (int i = 0; i < letters.length; ++i) { final int count = letters[i]; while (count > 0) { - result.append((char)(i + 'a')); + result.append((char) (i + 'a')); } } return result.toString(); diff --git a/unicodetools/src/main/java/org/unicode/draft/CharacterFrequency.java b/unicodetools/src/main/java/org/unicode/draft/CharacterFrequency.java index 36ea276a5..a67df684d 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CharacterFrequency.java +++ b/unicodetools/src/main/java/org/unicode/draft/CharacterFrequency.java @@ -1,4 +1,5 @@ package org.unicode.draft; + import java.io.File; import java.util.Arrays; import java.util.Collections; @@ -6,23 +7,19 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; - import org.unicode.cldr.util.Counter; import org.unicode.jsp.FileUtilities; import org.unicode.jsp.FileUtilities.SemiFileReader; import org.unicode.text.utility.Settings; - /** - * Run WebpageCharacterData first. - * Note that the data is post-html conversion, and whitespace is thus not represented. - * For a program that uses this, see ScriptPopulation + * Run WebpageCharacterData first. Note that the data is post-html conversion, and whitespace is + * thus not represented. For a program that uses this, see ScriptPopulation */ public class CharacterFrequency { - public static final String DATA_DIR = Settings.Output.GEN_DIR + - "frequency/languages/"; - private static final String DATA_DIR_RANK = Settings.Output.GEN_DIR + - "frequency/languages-rank/"; + public static final String DATA_DIR = Settings.Output.GEN_DIR + "frequency/languages/"; + private static final String DATA_DIR_RANK = + Settings.Output.GEN_DIR + "frequency/languages-rank/"; public static final boolean DEBUG = false; // static final int MAX_LINE_COUNT = Integer.MAX_VALUE; // 10000; // static final int MAX_SEQUENCE_CHARS = 15; @@ -31,11 +28,13 @@ public class CharacterFrequency { // private static Map languageToPopulation = new HashMap(); // private static Map languagesFound = new TreeMap(); //// private static Map languageNameToTag = new HashMap(); - // private static Map> languageToCharsCounter = new TreeMap>(); - private static Map> languageToCodePointCounter = new TreeMap>(); - + // private static Map> languageToCharsCounter = new TreeMap>(); + private static Map> languageToCodePointCounter = + new TreeMap>(); - // public static SupplementalDataInfo supplementalInfo = SupplementalDataInfo.getInstance(CldrUtility.SUPPLEMENTAL_DIRECTORY); + // public static SupplementalDataInfo supplementalInfo = + // SupplementalDataInfo.getInstance(CldrUtility.SUPPLEMENTAL_DIRECTORY); // // static { // languageToPopulation.put("eo", 100000d); @@ -50,7 +49,8 @@ public class CharacterFrequency { //// .and(transform, ULocale.getISOLanguages()) //// .and(ULocale.getAvailableLocales())) { //// if (!locale.getCountry().isEmpty()) continue; - //// String name = locale.getDisplayName(ULocale.ENGLISH).toUpperCase(Locale.ENGLISH); + //// String name = + // locale.getDisplayName(ULocale.ENGLISH).toUpperCase(Locale.ENGLISH); //// String languageTag = locale.toLanguageTag(); //// languageNameToTag.put(name, languageTag); //// } @@ -66,17 +66,22 @@ public class CharacterFrequency { //// languageNameToTag.put("BIHARI", "bho"); //// languageNameToTag.put("UNKNOWN", "und"); // - // Map> rawLanguageToSequencesCounter = new TreeMap>(); - // //Map> rawLanguageToCharsCounter = new TreeMap>(); + // Map> rawLanguageToSequencesCounter = new TreeMap>(); + // //Map> rawLanguageToCharsCounter = new TreeMap>(); // // // System.out.println("loading stats.characters.txt"); - // // SemiFileReader handler = new SequenceHandler(rawLanguageToCharsCounter).process(Utility.DATA_DIRECTORY + "/frequency/", "stats.short_sequences.txt"); + // // SemiFileReader handler = new + // SequenceHandler(rawLanguageToCharsCounter).process(Utility.DATA_DIRECTORY + "/frequency/", + // "stats.short_sequences.txt"); // // System.out.println("read lines:\t" + handler.getLineCount()); // // Counter mulValue = new Counter(); // // System.out.println("loading stats.lang_sequences.txt"); - // SemiFileReader handler = new SequenceHandler(rawLanguageToSequencesCounter).process(DATA_DIR, + // SemiFileReader handler = new + // SequenceHandler(rawLanguageToSequencesCounter).process(DATA_DIR, // "mul.txt"); // System.out.println("read lines:\t" + handler.getLineCount()); // @@ -116,7 +121,8 @@ public class CharacterFrequency { // // String cpStr = UTF16.valueOf(cp); // // long charCount = charCounter.get(cpStr); // // if (sequenceCount > charCount) { // debug - // // System.out.println(language + "\tsequence:\t" + sequenceCount + "\tchar:\t" + charCount); + // // System.out.println(language + "\tsequence:\t" + sequenceCount + + // "\tchar:\t" + charCount); // // } // // charCounter.add(cpStr, -sequenceCount); // // } @@ -126,14 +132,17 @@ public class CharacterFrequency { // // put all of the normalized marks into a combined list // // for (String sequence : sequenceCounter.keySet()) { - // addNormalizedCount(sequence, sequenceCounter.get(sequence), locale, combinedCounter); + // addNormalizedCount(sequence, sequenceCounter.get(sequence), locale, + // combinedCounter); // } // // for (String sequence : charCounter.keySet()) { - // // addNormalizedCount(sequence, charCounter.get(sequence), locale, combinedCounter); + // // addNormalizedCount(sequence, charCounter.get(sequence), locale, + // combinedCounter); // // } // // - // // at this point, the chars contain all the NFC'd characters, and the sequences contain all the sequences + // // at this point, the chars contain all the NFC'd characters, and the sequences + // contain all the sequences // // sequenceCounter.freeze(); // // charCounter.freeze(); // @@ -154,7 +163,8 @@ public class CharacterFrequency { // languageToPopulation.put("qsu", 7000000000d * 0.82d); // } - // private static void addNormalizedCount(String sequence, long countValue, ULocale locale, Counter combinedCounter) { + // private static void addNormalizedCount(String sequence, long countValue, ULocale locale, + // Counter combinedCounter) { // String nfcSequence = ExemplarInfo.specialNormalize(sequence, locale); // int cp; // for (int i = 0; i < nfcSequence.length(); i+=Character.charCount(cp)) { @@ -163,8 +173,6 @@ public class CharacterFrequency { // } // } - - // public static String getLanguageCode(String string) { // String result = LanguageCodeConverter.getCodeForName(string); // // string = string.toUpperCase(Locale.ENGLISH); @@ -187,14 +195,17 @@ public class CharacterFrequency { // } // double pop2; // String cldrLanguage = ExemplarInfo.getCldrLanguage(language); - // PopulationData popData = CharacterFrequency.supplementalInfo.getLanguagePopulationData(cldrLanguage); + // PopulationData popData = + // CharacterFrequency.supplementalInfo.getLanguagePopulationData(cldrLanguage); // if (popData != null) { // pop2 = popData.getLiteratePopulation(); // } else { // pop2 = 0; - // for (String child : CharacterFrequency.supplementalInfo.getLanguagesForTerritoriesPopulationData()) { + // for (String child : + // CharacterFrequency.supplementalInfo.getLanguagesForTerritoriesPopulationData()) { // if (child.startsWith(cldrLanguage + "_")) { - // popData = CharacterFrequency.supplementalInfo.getLanguagePopulationData(child); + // popData = + // CharacterFrequency.supplementalInfo.getLanguagePopulationData(child); // if (popData != null) { // pop2 += popData.getLiteratePopulation(); // } @@ -236,7 +247,9 @@ public static Counter getCodePointCounter(String language, boolean rank Counter result = languageToCodePointCounter.get(language); if (result == null) { result = new Counter(); - final SemiFileReader handler = new SequenceHandler(result).process(ranked ? DATA_DIR_RANK : DATA_DIR, language + ".txt"); + final SemiFileReader handler = + new SequenceHandler(result) + .process(ranked ? DATA_DIR_RANK : DATA_DIR, language + ".txt"); languageToCodePointCounter.put(language, result); } return result; @@ -249,20 +262,22 @@ static class SequenceHandler extends FileUtilities.SemiFileReader { public SequenceHandler(Counter counter) { this.counter = counter; } + @Override public boolean handleLine(int start, int end, String[] items) { // if (getLineCount() > CharacterFrequency.MAX_LINE_COUNT) { // return false; // } if (DEBUG && ((++lineCounter % 1000) == 0 || lineCounter < 100)) { - System.out.println(lineCounter + "\t" + Arrays.asList(items) + "\t" + counter.getItemCount()); + System.out.println( + lineCounter + "\t" + Arrays.asList(items) + "\t" + counter.getItemCount()); } if (items.length != 2) { throw new IllegalArgumentException(Arrays.asList(items).toString()); } - final int cp = Integer.parseInt(items[0],16); + final int cp = Integer.parseInt(items[0], 16); final long count = Long.parseLong(items[1]); // if (count < CharacterFrequency.MIN_COUNT) { // return true; @@ -273,6 +288,7 @@ public boolean handleLine(int start, int end, String[] items) { } static final Set LANGUAGES; + static { final HashSet result = new HashSet(); final File dir = new File(DATA_DIR); @@ -280,7 +296,7 @@ public boolean handleLine(int start, int end, String[] items) { if (!file.endsWith(".txt")) { continue; } - result.add(file.substring(0,file.length()-4)); + result.add(file.substring(0, file.length() - 4)); } LANGUAGES = Collections.unmodifiableSet(result); } diff --git a/unicodetools/src/main/java/org/unicode/draft/CheckCollator.java b/unicodetools/src/main/java/org/unicode/draft/CheckCollator.java index 6c7b703ec..6cfa9da00 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CheckCollator.java +++ b/unicodetools/src/main/java/org/unicode/draft/CheckCollator.java @@ -1,11 +1,10 @@ package org.unicode.draft; -import java.util.Arrays; -import java.util.Set; -import java.util.TreeSet; import com.ibm.icu.text.Collator; import com.ibm.icu.util.ULocale; - +import java.util.Arrays; +import java.util.Set; +import java.util.TreeSet; public class CheckCollator { public static void main(String[] args) { @@ -19,15 +18,20 @@ public static void main(String[] args) { } } - final String functionalLocale = Collator.getFunctionalEquivalent("collation", collator.getLocale(ULocale.ACTUAL_LOCALE)).toString(); + final String functionalLocale = + Collator.getFunctionalEquivalent( + "collation", collator.getLocale(ULocale.ACTUAL_LOCALE)) + .toString(); System.out.println(functionalLocale); final String[] values = Collator.getKeywordValues("collation"); - //System.out.println("collation" + ":\t" + Arrays.asList(values)); + // System.out.println("collation" + ":\t" + Arrays.asList(values)); final ULocale[] locales = Collator.getAvailableULocales(); for (final ULocale locale : locales) { - final String[] localeValues = Collator.getKeywordValuesForLocale("collation", locale, true); - //System.out.println(locale + "\t" + "collation" + ":\t" + Arrays.asList(localeValues)); + final String[] localeValues = + Collator.getKeywordValuesForLocale("collation", locale, true); + // System.out.println(locale + "\t" + "collation" + ":\t" + + // Arrays.asList(localeValues)); final ULocale functionalLocale2 = Collator.getFunctionalEquivalent("collation", locale); if (!functionalLocale2.equals(locale)) { System.out.println(locale + "\t=>\t" + functionalLocale2); diff --git a/unicodetools/src/main/java/org/unicode/draft/CheckComparison.java b/unicodetools/src/main/java/org/unicode/draft/CheckComparison.java index 43931a07e..a3d4f786c 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CheckComparison.java +++ b/unicodetools/src/main/java/org/unicode/draft/CheckComparison.java @@ -1,4 +1,11 @@ package org.unicode.draft; + +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.DecimalFormat; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.RawCollationKey; +import com.ibm.icu.util.ULocale; import java.text.ParseException; import java.text.ParsePosition; import java.util.ArrayList; @@ -6,21 +13,14 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.Timer; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.DecimalFormat; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.RawCollationKey; -import com.ibm.icu.util.ULocale; - public class CheckComparison { - static final Timer t = new Timer(); - static ArrayList indata = new ArrayList(); - static ArrayList mixedData = new ArrayList(); - static final DecimalFormat percent = (DecimalFormat) NumberFormat.getPercentInstance(); + static final Timer t = new Timer(); + static ArrayList indata = new ArrayList(); + static ArrayList mixedData = new ArrayList(); + static final DecimalFormat percent = (DecimalFormat) NumberFormat.getPercentInstance(); + static { percent.setPositivePrefix("+"); } @@ -70,16 +70,24 @@ private static void showTimes(final ULocale myLocale, final int dataSize) { System.out.println("Direct Comparison (JDK)"); timeDirectComparison2(collator2, 1, 1, directTimes2); // warm up timeDirectComparison2(collator2, constructionIterations, queryIterations, directTimes2); - System.out.println("Construction Time: " + directTimes2[0] + showPercent(directTimes2[0], directTimes[0])); - System.out.println("Query Time: " + directTimes2[1] + showPercent(directTimes2[1], directTimes[1])); + System.out.println( + "Construction Time: " + + directTimes2[0] + + showPercent(directTimes2[0], directTimes[0])); + System.out.println( + "Query Time: " + directTimes2[1] + showPercent(directTimes2[1], directTimes[1])); final long[] sortKeyTimes = new long[2]; System.gc(); System.out.println("Sortkey Comparison"); timeSortkeyComparison(collator, 1, 1, sortKeyTimes); // warm up timeSortkeyComparison(collator, constructionIterations, queryIterations, sortKeyTimes); - System.out.println("Construction Time: " + sortKeyTimes[0] + showPercent(sortKeyTimes[0], directTimes[0])); - System.out.println("Query Time: " + sortKeyTimes[1] + showPercent(sortKeyTimes[1], directTimes[1])); + System.out.println( + "Construction Time: " + + sortKeyTimes[0] + + showPercent(sortKeyTimes[0], directTimes[0])); + System.out.println( + "Query Time: " + sortKeyTimes[1] + showPercent(sortKeyTimes[1], directTimes[1])); } private static String showPercent(long l, long m) { @@ -87,9 +95,13 @@ private static String showPercent(long l, long m) { return ",\t" + percent.format((l / (double) m) - 1.0d); } - private static void timeSortkeyComparison(Collator collator, final int constructionIterations, - final int queryIterations, long[] times) { - final Map sortKeyComparison = new TreeMap(); + private static void timeSortkeyComparison( + Collator collator, + final int constructionIterations, + final int queryIterations, + long[] times) { + final Map sortKeyComparison = + new TreeMap(); t.start(); for (int i = constructionIterations; i >= 0; --i) { @@ -115,8 +127,11 @@ private static void timeSortkeyComparison(Collator collator, final int construct times[1] = t.getDuration(); } - private static void timeDirectComparison(Collator collator, final int constructionIterations, - final int queryIterations, long[] times) { + private static void timeDirectComparison( + Collator collator, + final int constructionIterations, + final int queryIterations, + long[] times) { final Set plainComparison = new TreeSet(collator); t.start(); @@ -140,8 +155,11 @@ private static void timeDirectComparison(Collator collator, final int constructi times[1] = t.getDuration(); } - private static void timeDirectComparison2(java.text.Collator collator, final int constructionIterations, - final int queryIterations, long[] times) { + private static void timeDirectComparison2( + java.text.Collator collator, + final int constructionIterations, + final int queryIterations, + long[] times) { final Set plainComparison = new TreeSet(collator); t.start(); @@ -165,7 +183,6 @@ private static void timeDirectComparison2(java.text.Collator collator, final int times[1] = t.getDuration(); } - private static Collator prepareData(final ULocale myLocale, int maxCount) { final Collator collator = Collator.getInstance(myLocale); int count = 0; @@ -173,15 +190,18 @@ private static Collator prepareData(final ULocale myLocale, int maxCount) { indata = new ArrayList(maxCount); mixedData = new ArrayList(maxCount); - main: for (final String languageCode : ULocale.getISOLanguages()) { + main: + for (final String languageCode : ULocale.getISOLanguages()) { final String languageName = ULocale.getDisplayLanguage(languageCode, myLocale); for (int i = UScript.COMMON; i < UScript.CODE_LIMIT; ++i) { final String scriptCode = UScript.getShortName(i); final String scriptName = ULocale.getDisplayScript("und-" + scriptCode, myLocale); for (final String countryCode : ULocale.getISOCountries()) { - final String countryName = ULocale.getDisplayCountry("und-" + countryCode, myLocale); + final String countryName = + ULocale.getDisplayCountry("und-" + countryCode, myLocale); final String someString = languageName + ", " + scriptName + ", " + countryName; - final String someString2 = scriptName + ", " + countryName + ", " + languageName; + final String someString2 = + scriptName + ", " + countryName + ", " + languageName; indata.add(someString2); mixedData.add(someString2); mixedData.add(someString); diff --git a/unicodetools/src/main/java/org/unicode/draft/CheckPunycode.java b/unicodetools/src/main/java/org/unicode/draft/CheckPunycode.java index 723f8761b..07c4f6003 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CheckPunycode.java +++ b/unicodetools/src/main/java/org/unicode/draft/CheckPunycode.java @@ -1,16 +1,16 @@ package org.unicode.draft; -import java.util.Random; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.StringPrepParseException; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; - +import java.util.Random; public class CheckPunycode { private static final UnicodeSet LDH = new UnicodeSet("[-0-9a-z]"); - private static final UnicodeSet APPROX_STRINGPREP = new UnicodeSet("[[-0-9a-z][:l:][:m:][:nd:]-[:nfkcqc=n:]-[:Lu:]-[:Lt:]]"); + private static final UnicodeSet APPROX_STRINGPREP = + new UnicodeSet("[[-0-9a-z][:l:][:m:][:nd:]-[:nfkcqc=n:]-[:Lu:]-[:Lt:]]"); static Random r = new Random(0); public static void main(String[] args) throws StringPrepParseException { @@ -22,19 +22,19 @@ public static void main(String[] args) throws StringPrepParseException { } System.out.println("singles"); - for (final UnicodeSetIterator it = new UnicodeSetIterator(LDH); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(LDH); it.next(); ) { checkPunycode(it.getString()); } System.out.println("doubles"); - for (final UnicodeSetIterator it = new UnicodeSetIterator(LDH); it.next();) { - for (final UnicodeSetIterator it2 = new UnicodeSetIterator(LDH); it2.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(LDH); it.next(); ) { + for (final UnicodeSetIterator it2 = new UnicodeSetIterator(LDH); it2.next(); ) { checkPunycode(it.getString() + it2.getString()); } } System.out.println("triples"); - for (final UnicodeSetIterator it = new UnicodeSetIterator(LDH); it.next();) { - for (final UnicodeSetIterator it2 = new UnicodeSetIterator(LDH); it2.next();) { - for (final UnicodeSetIterator it3 = new UnicodeSetIterator(LDH); it3.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(LDH); it.next(); ) { + for (final UnicodeSetIterator it2 = new UnicodeSetIterator(LDH); it2.next(); ) { + for (final UnicodeSetIterator it3 = new UnicodeSetIterator(LDH); it3.next(); ) { checkPunycode(it.getString() + it2.getString() + it3.getString()); } } @@ -65,8 +65,19 @@ private static void checkPunycode(String source) { status = "FAIL"; return; } - System.out.println(status + " " + source + " => <" + x + "> " + Utility.hex(x) + " " + - " => <" + y + "> " + Utility.hex(y)); + System.out.println( + status + + " " + + source + + " => <" + + x + + "> " + + Utility.hex(x) + + " " + + " => <" + + y + + "> " + + Utility.hex(y)); } private static void checkNewVsOld(int count) throws StringPrepParseException { @@ -77,18 +88,29 @@ private static void checkNewVsOld(int count) throws StringPrepParseException { OldPunycode.showProgress = puny.showProgress = (j == 37); try { - final String result = OldPunycode.encode(new StringBuffer(unicode), null).toString(); + final String result = + OldPunycode.encode(new StringBuffer(unicode), null).toString(); final String result2 = puny.encode(unicode, new StringBuilder()).toString(); if (!result.equals(result2)) { - System.out.println("Encode Failure at: " + unicode + ", " + result + ", " + result2); + System.out.println( + "Encode Failure at: " + unicode + ", " + result + ", " + result2); } final String back = OldPunycode.decode(new StringBuffer(result), null).toString(); final String back2 = puny.decode(result, new StringBuffer()).toString(); if (!back.equals(back2)) { - System.out.println("Decode Failure at: " + unicode + ", " + result + ", " + back + ", " + back2); + System.out.println( + "Decode Failure at: " + + unicode + + ", " + + result + + ", " + + back + + ", " + + back2); } } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException(j + " Error " + unicode).initCause(e); + throw (RuntimeException) + new IllegalArgumentException(j + " Error " + unicode).initCause(e); } } } @@ -103,7 +125,7 @@ private static String randomString() { int c; while (true) { final double nextDouble = r.nextDouble(); - final int index = (int)(ASSIGNED_SIZE*nextDouble*nextDouble*nextDouble); + final int index = (int) (ASSIGNED_SIZE * nextDouble * nextDouble * nextDouble); System.out.println(index); if (index > ASSIGNED_SIZE) { continue; diff --git a/unicodetools/src/main/java/org/unicode/draft/CheckResources.java b/unicodetools/src/main/java/org/unicode/draft/CheckResources.java index 15312d7ed..9d8e284a4 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CheckResources.java +++ b/unicodetools/src/main/java/org/unicode/draft/CheckResources.java @@ -1,13 +1,4 @@ package org.unicode.draft; -import java.io.UnsupportedEncodingException; -import java.util.Arrays; -import java.util.Collection; -import java.util.Comparator; -import java.util.LinkedHashSet; -import java.util.Set; -import java.util.TreeSet; - -import org.unicode.cldr.util.Counter; import com.ibm.icu.impl.ICUData; import com.ibm.icu.impl.ICUResourceBundle; @@ -17,7 +8,14 @@ import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; import com.ibm.icu.util.UResourceBundle; - +import java.io.UnsupportedEncodingException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedHashSet; +import java.util.Set; +import java.util.TreeSet; +import org.unicode.cldr.util.Counter; public class CheckResources { @@ -37,20 +35,23 @@ public static void main(String[] args) throws UnsupportedEncodingException { // Ugly hack to get base names static Collection getBaseNames() { - return new LinkedHashSet(Arrays.asList(new String[] { - ICUData.ICU_BASE_NAME, - ICUData.ICU_BRKITR_BASE_NAME, - ICUData.ICU_COLLATION_BASE_NAME, - ICUData.ICU_RBNF_BASE_NAME, - ICUData.ICU_TRANSLIT_BASE_NAME - })); + return new LinkedHashSet( + Arrays.asList( + new String[] { + ICUData.ICU_BASE_NAME, + ICUData.ICU_BRKITR_BASE_NAME, + ICUData.ICU_COLLATION_BASE_NAME, + ICUData.ICU_RBNF_BASE_NAME, + ICUData.ICU_TRANSLIT_BASE_NAME + })); } - private static void gatherData(String baseName) { ULocale[] availableULocales; try { - availableULocales = ICUResourceBundle.getAvailableULocales(baseName, ICUResourceBundle.ICU_DATA_CLASS_LOADER); + availableULocales = + ICUResourceBundle.getAvailableULocales( + baseName, ICUResourceBundle.ICU_DATA_CLASS_LOADER); } catch (final Exception e) { e.printStackTrace(); System.out.println("*** Unable to load " + baseName); @@ -69,26 +70,27 @@ private static void addStrings(UResourceBundle rs) { keyCounter.add(key, 1); } switch (rs.getType()) { - case UResourceBundle.STRING: - counter.add(rs.getString(), 1); - break; - case UResourceBundle.ARRAY: - case UResourceBundle.TABLE: - for (int i = 0; i < rs.getSize(); ++i) { - final UResourceBundle rs2 = rs.get(i); - addStrings(rs2); - } - break; - case UResourceBundle.BINARY: - case UResourceBundle.INT: - case UResourceBundle.INT_VECTOR: // skip - break; - default: - throw new IllegalArgumentException("Unknown Option: " + rs.getType()); + case UResourceBundle.STRING: + counter.add(rs.getString(), 1); + break; + case UResourceBundle.ARRAY: + case UResourceBundle.TABLE: + for (int i = 0; i < rs.getSize(); ++i) { + final UResourceBundle rs2 = rs.get(i); + addStrings(rs2); + } + break; + case UResourceBundle.BINARY: + case UResourceBundle.INT: + case UResourceBundle.INT_VECTOR: // skip + break; + default: + throw new IllegalArgumentException("Unknown Option: " + rs.getType()); } } - private static void printData(Counter counter2, boolean showKeys) throws UnsupportedEncodingException { + private static void printData(Counter counter2, boolean showKeys) + throws UnsupportedEncodingException { long totalUtf16Size = 0; long totalUtf8Size = 0; long totalScsuSize = 0; @@ -99,10 +101,10 @@ private static void printData(Counter counter2, boolean showKeys) throws for (final String key : counter2.getKeysetSortedByCount(false)) { final long count = counter2.getCount(key); if (showKeys) { - final String trunc = key.length() < 20 ? key : key.substring(0,19) + "..."; + final String trunc = key.length() < 20 ? key : key.substring(0, 19) + "..."; System.out.println(count + "\t" + trunc); } - final long utf16Length = (key.length()+1) * 2; + final long utf16Length = (key.length() + 1) * 2; uniqueUtf16Size += utf16Length; totalUtf16Size += utf16Length * count; @@ -126,7 +128,8 @@ private static void printData(Counter counter2, boolean showKeys) throws System.out.println("Total Unique Size (SCSU):\t" + nf.format(uniqueScsuSize) + " bytes"); } - private static void printDataCompressed(Counter counter2, boolean show) throws UnsupportedEncodingException { + private static void printDataCompressed(Counter counter2, boolean show) + throws UnsupportedEncodingException { long uniqueUtf8Size = 0; long savedUtf8Size = 0; long savedUtf8SingleSize = 0; @@ -138,7 +141,7 @@ private static void printDataCompressed(Counter counter2, boolean show) String lastKey = ""; for (final String key : count_key) { final long count = counter2.getCount(key); - final String trunc = key.length() < 20 ? key : key.substring(0,19) + "..."; + final String trunc = key.length() < 20 ? key : key.substring(0, 19) + "..."; if (show) { System.out.print(count + "\t" + trunc); } @@ -152,7 +155,8 @@ private static void printDataCompressed(Counter counter2, boolean show) if (show) { System.out.print("\tSKIP SINGLE"); } - } else if (key.regionMatches(0, lastKey, lastKey.length() - key.length(), key.length())) { + } else if (key.regionMatches( + 0, lastKey, lastKey.length() - key.length(), key.length())) { savedUtf8Size += utf8Length; if (show) { System.out.print("\tSKIP"); @@ -165,16 +169,30 @@ private static void printDataCompressed(Counter counter2, boolean show) } } System.out.println("Total Unique Size:\t" + nf.format(uniqueUtf8Size) + " bytes"); - System.out.println("Total Saved Shared Suffix Size:\t" + nf.format(savedUtf8Size) + " bytes\t" - + savedUtf8Size/(double)(uniqueUtf8Size)); - System.out.println("Total Saved Singleton Size:\t" + nf.format(savedUtf8SingleSize) + " bytes\t" - + savedUtf8SingleSize/(double)(uniqueUtf8Size)); + System.out.println( + "Total Saved Shared Suffix Size:\t" + + nf.format(savedUtf8Size) + + " bytes\t" + + savedUtf8Size / (double) (uniqueUtf8Size)); + System.out.println( + "Total Saved Singleton Size:\t" + + nf.format(savedUtf8SingleSize) + + " bytes\t" + + savedUtf8SingleSize / (double) (uniqueUtf8Size)); System.out.println("Character Frequencies"); int i = 0; for (final Character key : charCount.getKeysetSortedByCount(false)) { final long count = charCount.getCount(key); - System.out.println(++i + "\t" + count + "\t" + key + "\t" + Utility.hex(key) + (alphanum.contains(key) ? "" : "\t!Alphanum")); + System.out.println( + ++i + + "\t" + + count + + "\t" + + key + + "\t" + + Utility.hex(key) + + (alphanum.contains(key) ? "" : "\t!Alphanum")); if (i > 99) { break; } @@ -192,6 +210,7 @@ private static void countChars(Counter charCount, String key, long va static Counter counter = new Counter(); static Counter keyCounter = new Counter(); static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); + static { nf.setGroupingUsed(true); } @@ -214,5 +233,6 @@ public int compare(String o1, String o2) { } } } + static Comparator REVERSE = new SortReverseLengthFirst(); } diff --git a/unicodetools/src/main/java/org/unicode/draft/CldrUtility.java b/unicodetools/src/main/java/org/unicode/draft/CldrUtility.java index d8c925a0f..54cf0c622 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CldrUtility.java +++ b/unicodetools/src/main/java/org/unicode/draft/CldrUtility.java @@ -8,7 +8,15 @@ ********************************************************************** */ - +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.DateFormat; +import com.ibm.icu.text.SimpleDateFormat; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.Freezable; +import com.ibm.icu.util.TimeZone; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; @@ -35,40 +43,34 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.TransliteratorUtilities; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.DateFormat; -import com.ibm.icu.text.SimpleDateFormat; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.Freezable; -import com.ibm.icu.util.TimeZone; - public class CldrUtility { - public static final String LINE_SEPARATOR = "\n"; //System.getProperty("line.separator"); + public static final String LINE_SEPARATOR = "\n"; // System.getProperty("line.separator"); /** - * Very simple class, used to replace variables in a string. For example

-

static VariableReplacer langTag = new VariableReplacer()
-            .add("$alpha", "[a-zA-Z]")
-            .add("$digit", "[0-9]")
-            .add("$alphanum", "[a-zA-Z0-9]")
-            .add("$x", "[xX]");
-            ...
-            String langTagPattern = langTag.replace(...);
-    
+ * Very simple class, used to replace variables in a string. For example + * + *

+ * + *

static VariableReplacer langTag = new VariableReplacer()
+     * .add("$alpha", "[a-zA-Z]")
+     * .add("$digit", "[0-9]")
+     * .add("$alphanum", "[a-zA-Z0-9]")
+     * .add("$x", "[xX]");
+     * ...
+     * String langTagPattern = langTag.replace(...);
+     * 
*/ public static class VariableReplacer { // simple implementation for now - private final Map m = new TreeMap(Collections.reverseOrder()); + private final Map m = + new TreeMap(Collections.reverseOrder()); /** * Add a new variable + * * @param variable * @param value * @return @@ -79,6 +81,7 @@ public VariableReplacer add(String variable, String value) { } /** * Replace all of the variables in the source, recursively. + * * @param source * @return */ @@ -92,6 +95,7 @@ public String replace(String source) { } while (!source.equals(oldSource)); return source; } + public static String replaceAll(String source, String key, String value) { return source.replace(key, value); } @@ -101,8 +105,7 @@ static String getPath(String path, String filename) { if (path == null) { return null; } - final File file = filename == null ? new File(path) - : new File(path, filename); + final File file = filename == null ? new File(path) : new File(path, filename); try { return file.getCanonicalPath() + File.separatorChar; } catch (final IOException e) { @@ -115,40 +118,63 @@ static String getPath(String path) { } static final boolean DEBUG_SHOW_BAT = false; - /** default working directory for Eclipse is . = ${workspace_loc:cldr}, which is /tools/java/ */ + /** + * default working directory for Eclipse is . = ${workspace_loc:cldr}, which is + * /tools/java/ + */ // set the base directory with -Dcldrdata= // if the main is different, use -Dcldrmain= - public static final String BASE_DIRECTORY = getPath(CldrUtility.getProperty("CLDR_DIR", null)); // new File(Utility.getProperty("CLDR_DIR", null)).getPath(); // get up to - public static final String UTIL_DATA_DIR = getPath(BASE_DIRECTORY, "tools/java/org/unicode/cldr/util/data/"); // "C:/ICU4C/locale/tools/java/org/unicode/cldr/util/"; + public static final String BASE_DIRECTORY = + getPath( + CldrUtility.getProperty( + "CLDR_DIR", + null)); // new File(Utility.getProperty("CLDR_DIR", null)).getPath(); + // // get up to + + public static final String UTIL_DATA_DIR = + getPath( + BASE_DIRECTORY, + "tools/java/org/unicode/cldr/util/data/"); // "C:/ICU4C/locale/tools/java/org/unicode/cldr/util/"; public static final String UTIL_CLASS_DIR = "org.unicode.cldr.util"; - public static final String COMMON_DIRECTORY = getPath(BASE_DIRECTORY , "common/"); - public static final String MAIN_DIRECTORY = CldrUtility.getProperty("CLDR_MAIN", getPath(CldrUtility.COMMON_DIRECTORY, "main")); - public static final String GEN_DIRECTORY = getPath(CldrUtility.getProperty("CLDR_GEN_DIR", getPath(BASE_DIRECTORY , "../Generated/cldr/"))); + public static final String COMMON_DIRECTORY = getPath(BASE_DIRECTORY, "common/"); + public static final String MAIN_DIRECTORY = + CldrUtility.getProperty("CLDR_MAIN", getPath(CldrUtility.COMMON_DIRECTORY, "main")); + public static final String GEN_DIRECTORY = + getPath( + CldrUtility.getProperty( + "CLDR_GEN_DIR", getPath(BASE_DIRECTORY, "../Generated/cldr/"))); /** * @deprecated please use XMLFile and CLDRFILE getSupplementalDirectory() * @see DEFAULT_SUPPLEMENTAL_DIRECTORY */ @Deprecated - public static final String SUPPLEMENTAL_DIRECTORY = getPath(COMMON_DIRECTORY , "supplemental/"); - /** - * Only the default, if no other directory is specified. - */ - public static final String DEFAULT_SUPPLEMENTAL_DIRECTORY = getPath(COMMON_DIRECTORY , "supplemental/"); - public static final String CHART_DIRECTORY = getPath(GEN_DIRECTORY , "diff/"); - public static final String TEST_DIR = getPath(CldrUtility.BASE_DIRECTORY, "test/"); + public static final String SUPPLEMENTAL_DIRECTORY = getPath(COMMON_DIRECTORY, "supplemental/"); + /** Only the default, if no other directory is specified. */ + public static final String DEFAULT_SUPPLEMENTAL_DIRECTORY = + getPath(COMMON_DIRECTORY, "supplemental/"); + public static final String CHART_DIRECTORY = getPath(GEN_DIRECTORY, "diff/"); + public static final String TEST_DIR = getPath(CldrUtility.BASE_DIRECTORY, "test/"); /** If the generated BAT files are to work, this needs to be set right */ public static final String COMPARE_PROGRAM = "\"C:\\Program Files\\Compare It!\\wincmp3.exe\""; - public static final List MINIMUM_LANGUAGES = Arrays.asList(new String[] {"ar", "en", "de", "fr", "hi", "it", "es", "pt", "ru", "zh", "ja"}); // plus language itself - public static final List MINIMUM_TERRITORIES = Arrays.asList(new String[] {"US", "GB", "DE", "FR", "IT", "JP", "CN", "IN", "RU", "BR"}); + public static final List MINIMUM_LANGUAGES = + Arrays.asList( + new String[] { + "ar", "en", "de", "fr", "hi", "it", "es", "pt", "ru", "zh", "ja" + }); // plus language itself + public static final List MINIMUM_TERRITORIES = + Arrays.asList( + new String[] {"US", "GB", "DE", "FR", "IT", "JP", "CN", "IN", "RU", "BR"}); public interface LineComparer { static final int LINES_DIFFERENT = -1, LINES_SAME = 0, SKIP_FIRST = 1, SKIP_SECOND = 2; /** - * Returns LINES_DIFFERENT, LINES_SAME, or if one of the lines is ignorable, SKIP_FIRST or SKIP_SECOND + * Returns LINES_DIFFERENT, LINES_SAME, or if one of the lines is ignorable, SKIP_FIRST or + * SKIP_SECOND + * * @param line1 * @param line2 * @return @@ -161,9 +187,11 @@ public static class SimpleLineComparator implements LineComparer { StringIterator si1 = new StringIterator(); StringIterator si2 = new StringIterator(); int flags; + public SimpleLineComparator(int flags) { this.flags = flags; } + @Override public int compare(String line1, String line2) { // first, see if we want to skip one or the other lines @@ -171,26 +199,25 @@ public int compare(String line1, String line2) { if (line1 == null) { skipper = SKIP_FIRST; } else { - if ((flags & TRIM)!= 0) { + if ((flags & TRIM) != 0) { line1 = line1.trim(); } - if ((flags & SKIP_EMPTY)!= 0 && line1.length() == 0) { + if ((flags & SKIP_EMPTY) != 0 && line1.length() == 0) { skipper = SKIP_FIRST; } } if (line2 == null) { skipper = SKIP_SECOND; } else { - if ((flags & TRIM)!= 0) { + if ((flags & TRIM) != 0) { line2 = line2.trim(); } - if ((flags & SKIP_EMPTY)!= 0 && line2.length() == 0) { + if ((flags & SKIP_EMPTY) != 0 && line2.length() == 0) { skipper += SKIP_SECOND; } } if (skipper != 0) { - if (skipper == SKIP_FIRST + SKIP_SECOND) - { + if (skipper == SKIP_FIRST + SKIP_SECOND) { return LINES_SAME; // ok, don't skip both } return skipper; @@ -232,14 +259,18 @@ public int compare(String line1, String line2) { } // private Matcher dtdMatcher = Pattern.compile( - // "\\Q\\E").matcher(""); + // "\\Q\\E").matcher(""); - //private String[] CVS_TAGS = {"Revision", "Date"}; - private final Matcher[] CVS_TAGS = {Pattern.compile("[$](Revision)[^$]*[$]").matcher(""), Pattern.compile("[$](Date)[^$]*[$]").matcher("")}; + // private String[] CVS_TAGS = {"Revision", "Date"}; + private final Matcher[] CVS_TAGS = { + Pattern.compile("[$](Revision)[^$]*[$]").matcher(""), + Pattern.compile("[$](Date)[^$]*[$]").matcher("") + }; private String stripTags(String line) { - //$ Revision: 1.1 $ - //$ Date: 2010-05-08 01:06:12 $ + // $ Revision: 1.1 $ + // $ Date: 2010-05-08 01:06:12 $ for (final Matcher element : CVS_TAGS) { element.reset(line); final boolean foo = element.find(); @@ -247,11 +278,9 @@ private String stripTags(String line) { } return line; } - } /** - * * @param file1 * @param file2 * @param failureLines on input, String[2], on output, failing lines @@ -259,11 +288,12 @@ private String stripTags(String line) { * @return * @throws IOException */ - public static boolean areFileIdentical(String file1, String file2, String[] failureLines, - LineComparer lineComparer) throws IOException { - final BufferedReader br1 = new BufferedReader(new FileReader(file1), 32*1024); + public static boolean areFileIdentical( + String file1, String file2, String[] failureLines, LineComparer lineComparer) + throws IOException { + final BufferedReader br1 = new BufferedReader(new FileReader(file1), 32 * 1024); try { - final BufferedReader br2 = new BufferedReader(new FileReader(file2), 32*1024); + final BufferedReader br2 = new BufferedReader(new FileReader(file2), 32 * 1024); try { String line1 = ""; String line2 = ""; @@ -299,10 +329,11 @@ public static boolean areFileIdentical(String file1, String file2, String[] fail } public static void registerExtraTransliterators() { - final String tzadir = UTIL_DATA_DIR + File.separatorChar; // work around bad pattern (dir+filename) + final String tzadir = + UTIL_DATA_DIR + File.separatorChar; // work around bad pattern (dir+filename) // HACK around lack of Armenian, Ethiopic TransliteratorUtilities.registerTransliteratorFromFile(tzadir, "Latin-Armenian"); - //TransliteratorUtilities.registerTransliteratorFromFile(tzadir, "Latin-Ethiopic"); + // TransliteratorUtilities.registerTransliteratorFromFile(tzadir, "Latin-Ethiopic"); TransliteratorUtilities.registerTransliteratorFromFile(tzadir, "Cyrillic-Latin"); TransliteratorUtilities.registerTransliteratorFromFile(tzadir, "Arabic-Latin"); // needed @@ -317,8 +348,6 @@ public static void registerExtraTransliterators() { TransliteratorUtilities.registerTransliteratorFromFile(tzadir, "Lao-Latin"); } - - /* static String getLineWithoutFluff(BufferedReader br1, boolean first, int flags) throws IOException { while (true) { @@ -331,9 +360,10 @@ static String getLineWithoutFluff(BufferedReader br1, boolean first, int flags) } */ - public final static class StringIterator { + public static final class StringIterator { String string; int position = 0; + char next() { while (true) { if (position >= string.length()) { @@ -345,15 +375,18 @@ char next() { } } } + StringIterator reset() { position = 0; return this; } + StringIterator set(String string) { this.string = string; position = 0; return this; } + boolean matches(StringIterator other) { while (true) { final char c1 = next(); @@ -374,11 +407,22 @@ public int getPosition() { } } - static public void generateBat(String sourceDir, String sourceFile, String targetDir, String targetFile) { - generateBat( sourceDir, sourceFile, targetDir, targetFile, new CldrUtility.SimpleLineComparator(0)); + public static void generateBat( + String sourceDir, String sourceFile, String targetDir, String targetFile) { + generateBat( + sourceDir, + sourceFile, + targetDir, + targetFile, + new CldrUtility.SimpleLineComparator(0)); } - static public void generateBat(String sourceDir, String sourceFile, String targetDir, String targetFile, LineComparer lineComparer) { + public static void generateBat( + String sourceDir, + String sourceFile, + String targetDir, + String targetFile, + LineComparer lineComparer) { try { final String batDir = targetDir + "diff" + File.separator; final String batName = targetFile + ".bat"; @@ -398,9 +442,12 @@ static public void generateBat(String sourceDir, String sourceFile, String targe } else if (!areFileIdentical(fullSource, fullTarget, failureLines, lineComparer)) { final PrintWriter bat = FileUtilities.openUTF8Writer(batDir, batName); try { - bat.println(COMPARE_PROGRAM + " " + - new File(fullSource).getCanonicalPath() + " " + - new File(fullTarget).getCanonicalPath()); + bat.println( + COMPARE_PROGRAM + + " " + + new File(fullSource).getCanonicalPath() + + " " + + new File(fullTarget).getCanonicalPath()); } finally { bat.close(); } @@ -443,7 +490,8 @@ public static List splitList(String source, char separator, boolean trim return splitList(source, separator, trim, null); } - public static List splitList(String source, char separator, boolean trim, List output) { + public static List splitList( + String source, char separator, boolean trim, List output) { if (output == null) { output = new ArrayList(); } @@ -461,35 +509,34 @@ public static List splitList(String source, char separator, boolean trim piece = piece.trim(); } output.add(piece); - pos = npos+1; + pos = npos + 1; } while (pos < source.length()); return output; } /** - * Protect a collection (as much as Java lets us!) from modification. - * Really, really ugly code, since Java doesn't let us do better. + * Protect a collection (as much as Java lets us!) from modification. Really, really ugly code, + * since Java doesn't let us do better. */ public static T protectCollection(T source) { // TODO - exclude UnmodifiableMap, Set, ... if (source instanceof Map) { final Map sourceMap = (Map) source; final Map resultMap = clone(sourceMap); - if (resultMap == null) - { + if (resultMap == null) { return (T) sourceMap; // failed } resultMap.clear(); for (final Object key : sourceMap.keySet()) { resultMap.put(protectCollection(key), protectCollection(sourceMap.get(key))); } - return resultMap instanceof SortedMap ? (T) Collections.unmodifiableSortedMap((SortedMap)resultMap) + return resultMap instanceof SortedMap + ? (T) Collections.unmodifiableSortedMap((SortedMap) resultMap) : (T) Collections.unmodifiableMap(resultMap); } else if (source instanceof Collection) { final Collection sourceCollection = (Collection) source; final Collection resultCollection = clone(sourceCollection); - if (resultCollection == null) - { + if (resultCollection == null) { return (T) sourceCollection; // failed } resultCollection.clear(); @@ -498,16 +545,19 @@ public static T protectCollection(T source) { resultCollection.add(protectCollection(item)); } - return sourceCollection instanceof List ? (T)Collections.unmodifiableList((List)sourceCollection) - : sourceCollection instanceof SortedSet ? (T)Collections.unmodifiableSortedSet((SortedSet)sourceCollection) - : sourceCollection instanceof Set ? (T)Collections.unmodifiableSet((Set)sourceCollection) - : (T)Collections.unmodifiableCollection(sourceCollection); + return sourceCollection instanceof List + ? (T) Collections.unmodifiableList((List) sourceCollection) + : sourceCollection instanceof SortedSet + ? (T) Collections.unmodifiableSortedSet((SortedSet) sourceCollection) + : sourceCollection instanceof Set + ? (T) Collections.unmodifiableSet((Set) sourceCollection) + : (T) Collections.unmodifiableCollection(sourceCollection); } else if (source instanceof Freezable) { final Freezable freezableSource = (Freezable) source; if (freezableSource.isFrozen()) { return source; } - return (T)((Freezable)(freezableSource.cloneAsThawed())).freeze(); + return (T) ((Freezable) (freezableSource.cloneAsThawed())).freeze(); } else { return source; // can't protect } @@ -515,6 +565,7 @@ public static T protectCollection(T source) { /** * Clones T if we can; otherwise returns null. + * * @param * @param source * @return @@ -522,15 +573,14 @@ public static T protectCollection(T source) { public static T clone(T source) { try { final Class class1 = source.getClass(); - final Method declaredMethod = class1.getDeclaredMethod("clone", (Class)null); - return (T) declaredMethod.invoke(source, (Object)null); + final Method declaredMethod = class1.getDeclaredMethod("clone", (Class) null); + return (T) declaredMethod.invoke(source, (Object) null); } catch (final Exception e) { return null; // uncloneable } } - /** Appends two strings, inserting separator if either is empty - */ + /** Appends two strings, inserting separator if either is empty */ public static String joinWithSeparation(String a, String separator, String b) { if (a.length() == 0) { return b; @@ -541,10 +591,10 @@ public static String joinWithSeparation(String a, String separator, String b) { return a + separator + b; } - /** Appends two strings, inserting separator if either is empty. Modifies first map - */ - public static Map joinWithSeparation(Map a, String separator, Map b) { - for (final Iterator it = b.keySet().iterator(); it.hasNext();) { + /** Appends two strings, inserting separator if either is empty. Modifies first map */ + public static Map joinWithSeparation( + Map a, String separator, Map b) { + for (final Iterator it = b.keySet().iterator(); it.hasNext(); ) { final Object key = it.next(); String bvalue = (String) b.get(key); final String avalue = a.get(key); @@ -587,18 +637,17 @@ public static String join(Object[] c, String separator) { return output.toString(); } - - /** - * Utility like Arrays.asList() - */ + /** Utility like Arrays.asList() */ public static Map asMap(Object[][] source, Map target, boolean reverse) { int from = 0, to = 1; if (reverse) { - from = 1; to = 0; + from = 1; + to = 0; } for (final Object[] element : source) { if (element.length != 2) { - throw new IllegalArgumentException("Source must be array of pairs of strings: " + Arrays.asList(element)); + throw new IllegalArgumentException( + "Source must be array of pairs of strings: " + Arrays.asList(element)); } target.put(element[from], element[to]); } @@ -609,20 +658,16 @@ public static Map asMap(Object[][] source) { return asMap(source, new HashMap(), false); } - /** - * Utility that ought to be on Map - */ + /** Utility that ought to be on Map */ public static Map removeAll(Map m, Collection itemsToRemove) { - for (final Iterator it = itemsToRemove.iterator(); it.hasNext();) { + for (final Iterator it = itemsToRemove.iterator(); it.hasNext(); ) { final Object item = it.next(); m.remove(item); } return m; } - /** - * Returns the canonical name for a file. - */ + /** Returns the canonical name for a file. */ public static String getCanonicalName(String file) { try { return new File(file).getCanonicalPath(); @@ -632,62 +677,59 @@ public static String getCanonicalName(String file) { } /** - * Convert a UnicodeSet into a string that can be embedded into a Regex. Handles strings that are in the UnicodeSet, Supplementary ranges, and escaping - * @param source The source set - * @param escaper A transliterator that is used to escape the characters according to the requirements of the regex. + * Convert a UnicodeSet into a string that can be embedded into a Regex. Handles strings that + * are in the UnicodeSet, Supplementary ranges, and escaping + * + * @param source The source set + * @param escaper A transliterator that is used to escape the characters according to the + * requirements of the regex. * @return */ public static String toRegex(UnicodeSet source) { return toRegex(source, null, false); } - private static final Transliterator DEFAULT_REGEX_ESCAPER = Transliterator.createFromRules( - "foo", - "([ \\- \\\\ \\[ \\] ]) > '\\' $1 ;" - // + " ([:c:]) > &hex($1);" - + " ([[:control:][[:z:]&[:ascii:]]]) > &hex($1);", - Transliterator.FORWARD); + private static final Transliterator DEFAULT_REGEX_ESCAPER = + Transliterator.createFromRules( + "foo", + "([ \\- \\\\ \\[ \\] ]) > '\\' $1 ;" + // + " ([:c:]) > &hex($1);" + + " ([[:control:][[:z:]&[:ascii:]]]) > &hex($1);", + Transliterator.FORWARD); /** - * Convert a UnicodeSet into a string that can be embedded into a Regex. - * Handles strings that are in the UnicodeSet, Supplementary ranges, and - * escaping - * - * @param source - * The source set - * @param escaper - * A transliterator that is used to escape the characters according - * to the requirements of the regex. The default puts a \\ before [, -, - * \, and ], and converts controls and Ascii whitespace to hex. - * Alternatives can be supplied. Note that some Regex engines, - * including Java 1.5, don't really deal with escaped supplementaries - * well. - * @param onlyBmp - * Set to true if the Regex only accepts BMP characters. In that - * case, ranges of supplementary characters are converted to lists of - * ranges. For example, [\uFFF0-\U0010000F \U0010100F-\U0010300F] - * converts into: - *
+     * Convert a UnicodeSet into a string that can be embedded into a Regex. Handles strings that
+     * are in the UnicodeSet, Supplementary ranges, and escaping
+     *
+     * @param source The source set
+     * @param escaper A transliterator that is used to escape the characters according to the
+     *     requirements of the regex. The default puts a \\ before [, -, \, and ], and converts
+     *     controls and Ascii whitespace to hex. Alternatives can be supplied. Note that some Regex
+     *     engines, including Java 1.5, don't really deal with escaped supplementaries well.
+     * @param onlyBmp Set to true if the Regex only accepts BMP characters. In that case, ranges of
+     *     supplementary characters are converted to lists of ranges. For example,
+     *     [\uFFF0-\U0010000F \U0010100F-\U0010300F] converts into:
+     *     
      *          [\uD800][\uDC00-\uDFFF]
      *          [\uD801-\uDBBF][\uDC00-\uDFFF]
      *          [\uDBC0][\uDC00-\uDC0F]
- * and
+     *     and
+     *     
      *          [\uDBC4][\uDC0F-\uDFFF]
      *          [\uDBC5-\uDBCB][\uDC00-\uDFFF]
      *          [\uDBCC][\uDC00-\uDC0F]
      *          
- * These are then coalesced into a list of alternatives by sharing - * parts where feasible. For example, the above turns into 3 pairs of ranges: - *
+     *     These are then coalesced into a list of alternatives by sharing parts where feasible. For
+     *     example, the above turns into 3 pairs of ranges:
+     *     
      *          [\uDBC0\uDBCC][\uDC00-\uDC0F]|\uDBC4[\uDC0F-\uDFFF]|[\uD800-\uDBBF\uDBC5-\uDBCB][\uDC00-\uDFFF]
      *          
- * - * @return escaped string. Something like [a-z] or (?:[a-m]|{zh}) if there is - * a string zh in the set, or a more complicated case for - * supplementaries.
- * Special cases: [] returns "", single item returns a string - * (escaped), like [a] => "a", or [{abc}] => "abc"
- * Supplementaries are handled specially, as described under onlyBmp. + * + * @return escaped string. Something like [a-z] or (?:[a-m]|{zh}) if there is a string zh in the + * set, or a more complicated case for supplementaries.
+ * Special cases: [] returns "", single item returns a string (escaped), like [a] => "a", or + * [{abc}] => "abc"
+ * Supplementaries are handled specially, as described under onlyBmp. */ public static String toRegex(UnicodeSet source, Transliterator escaper, boolean onlyBmp) { if (escaper == null) { @@ -705,9 +747,10 @@ public static String toRegex(UnicodeSet source, Transliterator escaper, boolean // otherwise, we figure out what is in the set, and will return final StringBuilder base = new StringBuilder("["); final StringBuilder alternates = new StringBuilder(); - final Map lastToFirst = new TreeMap(new UnicodeSetComparator()); + final Map lastToFirst = + new TreeMap(new UnicodeSetComparator()); int alternateCount = 0; - while(it.nextRange()) { + while (it.nextRange()) { if (it.codepoint == UnicodeSetIterator.IS_STRING) { ++alternateCount; alternates.append('|').append(escaper.transliterate(it.string)); @@ -718,7 +761,8 @@ public static String toRegex(UnicodeSet source, Transliterator escaper, boolean addBmpRange(it.codepoint, 0xFFFF, escaper, base); it.codepoint = 0x10000; // reset the range } - // this gets a bit ugly; we are trying to minimize the extra ranges for supplementaries + // this gets a bit ugly; we are trying to minimize the extra ranges for + // supplementaries // we do this by breaking up X-Y based on the Lead and Trail values for X and Y // Lx [Tx - Ty]) (if Lx == Ly) // Lx [Tx - DFFF] | Ly [DC00-Ty] (if Lx == Ly - 1) @@ -732,7 +776,8 @@ public static String toRegex(UnicodeSet source, Transliterator escaper, boolean } else { addSupplementalRange(leadX, leadX, trailX, 0xDFFF, escaper, lastToFirst); if (leadX != leadY - 1) { - addSupplementalRange(leadX+1, leadY-1, 0xDC00, 0xDFFF, escaper, lastToFirst); + addSupplementalRange( + leadX + 1, leadY - 1, 0xDC00, 0xDFFF, escaper, lastToFirst); } addSupplementalRange(leadY, leadY, 0xDC00, trailY, escaper, lastToFirst); } @@ -742,7 +787,10 @@ public static String toRegex(UnicodeSet source, Transliterator escaper, boolean if (lastToFirst.size() != 0) { for (final UnicodeSet last : lastToFirst.keySet()) { ++alternateCount; - alternates.append('|').append(toRegex(lastToFirst.get(last), escaper, onlyBmp)).append(toRegex(last, escaper, onlyBmp)); + alternates + .append('|') + .append(toRegex(lastToFirst.get(last), escaper, onlyBmp)) + .append(toRegex(last, escaper, onlyBmp)); } } // Return the output. We separate cases in order to get the minimal extra apparatus @@ -753,13 +801,23 @@ public static String toRegex(UnicodeSet source, Transliterator escaper, boolean return "(?:" + base + "|" + alternates.substring(1) + ")"; } else if (alternateCount == 1) { return alternates.substring(1); - }else { + } else { return "(?:" + alternates.substring(1) + ")"; } } - private static void addSupplementalRange(int leadX, int leadY, int trailX, int trailY, Transliterator escaper, Map lastToFirst) { - System.out.println("\tadding: " + new UnicodeSet(leadX, leadY) + "\t" + new UnicodeSet(trailX, trailY) ); + private static void addSupplementalRange( + int leadX, + int leadY, + int trailX, + int trailY, + Transliterator escaper, + Map lastToFirst) { + System.out.println( + "\tadding: " + + new UnicodeSet(leadX, leadY) + + "\t" + + new UnicodeSet(trailX, trailY)); final UnicodeSet last = new UnicodeSet(trailX, trailY); UnicodeSet first = lastToFirst.get(last); if (first == null) { @@ -768,7 +826,8 @@ private static void addSupplementalRange(int leadX, int leadY, int trailX, int t first.add(leadX, leadY); } - private static void addBmpRange(int start, int limit, Transliterator escaper, StringBuilder base) { + private static void addBmpRange( + int start, int limit, Transliterator escaper, StringBuilder base) { base.append(escaper.transliterate(UTF16.valueOf(start))); if (start != limit) { base.append("-").append(escaper.transliterate(UTF16.valueOf(limit))); @@ -782,7 +841,8 @@ public int compare(UnicodeSet o1, UnicodeSet o2) { } } - public static class CollectionComparator> implements Comparator> { + public static class CollectionComparator> + implements Comparator> { @Override public int compare(Collection o1, Collection o2) { return UnicodeSet.compare(o1, o2, UnicodeSet.ComparisonStyle.SHORTER_FIRST); @@ -798,20 +858,21 @@ public int compare(T arg0, T arg1) { public static void addTreeMapChain(Map coverageData, Object... objects) { Map base = coverageData; - for (int i = 0; i < objects.length-2; ++i) { + for (int i = 0; i < objects.length - 2; ++i) { Map nextOne = (Map) base.get(objects[i]); if (nextOne == null) { base.put(objects[i], nextOne = new TreeMap()); } base = nextOne; } - base.put(objects[objects.length-2], objects[objects.length-1]); + base.put(objects[objects.length - 2], objects[objects.length - 1]); } - public static abstract class Transform { + public abstract static class Transform { public abstract Object transform(Object source); + public Collection transform(Collection input, Collection output) { - for (final Iterator it = input.iterator(); it.hasNext();) { + for (final Iterator it = input.iterator(); it.hasNext(); ) { final Object result = transform(it.next()); if (result != null) { output.add(result); @@ -819,13 +880,15 @@ public Collection transform(Collection input, Collection output) } return output; } + public Collection transform(Collection input) { return transform(input, new ArrayList()); } } - public static abstract class Apply { + public abstract static class Apply { public abstract void apply(T item); + public > void applyTo(U collection) { for (final T item : collection) { apply(item); @@ -833,12 +896,12 @@ public > void applyTo(U collection) { } } - public static abstract class Filter { + public abstract static class Filter { public abstract boolean contains(T item); public > U retainAll(U c) { - for (final Iterator it = c.iterator(); it.hasNext();) { + for (final Iterator it = c.iterator(); it.hasNext(); ) { if (!contains(it.next())) { it.remove(); } @@ -856,7 +919,7 @@ public > U extractMatches(U c, U target) { } public > U removeAll(U c) { - for (final Iterator it = c.iterator(); it.hasNext();) { + for (final Iterator it = c.iterator(); it.hasNext(); ) { if (contains(it.next())) { it.remove(); } @@ -876,20 +939,25 @@ public > U extractNonMatches(U c, U target) { public static class MatcherFilter extends Filter { private Matcher matcher; + public MatcherFilter(String pattern) { this.matcher = Pattern.compile(pattern).matcher(""); } + public MatcherFilter(Matcher matcher) { this.matcher = matcher; } + public MatcherFilter set(Matcher matcher) { this.matcher = matcher; return this; } + public MatcherFilter set(String pattern) { this.matcher = Pattern.compile(pattern).matcher(""); return this; } + @Override public boolean contains(T o) { return matcher.reset(o.toString()).matches(); @@ -898,34 +966,44 @@ public boolean contains(T o) { /** * Fetch data from jar + * * @param name name of thing to load (org.unicode.cldr.util.name) */ - static public BufferedReader getUTF8Data(String name) throws java.io.IOException { + public static BufferedReader getUTF8Data(String name) throws java.io.IOException { java.io.InputStream is = null; try { is = - com.ibm.icu.impl.ICUData.getRequiredStream(Class.forName(CldrUtility.UTIL_CLASS_DIR+".CldrUtility"), "data/" + name); + com.ibm.icu.impl.ICUData.getRequiredStream( + Class.forName(CldrUtility.UTIL_CLASS_DIR + ".CldrUtility"), + "data/" + name); } catch (final ClassNotFoundException cnf) { - throw new FileNotFoundException("Couldn't load " + CldrUtility.UTIL_CLASS_DIR + "." + name + " - ClassNotFoundException." + cnf.toString()); + throw new FileNotFoundException( + "Couldn't load " + + CldrUtility.UTIL_CLASS_DIR + + "." + + name + + " - ClassNotFoundException." + + cnf.toString()); // .initCause(cnf); } catch (final java.util.MissingResourceException mre) { // try file return FileUtilities.openUTF8Reader(CldrUtility.UTIL_DATA_DIR + File.separator, name); } - return new java.io.BufferedReader( new java.io.InputStreamReader(is,"UTF-8") ); + return new java.io.BufferedReader(new java.io.InputStreamReader(is, "UTF-8")); } /** * Takes a Map that goes from Object to Set, and fills in the transpose + * * @param source_key_valueSet * @param output_value_key */ public static void putAllTransposed(Map source_key_valueSet, Map output_value_key) { - for (final Iterator it = source_key_valueSet.keySet().iterator(); it.hasNext();) { + for (final Iterator it = source_key_valueSet.keySet().iterator(); it.hasNext(); ) { final Object key = it.next(); final Set values = (Set) source_key_valueSet.get(key); - for (final Iterator it2 = values.iterator(); it2.hasNext();) { + for (final Iterator it2 = values.iterator(); it2.hasNext(); ) { final Object value = it2.next(); output_value_key.put(value, key); } @@ -950,7 +1028,8 @@ public static void registerTransliteratorFromFile(String id, String dir, String registerTransliteratorFromFile(id, dir, filename, Transliterator.REVERSE, true); } - public static void registerTransliteratorFromFile(String id, String dir, String filename, int direction, boolean reverseID) { + public static void registerTransliteratorFromFile( + String id, String dir, String filename, int direction, boolean reverseID) { if (filename == null) { filename = id.replace('-', '_'); filename = filename.replace('/', '_'); @@ -964,7 +1043,7 @@ public static void registerTransliteratorFromFile(String id, String dir, String rid = id + "-Any"; id = "Any-" + id; } else { - rid = id.substring(pos+1) + "-" + id.substring(0, pos); + rid = id.substring(pos + 1) + "-" + id.substring(0, pos); } if (!reverseID) { rid = id; @@ -978,10 +1057,10 @@ public static void registerTransliteratorFromFile(String id, String dir, String } /*String test = "\u049A\u0430\u0437\u0430\u049B"; - System.out.println(t.transliterate(test)); - t = Transliterator.getInstance(id); - System.out.println(t.transliterate(test)); - */ + System.out.println(t.transliterate(test)); + t = Transliterator.getInstance(id); + System.out.println(t.transliterate(test)); + */ if (direction == Transliterator.REVERSE) { Transliterator.unregister(rid); @@ -1012,7 +1091,9 @@ public static String getText(String dir, String filename) { final String rules = buffer.toString(); return rules; } catch (final IOException e) { - throw (IllegalArgumentException) new IllegalArgumentException("Can't open " + dir + ", " + filename).initCause(e); + throw (IllegalArgumentException) + new IllegalArgumentException("Can't open " + dir + ", " + filename) + .initCause(e); } } @@ -1046,7 +1127,7 @@ public static void showMethods(Class cls) throws ClassNotFoundException { continue; } final int mods = method.getModifiers(); - //if (!Modifier.isStatic(mods)) continue; + // if (!Modifier.isStatic(mods)) continue; final String name = method.getName(); names.add(name); } @@ -1056,13 +1137,17 @@ public static void showMethods(Class cls) throws ClassNotFoundException { } /** - * Breaks lines if they are too long, or if matcher.group(1) != last. Only breaks just before matcher. + * Breaks lines if they are too long, or if matcher.group(1) != last. Only breaks just before + * matcher. + * * @param input * @param separator - * @param matcher must match each possible item. The first group is significant; if different, will cause break + * @param matcher must match each possible item. The first group is significant; if different, + * will cause break * @return */ - static public String breakLines(CharSequence input, String separator, Matcher matcher, int width) { + public static String breakLines( + CharSequence input, String separator, Matcher matcher, int width) { final StringBuffer output = new StringBuffer(); String lastPrefix = ""; int lastEnd = 0; @@ -1075,10 +1160,11 @@ static public String breakLines(CharSequence input, String separator, Matcher ma break; } final String prefix = matcher.group(1); - if (!prefix.equals(lastPrefix) || matcher.end() - lastBreakPos > width) { // break before? + if (!prefix.equals(lastPrefix) + || matcher.end() - lastBreakPos > width) { // break before? output.append(separator); lastBreakPos = lastEnd; - } else if (lastEnd != 0){ + } else if (lastEnd != 0) { output.append(' '); } output.append(input.subSequence(lastEnd, matcher.end()).toString().trim()); @@ -1089,20 +1175,21 @@ static public String breakLines(CharSequence input, String separator, Matcher ma } public static void showOptions(String[] args) { - //Properties props = System.getProperties(); - System.out.println("Arguments: " + join(args," ")); // + (props == null ? "" : " " + props)); + // Properties props = System.getProperties(); + System.out.println( + "Arguments: " + join(args, " ")); // + (props == null ? "" : " " + props)); } public static double roundToDecimals(double input, int places) { final boolean negative = input < 0.0; if (negative) { - input = - input; + input = -input; } final double log10 = Math.log10(input); // 15000 => 4.xxx final double intLog10 = Math.floor(log10); final double scale = Math.pow(10, intLog10 - places + 1); double factored = Math.round(input / scale) * scale; - //System.out.println("###\t" +input + "\t" + factored); + // System.out.println("###\t" +input + "\t" + factored); if (negative) { factored = -factored; } @@ -1110,9 +1197,9 @@ public static double roundToDecimals(double input, int places) { } /** - * Get a property value, returning the value if there is one (eg -Dkey=value), - * otherwise the default value (for either empty or null). - * + * Get a property value, returning the value if there is one (eg -Dkey=value), otherwise the + * default value (for either empty or null). + * * @param key * @param valueIfNull * @param valueIfEmpty @@ -1122,18 +1209,15 @@ public static String getProperty(String key, String defaultValue) { return getProperty(key, defaultValue, defaultValue); } - /** - * Get a property value, returning the value if there is one, otherwise null. - */ + /** Get a property value, returning the value if there is one, otherwise null. */ public static String getProperty(String key) { return getProperty(key, null, null); } /** - * Get a property value, returning the value if there is one (eg -Dkey=value), - * the valueIfEmpty if there is one with no value (eg -Dkey) and the valueIfNull - * if there is no property. - * + * Get a property value, returning the value if there is one (eg -Dkey=value), the valueIfEmpty + * if there is one with no value (eg -Dkey) and the valueIfNull if there is no property. + * * @param key * @param valueIfNull * @param valueIfEmpty @@ -1165,7 +1249,7 @@ public static String hex(byte[] bytes, int start, int end, String separator) { if (result.length() != 0) { result.append(separator); } - result.append(Utility.hex(bytes[i]&0xFF,2)); + result.append(Utility.hex(bytes[i] & 0xFF, 2)); } return result.toString(); } @@ -1182,7 +1266,8 @@ public static String checkValidDirectory(String sourceDirectory, String correcti return checkValidFile(sourceDirectory, true, correction); } - public static String checkValidFile(String sourceDirectory, boolean checkForDirectory, String correction) { + public static String checkValidFile( + String sourceDirectory, boolean checkForDirectory, String correction) { File file = null; String canonicalPath = null; try { @@ -1191,37 +1276,53 @@ public static String checkValidFile(String sourceDirectory, boolean checkForDire } catch (final Exception e) { } if (file == null || canonicalPath == null || checkForDirectory && !file.isDirectory()) { - throw new RuntimeException("Directory not found: " + sourceDirectory + (canonicalPath == null ? "" : " => " + canonicalPath) - + (correction == null ? "" : CldrUtility.LINE_SEPARATOR + correction)); + throw new RuntimeException( + "Directory not found: " + + sourceDirectory + + (canonicalPath == null ? "" : " => " + canonicalPath) + + (correction == null ? "" : CldrUtility.LINE_SEPARATOR + correction)); } return canonicalPath; } /** * Copy up to matching line (or rest of file, or skip). + * * @param oldFile file to copy from - * @param readUntilPattern pattern to stop at (or null to copy to end). Uses matches(), not find() + * @param readUntilPattern pattern to stop at (or null to copy to end). Uses matches(), not + * find() * @param output output, or null to just skip * @param includeMatchingLine if true, the matching line is copied also * @return the matching line if there is a match, null if the end of the file is reached * @throws IOException */ - public static String copyUpTo(BufferedReader oldFile, final Pattern readUntilPattern, - final PrintWriter output, boolean includeMatchingLine) throws IOException { + public static String copyUpTo( + BufferedReader oldFile, + final Pattern readUntilPattern, + final PrintWriter output, + boolean includeMatchingLine) + throws IOException { final Matcher readUntil = readUntilPattern == null ? null : readUntilPattern.matcher(""); return copyUpTo(oldFile, readUntil, output, includeMatchingLine); } /** * Copy up to matching line (or rest of file, or skip). + * * @param oldFile file to copy from - * @param readUntil Matcher for line to stop at (or null to copy to end). Uses matches(), not find(). After returning, the matcher has the groupings set. + * @param readUntil Matcher for line to stop at (or null to copy to end). Uses matches(), not + * find(). After returning, the matcher has the groupings set. * @param output output, or null to just skip * @param includeMatchingLine if true, the matching line is copied also * @return the matching line if there is a match, null if the end of the file is reached * @throws IOException */ - public static String copyUpTo(BufferedReader oldFile, Matcher readUntil, final PrintWriter output, boolean includeMatchingLine) throws IOException { + public static String copyUpTo( + BufferedReader oldFile, + Matcher readUntil, + final PrintWriter output, + boolean includeMatchingLine) + throws IOException { while (true) { String line = oldFile.readLine(); if (line == null) { @@ -1243,12 +1344,13 @@ public static String copyUpTo(BufferedReader oldFile, Matcher readUntil, final P } private static DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss 'GMT'"); + static { df.setTimeZone(TimeZone.getTimeZone("GMT")); } public static String isoFormat(Date date) { - synchronized(df) { + synchronized (df) { return df.format(date); } } diff --git a/unicodetools/src/main/java/org/unicode/draft/Cmudict.java b/unicodetools/src/main/java/org/unicode/draft/Cmudict.java index d15418313..4a17d2bcd 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Cmudict.java +++ b/unicodetools/src/main/java/org/unicode/draft/Cmudict.java @@ -1,5 +1,11 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -12,24 +18,18 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.text.utility.Settings; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - public class Cmudict { static final String BASE_DIR = Settings.UnicodeTools.DATA_DIR + "translit/"; private static final String GEN_TRANSLIT_DIR = Settings.Output.GEN_DIR + "translit/"; static final Collator col = Collator.getInstance(ULocale.ROOT); - //static final StressFixer stressFixer = new StressFixer(); - static final Transliterator arpabet = getTransliteratorFromFile("arpabet-ipa", BASE_DIR, "arpabet-ipa.txt"); - static final Transliterator respell = getTransliteratorFromFile("ipa-en", BASE_DIR, "respell.txt"); + // static final StressFixer stressFixer = new StressFixer(); + static final Transliterator arpabet = + getTransliteratorFromFile("arpabet-ipa", BASE_DIR, "arpabet-ipa.txt"); + static final Transliterator respell = + getTransliteratorFromFile("ipa-en", BASE_DIR, "respell.txt"); public static void main(String[] args) throws IOException { @@ -37,10 +37,12 @@ public static void main(String[] args) throws IOException { final UnicodeSet WORD_OK = new UnicodeSet("[-.a-z’¹²³\\u0020]").freeze(); final Set funnyWords = new TreeSet(); - final Relation toIPA = new Relation(new TreeMap(col), LinkedHashSet.class); - final Relation fromIpa = new Relation(new TreeMap(col), TreeSet.class); + final Relation toIPA = + new Relation(new TreeMap(col), LinkedHashSet.class); + final Relation fromIpa = new Relation(new TreeMap(col), TreeSet.class); final Set ipaWithoutStress = new TreeSet(col); - final Relation ipaDifferingByStress = new Relation(new TreeMap(col), TreeSet.class); + final Relation ipaDifferingByStress = + new Relation(new TreeMap(col), TreeSet.class); final BufferedReader in = FileUtilities.openUTF8Reader(BASE_DIR, "cmudict.0.7a.txt"); while (true) { @@ -59,19 +61,22 @@ public static void main(String[] args) throws IOException { // continue; // } final int wordEnd = line.indexOf(' '); - final String word = line.substring(0,wordEnd).toLowerCase(Locale.ENGLISH) - .replace('\'', '’') - .replace('_', ' ') - .replace("(1)","") // ¹ - .replace("(2)","") // ² - .replace("(3)","") // ³ + final String word = + line.substring(0, wordEnd) + .toLowerCase(Locale.ENGLISH) + .replace('\'', '’') + .replace('_', ' ') + .replace("(1)", "") // ¹ + .replace("(2)", "") // ² + .replace("(3)", "") // ³ ; - if (!WORD_OK.containsAll(word) || SKIP_START.contains(word.codePointAt(0)) + if (!WORD_OK.containsAll(word) + || SKIP_START.contains(word.codePointAt(0)) || word.startsWith("’") && word.contains("quote")) { funnyWords.add(word); continue; } - final String pronunciation = line.substring(wordEnd+1); + final String pronunciation = line.substring(wordEnd + 1); final String ipa = getIpa(pronunciation); fromIpa.put(ipa, word); toIPA.put(word, ipa); @@ -81,7 +86,7 @@ public static void main(String[] args) throws IOException { if (!ipa.contains("ˈ")) { ipaWithoutStress.add(ipa); } - final String stresslessIpa = ipa.replace("ˈ","").replace("ˌ",""); + final String stresslessIpa = ipa.replace("ˈ", "").replace("ˌ", ""); ipaDifferingByStress.put(stresslessIpa, ipa); } in.close(); @@ -114,10 +119,10 @@ public static void main(String[] args) throws IOException { // } // } - System.out.println("Post-processing"); final Set removals = new HashSet(); - final Relation specials = new Relation(new TreeMap(col), LinkedHashSet.class); + final Relation specials = + new Relation(new TreeMap(col), LinkedHashSet.class); for (final Entry> entry : toIPA.keyValuesSet()) { final String word = entry.getKey(); @@ -132,7 +137,7 @@ public static void main(String[] args) throws IOException { newWord = newWord.substring(1); } if (endsWith) { - newWord = newWord.substring(0, newWord.length()-1); + newWord = newWord.substring(0, newWord.length() - 1); } final Collection values2 = toIPA.get(newWord); if (values2 == null) { @@ -141,7 +146,15 @@ public static void main(String[] args) throws IOException { // System.out.println("Values Match:\t" + word + "\t" + values + "\t" + values2); removals.add(word); } else { - System.out.println("Values Differ:\t" + word + "\t" + values + "\t" + newWord + "\t" + values2); + System.out.println( + "Values Differ:\t" + + word + + "\t" + + values + + "\t" + + newWord + + "\t" + + values2); } } toIPA.removeAll(removals); @@ -149,7 +162,7 @@ public static void main(String[] args) throws IOException { System.out.println("Missing?\t" + entry); } - PrintWriter out = FileUtilities.openUTF8Writer(GEN_TRANSLIT_DIR, "cmudict.txt") ; + PrintWriter out = FileUtilities.openUTF8Writer(GEN_TRANSLIT_DIR, "cmudict.txt"); for (final Entry> entry : toIPA.keyValuesSet()) { final String word = entry.getKey(); final Set values = entry.getValue(); @@ -157,7 +170,7 @@ public static void main(String[] args) throws IOException { } out.close(); - out = FileUtilities.openUTF8Writer(GEN_TRANSLIT_DIR, "homonyms.txt") ; + out = FileUtilities.openUTF8Writer(GEN_TRANSLIT_DIR, "homonyms.txt"); final Set temp = new TreeSet(col); for (final Entry> entry : fromIpa.keyValuesSet()) { final Set values = entry.getValue(); @@ -166,7 +179,7 @@ public static void main(String[] args) throws IOException { } temp.clear(); for (String value : values) { - value = value.replace("’",""); + value = value.replace("’", ""); temp.add(value); } if (temp.size() == 1) { @@ -193,14 +206,20 @@ public static void main(String[] args) throws IOException { final String otherIpa = reverseIpa.get(reversedRespelledKey); final String respelledOtherIpa = respell.transform(otherIpa); final String reversedRespelledOtherIpa = respell.transform(respelledOtherIpa); - System.out.println("Collision:" - + "\t" + ipa - + "\t" + respelledKey - + "\t" + reversedRespelledKey - + "\t" + otherIpa - + "\t" + respelledOtherIpa - + "\t" + reversedRespelledOtherIpa - ); + System.out.println( + "Collision:" + + "\t" + + ipa + + "\t" + + respelledKey + + "\t" + + reversedRespelledKey + + "\t" + + otherIpa + + "\t" + + respelledOtherIpa + + "\t" + + reversedRespelledOtherIpa); } reverseIpa.put(reversedRespelledKey, ipa); } @@ -208,16 +227,19 @@ public static void main(String[] args) throws IOException { out = FileUtilities.openUTF8Writer(GEN_TRANSLIT_DIR, "reversed.txt"); for (final Entry reversed_normal : reverseIpa.entrySet()) { final String original = reversed_normal.getValue(); - out.println(CollectionUtilities.join(fromIpa.get(original), ", ") + "\t{" - // + reversed_normal.getKey() + ", " - + original + "}"); + out.println( + CollectionUtilities.join(fromIpa.get(original), ", ") + + "\t{" + // + reversed_normal.getKey() + ", " + + original + + "}"); } out.close(); - } static UnicodeSet IPA_UNITS = new UnicodeSet("[{aɪ}{aʊ}{ɔɪ}{tʃ}{dʒ}]").freeze(); - static UnicodeSet RESPELL_UNITS = new UnicodeSet("[{ùr}{òu}{òi}{cħ}{dʒ}{tħ}{ţħ}{sħ}{nġ}]").freeze(); + static UnicodeSet RESPELL_UNITS = + new UnicodeSet("[{ùr}{òu}{òi}{cħ}{dʒ}{tħ}{ţħ}{sħ}{nġ}]").freeze(); private static String reverse(String sourceString, UnicodeSet units) { final StringBuilder result = new StringBuilder(); @@ -226,9 +248,9 @@ private static String reverse(String sourceString, UnicodeSet units) { final int matchValue = units.matchesAt(temp, i); if (matchValue > i) { final char ch1 = temp.charAt(i); - final char ch2 = temp.charAt(i+1); + final char ch2 = temp.charAt(i + 1); temp.setCharAt(i, ch2); - temp.setCharAt(i+1, ch1); + temp.setCharAt(i + 1, ch1); } } // pass through and reverse duals @@ -242,7 +264,7 @@ private static String reverse(String sourceString, UnicodeSet units) { private static String getIpa(String pronunciation) { String ipa = arpabet.transform(pronunciation); - ipa = ipa.replace(""+PRIMARY_STRESS, "").replace(""+SECONDARY_STRESS, ""); + ipa = ipa.replace("" + PRIMARY_STRESS, "").replace("" + SECONDARY_STRESS, ""); return ipa; } @@ -272,8 +294,7 @@ public static Transliterator getTransliteratorFromFile(String ID, String dir, St if (line == null) { break; } - if (line.startsWith("\uFEFF")) - { + if (line.startsWith("\uFEFF")) { line = line.substring(1); // remove BOM } input.append(line); @@ -281,7 +302,9 @@ public static Transliterator getTransliteratorFromFile(String ID, String dir, St } return Transliterator.createFromRules(ID, input.toString(), Transliterator.FORWARD); } catch (final IOException e) { - throw (IllegalArgumentException) new IllegalArgumentException("Can't open transliterator file " + file).initCause(e); + throw (IllegalArgumentException) + new IllegalArgumentException("Can't open transliterator file " + file) + .initCause(e); } } @@ -379,7 +402,8 @@ public static Transliterator getTransliteratorFromFile(String ID, String dir, St // System.out.println("* Too few primaries\t" + input); // if (firstSecondary >= 0) { // --secondaryCount; -// input = input.substring(0,firstSecondary) + 'ˌ' + input.substring(firstSecondary+1); +// input = input.substring(0,firstSecondary) + 'ˌ' + +// input.substring(firstSecondary+1); // } else { // input = input.substring(0,firstVowel) + 'ˌ' + input.substring(firstVowel); // } diff --git a/unicodetools/src/main/java/org/unicode/draft/CodePoint.java b/unicodetools/src/main/java/org/unicode/draft/CodePoint.java index 711927cfc..6efd14b46 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CodePoint.java +++ b/unicodetools/src/main/java/org/unicode/draft/CodePoint.java @@ -1,16 +1,15 @@ package org.unicode.draft; -import java.util.Iterator; - -import org.unicode.cldr.util.Timer; import com.ibm.icu.text.NumberFormat; import com.ibm.icu.util.ULocale; - +import java.util.Iterator; +import org.unicode.cldr.util.Timer; public class CodePoint implements Iterator, Iterable { private static final int SUPPLEMENTAL_OFFSET = - (Character.MIN_HIGH_SURROGATE << 10) + Character.MIN_LOW_SURROGATE - - Character.MIN_SUPPLEMENTARY_CODE_POINT; + (Character.MIN_HIGH_SURROGATE << 10) + + Character.MIN_LOW_SURROGATE + - Character.MIN_SUPPLEMENTARY_CODE_POINT; private final CharSequence charSequence; private int position; private StringBuilder builder; @@ -42,22 +41,22 @@ public Iterator iterator() { return this; } - static CodePoint with (CharSequence s) { + static CodePoint with(CharSequence s) { return new CodePoint(s); } - static int[] full (CharSequence s) { + static int[] full(CharSequence s) { final int len = s.length(); int[] result = new int[len]; int pos = 0; - for (int i = 0; i < len;) { + for (int i = 0; i < len; ) { int cp = s.charAt(i++); // The key to performance is that surrogate pairs are very rare. // Test for a trail (low) surrogate. if (cp >= Character.MIN_LOW_SURROGATE && cp < Character.MAX_LOW_SURROGATE && pos > 0) { // If we get a trail, and if the last code point was a lead (high) surrogate, // we need to backup and set the correct value - final int last = result[pos-1]; + final int last = result[pos - 1]; if (last >= Character.MIN_HIGH_SURROGATE && last <= Character.MAX_HIGH_SURROGATE) { --pos; cp += (last << 10) - SUPPLEMENTAL_OFFSET; @@ -95,7 +94,9 @@ public boolean next() { return false; } int cp = buffer.charAt(position++); - if (cp >= Character.MIN_HIGH_SURROGATE && cp <= Character.MAX_HIGH_SURROGATE && position < length) { + if (cp >= Character.MIN_HIGH_SURROGATE + && cp <= Character.MAX_HIGH_SURROGATE + && position < length) { final int trail = buffer.charAt(position); if (trail >= Character.MIN_LOW_SURROGATE && trail <= Character.MAX_LOW_SURROGATE) { cp = (cp << 10) + trail - SUPPLEMENTAL_OFFSET; @@ -111,16 +112,18 @@ public static void main(String[] args) { System.out.print("Warmup\t"); timeMethods("a\uD800\uDC00", 100001); // warmup - final String[] tests = {"In a hole in the ground there lived a hobbit.", - "In a hole in the ground there lived a hobbit.\uD800\uDC00", - "In a hole in the ground there lived a hobbit.\uD800", - "\uDC00In a hole in the ground there lived a hobbit."}; + final String[] tests = { + "In a hole in the ground there lived a hobbit.", + "In a hole in the ground there lived a hobbit.\uD800\uDC00", + "In a hole in the ground there lived a hobbit.\uD800", + "\uDC00In a hole in the ground there lived a hobbit." + }; for (final String test : tests) { timeMethods(test, 10000001); } } - private static NumberFormat nf = NumberFormat.getNumberInstance(ULocale.ENGLISH); + private static NumberFormat nf = NumberFormat.getNumberInstance(ULocale.ENGLISH); private static void timeMethods(CharSequence s, int ITERATIONS) { System.out.println("Testing <" + s + "> for " + nf.format(ITERATIONS) + " iterations"); @@ -156,11 +159,14 @@ private static void timeMethods(CharSequence s, int ITERATIONS) { timer.start(); for (int iteration = ITERATIONS; iteration > 0; --iteration) { final int len = s.length(); - for (int i = 0; i < len;) { + for (int i = 0; i < len; ) { int cp = s.charAt(i++); - if (cp >= Character.MIN_HIGH_SURROGATE && cp <= Character.MAX_HIGH_SURROGATE && i < len) { + if (cp >= Character.MIN_HIGH_SURROGATE + && cp <= Character.MAX_HIGH_SURROGATE + && i < len) { final int trail = s.charAt(i); - if (trail >= Character.MIN_LOW_SURROGATE && trail < Character.MAX_LOW_SURROGATE) { + if (trail >= Character.MIN_LOW_SURROGATE + && trail < Character.MAX_LOW_SURROGATE) { cp = (cp << 10) + trail - SUPPLEMENTAL_OFFSET; ++i; } @@ -178,7 +184,7 @@ private static void timeMethods(CharSequence s, int ITERATIONS) { doSomethingWith2 = 0; timer.start(); for (int iteration = ITERATIONS; iteration > 0; --iteration) { - for (final CodePointIterator it = new CodePointIterator(s); it.next();) { + for (final CodePointIterator it = new CodePointIterator(s); it.next(); ) { doSomethingWith2 ^= it.codePoint; } } diff --git a/unicodetools/src/main/java/org/unicode/draft/Compare.java b/unicodetools/src/main/java/org/unicode/draft/Compare.java index 386be9152..45aca9a27 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Compare.java +++ b/unicodetools/src/main/java/org/unicode/draft/Compare.java @@ -1,15 +1,12 @@ package org.unicode.draft; -import java.util.Comparator; +import java.util.Comparator; public class Compare { - public static Compare - START = new Compare(); + public static Compare START = new Compare(); - private static Compare - GREATER = new CompareDone(1), - LESS = new CompareDone(-1); + private static Compare GREATER = new CompareDone(1), LESS = new CompareDone(-1); public > Compare compare(T a, T b) { if (a == null) { @@ -43,7 +40,6 @@ public Compare compare(T a, T b, Comparator comparator) { return GREATER; } - public Compare compare(int a, int b) { if (a == b) { return this; @@ -60,24 +56,28 @@ public int done() { private Compare() {} // hide constructor - static private class CompareDone extends Compare { + private static class CompareDone extends Compare { private final int result; private CompareDone(int result) { this.result = result; } + @Override public int done() { return result; } + @Override public Compare compare(int a, int b) { return this; } + @Override public > Compare compare(T a, T b) { return this; } + public > Compare compare(T a, T b, Comparator comparator) { return this; } @@ -95,18 +95,14 @@ public Foo(int a, int b, String c) { @Override public int compareTo(Foo other) { - return Compare.START - .compare(a, other.a) - .compare(b, other.b) - .compare(c, other.c) - .done(); + return Compare.START.compare(a, other.a).compare(b, other.b).compare(c, other.c).done(); } } public static void main(String[] args) { - final Foo a = new Foo(1,2,"ab"); - final Foo b = new Foo(0,2,"ab"); - final Foo c = new Foo(1,2,"ab"); + final Foo a = new Foo(1, 2, "ab"); + final Foo b = new Foo(0, 2, "ab"); + final Foo c = new Foo(1, 2, "ab"); System.out.println(a.compareTo(b)); System.out.println(a.compareTo(c)); } diff --git a/unicodetools/src/main/java/org/unicode/draft/CompareCldrUnihanData.java b/unicodetools/src/main/java/org/unicode/draft/CompareCldrUnihanData.java index 1ef9d0eb1..5b29c3889 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CompareCldrUnihanData.java +++ b/unicodetools/src/main/java/org/unicode/draft/CompareCldrUnihanData.java @@ -1,8 +1,11 @@ package org.unicode.draft; +import com.google.common.base.Objects; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.util.List; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; @@ -13,14 +16,10 @@ import org.unicode.props.UcdPropertyValues.General_Category_Values; import org.unicode.text.utility.Settings; -import com.google.common.base.Objects; -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class CompareCldrUnihanData { - private static final IndexUnicodeProperties IUP = IndexUnicodeProperties.make(Settings.latestVersion); + private static final IndexUnicodeProperties IUP = + IndexUnicodeProperties.make(Settings.latestVersion); static final Splitter ONBAR = Splitter.on('|').trimResults(); static final Splitter ONSPACE = Splitter.on(' ').trimResults(); @@ -52,11 +51,13 @@ public class CompareCldrUnihanData { UNIHAN_MANDARIN_T.putAll(uset, tValue); } - UnicodeMap gc = IUP.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + UnicodeMap gc = + IUP.loadEnum(UcdProperty.General_Category, General_Category_Values.class); UnicodeMap blocks = IUP.loadEnum(UcdProperty.Block, Block_Values.class); - UnicodeSet radicals = new UnicodeSet(blocks.getSet(Block_Values.CJK_Radicals_Supplement)) - .addAll(blocks.getSet(Block_Values.Kangxi_Radicals)) - .removeAll(gc.getSet(General_Category_Values.Unassigned)); + UnicodeSet radicals = + new UnicodeSet(blocks.getSet(Block_Values.CJK_Radicals_Supplement)) + .addAll(blocks.getSet(Block_Values.Kangxi_Radicals)) + .removeAll(gc.getSet(General_Category_Values.Unassigned)); UnicodeMap cjkRadical = IUP.load(UcdProperty.CJK_Radical); for (String value : cjkRadical.values()) { @@ -66,7 +67,7 @@ public class CompareCldrUnihanData { copyValues(withRadical, UNIHAN_TOTALSTROKES_S); copyValues(withRadical, UNIHAN_TOTALSTROKES_T); } - + UNIHAN_TOTALSTROKES_S.freeze(); UNIHAN_TOTALSTROKES_T.freeze(); System.out.println("UNIHAN_STROKES_T: " + UNIHAN_TOTALSTROKES_T.size()); @@ -91,9 +92,7 @@ private static boolean copyValues(UnicodeSet withRadical, UnicodeMap unic return false; } - /** - * Read these in from the files as text. Can do because format is generated. - */ + /** Read these in from the files as text. Can do because format is generated. */ private static UnicodeMap extractCldrPinyin() { UnicodeMap result = new UnicodeMap<>(); @@ -102,10 +101,11 @@ private static UnicodeMap extractCldrPinyin() { Splitter onarrow = Splitter.on('→').trimResults(); boolean reading = false; - for (String line : FileUtilities.in(CLDRPaths.COMMON_DIRECTORY+"/transforms/", "Han-Latin.xml")) { + for (String line : + FileUtilities.in(CLDRPaths.COMMON_DIRECTORY + "/transforms/", "Han-Latin.xml")) { // # START AUTOGENERATED Han-Latin.xml ( Unihan kMandarin) // [呵锕阿𠼞𥥩𨉚]→ā; - //... + // ... // # END AUTOGENERATED Han-Latin.xml (Unihan kMandarin) if (!reading) { if (line.contains("START AUTOGENERATED")) { @@ -118,11 +118,10 @@ private static UnicodeMap extractCldrPinyin() { } line = line.trim(); // [⺅䌾䛘人亻仁壬忈忎朲秂芢鈓銋魜鵀𡰥𢇦𦏀𧥷]→rén; - if (!line.startsWith(prefix) - || !line.endsWith(suffix)) { + if (!line.startsWith(prefix) || !line.endsWith(suffix)) { throw new IllegalArgumentException("Internal error: " + line); } - String value = line.substring(prefix.length(),line.length()-suffix.length()); + String value = line.substring(prefix.length(), line.length() - suffix.length()); List list = onarrow.splitToList(value); if (list.size() != 2) { throw new IllegalArgumentException("Internal error: " + line); @@ -153,7 +152,7 @@ private static UnicodeMap extractCldrTotalStrokeT() { // extractCldrTotalStrokeT() { continue; } } else { - index = Integer.parseInt(line.substring(indexPos+5).trim()); + index = Integer.parseInt(line.substring(indexPos + 5).trim()); suffix = "# " + index; continue; } @@ -176,8 +175,8 @@ private static UnicodeMap extractCldrTotalStrokeT() { } int beginIndex = line.startsWith("<*") ? 2 : 1; uset.clear(); - uset.addAll(line.substring(beginIndex,hashPos).trim()); - result.putAll(uset,index); + uset.addAll(line.substring(beginIndex, hashPos).trim()); + result.putAll(uset, index); } if (line.contains("<<<")) { break; @@ -198,10 +197,15 @@ public static void main(String[] args) { // diff(CLDR_STROKE_COLLATION, UNIHAN_TOTALSTROKES_S); } - enum Change {removed, changed, added} + enum Change { + removed, + changed, + added + } - private static > UnicodeMap> diff(UnicodeMap a, UnicodeMap b) { - UnicodeMap> result = new UnicodeMap<>(); + private static > UnicodeMap> diff( + UnicodeMap a, UnicodeMap b) { + UnicodeMap> result = new UnicodeMap<>(); UnicodeSet sources = new UnicodeSet(a.keySet()).addAll(b.keySet()); for (UnicodeSet.EntryRange range : sources.ranges()) { for (int i = range.codepoint; i <= range.codepointEnd; ++i) { @@ -219,7 +223,7 @@ private static > UnicodeMap> diff(UnicodeMap> sorted = new TreeSet<>(result.values()); + TreeSet> sorted = new TreeSet<>(result.values()); for (Change type : Change.values()) { boolean header = false; int count = 0; @@ -238,15 +242,24 @@ private static > UnicodeMap> diff(UnicodeMap> String show(Pair pair) { - return CldrUtility.ifNull(pair.getFirst(), "∅") + " → " + CldrUtility.ifNull(pair.getSecond(), "∅"); + static > String show(Pair pair) { + return CldrUtility.ifNull(pair.getFirst(), "∅") + + " → " + + CldrUtility.ifNull(pair.getSecond(), "∅"); } } diff --git a/unicodetools/src/main/java/org/unicode/draft/ComparePinyin.java b/unicodetools/src/main/java/org/unicode/draft/ComparePinyin.java index 452ff2168..dfd2b96e3 100644 --- a/unicodetools/src/main/java/org/unicode/draft/ComparePinyin.java +++ b/unicodetools/src/main/java/org/unicode/draft/ComparePinyin.java @@ -1,5 +1,15 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.IterableComparator; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -15,44 +25,33 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.Tabber; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.text.UCD.Default; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.IterableComparator; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - - public class ComparePinyin { private static final UnicodeSet UNIHAN = new UnicodeSet("[:script=han:]").freeze(); static Collator pinyinSort = Collator.getInstance(new ULocale("zh@collator=pinyin")); static Collator radicalStrokeSort = Collator.getInstance(new ULocale("zh@collator=unihan")); static Transliterator toPinyin = Transliterator.getInstance("Han-Latin;nfc"); - static final Comparator codepointComparator = new UTF16.StringComparator(true, false,0); - + static final Comparator codepointComparator = + new UTF16.StringComparator(true, false, 0); public static void main(String[] args) throws IOException { - showElements("", toPinyin); final UnihanPinyin unihanPinyin = new UnihanPinyin(); + final Relation notUnihan = + new Relation(new TreeMap(pinyinSort), TreeSet.class); - final Relation notUnihan = new Relation(new TreeMap(pinyinSort), TreeSet.class); - - for (final String s : new UnicodeSet(HAN).removeAll(unihanPinyin.keySet()).removeAll(new UnicodeSet("[:nfkcqc=n:]"))) { + for (final String s : + new UnicodeSet(HAN) + .removeAll(unihanPinyin.keySet()) + .removeAll(new UnicodeSet("[:nfkcqc=n:]"))) { final String pinyin = toPinyin.transform(s); if (!s.equals(pinyin)) { notUnihan.put(pinyin, s); @@ -65,12 +64,14 @@ public static void main(String[] args) throws IOException { System.out.println(pinyin + "\t" + showSet(s)); } - final ArrayList foo; final IterableComparator arrayComp = new IterableComparator(pinyinSort); - final Relation, String> notFirst = new Relation(new TreeMap(arrayComp), TreeSet.class); - final Relation, String> notIn = new Relation(new TreeMap(arrayComp), TreeSet.class); - final Relation, String> noTranslit = new Relation(new TreeMap(arrayComp), TreeSet.class); + final Relation, String> notFirst = + new Relation(new TreeMap(arrayComp), TreeSet.class); + final Relation, String> notIn = + new Relation(new TreeMap(arrayComp), TreeSet.class); + final Relation, String> noTranslit = + new Relation(new TreeMap(arrayComp), TreeSet.class); for (final String s : unihanPinyin.keySet()) { final String pinyin = toPinyin.transform(s); final Set unihanPinyinSet = unihanPinyin.getPinyinSet(s); @@ -79,12 +80,12 @@ public static void main(String[] args) throws IOException { final ArrayList val = new ArrayList(); val.add(pinyin); val.addAll(unihanPinyinSet); - notIn.put(val,s); + notIn.put(val, s); } else if (!pinyin.equals(unihanPinyinSet.iterator().next())) { final ArrayList val = new ArrayList(); val.add(pinyin); val.add(unihanPinyinSet.iterator().next()); - notFirst.put(val,s); + notFirst.put(val, s); } continue; } @@ -97,7 +98,8 @@ public static void main(String[] args) throws IOException { System.out.println(unihanPinyinSet + "\t" + showSet(s)); } - System.out.println("Characters with Unihan Pinyin and CLDR, but CLDR not in Unihan: " + notIn.size()); + System.out.println( + "Characters with Unihan Pinyin and CLDR, but CLDR not in Unihan: " + notIn.size()); for (final Collection unihanPinyinSet : notIn.keySet()) { final Set s = notIn.getAll(unihanPinyinSet); final ArrayList val = new ArrayList(unihanPinyinSet); @@ -106,7 +108,9 @@ public static void main(String[] args) throws IOException { System.out.println(pinyin + "\t" + val + "\t" + showSet(s)); } - System.out.println("Characters with Unihan Pinyin and CLDR, but CLDR not first in Unihan: " + notFirst.size()); + System.out.println( + "Characters with Unihan Pinyin and CLDR, but CLDR not first in Unihan: " + + notFirst.size()); for (final Collection unihanPinyinSet : notFirst.keySet()) { final Set s = notFirst.getAll(unihanPinyinSet); final ArrayList val = new ArrayList(unihanPinyinSet); @@ -121,11 +125,22 @@ public static void main(String[] args) throws IOException { final int bad = 0; final UnicodeSet tailored = pinyinSort.getTailoredSet().retainAll(UNIHAN); - final UnicodeSet inSortNotUnihan = new UnicodeSet(tailored).removeAll(unihanPinyin.keySet()); - final UnicodeSet inUnihanNotSort = new UnicodeSet(unihanPinyin.keySet()).removeAll(tailored); - final UnicodeSet inUnihanAndSort = new UnicodeSet(unihanPinyin.keySet()).retainAll(tailored); - System.out.println("Extras in pinyinSort - Unihan: " + inSortNotUnihan.size() + "\t" + inSortNotUnihan.toPattern(false)); - System.out.println("Extras in Unihan - pinyinSort: " + inUnihanNotSort.size() + "\t" + inUnihanNotSort.toPattern(false)); + final UnicodeSet inSortNotUnihan = + new UnicodeSet(tailored).removeAll(unihanPinyin.keySet()); + final UnicodeSet inUnihanNotSort = + new UnicodeSet(unihanPinyin.keySet()).removeAll(tailored); + final UnicodeSet inUnihanAndSort = + new UnicodeSet(unihanPinyin.keySet()).retainAll(tailored); + System.out.println( + "Extras in pinyinSort - Unihan: " + + inSortNotUnihan.size() + + "\t" + + inSortNotUnihan.toPattern(false)); + System.out.println( + "Extras in Unihan - pinyinSort: " + + inUnihanNotSort.size() + + "\t" + + inUnihanNotSort.toPattern(false)); System.out.println("In both Unihan and pinyinSort: " + inUnihanAndSort.size()); final Set sorted1 = new TreeSet(pinyinSort); @@ -161,35 +176,40 @@ public static void main(String[] args) throws IOException { printItems(buckets, unihanPinyin); - final Relation sorted = new Relation(new TreeMap(pinyinSort), TreeSet.class, radicalStrokeSort); + final Relation sorted = + new Relation(new TreeMap(pinyinSort), TreeSet.class, radicalStrokeSort); for (final String han : unihanPinyin.keySet()) { sorted.put(unihanPinyin.getPinyin(han), han); } - final Relation sorted2 = new Relation(new TreeMap(), TreeSet.class, radicalStrokeSort); + final Relation sorted2 = + new Relation(new TreeMap(), TreeSet.class, radicalStrokeSort); final Tabber tabber = new Tabber.HTMLTabber(); final PrintWriter out = Utility.openPrintWriterGenDir("pinyinTable.html", null); - final PrintWriter pinyinCollation = Utility.openPrintWriterGenDir("pinyinCollation.txt", null); - pinyinCollation.println("\uFEFF# Unihan Pinyin Collation\n" + - "&[last regular]"); - - final PrintWriter pinyinCollationInterleaved = Utility.openPrintWriterGenDir("pinyinCollationInterleaved.txt", null); - pinyinCollationInterleaved.println("\uFEFF# Unihan Pinyin Interleaved Collation\n" + - "&[last regular]"); - - out.println("\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
InputIDNA2003UTS46IDNA2008InputIDNA2003UTS46IDNA2008
Display
Punycode\u00A0\u00A0\u00A0\u00A0") - // .append(toHTML.transform(IdnaLabelTester.ESCAPER.transform(normalized.substring(0, result.position))) - // + "\u2639" + toHTML.transform(IdnaLabelTester.ESCAPER.transform(normalized.substring(result.position))) + // + // .append(toHTML.transform(IdnaLabelTester.ESCAPER.transform(normalized.substring(0, result.position))) + // + "\u2639" + + // toHTML.transform(IdnaLabelTester.ESCAPER.transform(normalized.substring(result.position))) // + "" + result.title // //+ "" + result.ruleLine // + "
"); + final PrintWriter pinyinCollation = + Utility.openPrintWriterGenDir("pinyinCollation.txt", null); + pinyinCollation.println("\uFEFF# Unihan Pinyin Collation\n" + "&[last regular]"); + + final PrintWriter pinyinCollationInterleaved = + Utility.openPrintWriterGenDir("pinyinCollationInterleaved.txt", null); + pinyinCollationInterleaved.println( + "\uFEFF# Unihan Pinyin Interleaved Collation\n" + "&[last regular]"); + + out.println( + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
"); for (final String pinyin : sorted.keySet()) { final Set hanSet = sorted.getAll(pinyin); pinyinCollationInterleaved.print("<" + pinyin + "\t"); sorted2.clear(); for (final String han : hanSet) { - final Map> pinyinToSource = unihanPinyin.getPinyinMap(han); + final Map> pinyinToSource = + unihanPinyin.getPinyinMap(han); sorted2.put("" + showPinyinToSource(pinyinToSource), han); pinyinCollation.print("<" + han); pinyinCollationInterleaved.print("<" + han); @@ -199,7 +219,7 @@ public static void main(String[] args) throws IOException { for (final String line : sorted2.keySet()) { final Set set = sorted2.getAll(line); String setStr = set.toString().replace(",", ""); - setStr = setStr.substring(1,setStr.length()-1); + setStr = setStr.substring(1, setStr.length() - 1); out.println(tabber.process(pinyin + "\t" + setStr + "\t" + line)); } } @@ -228,13 +248,21 @@ private static String showPinyinToSource(Map> piny } private static void showElements(String indent, Transliterator toPinyin2) { - System.out.println(indent + toPinyin2.getID() + "\t" + toPinyin2.getClass().getName() + "\tFilter: " + toPinyin2.getFilter() + "\tSource: " + toPinyin2.getSourceSet().toPattern(false)); + System.out.println( + indent + + toPinyin2.getID() + + "\t" + + toPinyin2.getClass().getName() + + "\tFilter: " + + toPinyin2.getFilter() + + "\tSource: " + + toPinyin2.getSourceSet().toPattern(false)); final Transliterator[] elements = toPinyin2.getElements(); for (final Transliterator element : elements) { if (element == toPinyin2) { continue; } - showElements(indent+"\t", element); + showElements(indent + "\t", element); } } @@ -250,7 +278,6 @@ private static UnicodeSet getSource(Transliterator toPinyin2, UnicodeSet target) return target; } - private static void excludeItems2(List buckets, int threshold) { for (int i = 0; i < buckets.size(); ++i) { final HanInfo row = buckets.get(i); @@ -276,9 +303,13 @@ private static void printItems(List buckets, UnihanPinyin unihanPinyin) } else { final int before = findNonexcludedRank(buckets, i, -1); final int after = findNonexcludedRank(buckets, i, 1); - if (before >= 0 && buckets.get(before).rank > row.rank && getLocalDistance(buckets, before) <= localDistance) { + if (before >= 0 + && buckets.get(before).rank > row.rank + && getLocalDistance(buckets, before) <= localDistance) { ok = false; - } else if (after >= 0 && buckets.get(after).rank < row.rank && getLocalDistance(buckets, after) <= localDistance) { + } else if (after >= 0 + && buckets.get(after).rank < row.rank + && getLocalDistance(buckets, after) <= localDistance) { ok = false; } } @@ -289,14 +320,20 @@ private static void printItems(List buckets, UnihanPinyin unihanPinyin) if (!ok) { ++bad; } - System.out.println(status + "\t" + localDistance + "\t" + (ok? "": "[" + bestPinyin + "]") + row); - if (!ok && row.hanList.size() > 1){ + System.out.println( + status + + "\t" + + localDistance + + "\t" + + (ok ? "" : "[" + bestPinyin + "]") + + row); + if (!ok && row.hanList.size() > 1) { for (final String han : row.hanList) { System.out.println("\t--\t" + han + "\t" + unihanPinyin.getPinyinMap(han)); } } for (final String han : row.hanList) { - unihanPinyin.addAll(han,"?", PinyinSource.s, bestPinyin); + unihanPinyin.addAll(han, "?", PinyinSource.s, bestPinyin); } } System.out.println("Bad:\t" + bad); @@ -321,6 +358,7 @@ static class HanInfo { int rank; String pinyin; List hanList = new ArrayList(); + @Override public String toString() { return pinyin + " (" + rank + ")\t" + showSet(hanList); // (exclude ? "*" : "") + "\t" + @@ -332,7 +370,7 @@ private static int getLocalDistance(List buckets, int i) { final int core = myBucket.rank; double distance = 0; int count = 0; - for (int j = i-1; j >= 0 && count < 10; --j) { + for (int j = i - 1; j >= 0 && count < 10; --j) { final HanInfo bucket = buckets.get(j); if (bucket.exclude) { continue; @@ -342,7 +380,7 @@ private static int getLocalDistance(List buckets, int i) { } int count2 = 0; - for (int j = i+1; j < buckets.size() && count2 < 10; ++j) { + for (int j = i + 1; j < buckets.size() && count2 < 10; ++j) { final HanInfo bucket = buckets.get(j); if (bucket.exclude) { continue; @@ -369,20 +407,29 @@ private static String showSet(Iterable set) { private static final UnicodeSet HAN = new UnicodeSet("[:script=han:]"); - enum PinyinSource {l, x, p, m, t, s}; + enum PinyinSource { + l, + x, + p, + m, + t, + s + }; static class UnihanPinyin { - // kHanyuPinyin, space, 10297.260: qÄ«n,qìn,qÇ�n, [a-z\x{FC}\x{300}-\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{FC}\x{300}-\x{302}\x{304}\x{308}\x{30C}] + // kHanyuPinyin, space, 10297.260: qÄ«n,qìn,qÇ�n, + // [a-z\x{FC}\x{300}-\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{FC}\x{300}-\x{302}\x{304}\x{308}\x{30C}] // kMandarin, space, [A-Z\x{308}]+[1-5] // 3475=HAN4 JI2 JIE2 ZHA3 ZI2 // kHanyuPinlu, space, [a-z\x{308}]+[1-5]\([0-9]+\) 4E0A=shang4(12308) shang5(392) - UnicodeMap>> unihanPinyin = new UnicodeMap<>(); - Map pinyinToOrder = new HashMap<>(); + UnicodeMap>> unihanPinyin = new UnicodeMap<>(); + Map pinyinToOrder = new HashMap<>(); TreeSet pinyinSet = new TreeSet(pinyinSort); IndexUnicodeProperties iup = IndexUnicodeProperties.make(Default.ucd().getVersionInfo()); { - final Transform pinyinNumeric = Transliterator.getInstance("NumericPinyin-Latin;nfc"); + final Transform pinyinNumeric = + Transliterator.getInstance("NumericPinyin-Latin;nfc"); // all kHanyuPinlu readings first; then take all kXHC1983; then kHanyuPinyin. @@ -394,8 +441,8 @@ static class UnihanPinyin { addAll(s, original, PinyinSource.l, source.split(" ")); } - //kXHC1983 - //^[0-9,.*]+:*[a-zx{FC}x{300}x{301}x{304}x{308}x{30C}]+$ + // kXHC1983 + // ^[0-9,.*]+:*[a-zx{FC}x{300}x{301}x{304}x{308}x{30C}]+$ final UnicodeMap kXHC1983 = iup.load(UcdProperty.kXHC1983); for (final String s : kXHC1983.keySet()) { final String original = kXHC1983.get(s); @@ -408,7 +455,10 @@ static class UnihanPinyin { for (final String s : kHanyuPinyin.keySet()) { final String original = kHanyuPinyin.get(s); String source = Normalizer.normalize(original, Normalizer.NFC); - source = source.replaceAll("^\\s*(\\d{5}\\.\\d{2}0,)*\\d{5}\\.\\d{2}0:", ""); // , only for medial + source = + source.replaceAll( + "^\\s*(\\d{5}\\.\\d{2}0,)*\\d{5}\\.\\d{2}0:", + ""); // , only for medial source = source.replaceAll("\\s*(\\d{5}\\.\\d{2}0,)*\\d{5}\\.\\d{2}0:", ","); addAll(s, original, PinyinSource.p, source.split(",")); } @@ -432,7 +482,7 @@ static class UnihanPinyin { int i = 0; for (final String s : pinyinSet) { - pinyinToOrder.put(s,i++); + pinyinToOrder.put(s, i++); } pinyinToOrder.put("#", i + 1000); pinyinSet = null; @@ -452,7 +502,7 @@ static class UnihanPinyin { private void collectPinyin() { final PrintWriter out = Utility.openPrintWriterGenDir("pinyin/pinyins.txt", null); final String[] line = {"", "", "", "", "", "", "", ""}; - final Map groupToIndex = new HashMap(); + final Map groupToIndex = new HashMap(); int k = 3; for (final String item : "zuÅ� zuó zuÇ’ zuò zuo".split("\\s+")) { groupToIndex.put(accents.transform(item), k++); @@ -473,7 +523,7 @@ private void collectPinyin() { } line[0] = oldBase = base; final int initialEnd = INITIALS.findIn(base, 0, true); - final String initialSegment = base.substring(0,initialEnd); + final String initialSegment = base.substring(0, initialEnd); final String finalSegment = base.substring(initialEnd); line[1] = initialSegment; initials.add(initialSegment); @@ -487,7 +537,7 @@ private void collectPinyin() { try { final int groupIndex = groupToIndex.get(group); - collectedAccents.set(groupIndex-3); + collectedAccents.set(groupIndex - 3); if (line[groupIndex].length() != 0) { System.out.println("***Multiple pinyins: " + s + "\t" + line[groupIndex]); } @@ -504,8 +554,14 @@ private void collectPinyin() { } private String showPinyinLine(String[] line, BitSet collectedAccents) { - return Arrays.asList(line).toString().replaceAll(",\\s*","\t").replaceAll("\\[|\\]", "") - + "\t" + collectedAccents.cardinality() + "\t" + collectedAccents; + return Arrays.asList(line) + .toString() + .replaceAll(",\\s*", "\t") + .replaceAll("\\[|\\]", "") + + "\t" + + collectedAccents.cardinality() + + "\t" + + collectedAccents; } public Integer getPinyinOrder(String pinyin) { @@ -516,11 +572,17 @@ public Integer getPinyinOrder(String pinyin) { return result; } - static Transform noaccents = Transliterator.getInstance("nfkd; [[:m:]-[\u0308]] remove; nfc"); - static Transform accents = Transliterator.getInstance("nfkd; [^[:m:]-[\u0308]] remove; nfc"); + static Transform noaccents = + Transliterator.getInstance("nfkd; [[:m:]-[\u0308]] remove; nfc"); + static Transform accents = + Transliterator.getInstance("nfkd; [^[:m:]-[\u0308]] remove; nfc"); - static UnicodeSet INITIALS = new UnicodeSet("[b c {ch} d f g h j k l m n p q r s {sh} t w x y z {zh}]").freeze(); - static UnicodeSet FINALS = new UnicodeSet("[a {ai} {an} {ang} {ao} e {ei} {en} {eng} {er} i {ia} {ian} {iang} {iao} {ie} {in} {ing} {iong} {iu} o {ong} {ou} u {ua} {uai} {uan} {uang} {ue} {ui} {un} {uo} ü {üe}]").freeze(); + static UnicodeSet INITIALS = + new UnicodeSet("[b c {ch} d f g h j k l m n p q r s {sh} t w x y z {zh}]").freeze(); + static UnicodeSet FINALS = + new UnicodeSet( + "[a {ai} {an} {ang} {ao} e {ei} {en} {eng} {er} i {ia} {ian} {iang} {iao} {ie} {in} {ing} {iong} {iu} o {ong} {ou} u {ua} {uai} {uan} {uang} {ue} {ui} {un} {uo} ü {üe}]") + .freeze(); boolean validPinyin(String pinyin) { final String base = noaccents.transform(pinyin); @@ -529,7 +591,8 @@ boolean validPinyin(String pinyin) { return false; } final String finalSegment = base.substring(initialEnd); - final boolean result = finalSegment.length() == 0 ? true : FINALS.contains(finalSegment); + final boolean result = + finalSegment.length() == 0 ? true : FINALS.contains(finalSegment); return result; } @@ -538,13 +601,24 @@ void addAll(String han, String original, PinyinSource pinyin, String... pinyinLi if (pinyinList.length == 0) { throw new IllegalArgumentException(); } - final Map> pinyinToSources = getPinyinToSources(han, true); + final Map> pinyinToSources = + getPinyinToSources(han, true); for (final String source : pinyinList) { if (source.length() == 0) { throw new IllegalArgumentException(); } if (!validPinyin(source)) { - System.out.println("***Invalid Pinyin: " + han + "\t" + pinyin + "\t" + source + "\t" + Utility.hex(han) + "\t" + original); + System.out.println( + "***Invalid Pinyin: " + + han + + "\t" + + pinyin + + "\t" + + source + + "\t" + + Utility.hex(han) + + "\t" + + original); } if (pinyinSet != null) { pinyinSet.add(source); @@ -559,7 +633,11 @@ void addAll(String han, String original, PinyinSource pinyin, String... pinyinLi throw new IllegalArgumentException(); } } - private EnumSet getEnumSet(Map> pinyinToSources, String pinyin, boolean createSet) { + + private EnumSet getEnumSet( + Map> pinyinToSources, + String pinyin, + boolean createSet) { EnumSet set = pinyinToSources.get(pinyin); if (createSet && set == null) { set = EnumSet.noneOf(PinyinSource.class); @@ -568,10 +646,11 @@ private EnumSet getEnumSet(Map> piny return set; } - private Map> getPinyinToSources(String han, boolean createSet) { - Map> set = unihanPinyin.get(han); + private Map> getPinyinToSources( + String han, boolean createSet) { + Map> set = unihanPinyin.get(han); if (createSet && set == null) { - set = new LinkedHashMap>(); + set = new LinkedHashMap>(); unihanPinyin.put(han, set); } return set; @@ -600,8 +679,5 @@ private String getPinyin(String han) { } return set.iterator().next(); } - } - - } diff --git a/unicodetools/src/main/java/org/unicode/draft/CompressedDataInput.java b/unicodetools/src/main/java/org/unicode/draft/CompressedDataInput.java index b19943614..a73be6b23 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CompressedDataInput.java +++ b/unicodetools/src/main/java/org/unicode/draft/CompressedDataInput.java @@ -1,6 +1,4 @@ -/** - * - */ +/** */ package org.unicode.draft; import java.io.DataInput; @@ -16,7 +14,8 @@ public CompressedDataInput set(DataInput in) { } /** - * Read long using readUnsignedLong. The bottom bit is the sign. If the number was negative, the value is inverted (~). + * Read long using readUnsignedLong. The bottom bit is the sign. If the number was negative, the + * value is inverted (~). */ @Override public long readLong() throws IOException { @@ -32,7 +31,7 @@ public long readLong() throws IOException { /** * Read a long as a series of 7-bits, with the last one having the top bit on. - * + * * @throws IOException */ public long readUnsignedLong() throws IOException { @@ -41,11 +40,11 @@ public long readUnsignedLong() throws IOException { while (true) { int byteValue = in.readByte(); if ((byte) byteValue >= 0) { - result |= ((long)byteValue << shift); + result |= ((long) byteValue << shift); shift += 7; } else { byteValue &= 0x7F; - result |= ((long)byteValue << shift); + result |= ((long) byteValue << shift); return result; } } @@ -158,4 +157,4 @@ public int readUnsignedByte() throws IOException { public String readLine() throws IOException { return in.readLine(); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/draft/CompressedDataOutput.java b/unicodetools/src/main/java/org/unicode/draft/CompressedDataOutput.java index 040e13c43..25a06d38e 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CompressedDataOutput.java +++ b/unicodetools/src/main/java/org/unicode/draft/CompressedDataOutput.java @@ -1,6 +1,4 @@ -/** - * - */ +/** */ package org.unicode.draft; import java.io.DataOutput; @@ -15,7 +13,6 @@ public CompressedDataOutput set(DataOutput out) { return this; } - @Override public void writeLong(long longValue) throws IOException { final boolean negative = longValue < 0; @@ -30,9 +27,8 @@ public void writeLong(long longValue) throws IOException { } /** - * Write a long as a series of 7-bits, with the last one having the top bit - * on. - * + * Write a long as a series of 7-bits, with the last one having the top bit on. + * * @param longValue * @throws IOException */ @@ -40,12 +36,12 @@ public void writeUnsignedLong(long longValue) throws IOException { while (true) { final int bottom = 0x7F & (int) longValue; longValue >>>= 7; - if (longValue == 0) { - out.write(bottom | 0x80); // write byte - return; - } else { - out.write(bottom); // write byte - } + if (longValue == 0) { + out.write(bottom | 0x80); // write byte + return; + } else { + out.write(bottom); // write byte + } } } @@ -108,7 +104,8 @@ public final void writeChar(int v) throws IOException { } /** - * Doesn't do the intracharacter compression that writeUTF does, because the caller may assume that it is equivalent to calling writeChar multiple times. + * Doesn't do the intracharacter compression that writeUTF does, because the caller may assume + * that it is equivalent to calling writeChar multiple times. */ @Override public final void writeChars(String s) throws IOException { @@ -134,33 +131,28 @@ public void write(byte[] b, int off, int len) throws IOException { out.write(b, off, len); } - @Override public void write(byte[] b) throws IOException { out.write(b); } - @Override public void write(int b) throws IOException { out.write(b); } - @Override public void writeBoolean(boolean v) throws IOException { out.writeBoolean(v); } - @Override public void writeByte(int v) throws IOException { out.writeByte(v); } - @Override public void writeBytes(String s) throws IOException { out.writeBytes(s); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/draft/CountryPopulationByCode.java b/unicodetools/src/main/java/org/unicode/draft/CountryPopulationByCode.java index 1b41cb5da..644c66e2e 100644 --- a/unicodetools/src/main/java/org/unicode/draft/CountryPopulationByCode.java +++ b/unicodetools/src/main/java/org/unicode/draft/CountryPopulationByCode.java @@ -1,4 +1,10 @@ package org.unicode.draft; + +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Row.R4; +import com.ibm.icu.impl.Row.R5; +import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashSet; @@ -7,7 +13,6 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; - import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.Counter; @@ -17,12 +22,6 @@ import org.unicode.jsp.FileUtilities; import org.unicode.jsp.FileUtilities.SemiFileReader; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Row.R4; -import com.ibm.icu.impl.Row.R5; -import com.ibm.icu.util.ULocale; - public class CountryPopulationByCode { private static final StandardCodes STANDARD_CODES = StandardCodes.make(); private static final boolean SHOW_INTERNET = false; @@ -44,14 +43,16 @@ public static void main(String[] args) { System.out.println(alt + "\t" + territory + "\t" + name); } } - //countryPopulation(); + // countryPopulation(); countryLanguagePopulation(); } private static void countryLanguagePopulation() { final Map> country2internetUsers = new TreeMap(); - final SemiFileReader handler = new SequenceHandler(country2internetUsers).process(CountryPopulationByCode.class, "internetUsers.txt"); + final SemiFileReader handler = + new SequenceHandler(country2internetUsers) + .process(CountryPopulationByCode.class, "internetUsers.txt"); final Counter gdp = new Counter(); final Counter language2InternetLatest = new Counter(); @@ -60,7 +61,9 @@ private static void countryLanguagePopulation() { for (final String territoryCode : STANDARD_CODES.getGoodAvailableCodes("territory")) { Set languages; try { - languages = testInfo.getSupplementalDataInfo().getLanguagesForTerritoryWithPopulationData(territoryCode); + languages = + testInfo.getSupplementalDataInfo() + .getLanguagesForTerritoryWithPopulationData(territoryCode); if (languages == null) { continue; } @@ -74,34 +77,50 @@ private static void countryLanguagePopulation() { double totalWeighted = 0; double total = 0; - final PopulationData territoryData = testInfo.getSupplementalDataInfo().getPopulationDataForTerritory(territoryCode); + final PopulationData territoryData = + testInfo.getSupplementalDataInfo().getPopulationDataForTerritory(territoryCode); final double territoryGdp = territoryData.getGdp(); double maxLiteratePopulation = 0; if (SHOW_SOURCE || territoryCode.equals("CN")) { System.out.println( - territoryCode + "\t" - + testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME,territoryCode) + territoryCode + "\t" + + testInfo.getEnglish() + .getName(CLDRFile.TERRITORY_NAME, territoryCode) + "\t" - + "\t" + territoryData.getPopulation() - + "\t" + territoryData.getLiteratePopulation() // getWeightedLiteratePopulation(data) - + "\t" + territoryData.getOfficialStatus() - + "\t" + territoryData.getGdp() - ); - + + "\t" + + "\t" + + territoryData.getPopulation() + + "\t" + + territoryData + .getLiteratePopulation() // getWeightedLiteratePopulation(data) + + "\t" + + territoryData.getOfficialStatus() + + "\t" + + territoryData.getGdp()); } for (final String languageCode : languages) { - final PopulationData data = testInfo.getSupplementalDataInfo().getLanguageAndTerritoryPopulationData(languageCode, territoryCode); + final PopulationData data = + testInfo.getSupplementalDataInfo() + .getLanguageAndTerritoryPopulationData(languageCode, territoryCode); if (SHOW_SOURCE || territoryCode.equals("CN")) { System.out.println( - territoryCode + "\t" - + testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME,territoryCode) - + "\t" + languageCode - + "\t" + getBaseName(languageCode) - + "\t" + data.getPopulation() - + "\t" + data.getLiteratePopulation() // getWeightedLiteratePopulation(data) - + "\t" + data.getOfficialStatus()); + territoryCode + + "\t" + + testInfo.getEnglish() + .getName(CLDRFile.TERRITORY_NAME, territoryCode) + + "\t" + + languageCode + + "\t" + + getBaseName(languageCode) + + "\t" + + data.getPopulation() + + "\t" + + data + .getLiteratePopulation() // getWeightedLiteratePopulation(data) + + "\t" + + data.getOfficialStatus()); } totalWeighted += getWeightedLiteratePopulation(data, languageCode); final double literatePopulation = data.getLiteratePopulation(); @@ -111,37 +130,73 @@ private static void countryLanguagePopulation() { total += literatePopulation; } final double literatePopulationInTerritory = territoryData.getLiteratePopulation(); - addRow(rowSet, territoryCode, "und", 1-total/literatePopulationInTerritory, 1-total/literatePopulationInTerritory, OfficialStatus.unknown); - addRow(rowSet, territoryCode, "mul", 1-maxLiteratePopulation/literatePopulationInTerritory, 1-maxLiteratePopulation/literatePopulationInTerritory, OfficialStatus.unknown); + addRow( + rowSet, + territoryCode, + "und", + 1 - total / literatePopulationInTerritory, + 1 - total / literatePopulationInTerritory, + OfficialStatus.unknown); + addRow( + rowSet, + territoryCode, + "mul", + 1 - maxLiteratePopulation / literatePopulationInTerritory, + 1 - maxLiteratePopulation / literatePopulationInTerritory, + OfficialStatus.unknown); for (final String languageCode : languages) { - final PopulationData data = testInfo.getSupplementalDataInfo().getLanguageAndTerritoryPopulationData(languageCode, territoryCode); + final PopulationData data = + testInfo.getSupplementalDataInfo() + .getLanguageAndTerritoryPopulationData(languageCode, territoryCode); final String languageName = getBaseName(languageCode); if (territoryCode.equals("IL")) { - System.out.println("$$\t" + data.getLiteratePopulation() + "\t" + testInfo.getEnglish().getName(languageCode)); + System.out.println( + "$$\t" + + data.getLiteratePopulation() + + "\t" + + testInfo.getEnglish().getName(languageCode)); } - final double ratioWeighted = getWeightedLiteratePopulation(data, languageCode)/totalWeighted; - final double ratio = data.getLiteratePopulation()/literatePopulationInTerritory; + final double ratioWeighted = + getWeightedLiteratePopulation(data, languageCode) / totalWeighted; + final double ratio = data.getLiteratePopulation() / literatePopulationInTerritory; - addRow(rowSet, territoryCode, languageCode, ratioWeighted, ratio, data.getOfficialStatus()); + addRow( + rowSet, + territoryCode, + languageCode, + ratioWeighted, + ratio, + data.getOfficialStatus()); - gdp.add(languageName, (int)(ratioWeighted * territoryGdp)); - language2InternetLatest.add(languageName, (int)(ratioWeighted * internetData.get1())); + gdp.add(languageName, (int) (ratioWeighted * territoryGdp)); + language2InternetLatest.add( + languageName, (int) (ratioWeighted * internetData.get1())); } } if (SHOW_WEIGHTS) { System.out.println("*** Factors"); - System.out.println("und = 1-total/literatePopulationInTerritory, 1-total/literatePopulationInTerritory"); - System.out.println("mul = 1-maxLiteratePopulation/literatePopulationInTerritory, 1-maxLiteratePopulation/literatePopulationInTerritory"); - System.out.println("region" + "\t" + "code" - //+ "\t" + "rank" - + "\t" + "ratio" - + "\t" + "weighted-ratio" - + "\t" + "language" - + "\t" + "code" - + "\t" + "status" - //+ "\t" + "K-if-Key" + System.out.println( + "und = 1-total/literatePopulationInTerritory, 1-total/literatePopulationInTerritory"); + System.out.println( + "mul = 1-maxLiteratePopulation/literatePopulationInTerritory, 1-maxLiteratePopulation/literatePopulationInTerritory"); + System.out.println( + "region" + + "\t" + + "code" + // + "\t" + "rank" + + "\t" + + "ratio" + + "\t" + + "weighted-ratio" + + "\t" + + "language" + + "\t" + + "code" + + "\t" + + "status" + // + "\t" + "K-if-Key" ); Object oldRegion = ""; @@ -151,19 +206,23 @@ private static void countryLanguagePopulation() { final double ratio = -row.get1(); final double weightedRatio = -row.get2(); final String languageCodeStatus = row.get3(); - if (FILTER && ( - weightedRatio < 0.01 - || languageCodeStatus.contains("\tund\t") - || languageCodeStatus.contains("\tmul\t"))) { + if (FILTER + && (weightedRatio < 0.01 + || languageCodeStatus.contains("\tund\t") + || languageCodeStatus.contains("\tmul\t"))) { continue; } counter = region.equals(oldRegion) ? counter + 1 : 1; - System.out.println(region - //+ "\t" + counter - + "\t" + ratio - + "\t" + weightedRatio - + "\t" + languageCodeStatus - //+ "\t" + (row.get4() ? "K" : "") + System.out.println( + region + // + "\t" + counter + + "\t" + + ratio + + "\t" + + weightedRatio + + "\t" + + languageCodeStatus + // + "\t" + (row.get4() ? "K" : "") ); oldRegion = region; } @@ -171,39 +230,68 @@ private static void countryLanguagePopulation() { if (SHOW_INTERNET) { // for (String languageName : language2Internet2000.keySet()) { - // System.out.println(languageName + "\t2000\t" + language2Internet2000.get(languageName)); + // System.out.println(languageName + "\t2000\t" + + // language2Internet2000.get(languageName)); // } System.out.println("*** internet/gdp"); for (final String languageName : gdp.getKeysetSortedByCount(false)) { - System.out.println(languageName + "\t" + language2InternetLatest.get(languageName) + "\t" + gdp.get(languageName)); + System.out.println( + languageName + + "\t" + + language2InternetLatest.get(languageName) + + "\t" + + gdp.get(languageName)); } } } - private static void addRow(TreeSet> rowSet, String territoryCode, String languageCode, double ratioWeighted, double ratio, OfficialStatus officialStatus) { - final R5 row = Row.of( - testInfo.getEnglish().getName("territory", territoryCode) + "\t" + territoryCode, - -ratio, - -ratioWeighted, - testInfo.getEnglish().getName(languageCode) + "\t" + languageCode + "\t" + (officialStatus == OfficialStatus.unknown ? "" : officialStatus.toString()), - KEY_LANGUAGES.contains(languageCode)); + private static void addRow( + TreeSet> rowSet, + String territoryCode, + String languageCode, + double ratioWeighted, + double ratio, + OfficialStatus officialStatus) { + final R5 row = + Row.of( + testInfo.getEnglish().getName("territory", territoryCode) + + "\t" + + territoryCode, + -ratio, + -ratioWeighted, + testInfo.getEnglish().getName(languageCode) + + "\t" + + languageCode + + "\t" + + (officialStatus == OfficialStatus.unknown + ? "" + : officialStatus.toString()), + KEY_LANGUAGES.contains(languageCode)); rowSet.add(row); } - static final Set KEY_LANGUAGES = new LinkedHashSet(Arrays.asList( - "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl", - "pl", "ar", "ru", "zh_Hant", "ko", "th", "sv", "fi", "da", - "he", "nb", "el", "hr", "bg", "sk", "lt", "vi", "lv", "sr", - "pt_PT", "ro", "hu", "cs", "id", "sl", "fil", "fa", "uk", - "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", "am", "ta", - "te", "mr", "ur", "ml", "kn", "gu", "or")); + static final Set KEY_LANGUAGES = + new LinkedHashSet( + Arrays.asList( + "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl", "pl", "ar", + "ru", "zh_Hant", "ko", "th", "sv", "fi", "da", "he", "nb", "el", "hr", + "bg", "sk", "lt", "vi", "lv", "sr", "pt_PT", "ro", "hu", "cs", "id", + "sl", "fil", "fa", "uk", "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", + "am", "ta", "te", "mr", "ur", "ml", "kn", "gu", "or")); private static double getWeightedLiteratePopulation(PopulationData data, String languageCode) { - return data.getLiteratePopulation() * (languageCode.equals("nn") ? OfficialStatus.official_minority : data.getOfficialStatus()).getWeight(); + return data.getLiteratePopulation() + * (languageCode.equals("nn") + ? OfficialStatus.official_minority + : data.getOfficialStatus()) + .getWeight(); } private static String getBaseName(String languageCode) { - final String baseLanguage = languageCode.contains("Hant") ? languageCode : new ULocale(languageCode).getLanguage(); + final String baseLanguage = + languageCode.contains("Hant") + ? languageCode + : new ULocale(languageCode).getLanguage(); final String languageName = testInfo.getEnglish().getName(baseLanguage); return languageName; @@ -211,12 +299,15 @@ private static String getBaseName(String languageCode) { static class SequenceHandler extends FileUtilities.SemiFileReader { Map> country2internetUsers = new TreeMap(); - public final static Pattern TABS = Pattern.compile("\\t+"); - public static final Map name2code = new HashMap(); - public static final Map remapName = new HashMap(); + public static final Pattern TABS = Pattern.compile("\\t+"); + public static final Map name2code = new HashMap(); + public static final Map remapName = new HashMap(); + static { for (final String territory : STANDARD_CODES.getGoodAvailableCodes("territory")) { - name2code.put(testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, territory), territory); + name2code.put( + testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, territory), + territory); } remapName.put("Korea, South", "South Korea"); remapName.put("Hong Kong�*", "Hong Kong SAR China"); @@ -261,6 +352,7 @@ public SequenceHandler(Map> rawLanguageToSequencesC protected boolean isCodePoint() { return false; } + @Override protected boolean handleLine(int start, int end, String[] items) { String code = name2code.get(items[0]); @@ -269,52 +361,67 @@ protected boolean handleLine(int start, int end, String[] items) { code = name2code.get(rename); } if (code == null) { - //System.out.println("remapName.put(\"" + items[0] + "\", \"XX\");"); + // System.out.println("remapName.put(\"" + items[0] + "\", \"XX\");"); code = items[0]; } - country2internetUsers.put(code, - Row.of(Integer.parseInt(items[1].replace(",","")), Integer.parseInt(items[2].replace(",","")))); + country2internetUsers.put( + code, + Row.of( + Integer.parseInt(items[1].replace(",", "")), + Integer.parseInt(items[2].replace(",", "")))); return true; } + @Override protected void handleEnd() { - final Set missing = new TreeSet(STANDARD_CODES.getGoodAvailableCodes("territory")); + final Set missing = + new TreeSet(STANDARD_CODES.getGoodAvailableCodes("territory")); missing.removeAll(country2internetUsers.keySet()); for (final String s : missing) { - System.out.println("//missing\t" + s + "\t" + testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, s)); + System.out.println( + "//missing\t" + + s + + "\t" + + testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, s)); } } } private static void countryPopulation() { - final Set> byPopulation = new TreeSet>(); + final Set> byPopulation = + new TreeSet>(); for (final String code : STANDARD_CODES.getGoodAvailableCodes("territory")) { - final Set numbers = testInfo.getSupplementalDataInfo().numericTerritoryMapping - .getAll(code); + final Set numbers = + testInfo.getSupplementalDataInfo().numericTerritoryMapping.getAll(code); if (numbers == null) { - //System.out.println("Skipping " + code); + // System.out.println("Skipping " + code); continue; } for (final Integer regionNumber : numbers) { - final PopulationData population = testInfo.getSupplementalDataInfo() - .getPopulationDataForTerritory(code); + final PopulationData population = + testInfo.getSupplementalDataInfo().getPopulationDataForTerritory(code); if (population == null) { System.out.println("Skipping " + code + ", " + regionNumber); continue; } - final R4 items = Row - .of(population.getPopulation(), population.getGdp(), code, regionNumber); + final R4 items = + Row.of(population.getPopulation(), population.getGdp(), code, regionNumber); byPopulation.add(items); } } for (final R4 row : byPopulation) { final String name = row.get2(); - System.out.println(testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, name) - + "\t" + row.get0() - + "\t" + row.get1() - + "\t" + name - + "\t" + row.get3()); + System.out.println( + testInfo.getEnglish().getName(CLDRFile.TERRITORY_NAME, name) + + "\t" + + row.get0() + + "\t" + + row.get1() + + "\t" + + name + + "\t" + + row.get3()); } } } diff --git a/unicodetools/src/main/java/org/unicode/draft/FindHanSizes.java b/unicodetools/src/main/java/org/unicode/draft/FindHanSizes.java index aaff157ff..ea5ec74ab 100644 --- a/unicodetools/src/main/java/org/unicode/draft/FindHanSizes.java +++ b/unicodetools/src/main/java/org/unicode/draft/FindHanSizes.java @@ -1,5 +1,13 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.Normalizer2.Mode; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.IOException; import java.nio.charset.Charset; @@ -14,25 +22,16 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.jsp.CharEncoder; import org.unicode.jsp.FileUtilities; import org.unicode.jsp.FileUtilities.SemiFileReader; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.Normalizer2.Mode; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; - public class FindHanSizes { - static final int SHOW_LIMIT = 100; - static Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkc", Mode.DECOMPOSE); - static UnicodeSet NONCANONICAL = new UnicodeSet("[:nfd_qc=n:]"); + static final int SHOW_LIMIT = 100; + static Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkc", Mode.DECOMPOSE); + static UnicodeSet NONCANONICAL = new UnicodeSet("[:nfd_qc=n:]"); static final UnicodeSet HAN; + static { // make sure we include the characters that contain HAN HAN = new UnicodeSet("[[:sc=han:][:ideographic:]]"); @@ -46,7 +45,16 @@ public class FindHanSizes { } enum NamedHanSet { - zh, zh_Hant, GB2312, GBK, Big5, Big5_HKSCS, Stroke, Pinyin, NewStroke, NewPinyin; + zh, + zh_Hant, + GB2312, + GBK, + Big5, + Big5_HKSCS, + Stroke, + Pinyin, + NewStroke, + NewPinyin; public static String toString(EnumSet set) { final StringBuilder result = new StringBuilder(); @@ -61,7 +69,7 @@ public static String toString(EnumSet set) { } static final SetComparator SINGLETON = new SetComparator(); - static Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); + static Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE); static class EncodingInfo { Map> status = new HashMap(); @@ -98,7 +106,7 @@ public UnicodeMap> getUnicodeMap() { } public void addAll(UnicodeSet tailored, NamedHanSet e) { - for (final UnicodeSetIterator it = new UnicodeSetIterator(tailored); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(tailored); it.next(); ) { add(it.codepoint, e); } } @@ -111,16 +119,25 @@ public UnicodeMap> showContents() { for (final EnumSet value : set) { final UnicodeSet keys = unicodeMap.getSet(value); - System.out.println(NamedHanSet.toString(value) + "\t" + keys.size() + "\t" + toAbbreviated(SHOW_LIMIT, keys)); + System.out.println( + NamedHanSet.toString(value) + + "\t" + + keys.size() + + "\t" + + toAbbreviated(SHOW_LIMIT, keys)); } - System.out.println("Total:\t" + unicodeMap.size() + "\t" + toAbbreviated(SHOW_LIMIT, unicodeMap.keySet())); + System.out.println( + "Total:\t" + + unicodeMap.size() + + "\t" + + toAbbreviated(SHOW_LIMIT, unicodeMap.keySet())); return unicodeMap; } } /** * Shows only chars, and just to the limit - * + * * @param input * @param limit * @return @@ -132,8 +149,11 @@ static String toAbbreviated(int limit, UnicodeSet input) { } else { final UnicodeSet smaller = new UnicodeSet(); int count = 0; - for (final UnicodeSetIterator it = new UnicodeSetIterator(input); it.nextRange();) { - count += it.codepoint == it.codepointEnd ? 1 : it.codepoint + 1 == it.codepointEnd ? 2 : 3; + for (final UnicodeSetIterator it = new UnicodeSetIterator(input); it.nextRange(); ) { + count += + it.codepoint == it.codepointEnd + ? 1 + : it.codepoint + 1 == it.codepointEnd ? 2 : 3; if (count > limit) { break; } @@ -147,7 +167,9 @@ public static void main(String[] args) { System.out.println("Use GenerateHanCollators for data"); System.out.println("All Han:\t" + HAN.size() + "\t" + HAN.toPattern(false)); final Set collators = new TreeSet(); - collators.addAll(Arrays.asList(Collator.getKeywordValuesForLocale("collation", ULocale.CHINESE, false))); + collators.addAll( + Arrays.asList( + Collator.getKeywordValuesForLocale("collation", ULocale.CHINESE, false))); System.out.println("Collators:\t" + collators); for (final String collatorType : collators) { final UnicodeSet set = getTailoredHan(collatorType); @@ -184,7 +206,7 @@ public static void main(String[] args) { } final UnicodeSet result = getCharsetRepertoire(name); final NamedHanSet e = NamedHanSet.valueOf(name.replace("-", "_")); - for (final String s: result) { + for (final String s : result) { final int cp = s.codePointAt(0); info.add(cp, e); } @@ -210,7 +232,8 @@ static UnicodeSet getCharsetRepertoire(String name) { private static void addNewCollator(EncodingInfo info, NamedHanSet e) { try { - final BufferedReader in = FileUtilities.openFile(FindHanSizes.class, e + "_repertoire.txt"); + final BufferedReader in = + FileUtilities.openFile(FindHanSizes.class, e + "_repertoire.txt"); final String contents = FileUtilities.getFileAsString(in); final UnicodeSet items = new UnicodeSet(contents); items.retainAll(HAN); @@ -228,7 +251,8 @@ private static void addTailoredHan(EncodingInfo info, NamedHanSet e) { private static UnicodeSet getTailoredHan(String type) { final Collator collator = Collator.getInstance(new ULocale("zh_co_" + type)); - final UnicodeSet tailored = new UnicodeSet(collator.getTailoredSet()).retainAll(HAN).removeAll(NONCANONICAL); + final UnicodeSet tailored = + new UnicodeSet(collator.getTailoredSet()).retainAll(HAN).removeAll(NONCANONICAL); return tailored; } @@ -247,7 +271,7 @@ public static UnicodeSet getMostFrequent(String e, double limit) { // 1000 0.8716829002327223 [一七三-下不且世並中主久之九也了事二五 static class MyReader extends SemiFileReader { UnicodeSet results = new UnicodeSet(); - double limit; + double limit; MyReader(double limit) { this.limit = limit; @@ -274,6 +298,6 @@ protected boolean handleLine(int start, int end, String[] items) { } return true; } - - }; + } + ; } diff --git a/unicodetools/src/main/java/org/unicode/draft/FormatRegistry.java b/unicodetools/src/main/java/org/unicode/draft/FormatRegistry.java index b87775849..cc8f4feff 100644 --- a/unicodetools/src/main/java/org/unicode/draft/FormatRegistry.java +++ b/unicodetools/src/main/java/org/unicode/draft/FormatRegistry.java @@ -1,24 +1,25 @@ package org.unicode.draft; -import java.text.Format; import com.ibm.icu.util.ULocale; +import java.text.Format; public interface FormatRegistry { /** * Return a default Format for a given type of object. - * - * @param obj the input Object. For example, for an object of type Number, a NumberFormat would be appropriate to return. + * + * @param obj the input Object. For example, for an object of type Number, a NumberFormat would + * be appropriate to return. * @param ulocale * @return default format, or null if none available for that type of object. */ public abstract Format getFormatForObject(Class classType, ULocale ulocale); /** - * Return a key, like "number", or "number,currency", or "number,#0,0#". If - * that key were passed into getFormat (with the same uLocale), then a - * format would be generated that would be equal to this one. - * + * Return a key, like "number", or "number,currency", or "number,#0,0#". If that key were passed + * into getFormat (with the same uLocale), then a format would be generated that would be equal + * to this one. + * * @param format The format to generate a key for. * @param ulocale * @return @@ -27,17 +28,13 @@ public interface FormatRegistry { /** * From a key of the form mainType, subType, return a format. Either one may be a pattern. - * - * @param mainType - * Guaranteed to be non-empty. - * @param subType - * May be empty or not. An empty subtype always works (if the mainType is valid). + * + * @param mainType Guaranteed to be non-empty. + * @param subType May be empty or not. An empty subtype always works (if the mainType is valid). * @param ulocale - * @exception IllegalArgumentException - * thrown if the mainType is not valid, or or the subType - * invalid for the mainType. + * @exception IllegalArgumentException thrown if the mainType is not valid, or or the subType + * invalid for the mainType. * @return */ public abstract Format getFormat(String mainType, String subType, ULocale ulocale); - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/draft/FormatSpecialData2.java b/unicodetools/src/main/java/org/unicode/draft/FormatSpecialData2.java index c6cfab8ac..d4a7c2d5f 100644 --- a/unicodetools/src/main/java/org/unicode/draft/FormatSpecialData2.java +++ b/unicodetools/src/main/java/org/unicode/draft/FormatSpecialData2.java @@ -1,5 +1,10 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.File; import java.io.IOException; @@ -11,35 +16,31 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility.CollectionComparator; import org.unicode.cldr.util.FileReaders; -import org.unicode.props.BagFormatter; import org.unicode.cldr.util.props.UnicodeLabel; import org.unicode.draft.ScriptCategories2.RemapType; - -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.UnicodeSet; +import org.unicode.props.BagFormatter; public class FormatSpecialData2 { private static final UnicodeSet UNASSIGNED = new UnicodeSet("[:cn:]").freeze(); - public static final UnicodeSet simpOnly = new UnicodeSet( - "[㑩㓥㔉㖊㖞㛟㛠㛿㟆㧑㧟㨫㱩㱮㲿㶉㶶㶽㺍㻏㻘䁖䅉䇲䌶-䌺䌼-䌾䍀䍁䓕䗖䘛䙊䙓䜣䜥䜧䝙䞌䞍䞐䢂䥿-䦁䩄䯃-䯅䲝䲞䴓-䴙万与丑专业-丝丢两严丧个丰临为丽举么义乌乐乔习乡书买乱争于亏云亚产亩亲亵亸亿仅仆从仑仓仪们价众优会伛伞-传伣-伧伪伫体佣佥侠侣侥-侪侬俣俦俨-俫俭债倾偬偻偾偿傥傧-傩儿克兑兖党兰关兴兹养兽冁内冈册写军农冯冲决况冻净准凉减凑凛几凤凫凭凯击凿刍划刘-创删别-刮制刹刽刿剀剂剐剑剥剧劝办务劢动励-劳势勋勚匀匦匮区医华协单卖卜卢卤卫却厂厅历厉压-厍厐厕厘厢厣厦厨厩厮县叁参双发变叙叠只叶号叹叽同向吓吕吗吣吨听启吴呐呒呓呕-呙呛呜咏咙咛咝咤咸响哑-哕哗哙哜哝哟唛唝唠-唢唤啧啬-啮啴啸喷喽喾嗫嗳嘘嘤嘱噜嚣团园困囱围囵国图圆圣圹场坂坏块坚-坠垄-垆垒垦垩垫垭垱垲垴埘-埚埯堑堕墙壮声壳壶壸处备复够头夸-夺奁奂奋奖奥奸妆-妈妩-妫姗姹娄-娈娱娲娴婳-婶媪嫒嫔嫱嬷孙学孪宁宝实宠审宪宫宽宾寝对寻导寿将尔尘尝尧尴尸尽层屃屉届属屡屦屿岁岂岖-岛岭岽岿峄峡峣-峦崂-崄崭嵘嵚嵝巅巩巯币帅师帏帐帘帜带帧帮帱帻帼幂干并广庄庆庐庑库应庙庞废廪开异弃弑张弥弪弯弹强归当录彝彦彻征径徕御忆忏忧忾怀-怆怜总怼怿恋恒恳恶恸-恽悦悫-悯惊惧-惩惫-惯愠愤愦愿慑懑懒懔戆戋戏戗战戬戯户扑执扩-扬扰抚抛抟-抢护报担拟拢拣拥-择挂挚-挦挽捝-损捡-捣据掳掴掷掸掺掼揽-搂搅携摄-摈摊撄撑撵撷撸撺擞攒敌敛数斋斓斗斩断无旧时-旸昙昼-显晋晒-晖暂暧术朴机杀杂权杆条来杨杩杰松板构枞枢枣枥枧枨枪枫枭柜柠柽栀栅标-栌栎栏树栖栗样栾桠-桩梦梼梾-棂椁椟椠椤椭楼榄榅榇-榉槚槛槟槠横樯樱橥橱橹橼檩欢欤欧歼殁殇残殒殓殚殡殴毁毂毕毙毡毵氇气氢氩氲汇汉汤汹沈沟没沣-沧沩沪泞注泪泶-泸泺-泾洁洒洼浃浅-浈浊测浍-浔涂涛涝-涡涣涤润-涩淀渊渌-渎渐渑渔渖渗温湾湿溃溅溆滗滚滞-滢滤-滦滨-滪漓漤潆潇潋潍潜潴澜濑濒灏灭灯灵灶灾-炀炉炖炜炝点炼炽烁-烃烛烟烦-烩烫-热焕焖焘煴爱爷牍牦牵牺犊状-犹狈狝狞独-狲猃猎猕猡猪-猬献獭玑玚玛玮-玱玺珐珑珰珲琏琐琼瑶瑷璎瓒瓯电画畅畴疖疗疟-疡疬-疯疱疴症-痉痒痖痨痪痫瘅瘆瘗瘘瘪瘫瘾瘿癞癣癫皑皱皲盏-监盖-盘眍眦眬着睁睐睑瞆瞒瞩矫矶矾-码砖砗砚砜砺砻砾础硁硕-硗硙确硷碍碛碜碱礼祃祎祢祯祷祸禀禄禅离秃秆种积称秽秾稆税稣稳穑穷窃窍窎窑窜窝窥窦窭竖竞笃笋笔笕笺笼笾筑筚-筝筹筼签简箓箦-箫篑篓篮篱簖籁籴类籼粜粝粤粪粮糁糇系紧累絷纟-缏缑-缵罂网罗罚罢罴羁羟翘耢耧耸耻聂聋-聍联聩聪肃肠肤肮肴肾-胁胆胜胡胧胨胪胫胶脉脍脏-脑脓脔脚脱脶脸腊腭腻-腾膑臜致舆舍舣舰舱舻艰艳艺节芈芗芜芦芸苁苇苈苋-苏苹范茎茏茑茔茕茧荆荐荙-荜荞-荡荣-药莅莱-莴莶-莺莼萝萤-萨葱蒇蒉蒋蒌蓝蓟蓠蓣蓥蓦蔂蔷蔹蔺蔼蕰蕲蕴薮藓蘖虏虑虚虫虬虮虽-蚂蚕蚬蛊蛎蛏蛮蛰-蛴蜕蜗蝇-蝉蝼蝾螀螨蟏衅衔补表衬衮袄-袆袜袭袯装裆裈裢-裥褛褴见-觑觞触觯訚誉誊讠-谈谊-谷豮贝-赣赪赵赶趋趱趸跃跄跞践-跹跻踊踌踪踬踯蹑蹒蹰蹿躏躜躯车-辚辞辩辫边辽达迁过迈运还这进-迟迩迳迹适选逊递逦逻遗遥邓邝邬邮邹-邻郁郏-郑郓郦郧郸酂酝酦酱酽-酿采释里鉴銮錾钅-镶长门-阛队阳-阶际-陉陕陧-险随隐隶隽难雏雠雳雾霁霡霭靓静面靥鞑鞒鞯韦-韬韵页-颢颤-颧风-飚飞飨餍饣-馕马-骧髅髋髌鬓魇魉鱼-鳣鸟-鹭鹯-鹴鹾麦麸黄黉黡黩黪黾鼋鼍鼗鼹齐齑齿-龌龙-龛龟𡒄𨱏]") - .freeze(); - public static final UnicodeSet tradOnly = new UnicodeSet( - "[㠏㩜䊷䋙䋻䝼䯀䰾䱽䲁丟並乾亂亞佇併來侖侶俁係俔俠倀倆倈倉個們倫偉側偵偽傑傖傘備傭傯傳-債傷傾僂僅僉僑僕僞僥僨價儀儂億儈儉儐儔儕儘償優儲儷儸儺-儼兌兒兗內兩冊冪凈凍凜凱別刪剄則剋剎剗剛剝剮剴創劃劇劉劊劌劍劏劑劚勁動務勛勝勞勢勩勱勵勸勻匭匯匱區協卻厙厠厭厲厴參叄叢吒吳吶呂呆咼員唄唚問啓啞啟啢喎喚喪喬單喲嗆嗇嗊嗎嗚嗩嗶嘆嘍嘔嘖嘗嘜嘩嘮-嘰嘵嘸嘽噓噚噝噠噥噦噯噲噴噸噹嚀嚇嚌嚕嚙嚦嚨嚲-嚴嚶囀-囂囅囈囑囪圇國圍園圓圖團垵埡埰執堅堊堖堝堯報場塊塋塏塒塗塢塤塵塹墊墜墮墳墻墾壇壈壋壓壘-壚壞-壠壢壩壯壺壼壽夠夢夾奐奧奩奪奬奮奼妝姍姦娛婁婦婭媧媯媼媽嫗嫵嫻嫿嬀嬈嬋嬌嬙嬡嬤嬪嬰嬸孌孫學孿宮寢實寧審寫寬寵寶將專尋對導尷屆屍屓屜屢層屨屬岡峴島峽崍崗崢崬嵐嶁嶄嶇嶔嶗嶠嶢嶧嶮嶴嶸嶺嶼巋巒巔巰帥師帳帶幀幃幗幘幟幣幫幬幹幺幾庫廁廂廄廈廚廝廟-廣廩廬廳弒弳張強彈彌彎彙彞彥後徑從徠復徵徹恆恥悅悞悵悶惡惱惲惻愛愜愨愴愷愾慄態慍慘慚慟慣慤慪慫慮慳慶憂憊憐-憒憚憤憫憮憲憶懇應懌懍懟懣懨懲懶-懸懺懼懾戀戇戔戧戩戰-戲戶拋挩挾捨捫掃掄掗掙掛採揀揚換揮損搖搗搵搶摑摜摟摯摳摶摻撈撏撐撓撝撟撣撥撫撲撳撻撾撿擁擄擇擊擋擓擔據擠擬擯-擲擴擷擺-擼擾攄攆攏攔攖攙攛-攝攢-攤攪攬敗敘敵數斂斃斕斬斷於時晉晝暈暉暘暢暫曄曆曇曉曏曖曠曨曬書會朧東杴柵桿梔梘條梟梲棄棖棗棟棧棲棶椏楊楓楨業極榪榮榲榿構槍槤槧槨槳樁樂樅樓標樞樣樸-樺橈橋機橢橫檁檉檔檜檟檢檣檮檯檳檸檻櫃櫓櫚櫛櫝-櫟櫥櫧櫨櫪-櫬櫱櫳櫸櫻欄權欏欒欖欞欽歐歟歡歲歷歸歿殘殞殤殨殫殮-殰殲殺-殼毀毆毿氂氈氌氣氫氬氳決沒沖況洶浹涇涼淚淥淪淵淶淺渙減渦測渾湊湞湯溈準溝溫滄滅滌滎滬滯滲滷滸滻滾滿漁漚漢漣漬漲漵漸漿潁潑潔潙潛潤潯潰潷潿澀澆澇澗澠澤澦澩澮澱濁濃濕濘濟濤濫濰濱濺濼濾瀅-瀇瀉瀋瀏瀕瀘瀝瀟瀠瀦-瀨瀲瀾灃灄灑灕灘灝灠灣灤灧災為烏烴無煉煒煙煢煥煩煬煱熅熒熗熱熲熾燁燈燉燒燙燜營燦燭燴燶燼燾爍爐爛爭爲爺爾牆牘牽犖犢犧狀狹狽猙猶猻獁獃-獅獎獨獪獫獮獰-獲獵獷獸獺-獼玀現琺琿瑋瑒瑣瑤瑩瑪瑲璉璣璦璫環璽瓊瓏瓔瓚甌產産畝畢畫異當疇疊痙痾瘂瘋瘍瘓瘞瘡瘧瘮瘲瘺瘻療癆癇癉癘癟癢癤癥癧癩癬-癮癰-癲發皚皰皸皺盜盞盡監盤盧盪眥眾睏睜睞瞘瞜瞞瞶瞼矓矚矯硜硤硨硯碩碭碸確碼磑磚磣磧磯磽礆礎礙礦礪-礬礱祿禍禎禕禡禦禪禮禰禱禿秈稅稈稏稟種稱穀穌-穎穠-穢穩穫穭窩窪窮窯窵窶窺竄竅竇竈竊竪競筆筍筧筴箋箏節範築篋篔篤篩篳簀簍簞簡簣簫簹簽簾籃籌籙籜籟籠籩籪籬籮粵糝糞糧糲糴糶糹糾紀紂約-紉紋納紐紓-紝紡紬細-紳紵紹紺紼紿絀終組-絆絎結絕絛絝絞絡絢給絨絰-絳絶絹綁綃綆綈綉綌綏綐經綜綞綠綢綣綫-維綯-綵綸-綻綽-綿緄緇緊緋緑-緔緗-線緝緞締緡緣緦編緩緬緯緱緲練緶緹緻縈-縋縐縑縕縗縛縝-縟縣縧縫縭縮縱-縳縵-縷縹總績繃繅繆繒織繕繚繞繡繢繩-繫繭-繰繳繸繹繼-繿纈纊續纍纏纓纖纘纜缽罈罌罰罵罷羅羆羈羋羥義習翹耬耮聖聞聯聰聲聳聵-職聹聽聾肅脅脈脛脫脹腎腖腡腦腫腳腸膃膚膠膩膽-膿臉臍臏臘臚臟臠臢臨臺與-舊艙艤艦艫艱艷芻茲荊莊莖莢莧華萇萊萬萵葉葒著葤葦葯葷蒓蒔蒞蒼蓀蓋蓮蓯蓴蓽蔔蔞蔣蔥蔦蔭蕁蕆蕎蕒蕓蕕蕘蕢蕩蕪蕭蕷薀薈薊薌薔薘薟薦薩薳薴薺藍藎藝藥藪藴藶藹藺蘄蘆蘇蘊蘋蘚蘞蘢蘭蘺蘿虆處虛虜號虧虯蛺蛻蜆蝕蝟蝦蝸螄螞螢螮螻螿蟄蟈蟎蟣蟬蟯蟲蟶蟻蠅蠆蠐蠑蠟蠣蠨蠱蠶蠻衆術衕衚衛衝衹袞裊裏補裝裡製複褌褘褲褳褸褻襇襏襖襝襠襤襪襬襯襲覆見覎規覓視覘覡覥覦親覬覯覲覷覺覽覿觀觴觶觸訁-訃計訊訌討訐訒訓訕-記訛訝訟訢訣訥訩訪設許訴訶診註詁詆詎詐詒詔-詘詛詞詠-詣試詩詫-詮詰-詳詵詼詿誄-誇誌認誑誒誕誘誚語誠誡誣-誦誨說説誰課誶誹誼誾調諂諄談諉請諍諏諑諒論諗諛-諞諢諤諦諧諫諭諮諱諳諶-諸諺諼諾謀-謂謄謅謊謎謐謔謖謗謙-講謝謠謡謨謫-謭謳謹謾譅證譎譏譖識-譚譜譫譯議譴護譸譽譾讀變讎讒讓讕讖讜讞豈豎豐豬豶貓貙貝-貢貧-責貯貰貲-貴貶-貸貺-貽貿-賅資賈賊賑-賓賕賙賚賜賞賠-賤賦賧質-賭賰賴賵賺-賾贄贅贇贈贊贋贍贏贐贓贔贖贗贛贜赬趕趙趨趲跡踐踴蹌蹕蹣蹤蹺躂躉-躋躍躑-躓躕躚躡躥躦躪軀車-軍軑軒軔軛軟軤軫軲軸-軼軾較輅輇-輊輒-輕輛-輟輥輦輩輪輬輯輳輸輻輾-轀轂轄-轆轉轍轎轔轟轡轢轤辦辭-辯農逕這連進運過達違遙遜遞遠適遲遷選遺遼邁還邇邊邏邐郟郵鄆鄉鄒鄔鄖鄧鄭鄰鄲鄴鄶鄺酇酈醖醜醞醫醬醱釀釁釃釅釋釐釒-釕釗-釙針釣釤釧釩釵釷釹釺鈀鈁鈃鈄鈈鈉鈍鈎鈐-鈒鈔鈕鈞鈣鈥-鈧鈮鈰鈳鈴鈷-鈺鈽-鉀鉅鉈鉉鉋鉍鉑鉕鉗鉚鉛鉞鉢鉤鉦鉬鉭鉶鉸鉺鉻鉿銀銃銅銍銑銓銖銘銚-銜銠銣銥銦銨-銬銱銳銷銹銻銼鋁鋃鋅鋇鋌鋏鋒鋙鋝鋟鋣-鋦鋨-鋪鋭-鋱鋶鋸鋼錁錄錆-錈錏錐錒錕錘-錛錟-錢錦錨錩錫錮錯録錳錶錸鍀鍁鍃鍆-鍈鍋鍍鍔鍘鍚鍛鍠鍤鍥鍩鍬鍰鍵鍶鍺鍾鎂鎄鎇鎊鎔鎖鎘鎚鎛鎝鎡-鎣鎦鎧鎩鎪鎬鎮鎰鎲鎳鎵鎸鎿鏃鏇鏈鏌鏍鏐鏑鏗鏘鏜-鏟鏡鏢鏤鏨鏰鏵鏷鏹鏽鐃鐋鐐鐒-鐔鐘鐙鐝鐠鐦-鐨鐫鐮鐲鐳鐵鐶鐸鐺鐿鑄鑊鑌鑒鑔鑕鑞鑠鑣鑥鑭鑰-鑲鑷鑹鑼-鑿钁長門閂閃閆閈閉開閌閎閏閑間閔閘閡閣閥閨閩閫-閭閱閲閶閹閻-閿闃闆闈闊-闍闐闒-闖關闞闠闡闤闥阪陘陝陣陰陳陸陽隉隊階隕際隨險隱隴隸隻雋雖雙雛雜雞離難雲電霢霧霽靂靄靈靚靜靦靨鞀鞏鞝鞽韁韃韉韋-韍韓韙韜韞韻響頁-頃項-須頊頌頎-頓頗領頜頡頤頦頭頮頰頲頴頷-頹頻頽顆題-顏顒-顔願顙顛類顢顥顧顫顬顯-顱顳顴風颭-颯颱颳颶颸颺-颼飀飄飆飈飛飠飢飣飥飩-飫飭飯飲飴飼-飿餃-餅餉養餌餎餏餑-餓餕餖餘餚-餜餞餡館餱餳餶餷餺餼餾餿饁饃饅饈-饌饑饒饗饜饞饢馬-馮馱馳馴馹駁駐-駒駔駕駘駙駛駝駟駡駢駭駰駱駸駿騁騂騅騌-騏騖騙騤騧騫騭騮騰騶-騸騾驀-驅驊驌驍驏驕驗驚驛驟驢驤-驦驪驫骯髏髒體-髖髮鬆鬍鬚鬢鬥鬧鬩鬮鬱魎魘魚魛魢魨魯魴魷魺鮁鮃鮊鮋鮍鮎鮐-鮓鮚鮜-鮞鮦鮪鮫鮭鮮鮳鮶鮺鯀鯁鯇鯉鯊鯒鯔-鯗鯛鯝鯡鯢鯤鯧鯨鯪鯫鯰鯴鯷鯽鯿鰁-鰃鰈鰉鰍鰏鰐鰒鰓鰜鰟鰠鰣鰥鰨鰩鰭鰮鰱-鰳鰵鰷鰹-鰼鰾鱂鱅鱈鱉鱒鱔鱖-鱘鱝鱟鱠鱣鱤鱧鱨鱭鱯鱷鱸鱺鳥鳧鳩鳬鳲-鳴鳶鳾鴆鴇鴉鴒鴕鴛鴝-鴟鴣鴦鴨鴯鴰鴴鴷鴻鴿鵁-鵃鵐-鵓鵜鵝鵠鵡鵪鵬鵮鵯鵲鵷鵾鶄鶇鶉鶊鶓鶖鶘鶚鶡鶥鶩鶪鶬鶯鶲鶴鶹-鶼鶿-鷂鷄鷈鷊鷓鷖鷗鷙鷚鷥鷦鷫鷯鷲鷳鷸-鷺鷽鷿鸂鸇鸌鸏鸕鸘鸚鸛鸝鸞鹵鹹鹺鹼鹽麗麥麩麵麼麽黃黌點黨黲黶黷黽黿鼉鼴齊齋齎齏齒齔齕齗齙齜齟-齡齦齪齬齲齶齷龍龎龐龔龕龜𡞵𡠹𡢃𤪺𤫩𧜵𧝞𧩙𧵳𨋢𨦫𨧜𨯅𩣑𩶘]") - .freeze(); + public static final UnicodeSet simpOnly = + new UnicodeSet( + "[㑩㓥㔉㖊㖞㛟㛠㛿㟆㧑㧟㨫㱩㱮㲿㶉㶶㶽㺍㻏㻘䁖䅉䇲䌶-䌺䌼-䌾䍀䍁䓕䗖䘛䙊䙓䜣䜥䜧䝙䞌䞍䞐䢂䥿-䦁䩄䯃-䯅䲝䲞䴓-䴙万与丑专业-丝丢两严丧个丰临为丽举么义乌乐乔习乡书买乱争于亏云亚产亩亲亵亸亿仅仆从仑仓仪们价众优会伛伞-传伣-伧伪伫体佣佥侠侣侥-侪侬俣俦俨-俫俭债倾偬偻偾偿傥傧-傩儿克兑兖党兰关兴兹养兽冁内冈册写军农冯冲决况冻净准凉减凑凛几凤凫凭凯击凿刍划刘-创删别-刮制刹刽刿剀剂剐剑剥剧劝办务劢动励-劳势勋勚匀匦匮区医华协单卖卜卢卤卫却厂厅历厉压-厍厐厕厘厢厣厦厨厩厮县叁参双发变叙叠只叶号叹叽同向吓吕吗吣吨听启吴呐呒呓呕-呙呛呜咏咙咛咝咤咸响哑-哕哗哙哜哝哟唛唝唠-唢唤啧啬-啮啴啸喷喽喾嗫嗳嘘嘤嘱噜嚣团园困囱围囵国图圆圣圹场坂坏块坚-坠垄-垆垒垦垩垫垭垱垲垴埘-埚埯堑堕墙壮声壳壶壸处备复够头夸-夺奁奂奋奖奥奸妆-妈妩-妫姗姹娄-娈娱娲娴婳-婶媪嫒嫔嫱嬷孙学孪宁宝实宠审宪宫宽宾寝对寻导寿将尔尘尝尧尴尸尽层屃屉届属屡屦屿岁岂岖-岛岭岽岿峄峡峣-峦崂-崄崭嵘嵚嵝巅巩巯币帅师帏帐帘帜带帧帮帱帻帼幂干并广庄庆庐庑库应庙庞废廪开异弃弑张弥弪弯弹强归当录彝彦彻征径徕御忆忏忧忾怀-怆怜总怼怿恋恒恳恶恸-恽悦悫-悯惊惧-惩惫-惯愠愤愦愿慑懑懒懔戆戋戏戗战戬戯户扑执扩-扬扰抚抛抟-抢护报担拟拢拣拥-择挂挚-挦挽捝-损捡-捣据掳掴掷掸掺掼揽-搂搅携摄-摈摊撄撑撵撷撸撺擞攒敌敛数斋斓斗斩断无旧时-旸昙昼-显晋晒-晖暂暧术朴机杀杂权杆条来杨杩杰松板构枞枢枣枥枧枨枪枫枭柜柠柽栀栅标-栌栎栏树栖栗样栾桠-桩梦梼梾-棂椁椟椠椤椭楼榄榅榇-榉槚槛槟槠横樯樱橥橱橹橼檩欢欤欧歼殁殇残殒殓殚殡殴毁毂毕毙毡毵氇气氢氩氲汇汉汤汹沈沟没沣-沧沩沪泞注泪泶-泸泺-泾洁洒洼浃浅-浈浊测浍-浔涂涛涝-涡涣涤润-涩淀渊渌-渎渐渑渔渖渗温湾湿溃溅溆滗滚滞-滢滤-滦滨-滪漓漤潆潇潋潍潜潴澜濑濒灏灭灯灵灶灾-炀炉炖炜炝点炼炽烁-烃烛烟烦-烩烫-热焕焖焘煴爱爷牍牦牵牺犊状-犹狈狝狞独-狲猃猎猕猡猪-猬献獭玑玚玛玮-玱玺珐珑珰珲琏琐琼瑶瑷璎瓒瓯电画畅畴疖疗疟-疡疬-疯疱疴症-痉痒痖痨痪痫瘅瘆瘗瘘瘪瘫瘾瘿癞癣癫皑皱皲盏-监盖-盘眍眦眬着睁睐睑瞆瞒瞩矫矶矾-码砖砗砚砜砺砻砾础硁硕-硗硙确硷碍碛碜碱礼祃祎祢祯祷祸禀禄禅离秃秆种积称秽秾稆税稣稳穑穷窃窍窎窑窜窝窥窦窭竖竞笃笋笔笕笺笼笾筑筚-筝筹筼签简箓箦-箫篑篓篮篱簖籁籴类籼粜粝粤粪粮糁糇系紧累絷纟-缏缑-缵罂网罗罚罢罴羁羟翘耢耧耸耻聂聋-聍联聩聪肃肠肤肮肴肾-胁胆胜胡胧胨胪胫胶脉脍脏-脑脓脔脚脱脶脸腊腭腻-腾膑臜致舆舍舣舰舱舻艰艳艺节芈芗芜芦芸苁苇苈苋-苏苹范茎茏茑茔茕茧荆荐荙-荜荞-荡荣-药莅莱-莴莶-莺莼萝萤-萨葱蒇蒉蒋蒌蓝蓟蓠蓣蓥蓦蔂蔷蔹蔺蔼蕰蕲蕴薮藓蘖虏虑虚虫虬虮虽-蚂蚕蚬蛊蛎蛏蛮蛰-蛴蜕蜗蝇-蝉蝼蝾螀螨蟏衅衔补表衬衮袄-袆袜袭袯装裆裈裢-裥褛褴见-觑觞触觯訚誉誊讠-谈谊-谷豮贝-赣赪赵赶趋趱趸跃跄跞践-跹跻踊踌踪踬踯蹑蹒蹰蹿躏躜躯车-辚辞辩辫边辽达迁过迈运还这进-迟迩迳迹适选逊递逦逻遗遥邓邝邬邮邹-邻郁郏-郑郓郦郧郸酂酝酦酱酽-酿采释里鉴銮錾钅-镶长门-阛队阳-阶际-陉陕陧-险随隐隶隽难雏雠雳雾霁霡霭靓静面靥鞑鞒鞯韦-韬韵页-颢颤-颧风-飚飞飨餍饣-馕马-骧髅髋髌鬓魇魉鱼-鳣鸟-鹭鹯-鹴鹾麦麸黄黉黡黩黪黾鼋鼍鼗鼹齐齑齿-龌龙-龛龟𡒄𨱏]") + .freeze(); + public static final UnicodeSet tradOnly = + new UnicodeSet( + "[㠏㩜䊷䋙䋻䝼䯀䰾䱽䲁丟並乾亂亞佇併來侖侶俁係俔俠倀倆倈倉個們倫偉側偵偽傑傖傘備傭傯傳-債傷傾僂僅僉僑僕僞僥僨價儀儂億儈儉儐儔儕儘償優儲儷儸儺-儼兌兒兗內兩冊冪凈凍凜凱別刪剄則剋剎剗剛剝剮剴創劃劇劉劊劌劍劏劑劚勁動務勛勝勞勢勩勱勵勸勻匭匯匱區協卻厙厠厭厲厴參叄叢吒吳吶呂呆咼員唄唚問啓啞啟啢喎喚喪喬單喲嗆嗇嗊嗎嗚嗩嗶嘆嘍嘔嘖嘗嘜嘩嘮-嘰嘵嘸嘽噓噚噝噠噥噦噯噲噴噸噹嚀嚇嚌嚕嚙嚦嚨嚲-嚴嚶囀-囂囅囈囑囪圇國圍園圓圖團垵埡埰執堅堊堖堝堯報場塊塋塏塒塗塢塤塵塹墊墜墮墳墻墾壇壈壋壓壘-壚壞-壠壢壩壯壺壼壽夠夢夾奐奧奩奪奬奮奼妝姍姦娛婁婦婭媧媯媼媽嫗嫵嫻嫿嬀嬈嬋嬌嬙嬡嬤嬪嬰嬸孌孫學孿宮寢實寧審寫寬寵寶將專尋對導尷屆屍屓屜屢層屨屬岡峴島峽崍崗崢崬嵐嶁嶄嶇嶔嶗嶠嶢嶧嶮嶴嶸嶺嶼巋巒巔巰帥師帳帶幀幃幗幘幟幣幫幬幹幺幾庫廁廂廄廈廚廝廟-廣廩廬廳弒弳張強彈彌彎彙彞彥後徑從徠復徵徹恆恥悅悞悵悶惡惱惲惻愛愜愨愴愷愾慄態慍慘慚慟慣慤慪慫慮慳慶憂憊憐-憒憚憤憫憮憲憶懇應懌懍懟懣懨懲懶-懸懺懼懾戀戇戔戧戩戰-戲戶拋挩挾捨捫掃掄掗掙掛採揀揚換揮損搖搗搵搶摑摜摟摯摳摶摻撈撏撐撓撝撟撣撥撫撲撳撻撾撿擁擄擇擊擋擓擔據擠擬擯-擲擴擷擺-擼擾攄攆攏攔攖攙攛-攝攢-攤攪攬敗敘敵數斂斃斕斬斷於時晉晝暈暉暘暢暫曄曆曇曉曏曖曠曨曬書會朧東杴柵桿梔梘條梟梲棄棖棗棟棧棲棶椏楊楓楨業極榪榮榲榿構槍槤槧槨槳樁樂樅樓標樞樣樸-樺橈橋機橢橫檁檉檔檜檟檢檣檮檯檳檸檻櫃櫓櫚櫛櫝-櫟櫥櫧櫨櫪-櫬櫱櫳櫸櫻欄權欏欒欖欞欽歐歟歡歲歷歸歿殘殞殤殨殫殮-殰殲殺-殼毀毆毿氂氈氌氣氫氬氳決沒沖況洶浹涇涼淚淥淪淵淶淺渙減渦測渾湊湞湯溈準溝溫滄滅滌滎滬滯滲滷滸滻滾滿漁漚漢漣漬漲漵漸漿潁潑潔潙潛潤潯潰潷潿澀澆澇澗澠澤澦澩澮澱濁濃濕濘濟濤濫濰濱濺濼濾瀅-瀇瀉瀋瀏瀕瀘瀝瀟瀠瀦-瀨瀲瀾灃灄灑灕灘灝灠灣灤灧災為烏烴無煉煒煙煢煥煩煬煱熅熒熗熱熲熾燁燈燉燒燙燜營燦燭燴燶燼燾爍爐爛爭爲爺爾牆牘牽犖犢犧狀狹狽猙猶猻獁獃-獅獎獨獪獫獮獰-獲獵獷獸獺-獼玀現琺琿瑋瑒瑣瑤瑩瑪瑲璉璣璦璫環璽瓊瓏瓔瓚甌產産畝畢畫異當疇疊痙痾瘂瘋瘍瘓瘞瘡瘧瘮瘲瘺瘻療癆癇癉癘癟癢癤癥癧癩癬-癮癰-癲發皚皰皸皺盜盞盡監盤盧盪眥眾睏睜睞瞘瞜瞞瞶瞼矓矚矯硜硤硨硯碩碭碸確碼磑磚磣磧磯磽礆礎礙礦礪-礬礱祿禍禎禕禡禦禪禮禰禱禿秈稅稈稏稟種稱穀穌-穎穠-穢穩穫穭窩窪窮窯窵窶窺竄竅竇竈竊竪競筆筍筧筴箋箏節範築篋篔篤篩篳簀簍簞簡簣簫簹簽簾籃籌籙籜籟籠籩籪籬籮粵糝糞糧糲糴糶糹糾紀紂約-紉紋納紐紓-紝紡紬細-紳紵紹紺紼紿絀終組-絆絎結絕絛絝絞絡絢給絨絰-絳絶絹綁綃綆綈綉綌綏綐經綜綞綠綢綣綫-維綯-綵綸-綻綽-綿緄緇緊緋緑-緔緗-線緝緞締緡緣緦編緩緬緯緱緲練緶緹緻縈-縋縐縑縕縗縛縝-縟縣縧縫縭縮縱-縳縵-縷縹總績繃繅繆繒織繕繚繞繡繢繩-繫繭-繰繳繸繹繼-繿纈纊續纍纏纓纖纘纜缽罈罌罰罵罷羅羆羈羋羥義習翹耬耮聖聞聯聰聲聳聵-職聹聽聾肅脅脈脛脫脹腎腖腡腦腫腳腸膃膚膠膩膽-膿臉臍臏臘臚臟臠臢臨臺與-舊艙艤艦艫艱艷芻茲荊莊莖莢莧華萇萊萬萵葉葒著葤葦葯葷蒓蒔蒞蒼蓀蓋蓮蓯蓴蓽蔔蔞蔣蔥蔦蔭蕁蕆蕎蕒蕓蕕蕘蕢蕩蕪蕭蕷薀薈薊薌薔薘薟薦薩薳薴薺藍藎藝藥藪藴藶藹藺蘄蘆蘇蘊蘋蘚蘞蘢蘭蘺蘿虆處虛虜號虧虯蛺蛻蜆蝕蝟蝦蝸螄螞螢螮螻螿蟄蟈蟎蟣蟬蟯蟲蟶蟻蠅蠆蠐蠑蠟蠣蠨蠱蠶蠻衆術衕衚衛衝衹袞裊裏補裝裡製複褌褘褲褳褸褻襇襏襖襝襠襤襪襬襯襲覆見覎規覓視覘覡覥覦親覬覯覲覷覺覽覿觀觴觶觸訁-訃計訊訌討訐訒訓訕-記訛訝訟訢訣訥訩訪設許訴訶診註詁詆詎詐詒詔-詘詛詞詠-詣試詩詫-詮詰-詳詵詼詿誄-誇誌認誑誒誕誘誚語誠誡誣-誦誨說説誰課誶誹誼誾調諂諄談諉請諍諏諑諒論諗諛-諞諢諤諦諧諫諭諮諱諳諶-諸諺諼諾謀-謂謄謅謊謎謐謔謖謗謙-講謝謠謡謨謫-謭謳謹謾譅證譎譏譖識-譚譜譫譯議譴護譸譽譾讀變讎讒讓讕讖讜讞豈豎豐豬豶貓貙貝-貢貧-責貯貰貲-貴貶-貸貺-貽貿-賅資賈賊賑-賓賕賙賚賜賞賠-賤賦賧質-賭賰賴賵賺-賾贄贅贇贈贊贋贍贏贐贓贔贖贗贛贜赬趕趙趨趲跡踐踴蹌蹕蹣蹤蹺躂躉-躋躍躑-躓躕躚躡躥躦躪軀車-軍軑軒軔軛軟軤軫軲軸-軼軾較輅輇-輊輒-輕輛-輟輥輦輩輪輬輯輳輸輻輾-轀轂轄-轆轉轍轎轔轟轡轢轤辦辭-辯農逕這連進運過達違遙遜遞遠適遲遷選遺遼邁還邇邊邏邐郟郵鄆鄉鄒鄔鄖鄧鄭鄰鄲鄴鄶鄺酇酈醖醜醞醫醬醱釀釁釃釅釋釐釒-釕釗-釙針釣釤釧釩釵釷釹釺鈀鈁鈃鈄鈈鈉鈍鈎鈐-鈒鈔鈕鈞鈣鈥-鈧鈮鈰鈳鈴鈷-鈺鈽-鉀鉅鉈鉉鉋鉍鉑鉕鉗鉚鉛鉞鉢鉤鉦鉬鉭鉶鉸鉺鉻鉿銀銃銅銍銑銓銖銘銚-銜銠銣銥銦銨-銬銱銳銷銹銻銼鋁鋃鋅鋇鋌鋏鋒鋙鋝鋟鋣-鋦鋨-鋪鋭-鋱鋶鋸鋼錁錄錆-錈錏錐錒錕錘-錛錟-錢錦錨錩錫錮錯録錳錶錸鍀鍁鍃鍆-鍈鍋鍍鍔鍘鍚鍛鍠鍤鍥鍩鍬鍰鍵鍶鍺鍾鎂鎄鎇鎊鎔鎖鎘鎚鎛鎝鎡-鎣鎦鎧鎩鎪鎬鎮鎰鎲鎳鎵鎸鎿鏃鏇鏈鏌鏍鏐鏑鏗鏘鏜-鏟鏡鏢鏤鏨鏰鏵鏷鏹鏽鐃鐋鐐鐒-鐔鐘鐙鐝鐠鐦-鐨鐫鐮鐲鐳鐵鐶鐸鐺鐿鑄鑊鑌鑒鑔鑕鑞鑠鑣鑥鑭鑰-鑲鑷鑹鑼-鑿钁長門閂閃閆閈閉開閌閎閏閑間閔閘閡閣閥閨閩閫-閭閱閲閶閹閻-閿闃闆闈闊-闍闐闒-闖關闞闠闡闤闥阪陘陝陣陰陳陸陽隉隊階隕際隨險隱隴隸隻雋雖雙雛雜雞離難雲電霢霧霽靂靄靈靚靜靦靨鞀鞏鞝鞽韁韃韉韋-韍韓韙韜韞韻響頁-頃項-須頊頌頎-頓頗領頜頡頤頦頭頮頰頲頴頷-頹頻頽顆題-顏顒-顔願顙顛類顢顥顧顫顬顯-顱顳顴風颭-颯颱颳颶颸颺-颼飀飄飆飈飛飠飢飣飥飩-飫飭飯飲飴飼-飿餃-餅餉養餌餎餏餑-餓餕餖餘餚-餜餞餡館餱餳餶餷餺餼餾餿饁饃饅饈-饌饑饒饗饜饞饢馬-馮馱馳馴馹駁駐-駒駔駕駘駙駛駝駟駡駢駭駰駱駸駿騁騂騅騌-騏騖騙騤騧騫騭騮騰騶-騸騾驀-驅驊驌驍驏驕驗驚驛驟驢驤-驦驪驫骯髏髒體-髖髮鬆鬍鬚鬢鬥鬧鬩鬮鬱魎魘魚魛魢魨魯魴魷魺鮁鮃鮊鮋鮍鮎鮐-鮓鮚鮜-鮞鮦鮪鮫鮭鮮鮳鮶鮺鯀鯁鯇鯉鯊鯒鯔-鯗鯛鯝鯡鯢鯤鯧鯨鯪鯫鯰鯴鯷鯽鯿鰁-鰃鰈鰉鰍鰏鰐鰒鰓鰜鰟鰠鰣鰥鰨鰩鰭鰮鰱-鰳鰵鰷鰹-鰼鰾鱂鱅鱈鱉鱒鱔鱖-鱘鱝鱟鱠鱣鱤鱧鱨鱭鱯鱷鱸鱺鳥鳧鳩鳬鳲-鳴鳶鳾鴆鴇鴉鴒鴕鴛鴝-鴟鴣鴦鴨鴯鴰鴴鴷鴻鴿鵁-鵃鵐-鵓鵜鵝鵠鵡鵪鵬鵮鵯鵲鵷鵾鶄鶇鶉鶊鶓鶖鶘鶚鶡鶥鶩鶪鶬鶯鶲鶴鶹-鶼鶿-鷂鷄鷈鷊鷓鷖鷗鷙鷚鷥鷦鷫鷯鷲鷳鷸-鷺鷽鷿鸂鸇鸌鸏鸕鸘鸚鸛鸝鸞鹵鹹鹺鹼鹽麗麥麩麵麼麽黃黌點黨黲黶黷黽黿鼉鼴齊齋齎齏齒齔齕齗齙齜齟-齡齦齪齬齲齶齷龍龎龐龔龕龜𡞵𡠹𡢃𤪺𤫩𧜵𧝞𧩙𧵳𨋢𨦫𨧜𨯅𩣑𩶘]") + .freeze(); // public static final UnicodeSet bothSimpTrad = new // UnicodeSet("[:sc=han:]").removeAll(simpOnly).removeAll(tradOnly).freeze(); public static final UnicodeSet specialIPA = new UnicodeSet("[βΒ θΘ χΧ]"); public static void main(String[] args) throws IOException { - String resource = FileReaders.getRelativeFileName(FormatSpecialData2.class, "ScriptData.txt"); + String resource = + FileReaders.getRelativeFileName(FormatSpecialData2.class, "ScriptData.txt"); final UnicodeMap> pivot = new UnicodeMap>(); ; @@ -52,7 +53,9 @@ public static void main(String[] args) throws IOException { continue; } String specialName = type.toString().toLowerCase() + "Specials"; - PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY + "special/", specialName + ".txt"); + PrintWriter out = + FileUtilities.openUTF8Writer( + CLDRPaths.GEN_DIRECTORY + "special/", specialName + ".txt"); pivot.clear(); addDataToPivot(string2uset, pivot); if (type == RemapType.SCRIPT) { @@ -83,7 +86,8 @@ public static void main(String[] args) throws IOException { } } - private static void addDataToPivot(Map string2uset, final UnicodeMap> pivot) { + private static void addDataToPivot( + Map string2uset, final UnicodeMap> pivot) { for (String value : string2uset.keySet()) { UnicodeSet set = string2uset.get(value); // if (value.equals("Han")) { @@ -122,19 +126,21 @@ public OldUnicodeLabel(RemapType type) { public String getValue(int codepoint, boolean isShort) { int code = UCharacter.getIntPropertyValue(codepoint, ICUPropEnum); return UCharacter.getPropertyValueName(ICUPropEnum, code, UProperty.NameChoice.SHORT) - + " (" + UCharacter.getPropertyValueName(ICUPropEnum, code, UProperty.NameChoice.LONG) + ")"; + + " (" + + UCharacter.getPropertyValueName(ICUPropEnum, code, UProperty.NameChoice.LONG) + + ")"; } } private static int getIcuPropEnum(RemapType type) { int icuProp; switch (type) { - case SCRIPT: - icuProp = UProperty.SCRIPT; - break; - default: - icuProp = UProperty.GENERAL_CATEGORY; - break; + case SCRIPT: + icuProp = UProperty.SCRIPT; + break; + default: + icuProp = UProperty.GENERAL_CATEGORY; + break; } return icuProp; } @@ -188,11 +194,17 @@ public static BufferedReader openFile(Class class1, String file) throws IOExc return bufferedReader; } catch (Exception e) { File file1 = new File(file); - throw (RuntimeException) new IllegalArgumentException("Bad file name: " - // + path + "\t" + externalForm + "\t" + - + file1.getCanonicalPath() - + "\n" + new File(".").getCanonicalFile() + " => " - + Arrays.asList(new File(".").getCanonicalFile().list())).initCause(e); + throw (RuntimeException) + new IllegalArgumentException( + "Bad file name: " + // + path + "\t" + externalForm + "\t" + + + file1.getCanonicalPath() + + "\n" + + new File(".").getCanonicalFile() + + " => " + + Arrays.asList( + new File(".").getCanonicalFile().list())) + .initCause(e); } } } diff --git a/unicodetools/src/main/java/org/unicode/draft/FrequencyData2.java b/unicodetools/src/main/java/org/unicode/draft/FrequencyData2.java index cb29da11a..42047300a 100644 --- a/unicodetools/src/main/java/org/unicode/draft/FrequencyData2.java +++ b/unicodetools/src/main/java/org/unicode/draft/FrequencyData2.java @@ -1,5 +1,16 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.Normalizer.Mode; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -8,7 +19,6 @@ import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.Counter; @@ -17,30 +27,20 @@ import org.unicode.props.UnicodeProperty; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.Normalizer.Mode; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class FrequencyData2 { private static final boolean MARKUP = false; private static final boolean MAP_CASE = true; - private static final UnicodeSet NO_SCRIPT = new UnicodeSet( - "[[:script=common:][:script=inherited:][:script=unknown:]]"); + private static final UnicodeSet NO_SCRIPT = + new UnicodeSet("[[:script=common:][:script=inherited:][:script=unknown:]]"); static final UnicodeSet NfcNo = new UnicodeSet("[:nfcqc=no:]").freeze(); static final UnicodeSet NfcMaybe = new UnicodeSet("[:nfcqc=maybe:]").freeze(); - static final Transliterator fixOutput = Transliterator.createFromRules("fix", "" + - "([[:di:][:whitespace:][:co:]\"'']) > &any-hex/unicode($1) ;" + - "", Transliterator.FORWARD); + static final Transliterator fixOutput = + Transliterator.createFromRules( + "fix", + "" + "([[:di:][:whitespace:][:co:]\"'']) > &any-hex/unicode($1) ;" + "", + Transliterator.FORWARD); // private Counter langNfcNo = new Counter(); // private Counter langNfcMaybe = new Counter(); @@ -48,6 +48,7 @@ public class FrequencyData2 { // private Counter langUpper = new Counter(); private Map> langData = new HashMap>(); private Counter frequencies = new Counter(); + { langData.put("mul", frequencies); } @@ -55,89 +56,93 @@ public class FrequencyData2 { /** * The 1st column is the code point. * - * 2nd is detected language + *

2nd is detected language * - * Then there are 3 groups of 4 columns, where each group is: + *

Then there are 3 groups of 4 columns, where each group is: * - * pre-HTML code point count post-HTML code point count, document count, UTF-8 document count + *

pre-HTML code point count post-HTML code point count, document count, UTF-8 document count * - * The 1st group includes "bad" docs (error during input conversion or - * contains unassigned or high private use), 2nd group excludes "bad" - * docs, 3rd group is multiplied by pagerank (and excludes "bad" docs). + *

The 1st group includes "bad" docs (error during input conversion or contains unassigned or + * high private use), 2nd group excludes "bad" docs, 3rd group is multiplied by pagerank (and + * excludes "bad" docs). * - * Then there are up to 3 groups, where each group is: + *

Then there are up to 3 groups, where each group is: * - * navboost, pagerank, language, encoding, url + *

navboost, pagerank, language, encoding, url * * @param frequencyFile * @throws IOException */ static final int postFrequencyIndex = 2 + 4 + 1; + static final int preFrequencyIndex = 2 + 4 + 0; -// public FrequencyData2(String frequencyFile, boolean showProgress, boolean old) throws IOException { -// BufferedReader in = GenerateNormalizeForMatch2.openUTF8Reader(frequencyFile); -// for (int lineCount = 0;; ++lineCount) { -// String line = in.readLine(); -// if (line == null) break; -// int commentPos = line.indexOf("#"); -// if (commentPos >= 0) { -// line = line.substring(0, commentPos); -// } -// line = line.trim(); -// if (line.length() == 0) continue; -// String[] pieces = line.split("\\s+"); -// int code = Integer.parseInt(pieces[0], 16); -// -// if (showProgress && lineCount < 100 || (lineCount % 1000000) == 0 || code == 0x03C2) { -// System.out.println(lineCount + "\t" + line); -// } -// -// if (code < 0x20) code = 0x20; -// if (MAP_CASE) { -// code = UCharacter.toLowerCase(code); -// } -// long count = MARKUP -// ? Math.max(0, Long.parseLong(pieces[preFrequencyIndex]) - Long.parseLong(pieces[postFrequencyIndex])) -// : Long.parseLong(pieces[postFrequencyIndex]); -// String lang = pieces[1]; -// Counter langCounter = langData.get(lang); -// if (langCounter == null) { -// langData.put(lang, langCounter = new Counter()); -// } -// langCounter.add(code, count); -// // if (NfcNo.contains(code)) { -// // langNfcNo.add(lang, count); -// // } else if (NfcMaybe.contains(code)) { -// // langNfcMaybe.add(lang, count); -// // } -// // if (UCharacter.isUpperCase(code)) { -// // langUpper.add(lang, count); -// // } -// // langTotal.add(lang, count); -// frequencies.add(code, count); -// } -// in.close(); -// } + // public FrequencyData2(String frequencyFile, boolean showProgress, boolean old) throws + // IOException { + // BufferedReader in = GenerateNormalizeForMatch2.openUTF8Reader(frequencyFile); + // for (int lineCount = 0;; ++lineCount) { + // String line = in.readLine(); + // if (line == null) break; + // int commentPos = line.indexOf("#"); + // if (commentPos >= 0) { + // line = line.substring(0, commentPos); + // } + // line = line.trim(); + // if (line.length() == 0) continue; + // String[] pieces = line.split("\\s+"); + // int code = Integer.parseInt(pieces[0], 16); + // + // if (showProgress && lineCount < 100 || (lineCount % 1000000) == 0 || code == + // 0x03C2) { + // System.out.println(lineCount + "\t" + line); + // } + // + // if (code < 0x20) code = 0x20; + // if (MAP_CASE) { + // code = UCharacter.toLowerCase(code); + // } + // long count = MARKUP + // ? Math.max(0, Long.parseLong(pieces[preFrequencyIndex]) - + // Long.parseLong(pieces[postFrequencyIndex])) + // : Long.parseLong(pieces[postFrequencyIndex]); + // String lang = pieces[1]; + // Counter langCounter = langData.get(lang); + // if (langCounter == null) { + // langData.put(lang, langCounter = new Counter()); + // } + // langCounter.add(code, count); + // // if (NfcNo.contains(code)) { + // // langNfcNo.add(lang, count); + // // } else if (NfcMaybe.contains(code)) { + // // langNfcMaybe.add(lang, count); + // // } + // // if (UCharacter.isUpperCase(code)) { + // // langUpper.add(lang, count); + // // } + // // langTotal.add(lang, count); + // frequencies.add(code, count); + // } + // in.close(); + // } public FrequencyData2(String frequencyFile, boolean showProgress) throws IOException { - if (true) throw new IllegalArgumentException("old code: see CharacterFrequency"); + if (true) throw new IllegalArgumentException("old code: see CharacterFrequency"); BufferedReader in = GenerateNormalizeForMatch2.openUTF8Reader(frequencyFile); - for (int lineCount = 0;; ++lineCount) { + for (int lineCount = 0; ; ++lineCount) { String line = in.readLine(); if (line == null) break; -// int commentPos = line.indexOf("#"); -// if (commentPos >= 0) { -// line = line.substring(0, commentPos); -// } -// line = line.trim(); + // int commentPos = line.indexOf("#"); + // if (commentPos >= 0) { + // line = line.substring(0, commentPos); + // } + // line = line.trim(); if (line.length() == 0) continue; String[] pieces = line.split("\\t"); // -4.470007 n U+006E Ll Latn LATIN SMALL LETTER N double logFreq = Double.parseDouble(pieces[0]); double freq = Math.pow(10, logFreq); - long count = (int) Math.round(freq*Long.MAX_VALUE); + long count = (int) Math.round(freq * Long.MAX_VALUE); int code = Utility.fromHex(pieces[2]).codePointAt(0); if (showProgress && lineCount < 100 || (lineCount % 1000000) == 0 || code == 0x03C2) { @@ -211,7 +216,7 @@ public double getTotalRelative() { private RelativeFrequency(UnicodeSet withinSet, Mode compose) { Counter counter = new Counter(); - for (UnicodeSetIterator it = new UnicodeSetIterator(withinSet); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(withinSet); it.next(); ) { final long frequency = getCount(it.codepoint); if (frequency == 0) continue; if (compose == null) { @@ -275,30 +280,39 @@ private RelativeFrequency getRelativeFrequency(UnicodeSet withinSet, Mode compos } static NumberFormat nf = NumberFormat.getInstance(); + static { nf.setGroupingUsed(true); } private void showData(String category, int propEnum, UnicodeSet exclusions) { - for (int i = UCharacter.getIntPropertyMinValue(propEnum); i <= UCharacter.getIntPropertyMaxValue(propEnum); ++i) { - String valueAlias = UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.LONG); - String shortValueAlias = UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.SHORT); - // if (valueAlias.equalsIgnoreCase("common") || valueAlias.equalsIgnoreCase("inherited")) continue; + for (int i = UCharacter.getIntPropertyMinValue(propEnum); + i <= UCharacter.getIntPropertyMaxValue(propEnum); + ++i) { + String valueAlias = + UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.LONG); + String shortValueAlias = + UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.SHORT); + // if (valueAlias.equalsIgnoreCase("common") || + // valueAlias.equalsIgnoreCase("inherited")) continue; UnicodeSet valueChars = new UnicodeSet(); - valueChars.applyPropertyAlias(UCharacter.getPropertyName(propEnum, UProperty.NameChoice.SHORT), - shortValueAlias); + valueChars.applyPropertyAlias( + UCharacter.getPropertyName(propEnum, UProperty.NameChoice.SHORT), + shortValueAlias); valueChars.removeAll(exclusions); if (valueChars.size() == 0) continue; showData(category, shortValueAlias + " - " + valueAlias, valueChars); } } - private void showData2(String category, UnicodeProperty prop, UnicodeSet exclusions, boolean differences) { + private void showData2( + String category, UnicodeProperty prop, UnicodeSet exclusions, boolean differences) { UnicodeSet last = new UnicodeSet(); for (Object value : prop.getAvailableValues()) { String valueAlias = (String) value; - // if (valueAlias.equalsIgnoreCase("common") || valueAlias.equalsIgnoreCase("inherited")) continue; + // if (valueAlias.equalsIgnoreCase("common") || + // valueAlias.equalsIgnoreCase("inherited")) continue; UnicodeSet valueChars = new UnicodeSet(); valueChars.applyPropertyAlias(prop.getName(), valueAlias); @@ -322,7 +336,8 @@ private void showData(String category, String title, UnicodeSet valueChars) { sds.put(cp, sd); if (sd == standardDeviation.length) break; // boolean isNFKC = Normalizer.isNormalized(cp, Normalizer.COMPOSE_COMPAT, 0); - // System.out.println(new StringBuilder().appendCodePoint(cp) + "\t" + (totalFrequency*100) + "%\t" + sd + + // System.out.println(new StringBuilder().appendCodePoint(cp) + "\t" + + // (totalFrequency*100) + "%\t" + sd + // "\t" + (isNFKC ? "" : "K")); } @@ -366,37 +381,39 @@ private void showData(String category, String title, UnicodeSet valueChars) { static Pattern IICORE = PatternCache.get("U\\+([A-Z0-9]+)\\s+kIICore\\s+(.*)"); static UnicodeSet iiCoreSet; -// public static UnicodeSet getIICore() { -// if (iiCoreSet == null) { -// try { -// String unihanFile = CldrUtility.getProperty("unidata") + "/Unihan/Unihan_NormativeProperties.txt"; -// BufferedReader in = new BufferedReader(new FileReader(unihanFile)); -// Matcher iiCore = IICORE.matcher(""); -// iiCoreSet = new UnicodeSet(); -// while (true) { -// String line = in.readLine(); -// if (line == null) break; -// if (iiCore.reset(line).matches()) { -// int cp = Integer.parseInt(iiCore.group(1), 16); -// iiCoreSet.add(cp); -// } -// } -// in.close(); -// iiCoreSet.freeze(); -// } catch (IOException e) { -// throw new IllegalArgumentException(e); -// } -// } -// return iiCoreSet; -// } + // public static UnicodeSet getIICore() { + // if (iiCoreSet == null) { + // try { + // String unihanFile = CldrUtility.getProperty("unidata") + + // "/Unihan/Unihan_NormativeProperties.txt"; + // BufferedReader in = new BufferedReader(new FileReader(unihanFile)); + // Matcher iiCore = IICORE.matcher(""); + // iiCoreSet = new UnicodeSet(); + // while (true) { + // String line = in.readLine(); + // if (line == null) break; + // if (iiCore.reset(line).matches()) { + // int cp = Integer.parseInt(iiCore.group(1), 16); + // iiCoreSet.add(cp); + // } + // } + // in.close(); + // iiCoreSet.freeze(); + // } catch (IOException e) { + // throw new IllegalArgumentException(e); + // } + // } + // return iiCoreSet; + // } public static void main(String[] args) throws IOException { String frequencyFile = args[0]; FrequencyData2 data = new FrequencyData2(frequencyFile, true); -// System.out.println("IICoreSet\t" + getIICore().size() + "\t" + getIICore().toPattern(false)); + // System.out.println("IICoreSet\t" + getIICore().size() + "\t" + + // getIICore().toPattern(false)); -// showHan(data); + // showHan(data); writeSummary2(data); System.out.print("Category" + "\t"); @@ -407,12 +424,20 @@ public static void main(String[] args) throws IOException { System.out.print(1.0d + "\t"); System.out.println("Total"); - data.showData2("Age", ICUPropertyFactory.make().getProperty("age"), new UnicodeSet("[[:cn:][:co:]]"), true); + data.showData2( + "Age", + ICUPropertyFactory.make().getProperty("age"), + new UnicodeSet("[[:cn:][:co:]]"), + true); data.showData("Script/Cat", UCharacter.getPropertyEnum("script"), NO_SCRIPT); - data.showData("Script/Cat", UCharacter.getPropertyEnum("gc"), new UnicodeSet(NO_SCRIPT).complement()); + data.showData( + "Script/Cat", + UCharacter.getPropertyEnum("gc"), + new UnicodeSet(NO_SCRIPT).complement()); // data.showData("Private Use", PRIVATE_USE); - // RelativeFrequency relative = data.getRelativeFrequency(new UnicodeSet("[:script=unknown:]"), + // RelativeFrequency relative = data.getRelativeFrequency(new + // UnicodeSet("[:script=unknown:]"), // Normalizer.NFKC); // System.out.println(relative.getTotalRelative()); // for (int i = 0; i < 10; ++i) { @@ -422,65 +447,66 @@ public static void main(String[] args) throws IOException { // } } -private static void writeSummary2(FrequencyData2 data) { - long buckets[] = new long[4]; - for ( R2 entry : data.frequencies.getEntrySetSortedByCount(false, null)) { - int codepoint = entry.get1(); - long freq = entry.get0(); - int bucket; - if (codepoint <= 0x7F) { - bucket = 0; - } else if (codepoint <= 0x7FF) { - bucket = 1; - } else if (codepoint <= 0xFFFF) { - bucket = 2; - } else { - bucket = 3; - } - buckets[bucket] += freq; - if (buckets[bucket] < 0) { - throw new IllegalArgumentException(); + private static void writeSummary2(FrequencyData2 data) { + long buckets[] = new long[4]; + for (R2 entry : data.frequencies.getEntrySetSortedByCount(false, null)) { + int codepoint = entry.get1(); + long freq = entry.get0(); + int bucket; + if (codepoint <= 0x7F) { + bucket = 0; + } else if (codepoint <= 0x7FF) { + bucket = 1; + } else if (codepoint <= 0xFFFF) { + bucket = 2; + } else { + bucket = 3; + } + buckets[bucket] += freq; + if (buckets[bucket] < 0) { + throw new IllegalArgumentException(); + } } - } - long total = 0; - long counts[] = new long[]{0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; + long total = 0; + long counts[] = new long[] {0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; - for (int i = 0; i < 4; ++i) { - total += buckets[i]; - if (total < 0) { - throw new IllegalArgumentException(); - } - if (i > 0) { - counts[i] -= counts[i-1]; + for (int i = 0; i < 4; ++i) { + total += buckets[i]; + if (total < 0) { + throw new IllegalArgumentException(); + } + if (i > 0) { + counts[i] -= counts[i - 1]; + } } - } - for (int i = 0; i < 4; ++i) { - System.out.println((i+1) + "-byte:\t" - + 100*buckets[i]/(double) total + "%"); + for (int i = 0; i < 4; ++i) { + System.out.println((i + 1) + "-byte:\t" + 100 * buckets[i] / (double) total + "%"); + } } -} -// private static void showHan(FrequencyData2 data) { -// UnicodeSet han = new UnicodeSet("\\p{sc=han}").freeze(); -// UnicodeSet tranche = new UnicodeSet(); -// UnicodeSet iiCore2 = new UnicodeSet(getIICore()); -// int bucket = 0; -// for (int cp : data.frequencies.getKeysetSortedByCount(false)) { -// if (han.contains(cp)) { -// tranche.add(cp); -// if (tranche.size() >= 5000) { -// bucket += tranche.size(); -// UnicodeSet diff; -// diff = new UnicodeSet(tranche).removeAll(iiCore2); -// System.out.println(bucket + "\tNOT iiCore\t" + diff.size() + "\t" + diff.toPattern(false)); -// diff = iiCore2.removeAll(tranche); -// System.out.println(bucket + "\tiiCore\t" + diff.size() + "\t" + diff.toPattern(false)); -// tranche.clear(); -// } -// } -// } -// } + // private static void showHan(FrequencyData2 data) { + // UnicodeSet han = new UnicodeSet("\\p{sc=han}").freeze(); + // UnicodeSet tranche = new UnicodeSet(); + // UnicodeSet iiCore2 = new UnicodeSet(getIICore()); + // int bucket = 0; + // for (int cp : data.frequencies.getKeysetSortedByCount(false)) { + // if (han.contains(cp)) { + // tranche.add(cp); + // if (tranche.size() >= 5000) { + // bucket += tranche.size(); + // UnicodeSet diff; + // diff = new UnicodeSet(tranche).removeAll(iiCore2); + // System.out.println(bucket + "\tNOT iiCore\t" + diff.size() + "\t" + + // diff.toPattern(false)); + // diff = iiCore2.removeAll(tranche); + // System.out.println(bucket + "\tiiCore\t" + diff.size() + "\t" + + // diff.toPattern(false)); + // tranche.clear(); + // } + // } + // } + // } static class CountLang implements Comparable { long total; @@ -519,8 +545,10 @@ private static void writeSummary(FrequencyData2 data) throws IOException { long runningTotal = 0; double threshold = standardDeviation[4] * total; - PrintWriter out = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, "/char_frequencies/" + lang - + (MARKUP ? "_markup" : "") + ".txt"); + PrintWriter out = + FileUtilities.openUTF8Writer( + CLDRPaths.GEN_DIRECTORY, + "/char_frequencies/" + lang + (MARKUP ? "_markup" : "") + ".txt"); out.println("lang\trank\tcount\tlangPPB\tNFC\tcat\tscript\tcodepoint\tchar\tname"); writeLine(out, lang, 0, total, total, 0, null); @@ -543,13 +571,21 @@ private static void writeSummary(FrequencyData2 data) throws IOException { // nfcMaybeCount += langCount; // } rank++; - } out.close(); for (String s : normCounter.getKeysetSortedByKey()) { final long count2 = normCounter.getCount(s); - System.out.println("NFC:\t" + lang + "\t" + s + "\t" + count2 + "\t" + total + "\t" - + (count / (double) total)); + System.out.println( + "NFC:\t" + + lang + + "\t" + + s + + "\t" + + count2 + + "\t" + + total + + "\t" + + (count / (double) total)); } log.flush(); // System.out.println(s + "\t" + new ULocale(s).getDisplayName() @@ -561,59 +597,95 @@ private static void writeSummary(FrequencyData2 data) throws IOException { // + "\t" + data.langUpper.getCount(s) // + "\t" + data.langTotal.getCount(s)); } - } - private static void writeLine(PrintWriter out, String lang, int code, long langCount, long total2, int rank, - Counter normCounter) { + private static void writeLine( + PrintWriter out, + String lang, + int code, + long langCount, + long total2, + int rank, + Counter normCounter) { if (code == 0) { - out.println(lang - + "\t" + 0 - + "\t" + langCount - + "\t" + 1000000000 * langCount / total2 - + "\t" + "Total"); + out.println( + lang + + "\t" + + 0 + + "\t" + + langCount + + "\t" + + 1000000000 * langCount / total2 + + "\t" + + "Total"); } else { final String normalizationType = getNormalizationType(code); if (normCounter != null) { normCounter.add(normalizationType, langCount); } - out.println(lang - + "\t" + rank - + "\t" + langCount - + "\t" + 1000000000 * langCount / total2 - + "\t" + normalizationType - + "\t" + getValueAlias(code, UProperty.GENERAL_CATEGORY, UProperty.NameChoice.SHORT).charAt(0) - + "\t" + getValueAlias(code, UProperty.SCRIPT, UProperty.NameChoice.SHORT) - + "\t" + "U+" + com.ibm.icu.impl.Utility.hex(code, 4) - + "\t" + toChar(code) - + "\t" + UCharacter.getExtendedName(code)); + out.println( + lang + + "\t" + + rank + + "\t" + + langCount + + "\t" + + 1000000000 * langCount / total2 + + "\t" + + normalizationType + + "\t" + + getValueAlias( + code, + UProperty.GENERAL_CATEGORY, + UProperty.NameChoice.SHORT) + .charAt(0) + + "\t" + + getValueAlias(code, UProperty.SCRIPT, UProperty.NameChoice.SHORT) + + "\t" + + "U+" + + com.ibm.icu.impl.Utility.hex(code, 4) + + "\t" + + toChar(code) + + "\t" + + UCharacter.getExtendedName(code)); } - } private static String getValueAlias(int code, int propEnum, int nameChoice) { if (propEnum == UProperty.SCRIPT && code < 0x80) { return "ASCII"; } - return UCharacter.getPropertyValueName(propEnum, UCharacter.getIntPropertyValue(code, propEnum), nameChoice); + return UCharacter.getPropertyValueName( + propEnum, UCharacter.getIntPropertyValue(code, propEnum), nameChoice); } private static String getNormalizationType(Integer code) { - String nfd = UCharacter.getPropertyValueName(UProperty.NFD_QUICK_CHECK, - UCharacter.getIntPropertyValue(code, UProperty.NFD_QUICK_CHECK), UProperty.NameChoice.SHORT); - String nfc = UCharacter.getPropertyValueName(UProperty.NFC_QUICK_CHECK, - UCharacter.getIntPropertyValue(code, UProperty.NFC_QUICK_CHECK), UProperty.NameChoice.SHORT); - String nfkd = UCharacter.getPropertyValueName(UProperty.NFKD_QUICK_CHECK, - UCharacter.getIntPropertyValue(code, UProperty.NFKD_QUICK_CHECK), UProperty.NameChoice.SHORT); - String nfkc = UCharacter.getPropertyValueName(UProperty.NFKC_QUICK_CHECK, - UCharacter.getIntPropertyValue(code, UProperty.NFKC_QUICK_CHECK), UProperty.NameChoice.SHORT); + String nfd = + UCharacter.getPropertyValueName( + UProperty.NFD_QUICK_CHECK, + UCharacter.getIntPropertyValue(code, UProperty.NFD_QUICK_CHECK), + UProperty.NameChoice.SHORT); + String nfc = + UCharacter.getPropertyValueName( + UProperty.NFC_QUICK_CHECK, + UCharacter.getIntPropertyValue(code, UProperty.NFC_QUICK_CHECK), + UProperty.NameChoice.SHORT); + String nfkd = + UCharacter.getPropertyValueName( + UProperty.NFKD_QUICK_CHECK, + UCharacter.getIntPropertyValue(code, UProperty.NFKD_QUICK_CHECK), + UProperty.NameChoice.SHORT); + String nfkc = + UCharacter.getPropertyValueName( + UProperty.NFKC_QUICK_CHECK, + UCharacter.getIntPropertyValue(code, UProperty.NFKC_QUICK_CHECK), + UProperty.NameChoice.SHORT); String result = nfc + nfd + nfkc + nfkd; result = result.replace("Y", "+").replace("N", "-").replace("M", "?"); - if (result.equals("++++")) - result = "+"; - else if (result.equals("----")) - result = "-"; - else if (result.substring(0, 2).equals(result.substring(2, 4))) result = result.substring(0, 2); + if (result.equals("++++")) result = "+"; + else if (result.equals("----")) result = "-"; + else if (result.substring(0, 2).equals(result.substring(2, 4))) + result = result.substring(0, 2); return "'" + result; } diff --git a/unicodetools/src/main/java/org/unicode/draft/FuzzyNumber.java b/unicodetools/src/main/java/org/unicode/draft/FuzzyNumber.java index 16ddf7c67..116e12279 100644 --- a/unicodetools/src/main/java/org/unicode/draft/FuzzyNumber.java +++ b/unicodetools/src/main/java/org/unicode/draft/FuzzyNumber.java @@ -1,6 +1,5 @@ package org.unicode.draft; - import java.text.NumberFormat; public class FuzzyNumber { @@ -27,7 +26,10 @@ public static FuzzyNumber parse(String in) { in = in.trim(); final double value = Double.parseDouble(in); final int decimalPos = in.indexOf('.'); - final double increment = decimalPos < 0 || decimalPos == in.length() - 1 ? 0.5 : 0.5 / Math.pow(10, in.length() - decimalPos - 1); + final double increment = + decimalPos < 0 || decimalPos == in.length() - 1 + ? 0.5 + : 0.5 / Math.pow(10, in.length() - decimalPos - 1); return new FuzzyNumber(value, increment); } @@ -62,7 +64,8 @@ public FuzzyNumber invert() { if (upper < 0) { return new FuzzyNumber(1.0D / value, 1.0D / upper, 1.0D / lower); } else { - return new FuzzyNumber(1.0D / value, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY); + return new FuzzyNumber( + 1.0D / value, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY); } } else { // == 0 or NaN return new FuzzyNumber(1.0D / value, 1.0D / upper, Double.POSITIVE_INFINITY); @@ -78,7 +81,11 @@ public FuzzyNumber divide(FuzzyNumber other) { @Override public String toString() { - return nf.format(value) + " {-" + pf.format(Math.abs((value - lower) / value)) + "+" + pf.format(Math.abs((upper - value) / value)) + "}"; + return nf.format(value) + + " {-" + + pf.format(Math.abs((value - lower) / value)) + + "+" + + pf.format(Math.abs((upper - value) / value)) + + "}"; } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/FuzzyTest.java b/unicodetools/src/main/java/org/unicode/draft/FuzzyTest.java index 2d5126822..2b9f90e6d 100644 --- a/unicodetools/src/main/java/org/unicode/draft/FuzzyTest.java +++ b/unicodetools/src/main/java/org/unicode/draft/FuzzyTest.java @@ -1,4 +1,5 @@ package org.unicode.draft; + import java.util.Random; public class FuzzyTest { @@ -13,7 +14,6 @@ public static void main(String[] args) { expected = new FuzzyNumber(1.0, 0.95, 1.05); assertEquals("aa", expected, aa); - final Random r = new Random(0); for (int k = 0; k < 1000; ++k) { final FuzzyNumber a = new FuzzyNumber(r.nextInt(200) / 10.0D - 10, 0.5); @@ -24,11 +24,35 @@ public static void main(String[] args) { final FuzzyNumber multiplied = a.multiply(b); final FuzzyNumber divided = a.divide(b); if (false) { - System.out.println("1/" + a + " = " + invert - + "\t\t" + a + " + " + b + " = " + added - + "\t\t" + a + " - " + b + " = " + subtracted - + "\t\t" + a + " × " + b + " = " + multiplied - + "\t\t" + a + " ÷ " + b + " = " + divided); + System.out.println( + "1/" + + a + + " = " + + invert + + "\t\t" + + a + + " + " + + b + + " = " + + added + + "\t\t" + + a + + " - " + + b + + " = " + + subtracted + + "\t\t" + + a + + " × " + + b + + " = " + + multiplied + + "\t\t" + + a + + " ÷ " + + b + + " = " + + divided); } for (int i = 1; i < 10; ++i) { @@ -49,7 +73,7 @@ public static void main(String[] args) { } private static void assertEquals(String string, FuzzyNumber aa, FuzzyNumber bb) { - if (!equals(aa,bb)) { + if (!equals(aa, bb)) { System.out.println("FAILURE " + string + "\t" + aa + " != " + bb); } else { System.out.println(string + "\t" + aa + " = " + bb); @@ -58,9 +82,7 @@ private static void assertEquals(String string, FuzzyNumber aa, FuzzyNumber bb) private static boolean equals(FuzzyNumber aa, FuzzyNumber bb) { // TODO Auto-generated method stub - return aa.value == bb.value - && aa.lower == bb.lower - && aa.upper == bb.upper; + return aa.value == bb.value && aa.lower == bb.lower && aa.upper == bb.upper; } private static boolean assertCovers(String title, FuzzyNumber added, double d) { @@ -74,5 +96,4 @@ private static boolean assertCovers(String title, FuzzyNumber added, double d) { } return true; } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/GenerateCasedPairs.java b/unicodetools/src/main/java/org/unicode/draft/GenerateCasedPairs.java index 933e6abae..2ffae82cc 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GenerateCasedPairs.java +++ b/unicodetools/src/main/java/org/unicode/draft/GenerateCasedPairs.java @@ -1,6 +1,4 @@ package org.unicode.draft; -import java.util.Set; -import java.util.TreeSet; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; @@ -9,20 +7,26 @@ import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; import com.ibm.icu.util.ULocale; +import java.util.Set; +import java.util.TreeSet; public class GenerateCasedPairs { public static void main(String[] args) { - final UnicodeSet set = new UnicodeSet("[[:script=Cyrillic:]" + - "-[:block=Phonetic_Extensions:]" + - "-[:block=Cyrillic_Extended_A:]" + - "-[:block=Cyrillic_Extended_B:]" + - "-[:block=Cyrillic_Supplement:]" + - "]"); + final UnicodeSet set = + new UnicodeSet( + "[[:script=Cyrillic:]" + + "-[:block=Phonetic_Extensions:]" + + "-[:block=Cyrillic_Extended_A:]" + + "-[:block=Cyrillic_Extended_B:]" + + "-[:block=Cyrillic_Supplement:]" + + "]"); final Set lower = new TreeSet(Collator.getInstance(ULocale.ENGLISH)); final Set blocks = new TreeSet(); - for (final UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(set); it.next(); ) { lower.add(UCharacter.toLowerCase(ULocale.ENGLISH, it.getString())); - final String block = UCharacter.getStringPropertyValue(UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG); + final String block = + UCharacter.getStringPropertyValue( + UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG); blocks.add(block); } System.out.println("Blocks: " + blocks); diff --git a/unicodetools/src/main/java/org/unicode/draft/GenerateCharacterFrequencyCharts.java b/unicodetools/src/main/java/org/unicode/draft/GenerateCharacterFrequencyCharts.java index 7dd79ca7d..2beea662a 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GenerateCharacterFrequencyCharts.java +++ b/unicodetools/src/main/java/org/unicode/draft/GenerateCharacterFrequencyCharts.java @@ -1,5 +1,15 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.DataOutputStream; import java.io.FileOutputStream; import java.io.IOException; @@ -10,7 +20,6 @@ import java.util.List; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.ExemplarInfo; import org.unicode.cldr.draft.ExemplarInfo.Status; import org.unicode.cldr.draft.FileUtilities; @@ -23,18 +32,6 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - - public class GenerateCharacterFrequencyCharts { static UnicodeSet SKIP_NAME = new UnicodeSet("[:UnifiedIdeograph:]"); @@ -51,33 +48,66 @@ private void run() throws IOException { final NumberFormat nf = NumberFormat.getNumberInstance(ULocale.ENGLISH); nf.setGroupingUsed(true); - final TablePrinter indexTable = new TablePrinter() - .addColumn("Char Count").setCellAttributes("class='count'").setSortAscending(false).setSortPriority(0).setCellPattern("{0,number,#,##0}") - .addColumn("%/Total").setCellAttributes("class='pc'").setCellPattern("{0,number,#,##0.00%}") - .addColumn("Locale").setCellAttributes("class='locale'").setCellPattern("{0}") - .addColumn("Name") - .addColumn("Lit. Pop").setCellAttributes("class='count'").setCellPattern("{0,number,#,##0}") - .addColumn("%World").setCellAttributes("class='pc'").setCellPattern("{0,number,#,##0.00%}") - .addColumn("Ch/Lit").setCellAttributes("class='pc'").setCellPattern("{0,number,#,##0.00%}") - .addColumn("Chars") - ; - - final TablePrinter indexTable2 = new TablePrinter() - .addColumn("Char Count").setCellAttributes("class='count'").setSortAscending(false).setSortPriority(0).setCellPattern("{0,number,#,##0}") - .addColumn("%/Total").setCellAttributes("class='pc'").setCellPattern("{0,number,#,##0.0000%}") - .addColumn("Locale").setCellAttributes("class='locale'").setCellPattern("{0}") - .addColumn("Name") - .addColumn("Lit. Pop").setCellAttributes("class='count'").setCellPattern("{0,number,#,##0}") - .addColumn("%World").setCellAttributes("class='pc'").setCellPattern("{0,number,#,##0.0000%}") - .addColumn("Ch/Lit").setCellAttributes("class='pc'").setCellPattern("{0,number,#,##0.0000%}") - ; - - final TablePrinter summaryTable = new TablePrinter() - .addColumn("Locale").setCellAttributes("class='locale'") - .addColumn("Name") - .addColumn("Lit. Pop").setCellAttributes("class='count'").setCellPattern("{0,number,#,##0}").setSortAscending(false).setSortPriority(0) - .addColumn("Chars") - ; + final TablePrinter indexTable = + new TablePrinter() + .addColumn("Char Count") + .setCellAttributes("class='count'") + .setSortAscending(false) + .setSortPriority(0) + .setCellPattern("{0,number,#,##0}") + .addColumn("%/Total") + .setCellAttributes("class='pc'") + .setCellPattern("{0,number,#,##0.00%}") + .addColumn("Locale") + .setCellAttributes("class='locale'") + .setCellPattern("{0}") + .addColumn("Name") + .addColumn("Lit. Pop") + .setCellAttributes("class='count'") + .setCellPattern("{0,number,#,##0}") + .addColumn("%World") + .setCellAttributes("class='pc'") + .setCellPattern("{0,number,#,##0.00%}") + .addColumn("Ch/Lit") + .setCellAttributes("class='pc'") + .setCellPattern("{0,number,#,##0.00%}") + .addColumn("Chars"); + + final TablePrinter indexTable2 = + new TablePrinter() + .addColumn("Char Count") + .setCellAttributes("class='count'") + .setSortAscending(false) + .setSortPriority(0) + .setCellPattern("{0,number,#,##0}") + .addColumn("%/Total") + .setCellAttributes("class='pc'") + .setCellPattern("{0,number,#,##0.0000%}") + .addColumn("Locale") + .setCellAttributes("class='locale'") + .setCellPattern("{0}") + .addColumn("Name") + .addColumn("Lit. Pop") + .setCellAttributes("class='count'") + .setCellPattern("{0,number,#,##0}") + .addColumn("%World") + .setCellAttributes("class='pc'") + .setCellPattern("{0,number,#,##0.0000%}") + .addColumn("Ch/Lit") + .setCellAttributes("class='pc'") + .setCellPattern("{0,number,#,##0.0000%}"); + + final TablePrinter summaryTable = + new TablePrinter() + .addColumn("Locale") + .setCellAttributes("class='locale'") + .addColumn("Name") + .addColumn("Lit. Pop") + .setCellAttributes("class='count'") + .setCellPattern("{0,number,#,##0}") + .setSortAscending(false) + .setSortPriority(0) + .addColumn("Chars"); final Counter mulCounter = CharacterFrequency.getCodePointCounter("mul", false); // hack: print top 1000 supplemental characters int topSupp = 1000; @@ -91,8 +121,8 @@ private void run() throws IOException { } } final long totalTotal = mulCounter.getTotal(); - //double worldPop = CharacterFrequency.getLanguageToPopulation("mul"); - //final long worldPop = CharacterFrequency.getCodePointCounter("mul", true).getTotal(); + // double worldPop = CharacterFrequency.getLanguageToPopulation("mul"); + // final long worldPop = CharacterFrequency.getCodePointCounter("mul", true).getTotal(); CLDRConfig testInfo = CLDRConfig.getInstance(); SupplementalDataInfo supplemental = testInfo.getSupplementalDataInfo(); @@ -115,7 +145,8 @@ private void run() throws IOException { final CharacterSamples indexChars = new CharacterSamples(language); final String cldrLanguage = ExemplarInfo.getCldrLanguage(language); - final Counter counter = CharacterFrequency.getCodePointCounter(language, false); + final Counter counter = + CharacterFrequency.getCodePointCounter(language, false); final long total = counter.getTotal(); final String htmlFilename = language + ".html"; @@ -125,40 +156,65 @@ private void run() throws IOException { PopulationData popInfo = supplemental.getLanguagePopulationData(cldrLanguage); - final double pop = language.equals("mul") ? worldPop : getPopulation(supplemental, cldrLanguage); - + final double pop = + language.equals("mul") ? worldPop : getPopulation(supplemental, cldrLanguage); // get exemplars final ExemplarInfo exemplarInfo = ExemplarInfo.make(cldrLanguage, missingExemplars); // open files for writing, create table - final DataOutputStream dataOutputStream = new DataOutputStream(new FileOutputStream(Settings.Output.GEN_DIR + "/frequency/" + language + ".txt")); + final DataOutputStream dataOutputStream = + new DataOutputStream( + new FileOutputStream( + Settings.Output.GEN_DIR + "/frequency/" + language + ".txt")); final CompressedDataOutput out = new CompressedDataOutput().set(dataOutputStream); - final PrintWriter html = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "/frequency-html", htmlFilename); - html.println("\n" + - "\n" + - "\n" + - "

" + htmlTitle + "

"); - final TablePrinter table = new TablePrinter() - .addColumn("Rank").setCellAttributes("class='rank'") - .addColumn("Count").setCellAttributes("class='count'") - .addColumn("%").setCellAttributes("class='pc'") - .addColumn("Type").setCellAttributes("class='type'") - .addColumn("Ex").setCellAttributes("class=''{0}''").setCellPattern("·{0}·") - .addColumn("Level").setCellAttributes("class='chars'") - .addColumn("Chars").setCellAttributes("class='name'") - .addColumn("Hex").setCellAttributes("class='hex'") - .addColumn("Name").setCellAttributes("class='name'") - ; - final long enough = (long)(total*0.99999d); - System.out.println(total + "\t" + enough + "\t" + (total - enough) + "\t" + ((total - enough)/(double)total)); + final PrintWriter html = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "/frequency-html", htmlFilename); + html.println( + "\n" + + "\n" + + "\n" + + "

" + + htmlTitle + + "

"); + final TablePrinter table = + new TablePrinter() + .addColumn("Rank") + .setCellAttributes("class='rank'") + .addColumn("Count") + .setCellAttributes("class='count'") + .addColumn("%") + .setCellAttributes("class='pc'") + .addColumn("Type") + .setCellAttributes("class='type'") + .addColumn("Ex") + .setCellAttributes("class=''{0}''") + .setCellPattern("·{0}·") + .addColumn("Level") + .setCellAttributes("class='chars'") + .addColumn("Chars") + .setCellAttributes("class='name'") + .addColumn("Hex") + .setCellAttributes("class='hex'") + .addColumn("Name") + .setCellAttributes("class='name'"); + final long enough = (long) (total * 0.99999d); + System.out.println( + total + + "\t" + + enough + + "\t" + + (total - enough) + + "\t" + + ((total - enough) / (double) total)); long soFar = 0; final Collection typeList = new LinkedHashSet(); long rank = 0; for (final int sequence : counter.getKeysetSortedByCount(false)) { final long count = counter.get(sequence); - //if (count < COUNT_LIMIT) break; // skip for now + // if (count < COUNT_LIMIT) break; // skip for now final String hex = Utility.hex(sequence); out.writeUnsignedLong(count); @@ -166,61 +222,68 @@ private void run() throws IOException { out.writeUTF(sequence2); final String type = getType(sequence2, typeList); - String name = SKIP_NAME.containsAll(sequence2) ? "" : UCharacter.getName(sequence2, "+"); + String name = + SKIP_NAME.containsAll(sequence2) ? "" : UCharacter.getName(sequence2, "+"); if (name == null) { name = "none"; } final ExemplarInfo.Status exemplar = exemplarInfo.getStatus(sequence2); indexChars.add(sequence2, exemplar); - final String percent = pf.format(count/(double)total); + final String percent = pf.format(count / (double) total); final String decimal = nf.format(count); final String rankStr = nf.format(++rank); final String level = exemplarInfo.getEducationLevel(sequence2); table.addRow() - .addCell(rankStr) - .addCell(decimal) - .addCell(percent) - .addCell(type) - .addCell(exemplar) - .addCell(level == null ? "-" : "·" + level + "·") - .addCell(sequence).addCell(hex).addCell(name).finishRow(); + .addCell(rankStr) + .addCell(decimal) + .addCell(percent) + .addCell(type) + .addCell(exemplar) + .addCell(level == null ? "-" : "·" + level + "·") + .addCell(sequence) + .addCell(hex) + .addCell(name) + .finishRow(); soFar += count; if (soFar >= enough) { break; } } - double charsOverTotal = total/(double)totalTotal; - double popOverTotal = pop/worldPop; - indexTable.addRow() - .addCell(total) - .addCell(charsOverTotal) - .addCell(language) - .addCell(englishLocaleName) - .addCell(pop) - .addCell(popOverTotal) - .addCell(charsOverTotal/popOverTotal) - .addCell(indexChars.toString()) - .finishRow(); - - indexTable2.addRow() - .addCell(total) - .addCell(charsOverTotal) - .addCell(language) - .addCell(englishLocaleName) - .addCell(pop) - .addCell(popOverTotal) - .addCell(charsOverTotal/popOverTotal) - .finishRow(); - - summaryTable.addRow() - .addCell(language) - .addCell(englishLocaleName) - .addCell(pop) - .addCell(indexChars.toString()) - .finishRow(); - - //out.close(); + double charsOverTotal = total / (double) totalTotal; + double popOverTotal = pop / worldPop; + indexTable + .addRow() + .addCell(total) + .addCell(charsOverTotal) + .addCell(language) + .addCell(englishLocaleName) + .addCell(pop) + .addCell(popOverTotal) + .addCell(charsOverTotal / popOverTotal) + .addCell(indexChars.toString()) + .finishRow(); + + indexTable2 + .addRow() + .addCell(total) + .addCell(charsOverTotal) + .addCell(language) + .addCell(englishLocaleName) + .addCell(pop) + .addCell(popOverTotal) + .addCell(charsOverTotal / popOverTotal) + .finishRow(); + + summaryTable + .addRow() + .addCell(language) + .addCell(englishLocaleName) + .addCell(pop) + .addCell(indexChars.toString()) + .finishRow(); + + // out.close(); html.println(table); html.println(""); html.close(); @@ -232,13 +295,13 @@ private void run() throws IOException { System.out.println("Missing exemplars:\t" + missingExemplars); } - public double getPopulation(SupplementalDataInfo supplemental, - final String cldrLanguage) { + public double getPopulation(SupplementalDataInfo supplemental, final String cldrLanguage) { PopulationData popInfo = supplemental.getLanguagePopulationData(cldrLanguage); if (popInfo == null) { String defaultScript = supplemental.getDefaultScript(cldrLanguage); if (defaultScript != null) { - popInfo = supplemental.getLanguagePopulationData(cldrLanguage + "_" + defaultScript); + popInfo = + supplemental.getLanguagePopulationData(cldrLanguage + "_" + defaultScript); } if (popInfo == null) { System.out.println("Can't get pop data for: " + cldrLanguage); @@ -249,16 +312,17 @@ public double getPopulation(SupplementalDataInfo supplemental, } private void printIndex(TablePrinter indexTable, String file) throws IOException { - final PrintWriter index = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "/frequency-html", file); - index.println("\n" + - "\n" + - "\n"); + final PrintWriter index = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "/frequency-html", file); + index.println( + "\n" + + "\n" + + "\n"); index.println(indexTable); index.println(""); index.close(); } - private static class CharacterSamples { static UnicodeSetPrettyPrinter pp = new UnicodeSetPrettyPrinter(); static final UnicodeSet DIGITS = new UnicodeSet("[:nd:]").freeze(); @@ -271,13 +335,17 @@ private static class CharacterSamples { UnicodeSet exemplarsLeft; CharacterSamples(String language) { - exemplarsLeft = new UnicodeSet(ExemplarInfo.make(language, null).getExemplars()).removeAll(ExemplarInfo.IGNORE); + exemplarsLeft = + new UnicodeSet(ExemplarInfo.make(language, null).getExemplars()) + .removeAll(ExemplarInfo.IGNORE); isKey = language.equals("mul"); pp.setOrdering(Collator.getInstance(new ULocale(language))); - final RuleBasedCollator spaceComp = (RuleBasedCollator) Collator.getInstance(new ULocale(language)); + final RuleBasedCollator spaceComp = + (RuleBasedCollator) Collator.getInstance(new ULocale(language)); spaceComp.setStrength(Collator.PRIMARY); pp.setSpaceComparator(spaceComp); } + void add(String sequence, ExemplarInfo.Status exemplar) { if (DIGITS.containsAll(sequence)) { return; @@ -294,13 +362,15 @@ void add(String sequence, ExemplarInfo.Status exemplar) { indexSet.add(sequence); exemplarsLeft.removeAll(sequence); } + void finish(UnicodeSet unicodeSet, Status lastExemplarStatus2) { if (indexSet.size() != 0) { String setString = pp.format(unicodeSet); - setString = setString.substring(1,setString.length()-1); + setString = setString.substring(1, setString.length() - 1); indexChars.add("
"); } } + @Override public String toString() { if (isKey) { @@ -316,21 +386,25 @@ public String toString() { if (exemplarsLeft.size() < 1000) { finish(exemplarsLeft, Status.N); } - return "
" + setString + "
" + CollectionUtilities.join(indexChars,"") + "
"; + return "" + CollectionUtilities.join(indexChars, "") + "
"; } } private String getType(String sequence, Collection items) { items.clear(); int cp; - for (int i = 0; i < sequence.length(); i+=Character.charCount(cp)) { + for (int i = 0; i < sequence.length(); i += Character.charCount(cp)) { cp = sequence.codePointAt(i); final int script = UScript.getScript(cp); - if (script != UScript.UNKNOWN && script != UScript.COMMON && script != UScript.INHERITED) { + if (script != UScript.UNKNOWN + && script != UScript.COMMON + && script != UScript.INHERITED) { items.add(UScript.getShortName(script)); } else { final int type = UCharacter.getType(cp); - items.add(UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, type, UProperty.NameChoice.SHORT)); + items.add( + UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, type, UProperty.NameChoice.SHORT)); } } if (items.size() > 1 && items.contains("Zinh")) { diff --git a/unicodetools/src/main/java/org/unicode/draft/GenerateLanguageNames.java b/unicodetools/src/main/java/org/unicode/draft/GenerateLanguageNames.java index b09b5810f..525ef5d63 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GenerateLanguageNames.java +++ b/unicodetools/src/main/java/org/unicode/draft/GenerateLanguageNames.java @@ -1,22 +1,20 @@ package org.unicode.draft; +import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.StandardCodes; -import com.ibm.icu.util.ULocale; - public class GenerateLanguageNames { public static void main(String[] args) { getNames(); } - static String[][] langList = { + static String[][] langList = { {"Chinese (Traditional)", "Chinese (Traditional Han)"}, {"Bhojpuri", "Bihari"}, {"Divehi", "Dhivehi"}, @@ -27,7 +25,6 @@ public static void main(String[] args) { {"Scottish Gaelic", "Scots Gaelic"}, {"Sinhala", "Sinhalese"}, {"Western Frisian", "Frisian"}, - {"English", "English"}, {"Chinese", "Chinese (Simplified)"}, {"Spanish", "Spanish"}, @@ -141,9 +138,10 @@ public static void main(String[] args) { {"Laothian", "Lao"}, {"Luxembourgish", "lb"}, }; + static void getNames() { - final Map nameToLocale = new TreeMap(); - final Map localeToName = new TreeMap(); + final Map nameToLocale = new TreeMap(); + final Map localeToName = new TreeMap(); final StandardCodes sc = StandardCodes.make(); for (final String lang : sc.getAvailableCodes("language")) { final String[] names = sc.getData("language", lang).split("▪"); @@ -166,7 +164,13 @@ static void getNames() { nameToLocale.put(name, lang); localeToName.put(lang, name); } - final Set locales = new TreeSet(Arrays.asList("ak", "ba", "bh", "bs", "ha", "haw", "ia", "ig", "iw", "lg", "ln", "mfe", "mg", "mo", "nn", "no", "om", "pt-BR", "pt-PT", "rm", "rn", "rw", "sh", "sn", "so", "sr-ME", "st", "ti", "tk", "tl", "tn", "tw", "xh", "yi", "zh-CN", "zh-TW", "zu")); + final Set locales = + new TreeSet( + Arrays.asList( + "ak", "ba", "bh", "bs", "ha", "haw", "ia", "ig", "iw", "lg", "ln", + "mfe", "mg", "mo", "nn", "no", "om", "pt-BR", "pt-PT", "rm", "rn", + "rw", "sh", "sn", "so", "sr-ME", "st", "ti", "tk", "tl", "tn", "tw", + "xh", "yi", "zh-CN", "zh-TW", "zu")); final Set missing = new TreeSet(); for (final String[] pair : langList) { String locale = nameToLocale.get(pair[0]); diff --git a/unicodetools/src/main/java/org/unicode/draft/GenerateNormalizeForMatch2.java b/unicodetools/src/main/java/org/unicode/draft/GenerateNormalizeForMatch2.java index 0d8ea8d07..6c256a62f 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GenerateNormalizeForMatch2.java +++ b/unicodetools/src/main/java/org/unicode/draft/GenerateNormalizeForMatch2.java @@ -5,6 +5,19 @@ */ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.DateFormat; +import com.ibm.icu.text.DecimalFormat; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.Normalizer.Mode; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.SimpleDateFormat; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.VersionInfo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -20,30 +33,17 @@ import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; - import org.unicode.cldr.util.PatternCache; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.DateFormat; -import com.ibm.icu.text.DecimalFormat; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.Normalizer.Mode; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.SimpleDateFormat; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.VersionInfo; - @Deprecated public class GenerateNormalizeForMatch2 { static boolean TABLE = true; // Command choices enum ListStyle { - ALL, SHOW_AGE, ONLY_OLD + ALL, + SHOW_AGE, + ONLY_OLD } private static ListStyle LIST_STYLE = ListStyle.ALL; @@ -57,7 +57,8 @@ enum ListStyle { private static int DEBUG_CODE_POINT = 0x0041; // eg 0xFDFA - private static final Matcher HEXFORM = PatternCache.get("[0-9A-Fa-f]{4,6}(\\s+[0-9A-Fa-f]{4,6})*").matcher(""); + private static final Matcher HEXFORM = + PatternCache.get("[0-9A-Fa-f]{4,6}(\\s+[0-9A-Fa-f]{4,6})*").matcher(""); private static final int DIFF_LIMIT = 10; @@ -65,10 +66,15 @@ enum ListStyle { // Fixes to match Jim's names private static UnicodeMap JIM_NAMES = new UnicodeMap(); + static { JIM_NAMES.putAll(new UnicodeSet("[:block=CJK Unified Ideographs:]"), ""); - JIM_NAMES.putAll(new UnicodeSet("[:block=CJK Unified Ideographs Extension A:]"), ""); - JIM_NAMES.putAll(new UnicodeSet("[:block=CJK Unified Ideographs Extension B:]"), ""); + JIM_NAMES.putAll( + new UnicodeSet("[:block=CJK Unified Ideographs Extension A:]"), + ""); + JIM_NAMES.putAll( + new UnicodeSet("[:block=CJK Unified Ideographs Extension B:]"), + ""); JIM_NAMES.freeze(); } @@ -76,7 +82,7 @@ enum ListStyle { /** * Generate new files or reformat old ones, depending on options - * + * * @param args * @throws IOException */ @@ -103,10 +109,13 @@ public static void main(String[] args) throws IOException { } else if (arg.equals("fix")) { fix = true; } else { - throw new IllegalArgumentException("Unknown option: " + arg + "\n" + - "-o \n" + - "-i \n" + - "-l "); + throw new IllegalArgumentException( + "Unknown option: " + + arg + + "\n" + + "-o \n" + + "-i \n" + + "-l "); } } if (logFile == null) { @@ -115,16 +124,17 @@ public static void main(String[] args) throws IOException { LOG_WRITER = openUTF8Writer(logFile); LOG_WRITER.write(0xFEFF); if (TABLE) { - LOG_WRITER.write("\n" + - "\n" + - "\n" + - "" + - "\n" + - "\n" + - "
\n");
+                LOG_WRITER.write(
+                        "\n"
+                                + "\n"
+                                + "\n"
+                                + ""
+                                + "\n"
+                                + "\n"
+                                + "
\n");
             }
         }
         if (fix) {
@@ -143,8 +153,9 @@ public static void main(String[] args) throws IOException {
         System.out.println("DONE");
     }
 
-    private static void logDiffs(UnicodeMap newMappings, String oldMappingFile, String frequencyFile)
-        throws IOException {
+    private static void logDiffs(
+            UnicodeMap newMappings, String oldMappingFile, String frequencyFile)
+            throws IOException {
         LOG_WRITER.println();
         LOG_WRITER.println("# *** Differences from " + new File(oldMappingFile).getName() + " ***");
         LOG_WRITER.println();
@@ -155,24 +166,40 @@ private static void logDiffs(UnicodeMap newMappings, String oldMappingFi
         UnicodeSet diffSourceSet = diffMappings.keySet();
         UnicodeSet union = new UnicodeSet(newSource).addAll(diffSourceSet);
 
-        UnicodeSet isLetter = new UnicodeSet(
-            "[[:L:][:M:][:N:]\\u002B\\u005F\\uFF06\\uFF0B\\uFF3F\\u309B\\u309C\\u30a0]");
-        showOrderedList("IsLetter", new UnicodeSet(union).retainAll(isLetter), diffMappings, newMappings,
-            Integer.MAX_VALUE);
-        showOrderedList("NOT IsLetter", new UnicodeSet(union).removeAll(isLetter), diffMappings, newMappings,
-            Integer.MAX_VALUE);
+        UnicodeSet isLetter =
+                new UnicodeSet(
+                        "[[:L:][:M:][:N:]\\u002B\\u005F\\uFF06\\uFF0B\\uFF3F\\u309B\\u309C\\u30a0]");
+        showOrderedList(
+                "IsLetter",
+                new UnicodeSet(union).retainAll(isLetter),
+                diffMappings,
+                newMappings,
+                Integer.MAX_VALUE);
+        showOrderedList(
+                "NOT IsLetter",
+                new UnicodeSet(union).removeAll(isLetter),
+                diffMappings,
+                newMappings,
+                Integer.MAX_VALUE);
     }
 
-    private static void showOrderedList(String title, UnicodeSet charsToShow,
-        UnicodeMap oldMappings, UnicodeMap newMappings, int limit) {
+    private static void showOrderedList(
+            String title,
+            UnicodeSet charsToShow,
+            UnicodeMap oldMappings,
+            UnicodeMap newMappings,
+            int limit) {
 
         Set[]> ordered = new TreeSet(DOUBLE_STRING_COMP);
-        for (UnicodeSetIterator it = new UnicodeSetIterator(charsToShow); it.next();) {
+        for (UnicodeSetIterator it = new UnicodeSetIterator(charsToShow); it.next(); ) {
             String oldMapped = getRemapped(it.codepoint, oldMappings);
             String newMapped = getRemapped(it.codepoint, newMappings);
             if (!newMapped.equals(oldMapped)) {
                 Long count = frequencies == null ? null : frequencies.getCount(it.codepoint);
-                ordered.add(new Comparable[] { count == null ? 0L : count, it.codepoint, oldMapped, newMapped });
+                ordered.add(
+                        new Comparable[] {
+                            count == null ? 0L : count, it.codepoint, oldMapped, newMapped
+                        });
             }
         }
         showOrderedList2(title, ordered, limit);
@@ -182,7 +209,8 @@ private static String anchorize(String name) {
         return name.replace(' ', '_').toLowerCase();
     }
 
-    private static void showOrderedList2(String title, Collection[]> ordered, int limit) {
+    private static void showOrderedList2(
+            String title, Collection[]> ordered, int limit) {
         if (ordered.size() == 0) {
             String msg = "NO CHARACTERS CHANGED";
             if (TABLE) {
@@ -210,7 +238,10 @@ private static void showOrderedList2(String title, Collection
") + "

"; + footer = + "
" + + footer.replace("\t", "") + + "

"; } LOG_WRITER.println(footer); } @@ -219,13 +250,13 @@ private static void showOrderedList2(String title, Collection"); if (longEnough) { LOG_WRITER.println( - "
CodeNew MapFreq.Name↛ Old Map Name→ New Map Name
CodeNew MapFreq.Name↛ Old Map Name→ New Map Name
").replace(" ; ", "") + "
" + + line.replace("\t", "").replace(" ; ", "") + + "
" + category + "" - + fixHtml(fixCategoryName(subcategory)) + "" - + valueChars.strings.size() + "" + fixHtml(valueChars.strings) + "
" + labelString + "
" + category + "" + + fixHtml(fixCategoryName(subcategory)) + + "" + + valueChars.strings.size() + + "" + + fixHtml(valueChars.strings) + + "
" + + labelString + + "
"); @@ -1062,21 +1354,33 @@ private PrintWriter writeHtmlFooterAndClose(PrintWriter htmlChart) { return null; } - private PrintWriter writeHtmlHeader(PrintWriter htmlChart, String localDataDirectory, String category, - String baseTarget, String styles) throws IOException { - htmlChart.println("\n\n" - + "\n" - + "Picker Data\n" + "\n" - + "\n" - + (styles == null ? "" : "\n") + "\n" - + "\n"); + private PrintWriter writeHtmlHeader( + PrintWriter htmlChart, + String localDataDirectory, + String category, + String baseTarget, + String styles) + throws IOException { + htmlChart.println( + "\n\n" + + "\n" + + "Picker Data\n" + + "\n" + + "\n" + + (styles == null + ? "" + : "\n") + + "\n" + + "\n"); return htmlChart; } private String fileNameFromCategory(String category) { - return "PickerData_" + fixCategoryName(category) - .replace(' ', '_') - .replace("&", "and") + ".html"; + return "PickerData_" + + fixCategoryName(category).replace(' ', '_').replace("&", "and") + + ".html"; } private void writePageIndex(PrintWriter htmlChart, Set set) { @@ -1084,7 +1388,12 @@ private void writePageIndex(PrintWriter htmlChart, Set set) { htmlChart.println("\n

 

(" + new Date() + ")

"); } @@ -1093,8 +1402,12 @@ private void writeCategoryH1(PrintWriter htmlChart, String category) { htmlChart.println("

" + fixCategoryName(category) + "

"); } - private String addResult(StringBuilder result, GeneratePickerData2.USet valueChars, String category, - String subcategory, boolean doDisplayData) { + private String addResult( + StringBuilder result, + GeneratePickerData2.USet valueChars, + String category, + String subcategory, + boolean doDisplayData) { subcategory = fixCategoryName(subcategory); category = fixCategoryName(category); @@ -1103,8 +1416,16 @@ private String addResult(StringBuilder result, GeneratePickerData2.USet valueCha try { valueCharsString = valueChars.toString(); } catch (IllegalArgumentException e) { - System.out.println("/*" + size + "*/" + " " + category + MAIN_SUB_SEPARATOR + subcategory + "\t" - + valueChars.strings); + System.out.println( + "/*" + + size + + "*/" + + " " + + category + + MAIN_SUB_SEPARATOR + + subcategory + + "\t" + + valueChars.strings); throw e; } final int length = valueCharsString.length(); @@ -1112,12 +1433,34 @@ private String addResult(StringBuilder result, GeneratePickerData2.USet valueCha for (String s : valueChars.strings) { valueSet.add(s); } - final String quoteFixedvalueCharsString = valueCharsString.replace("\\", "\\\\").replace("\"", "\\\""); - result.append("/*" + size + "," + length + "*/" + " {\"" + subcategory + "\",\"" - + quoteFixedvalueCharsString + "\"},\n"); + final String quoteFixedvalueCharsString = + valueCharsString.replace("\\", "\\\\").replace("\"", "\\\""); + result.append( + "/*" + + size + + "," + + length + + "*/" + + " {\"" + + subcategory + + "\",\"" + + quoteFixedvalueCharsString + + "\"},\n"); if (doDisplayData) { - System.out.println("/*" + size + "," + length + "*/" + " " + category + MAIN_SUB_SEPARATOR - + subcategory + "\t" + valueSet.toPattern(false) + ", " + toHex(valueCharsString, true)); + System.out.println( + "/*" + + size + + "," + + length + + "*/" + + " " + + category + + MAIN_SUB_SEPARATOR + + subcategory + + "\t" + + valueSet.toPattern(false) + + ", " + + toHex(valueCharsString, true)); } return valueCharsString; } @@ -1135,13 +1478,17 @@ private String fixCategoryName(String subcategory) { if (DEBUG) System.out.println("Skip: " + SKIP); } - private static void addProperty(String propertyAlias, String title, Comparator sort, UnicodeSet retain) { + private static void addProperty( + String propertyAlias, String title, Comparator sort, UnicodeSet retain) { int propEnum = UCharacter.getPropertyEnum(propertyAlias); // get all the value strings, sorted UnicodeSet valueChars = new UnicodeSet(); - for (int i = UCharacter.getIntPropertyMinValue(propEnum); i <= UCharacter.getIntPropertyMaxValue(propEnum); ++i) { - String valueAlias = UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.LONG); + for (int i = UCharacter.getIntPropertyMinValue(propEnum); + i <= UCharacter.getIntPropertyMaxValue(propEnum); + ++i) { + String valueAlias = + UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.LONG); if (valueAlias.contains("Symbol")) { System.out.println("Skipping " + valueAlias); continue; @@ -1151,24 +1498,35 @@ private static void addProperty(String propertyAlias, String title, Comparator sort, - Set propertyValues) { + private static void addProperty( + String propertyAlias, + String title, + Comparator sort, + Set propertyValues) { // get all the value strings, sorted UnicodeSet valueChars = new UnicodeSet(); for (String valueAlias : propertyValues) { @@ -1183,15 +1541,20 @@ private static void addProperty(String propertyAlias, String title, Comparator sortItems(Comparator sort, String propertyAlias, String valueAlias) { + private static Comparator sortItems( + Comparator sort, String propertyAlias, String valueAlias) { if (valueAlias.equals("Decimal_Number") && propertyAlias.equals("General_Category")) { return null; } @@ -1258,7 +1625,6 @@ public int compare(T arg0, T arg1) { } return 0; } - } static class UnicodeSetInclusionFirst> implements Comparator { @@ -1273,13 +1639,13 @@ public int compare(T arg0, T arg1) { boolean a1 = included.containsAll(arg1.toString()); return a0 == a1 ? arg0.compareTo(arg1) : a0 ? -1 : 1; } - } public static Set ERROR_COUNT = new LinkedHashSet(); /** * Provide a simple list of strings + * * @param source * @return */ @@ -1305,8 +1671,9 @@ static class USet { Collection strings; /** - * A few choices. As a plain list, as a LinkedHashSet, sorted by code point, or sorted by specific comparator - * + * A few choices. As a plain list, as a LinkedHashSet, sorted by code point, or sorted by + * specific comparator + * * @param sorted */ public USet(Comparator sorted) { @@ -1328,7 +1695,8 @@ public String toString() { // for (String s : set) { // set2.add(s); // } - // //if (DEBUG) System.out.println("Sorted " + value + ": " + valueChars.size() + ", " + valueChars); + // //if (DEBUG) System.out.println("Sorted " + value + ": " + valueChars.size() + ", " + + // valueChars); // if (set2.isEmpty()) { // return null; // } @@ -1347,30 +1715,43 @@ public String toString() { Set ba = new LinkedHashSet(reversal); ba.removeAll(original); System.out.println("FAILED!!!!"); - IllegalArgumentException e = new IllegalArgumentException("Failed with: " + original + "\n" - + "Range String: " + Compacter.getInternalRangeString(strings) + "\n" - + "In original but not restored: " + ab + "\n" + "In restored but not original: " + ba + "\n" - + "Returned range string: " + CharacterListCompressor.base88DecodeList(result.toString()) - // CharacterListCompressor.base88Decode(in); - ); + IllegalArgumentException e = + new IllegalArgumentException( + "Failed with: " + + original + + "\n" + + "Range String: " + + Compacter.getInternalRangeString(strings) + + "\n" + + "In original but not restored: " + + ab + + "\n" + + "In restored but not original: " + + ba + + "\n" + + "Returned range string: " + + CharacterListCompressor.base88DecodeList( + result.toString()) + // CharacterListCompressor.base88Decode(in); + ); e.printStackTrace(System.err); ERROR_COUNT.add(e); } return result.toString(); } - } /** - * Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd] Returns the set for chaining. - * + * Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd] Returns the set for + * chaining. + * * @param exemplar1 * @return */ public static UnicodeSet flatten(UnicodeSet exemplar1) { UnicodeSet result = new UnicodeSet(); boolean gotString = false; - for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange(); ) { if (it.codepoint == UnicodeSetIterator.IS_STRING) { result.addAll(it.string); gotString = true; @@ -1382,186 +1763,627 @@ public static UnicodeSet flatten(UnicodeSet exemplar1) { return exemplar1; } - static String MKD_RULES = "\u1101 > \u1100\u1100;" + "\u1104 > \u1103\u1103;" + "\u1108 > \u1107\u1107;" - + "\u110A > \u1109\u1109;" + "\u110D > \u110C\u110C;" + "\u1113 > \u1102\u1100;" + "\u1114 > \u1102\u1102;" - + "\u1115 > \u1102\u1103;" + "\u1116 > \u1102\u1107;" + "\u1117 > \u1103\u1100;" + "\u1118 > \u1105\u1102;" - + "\u1119 > \u1105\u1105;" + "\u111A > \u1105\u1112;" + "\u111B > \u1105\u110B;" + "\u111C > \u1106\u1107;" - + "\u111D > \u1106\u110B;" + "\u111E > \u1107\u1100;" + "\u111F > \u1107\u1102;" + "\u1120 > \u1107\u1103;" - + "\u1121 > \u1107\u1109;" + "\u1122 > \u1107\u1109\u1100;" + "\u1123 > \u1107\u1109\u1103;" - + "\u1124 > \u1107\u1109\u1107;" + "\u1125 > \u1107\u1109\u1109;" + "\u1126 > \u1107\u1109\u110C;" - + "\u1127 > \u1107\u110C;" + "\u1128 > \u1107\u110E;" + "\u1129 > \u1107\u1110;" + "\u112A > \u1107\u1111;" - + "\u112B > \u1107\u110B;" + "\u112C > \u1107\u1107\u110B;" + "\u112D > \u1109\u1100;" - + "\u112E > \u1109\u1102;" + "\u112F > \u1109\u1103;" + "\u1130 > \u1109\u1105;" + "\u1131 > \u1109\u1106;" - + "\u1132 > \u1109\u1107;" + "\u1133 > \u1109\u1107\u1100;" + "\u1134 > \u1109\u1109\u1109;" - + "\u1135 > \u1109\u110B;" + "\u1136 > \u1109\u110C;" + "\u1137 > \u1109\u110E;" + "\u1138 > \u1109\u110F;" - + "\u1139 > \u1109\u1110;" + "\u113A > \u1109\u1111;" + "\u113B > \u1109\u1112;" + "\u113D > \u113C\u113C;" - + "\u113F > \u113E\u113E;" + "\u1141 > \u110B\u1100;" + "\u1142 > \u110B\u1103;" + "\u1143 > \u110B\u1106;" - + "\u1144 > \u110B\u1107;" + "\u1145 > \u110B\u1109;" + "\u1146 > \u110B\u1140;" + "\u1147 > \u110B\u110B;" - + "\u1148 > \u110B\u110C;" + "\u1149 > \u110B\u110E;" + "\u114A > \u110B\u1110;" + "\u114B > \u110B\u1111;" - + "\u114D > \u110C\u110B;" + "\u114F > \u114E\u114E;" + "\u1151 > \u1150\u1150;" + "\u1152 > \u110E\u110F;" - + "\u1153 > \u110E\u1112;" + "\u1156 > \u1111\u1107;" + "\u1157 > \u1111\u110B;" + "\u1158 > \u1112\u1112;" - + "\u115A > \u1100\u1103;" + "\u115B > \u1102\u1109;" + "\u115C > \u1102\u110C;" + "\u115D > \u1102\u1112;" - + "\u115E > \u1103\u1105;" + "\uA960 > \u1103\u1106;" + "\uA961 > \u1103\u1107;" + "\uA962 > \u1103\u1109;" - + "\uA963 > \u1103\u110C;" + "\uA964 > \u1105\u1100;" + "\uA965 > \u1105\u1100\u1100;" - + "\uA966 > \u1105\u1103;" + "\uA967 > \u1105\u1103\u1103;" + "\uA968 > \u1105\u1106;" - + "\uA969 > \u1105\u1107;" + "\uA96A > \u1105\u1107\u1107;" + "\uA96B > \u1105\u1107\u110B;" - + "\uA96C > \u1105\u1109;" + "\uA96D > \u1105\u110C;" + "\uA96E > \u1105\u110F;" + "\uA96F > \u1106\u1100;" - + "\uA970 > \u1106\u1103;" + "\uA971 > \u1106\u1109;" + "\uA972 > \u1107\u1109\u1110;" - + "\uA973 > \u1107\u110F;" + "\uA974 > \u1107\u1112;" + "\uA975 > \u1109\u1109\u1107;" - + "\uA976 > \u110B\u1105;" + "\uA977 > \u110B\u1112;" + "\uA978 > \u110C\u110C\u1112;" - + "\uA979 > \u1110\u1110;" + "\uA97A > \u1111\u1112;" + "\uA97B > \u1112\u1109;" + "\uA97C > \u1159\u1159;" - + "\u1162 > \u1161\u1175;" + "\u1164 > \u1163\u1175;" + "\u1166 > \u1165\u1175;" + "\u1168 > \u1167\u1175;" - + "\u116A > \u1169\u1161;" + "\u116B > \u1169\u1161\u1175;" + "\u116C > \u1169\u1175;" - + "\u116F > \u116E\u1165;" + "\u1170 > \u116E\u1165\u1175;" + "\u1171 > \u116E\u1175;" - + "\u1174 > \u1173\u1175;" + "\u1176 > \u1161\u1169;" + "\u1177 > \u1161\u116E;" + "\u1178 > \u1163\u1169;" - + "\u1179 > \u1163\u116D;" + "\u117A > \u1165\u1169;" + "\u117B > \u1165\u116E;" + "\u117C > \u1165\u1173;" - + "\u117D > \u1167\u1169;" + "\u117E > \u1167\u116E;" + "\u117F > \u1169\u1165;" - + "\u1180 > \u1169\u1165\u1175;" + "\u1181 > \u1169\u1167\u1175;" + "\u1182 > \u1169\u1169;" - + "\u1183 > \u1169\u116E;" + "\u1184 > \u116D\u1163;" + "\u1185 > \u116D\u1163\u1175;" - + "\u1186 > \u116D\u1167;" + "\u1187 > \u116D\u1169;" + "\u1188 > \u116D\u1175;" + "\u1189 > \u116E\u1161;" - + "\u118A > \u116E\u1161\u1175;" + "\u118B > \u116E\u1165\u1173;" + "\u118C > \u116E\u1167\u1175;" - + "\u118D > \u116E\u116E;" + "\u118E > \u1172\u1161;" + "\u118F > \u1172\u1165;" - + "\u1190 > \u1172\u1165\u1175;" + "\u1191 > \u1172\u1167;" + "\u1192 > \u1172\u1167\u1175;" - + "\u1193 > \u1172\u116E;" + "\u1194 > \u1172\u1175;" + "\u1195 > \u1173\u116E;" + "\u1196 > \u1173\u1173;" - + "\u1197 > \u1173\u1175\u116E;" + "\u1198 > \u1175\u1161;" + "\u1199 > \u1175\u1163;" - + "\u119A > \u1175\u1169;" + "\u119B > \u1175\u116E;" + "\u119C > \u1175\u1173;" + "\u119D > \u1175\u119E;" - + "\u119F > \u119E\u1165;" + "\u11A0 > \u119E\u116E;" + "\u11A1 > \u119E\u1175;" + "\u11A2 > \u119E\u119E;" - + "\u11A3 > \u1161\u1173;" + "\u11A4 > \u1163\u116E;" + "\u11A5 > \u1167\u1163;" + "\u11A6 > \u1169\u1163;" - + "\u11A7 > \u1169\u1163\u1175;" + "\uD7B0 > \u1169\u1167;" + "\uD7B1 > \u1169\u1169\u1175;" - + "\uD7B2 > \u116D\u1161;" + "\uD7B3 > \u116D\u1161\u1175;" + "\uD7B4 > \u116D\u1165;" - + "\uD7B5 > \u116E\u1167;" + "\uD7B6 > \u116E\u1175\u1175;" + "\uD7B7 > \u1172\u1161\u1175;" - + "\uD7B8 > \u1172\u1169;" + "\uD7B9 > \u1173\u1161;" + "\uD7BA > \u1173\u1165;" - + "\uD7BB > \u1173\u1165\u1175;" + "\uD7BC > \u1173\u1169;" + "\uD7BD > \u1175\u1163\u1169;" - + "\uD7BE > \u1175\u1163\u1175;" + "\uD7BF > \u1175\u1167;" + "\uD7C0 > \u1175\u1167\u1175;" - + "\uD7C1 > \u1175\u1169\u1175;" + "\uD7C2 > \u1175\u116D;" + "\uD7C3 > \u1175\u1172;" - + "\uD7C4 > \u1175\u1175;" + "\uD7C5 > \u119E\u1161;" + "\uD7C6 > \u119E\u1165\u1175;" - + "\u11A9 > \u11A8\u11A8;" + "\u11AA > \u11A8\u11BA;" + "\u11AC > \u11AB\u11BD;" + "\u11AD > \u11AB\u11C2;" - + "\u11B0 > \u11AF\u11A8;" + "\u11B1 > \u11AF\u11B7;" + "\u11B2 > \u11AF\u11B8;" + "\u11B3 > \u11AF\u11BA;" - + "\u11B4 > \u11AF\u11C0;" + "\u11B5 > \u11AF\u11C1;" + "\u11B6 > \u11AF\u11C2;" + "\u11B9 > \u11B8\u11BA;" - + "\u11BB > \u11BA\u11BA;" + "\u11C3 > \u11A8\u11AF;" + "\u11C4 > \u11A8\u11BA\u11A8;" - + "\u11C5 > \u11AB\u11A8;" + "\u11C6 > \u11AB\u11AE;" + "\u11C7 > \u11AB\u11BA;" + "\u11C8 > \u11AB\u11EB;" - + "\u11C9 > \u11AB\u11C0;" + "\u11CA > \u11AE\u11A8;" + "\u11CB > \u11AE\u11AF;" - + "\u11CC > \u11AF\u11A8\u11BA;" + "\u11CD > \u11AF\u11AB;" + "\u11CE > \u11AF\u11AE;" - + "\u11CF > \u11AF\u11AE\u11C2;" + "\u11D0 > \u11AF\u11AF;" + "\u11D1 > \u11AF\u11B7\u11A8;" - + "\u11D2 > \u11AF\u11B7\u11BA;" + "\u11D3 > \u11AF\u11B8\u11BA;" + "\u11D4 > \u11AF\u11B8\u11C2;" - + "\u11D5 > \u11AF\u11B8\u11BC;" + "\u11D6 > \u11AF\u11BA\u11BA;" + "\u11D7 > \u11AF\u11EB;" - + "\u11D8 > \u11AF\u11BF;" + "\u11D9 > \u11AF\u11F9;" + "\u11DA > \u11B7\u11A8;" + "\u11DB > \u11B7\u11AF;" - + "\u11DC > \u11B7\u11B8;" + "\u11DD > \u11B7\u11BA;" + "\u11DE > \u11B7\u11BA\u11BA;" - + "\u11DF > \u11B7\u11EB;" + "\u11E0 > \u11B7\u11BE;" + "\u11E1 > \u11B7\u11C2;" + "\u11E2 > \u11B7\u11BC;" - + "\u11E3 > \u11B8\u11AF;" + "\u11E4 > \u11B8\u11C1;" + "\u11E5 > \u11B8\u11C2;" + "\u11E6 > \u11B8\u11BC;" - + "\u11E7 > \u11BA\u11A8;" + "\u11E8 > \u11BA\u11AE;" + "\u11E9 > \u11BA\u11AF;" + "\u11EA > \u11BA\u11B8;" - + "\u11EC > \u11BC\u11A8;" + "\u11ED > \u11BC\u11A8\u11A8;" + "\u11EE > \u11BC\u11BC;" - + "\u11EF > \u11BC\u11BF;" + "\u11F1 > \u11F0\u11BA;" + "\u11F2 > \u11F0\u11EB;" + "\u11F3 > \u11C1\u11B8;" - + "\u11F4 > \u11C1\u11BC;" + "\u11F5 > \u11C2\u11AB;" + "\u11F6 > \u11C2\u11AF;" + "\u11F7 > \u11C2\u11B7;" - + "\u11F8 > \u11C2\u11B8;" + "\u11FA > \u11A8\u11AB;" + "\u11FB > \u11A8\u11B8;" + "\u11FC > \u11A8\u11BE;" - + "\u11FD > \u11A8\u11BF;" + "\u11FE > \u11A8\u11C2;" + "\u11FF > \u11AB\u11AB;" + "\uD7CB > \u11AB\u11AF;" - + "\uD7CC > \u11AB\u11BE;" + "\uD7CD > \u11AE\u11AE;" + "\uD7CE > \u11AE\u11AE\u11B8;" - + "\uD7CF > \u11AE\u11B8;" + "\uD7D0 > \u11AE\u11BA;" + "\uD7D1 > \u11AE\u11BA\u11A8;" - + "\uD7D2 > \u11AE\u11BD;" + "\uD7D3 > \u11AE\u11BE;" + "\uD7D4 > \u11AE\u11C0;" - + "\uD7D5 > \u11AF\u11A8\u11A8;" + "\uD7D6 > \u11AF\u11A8\u11C2;" + "\uD7D7 > \u11AF\u11AF\u11BF;" - + "\uD7D8 > \u11AF\u11B7\u11C2;" + "\uD7D9 > \u11AF\u11B8\u11AE;" + "\uD7DA > \u11AF\u11B8\u11C1;" - + "\uD7DB > \u11AF\u11F0;" + "\uD7DC > \u11AF\u11F9\u11C2;" + "\uD7DD > \u11AF\u11BC;" - + "\uD7DE > \u11B7\u11AB;" + "\uD7DF > \u11B7\u11AB\u11AB;" + "\uD7E0 > \u11B7\u11B7;" - + "\uD7E1 > \u11B7\u11B8\u11BA;" + "\uD7E2 > \u11B7\u11BD;" + "\uD7E3 > \u11B8\u11AE;" - + "\uD7E4 > \u11B8\u11AF\u11C1;" + "\uD7E5 > \u11B8\u11B7;" + "\uD7E6 > \u11B8\u11B8;" - + "\uD7E7 > \u11B8\u11BA\u11AE;" + "\uD7E8 > \u11B8\u11BD;" + "\uD7E9 > \u11B8\u11BE;" - + "\uD7EA > \u11BA\u11B7;" + "\uD7EB > \u11BA\u11B8\u11BC;" + "\uD7EC > \u11BA\u11BA\u11A8;" - + "\uD7ED > \u11BA\u11BA\u11AE;" + "\uD7EE > \u11BA\u11EB;" + "\uD7EF > \u11BA\u11BD;" - + "\uD7F0 > \u11BA\u11BE;" + "\uD7F1 > \u11BA\u11C0;" + "\uD7F2 > \u11BA\u11C2;" + "\uD7F3 > \u11EB\u11B8;" - + "\uD7F4 > \u11EB\u11B8\u11BC;" + "\uD7F5 > \u11F0\u11B7;" + "\uD7F6 > \u11F0\u11C2;" - + "\uD7F7 > \u11BD\u11B8;" + "\uD7F8 > \u11BD\u11B8\u11B8;" + "\uD7F9 > \u11BD\u11BD;" - + "\uD7FA > \u11C1\u11BA;" + "\uD7FB > \u11C1\u11C0;"; + static String MKD_RULES = + "\u1101 > \u1100\u1100;" + + "\u1104 > \u1103\u1103;" + + "\u1108 > \u1107\u1107;" + + "\u110A > \u1109\u1109;" + + "\u110D > \u110C\u110C;" + + "\u1113 > \u1102\u1100;" + + "\u1114 > \u1102\u1102;" + + "\u1115 > \u1102\u1103;" + + "\u1116 > \u1102\u1107;" + + "\u1117 > \u1103\u1100;" + + "\u1118 > \u1105\u1102;" + + "\u1119 > \u1105\u1105;" + + "\u111A > \u1105\u1112;" + + "\u111B > \u1105\u110B;" + + "\u111C > \u1106\u1107;" + + "\u111D > \u1106\u110B;" + + "\u111E > \u1107\u1100;" + + "\u111F > \u1107\u1102;" + + "\u1120 > \u1107\u1103;" + + "\u1121 > \u1107\u1109;" + + "\u1122 > \u1107\u1109\u1100;" + + "\u1123 > \u1107\u1109\u1103;" + + "\u1124 > \u1107\u1109\u1107;" + + "\u1125 > \u1107\u1109\u1109;" + + "\u1126 > \u1107\u1109\u110C;" + + "\u1127 > \u1107\u110C;" + + "\u1128 > \u1107\u110E;" + + "\u1129 > \u1107\u1110;" + + "\u112A > \u1107\u1111;" + + "\u112B > \u1107\u110B;" + + "\u112C > \u1107\u1107\u110B;" + + "\u112D > \u1109\u1100;" + + "\u112E > \u1109\u1102;" + + "\u112F > \u1109\u1103;" + + "\u1130 > \u1109\u1105;" + + "\u1131 > \u1109\u1106;" + + "\u1132 > \u1109\u1107;" + + "\u1133 > \u1109\u1107\u1100;" + + "\u1134 > \u1109\u1109\u1109;" + + "\u1135 > \u1109\u110B;" + + "\u1136 > \u1109\u110C;" + + "\u1137 > \u1109\u110E;" + + "\u1138 > \u1109\u110F;" + + "\u1139 > \u1109\u1110;" + + "\u113A > \u1109\u1111;" + + "\u113B > \u1109\u1112;" + + "\u113D > \u113C\u113C;" + + "\u113F > \u113E\u113E;" + + "\u1141 > \u110B\u1100;" + + "\u1142 > \u110B\u1103;" + + "\u1143 > \u110B\u1106;" + + "\u1144 > \u110B\u1107;" + + "\u1145 > \u110B\u1109;" + + "\u1146 > \u110B\u1140;" + + "\u1147 > \u110B\u110B;" + + "\u1148 > \u110B\u110C;" + + "\u1149 > \u110B\u110E;" + + "\u114A > \u110B\u1110;" + + "\u114B > \u110B\u1111;" + + "\u114D > \u110C\u110B;" + + "\u114F > \u114E\u114E;" + + "\u1151 > \u1150\u1150;" + + "\u1152 > \u110E\u110F;" + + "\u1153 > \u110E\u1112;" + + "\u1156 > \u1111\u1107;" + + "\u1157 > \u1111\u110B;" + + "\u1158 > \u1112\u1112;" + + "\u115A > \u1100\u1103;" + + "\u115B > \u1102\u1109;" + + "\u115C > \u1102\u110C;" + + "\u115D > \u1102\u1112;" + + "\u115E > \u1103\u1105;" + + "\uA960 > \u1103\u1106;" + + "\uA961 > \u1103\u1107;" + + "\uA962 > \u1103\u1109;" + + "\uA963 > \u1103\u110C;" + + "\uA964 > \u1105\u1100;" + + "\uA965 > \u1105\u1100\u1100;" + + "\uA966 > \u1105\u1103;" + + "\uA967 > \u1105\u1103\u1103;" + + "\uA968 > \u1105\u1106;" + + "\uA969 > \u1105\u1107;" + + "\uA96A > \u1105\u1107\u1107;" + + "\uA96B > \u1105\u1107\u110B;" + + "\uA96C > \u1105\u1109;" + + "\uA96D > \u1105\u110C;" + + "\uA96E > \u1105\u110F;" + + "\uA96F > \u1106\u1100;" + + "\uA970 > \u1106\u1103;" + + "\uA971 > \u1106\u1109;" + + "\uA972 > \u1107\u1109\u1110;" + + "\uA973 > \u1107\u110F;" + + "\uA974 > \u1107\u1112;" + + "\uA975 > \u1109\u1109\u1107;" + + "\uA976 > \u110B\u1105;" + + "\uA977 > \u110B\u1112;" + + "\uA978 > \u110C\u110C\u1112;" + + "\uA979 > \u1110\u1110;" + + "\uA97A > \u1111\u1112;" + + "\uA97B > \u1112\u1109;" + + "\uA97C > \u1159\u1159;" + + "\u1162 > \u1161\u1175;" + + "\u1164 > \u1163\u1175;" + + "\u1166 > \u1165\u1175;" + + "\u1168 > \u1167\u1175;" + + "\u116A > \u1169\u1161;" + + "\u116B > \u1169\u1161\u1175;" + + "\u116C > \u1169\u1175;" + + "\u116F > \u116E\u1165;" + + "\u1170 > \u116E\u1165\u1175;" + + "\u1171 > \u116E\u1175;" + + "\u1174 > \u1173\u1175;" + + "\u1176 > \u1161\u1169;" + + "\u1177 > \u1161\u116E;" + + "\u1178 > \u1163\u1169;" + + "\u1179 > \u1163\u116D;" + + "\u117A > \u1165\u1169;" + + "\u117B > \u1165\u116E;" + + "\u117C > \u1165\u1173;" + + "\u117D > \u1167\u1169;" + + "\u117E > \u1167\u116E;" + + "\u117F > \u1169\u1165;" + + "\u1180 > \u1169\u1165\u1175;" + + "\u1181 > \u1169\u1167\u1175;" + + "\u1182 > \u1169\u1169;" + + "\u1183 > \u1169\u116E;" + + "\u1184 > \u116D\u1163;" + + "\u1185 > \u116D\u1163\u1175;" + + "\u1186 > \u116D\u1167;" + + "\u1187 > \u116D\u1169;" + + "\u1188 > \u116D\u1175;" + + "\u1189 > \u116E\u1161;" + + "\u118A > \u116E\u1161\u1175;" + + "\u118B > \u116E\u1165\u1173;" + + "\u118C > \u116E\u1167\u1175;" + + "\u118D > \u116E\u116E;" + + "\u118E > \u1172\u1161;" + + "\u118F > \u1172\u1165;" + + "\u1190 > \u1172\u1165\u1175;" + + "\u1191 > \u1172\u1167;" + + "\u1192 > \u1172\u1167\u1175;" + + "\u1193 > \u1172\u116E;" + + "\u1194 > \u1172\u1175;" + + "\u1195 > \u1173\u116E;" + + "\u1196 > \u1173\u1173;" + + "\u1197 > \u1173\u1175\u116E;" + + "\u1198 > \u1175\u1161;" + + "\u1199 > \u1175\u1163;" + + "\u119A > \u1175\u1169;" + + "\u119B > \u1175\u116E;" + + "\u119C > \u1175\u1173;" + + "\u119D > \u1175\u119E;" + + "\u119F > \u119E\u1165;" + + "\u11A0 > \u119E\u116E;" + + "\u11A1 > \u119E\u1175;" + + "\u11A2 > \u119E\u119E;" + + "\u11A3 > \u1161\u1173;" + + "\u11A4 > \u1163\u116E;" + + "\u11A5 > \u1167\u1163;" + + "\u11A6 > \u1169\u1163;" + + "\u11A7 > \u1169\u1163\u1175;" + + "\uD7B0 > \u1169\u1167;" + + "\uD7B1 > \u1169\u1169\u1175;" + + "\uD7B2 > \u116D\u1161;" + + "\uD7B3 > \u116D\u1161\u1175;" + + "\uD7B4 > \u116D\u1165;" + + "\uD7B5 > \u116E\u1167;" + + "\uD7B6 > \u116E\u1175\u1175;" + + "\uD7B7 > \u1172\u1161\u1175;" + + "\uD7B8 > \u1172\u1169;" + + "\uD7B9 > \u1173\u1161;" + + "\uD7BA > \u1173\u1165;" + + "\uD7BB > \u1173\u1165\u1175;" + + "\uD7BC > \u1173\u1169;" + + "\uD7BD > \u1175\u1163\u1169;" + + "\uD7BE > \u1175\u1163\u1175;" + + "\uD7BF > \u1175\u1167;" + + "\uD7C0 > \u1175\u1167\u1175;" + + "\uD7C1 > \u1175\u1169\u1175;" + + "\uD7C2 > \u1175\u116D;" + + "\uD7C3 > \u1175\u1172;" + + "\uD7C4 > \u1175\u1175;" + + "\uD7C5 > \u119E\u1161;" + + "\uD7C6 > \u119E\u1165\u1175;" + + "\u11A9 > \u11A8\u11A8;" + + "\u11AA > \u11A8\u11BA;" + + "\u11AC > \u11AB\u11BD;" + + "\u11AD > \u11AB\u11C2;" + + "\u11B0 > \u11AF\u11A8;" + + "\u11B1 > \u11AF\u11B7;" + + "\u11B2 > \u11AF\u11B8;" + + "\u11B3 > \u11AF\u11BA;" + + "\u11B4 > \u11AF\u11C0;" + + "\u11B5 > \u11AF\u11C1;" + + "\u11B6 > \u11AF\u11C2;" + + "\u11B9 > \u11B8\u11BA;" + + "\u11BB > \u11BA\u11BA;" + + "\u11C3 > \u11A8\u11AF;" + + "\u11C4 > \u11A8\u11BA\u11A8;" + + "\u11C5 > \u11AB\u11A8;" + + "\u11C6 > \u11AB\u11AE;" + + "\u11C7 > \u11AB\u11BA;" + + "\u11C8 > \u11AB\u11EB;" + + "\u11C9 > \u11AB\u11C0;" + + "\u11CA > \u11AE\u11A8;" + + "\u11CB > \u11AE\u11AF;" + + "\u11CC > \u11AF\u11A8\u11BA;" + + "\u11CD > \u11AF\u11AB;" + + "\u11CE > \u11AF\u11AE;" + + "\u11CF > \u11AF\u11AE\u11C2;" + + "\u11D0 > \u11AF\u11AF;" + + "\u11D1 > \u11AF\u11B7\u11A8;" + + "\u11D2 > \u11AF\u11B7\u11BA;" + + "\u11D3 > \u11AF\u11B8\u11BA;" + + "\u11D4 > \u11AF\u11B8\u11C2;" + + "\u11D5 > \u11AF\u11B8\u11BC;" + + "\u11D6 > \u11AF\u11BA\u11BA;" + + "\u11D7 > \u11AF\u11EB;" + + "\u11D8 > \u11AF\u11BF;" + + "\u11D9 > \u11AF\u11F9;" + + "\u11DA > \u11B7\u11A8;" + + "\u11DB > \u11B7\u11AF;" + + "\u11DC > \u11B7\u11B8;" + + "\u11DD > \u11B7\u11BA;" + + "\u11DE > \u11B7\u11BA\u11BA;" + + "\u11DF > \u11B7\u11EB;" + + "\u11E0 > \u11B7\u11BE;" + + "\u11E1 > \u11B7\u11C2;" + + "\u11E2 > \u11B7\u11BC;" + + "\u11E3 > \u11B8\u11AF;" + + "\u11E4 > \u11B8\u11C1;" + + "\u11E5 > \u11B8\u11C2;" + + "\u11E6 > \u11B8\u11BC;" + + "\u11E7 > \u11BA\u11A8;" + + "\u11E8 > \u11BA\u11AE;" + + "\u11E9 > \u11BA\u11AF;" + + "\u11EA > \u11BA\u11B8;" + + "\u11EC > \u11BC\u11A8;" + + "\u11ED > \u11BC\u11A8\u11A8;" + + "\u11EE > \u11BC\u11BC;" + + "\u11EF > \u11BC\u11BF;" + + "\u11F1 > \u11F0\u11BA;" + + "\u11F2 > \u11F0\u11EB;" + + "\u11F3 > \u11C1\u11B8;" + + "\u11F4 > \u11C1\u11BC;" + + "\u11F5 > \u11C2\u11AB;" + + "\u11F6 > \u11C2\u11AF;" + + "\u11F7 > \u11C2\u11B7;" + + "\u11F8 > \u11C2\u11B8;" + + "\u11FA > \u11A8\u11AB;" + + "\u11FB > \u11A8\u11B8;" + + "\u11FC > \u11A8\u11BE;" + + "\u11FD > \u11A8\u11BF;" + + "\u11FE > \u11A8\u11C2;" + + "\u11FF > \u11AB\u11AB;" + + "\uD7CB > \u11AB\u11AF;" + + "\uD7CC > \u11AB\u11BE;" + + "\uD7CD > \u11AE\u11AE;" + + "\uD7CE > \u11AE\u11AE\u11B8;" + + "\uD7CF > \u11AE\u11B8;" + + "\uD7D0 > \u11AE\u11BA;" + + "\uD7D1 > \u11AE\u11BA\u11A8;" + + "\uD7D2 > \u11AE\u11BD;" + + "\uD7D3 > \u11AE\u11BE;" + + "\uD7D4 > \u11AE\u11C0;" + + "\uD7D5 > \u11AF\u11A8\u11A8;" + + "\uD7D6 > \u11AF\u11A8\u11C2;" + + "\uD7D7 > \u11AF\u11AF\u11BF;" + + "\uD7D8 > \u11AF\u11B7\u11C2;" + + "\uD7D9 > \u11AF\u11B8\u11AE;" + + "\uD7DA > \u11AF\u11B8\u11C1;" + + "\uD7DB > \u11AF\u11F0;" + + "\uD7DC > \u11AF\u11F9\u11C2;" + + "\uD7DD > \u11AF\u11BC;" + + "\uD7DE > \u11B7\u11AB;" + + "\uD7DF > \u11B7\u11AB\u11AB;" + + "\uD7E0 > \u11B7\u11B7;" + + "\uD7E1 > \u11B7\u11B8\u11BA;" + + "\uD7E2 > \u11B7\u11BD;" + + "\uD7E3 > \u11B8\u11AE;" + + "\uD7E4 > \u11B8\u11AF\u11C1;" + + "\uD7E5 > \u11B8\u11B7;" + + "\uD7E6 > \u11B8\u11B8;" + + "\uD7E7 > \u11B8\u11BA\u11AE;" + + "\uD7E8 > \u11B8\u11BD;" + + "\uD7E9 > \u11B8\u11BE;" + + "\uD7EA > \u11BA\u11B7;" + + "\uD7EB > \u11BA\u11B8\u11BC;" + + "\uD7EC > \u11BA\u11BA\u11A8;" + + "\uD7ED > \u11BA\u11BA\u11AE;" + + "\uD7EE > \u11BA\u11EB;" + + "\uD7EF > \u11BA\u11BD;" + + "\uD7F0 > \u11BA\u11BE;" + + "\uD7F1 > \u11BA\u11C0;" + + "\uD7F2 > \u11BA\u11C2;" + + "\uD7F3 > \u11EB\u11B8;" + + "\uD7F4 > \u11EB\u11B8\u11BC;" + + "\uD7F5 > \u11F0\u11B7;" + + "\uD7F6 > \u11F0\u11C2;" + + "\uD7F7 > \u11BD\u11B8;" + + "\uD7F8 > \u11BD\u11B8\u11B8;" + + "\uD7F9 > \u11BD\u11BD;" + + "\uD7FA > \u11C1\u11BA;" + + "\uD7FB > \u11C1\u11C0;"; static final String MKC_RULES = // "::MKD;"+ - "\u1107\u1109\u1100 > \u1122;" + "\u1107\u1109\u1103 > \u1123;" + "\u1107\u1109\u1107 > \u1124;" - + "\u1107\u1109\u1109 > \u1125;" + "\u1107\u1109\u110C > \u1126;" + "\u1107\u1107\u110B > \u112C;" - + "\u1109\u1107\u1100 > \u1133;" + "\u1109\u1109\u1109 > \u1134;" + "\u1169\u1161\u1175 > \u116B;" - + "\u116E\u1165\u1175 > \u1170;" + "\u1169\u1165\u1175 > \u1180;" + "\u1169\u1167\u1175 > \u1181;" - + "\u116D\u1163\u1175 > \u1185;" + "\u116E\u1161\u1175 > \u118A;" + "\u116E\u1165\u1173 > \u118B;" - + "\u116E\u1167\u1175 > \u118C;" + "\u1172\u1165\u1175 > \u1190;" + "\u1172\u1167\u1175 > \u1192;" - + "\u1173\u1175\u116E > \u1197;" + "\u1169\u1163\u1175 > \u11A7;" + "\u11A8\u11BA\u11A8 > \u11C4;" - + "\u11AF\u11A8\u11BA > \u11CC;" + "\u11AF\u11AE\u11C2 > \u11CF;" + "\u11AF\u11B7\u11A8 > \u11D1;" - + "\u11AF\u11B7\u11BA > \u11D2;" + "\u11AF\u11B8\u11BA > \u11D3;" + "\u11AF\u11B8\u11C2 > \u11D4;" - + "\u11AF\u11B8\u11BC > \u11D5;" + "\u11AF\u11BA\u11BA > \u11D6;" + "\u11B7\u11BA\u11BA > \u11DE;" - + "\u11BC\u11A8\u11A8 > \u11ED;" + "\u1105\u1100\u1100 > \uA965;" + "\u1105\u1103\u1103 > \uA967;" - + "\u1105\u1107\u1107 > \uA96A;" + "\u1105\u1107\u110B > \uA96B;" + "\u1107\u1109\u1110 > \uA972;" - + "\u1109\u1109\u1107 > \uA975;" + "\u110C\u110C\u1112 > \uA978;" + "\u1169\u1169\u1175 > \uD7B1;" - + "\u116D\u1161\u1175 > \uD7B3;" + "\u116E\u1175\u1175 > \uD7B6;" + "\u1172\u1161\u1175 > \uD7B7;" - + "\u1173\u1165\u1175 > \uD7BB;" + "\u1175\u1163\u1169 > \uD7BD;" + "\u1175\u1163\u1175 > \uD7BE;" - + "\u1175\u1167\u1175 > \uD7C0;" + "\u1175\u1169\u1175 > \uD7C1;" + "\u119E\u1165\u1175 > \uD7C6;" - + "\u11AE\u11AE\u11B8 > \uD7CE;" + "\u11AE\u11BA\u11A8 > \uD7D1;" + "\u11AF\u11A8\u11A8 > \uD7D5;" - + "\u11AF\u11A8\u11C2 > \uD7D6;" + "\u11AF\u11AF\u11BF > \uD7D7;" + "\u11AF\u11B7\u11C2 > \uD7D8;" - + "\u11AF\u11B8\u11AE > \uD7D9;" + "\u11AF\u11B8\u11C1 > \uD7DA;" + "\u11AF\u11F9\u11C2 > \uD7DC;" - + "\u11B7\u11AB\u11AB > \uD7DF;" + "\u11B7\u11B8\u11BA > \uD7E1;" + "\u11B8\u11AF\u11C1 > \uD7E4;" - + "\u11B8\u11BA\u11AE > \uD7E7;" + "\u11BA\u11B8\u11BC > \uD7EB;" + "\u11BA\u11BA\u11A8 > \uD7EC;" - + "\u11BA\u11BA\u11AE > \uD7ED;" + "\u11EB\u11B8\u11BC > \uD7F4;" + "\u11BD\u11B8\u11B8 > \uD7F8;" - + "\u1100\u1100 > \u1101;" + "\u1103\u1103 > \u1104;" + "\u1107\u1107 > \u1108;" + "\u1109\u1109 > \u110A;" - + "\u110C\u110C > \u110D;" + "\u1102\u1100 > \u1113;" + "\u1102\u1102 > \u1114;" + "\u1102\u1103 > \u1115;" - + "\u1102\u1107 > \u1116;" + "\u1103\u1100 > \u1117;" + "\u1105\u1102 > \u1118;" + "\u1105\u1105 > \u1119;" - + "\u1105\u1112 > \u111A;" + "\u1105\u110B > \u111B;" + "\u1106\u1107 > \u111C;" + "\u1106\u110B > \u111D;" - + "\u1107\u1100 > \u111E;" + "\u1107\u1102 > \u111F;" + "\u1107\u1103 > \u1120;" + "\u1107\u1109 > \u1121;" - + "\u1107\u110C > \u1127;" + "\u1107\u110E > \u1128;" + "\u1107\u1110 > \u1129;" + "\u1107\u1111 > \u112A;" - + "\u1107\u110B > \u112B;" + "\u1109\u1100 > \u112D;" + "\u1109\u1102 > \u112E;" + "\u1109\u1103 > \u112F;" - + "\u1109\u1105 > \u1130;" + "\u1109\u1106 > \u1131;" + "\u1109\u1107 > \u1132;" + "\u1109\u110B > \u1135;" - + "\u1109\u110C > \u1136;" + "\u1109\u110E > \u1137;" + "\u1109\u110F > \u1138;" + "\u1109\u1110 > \u1139;" - + "\u1109\u1111 > \u113A;" + "\u1109\u1112 > \u113B;" + "\u113C\u113C > \u113D;" + "\u113E\u113E > \u113F;" - + "\u110B\u1100 > \u1141;" + "\u110B\u1103 > \u1142;" + "\u110B\u1106 > \u1143;" + "\u110B\u1107 > \u1144;" - + "\u110B\u1109 > \u1145;" + "\u110B\u1140 > \u1146;" + "\u110B\u110B > \u1147;" + "\u110B\u110C > \u1148;" - + "\u110B\u110E > \u1149;" + "\u110B\u1110 > \u114A;" + "\u110B\u1111 > \u114B;" + "\u110C\u110B > \u114D;" - + "\u114E\u114E > \u114F;" + "\u1150\u1150 > \u1151;" + "\u110E\u110F > \u1152;" + "\u110E\u1112 > \u1153;" - + "\u1111\u1107 > \u1156;" + "\u1111\u110B > \u1157;" + "\u1112\u1112 > \u1158;" + "\u1100\u1103 > \u115A;" - + "\u1102\u1109 > \u115B;" + "\u1102\u110C > \u115C;" + "\u1102\u1112 > \u115D;" + "\u1103\u1105 > \u115E;" - + "\u1161\u1175 > \u1162;" + "\u1163\u1175 > \u1164;" + "\u1165\u1175 > \u1166;" + "\u1167\u1175 > \u1168;" - + "\u1169\u1161 > \u116A;" + "\u1169\u1175 > \u116C;" + "\u116E\u1165 > \u116F;" + "\u116E\u1175 > \u1171;" - + "\u1173\u1175 > \u1174;" + "\u1161\u1169 > \u1176;" + "\u1161\u116E > \u1177;" + "\u1163\u1169 > \u1178;" - + "\u1163\u116D > \u1179;" + "\u1165\u1169 > \u117A;" + "\u1165\u116E > \u117B;" + "\u1165\u1173 > \u117C;" - + "\u1167\u1169 > \u117D;" + "\u1167\u116E > \u117E;" + "\u1169\u1165 > \u117F;" + "\u1169\u1169 > \u1182;" - + "\u1169\u116E > \u1183;" + "\u116D\u1163 > \u1184;" + "\u116D\u1167 > \u1186;" + "\u116D\u1169 > \u1187;" - + "\u116D\u1175 > \u1188;" + "\u116E\u1161 > \u1189;" + "\u116E\u116E > \u118D;" + "\u1172\u1161 > \u118E;" - + "\u1172\u1165 > \u118F;" + "\u1172\u1167 > \u1191;" + "\u1172\u116E > \u1193;" + "\u1172\u1175 > \u1194;" - + "\u1173\u116E > \u1195;" + "\u1173\u1173 > \u1196;" + "\u1175\u1161 > \u1198;" + "\u1175\u1163 > \u1199;" - + "\u1175\u1169 > \u119A;" + "\u1175\u116E > \u119B;" + "\u1175\u1173 > \u119C;" + "\u1175\u119E > \u119D;" - + "\u119E\u1165 > \u119F;" + "\u119E\u116E > \u11A0;" + "\u119E\u1175 > \u11A1;" + "\u119E\u119E > \u11A2;" - + "\u1161\u1173 > \u11A3;" + "\u1163\u116E > \u11A4;" + "\u1167\u1163 > \u11A5;" + "\u1169\u1163 > \u11A6;" - + "\u11A8\u11A8 > \u11A9;" + "\u11A8\u11BA > \u11AA;" + "\u11AB\u11BD > \u11AC;" + "\u11AB\u11C2 > \u11AD;" - + "\u11AF\u11A8 > \u11B0;" + "\u11AF\u11B7 > \u11B1;" + "\u11AF\u11B8 > \u11B2;" + "\u11AF\u11BA > \u11B3;" - + "\u11AF\u11C0 > \u11B4;" + "\u11AF\u11C1 > \u11B5;" + "\u11AF\u11C2 > \u11B6;" + "\u11B8\u11BA > \u11B9;" - + "\u11BA\u11BA > \u11BB;" + "\u11A8\u11AF > \u11C3;" + "\u11AB\u11A8 > \u11C5;" + "\u11AB\u11AE > \u11C6;" - + "\u11AB\u11BA > \u11C7;" + "\u11AB\u11EB > \u11C8;" + "\u11AB\u11C0 > \u11C9;" + "\u11AE\u11A8 > \u11CA;" - + "\u11AE\u11AF > \u11CB;" + "\u11AF\u11AB > \u11CD;" + "\u11AF\u11AE > \u11CE;" + "\u11AF\u11AF > \u11D0;" - + "\u11AF\u11EB > \u11D7;" + "\u11AF\u11BF > \u11D8;" + "\u11AF\u11F9 > \u11D9;" + "\u11B7\u11A8 > \u11DA;" - + "\u11B7\u11AF > \u11DB;" + "\u11B7\u11B8 > \u11DC;" + "\u11B7\u11BA > \u11DD;" + "\u11B7\u11EB > \u11DF;" - + "\u11B7\u11BE > \u11E0;" + "\u11B7\u11C2 > \u11E1;" + "\u11B7\u11BC > \u11E2;" + "\u11B8\u11AF > \u11E3;" - + "\u11B8\u11C1 > \u11E4;" + "\u11B8\u11C2 > \u11E5;" + "\u11B8\u11BC > \u11E6;" + "\u11BA\u11A8 > \u11E7;" - + "\u11BA\u11AE > \u11E8;" + "\u11BA\u11AF > \u11E9;" + "\u11BA\u11B8 > \u11EA;" + "\u11BC\u11A8 > \u11EC;" - + "\u11BC\u11BC > \u11EE;" + "\u11BC\u11BF > \u11EF;" + "\u11F0\u11BA > \u11F1;" + "\u11F0\u11EB > \u11F2;" - + "\u11C1\u11B8 > \u11F3;" + "\u11C1\u11BC > \u11F4;" + "\u11C2\u11AB > \u11F5;" + "\u11C2\u11AF > \u11F6;" - + "\u11C2\u11B7 > \u11F7;" + "\u11C2\u11B8 > \u11F8;" + "\u11A8\u11AB > \u11FA;" + "\u11A8\u11B8 > \u11FB;" - + "\u11A8\u11BE > \u11FC;" + "\u11A8\u11BF > \u11FD;" + "\u11A8\u11C2 > \u11FE;" + "\u11AB\u11AB > \u11FF;" - + "\u1103\u1106 > \uA960;" + "\u1103\u1107 > \uA961;" + "\u1103\u1109 > \uA962;" + "\u1103\u110C > \uA963;" - + "\u1105\u1100 > \uA964;" + "\u1105\u1103 > \uA966;" + "\u1105\u1106 > \uA968;" + "\u1105\u1107 > \uA969;" - + "\u1105\u1109 > \uA96C;" + "\u1105\u110C > \uA96D;" + "\u1105\u110F > \uA96E;" + "\u1106\u1100 > \uA96F;" - + "\u1106\u1103 > \uA970;" + "\u1106\u1109 > \uA971;" + "\u1107\u110F > \uA973;" + "\u1107\u1112 > \uA974;" - + "\u110B\u1105 > \uA976;" + "\u110B\u1112 > \uA977;" + "\u1110\u1110 > \uA979;" + "\u1111\u1112 > \uA97A;" - + "\u1112\u1109 > \uA97B;" + "\u1159\u1159 > \uA97C;" + "\u1169\u1167 > \uD7B0;" + "\u116D\u1161 > \uD7B2;" - + "\u116D\u1165 > \uD7B4;" + "\u116E\u1167 > \uD7B5;" + "\u1172\u1169 > \uD7B8;" + "\u1173\u1161 > \uD7B9;" - + "\u1173\u1165 > \uD7BA;" + "\u1173\u1169 > \uD7BC;" + "\u1175\u1167 > \uD7BF;" + "\u1175\u116D > \uD7C2;" - + "\u1175\u1172 > \uD7C3;" + "\u1175\u1175 > \uD7C4;" + "\u119E\u1161 > \uD7C5;" + "\u11AB\u11AF > \uD7CB;" - + "\u11AB\u11BE > \uD7CC;" + "\u11AE\u11AE > \uD7CD;" + "\u11AE\u11B8 > \uD7CF;" + "\u11AE\u11BA > \uD7D0;" - + "\u11AE\u11BD > \uD7D2;" + "\u11AE\u11BE > \uD7D3;" + "\u11AE\u11C0 > \uD7D4;" + "\u11AF\u11F0 > \uD7DB;" - + "\u11AF\u11BC > \uD7DD;" + "\u11B7\u11AB > \uD7DE;" + "\u11B7\u11B7 > \uD7E0;" + "\u11B7\u11BD > \uD7E2;" - + "\u11B8\u11AE > \uD7E3;" + "\u11B8\u11B7 > \uD7E5;" + "\u11B8\u11B8 > \uD7E6;" + "\u11B8\u11BD > \uD7E8;" - + "\u11B8\u11BE > \uD7E9;" + "\u11BA\u11B7 > \uD7EA;" + "\u11BA\u11EB > \uD7EE;" + "\u11BA\u11BD > \uD7EF;" - + "\u11BA\u11BE > \uD7F0;" + "\u11BA\u11C0 > \uD7F1;" + "\u11BA\u11C2 > \uD7F2;" + "\u11EB\u11B8 > \uD7F3;" - + "\u11F0\u11B7 > \uD7F5;" + "\u11F0\u11C2 > \uD7F6;" + "\u11BD\u11B8 > \uD7F7;" + "\u11BD\u11BD > \uD7F9;" - + "\u11C1\u11BA > \uD7FA;" + "\u11C1\u11C0 > \uD7FB;"; - - static final Transliterator MKD = Transliterator.createFromRules("MKD", "::NFD;" + MKD_RULES, - Transliterator.FORWARD); - static final Transliterator MKKD = Transliterator.createFromRules("MKD", "::NFKD;" + MKD_RULES, - Transliterator.FORWARD); - static final Transliterator MKC = Transliterator.createFromRules("MKC", "::NFD;" + MKD_RULES + "::null;" - + MKC_RULES + "::NFC;", Transliterator.FORWARD); + "\u1107\u1109\u1100 > \u1122;" + + "\u1107\u1109\u1103 > \u1123;" + + "\u1107\u1109\u1107 > \u1124;" + + "\u1107\u1109\u1109 > \u1125;" + + "\u1107\u1109\u110C > \u1126;" + + "\u1107\u1107\u110B > \u112C;" + + "\u1109\u1107\u1100 > \u1133;" + + "\u1109\u1109\u1109 > \u1134;" + + "\u1169\u1161\u1175 > \u116B;" + + "\u116E\u1165\u1175 > \u1170;" + + "\u1169\u1165\u1175 > \u1180;" + + "\u1169\u1167\u1175 > \u1181;" + + "\u116D\u1163\u1175 > \u1185;" + + "\u116E\u1161\u1175 > \u118A;" + + "\u116E\u1165\u1173 > \u118B;" + + "\u116E\u1167\u1175 > \u118C;" + + "\u1172\u1165\u1175 > \u1190;" + + "\u1172\u1167\u1175 > \u1192;" + + "\u1173\u1175\u116E > \u1197;" + + "\u1169\u1163\u1175 > \u11A7;" + + "\u11A8\u11BA\u11A8 > \u11C4;" + + "\u11AF\u11A8\u11BA > \u11CC;" + + "\u11AF\u11AE\u11C2 > \u11CF;" + + "\u11AF\u11B7\u11A8 > \u11D1;" + + "\u11AF\u11B7\u11BA > \u11D2;" + + "\u11AF\u11B8\u11BA > \u11D3;" + + "\u11AF\u11B8\u11C2 > \u11D4;" + + "\u11AF\u11B8\u11BC > \u11D5;" + + "\u11AF\u11BA\u11BA > \u11D6;" + + "\u11B7\u11BA\u11BA > \u11DE;" + + "\u11BC\u11A8\u11A8 > \u11ED;" + + "\u1105\u1100\u1100 > \uA965;" + + "\u1105\u1103\u1103 > \uA967;" + + "\u1105\u1107\u1107 > \uA96A;" + + "\u1105\u1107\u110B > \uA96B;" + + "\u1107\u1109\u1110 > \uA972;" + + "\u1109\u1109\u1107 > \uA975;" + + "\u110C\u110C\u1112 > \uA978;" + + "\u1169\u1169\u1175 > \uD7B1;" + + "\u116D\u1161\u1175 > \uD7B3;" + + "\u116E\u1175\u1175 > \uD7B6;" + + "\u1172\u1161\u1175 > \uD7B7;" + + "\u1173\u1165\u1175 > \uD7BB;" + + "\u1175\u1163\u1169 > \uD7BD;" + + "\u1175\u1163\u1175 > \uD7BE;" + + "\u1175\u1167\u1175 > \uD7C0;" + + "\u1175\u1169\u1175 > \uD7C1;" + + "\u119E\u1165\u1175 > \uD7C6;" + + "\u11AE\u11AE\u11B8 > \uD7CE;" + + "\u11AE\u11BA\u11A8 > \uD7D1;" + + "\u11AF\u11A8\u11A8 > \uD7D5;" + + "\u11AF\u11A8\u11C2 > \uD7D6;" + + "\u11AF\u11AF\u11BF > \uD7D7;" + + "\u11AF\u11B7\u11C2 > \uD7D8;" + + "\u11AF\u11B8\u11AE > \uD7D9;" + + "\u11AF\u11B8\u11C1 > \uD7DA;" + + "\u11AF\u11F9\u11C2 > \uD7DC;" + + "\u11B7\u11AB\u11AB > \uD7DF;" + + "\u11B7\u11B8\u11BA > \uD7E1;" + + "\u11B8\u11AF\u11C1 > \uD7E4;" + + "\u11B8\u11BA\u11AE > \uD7E7;" + + "\u11BA\u11B8\u11BC > \uD7EB;" + + "\u11BA\u11BA\u11A8 > \uD7EC;" + + "\u11BA\u11BA\u11AE > \uD7ED;" + + "\u11EB\u11B8\u11BC > \uD7F4;" + + "\u11BD\u11B8\u11B8 > \uD7F8;" + + "\u1100\u1100 > \u1101;" + + "\u1103\u1103 > \u1104;" + + "\u1107\u1107 > \u1108;" + + "\u1109\u1109 > \u110A;" + + "\u110C\u110C > \u110D;" + + "\u1102\u1100 > \u1113;" + + "\u1102\u1102 > \u1114;" + + "\u1102\u1103 > \u1115;" + + "\u1102\u1107 > \u1116;" + + "\u1103\u1100 > \u1117;" + + "\u1105\u1102 > \u1118;" + + "\u1105\u1105 > \u1119;" + + "\u1105\u1112 > \u111A;" + + "\u1105\u110B > \u111B;" + + "\u1106\u1107 > \u111C;" + + "\u1106\u110B > \u111D;" + + "\u1107\u1100 > \u111E;" + + "\u1107\u1102 > \u111F;" + + "\u1107\u1103 > \u1120;" + + "\u1107\u1109 > \u1121;" + + "\u1107\u110C > \u1127;" + + "\u1107\u110E > \u1128;" + + "\u1107\u1110 > \u1129;" + + "\u1107\u1111 > \u112A;" + + "\u1107\u110B > \u112B;" + + "\u1109\u1100 > \u112D;" + + "\u1109\u1102 > \u112E;" + + "\u1109\u1103 > \u112F;" + + "\u1109\u1105 > \u1130;" + + "\u1109\u1106 > \u1131;" + + "\u1109\u1107 > \u1132;" + + "\u1109\u110B > \u1135;" + + "\u1109\u110C > \u1136;" + + "\u1109\u110E > \u1137;" + + "\u1109\u110F > \u1138;" + + "\u1109\u1110 > \u1139;" + + "\u1109\u1111 > \u113A;" + + "\u1109\u1112 > \u113B;" + + "\u113C\u113C > \u113D;" + + "\u113E\u113E > \u113F;" + + "\u110B\u1100 > \u1141;" + + "\u110B\u1103 > \u1142;" + + "\u110B\u1106 > \u1143;" + + "\u110B\u1107 > \u1144;" + + "\u110B\u1109 > \u1145;" + + "\u110B\u1140 > \u1146;" + + "\u110B\u110B > \u1147;" + + "\u110B\u110C > \u1148;" + + "\u110B\u110E > \u1149;" + + "\u110B\u1110 > \u114A;" + + "\u110B\u1111 > \u114B;" + + "\u110C\u110B > \u114D;" + + "\u114E\u114E > \u114F;" + + "\u1150\u1150 > \u1151;" + + "\u110E\u110F > \u1152;" + + "\u110E\u1112 > \u1153;" + + "\u1111\u1107 > \u1156;" + + "\u1111\u110B > \u1157;" + + "\u1112\u1112 > \u1158;" + + "\u1100\u1103 > \u115A;" + + "\u1102\u1109 > \u115B;" + + "\u1102\u110C > \u115C;" + + "\u1102\u1112 > \u115D;" + + "\u1103\u1105 > \u115E;" + + "\u1161\u1175 > \u1162;" + + "\u1163\u1175 > \u1164;" + + "\u1165\u1175 > \u1166;" + + "\u1167\u1175 > \u1168;" + + "\u1169\u1161 > \u116A;" + + "\u1169\u1175 > \u116C;" + + "\u116E\u1165 > \u116F;" + + "\u116E\u1175 > \u1171;" + + "\u1173\u1175 > \u1174;" + + "\u1161\u1169 > \u1176;" + + "\u1161\u116E > \u1177;" + + "\u1163\u1169 > \u1178;" + + "\u1163\u116D > \u1179;" + + "\u1165\u1169 > \u117A;" + + "\u1165\u116E > \u117B;" + + "\u1165\u1173 > \u117C;" + + "\u1167\u1169 > \u117D;" + + "\u1167\u116E > \u117E;" + + "\u1169\u1165 > \u117F;" + + "\u1169\u1169 > \u1182;" + + "\u1169\u116E > \u1183;" + + "\u116D\u1163 > \u1184;" + + "\u116D\u1167 > \u1186;" + + "\u116D\u1169 > \u1187;" + + "\u116D\u1175 > \u1188;" + + "\u116E\u1161 > \u1189;" + + "\u116E\u116E > \u118D;" + + "\u1172\u1161 > \u118E;" + + "\u1172\u1165 > \u118F;" + + "\u1172\u1167 > \u1191;" + + "\u1172\u116E > \u1193;" + + "\u1172\u1175 > \u1194;" + + "\u1173\u116E > \u1195;" + + "\u1173\u1173 > \u1196;" + + "\u1175\u1161 > \u1198;" + + "\u1175\u1163 > \u1199;" + + "\u1175\u1169 > \u119A;" + + "\u1175\u116E > \u119B;" + + "\u1175\u1173 > \u119C;" + + "\u1175\u119E > \u119D;" + + "\u119E\u1165 > \u119F;" + + "\u119E\u116E > \u11A0;" + + "\u119E\u1175 > \u11A1;" + + "\u119E\u119E > \u11A2;" + + "\u1161\u1173 > \u11A3;" + + "\u1163\u116E > \u11A4;" + + "\u1167\u1163 > \u11A5;" + + "\u1169\u1163 > \u11A6;" + + "\u11A8\u11A8 > \u11A9;" + + "\u11A8\u11BA > \u11AA;" + + "\u11AB\u11BD > \u11AC;" + + "\u11AB\u11C2 > \u11AD;" + + "\u11AF\u11A8 > \u11B0;" + + "\u11AF\u11B7 > \u11B1;" + + "\u11AF\u11B8 > \u11B2;" + + "\u11AF\u11BA > \u11B3;" + + "\u11AF\u11C0 > \u11B4;" + + "\u11AF\u11C1 > \u11B5;" + + "\u11AF\u11C2 > \u11B6;" + + "\u11B8\u11BA > \u11B9;" + + "\u11BA\u11BA > \u11BB;" + + "\u11A8\u11AF > \u11C3;" + + "\u11AB\u11A8 > \u11C5;" + + "\u11AB\u11AE > \u11C6;" + + "\u11AB\u11BA > \u11C7;" + + "\u11AB\u11EB > \u11C8;" + + "\u11AB\u11C0 > \u11C9;" + + "\u11AE\u11A8 > \u11CA;" + + "\u11AE\u11AF > \u11CB;" + + "\u11AF\u11AB > \u11CD;" + + "\u11AF\u11AE > \u11CE;" + + "\u11AF\u11AF > \u11D0;" + + "\u11AF\u11EB > \u11D7;" + + "\u11AF\u11BF > \u11D8;" + + "\u11AF\u11F9 > \u11D9;" + + "\u11B7\u11A8 > \u11DA;" + + "\u11B7\u11AF > \u11DB;" + + "\u11B7\u11B8 > \u11DC;" + + "\u11B7\u11BA > \u11DD;" + + "\u11B7\u11EB > \u11DF;" + + "\u11B7\u11BE > \u11E0;" + + "\u11B7\u11C2 > \u11E1;" + + "\u11B7\u11BC > \u11E2;" + + "\u11B8\u11AF > \u11E3;" + + "\u11B8\u11C1 > \u11E4;" + + "\u11B8\u11C2 > \u11E5;" + + "\u11B8\u11BC > \u11E6;" + + "\u11BA\u11A8 > \u11E7;" + + "\u11BA\u11AE > \u11E8;" + + "\u11BA\u11AF > \u11E9;" + + "\u11BA\u11B8 > \u11EA;" + + "\u11BC\u11A8 > \u11EC;" + + "\u11BC\u11BC > \u11EE;" + + "\u11BC\u11BF > \u11EF;" + + "\u11F0\u11BA > \u11F1;" + + "\u11F0\u11EB > \u11F2;" + + "\u11C1\u11B8 > \u11F3;" + + "\u11C1\u11BC > \u11F4;" + + "\u11C2\u11AB > \u11F5;" + + "\u11C2\u11AF > \u11F6;" + + "\u11C2\u11B7 > \u11F7;" + + "\u11C2\u11B8 > \u11F8;" + + "\u11A8\u11AB > \u11FA;" + + "\u11A8\u11B8 > \u11FB;" + + "\u11A8\u11BE > \u11FC;" + + "\u11A8\u11BF > \u11FD;" + + "\u11A8\u11C2 > \u11FE;" + + "\u11AB\u11AB > \u11FF;" + + "\u1103\u1106 > \uA960;" + + "\u1103\u1107 > \uA961;" + + "\u1103\u1109 > \uA962;" + + "\u1103\u110C > \uA963;" + + "\u1105\u1100 > \uA964;" + + "\u1105\u1103 > \uA966;" + + "\u1105\u1106 > \uA968;" + + "\u1105\u1107 > \uA969;" + + "\u1105\u1109 > \uA96C;" + + "\u1105\u110C > \uA96D;" + + "\u1105\u110F > \uA96E;" + + "\u1106\u1100 > \uA96F;" + + "\u1106\u1103 > \uA970;" + + "\u1106\u1109 > \uA971;" + + "\u1107\u110F > \uA973;" + + "\u1107\u1112 > \uA974;" + + "\u110B\u1105 > \uA976;" + + "\u110B\u1112 > \uA977;" + + "\u1110\u1110 > \uA979;" + + "\u1111\u1112 > \uA97A;" + + "\u1112\u1109 > \uA97B;" + + "\u1159\u1159 > \uA97C;" + + "\u1169\u1167 > \uD7B0;" + + "\u116D\u1161 > \uD7B2;" + + "\u116D\u1165 > \uD7B4;" + + "\u116E\u1167 > \uD7B5;" + + "\u1172\u1169 > \uD7B8;" + + "\u1173\u1161 > \uD7B9;" + + "\u1173\u1165 > \uD7BA;" + + "\u1173\u1169 > \uD7BC;" + + "\u1175\u1167 > \uD7BF;" + + "\u1175\u116D > \uD7C2;" + + "\u1175\u1172 > \uD7C3;" + + "\u1175\u1175 > \uD7C4;" + + "\u119E\u1161 > \uD7C5;" + + "\u11AB\u11AF > \uD7CB;" + + "\u11AB\u11BE > \uD7CC;" + + "\u11AE\u11AE > \uD7CD;" + + "\u11AE\u11B8 > \uD7CF;" + + "\u11AE\u11BA > \uD7D0;" + + "\u11AE\u11BD > \uD7D2;" + + "\u11AE\u11BE > \uD7D3;" + + "\u11AE\u11C0 > \uD7D4;" + + "\u11AF\u11F0 > \uD7DB;" + + "\u11AF\u11BC > \uD7DD;" + + "\u11B7\u11AB > \uD7DE;" + + "\u11B7\u11B7 > \uD7E0;" + + "\u11B7\u11BD > \uD7E2;" + + "\u11B8\u11AE > \uD7E3;" + + "\u11B8\u11B7 > \uD7E5;" + + "\u11B8\u11B8 > \uD7E6;" + + "\u11B8\u11BD > \uD7E8;" + + "\u11B8\u11BE > \uD7E9;" + + "\u11BA\u11B7 > \uD7EA;" + + "\u11BA\u11EB > \uD7EE;" + + "\u11BA\u11BD > \uD7EF;" + + "\u11BA\u11BE > \uD7F0;" + + "\u11BA\u11C0 > \uD7F1;" + + "\u11BA\u11C2 > \uD7F2;" + + "\u11EB\u11B8 > \uD7F3;" + + "\u11F0\u11B7 > \uD7F5;" + + "\u11F0\u11C2 > \uD7F6;" + + "\u11BD\u11B8 > \uD7F7;" + + "\u11BD\u11BD > \uD7F9;" + + "\u11C1\u11BA > \uD7FA;" + + "\u11C1\u11C0 > \uD7FB;"; + + static final Transliterator MKD = + Transliterator.createFromRules("MKD", "::NFD;" + MKD_RULES, Transliterator.FORWARD); + static final Transliterator MKKD = + Transliterator.createFromRules("MKD", "::NFKD;" + MKD_RULES, Transliterator.FORWARD); + static final Transliterator MKC = + Transliterator.createFromRules( + "MKC", + "::NFD;" + MKD_RULES + "::null;" + MKC_RULES + "::NFC;", + Transliterator.FORWARD); // static final String MKDP_RULES = // MKD_RULES + @@ -1580,18 +2402,32 @@ public static UnicodeSet flatten(UnicodeSet exemplar1) { // "::NFD;"+ MKDP_RULES + "::null;" + MKCP_RULES + "::NFC;", // Transliterator.FORWARD); - static Pattern IS_ARCHAIC = Pattern.compile("(Obsolete|Ancient|Archaic|Medieval|New Testament|\\bUPA\\b)", - Pattern.CASE_INSENSITIVE); - - public static final UnicodeSet ADD_SUBHEAD = (UnicodeSet) ScriptCategories2 - .parseUnicodeSet("[[:S:][:P:][:M:]&[[:script=common:][:script=inherited:]]-[:nfkdqc=n:]]") - .removeAll(ScriptCategories2.ARCHAIC).freeze(); - static UnicodeSet UNCOMMON_HAN = ScriptCategories2.parseUnicodeSet("[" + "[:script=han:]" - + "-[:block=CJK Unified Ideographs:]" + "-[:block=CJK Symbols And Punctuation:]" - + "-[:block=CJK Radicals Supplement:]" + "-[:block=Ideographic Description Characters:]" - + "-[:block=CJK Strokes:]" + "-[:script=hiragana:]" + "-[:script=katakana:]" + "-[〇]" + "]"); // we'll alter - // below to remove - // iicore + static Pattern IS_ARCHAIC = + Pattern.compile( + "(Obsolete|Ancient|Archaic|Medieval|New Testament|\\bUPA\\b)", + Pattern.CASE_INSENSITIVE); + + public static final UnicodeSet ADD_SUBHEAD = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet( + "[[:S:][:P:][:M:]&[[:script=common:][:script=inherited:]]-[:nfkdqc=n:]]") + .removeAll(ScriptCategories2.ARCHAIC) + .freeze(); + static UnicodeSet UNCOMMON_HAN = + ScriptCategories2.parseUnicodeSet( + "[" + + "[:script=han:]" + + "-[:block=CJK Unified Ideographs:]" + + "-[:block=CJK Symbols And Punctuation:]" + + "-[:block=CJK Radicals Supplement:]" + + "-[:block=Ideographic Description Characters:]" + + "-[:block=CJK Strokes:]" + + "-[:script=hiragana:]" + + "-[:script=katakana:]" + + "-[〇]" + + "]"); // we'll alter + // below to remove + // iicore static class Renamer { static class MatchData { @@ -1634,14 +2470,15 @@ void getRenameData(String filename) throws IOException { int breaker = line.indexOf(">"); String source = line.substring(0, breaker).trim(); String target = line.substring(breaker + 1).trim(); - renameTable.put(Pattern.compile(source, Pattern.CASE_INSENSITIVE).matcher(""), new MatchData( - source, target)); + renameTable.put( + Pattern.compile(source, Pattern.CASE_INSENSITIVE).matcher(""), + new MatchData(source, target)); } catch (Exception e) { - throw (RuntimeException) new IllegalArgumentException("Problem with: " + line).initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Problem with: " + line).initCause(e); } } in.close(); - } // static final String[] RENAME_TABLE = { @@ -1649,11 +2486,13 @@ void getRenameData(String filename) throws IOException { // ".*Category:(.*) - (.*)>$1:$2", // ".*Category:([^ ]*)[ ](.*)>$2:$1", // ".*Category:(.*)>$1:Miscellaneous", - // "Symbol:Latin 1 Supplement - Latin-1 punctuation and symbols > Symbol:Latin-1 punctuation and symbols", + // "Symbol:Latin 1 Supplement - Latin-1 punctuation and symbols > Symbol:Latin-1 punctuation + // and symbols", // "Mark:(.*) > General Diacritic:$1", // "Symbol:(.*) - (.*(arrows|harpoons).*) > Arrows:$2", // "Symbol:Control Pictures.*>Symbol:Control Pictures", - // "Symbol:(Box Drawing|Block Elements|Geometric Shapes|Miscellaneous Symbols And Arrows).*>Symbol:Geometric Shapes", + // "Symbol:(Box Drawing|Block Elements|Geometric Shapes|Miscellaneous Symbols And + // Arrows).*>Symbol:Geometric Shapes", // "Symbol:(.*) Tiles.*>Symbol:Tiles and Dominoes", // "Symbol:.*Musical.*>Symbol:Musical Symbols", // "Symbol:Tai Xuan Jing Symbols.*>Symbol:Tai Xuan Jing Symbols", @@ -1697,7 +2536,8 @@ void getRenameData(String filename) throws IOException { // String target = row.substring(breaker+1).trim(); // renameTable.put(Pattern.compile(source,Pattern.CASE_INSENSITIVE).matcher(""), target); // } catch (Exception e) { - // throw (RuntimeException) new IllegalArgumentException("Problem with: " + row).initCause(e); + // throw (RuntimeException) new IllegalArgumentException("Problem with: " + + // row).initCause(e); // } // } // } @@ -1716,7 +2556,7 @@ SimplePair rename(String maincategory, String subcategory) { if (true) System.out.println(); } String indent = ""; - for (int count = 0;; ++count) { + for (int count = 0; ; ++count) { boolean didMatch = false; for (Matcher m : renameTable.keySet()) { if (m.reset(lookup).matches()) { @@ -1728,8 +2568,15 @@ SimplePair rename(String maincategory, String subcategory) { if (lookup.equals(newName)) { continue; } - renamingLog.println(indent + lookup + "\t=>\t" + newName + "\t // " + newNames.source + " > " - + newNames.target); + renamingLog.println( + indent + + lookup + + "\t=>\t" + + newName + + "\t // " + + newNames.source + + " > " + + newNames.target); lookup = newName; indent += "\t"; newNames.used = true; @@ -1765,7 +2612,7 @@ public void showUnusedRules() { } public static > U addAllToCollection(UnicodeSet input, U output) { - for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) { output.add(it.getString()); } return output; @@ -1783,7 +2630,8 @@ private static void addManualCorrections(String fileName) throws IOException { } String components[] = line.split(";"); if (components.length != 4) { - throw new IOException("Invalid line: <" + line + "> - Expecting 4 ';' separated components"); + throw new IOException( + "Invalid line: <" + line + "> - Expecting 4 ';' separated components"); } UnicodeSet set = new UnicodeSet(components[3]); String subCategory = components[1]; @@ -1794,19 +2642,28 @@ private static void addManualCorrections(String fileName) throws IOException { } if (components[2].equals("Add")) { - CATEGORYTABLE.add(components[0], false, subCategory, buttonComparator, Separation.ALL_ORDINARY, set); - } - else if (components[2].equals("Remove")) { + CATEGORYTABLE.add( + components[0], + false, + subCategory, + buttonComparator, + Separation.ALL_ORDINARY, + set); + } else if (components[2].equals("Remove")) { CATEGORYTABLE.removeAll(components[0], subCategory, set); } else { - throw new IOException("Invalid operation: <" + components[2] + "> - Expecting one of {Add,Remove}"); + throw new IOException( + "Invalid operation: <" + + components[2] + + "> - Expecting one of {Add,Remove}"); } } } private static void addEmojiCharacters() throws IOException { - File emojiSources = new File(unicodeDataDirectory + "/EmojiSources.txt"); // Needs fixing for release vs - // non-released directory + File emojiSources = + new File(unicodeDataDirectory + "/EmojiSources.txt"); // Needs fixing for release vs + // non-released directory FileInputStream fis = new FileInputStream(emojiSources); BufferedReader in = new BufferedReader(new InputStreamReader(fis, "UTF-8")); UnicodeSet emojiCharacters = new UnicodeSet(); @@ -1827,11 +2684,18 @@ private static void addEmojiCharacters() throws IOException { emojiCharacters.add(codepoint); } in.close(); - CATEGORYTABLE.add("Symbol", false, "Emoji", buttonComparator, Separation.ALL_ORDINARY, emojiCharacters); + CATEGORYTABLE.add( + "Symbol", + false, + "Emoji", + buttonComparator, + Separation.ALL_ORDINARY, + emojiCharacters); } - public static > U removeAllFromCollection(UnicodeSet input, U output) { - for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) { + public static > U removeAllFromCollection( + UnicodeSet input, U output) { + for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) { output.remove(it.getString()); } return output; diff --git a/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollatorFiles.java b/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollatorFiles.java index 5e5437dc6..255f383a7 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollatorFiles.java +++ b/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollatorFiles.java @@ -6,7 +6,6 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; public class GenerateUnihanCollatorFiles { @@ -14,24 +13,27 @@ public class GenerateUnihanCollatorFiles { static final String OUTPUT_DIRECTORY = CldrUtility.GEN_DIRECTORY + "/han"; static final String OUTPUT_DIRECTORY_REPLACE = CldrUtility.GEN_DIRECTORY + "/han/replace"; - static final Pattern START_AUTOGEN = Pattern.compile(".*#\\s*START\\s*AUTOGENERATED\\s*([^(]*).*"); + static final Pattern START_AUTOGEN = + Pattern.compile(".*#\\s*START\\s*AUTOGENERATED\\s*([^(]*).*"); static final Pattern END_AUTOGEN = Pattern.compile(".*#\\s*END\\s*AUTOGENERATED\\s*([^(]*).*"); // new format // # START AUTOGENERATED PINYIN LONG (sort by pinyin then kTotalStrokes then kRSUnicode) - public static void main(String[] args) throws IOException { composeFile(CldrUtility.COMMON_DIRECTORY + "/collation/", "zh.xml", true); composeFile(CldrUtility.COMMON_DIRECTORY + "/transforms/", "Han-Latin.xml", false); } - private static void composeFile(String inputDirectory, String fileName, boolean fixChoice) throws IOException { + private static void composeFile(String inputDirectory, String fileName, boolean fixChoice) + throws IOException { final Matcher start_autogen = START_AUTOGEN.matcher(""); final Matcher end_autogen = END_AUTOGEN.matcher(""); int count = 0; - try (final PrintWriter newFile = FileUtilities.openUTF8Writer(OUTPUT_DIRECTORY_REPLACE, fileName); - final BufferedReader oldFile = FileUtilities.openUTF8Reader(inputDirectory, fileName)) { + try (final PrintWriter newFile = + FileUtilities.openUTF8Writer(OUTPUT_DIRECTORY_REPLACE, fileName); + final BufferedReader oldFile = + FileUtilities.openUTF8Reader(inputDirectory, fileName)) { while (true) { // copy up to the first autogen comment, including the comment line @@ -39,29 +41,41 @@ private static void composeFile(String inputDirectory, String fileName, boolean newFile.flush(); if (matchingLine == null) { if (count == 0) { - throw new IllegalArgumentException("No START comments for autogeneration: " + matchingLine); + throw new IllegalArgumentException( + "No START comments for autogeneration: " + matchingLine); } break; } ++count; final String choice = start_autogen.group(1).trim(); - final String replacementFile = fixChoice - ? choice.toLowerCase(Locale.ENGLISH).replaceAll("\\s+", "_").replace("_long", "").replace("stroke", "strokeT") + ".txt" - : choice; + final String replacementFile = + fixChoice + ? choice.toLowerCase(Locale.ENGLISH) + .replaceAll("\\s+", "_") + .replace("_long", "") + .replace("stroke", "strokeT") + + ".txt" + : choice; // copy the file to be inserted - try (final BufferedReader insertFile = FileUtilities.openUTF8Reader(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, replacementFile)) { - CldrUtility.copyUpTo(insertFile, (Matcher)null, newFile, true); // copy to end + try (final BufferedReader insertFile = + FileUtilities.openUTF8Reader( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, replacementFile)) { + CldrUtility.copyUpTo(insertFile, (Matcher) null, newFile, true); // copy to end newFile.flush(); } - //insertFile.close(); + // insertFile.close(); // skip to the end of the matching autogen comment matchingLine = CldrUtility.copyUpTo(oldFile, end_autogen, null, true); // check for matching comment if (matchingLine == null || !choice.equals(end_autogen.group(1).trim())) { - throw new IllegalArgumentException("Mismatched comments for autogeneration: " + choice + ", " + matchingLine); + throw new IllegalArgumentException( + "Mismatched comments for autogeneration: " + + choice + + ", " + + matchingLine); } newFile.println(matchingLine); // copy comment line newFile.flush(); @@ -70,4 +84,3 @@ private static void composeFile(String inputDirectory, String fileName, boolean System.out.println(count + " segments replaced"); } } - diff --git a/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollators.java b/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollators.java index 0f2e58dda..1d813c1f5 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollators.java +++ b/unicodetools/src/main/java/org/unicode/draft/GenerateUnihanCollators.java @@ -1,5 +1,22 @@ package org.unicode.draft; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.dev.util.UnicodeMap.EntryRange; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Row.R4; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -22,7 +39,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.Counter; import org.unicode.cldr.util.Differ; @@ -38,30 +54,14 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.dev.util.UnicodeMap.EntryRange; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Row.R4; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.Output; -import com.ibm.icu.util.ULocale; - public class GenerateUnihanCollators { private static final boolean DEBUG = false; private static String version = CldrUtility.getProperty("UVERSION"); + static { - System.out.println("To make files for a different version of unicode, use -DUVERSION=x.y.z"); + System.out.println( + "To make files for a different version of unicode, use -DUVERSION=x.y.z"); if (version == null) { version = Settings.latestVersion; } else { @@ -69,74 +69,93 @@ public class GenerateUnihanCollators { Default.setUCD(version); } } + private static final IndexUnicodeProperties IUP = IndexUnicodeProperties.make(version); private static final RadicalStroke radicalStroke = new RadicalStroke(version); private static final char INDEX_ITEM_BASE = '\u2800'; - private enum FileType {txt, xml} + private enum FileType { + txt, + xml + } + private enum InfoType { - radicalStroke("\uFDD2"), stroke("\uFDD1"), pinyin("\uFDD0"); + radicalStroke("\uFDD2"), + stroke("\uFDD1"), + pinyin("\uFDD0"); final String base = "\uFDD0"; + InfoType(String base) { - //this.base = base; + // this.base = base; } } - private enum OverrideItems {keepOld, keepNew} + private enum OverrideItems { + keepOld, + keepNew + } - private static final Transform fromNumericPinyin = Transliterator.getInstance("NumericPinyin-Latin;nfc"); - private static final Transliterator toNumericPinyin = Transliterator.getInstance("Latin-NumericPinyin;nfc"); + private static final Transform fromNumericPinyin = + Transliterator.getInstance("NumericPinyin-Latin;nfc"); + private static final Transliterator toNumericPinyin = + Transliterator.getInstance("Latin-NumericPinyin;nfc"); - private static final Normalizer nfkd = Default.nfkd(); - private static final Normalizer nfd = Default.nfd(); - private static final Normalizer nfc = Default.nfc(); + private static final Normalizer nfkd = Default.nfkd(); + private static final Normalizer nfd = Default.nfd(); + private static final Normalizer nfc = Default.nfc(); - private static final UnicodeSet PINYIN_LETTERS = new UnicodeSet("['a-uw-zàáèéìíòóùúüāēěīōūǎǐǒǔǖǘǚǜ]").freeze(); + private static final UnicodeSet PINYIN_LETTERS = + new UnicodeSet("['a-uw-zàáèéìíòóùúüāēěīōūǎǐǒǔǖǘǚǜ]").freeze(); // these should be ok, eve if we are not on an old version - private static final UnicodeSet NOT_NFC = new UnicodeSet("[:nfc_qc=no:]").freeze(); - private static final UnicodeSet NOT_NFD = new UnicodeSet("[:nfd_qc=no:]").freeze(); - private static final UnicodeSet NOT_NFKD = new UnicodeSet("[:nfkd_qc=no:]").freeze(); + private static final UnicodeSet NOT_NFC = new UnicodeSet("[:nfc_qc=no:]").freeze(); + private static final UnicodeSet NOT_NFD = new UnicodeSet("[:nfd_qc=no:]").freeze(); + private static final UnicodeSet NOT_NFKD = new UnicodeSet("[:nfkd_qc=no:]").freeze(); + + // specifically restrict this to the set version. Theoretically there could be some variance in + // ideographic, but it isn't worth worrying about - // specifically restrict this to the set version. Theoretically there could be some variance in ideographic, but it isn't worth worrying about + private static final UnicodeSet UNIHAN_LATEST = + new UnicodeSet("[[:ideographic:][:script=han:]]").removeAll(NOT_NFC).freeze(); + private static final UnicodeSet UNIHAN = + version == null + ? UNIHAN_LATEST + : new UnicodeSet("[:age=" + version + ":]").retainAll(UNIHAN_LATEST).freeze(); - private static final UnicodeSet UNIHAN_LATEST = new UnicodeSet("[[:ideographic:][:script=han:]]") - .removeAll(NOT_NFC) - .freeze(); - private static final UnicodeSet UNIHAN = version == null ? UNIHAN_LATEST - : new UnicodeSet("[:age=" + version + ":]") - .retainAll(UNIHAN_LATEST) - .freeze(); static { if (!UNIHAN.contains(0x2B820)) { throw new ICUException(Utility.hex(0x2B820) + " not supported"); } } - private static Matcher unicodeCp = Pattern.compile("^U\\+(2?[0-9A-F]{4})$").matcher(""); - private static final HashMap validPinyin = new HashMap(); - private static final Collator pinyinSort = Collator.getInstance(new ULocale("zh@collator=pinyin")); - private static final Collator strokeSort = Collator.getInstance(new ULocale("zh@collator=stroke")); - - private static final Comparator codepointComparator = new UTF16.StringComparator(true, false, 0); - private static final Comparator nfkdComparator = new Comparator() { - @Override - public int compare(String o1, String o2) { - if (!nfkd.isNormalized(o1) || !nfkd.isNormalized(o2)) { - final String s1 = nfkd.normalize(o1); - final String s2 = nfkd.normalize(o2); - final int result = codepointComparator.compare(s1, s2); - if (result != 0) { - return result; // otherwise fall through to codepoint comparator + private static Matcher unicodeCp = Pattern.compile("^U\\+(2?[0-9A-F]{4})$").matcher(""); + private static final HashMap validPinyin = new HashMap(); + private static final Collator pinyinSort = + Collator.getInstance(new ULocale("zh@collator=pinyin")); + private static final Collator strokeSort = + Collator.getInstance(new ULocale("zh@collator=stroke")); + + private static final Comparator codepointComparator = + new UTF16.StringComparator(true, false, 0); + private static final Comparator nfkdComparator = + new Comparator() { + @Override + public int compare(String o1, String o2) { + if (!nfkd.isNormalized(o1) || !nfkd.isNormalized(o2)) { + final String s1 = nfkd.normalize(o1); + final String s2 = nfkd.normalize(o2); + final int result = codepointComparator.compare(s1, s2); + if (result != 0) { + return result; // otherwise fall through to codepoint comparator + } + } + return codepointComparator.compare(o1, o2); } - } - return codepointComparator.compare(o1, o2); - } - }; + }; - private static final UnicodeMap bestStrokesS = new UnicodeMap(); - private static final UnicodeMap bestStrokesT = new UnicodeMap(); + private static final UnicodeMap bestStrokesS = new UnicodeMap(); + private static final UnicodeMap bestStrokesT = new UnicodeMap(); private static final UnicodeMap kTotalStrokes = IUP.load(UcdProperty.kTotalStrokes); private static final Splitter ONBAR = Splitter.on('|').trimResults(); private static final Splitter ONCOMMA = Splitter.on(',').trimResults(); @@ -156,39 +175,48 @@ public int compare(String o1, String o2) { } } - private static UnicodeMap> bihuaData = new UnicodeMap>(); + private static UnicodeMap> bihuaData = + new UnicodeMap>(); - private static final UnicodeMap kRSUnicode = IUP.load(UcdProperty.kRSUnicode).cloneAsThawed(); - private static final UnicodeMap kSimplifiedVariant = IUP.load(UcdProperty.kSimplifiedVariant); - private static final UnicodeMap kTraditionalVariant = IUP.load(UcdProperty.kTraditionalVariant); + private static final UnicodeMap kRSUnicode = + IUP.load(UcdProperty.kRSUnicode).cloneAsThawed(); + private static final UnicodeMap kSimplifiedVariant = + IUP.load(UcdProperty.kSimplifiedVariant); + private static final UnicodeMap kTraditionalVariant = + IUP.load(UcdProperty.kTraditionalVariant); - private static final UnicodeMap> mergedPinyin = new UnicodeMap>(); - private static final UnicodeSet originalPinyin; + private static final UnicodeMap> mergedPinyin = new UnicodeMap>(); + private static final UnicodeSet originalPinyin; private static final boolean only19 = System.getProperty("only19") != null; - private static UnicodeMap radicalMap = new UnicodeMap(); + private static UnicodeMap radicalMap = new UnicodeMap(); // kHanyuPinyin, space, 10297.260: qīn,qìn,qǐn, // [a-z\x{FC}\x{300}-\x{302}\x{304}\x{308}\x{30C}]+(,[a-z\x{FC}\x{300}-\x{302}\x{304}\x{308}\x{30C}] // kMandarin, space, [A-Z\x{308}]+[1-5] // 3475=HAN4 JI2 JIE2 ZHA3 ZI2 // kHanyuPinlu, space, [a-z\x{308}]+[1-5]\([0-9]+\) 4E0A=shang4(12308) // shang5(392) - private static UnicodeMap bestPinyin = new UnicodeMap<>(); + private static UnicodeMap bestPinyin = new UnicodeMap<>(); // while these use NFKD, for the repertoire they apply to it should work. - private static Transform noaccents = Transliterator.getInstance("nfkd; [[:m:]-[\u0308]] remove; nfc"); - - private static UnicodeSet INITIALS = new UnicodeSet("[b c {ch} d f g h j k l m n p q r s {sh} t w x y z {zh}]").freeze(); - private static UnicodeSet FINALS = new UnicodeSet( - "[a {ai} {an} {ang} {ao} e {ei} {en} {eng} {er} i {ia} {ian} {iang} {iao} {ie} {in} {ing} {iong} {iu} o {ong} {ou} u {ua} {uai} {uan} {uang} {ue} {ui} {un} {uo} ü {üe}]") - .freeze(); + private static Transform noaccents = + Transliterator.getInstance("nfkd; [[:m:]-[\u0308]] remove; nfc"); + + private static UnicodeSet INITIALS = + new UnicodeSet("[b c {ch} d f g h j k l m n p q r s {sh} t w x y z {zh}]").freeze(); + private static UnicodeSet FINALS = + new UnicodeSet( + "[a {ai} {an} {ang} {ao} e {ei} {en} {eng} {er} i {ia} {ian} {iang} {iao} {ie} {in} {ing} {iong} {iu} o {ong} {ou} u {ua} {uai} {uan} {uang} {ue} {ui} {un} {uo} ü {üe}]") + .freeze(); private static final int NO_STROKE_INFO = Integer.MAX_VALUE; // We need to quote at least the collation syntax characters, see // http://www.unicode.org/reports/tr35/tr35-collation.html#Rules - private static UnicodeSet NEEDSQUOTE = new UnicodeSet("[_[:pattern_syntax:][:pattern_whitespace:]]").freeze(); + private static UnicodeSet NEEDSQUOTE = + new UnicodeSet("[_[:pattern_syntax:][:pattern_whitespace:]]").freeze(); - private static final XEquivalenceClass variantEquivalents = new XEquivalenceClass(); + private static final XEquivalenceClass variantEquivalents = + new XEquivalenceClass(); private static final String INDENT = " "; private static final UnicodeMap kMandarin = IUP.load(UcdProperty.kMandarin); @@ -197,12 +225,14 @@ public int compare(String o1, String o2) { new BihuaReader().process(GenerateUnihanCollators.class, "bihua-chinese-sorting.txt"); getBestStrokes(); - new PatchStrokeReader(bestStrokesS, Pattern.compile("\\t\\s*")).process(GenerateUnihanCollators.class, "ucs-strokes-ext-e.txt"); + new PatchStrokeReader(bestStrokesS, Pattern.compile("\\t\\s*")) + .process(GenerateUnihanCollators.class, "ucs-strokes-ext-e.txt"); RsInfo.addToStrokeInfo(bestStrokesS, true); bestStrokesT.putAll(bestStrokesS); // patch the values for the T strokes - new PatchStrokeReader(bestStrokesT, SemiFileReader.SPLIT).process(GenerateUnihanCollators.class, "patchStrokeT.txt"); + new PatchStrokeReader(bestStrokesT, SemiFileReader.SPLIT) + .process(GenerateUnihanCollators.class, "patchStrokeT.txt"); RsInfo.addToStrokeInfo(bestStrokesT, false); new MyFileReader().process(GenerateUnihanCollators.class, "CJK_Radicals.csv"); @@ -255,10 +285,9 @@ public int compare(String o1, String o2) { // only // for // medial - //source = source.replaceAll("\\s*(\\d{5}\\.\\d{2}0,)*\\d{5}\\.\\d{2}0:", ","); + // source = source.replaceAll("\\s*(\\d{5}\\.\\d{2}0,)*\\d{5}\\.\\d{2}0:", ","); addAllKeepingOld(s, original, PinyinSource.p, ONCOMMA.split(source)); } - } originalPinyin = mergedPinyin.keySet().freeze(); @@ -275,10 +304,10 @@ public int compare(String o1, String o2) { addEquivalents(kSimplifiedVariant); count += addPinyinFromVariants("STVariants", count); - //count += showAdded("kTraditionalVariant", count); + // count += showAdded("kTraditionalVariant", count); - //addVariants("kSimplifiedVariant", kSimplifiedVariant); - //count += showAdded("kSimplifiedVariant", count); + // addVariants("kSimplifiedVariant", kSimplifiedVariant); + // count += showAdded("kSimplifiedVariant", count); new PatchPinyinReader().process(GenerateUnihanCollators.class, "patchPinyin.txt"); @@ -296,21 +325,21 @@ public static void main(String[] args) throws Exception { final UnicodeSet zh = FindHanSizes.getMostFrequent("zh", 0.999); final UnicodeSet zh_Hant = FindHanSizes.getMostFrequent("zh_Hant", 0.999); - //Matcher charsetMatcher = Pattern.compile("GB2312|GBK|Big5|Big5-HKSCS").matcher(""); + // Matcher charsetMatcher = Pattern.compile("GB2312|GBK|Big5|Big5-HKSCS").matcher(""); final UnicodeSet GB2312 = FindHanSizes.getCharsetRepertoire("GB2312"); final UnicodeSet GBK = FindHanSizes.getCharsetRepertoire("GBK"); final UnicodeSet Big5 = FindHanSizes.getCharsetRepertoire("Big5"); final UnicodeSet Big5_HKSCS = FindHanSizes.getCharsetRepertoire("Big5-HKSCS"); final UnicodeSet shortPinyin = new UnicodeSet(zh).addAll(GB2312).addAll(GBK); - final UnicodeSet shortStroke = new UnicodeSet(shortPinyin).addAll(zh_Hant).addAll(Big5).addAll(Big5_HKSCS); - + final UnicodeSet shortStroke = + new UnicodeSet(shortPinyin).addAll(zh_Hant).addAll(Big5).addAll(Big5_HKSCS); showSorting(RSComparator, kRSUnicode, "unihan", InfoType.radicalStroke); testSorting(RSComparator, kRSUnicode, "unihan"); writeAndTest(shortPinyin, PinyinComparator, bestPinyin, "pinyin", InfoType.pinyin); - //writeAndTest(shortStroke, SStrokeComparator, bestStrokesS, "stroke", InfoType.stroke); + // writeAndTest(shortStroke, SStrokeComparator, bestStrokesS, "stroke", InfoType.stroke); writeAndTest(shortStroke, TStrokeComparator, bestStrokesT, "strokeT", InfoType.stroke); for (final Entry> entry : indexValues.keyValuesSet()) { @@ -324,37 +353,36 @@ public static void main(String[] args) throws Exception { writeUnihanFields(bestStrokesS, bestStrokesT, null, SStrokeComparator, "kTotalStrokes"); // showSorting(PinyinComparator, bestPinyin, "pinyin"); - // UnicodeMap shortPinyinMap = new UnicodeMap().putAllFiltered(bestPinyin, shortPinyin); + // UnicodeMap shortPinyinMap = new + // UnicodeMap().putAllFiltered(bestPinyin, shortPinyin); // System.out.println("stroke_pinyin base size:\t" + shortPinyinMap.size()); // showSorting(PinyinComparator, shortPinyinMap, "pinyin_short"); // testSorting(PinyinComparator, bestPinyin, "pinyin"); // testSorting(PinyinComparator, shortPinyinMap, "pinyin_short"); - // showSorting(TStrokeComparator, bestStrokesT, "strokeT"); - // UnicodeMap shortStrokeMapT = new UnicodeMap().putAllFiltered(bestStrokesT, shortStroke); + // UnicodeMap shortStrokeMapT = new + // UnicodeMap().putAllFiltered(bestStrokesT, shortStroke); // System.out.println("Tstroke_stroke base size:\t" + shortStrokeMapT.size()); // showSorting(TStrokeComparator, shortStrokeMapT, "stroke_shortT"); // testSorting(TStrokeComparator, bestStrokesT, "strokeT"); // testSorting(TStrokeComparator, shortStrokeMapT, "stroke_shortT"); - - //showSorting(PinyinComparator, bestPinyin, "pinyinCollationInterleaved", true, FileType.TXT); - + // showSorting(PinyinComparator, bestPinyin, "pinyinCollationInterleaved", true, + // FileType.TXT); showTranslit("Han-Latin"); showBackgroundData(); - System.out.println("TODO: test the translit"); getIndexChars(); } /** - * U+3400 kMandarin QIU1 - * U+3400 kTotalStrokes 5 + * U+3400 kMandarin QIU1 U+3400 kTotalStrokes 5 + * * @param * @param * @param simplified @@ -363,8 +391,15 @@ public static void main(String[] args) throws Exception { * @param comp * @param filename */ - private static void writeUnihanFields(UnicodeMap simplified, UnicodeMap traditional, UnicodeMap other, Comparator comp, String filename) { - final PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + ".txt", null); + private static void writeUnihanFields( + UnicodeMap simplified, + UnicodeMap traditional, + UnicodeMap other, + Comparator comp, + String filename) { + final PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + ".txt", null); final UnicodeSet keys = new UnicodeSet(simplified.keySet()).addAll(traditional.keySet()); final Set sorted = new TreeSet(comp); UnicodeSet.addAllTo(keys, sorted); @@ -385,8 +420,8 @@ private static void writeUnihanFields(UnicodeMap simplified, UnicodeMa // do nothing } else if (commentSource instanceof Set) { @SuppressWarnings("unchecked") - final - LinkedHashSet temp = new LinkedHashSet((Set)commentSource); + final LinkedHashSet temp = + new LinkedHashSet((Set) commentSource); temp.remove(simp); temp.remove(trad); comments = CollectionUtilities.join(temp, " "); @@ -396,12 +431,27 @@ private static void writeUnihanFields(UnicodeMap simplified, UnicodeMa comments = ""; } } - out.println("U+" + Utility.hex(s) + "\t" + filename + "\t" + item + "\t# " + s + (comments.isEmpty() ? "" : "\t" + comments)); + out.println( + "U+" + + Utility.hex(s) + + "\t" + + filename + + "\t" + + item + + "\t# " + + s + + (comments.isEmpty() ? "" : "\t" + comments)); } out.close(); } - private static void writeAndTest(UnicodeSet shortStroke, Comparator comparator2, UnicodeMap unicodeMap2, String title2, InfoType infoType) throws Exception { + private static void writeAndTest( + UnicodeSet shortStroke, + Comparator comparator2, + UnicodeMap unicodeMap2, + String title2, + InfoType infoType) + throws Exception { showSorting(comparator2, unicodeMap2, title2, infoType); testSorting(comparator2, unicodeMap2, title2); final UnicodeMap shortMap = new UnicodeMap().putAllFiltered(unicodeMap2, shortStroke); @@ -411,24 +461,25 @@ private static void writeAndTest(UnicodeSet shortStroke, Comparator } private static void showOldData(Collator collator, String name, boolean japanese) { - final PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, name, null); + final PrintWriter out = + Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, name, null); final UnicodeSet tailored = collator.getTailoredSet(); final TreeSet sorted = new TreeSet(collator); for (final String s : tailored) { sorted.add(nfc.normalize(s)); } - final UnicodeMap kJapaneseKun = IUP.load(UcdProperty.kJapaneseKun); - final UnicodeMap kJapaneseOn = IUP.load(UcdProperty.kJapaneseOn); + final UnicodeMap kJapaneseKun = IUP.load(UcdProperty.kJapaneseKun); + final UnicodeMap kJapaneseOn = IUP.load(UcdProperty.kJapaneseOn); final StringBuilder buffer = new StringBuilder(); out.println("#char; strokes; radical; rem-strokes; reading"); for (final String item : sorted) { buffer.append("<").append(item).append("\t#"); final String code = Utility.hex(item); - buffer.append(pad(code,6)).append(";\t"); + buffer.append(pad(code, 6)).append(";\t"); int strokes = CldrUtility.ifNull(bestStrokesS.get(item), 0); - buffer.append(pad(String.valueOf(strokes),3)).append(";\t"); + buffer.append(pad(String.valueOf(strokes), 3)).append(";\t"); int data = getRSShortData(item.codePointAt(0)); String radical = null; @@ -437,14 +488,14 @@ private static void showOldData(Collator collator, String name, boolean japanese radical = radicalStroke.getRadicalStringFromShortData(data); remainingStrokes = RadicalStroke.getResidualStrokesFromShortData(data) + ""; } - buffer.append(pad(radical,4)).append(";\t"); - buffer.append(pad(remainingStrokes,2)).append(";\t"); + buffer.append(pad(radical, 4)).append(";\t"); + buffer.append(pad(remainingStrokes, 2)).append(";\t"); if (japanese) { final String reading = kJapaneseKun.get(item); final String reading2 = kJapaneseOn.get(item); - buffer.append(pad(reading,1)).append(";\t"); - buffer.append(pad(reading2,1)).append(";\t"); + buffer.append(pad(reading, 1)).append(";\t"); + buffer.append(pad(reading2, 1)).append(";\t"); } else { final Set pinyins = mergedPinyin.get(item); if (pinyins != null) { @@ -503,19 +554,32 @@ private static void getIndexChars() { if (pinyin.equals(lastPinyin)) { count++; } else { - if (DEBUG) System.out.println("\t" + count + "\t" - + (progressive.get(lastPinyin) / (double) counter.get(lastPinyin))); + if (DEBUG) + System.out.println( + "\t" + + count + + "\t" + + (progressive.get(lastPinyin) + / (double) counter.get(lastPinyin))); count = 1; lastPinyin = pinyin; System.out.print(s + "\t" + pinyin + "\t"); } } - if (DEBUG) System.out.println("\t" + count + "\t" - + (progressive.get(lastPinyin) / (double) counter.get(lastPinyin))); + if (DEBUG) + System.out.println( + "\t" + + count + + "\t" + + (progressive.get(lastPinyin) / (double) counter.get(lastPinyin))); } private static void getBestStrokes() { - final PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, "kTotalStrokesReplacements.txt", null); + final PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + "kTotalStrokesReplacements.txt", + null); out.println("#Code\tkTotalStrokes\tValue\t#\tChar\tUnihan"); @@ -529,16 +593,26 @@ private static void getBestStrokes() { bestStrokesS.put(s, unihanStrokes); } if (bihuaStrokes != NO_STROKE_INFO && bihuaStrokes != unihanStrokes) { - out.println("U+" + Utility.hex(s) + "\tkTotalStrokes\t" + bihuaStrokes + "\t#\t" + s + "\t" + unihanStrokes); + out.println( + "U+" + + Utility.hex(s) + + "\tkTotalStrokes\t" + + bihuaStrokes + + "\t#\t" + + s + + "\t" + + unihanStrokes); } } out.close(); - new PatchStrokeReader(bestStrokesS, SemiFileReader.SPLIT).process(GenerateUnihanCollators.class, "patchStroke.txt"); + new PatchStrokeReader(bestStrokesS, SemiFileReader.SPLIT) + .process(GenerateUnihanCollators.class, "patchStroke.txt"); } private static void closeUnderNFKD(String title, UnicodeMap mapping) { - // UnicodeSet possibles = new UnicodeSet(NOT_NFKD).removeAll(NOT_NFD).removeAll(mapping.keySet()); + // UnicodeSet possibles = new + // UnicodeSet(NOT_NFKD).removeAll(NOT_NFD).removeAll(mapping.keySet()); // if (!possibles.contains("㊀")) { // System.out.println("??"); // } @@ -553,7 +627,8 @@ private static void closeUnderNFKD(String title, UnicodeMap mapping) { // continue; // } // mapping.put(s, value); - // System.out.println("*** " + title + " Closing " + s + " => " + kd + "; " + value); + // System.out.println("*** " + title + " Closing " + s + " => " + kd + "; " + + // value); // } final UnicodeSet extras = new UnicodeSet(NOT_NFKD).retainAll(mapping.keySet()); if (extras.size() != 0) { @@ -564,29 +639,39 @@ private static void closeUnderNFKD(String title, UnicodeMap mapping) { } private static void showBackgroundData() throws IOException { - final PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, "backgroundCjkData.txt", null); - final UnicodeSet all = new UnicodeSet(bihuaData.keySet()); // .addAll(allPinyin.keySet()).addAll(kRSUnicode.keySet()); - final Comparator> comparator = new Comparator>() { - @Override - public int compare(R4 o1, R4 o2) { - int result = o1.get0().compareTo(o2.get0()); - if (result != 0) { - return result; - } - result = pinyinSort.compare(o1.get1(), o2.get1()); - if (result != 0) { - return result; - } - result = o1.get2().compareTo(o2.get2()); - if (result != 0) { - return result; - } - result = o1.get3().compareTo(o2.get3()); - return result; - } - - }; - final Set> items = new TreeSet>(comparator); + final PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + "backgroundCjkData.txt", + null); + final UnicodeSet all = + new UnicodeSet( + bihuaData + .keySet()); // .addAll(allPinyin.keySet()).addAll(kRSUnicode.keySet()); + final Comparator> comparator = + new Comparator>() { + @Override + public int compare( + R4 o1, + R4 o2) { + int result = o1.get0().compareTo(o2.get0()); + if (result != 0) { + return result; + } + result = pinyinSort.compare(o1.get1(), o2.get1()); + if (result != 0) { + return result; + } + result = o1.get2().compareTo(o2.get2()); + if (result != 0) { + return result; + } + result = o1.get3().compareTo(o2.get3()); + return result; + } + }; + final Set> items = + new TreeSet>(comparator); for (final String s : all) { final R2 bihua = bihuaData.get(s); final int bihuaStrokes = bihua == null ? 0 : bihua.get1().length(); @@ -595,7 +680,8 @@ public int compare(R4 o1, R4 o1, R4 item : items) { out.println(item.get3()); @@ -617,14 +721,17 @@ public int compare(R4 o1, R4 and it cannot also be a Comparator. + // Note that Collator is a Comparator and it cannot also be a Comparator. private static final class CollatorWithTieBreaker implements Comparator { private final Collator coll; private final Comparator tieBreaker; + CollatorWithTieBreaker(Collator c, Comparator tb) { coll = c; tieBreaker = tb; } + public int compare(String left, String right) { int result = coll.compare(left, right); if (result != 0) { @@ -634,9 +741,16 @@ public int compare(String left, String right) { } } - private static void testSorting(Comparator oldComparator, UnicodeMap krsunicode2, String filename) throws Exception { + private static void testSorting( + Comparator oldComparator, UnicodeMap krsunicode2, String filename) + throws Exception { final List temp = krsunicode2.keySet().addAllTo(new ArrayList()); - final String rules = getFileAsString(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY + File.separatorChar + filename + ".txt"); + final String rules = + getFileAsString( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY + + File.separatorChar + + filename + + ".txt"); // The rules contain \uFDD0 and such and must be unescaped for the RuleBasedCollator. final Collator coll = new RuleBasedCollator(com.ibm.icu.impl.Utility.unescape(rules)); @@ -644,7 +758,8 @@ private static void testSorting(Comparator oldComparator, UnicodeMap final List ruleSorted = sortList(collator, temp); @SuppressWarnings("unchecked") - final Comparator oldCollator = new MultiComparator(oldComparator, codepointComparator); + final Comparator oldCollator = + new MultiComparator(oldComparator, codepointComparator); final List originalSorted = sortList(oldCollator, temp); int badItems = 0; final int min = Math.min(originalSorted.size(), ruleSorted.size()); @@ -663,7 +778,8 @@ private static void testSorting(Comparator oldComparator, UnicodeMap final int bCount = differ.getBCount(); if (aCount != 0 || bCount != 0) { badItems += aCount + bCount; - System.out.println(aline(krsunicode2, differ, -1) + "\t" + bline(krsunicode2, differ, -1)); + System.out.println( + aline(krsunicode2, differ, -1) + "\t" + bline(krsunicode2, differ, -1)); if (aCount != 0) { for (int i = 0; i < aCount; ++i) { @@ -675,7 +791,10 @@ private static void testSorting(Comparator oldComparator, UnicodeMap System.out.println("\t\t\t\t\t\t" + bline(krsunicode2, differ, i)); } } - System.out.println(aline(krsunicode2, differ, aCount) + "\t " + bline(krsunicode2, differ, bCount)); + System.out.println( + aline(krsunicode2, differ, aCount) + + "\t " + + bline(krsunicode2, differ, bCount)); System.out.println("-----"); } @@ -692,7 +811,15 @@ private static void testSorting(Comparator oldComparator, UnicodeMap private static String aline(UnicodeMap krsunicode2, Differ differ, int i) { final String item = differ.getA(i); try { - return "unihan: " + differ.getALine(i) + " " + item + " [" + Utility.hex(item) + "/" + krsunicode2.get(item) + "]"; + return "unihan: " + + differ.getALine(i) + + " " + + item + + " [" + + Utility.hex(item) + + "/" + + krsunicode2.get(item) + + "]"; } catch (final RuntimeException e) { throw e; } @@ -700,7 +827,15 @@ private static String aline(UnicodeMap krsunicode2, Differ differ private static String bline(UnicodeMap krsunicode2, Differ differ, int i) { final String item = differ.getB(i); - return "rules: " + differ.getBLine(i) + " " + item + " [" + Utility.hex(item) + "/" + krsunicode2.get(item) + "]"; + return "rules: " + + differ.getBLine(i) + + " " + + item + + " [" + + Utility.hex(item) + + "/" + + krsunicode2.get(item) + + "]"; } private static List sortList(Comparator collator, List temp) { @@ -709,13 +844,15 @@ private static List sortList(Comparator collator, List t return Arrays.asList(ruleSorted); } - // private static String getFileAsString(Class relativeToClass, String filename) throws IOException { + // private static String getFileAsString(Class relativeToClass, String + // filename) throws IOException { // final BufferedReader in = FileUtilities.openFile(relativeToClass, filename); // ... same as the version below // } private static String getFileAsString(String filename) throws IOException { - final InputStreamReader reader = new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8); + final InputStreamReader reader = + new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8); final BufferedReader in = new BufferedReader(reader, 1024 * 64); final StringBuilder builder = new StringBuilder(); while (true) { @@ -730,8 +867,12 @@ private static String getFileAsString(String filename) throws IOException { } private static void showTranslit(String filename) { - final PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + ".txt", null); - final PrintWriter out2 = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + ".xml", null); + final PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + ".txt", null); + final PrintWriter out2 = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + ".xml", null); final TreeSet s = new TreeSet(pinyinSort); s.addAll(bestPinyin.getAvailableValues()); @@ -747,7 +888,7 @@ private static void showTranslit(String filename) { private static class RsInfo { public static void addToStrokeInfo(UnicodeMap bestStrokesIn, boolean simplified) { - final int[] mainStrokes = new int[256]; + final int[] mainStrokes = new int[256]; final int[] alternateStrokes = new int[256]; final Counter mainStrokesTotal = new Counter(); @@ -762,7 +903,8 @@ public static void addToStrokeInfo(UnicodeMap bestStrokesIn, boolean si continue; } int radical = RadicalStroke.getRadicalNumberFromShortData(data); - final int radicalsStrokes = bestStrokeInfo - RadicalStroke.getResidualStrokesFromShortData(data); + final int radicalsStrokes = + bestStrokeInfo - RadicalStroke.getResidualStrokesFromShortData(data); if (!RadicalStroke.isSimplifiedFromShortData(data)) { mainStrokesTotal.add(radical, radicalsStrokes); mainCount.add(radical, 1); @@ -773,24 +915,41 @@ public static void addToStrokeInfo(UnicodeMap bestStrokesIn, boolean si } // compute averages. Lame, but the best we have for now. for (final int key : mainStrokesTotal.keySet()) { - mainStrokes[key] = (int) Math.round(mainStrokesTotal.get(key) / (double) mainCount.get(key)); + mainStrokes[key] = + (int) Math.round(mainStrokesTotal.get(key) / (double) mainCount.get(key)); if (DEBUG) System.out.println("radical " + key + "\t" + mainStrokes[key]); } for (final int key : alternateStrokesTotal.keySet()) { - alternateStrokes[key] = (int) Math.round(alternateStrokesTotal.get(key) / (double) alternateCount.get(key)); + alternateStrokes[key] = + (int) + Math.round( + alternateStrokesTotal.get(key) + / (double) alternateCount.get(key)); if (DEBUG) System.out.println("radical' " + key + "\t" + alternateStrokes[key]); } - final PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, "imputedStrokes" + (simplified ? "" : "T") + - ".txt", null); - for (final String s : new UnicodeSet(kRSUnicode.keySet()).removeAll(bestStrokesIn.keySet())) { + final PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + "imputedStrokes" + (simplified ? "" : "T") + ".txt", + null); + for (final String s : + new UnicodeSet(kRSUnicode.keySet()).removeAll(bestStrokesIn.keySet())) { int c = s.codePointAt(0); int data = getRSShortData(c); int radical = RadicalStroke.getRadicalNumberFromShortData(data); - final int computedStrokes = RadicalStroke.getResidualStrokesFromShortData(data) + - (RadicalStroke.isSimplifiedFromShortData(data) ? - alternateStrokes[radical] : mainStrokes[radical]); + final int computedStrokes = + RadicalStroke.getResidualStrokesFromShortData(data) + + (RadicalStroke.isSimplifiedFromShortData(data) + ? alternateStrokes[radical] + : mainStrokes[radical]); bestStrokesIn.put(s, computedStrokes); - out.println("U+" + Utility.hex(s) + "\tkImputedStrokes\t" + computedStrokes + "\t#\t" + s); + out.println( + "U+" + + Utility.hex(s) + + "\tkImputedStrokes\t" + + computedStrokes + + "\t#\t" + + s); } closeUnderNFKD("Strokes", bestStrokesIn); bestStrokesIn.freeze(); @@ -809,7 +968,7 @@ private static int getRSShortData(int c) { return 0; } c = radical.codePointAt(0); - assert radical.length() == Character.charCount(c); // single code point + assert radical.length() == Character.charCount(c); // single code point data = radicalStroke.getShortData(c); assert data != 0; return data; @@ -829,10 +988,10 @@ private static long getRSLongOrder(int c) { String radical = radicalMap.get(c); if (radical == null) { // Not an ideograph, sort higher than any of them. - return ((long)Integer.MAX_VALUE << 32) | c; + return ((long) Integer.MAX_VALUE << 32) | c; } c = radical.codePointAt(0); - assert radical.length() == Character.charCount(c); // single code point + assert radical.length() == Character.charCount(c); // single code point order = radicalStroke.getLongOrder(c); assert order != 0; return order; @@ -842,19 +1001,27 @@ private static long getRSLongOrder(int c) { order = radicalStroke.getLongOrder(c); if (order == 0) { // Not an ideograph, sort higher than any of them. - order = ((long)Integer.MAX_VALUE << 32) | c; + order = ((long) Integer.MAX_VALUE << 32) | c; } return order; } - private static void showSorting(Comparator comparator, UnicodeMap unicodeMap, String filename, InfoType infoType) { + private static void showSorting( + Comparator comparator, + UnicodeMap unicodeMap, + String filename, + InfoType infoType) { showSorting(comparator, unicodeMap, filename, FileType.txt, infoType); showSorting(comparator, unicodeMap, filename, FileType.xml, infoType); } @SuppressWarnings("resource") - private static void showSorting(Comparator comparator, UnicodeMap unicodeMap, String filename, - FileType fileType, InfoType infoType) { + private static void showSorting( + Comparator comparator, + UnicodeMap unicodeMap, + String filename, + FileType fileType, + InfoType infoType) { // special capture for Pinyin buckets final boolean isPinyin = filename.startsWith("pinyin") && fileType == FileType.xml; int alpha = 'a'; @@ -862,7 +1029,11 @@ private static void showSorting(Comparator comparator, UnicodeMap final StringBuilder pinyinIndexBuffer = new StringBuilder("\"\u0101"); final UnicodeSet accumulated = new UnicodeSet(); - PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + "." + fileType, null); + PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + filename + "." + fileType, + null); final TreeSet rsSorted = new TreeSet(comparator); final StringBuilder buffer = new StringBuilder(); for (final String s : unicodeMap) { @@ -889,7 +1060,8 @@ private static void showSorting(Comparator comparator, UnicodeMap // +" \n" // +" \n" // ); - // FileUtilities.appendFile(GenerateUnihanCollators.class, "pinyinHeader.txt", out); + // FileUtilities.appendFile(GenerateUnihanCollators.class, + // "pinyinHeader.txt", out); out.println("\t\t\t\t"); } S oldValue = null; @@ -906,18 +1078,43 @@ private static void showSorting(Comparator comparator, UnicodeMap // show other characters if (buffer.codePointCount(0, buffer.length()) < 128) { if (fileType == FileType.txt) { - out.println(INDENT + "<*" + sortingQuote(buffer.toString(), accumulated) + " # " + sortingQuote(oldValue, accumulated)); + out.println( + INDENT + + "<*" + + sortingQuote(buffer.toString(), accumulated) + + " # " + + sortingQuote(oldValue, accumulated)); } else { - out.println(" " + buffer + ""); + out.println( + " " + + buffer + + ""); } } else { int count = 1; while (buffer.length() > 0) { final String temp = extractFirst(buffer, 128); if (fileType == FileType.txt) { - out.println(INDENT + "<*" + sortingQuote(temp.toString(), accumulated) + " # " + sortingQuote(oldValue, accumulated) + " (p" + count++ + ")"); + out.println( + INDENT + + "<*" + + sortingQuote(temp.toString(), accumulated) + + " # " + + sortingQuote(oldValue, accumulated) + + " (p" + + count++ + + ")"); } else { - out.println(" " + temp + ""); + out.println( + " " + + temp + + ""); } } } @@ -940,9 +1137,18 @@ private static void showSorting(Comparator comparator, UnicodeMap alpha++; } // "\u516B", // B - pinyinBuffer.append("\"" + hexConstant(s) + "\", " + - "// " + UTF16.valueOf(alpha) + " : " + s + " [" + pinyinValue + "]\n"); - pinyinIndexBuffer.append(hexConstant(pinyinValue.substring(0,1))); + pinyinBuffer.append( + "\"" + + hexConstant(s) + + "\", " + + "// " + + UTF16.valueOf(alpha) + + " : " + + s + + " [" + + pinyinValue + + "]\n"); + pinyinIndexBuffer.append(hexConstant(pinyinValue.substring(0, 1))); } } } @@ -951,7 +1157,12 @@ private static void showSorting(Comparator comparator, UnicodeMap if (oldValue != null) { if (fileType == FileType.txt) { - out.println(INDENT + "<*" + sortingQuote(buffer.toString(), accumulated) + " # " + sortingQuote(oldValue, accumulated)); + out.println( + INDENT + + "<*" + + sortingQuote(buffer.toString(), accumulated) + + " # " + + sortingQuote(oldValue, accumulated)); } else { out.println(" " + buffer + ""); } @@ -970,15 +1181,19 @@ private static void showSorting(Comparator comparator, UnicodeMap for (final String s : sorted) { // decomposable, but not tailored final String kd = nfkd.normalize(s.codePointAt(0)); - if (!tailored.containsSome(kd)) - { + if (!tailored.containsSome(kd)) { continue; // the decomp has to contain at least one tailored } // if (tailored.containsAll(kd)) // continue; //already have it // character if (fileType == FileType.txt) { - out.println(INDENT + "&" + sortingQuote(kd, accumulated) + "<<<" + sortingQuote(s, accumulated)); + out.println( + INDENT + + "&" + + sortingQuote(kd, accumulated) + + "<<<" + + sortingQuote(s, accumulated)); } else { out.println(" " + kd + ""); out.println(" " + s + ""); @@ -994,11 +1209,19 @@ private static void showSorting(Comparator comparator, UnicodeMap // } out.close(); - out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + "_repertoire.txt", null); + out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + filename + "_repertoire.txt", + null); out.println(accumulated.toPattern(false)); out.close(); if (isPinyin) { - out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, filename + "_buckets.txt", null); + out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + filename + "_buckets.txt", + null); pinyinIndexBuffer.append('"'); out.println(pinyinIndexBuffer); out.println(pinyinBuffer); @@ -1006,7 +1229,8 @@ private static void showSorting(Comparator comparator, UnicodeMap } } - private static void showIndexValue(FileType fileType, PrintWriter out, Output comment, String indexValue) { + private static void showIndexValue( + FileType fileType, PrintWriter out, Output comment, String indexValue) { if (fileType == FileType.txt) { out.println(INDENT + "<'" + hexConstant(indexValue) + "' # INDEX " + comment); } else { @@ -1016,6 +1240,7 @@ private static void showIndexValue(FileType fileType, PrintWriter out, Outpu /** * Hex format by code unit. + * * @param s * @return */ @@ -1056,9 +1281,9 @@ private static String sortingQuote(T input, UnicodeSet accumulated) { } private static boolean equals(Object newValue, Object oldValue) { - return newValue == null ? oldValue == null - : oldValue == null ? false - : newValue.equals(oldValue); + return newValue == null + ? oldValue == null + : oldValue == null ? false : newValue.equals(oldValue); } private static int showAdded(String title, int difference) { @@ -1076,11 +1301,19 @@ private static void addBihua() { } private static void printExtraPinyinForUnihan() { - try ( - PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, "kMandarinAdditions.txt", null); - PrintWriter overrideOut = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, "kMandarinOverride.txt", null);) { + try (PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + "kMandarinAdditions.txt", + null); + PrintWriter overrideOut = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + "kMandarinOverride.txt", + null); ) { final String header = "#Code\t“Best”\tValue\t#\tChar"; - String description = "# the format is like Unihan, with “kMandarin” being the field, and the value being a possible replacement for what is there."; + String description = + "# the format is like Unihan, with “kMandarin” being the field, and the value being a possible replacement for what is there."; out.println(header + "\n" + description); overrideOut.println(header + "\tkMandarin\n" + description); @@ -1088,8 +1321,15 @@ private static void printExtraPinyinForUnihan() { final String bestValue = bestPinyin.get(s); final String kMandarinString = kMandarin.get(s); if (kMandarinString == null) { - final String bestValueNumeric = toNumericPinyin.transform(bestValue).toUpperCase(); - out.println("U+" + Utility.hex(s) + "\tkMandarin\t" + bestValueNumeric + "\t#\t" + s); + final String bestValueNumeric = + toNumericPinyin.transform(bestValue).toUpperCase(); + out.println( + "U+" + + Utility.hex(s) + + "\tkMandarin\t" + + bestValueNumeric + + "\t#\t" + + s); continue; } @@ -1098,42 +1338,76 @@ private static void printExtraPinyinForUnihan() { continue; } final String bestValueNumeric = toNumericPinyin.transform(bestValue).toUpperCase(); - overrideOut.println("U+" + Utility.hex(s) + "\tkMandarin\t" + bestValueNumeric + "\t#\t" + s + "\t" + kMandarinString); + overrideOut.println( + "U+" + + Utility.hex(s) + + "\tkMandarin\t" + + bestValueNumeric + + "\t#\t" + + s + + "\t" + + kMandarinString); } } } private static void printExtraStrokesForUnihan() { - try ( - PrintWriter out = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, "kTotalStrokesAdditions.txt", null); - PrintWriter overrideOut = Utility.openPrintWriter(GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, "kTotalStrokesOverride.txt", null);) { + try (PrintWriter out = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + "kTotalStrokesAdditions.txt", + null); + PrintWriter overrideOut = + Utility.openPrintWriter( + GenerateUnihanCollatorFiles.OUTPUT_DIRECTORY, + "kTotalStrokesOverride.txt", + null); ) { final String header = "#Code\t“Best”\tValue\t#\tChar"; - String description = "# the format is like Unihan, with “kTotalStrokes” being the field, and the value being a possible replacement for what is there."; + String description = + "# the format is like Unihan, with “kTotalStrokes” being the field, and the value being a possible replacement for what is there."; out.println(header + "\n" + description); overrideOut.println(header + "\tkTotalStrokes\n" + description); - UnicodeSet keys = new UnicodeSet(bestStrokesS.keySet()).addAll(bestStrokesT.keySet()).freeze(); + UnicodeSet keys = + new UnicodeSet(bestStrokesS.keySet()).addAll(bestStrokesT.keySet()).freeze(); for (final String s : keys) { Integer bestS = bestStrokesS.get(s); Integer bestT = bestStrokesT.get(s); - String replacement = bestS == null ? bestT.toString() - : bestT == null ? bestS.toString() - : bestS.equals(bestT) ? bestS.toString() - : bestS + "|" + bestT; + String replacement = + bestS == null + ? bestT.toString() + : bestT == null + ? bestS.toString() + : bestS.equals(bestT) + ? bestS.toString() + : bestS + "|" + bestT; final String kTotalStrokesString = kTotalStrokes.get(s); if (kTotalStrokesString == null) { - out.println("U+" + Utility.hex(s) + "\tkTotalStrokes\t" + replacement.replace('|', ' ') + "\t#\t" + s); + out.println( + "U+" + + Utility.hex(s) + + "\tkTotalStrokes\t" + + replacement.replace('|', ' ') + + "\t#\t" + + s); continue; } if (kTotalStrokesString.equals(replacement)) { continue; } - overrideOut.println("U+" + Utility.hex(s) + "\tkTotalStrokes\t" + replacement.replace('|', ' ') + "\t#\t" + s + "\t" + kTotalStrokesString); + overrideOut.println( + "U+" + + Utility.hex(s) + + "\tkTotalStrokes\t" + + replacement.replace('|', ' ') + + "\t#\t" + + s + + "\t" + + kTotalStrokesString); } } } - private static int addPinyinFromVariants(String title, int count) { for (final Set s : variantEquivalents.getEquivalenceSets()) { String hasPinyin = null; @@ -1141,7 +1415,8 @@ private static int addPinyinFromVariants(String title, int count) { for (final Integer cp : s) { final String existing = bestPinyin.get(cp); if (existing != null) { - hasPinyin = existing; // take last one. Might be better algorithm, but for now... + hasPinyin = + existing; // take last one. Might be better algorithm, but for now... countHasPinyin++; } } @@ -1182,7 +1457,6 @@ private static void addEquivalents(UnicodeMap variantMap) { } } - private static void addRadicals() { for (final String s : radicalMap.keySet()) { final String main = radicalMap.get(s); @@ -1225,7 +1499,8 @@ private static boolean validPinyin(String pinyin) { return result; } - private static void addAllKeepingOld(String han, String original, PinyinSource pinyin, Iterable pinyinList) { + private static void addAllKeepingOld( + String han, String original, PinyinSource pinyin, Iterable pinyinList) { int count = 0; for (final String source : pinyinList) { if (source.length() == 0) { @@ -1244,7 +1519,15 @@ private static void addPinyin(String title, String han, String source, OverrideI int debug = 0; } if (!validPinyin(source)) { - System.out.println("***Invalid Pinyin - " + title + ": " + han + "\t" + source + "\t" + Utility.hex(han)); + System.out.println( + "***Invalid Pinyin - " + + title + + ": " + + han + + "\t" + + source + + "\t" + + Utility.hex(han)); return; } source = source.intern(); @@ -1252,7 +1535,8 @@ private static void addPinyin(String title, String han, String source, OverrideI if (item == null || override == OverrideItems.keepNew) { if (!source.equals(item)) { if (item != null) { - System.out.println("Overriding Pinyin " + han + "\told: " + item + "\tnew: " + source); + System.out.println( + "Overriding Pinyin " + han + "\told: " + item + "\tnew: " + source); } bestPinyin.put(han, source); } @@ -1266,7 +1550,7 @@ private static void addPinyin(String title, String han, String source, OverrideI private static final class MyFileReader extends SemiFileReader { public final Pattern SPLIT = Pattern.compile("\\s*,\\s*"); - String last = ""; + String last = ""; @Override protected String[] splitLine(String line) { @@ -1276,10 +1560,14 @@ protected String[] splitLine(String line) { @Override protected boolean isCodePoint() { return false; - }; + } + ; /** - *
;Radical Number,Status,Unified_Ideo,Hex,Radical,Hex,Name,Conf.Char,Hex,Unified Ideo. has NORemainingStrokes in Unihan
+         *
+         *
+         * 
+         * ;Radical Number,Status,Unified_Ideo,Hex,Radical,Hex,Name,Conf.Char,Hex,Unified Ideo. has NORemainingStrokes in Unihan
          * 
1,Main,一,U+4E00,⼀,U+2F00,ONE *
*/ @@ -1299,14 +1587,16 @@ protected boolean handleLine(int start, int end, String[] items) { radicalMap.put(radical, last); return true; } - }; + } + ; // 吖 ; a ; 1 ; 251432 ; 0x5416 private static final class BihuaReader extends SemiFileReader { @Override protected boolean isCodePoint() { return false; - }; + } + ; Set seen = new HashSet(); @@ -1337,7 +1627,8 @@ protected boolean handleLine(int start, int end, String[] items) { bihuaData.put(character, Row.of(source, charSequence)); return true; } - }; + } + ; private static final class PatchPinyinReader extends SemiFileReader { boolean skip = false; @@ -1345,7 +1636,8 @@ private static final class PatchPinyinReader extends SemiFileReader { @Override protected boolean isCodePoint() { return false; - }; + } + ; @Override protected void processComment(String line, int comment) { @@ -1364,8 +1656,11 @@ protected boolean handleLine(int start, int end, String[] items) { throw new IllegalArgumentException("Non-Unihan character: " + items[0]); } if (!PINYIN_LETTERS.containsAll(items[1])) { - throw new IllegalArgumentException("Non-Pinyin character: " + items[1] - + "; " + new UnicodeSet().addAll(items[1]).removeAll(PINYIN_LETTERS)); + throw new IllegalArgumentException( + "Non-Pinyin character: " + + items[1] + + "; " + + new UnicodeSet().addAll(items[1]).removeAll(PINYIN_LETTERS)); } addPinyin("patchPinyin", items[0], items[1], OverrideItems.keepNew); } @@ -1390,7 +1685,8 @@ protected String[] splitLine(String line) { @Override protected boolean isCodePoint() { return false; - }; + } + ; @Override protected void processComment(String line, int comment) { @@ -1409,13 +1705,14 @@ protected boolean handleLine(int start, int end, String[] items) { codepoint = UTF16.valueOf(Integer.parseInt(codepoint.substring(2), 16)); } if (!UNIHAN.contains(codepoint)) { - throw new IllegalArgumentException("Non-Unihan character: " + codepoint + ", " + Utility.hex(codepoint)); + throw new IllegalArgumentException( + "Non-Unihan character: " + codepoint + ", " + Utility.hex(codepoint)); } if (items.length > 1) { String strokeCount = items[1]; int comma = strokeCount.indexOf(','); if (comma >= 0) { - strokeCount = strokeCount.substring(0,comma); + strokeCount = strokeCount.substring(0, comma); } target.put(codepoint, Integer.parseInt(strokeCount)); } @@ -1423,27 +1720,30 @@ protected boolean handleLine(int start, int end, String[] items) { } } - private static Comparator RSComparator = new Comparator() { - @Override - public int compare(String s1, String s2) { - int c1 = s1.codePointAt(0); - assert Character.charCount(c1) == s1.length(); - int c2 = s2.codePointAt(0); - assert Character.charCount(c2) == s2.length(); - long order1 = getRSLongOrder(c1); - long order2 = getRSLongOrder(c2); - if (order1 != order2) { - return order1 < order2 ? -1 : 1; - } - return codepointComparator.compare(s1, s2); - } - }; + private static Comparator RSComparator = + new Comparator() { + @Override + public int compare(String s1, String s2) { + int c1 = s1.codePointAt(0); + assert Character.charCount(c1) == s1.length(); + int c2 = s2.codePointAt(0); + assert Character.charCount(c2) == s2.length(); + long order1 = getRSLongOrder(c1); + long order2 = getRSLongOrder(c2); + if (order1 != order2) { + return order1 < order2 ? -1 : 1; + } + return codepointComparator.compare(s1, s2); + } + }; private static class StrokeComparator implements Comparator { final UnicodeMap baseMap; + public StrokeComparator(UnicodeMap baseMap) { this.baseMap = baseMap; } + @Override public int compare(String o1, String o2) { final Integer n1 = getStrokeValue(o1, baseMap); @@ -1473,63 +1773,66 @@ private static Integer getStrokeValue(String o1, UnicodeMap baseMap) { private static Comparator SStrokeComparator = new StrokeComparator(bestStrokesS); private static Comparator TStrokeComparator = new StrokeComparator(bestStrokesT); - private static Comparator PinyinComparator = new Comparator() { + private static Comparator PinyinComparator = + new Comparator() { - @Override - public int compare(String o1, String o2) { - final String s1 = getPinyin(o1); - final String s2 = getPinyin(o2); - if (s1 == null) { - if (s2 != null) { - return 1; + @Override + public int compare(String o1, String o2) { + final String s1 = getPinyin(o1); + final String s2 = getPinyin(o2); + if (s1 == null) { + if (s2 != null) { + return 1; + } + } else if (s2 == null) { + return -1; + } + final int result = pinyinSort.compare(s1, s2); + if (result != 0) { + return result; + } + return SStrokeComparator.compare(o1, o2); } - } else if (s2 == null) { - return -1; - } - final int result = pinyinSort.compare(s1, s2); - if (result != 0) { - return result; - } - return SStrokeComparator.compare(o1, o2); - } - }; + }; public static String getPinyin(String o1) { final int cp1 = o1.codePointAt(0); return bestPinyin.get(cp1); } - private static final Relation indexValues = Relation.of(new EnumMap>(InfoType.class), HashSet.class); + private static final Relation indexValues = + Relation.of(new EnumMap>(InfoType.class), HashSet.class); private static String getIndexValue(InfoType infoType, String s, Output comment) { String rest; switch (infoType) { - case pinyin: - final String str = getPinyin(s).toUpperCase(Locale.ENGLISH); // TODO drop accents - final int first = str.charAt(0); - if (first < 0x7F) { - rest = str.substring(0,1); - } else { - rest = nfd.normalize(first).substring(0,1); - } - comment.value = rest; - break; - case radicalStroke: - final int codepoint = s.codePointAt(0); - int data = getRSShortData(codepoint); - if (data == 0) { - throw new IllegalArgumentException("Missing R-S data for U+" + Utility.hex(codepoint)); - } - rest = radicalStroke.getRadicalCharFromShortData(data); - comment.value = radicalStroke.getRadicalStringFromShortData(data); - break; - case stroke: - final Integer strokeCount = getStrokeValue(s, bestStrokesT); - rest = String.valueOf((char)(INDEX_ITEM_BASE + strokeCount)); - comment.value = String.valueOf(strokeCount); - break; - default: - throw new IllegalArgumentException(); + case pinyin: + final String str = getPinyin(s).toUpperCase(Locale.ENGLISH); // TODO drop accents + final int first = str.charAt(0); + if (first < 0x7F) { + rest = str.substring(0, 1); + } else { + rest = nfd.normalize(first).substring(0, 1); + } + comment.value = rest; + break; + case radicalStroke: + final int codepoint = s.codePointAt(0); + int data = getRSShortData(codepoint); + if (data == 0) { + throw new IllegalArgumentException( + "Missing R-S data for U+" + Utility.hex(codepoint)); + } + rest = radicalStroke.getRadicalCharFromShortData(data); + comment.value = radicalStroke.getRadicalStringFromShortData(data); + break; + case stroke: + final Integer strokeCount = getStrokeValue(s, bestStrokesT); + rest = String.valueOf((char) (INDEX_ITEM_BASE + strokeCount)); + comment.value = String.valueOf(strokeCount); + break; + default: + throw new IllegalArgumentException(); } final String result = infoType.base + rest; indexValues.put(infoType, result); diff --git a/unicodetools/src/main/java/org/unicode/draft/GetCurrencies.java b/unicodetools/src/main/java/org/unicode/draft/GetCurrencies.java index 106d360e8..136d48435 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GetCurrencies.java +++ b/unicodetools/src/main/java/org/unicode/draft/GetCurrencies.java @@ -1,4 +1,7 @@ package org.unicode.draft; + +import com.ibm.icu.util.Currency; +import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.Collections; import java.util.Date; @@ -8,34 +11,38 @@ import java.util.TreeMap; import java.util.TreeSet; -import com.ibm.icu.util.Currency; -import com.ibm.icu.util.ULocale; - - public class GetCurrencies { - enum Type {IN_COUNTRY, ALL} + enum Type { + IN_COUNTRY, + ALL + } public static void main(String[] args) { final Date today = new Date(); final CurrenciesLocalizations localizations = new CurrenciesLocalizations(); final Set modernCurrencies = new TreeSet(); for (final ULocale locale : ULocale.getAvailableLocales()) { - final String[] availableCurrencyCodes = Currency.getAvailableCurrencyCodes(locale, today); + final String[] availableCurrencyCodes = + Currency.getAvailableCurrencyCodes(locale, today); if (availableCurrencyCodes == null) { - //System.out.println(locale + "\t" + "none"); + // System.out.println(locale + "\t" + "none"); continue; } final List currencies = Arrays.asList(availableCurrencyCodes); - //System.out.println(locale + "\t" + currencies); + // System.out.println(locale + "\t" + currencies); for (final String currency : availableCurrencyCodes) { - localizations.add(currency, Currency.getInstance(currency).getSymbol(locale), Type.IN_COUNTRY); + localizations.add( + currency, + Currency.getInstance(currency).getSymbol(locale), + Type.IN_COUNTRY); } modernCurrencies.addAll(currencies); } for (final ULocale locale : ULocale.getAvailableLocales()) { for (final String currency : modernCurrencies) { - localizations.add(currency, Currency.getInstance(currency).getSymbol(locale), Type.ALL); + localizations.add( + currency, Currency.getInstance(currency).getSymbol(locale), Type.ALL); } } @@ -44,26 +51,40 @@ public static void main(String[] args) { for (final String currency : modernCurrencies) { final Set inCountry = localizations.getSymbols(currency, Type.IN_COUNTRY); final Set other = localizations.getSymbols(currency, Type.ALL); - showLine(currency, Currency.getInstance(currency).getSymbol(ULocale.ROOT), - inCountry.toString(), ""+inCountry.size(), - other.toString(), ""+other.size()); + showLine( + currency, + Currency.getInstance(currency).getSymbol(ULocale.ROOT), + inCountry.toString(), + "" + inCountry.size(), + other.toString(), + "" + other.size()); } } - private static void showLine(String currency, String symbolInRoot, String symbolInCountry, String count, - String symbolInOther, String countOther) { + private static void showLine( + String currency, + String symbolInRoot, + String symbolInCountry, + String count, + String symbolInOther, + String countOther) { System.out.println( currency - + "\t" + symbolInRoot - + "\t" + symbolInCountry - + "\t" + count - + "\t" + symbolInOther - + "\t" + countOther - ); + + "\t" + + symbolInRoot + + "\t" + + symbolInCountry + + "\t" + + count + + "\t" + + symbolInOther + + "\t" + + countOther); } static class CurrenciesLocalizations { - Map currencyToData = new TreeMap(); + Map currencyToData = + new TreeMap(); public void add(String currency, String symbol, Type type) { if (currency.equals(symbol)) { diff --git a/unicodetools/src/main/java/org/unicode/draft/GetNames.java b/unicodetools/src/main/java/org/unicode/draft/GetNames.java index 7e9186a0b..9306b75bc 100644 --- a/unicodetools/src/main/java/org/unicode/draft/GetNames.java +++ b/unicodetools/src/main/java/org/unicode/draft/GetNames.java @@ -1,14 +1,11 @@ package org.unicode.draft; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.IOException; - import org.unicode.cldr.draft.FileUtilities; -import com.ibm.icu.util.ULocale; - - public class GetNames { public static void main(String[] args) throws IOException { System.out.println(new File(".").getCanonicalPath()); diff --git a/unicodetools/src/main/java/org/unicode/draft/HanFrequencies.java b/unicodetools/src/main/java/org/unicode/draft/HanFrequencies.java index 19e44702e..15f3b3a91 100644 --- a/unicodetools/src/main/java/org/unicode/draft/HanFrequencies.java +++ b/unicodetools/src/main/java/org/unicode/draft/HanFrequencies.java @@ -1,5 +1,10 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -11,17 +16,10 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Counter; import org.unicode.text.utility.Settings; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class HanFrequencies { private static final String GEN_HANFREQ_DIR = Settings.Output.GEN_DIR + "/hanfrequency"; @@ -51,8 +49,11 @@ private static void generateReadings() throws IOException { rank.put(parts[0], ++count); } freq.close(); - final BufferedReader readings = FileUtilities.openUTF8Reader(Settings.UnicodeTools.DATA_DIR + "/frequency", "han-reading-diff.txt"); - final Set>> ordered = new TreeSet>>(); + final BufferedReader readings = + FileUtilities.openUTF8Reader( + Settings.UnicodeTools.DATA_DIR + "/frequency", "han-reading-diff.txt"); + final Set>> ordered = + new TreeSet>>(); while (true) { String line = readings.readLine(); if (line == null) { @@ -63,17 +64,18 @@ private static void generateReadings() throws IOException { } line = line; // just to make spreadsheets happy with the hex. final String[] parts = line.split("\t"); - //4F3D 伽 jiā; gā; jiā x # std=old + // 4F3D 伽 jiā; gā; jiā x # std=old try { Integer rankValue = rank.get(parts[1]); if (rankValue == null) { rankValue = ++count; // next value System.out.println("Missing rank: " + line); } - if (parts[1].codePointAt(0) != Integer.parseInt(parts[0],16) || parts.length != 7) { + if (parts[1].codePointAt(0) != Integer.parseInt(parts[0], 16) + || parts.length != 7) { throw new IllegalArgumentException(); } - final Map values = new TreeMap(); + final Map values = new TreeMap(); values.put(ReadingRows.hex, "x" + parts[0]); values.put(ReadingRows.character, parts[1]); values.put(ReadingRows.oldVal, parts[2]); @@ -81,7 +83,8 @@ private static void generateReadings() throws IOException { values.put(ReadingRows.CN, parts[4]); final String[] alt = parts[5].split(":"); - final ReadingAlt type = alt[0].equals("?") ? ReadingAlt.x : ReadingAlt.valueOf(alt[0]); + final ReadingAlt type = + alt[0].equals("?") ? ReadingAlt.x : ReadingAlt.valueOf(alt[0]); values.put(ReadingRows.TW, type == ReadingAlt.tw ? alt[1] : ""); values.put(ReadingRows.name, type == ReadingAlt.nm ? alt[1] : ""); values.put(ReadingRows.combo, type == ReadingAlt.co ? alt[1] : ""); @@ -93,7 +96,7 @@ private static void generateReadings() throws IOException { } values.put(ReadingRows.comment, comment); - final R2> row = Row.of(rankValue, values); + final R2> row = Row.of(rankValue, values); ordered.add(row); } catch (final Exception e) { throw new IllegalArgumentException(line, e); @@ -101,17 +104,18 @@ private static void generateReadings() throws IOException { } readings.close(); - final PrintWriter out = FileUtilities.openUTF8Writer(GEN_HANFREQ_DIR, "han-reading-diff.html"); - //PrintStream out = System.out; + final PrintWriter out = + FileUtilities.openUTF8Writer(GEN_HANFREQ_DIR, "han-reading-diff.html"); + // PrintStream out = System.out; final ReadingRows[] values = ReadingRows.values(); out.println("
"); - for (final R2> entry : ordered) { + for (final R2> entry : ordered) { out.print(""); @@ -120,18 +124,33 @@ private static void generateReadings() throws IOException { out.close(); } - enum ReadingRows {hex, character, oldVal, newVal, CN, TW, name, combo, comment} - enum ReadingAlt {x, co, nm, tw} + enum ReadingRows { + hex, + character, + oldVal, + newVal, + CN, + TW, + name, + combo, + comment + } + + enum ReadingAlt { + x, + co, + nm, + tw + } // # tw: - alt reading for TW (not Hant; in HK,MO this reading is not relevant) // # nm: - alt reading for names (personal or geographic) // # co: - alt non-name reading only used in combinations - private static void generateFrequencies() { final Set languages = CharacterFrequency.getLanguagesWithCounter(); showInterleaved(); - //System.out.println(languages); + // System.out.println(languages); show("zh"); show("zh-Hant"); show("ja"); @@ -140,8 +159,11 @@ private static void generateFrequencies() { } private static void showInterleaved() { - final PrintWriter out = org.unicode.text.utility.Utility.openPrintWriter(GEN_HANFREQ_DIR, - "unifiedZh.txt", org.unicode.text.utility.Utility.UTF8_WINDOWS); + final PrintWriter out = + org.unicode.text.utility.Utility.openPrintWriter( + GEN_HANFREQ_DIR, + "unifiedZh.txt", + org.unicode.text.utility.Utility.UTF8_WINDOWS); final LinkedHashMap rank1 = getFilteredList("zh"); final Iterator> it1 = rank1.entrySet().iterator(); @@ -160,16 +182,28 @@ private static void showInterleaved() { out.close(); } - private static Entry writeItem(PrintWriter out, Iterator> it1, LinkedHashMap otherRank, HashSet alreadyDone, String titles) { + private static Entry writeItem( + PrintWriter out, + Iterator> it1, + LinkedHashMap otherRank, + HashSet alreadyDone, + String titles) { final Entry entry1 = it1.hasNext() ? it1.next() : null; if (entry1 != null) { final String item1 = entry1.getKey(); if (!alreadyDone.contains(item1)) { final Integer otherValue = otherRank.get(item1); - out.println(item1 - + "\t" + titles.charAt(0) + "\t" + entry1.getValue() - + "\t" + titles.charAt(1) + "\t" + (otherValue == null ? "-" : otherValue.toString())); + out.println( + item1 + + "\t" + + titles.charAt(0) + + "\t" + + entry1.getValue() + + "\t" + + titles.charAt(1) + + "\t" + + (otherValue == null ? "-" : otherValue.toString())); alreadyDone.add(item1); } } @@ -178,7 +212,7 @@ private static Entry writeItem(PrintWriter out, Iterator getFilteredList(String locale) { final Counter counter1 = CharacterFrequency.getCodePointCounter(locale, true); - final LinkedHashMap list1 = new LinkedHashMap(); + final LinkedHashMap list1 = new LinkedHashMap(); int rank = 0; for (final Integer item : counter1.getKeysetSortedByCount(false)) { if (HAN.contains(item)) { @@ -190,8 +224,11 @@ private static LinkedHashMap getFilteredList(String locale) { private static void show(String locale) { System.out.println("Writing:\t" + locale); - final PrintWriter out = org.unicode.text.utility.Utility.openPrintWriter(GEN_HANFREQ_DIR, - locale + ".txt", org.unicode.text.utility.Utility.UTF8_WINDOWS); + final PrintWriter out = + org.unicode.text.utility.Utility.openPrintWriter( + GEN_HANFREQ_DIR, + locale + ".txt", + org.unicode.text.utility.Utility.UTF8_WINDOWS); final Counter counter = CharacterFrequency.getCodePointCounter(locale, true); long total = 0; for (final Integer item : counter) { @@ -200,7 +237,7 @@ private static void show(String locale) { } total += counter.get(item); } - final long countLimit = (long)(total * 0.999995d); + final long countLimit = (long) (total * 0.999995d); final UnicodeSet currentSet = new UnicodeSet(); int setCount = 0; long runningTotal = 0; @@ -214,7 +251,12 @@ private static void show(String locale) { currentSet.add(item); if (currentSet.size() >= chunkLimit) { setCount += currentSet.size(); - out.println(setCount + "\t" + (runningTotal/(double)total) + "\t" + currentSet.toPattern(false)); + out.println( + setCount + + "\t" + + (runningTotal / (double) total) + + "\t" + + currentSet.toPattern(false)); out.flush(); System.out.print("."); currentSet.clear(); diff --git a/unicodetools/src/main/java/org/unicode/draft/Hello.java b/unicodetools/src/main/java/org/unicode/draft/Hello.java index f49909b19..30a3a32bd 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Hello.java +++ b/unicodetools/src/main/java/org/unicode/draft/Hello.java @@ -1,7 +1,4 @@ package org.unicode.draft; -import java.util.Arrays; -import java.util.Date; -import java.util.Locale; import com.ibm.icu.lang.UScript; import com.ibm.icu.text.Collator; @@ -18,7 +15,9 @@ import com.ibm.icu.util.TimeUnit; import com.ibm.icu.util.TimeUnitAmount; import com.ibm.icu.util.ULocale; - +import java.util.Arrays; +import java.util.Date; +import java.util.Locale; public class Hello { @@ -27,11 +26,21 @@ public class Hello { */ public static void main(String[] args) { - for (final String test : new String [] {"en", "ja", "de", "da", "ru"}) { - for (final TimeUnit timeUnit : new TimeUnit [] {TimeUnit.YEAR, TimeUnit.MONTH, TimeUnit.WEEK, TimeUnit.DAY, TimeUnit.HOUR, TimeUnit.MINUTE, TimeUnit.SECOND}) { - for (final int style : new int[] {TimeUnitFormat.ABBREVIATED_NAME, TimeUnitFormat.FULL_NAME}) { + for (final String test : new String[] {"en", "ja", "de", "da", "ru"}) { + for (final TimeUnit timeUnit : + new TimeUnit[] { + TimeUnit.YEAR, + TimeUnit.MONTH, + TimeUnit.WEEK, + TimeUnit.DAY, + TimeUnit.HOUR, + TimeUnit.MINUTE, + TimeUnit.SECOND + }) { + for (final int style : + new int[] {TimeUnitFormat.ABBREVIATED_NAME, TimeUnitFormat.FULL_NAME}) { final TimeUnitFormat format = new TimeUnitFormat(new ULocale(test), style); - for (final double amount : new double[]{1d, 2d}) { + for (final double amount : new double[] {1d, 2d}) { // create time unit amount instance - a combination of Number and time unit final TimeUnitAmount source = new TimeUnitAmount(amount, timeUnit); System.out.print(format.format(source) + "\t\t"); @@ -49,15 +58,13 @@ public static void main(String[] args) { final UnicodeSet foo; final ULocale locale = new ULocale("fr"); final NumberFormat nf = NumberFormat.getCurrencyInstance(locale); - String pattern = ((DecimalFormat)nf).toPattern(); + String pattern = ((DecimalFormat) nf).toPattern(); pattern = pattern.replace("¤", "¤¤¤"); - ((DecimalFormat)nf).applyPattern(pattern); + ((DecimalFormat) nf).applyPattern(pattern); final CurrencyAmount ca = new CurrencyAmount(1.99, Currency.getInstance("USD")); final String formatted = nf.format(ca); System.out.println(formatted); - - final int foo1 = UScript.getCodeFromName("Grek"); final int foo2 = UScript.getCodeFromName("Greek"); checkCollator(ULocale.ENGLISH); @@ -66,32 +73,35 @@ public static void main(String[] args) { return; } // TODO Auto-generated method stub - final UnicodeSet junk = new UnicodeSet("[\\U0001F1FF {\\U0001F1E8 \\U0001F1F3} {\\U0001F1E9 \\U0001F1EA} {\\U0001F1EA \\U0001F1F8} {\\U0001F1EB \\U0001F1F7} {\\U0001F1EC \\U0001F1E7} {\\U0001F1EE \\U0001F1F9} {\\U0001F1EF \\U0001F1F5} {\\U0001F1F0 \\U0001F1F7} {\\U0001F1F7 \\U0001F1FA} {\\U0001F1FA \\U0001F1F8} ]"); + final UnicodeSet junk = + new UnicodeSet( + "[\\U0001F1FF {\\U0001F1E8 \\U0001F1F3} {\\U0001F1E9 \\U0001F1EA} {\\U0001F1EA \\U0001F1F8} {\\U0001F1EB \\U0001F1F7} {\\U0001F1EC \\U0001F1E7} {\\U0001F1EE \\U0001F1F9} {\\U0001F1EF \\U0001F1F5} {\\U0001F1F0 \\U0001F1F7} {\\U0001F1F7 \\U0001F1FA} {\\U0001F1FA \\U0001F1F8} ]"); junk.toString(); System.out.println(junk); - final UnicodeSet emoji = new UnicodeSet("[\\U00002600 \\U00002601 \\U0001F300 \\U0001F301 \\U0001F302 \\U0001F303 \\U0001F304 \\U0001F305 \\U0001F306 \\U0001F307 \\U0001F308 \\U0001F309 \\U0001F30A \\U0001F30B \\U0001F30C \\U000026C4 \\U000026C5 \\U00002614 \\U000026A1 \\U0001F30F \\U0001F311 \\U0001F314 \\U0001F313 \\U0001F319 \\U0001F315 \\U0001F31B \\U0001F31F \\U0001F320 \\U0001F550 \\U0001F551 \\U0001F552 \\U0001F553 \\U0001F554 \\U0001F555 \\U0001F556 \\U0001F557 \\U0001F558 \\U0001F559 \\U0001F55A \\U0001F55B \\U0000231A \\U0000231B \\U000023F0 \\U000023F3 \\U00002648 \\U00002649 \\U0000264A \\U0000264B \\U0000264C \\U0000264D \\U0000264E \\U0000264F \\U00002650 \\U00002651 \\U00002652 \\U00002653 \\U000026CE \\U0001F340 \\U0001F337 \\U0001F331 \\U0001F341 \\U0001F338 \\U0001F339 \\U0001F342 \\U0001F343 \\U0001F33A \\U0001F33B \\U0001F334 \\U0001F335 \\U0001F33E \\U0001F33D \\U0001F344 \\U0001F330 \\U0001F33C \\U0001F33F \\U0001F352 \\U0001F34C \\U0001F34E \\U0001F34A \\U0001F353 \\U0001F349 \\U0001F345 \\U0001F346 \\U0001F348 \\U0001F34D \\U0001F347 \\U0001F351 \\U0001F34F \\U0001F440 \\U0001F442 \\U0001F443 \\U0001F444 \\U0001F445 \\U0001F484 \\U0001F485 \\U0001F486 \\U0001F487 \\U0001F488 \\U0001F464 \\U0001F466 \\U0001F467 \\U0001F468 \\U0001F469 \\U0001F46A \\U0001F46B \\U0001F46E \\U0001F46F \\U0001F470 \\U0001F471 \\U0001F472 \\U0001F473 \\U0001F474 \\U0001F475 \\U0001F476 \\U0001F477 \\U0001F478 \\U0001F479 \\U0001F47A \\U0001F47B \\U0001F47C \\U0001F47D \\U0001F47E \\U0001F47F \\U0001F480 \\U0001F481 \\U0001F482 \\U0001F483 \\U0001F40C \\U0001F40D \\U0001F40E \\U0001F414 \\U0001F417 \\U0001F42B \\U0001F418 \\U0001F428 \\U0001F412 \\U0001F411 \\U0001F419 \\U0001F41A \\U0001F41B \\U0001F41C \\U0001F41D \\U0001F41E \\U0001F420 \\U0001F421 \\U0001F422 \\U0001F424 \\U0001F425 \\U0001F426 \\U0001F423 \\U0001F427 \\U0001F429 \\U0001F41F \\U0001F42C \\U0001F42D \\U0001F42F \\U0001F431 \\U0001F433 \\U0001F434 \\U0001F435 \\U0001F436 \\U0001F437 \\U0001F43B \\U0001F439 \\U0001F43A \\U0001F42E \\U0001F430 \\U0001F438 \\U0001F43E \\U0001F432 \\U0001F43C \\U0001F43D \\U0000263A \\U0001F620 \\U0001F629 \\U0001F632 \\U0001F61E \\U0001F635 \\U0001F630 \\U0001F612 \\U0001F60D \\U0001F624 \\U0001F61C \\U0001F61D \\U0001F60B \\U0001F618 \\U0001F61A \\U0001F637 \\U0001F633 \\U0001F603 \\U0001F605 \\U0001F606 \\U0001F601 \\U0001F602 \\U0001F60A \\U0001F604 \\U0001F622 \\U0001F62D \\U0001F628 \\U0001F623 \\U0001F621 \\U0001F60C \\U0001F616 \\U0001F614 \\U0001F631 \\U0001F62A \\U0001F60F \\U0001F613 \\U0001F625 \\U0001F62B \\U0001F609 \\U0001F63A \\U0001F638 \\U0001F639 \\U0001F63D \\U0001F63B \\U0001F63F \\U0001F63E \\U0001F63C \\U0001F640 \\U0001F645 \\U0001F646 \\U0001F647 \\U0001F648 \\U0001F64A \\U0001F649 \\U0001F64B \\U0001F64C \\U0001F64D \\U0001F64E \\U0001F64F \\U0001F3E0 \\U0001F3E1 \\U0001F3E2 \\U0001F3E3 \\U0001F3E5 \\U0001F3E6 \\U0001F3E7 \\U0001F3E8 \\U0001F3E9 \\U0001F3EA \\U0001F3EB \\U0001F3EC \\U0001F3EF \\U0001F3F0 \\U0001F3ED \\U0001F3EE \\U00002693 \\U000026EA \\U000026F2 \\U0001F5FB \\U0001F5FC \\U0001F5FD \\U0001F5FE \\U0001F5FF \\U0001F45E \\U0001F45F \\U0001F460 \\U0001F461 \\U0001F462 \\U0001F463 \\U0001F453 \\U0001F455 \\U0001F456 \\U0001F451 \\U0001F454 \\U0001F452 \\U0001F457 \\U0001F458 \\U0001F459 \\U0001F45A \\U0001F45B \\U0001F45C \\U0001F45D \\U0001F4B0 \\U0001F4B1 \\U0001F4B9 \\U0001F4B2 \\U0001F4B3 \\U0001F4B4 \\U0001F4B5 \\U0001F4B8 \\U0001F1E6 \\U0001F1E7 \\U0001F1E8 \\U0001F1E9 \\U0001F1EA \\U0001F1EB \\U0001F1EC \\U0001F1ED \\U0001F1EE \\U0001F1EF \\U0001F1F0 \\U0001F1F1 \\U0001F1F2 \\U0001F1F3 \\U0001F1F4 \\U0001F1F5 \\U0001F1F6 \\U0001F1F7 \\U0001F1F8 \\U0001F1F9 \\U0001F1FA \\U0001F1FB \\U0001F1FC \\U0001F1FD \\U0001F1FE \\U0001F1FF {\\U0001F1E8 \\U0001F1F3} {\\U0001F1E9 \\U0001F1EA} {\\U0001F1EA \\U0001F1F8} {\\U0001F1EB \\U0001F1F7} {\\U0001F1EC \\U0001F1E7} {\\U0001F1EE \\U0001F1F9} {\\U0001F1EF \\U0001F1F5} {\\U0001F1F0 \\U0001F1F7} {\\U0001F1F7 \\U0001F1FA} {\\U0001F1FA \\U0001F1F8} \\U0001F525 \\U0001F526 \\U0001F527 \\U0001F528 \\U0001F529 \\U0001F52A \\U0001F52B \\U0001F52E \\U0001F52F \\U0001F530 \\U0001F531 \\U0001F489 \\U0001F48A \\U0001F170 \\U0001F171 \\U0001F18E \\U0001F17E \\U0001F17F \\U0001F380 \\U0001F381 \\U0001F382 \\U0001F384 \\U0001F385 \\U0001F38C \\U0001F386 \\U0001F388 \\U0001F389 \\U0001F38D \\U0001F38E \\U0001F393 \\U0001F392 \\U0001F38F \\U0001F387 \\U0001F390 \\U0001F383 \\U0001F38A \\U0001F38B \\U0001F391 \\U0000260E \\U0001F4DF \\U0001F4DE \\U0001F4F1 \\U0001F4F2 \\U0001F4DD \\U0001F4E0 \\U0001F4E8 \\U0001F4E9 \\U0001F4EA \\U0001F4EB \\U0001F4EE \\U0001F4F0 \\U0001F4E2 \\U0001F4E3 \\U0001F4E1 \\U0001F4E4 \\U0001F4E5 \\U0001F4E6 \\U0001F4E7 \\U0001F520 \\U0001F521 \\U0001F522 \\U0001F523 \\U0001F524 \\U00002702 \\U00002709 \\U0000270F \\U00002712 \\U00002714 \\U00002716 \\U0001F4BA \\U0001F4BB \\U0001F4CE \\U0001F4BC \\U0001F4BD \\U0001F4BE \\U0001F4BF \\U0001F4C0 \\U0001F4CD \\U0001F4C3 \\U0001F4C4 \\U0001F4C5 \\U0001F4C1 \\U0001F4C2 \\U0001F4D3 \\U0001F4D6 \\U0001F4D4 \\U0001F4D5 \\U0001F4D7 \\U0001F4D8 \\U0001F4D9 \\U0001F4DA \\U0001F4DB \\U0001F4DC \\U0001F4CB \\U0001F4C6 \\U0001F4CA \\U0001F4C8 \\U0001F4C9 \\U0001F4C7 \\U0001F4CC \\U0001F4D2 \\U0001F4CF \\U0001F4D0 \\U0001F4D1 \\U000026F3 \\U000026F5 \\U000026FA \\U000026FD \\U0001F3BD \\U000026BE \\U0001F3BE \\U000026BD \\U0001F3BF \\U0001F3C0 \\U0001F3C1 \\U0001F3C2 \\U0001F3C3 \\U0001F3C4 \\U0001F3C6 \\U0001F3C8 \\U0001F3CA \\U000024C2 \\U0001F683 \\U0001F687 \\U0001F684 \\U0001F685 \\U0001F697 \\U0001F699 \\U0001F68C \\U0001F68F \\U0001F6A2 \\U0001F689 \\U0001F680 \\U0001F6A4 \\U0001F695 \\U0001F69A \\U0001F692 \\U0001F691 \\U0001F693 \\U0001F6A5 \\U0001F6A7 \\U0001F6A8 \\U00002668 \\U00002708 \\U0001F3A0 \\U0001F3A1 \\U0001F3A2 \\U0001F3A3 \\U0001F3A4 \\U0001F3A5 \\U0001F3A6 \\U0001F3A7 \\U0001F3A8 \\U0001F3A9 \\U0001F3AA \\U0001F3AB \\U0001F3AC \\U0001F3AD \\U0001F004 \\U0001F3AE \\U0001F3AF \\U0001F3B0 \\U0001F3B1 \\U0001F3B2 \\U0001F3B3 \\U0001F3B4 \\U0001F0CF \\U0001F3B5 \\U0001F3B6 \\U0001F3B7 \\U0001F3B8 \\U0001F3B9 \\U0001F3BA \\U0001F3BB \\U0001F3BC \\U0000303D \\U0001F4F7 \\U0001F4F9 \\U0001F4FA \\U0001F4FB \\U0001F4FC \\U0001F48B \\U0001F48C \\U0001F48D \\U0001F48E \\U0001F48F \\U0001F490 \\U0001F491 \\U0001F492 \\U000000A9 \\U000000AE \\U00002122 \\U00002139 \\U0001F51E {\\U00000023 \\U000020E3} {\\U00000031 \\U000020E3} {\\U00000032 \\U000020E3} {\\U00000033 \\U000020E3} {\\U00000034 \\U000020E3} {\\U00000035 \\U000020E3} {\\U00000036 \\U000020E3} {\\U00000037 \\U000020E3} {\\U00000038 \\U000020E3} {\\U00000039 \\U000020E3} {\\U00000030 \\U000020E3} \\U0001F51F \\U0001F4F6 \\U0001F4F3 \\U0001F4F4 \\U0001F354 \\U0001F359 \\U0001F370 \\U0001F35C \\U0001F35E \\U0001F373 \\U0001F366 \\U0001F35F \\U0001F361 \\U0001F358 \\U0001F35A \\U0001F35D \\U0001F35B \\U0001F362 \\U0001F363 \\U0001F371 \\U0001F372 \\U0001F367 \\U0001F356 \\U0001F365 \\U0001F360 \\U0001F355 \\U0001F357 \\U0001F368 \\U0001F369 \\U0001F36A \\U0001F36B \\U0001F36C \\U0001F36D \\U0001F36E \\U0001F36F \\U0001F364 \\U0001F374 \\U00002615 \\U0001F378 \\U0001F37A \\U0001F375 \\U0001F376 \\U0001F377 \\U0001F37B \\U0001F379 \\U00002194 \\U00002195 \\U00002197 \\U00002198 \\U00002196 \\U00002199 \\U00002B06 \\U00002B07 \\U00002B05 \\U000027A1 \\U00002934 \\U00002935 \\U000025B6 \\U000025C0 \\U000023E9 \\U000023EA \\U000023EB \\U000023EC \\U0001F53A \\U0001F53B \\U0001F53C \\U0001F53D \\U00002B55 \\U0000274C \\U0000274E \\U00002757 \\U00002753 \\U00002754 \\U00002755 \\U00003030 \\U0000203C \\U00002049 \\U000027B0 \\U000027BF \\U00002764 \\U0001F493 \\U0001F494 \\U0001F495 \\U0001F496 \\U0001F497 \\U0001F498 \\U0001F499 \\U0001F49A \\U0001F49B \\U0001F49C \\U0001F49D \\U0001F49E \\U0001F49F \\U00002665 \\U00002660 \\U00002666 \\U00002663 \\U0000267B \\U0000267F \\U000026A0 \\U000026D4 \\U0001F6AC \\U0001F6AD \\U0001F6A9 \\U0001F6B2 \\U0001F6B6 \\U0001F6B9 \\U0001F6BA \\U0001F6C0 \\U0001F6BB \\U0001F6BD \\U0001F6BE \\U0001F6BC \\U0001F6AA \\U0001F6AB \\U0001F191 \\U0001F192 \\U0001F193 \\U0001F194 \\U0001F195 \\U0001F196 \\U0001F197 \\U0001F198 \\U0001F199 \\U0001F19A \\U0001F201 \\U0001F202 \\U0001F21A \\U0001F22F \\U0001F232 \\U0001F233 \\U0001F234 \\U0001F235 \\U0001F236 \\U0001F237 \\U0001F238 \\U0001F239 \\U0001F23A \\U00003299 \\U00003297 \\U0001F250 \\U0001F251 \\U00002795 \\U00002796 \\U00002797 \\U0001F4A0 \\U0001F4A1 \\U0001F4A2 \\U0001F4A3 \\U0001F4A4 \\U0001F4A5 \\U0001F4A6 \\U0001F4A7 \\U0001F4A8 \\U0001F4A9 \\U0001F4AA \\U0001F4AB \\U0001F4AC \\U00002728 \\U00002734 \\U00002733 \\U00002744 \\U00002747 \\U000026AA \\U000026AB \\U00002B50 \\U0001F534 \\U0001F535 \\U0001F532 \\U0001F533 \\U000025AB \\U000025AA \\U000025FD \\U000025FE \\U000025FB \\U000025FC \\U0001F536 \\U0001F537 \\U0001F538 \\U0001F539 \\U00002B1B \\U00002B1C \\U0001F4AE \\U0001F4AF \\U000021A9 \\U000021AA \\U0001F503 \\U0001F50A \\U0001F50B \\U0001F50C \\U0001F50D \\U0001F50E \\U0001F512 \\U0001F513 \\U0001F50F \\U0001F510 \\U0001F511 \\U0001F514 \\U0001F518 \\U0001F516 \\U0001F517 \\U00002611 \\U0001F519 \\U0001F51A \\U0001F51B \\U0001F51C \\U0001F51D \\U00002003 \\U00002002 \\U00002005 \\U00002705 \\U0000270A \\U0000270B \\U0000270C \\U0001F44A \\U0001F44D \\U0001F446 \\U0001F447 \\U0001F448 \\U0001F449 \\U0001F44B \\U0001F44F \\U0001F44C \\U0001F44E \\U0001F450 \\U0000261D ]"); + final UnicodeSet emoji = + new UnicodeSet( + "[\\U00002600 \\U00002601 \\U0001F300 \\U0001F301 \\U0001F302 \\U0001F303 \\U0001F304 \\U0001F305 \\U0001F306 \\U0001F307 \\U0001F308 \\U0001F309 \\U0001F30A \\U0001F30B \\U0001F30C \\U000026C4 \\U000026C5 \\U00002614 \\U000026A1 \\U0001F30F \\U0001F311 \\U0001F314 \\U0001F313 \\U0001F319 \\U0001F315 \\U0001F31B \\U0001F31F \\U0001F320 \\U0001F550 \\U0001F551 \\U0001F552 \\U0001F553 \\U0001F554 \\U0001F555 \\U0001F556 \\U0001F557 \\U0001F558 \\U0001F559 \\U0001F55A \\U0001F55B \\U0000231A \\U0000231B \\U000023F0 \\U000023F3 \\U00002648 \\U00002649 \\U0000264A \\U0000264B \\U0000264C \\U0000264D \\U0000264E \\U0000264F \\U00002650 \\U00002651 \\U00002652 \\U00002653 \\U000026CE \\U0001F340 \\U0001F337 \\U0001F331 \\U0001F341 \\U0001F338 \\U0001F339 \\U0001F342 \\U0001F343 \\U0001F33A \\U0001F33B \\U0001F334 \\U0001F335 \\U0001F33E \\U0001F33D \\U0001F344 \\U0001F330 \\U0001F33C \\U0001F33F \\U0001F352 \\U0001F34C \\U0001F34E \\U0001F34A \\U0001F353 \\U0001F349 \\U0001F345 \\U0001F346 \\U0001F348 \\U0001F34D \\U0001F347 \\U0001F351 \\U0001F34F \\U0001F440 \\U0001F442 \\U0001F443 \\U0001F444 \\U0001F445 \\U0001F484 \\U0001F485 \\U0001F486 \\U0001F487 \\U0001F488 \\U0001F464 \\U0001F466 \\U0001F467 \\U0001F468 \\U0001F469 \\U0001F46A \\U0001F46B \\U0001F46E \\U0001F46F \\U0001F470 \\U0001F471 \\U0001F472 \\U0001F473 \\U0001F474 \\U0001F475 \\U0001F476 \\U0001F477 \\U0001F478 \\U0001F479 \\U0001F47A \\U0001F47B \\U0001F47C \\U0001F47D \\U0001F47E \\U0001F47F \\U0001F480 \\U0001F481 \\U0001F482 \\U0001F483 \\U0001F40C \\U0001F40D \\U0001F40E \\U0001F414 \\U0001F417 \\U0001F42B \\U0001F418 \\U0001F428 \\U0001F412 \\U0001F411 \\U0001F419 \\U0001F41A \\U0001F41B \\U0001F41C \\U0001F41D \\U0001F41E \\U0001F420 \\U0001F421 \\U0001F422 \\U0001F424 \\U0001F425 \\U0001F426 \\U0001F423 \\U0001F427 \\U0001F429 \\U0001F41F \\U0001F42C \\U0001F42D \\U0001F42F \\U0001F431 \\U0001F433 \\U0001F434 \\U0001F435 \\U0001F436 \\U0001F437 \\U0001F43B \\U0001F439 \\U0001F43A \\U0001F42E \\U0001F430 \\U0001F438 \\U0001F43E \\U0001F432 \\U0001F43C \\U0001F43D \\U0000263A \\U0001F620 \\U0001F629 \\U0001F632 \\U0001F61E \\U0001F635 \\U0001F630 \\U0001F612 \\U0001F60D \\U0001F624 \\U0001F61C \\U0001F61D \\U0001F60B \\U0001F618 \\U0001F61A \\U0001F637 \\U0001F633 \\U0001F603 \\U0001F605 \\U0001F606 \\U0001F601 \\U0001F602 \\U0001F60A \\U0001F604 \\U0001F622 \\U0001F62D \\U0001F628 \\U0001F623 \\U0001F621 \\U0001F60C \\U0001F616 \\U0001F614 \\U0001F631 \\U0001F62A \\U0001F60F \\U0001F613 \\U0001F625 \\U0001F62B \\U0001F609 \\U0001F63A \\U0001F638 \\U0001F639 \\U0001F63D \\U0001F63B \\U0001F63F \\U0001F63E \\U0001F63C \\U0001F640 \\U0001F645 \\U0001F646 \\U0001F647 \\U0001F648 \\U0001F64A \\U0001F649 \\U0001F64B \\U0001F64C \\U0001F64D \\U0001F64E \\U0001F64F \\U0001F3E0 \\U0001F3E1 \\U0001F3E2 \\U0001F3E3 \\U0001F3E5 \\U0001F3E6 \\U0001F3E7 \\U0001F3E8 \\U0001F3E9 \\U0001F3EA \\U0001F3EB \\U0001F3EC \\U0001F3EF \\U0001F3F0 \\U0001F3ED \\U0001F3EE \\U00002693 \\U000026EA \\U000026F2 \\U0001F5FB \\U0001F5FC \\U0001F5FD \\U0001F5FE \\U0001F5FF \\U0001F45E \\U0001F45F \\U0001F460 \\U0001F461 \\U0001F462 \\U0001F463 \\U0001F453 \\U0001F455 \\U0001F456 \\U0001F451 \\U0001F454 \\U0001F452 \\U0001F457 \\U0001F458 \\U0001F459 \\U0001F45A \\U0001F45B \\U0001F45C \\U0001F45D \\U0001F4B0 \\U0001F4B1 \\U0001F4B9 \\U0001F4B2 \\U0001F4B3 \\U0001F4B4 \\U0001F4B5 \\U0001F4B8 \\U0001F1E6 \\U0001F1E7 \\U0001F1E8 \\U0001F1E9 \\U0001F1EA \\U0001F1EB \\U0001F1EC \\U0001F1ED \\U0001F1EE \\U0001F1EF \\U0001F1F0 \\U0001F1F1 \\U0001F1F2 \\U0001F1F3 \\U0001F1F4 \\U0001F1F5 \\U0001F1F6 \\U0001F1F7 \\U0001F1F8 \\U0001F1F9 \\U0001F1FA \\U0001F1FB \\U0001F1FC \\U0001F1FD \\U0001F1FE \\U0001F1FF {\\U0001F1E8 \\U0001F1F3} {\\U0001F1E9 \\U0001F1EA} {\\U0001F1EA \\U0001F1F8} {\\U0001F1EB \\U0001F1F7} {\\U0001F1EC \\U0001F1E7} {\\U0001F1EE \\U0001F1F9} {\\U0001F1EF \\U0001F1F5} {\\U0001F1F0 \\U0001F1F7} {\\U0001F1F7 \\U0001F1FA} {\\U0001F1FA \\U0001F1F8} \\U0001F525 \\U0001F526 \\U0001F527 \\U0001F528 \\U0001F529 \\U0001F52A \\U0001F52B \\U0001F52E \\U0001F52F \\U0001F530 \\U0001F531 \\U0001F489 \\U0001F48A \\U0001F170 \\U0001F171 \\U0001F18E \\U0001F17E \\U0001F17F \\U0001F380 \\U0001F381 \\U0001F382 \\U0001F384 \\U0001F385 \\U0001F38C \\U0001F386 \\U0001F388 \\U0001F389 \\U0001F38D \\U0001F38E \\U0001F393 \\U0001F392 \\U0001F38F \\U0001F387 \\U0001F390 \\U0001F383 \\U0001F38A \\U0001F38B \\U0001F391 \\U0000260E \\U0001F4DF \\U0001F4DE \\U0001F4F1 \\U0001F4F2 \\U0001F4DD \\U0001F4E0 \\U0001F4E8 \\U0001F4E9 \\U0001F4EA \\U0001F4EB \\U0001F4EE \\U0001F4F0 \\U0001F4E2 \\U0001F4E3 \\U0001F4E1 \\U0001F4E4 \\U0001F4E5 \\U0001F4E6 \\U0001F4E7 \\U0001F520 \\U0001F521 \\U0001F522 \\U0001F523 \\U0001F524 \\U00002702 \\U00002709 \\U0000270F \\U00002712 \\U00002714 \\U00002716 \\U0001F4BA \\U0001F4BB \\U0001F4CE \\U0001F4BC \\U0001F4BD \\U0001F4BE \\U0001F4BF \\U0001F4C0 \\U0001F4CD \\U0001F4C3 \\U0001F4C4 \\U0001F4C5 \\U0001F4C1 \\U0001F4C2 \\U0001F4D3 \\U0001F4D6 \\U0001F4D4 \\U0001F4D5 \\U0001F4D7 \\U0001F4D8 \\U0001F4D9 \\U0001F4DA \\U0001F4DB \\U0001F4DC \\U0001F4CB \\U0001F4C6 \\U0001F4CA \\U0001F4C8 \\U0001F4C9 \\U0001F4C7 \\U0001F4CC \\U0001F4D2 \\U0001F4CF \\U0001F4D0 \\U0001F4D1 \\U000026F3 \\U000026F5 \\U000026FA \\U000026FD \\U0001F3BD \\U000026BE \\U0001F3BE \\U000026BD \\U0001F3BF \\U0001F3C0 \\U0001F3C1 \\U0001F3C2 \\U0001F3C3 \\U0001F3C4 \\U0001F3C6 \\U0001F3C8 \\U0001F3CA \\U000024C2 \\U0001F683 \\U0001F687 \\U0001F684 \\U0001F685 \\U0001F697 \\U0001F699 \\U0001F68C \\U0001F68F \\U0001F6A2 \\U0001F689 \\U0001F680 \\U0001F6A4 \\U0001F695 \\U0001F69A \\U0001F692 \\U0001F691 \\U0001F693 \\U0001F6A5 \\U0001F6A7 \\U0001F6A8 \\U00002668 \\U00002708 \\U0001F3A0 \\U0001F3A1 \\U0001F3A2 \\U0001F3A3 \\U0001F3A4 \\U0001F3A5 \\U0001F3A6 \\U0001F3A7 \\U0001F3A8 \\U0001F3A9 \\U0001F3AA \\U0001F3AB \\U0001F3AC \\U0001F3AD \\U0001F004 \\U0001F3AE \\U0001F3AF \\U0001F3B0 \\U0001F3B1 \\U0001F3B2 \\U0001F3B3 \\U0001F3B4 \\U0001F0CF \\U0001F3B5 \\U0001F3B6 \\U0001F3B7 \\U0001F3B8 \\U0001F3B9 \\U0001F3BA \\U0001F3BB \\U0001F3BC \\U0000303D \\U0001F4F7 \\U0001F4F9 \\U0001F4FA \\U0001F4FB \\U0001F4FC \\U0001F48B \\U0001F48C \\U0001F48D \\U0001F48E \\U0001F48F \\U0001F490 \\U0001F491 \\U0001F492 \\U000000A9 \\U000000AE \\U00002122 \\U00002139 \\U0001F51E {\\U00000023 \\U000020E3} {\\U00000031 \\U000020E3} {\\U00000032 \\U000020E3} {\\U00000033 \\U000020E3} {\\U00000034 \\U000020E3} {\\U00000035 \\U000020E3} {\\U00000036 \\U000020E3} {\\U00000037 \\U000020E3} {\\U00000038 \\U000020E3} {\\U00000039 \\U000020E3} {\\U00000030 \\U000020E3} \\U0001F51F \\U0001F4F6 \\U0001F4F3 \\U0001F4F4 \\U0001F354 \\U0001F359 \\U0001F370 \\U0001F35C \\U0001F35E \\U0001F373 \\U0001F366 \\U0001F35F \\U0001F361 \\U0001F358 \\U0001F35A \\U0001F35D \\U0001F35B \\U0001F362 \\U0001F363 \\U0001F371 \\U0001F372 \\U0001F367 \\U0001F356 \\U0001F365 \\U0001F360 \\U0001F355 \\U0001F357 \\U0001F368 \\U0001F369 \\U0001F36A \\U0001F36B \\U0001F36C \\U0001F36D \\U0001F36E \\U0001F36F \\U0001F364 \\U0001F374 \\U00002615 \\U0001F378 \\U0001F37A \\U0001F375 \\U0001F376 \\U0001F377 \\U0001F37B \\U0001F379 \\U00002194 \\U00002195 \\U00002197 \\U00002198 \\U00002196 \\U00002199 \\U00002B06 \\U00002B07 \\U00002B05 \\U000027A1 \\U00002934 \\U00002935 \\U000025B6 \\U000025C0 \\U000023E9 \\U000023EA \\U000023EB \\U000023EC \\U0001F53A \\U0001F53B \\U0001F53C \\U0001F53D \\U00002B55 \\U0000274C \\U0000274E \\U00002757 \\U00002753 \\U00002754 \\U00002755 \\U00003030 \\U0000203C \\U00002049 \\U000027B0 \\U000027BF \\U00002764 \\U0001F493 \\U0001F494 \\U0001F495 \\U0001F496 \\U0001F497 \\U0001F498 \\U0001F499 \\U0001F49A \\U0001F49B \\U0001F49C \\U0001F49D \\U0001F49E \\U0001F49F \\U00002665 \\U00002660 \\U00002666 \\U00002663 \\U0000267B \\U0000267F \\U000026A0 \\U000026D4 \\U0001F6AC \\U0001F6AD \\U0001F6A9 \\U0001F6B2 \\U0001F6B6 \\U0001F6B9 \\U0001F6BA \\U0001F6C0 \\U0001F6BB \\U0001F6BD \\U0001F6BE \\U0001F6BC \\U0001F6AA \\U0001F6AB \\U0001F191 \\U0001F192 \\U0001F193 \\U0001F194 \\U0001F195 \\U0001F196 \\U0001F197 \\U0001F198 \\U0001F199 \\U0001F19A \\U0001F201 \\U0001F202 \\U0001F21A \\U0001F22F \\U0001F232 \\U0001F233 \\U0001F234 \\U0001F235 \\U0001F236 \\U0001F237 \\U0001F238 \\U0001F239 \\U0001F23A \\U00003299 \\U00003297 \\U0001F250 \\U0001F251 \\U00002795 \\U00002796 \\U00002797 \\U0001F4A0 \\U0001F4A1 \\U0001F4A2 \\U0001F4A3 \\U0001F4A4 \\U0001F4A5 \\U0001F4A6 \\U0001F4A7 \\U0001F4A8 \\U0001F4A9 \\U0001F4AA \\U0001F4AB \\U0001F4AC \\U00002728 \\U00002734 \\U00002733 \\U00002744 \\U00002747 \\U000026AA \\U000026AB \\U00002B50 \\U0001F534 \\U0001F535 \\U0001F532 \\U0001F533 \\U000025AB \\U000025AA \\U000025FD \\U000025FE \\U000025FB \\U000025FC \\U0001F536 \\U0001F537 \\U0001F538 \\U0001F539 \\U00002B1B \\U00002B1C \\U0001F4AE \\U0001F4AF \\U000021A9 \\U000021AA \\U0001F503 \\U0001F50A \\U0001F50B \\U0001F50C \\U0001F50D \\U0001F50E \\U0001F512 \\U0001F513 \\U0001F50F \\U0001F510 \\U0001F511 \\U0001F514 \\U0001F518 \\U0001F516 \\U0001F517 \\U00002611 \\U0001F519 \\U0001F51A \\U0001F51B \\U0001F51C \\U0001F51D \\U00002003 \\U00002002 \\U00002005 \\U00002705 \\U0000270A \\U0000270B \\U0000270C \\U0001F44A \\U0001F44D \\U0001F446 \\U0001F447 \\U0001F448 \\U0001F449 \\U0001F44B \\U0001F44F \\U0001F44C \\U0001F44E \\U0001F450 \\U0000261D ]"); System.out.println(emoji); final UnicodeSet s = new UnicodeSet("[:lb=SY:]").complement().complement(); System.out.println("hi " + Arrays.asList(args) + ", " + s); - final DateFormat df = DateFormat.getPatternInstance(DateFormat.HOUR_MINUTE_GENERIC_TZ, ULocale.FRANCE); + final DateFormat df = + DateFormat.getPatternInstance(DateFormat.HOUR_MINUTE_GENERIC_TZ, ULocale.FRANCE); System.out.println(df.format(new Date())); - } private static void checkTranslit() { final String[] rules = { - ":: NFKD;", - ":: [:Latin:] NFKD;", - ":: [[:Mn:][:Me:]] remove;", - ":: Latin-Greek;", - ":: NFKD;\n" + - ":: [[:Mn:][:Me:]] remove;\n" + - ":: NFC;" + ":: NFKD;", + ":: [:Latin:] NFKD;", + ":: [[:Mn:][:Me:]] remove;", + ":: Latin-Greek;", + ":: NFKD;\n" + ":: [[:Mn:][:Me:]] remove;\n" + ":: NFC;" }; for (final String rule : rules) { System.out.println("Rules:\n" + rule); - final Transliterator trans = Transliterator.createFromRules("temp", rule, Transliterator.FORWARD); + final Transliterator trans = + Transliterator.createFromRules("temp", rule, Transliterator.FORWARD); final UnicodeSet source = getSourceSet(trans); final UnicodeSet target = trans.getTargetSet(); System.out.println("Source:\t" + source.toPattern(false)); @@ -112,7 +122,7 @@ static UnicodeSet getSourceSet(Transliterator t) { // TODO: if s1 produces ABC, what about chaining? final UnicodeFilter filter = t.getFilter(); if (filter != null) { - sources.retainAll((UnicodeSet)filter); // TODO fix for arbitrary filters + sources.retainAll((UnicodeSet) filter); // TODO fix for arbitrary filters } return sources; } @@ -130,7 +140,8 @@ private static void checkCollator(ULocale locale) { c = Collator.getInstance(locale); } long end = System.nanoTime(); - System.out.println(".getInstance nanos: " + nf.format((end-start)/(double)iterations) + " ns"); + System.out.println( + ".getInstance nanos: " + nf.format((end - start) / (double) iterations) + " ns"); Collator d; try { @@ -148,7 +159,8 @@ private static void checkCollator(ULocale locale) { } } end = System.nanoTime(); - System.out.println(".clone nanos: " + nf.format((end-start)/(double)iterations) + " ns"); + System.out.println( + ".clone nanos: " + nf.format((end - start) / (double) iterations) + " ns"); try { start = System.nanoTime(); @@ -159,8 +171,8 @@ private static void checkCollator(ULocale locale) { } catch (final Exception e) { throw new RuntimeException(e); } - System.out.println(".clone (no try) nanos: " + nf.format((end-start)/(double)iterations) + " ns"); - + System.out.println( + ".clone (no try) nanos: " + nf.format((end - start) / (double) iterations) + " ns"); } DateFormat foo; @@ -174,49 +186,67 @@ private static void checkCollator(ULocale locale) { // {return null;} // public final static DateFormat getPatternInstance(String pattern, ULocale locale) // {return null;} - // public final static DateFormat getPatternInstance(Calendar calendar, String pattern, Locale locale) + // public final static DateFormat getPatternInstance(Calendar calendar, String pattern, + // Locale locale) // {return null;} - // public final static DateFormat getPatternInstance(Calendar calendar, pattern, ULocale locale) + // public final static DateFormat getPatternInstance(Calendar calendar, pattern, ULocale + // locale) // {return null;} - public static final String - MINUTE_SECOND = "m:ss", - HOUR_MINUTE = "H:mm", - HOUR_MINUTE_SECOND = "H:mm:ss", - HOUR12_MINUTE = "h:mm", - HOUR12_MINUTE_SECOND = "H:mm:ss", - - DAY = "d", - MONTH = "L", - ABBR_MONTH = "LLL", - YEAR = "yyyy", - - MONTH_DAY = "MMMM d", - ABBR_MONTH_DAY = "MMM d", - NUM_MONTH_DAY = "M/d", - WEEKDAY_MONTH_DAY = "E MMMM d", - WEEKDAY_ABBR_MONTH_DAY = "E MMM d", - WEEKDAY_NUM_MONTH_DAY ="E, M-d", - - MONTH_YEAR = "MMMM yyyy", - NUM_MONTH_YEAR = "M/yyyy", - ABBR_MONTH_YEAR = "MMM yyyy", - WEEKDAY_NUM_MONTH_DAY_YEAR = "EEE, M/d/yyyy", - WEEKDAY_ABBR_MONTH_DAY_YEAR = "EEE, MMM d yyyy", - - QUARTER_YEAR = "QQQ yyyy", - ABBR_QUARTER_YEAR = "Q yyyy"; + public static final String MINUTE_SECOND = "m:ss", + HOUR_MINUTE = "H:mm", + HOUR_MINUTE_SECOND = "H:mm:ss", + HOUR12_MINUTE = "h:mm", + HOUR12_MINUTE_SECOND = "H:mm:ss", + DAY = "d", + MONTH = "L", + ABBR_MONTH = "LLL", + YEAR = "yyyy", + MONTH_DAY = "MMMM d", + ABBR_MONTH_DAY = "MMM d", + NUM_MONTH_DAY = "M/d", + WEEKDAY_MONTH_DAY = "E MMMM d", + WEEKDAY_ABBR_MONTH_DAY = "E MMM d", + WEEKDAY_NUM_MONTH_DAY = "E, M-d", + MONTH_YEAR = "MMMM yyyy", + NUM_MONTH_YEAR = "M/yyyy", + ABBR_MONTH_YEAR = "MMM yyyy", + WEEKDAY_NUM_MONTH_DAY_YEAR = "EEE, M/d/yyyy", + WEEKDAY_ABBR_MONTH_DAY_YEAR = "EEE, MMM d yyyy", + QUARTER_YEAR = "QQQ yyyy", + ABBR_QUARTER_YEAR = "Q yyyy"; public static final class CurrencyFilter { - public static CurrencyFilter onRegion(String region) { return new CurrencyFilter(); } - public static CurrencyFilter onCurrency(String currency) { return new CurrencyFilter(); } - public static CurrencyFilter onFromDate(Date date) { return new CurrencyFilter(); } - public static CurrencyFilter onToDate(Date date) { return new CurrencyFilter(); } - - public CurrencyFilter withRegion(String region) { return this; } - public CurrencyFilter withCurrency(String currency) { return this; } - public CurrencyFilter withFromDate(Date date) { return this; } - public CurrencyFilter withToDate(Date date) { return this; } - } + public static CurrencyFilter onRegion(String region) { + return new CurrencyFilter(); + } + public static CurrencyFilter onCurrency(String currency) { + return new CurrencyFilter(); + } + + public static CurrencyFilter onFromDate(Date date) { + return new CurrencyFilter(); + } + + public static CurrencyFilter onToDate(Date date) { + return new CurrencyFilter(); + } + + public CurrencyFilter withRegion(String region) { + return this; + } + + public CurrencyFilter withCurrency(String currency) { + return this; + } + + public CurrencyFilter withFromDate(Date date) { + return this; + } + + public CurrencyFilter withToDate(Date date) { + return this; + } + } } diff --git a/unicodetools/src/main/java/org/unicode/draft/IcuCache.java b/unicodetools/src/main/java/org/unicode/draft/IcuCache.java index ef4da5574..48bffad2e 100644 --- a/unicodetools/src/main/java/org/unicode/draft/IcuCache.java +++ b/unicodetools/src/main/java/org/unicode/draft/IcuCache.java @@ -1,13 +1,15 @@ package org.unicode.draft; -import java.lang.ref.SoftReference; -import java.util.concurrent.ConcurrentHashMap; import com.ibm.icu.text.Collator; +import java.lang.ref.SoftReference; +import java.util.concurrent.ConcurrentHashMap; +public abstract class IcuCache { + private final ConcurrentHashMap> cache = + new ConcurrentHashMap>(); -public abstract class IcuCache { - private final ConcurrentHashMap> cache = new ConcurrentHashMap>(); protected abstract V getInstance(K key); + public V get(K key) { // get the value from the cache if possible; // otherwise, create from getInstance and add to cache @@ -15,12 +17,13 @@ public V get(K key) { } static { - final IcuCache SINGLETON = new IcuCache(){ - @Override - protected Collator getInstance(String key) { - // generate the collator corresponding to the string - return null; - } - }; + final IcuCache SINGLETON = + new IcuCache() { + @Override + protected Collator getInstance(String key) { + // generate the collator corresponding to the string + return null; + } + }; } } diff --git a/unicodetools/src/main/java/org/unicode/draft/IdnaFrequency.java b/unicodetools/src/main/java/org/unicode/draft/IdnaFrequency.java index cc1311bca..5829399dc 100644 --- a/unicodetools/src/main/java/org/unicode/draft/IdnaFrequency.java +++ b/unicodetools/src/main/java/org/unicode/draft/IdnaFrequency.java @@ -1,5 +1,12 @@ package org.unicode.draft; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -12,39 +19,20 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.Counter; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UProperty.NameChoice; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class IdnaFrequency { private static final Charset LATIN1 = Charset.forName("8859-1"); /** - * cm000162 3 481 31032 6 22 \xc5\xa2\xc4\x98\xc4\x82M\xc4\x8c\xc5\x96@\xc5\xb9\xc5\xb8.yoll.net 278 41991 0 29 - * home.\xc5\xa2replug.net - * 0 c = child, p=parent; m = mapped, u=unmapped; 000162 hex code point - * 1 3 - count - * 2 481 - navboost - * 3 31032 - page rank - * 4 6 - language - * 5 22 - encoding - * 6 url in utf-8 (c-style byte escapes) - * 7,8,9,10 - * 11 url - * 12,13,14,15 - * 16 url - * ... - * + * cm000162 3 481 31032 6 22 \xc5\xa2\xc4\x98\xc4\x82M\xc4\x8c\xc5\x96@\xc5\xb9\xc5\xb8.yoll.net + * 278 41991 0 29 home.\xc5\xa2replug.net 0 c = child, p=parent; m = mapped, u=unmapped; 000162 + * hex code point 1 3 - count 2 481 - navboost 3 31032 - page rank 4 6 - language 5 22 - + * encoding 6 url in utf-8 (c-style byte escapes) 7,8,9,10 11 url 12,13,14,15 16 url ... + * * @param args * @throws IOException */ @@ -55,10 +43,17 @@ public static void main(String[] args) throws IOException { String norm = normalize(cp); if (!norm.equals(UTF16.valueOf(cp))) { final int dt = UCharacter.getIntPropertyValue(cp, UProperty.DECOMPOSITION_TYPE); - final String tdName = UCharacter - .getPropertyValueName(UProperty.DECOMPOSITION_TYPE, dt, NameChoice.LONG); - System.out.println(charTotal.getCount(cp) + "\t" + tdName + "\t" + getCodeAndName(cp) + "\t=>\t" - + getCodeAndName(norm)); + final String tdName = + UCharacter.getPropertyValueName( + UProperty.DECOMPOSITION_TYPE, dt, NameChoice.LONG); + System.out.println( + charTotal.getCount(cp) + + "\t" + + tdName + + "\t" + + getCodeAndName(cp) + + "\t=>\t" + + getCodeAndName(norm)); } } } @@ -66,8 +61,12 @@ public static void main(String[] args) throws IOException { static UnicodeSet testchars = new UnicodeSet("[[:script=greek:]ÄäÖöÜüß]"); public static Counter getData(boolean writeOut) throws IOException { - BufferedReader in = FileUtilities.openUTF8Reader("", CldrUtility.getProperty("idnaFrequency")); - PrintWriter out = !writeOut ? null : FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, "idn41-data.txt"); + BufferedReader in = + FileUtilities.openUTF8Reader("", CldrUtility.getProperty("idnaFrequency")); + PrintWriter out = + !writeOut + ? null + : FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, "idn41-data.txt"); if (writeOut) { out.write((char) 0xFEFF); } @@ -75,7 +74,7 @@ public static Counter getData(boolean writeOut) throws IOException { // Mapper encodingMapper = new Mapper(Encoding.values()); Counter charTotal = new Counter(); - for (int counter = 0;; ++counter) { + for (int counter = 0; ; ++counter) { String line = in.readLine(); if (line == null) { break; @@ -121,9 +120,11 @@ private static void showCharsets() { } } - static UnicodeSet diSet = new UnicodeSet( - "[\\u034F \\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u2064\\u206A-\\u206F \\uFE00-\\uFE0F\\uFEFF\\U0001D173-\\U0001D17A\\U000E0001\\U000E0020-\\U000E007F \\U000E0100-\\U000E01EF \\u00AD \\u17B4 \\u17B5 \\u115F \\u1160\\u3164\\uFFA0 \\u2065-\\u2069 \\uFFF0-\\uFFF8]"); - static Matcher defaultIgnorables = Pattern.compile(diSet.toPattern(false), Pattern.COMMENTS).matcher(""); + static UnicodeSet diSet = + new UnicodeSet( + "[\\u034F \\u180B-\\u180D\\u200B-\\u200F\\u202A-\\u202E\\u2060-\\u2064\\u206A-\\u206F \\uFE00-\\uFE0F\\uFEFF\\U0001D173-\\U0001D17A\\U000E0001\\U000E0020-\\U000E007F \\U000E0100-\\U000E01EF \\u00AD \\u17B4 \\u17B5 \\u115F \\u1160\\u3164\\uFFA0 \\u2065-\\u2069 \\uFFF0-\\uFFF8]"); + static Matcher defaultIgnorables = + Pattern.compile(diSet.toPattern(false), Pattern.COMMENTS).matcher(""); private static String normalize(int cp) { String a = Normalizer.normalize(cp, Normalizer.NFKC); @@ -134,8 +135,12 @@ private static String normalize(int cp) { } private static String getCodeAndName(int cp) { - return "U+" + Utility.hex(cp, 4) + "\t( " + com.ibm.icu.text.UTF16.valueOf(cp) + " )\t" - + UCharacter.getName(cp); + return "U+" + + Utility.hex(cp, 4) + + "\t( " + + com.ibm.icu.text.UTF16.valueOf(cp) + + " )\t" + + UCharacter.getName(cp); } private static String getCodeAndName(String cp) { @@ -156,30 +161,30 @@ private static String unescape(String string, Encoding encoding) throws IOExcept for (int i = 0; i < string.length(); ++i) { char b = string.charAt(i); switch (state) { - case 0: - if (b == '\\') { - state = 1; - } else { - out.write(b); - } - break; - case 1: - if (b != 'x') { - out.write(b); + case 0: + if (b == '\\') { + state = 1; + } else { + out.write(b); + } + break; + case 1: + if (b != 'x') { + out.write(b); + state = 0; + } else { + state = 2; + } + break; + case 2: + chBuffer = getNybble(b) << 4; + state = 3; + break; + case 3: + chBuffer |= getNybble(b); + out.write((byte) chBuffer); state = 0; - } else { - state = 2; - } - break; - case 2: - chBuffer = getNybble(b) << 4; - state = 3; - break; - case 3: - chBuffer |= getNybble(b); - out.write((byte) chBuffer); - state = 0; - break; + break; } } out.close(); diff --git a/unicodetools/src/main/java/org/unicode/draft/IdnaLabelTester2.java b/unicodetools/src/main/java/org/unicode/draft/IdnaLabelTester2.java index 2f77e6b21..8820ef556 100644 --- a/unicodetools/src/main/java/org/unicode/draft/IdnaLabelTester2.java +++ b/unicodetools/src/main/java/org/unicode/draft/IdnaLabelTester2.java @@ -1,5 +1,26 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.dev.util.UnicodeMapIterator; +import com.ibm.icu.impl.Punycode; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Row.R5; +import com.ibm.icu.impl.UnicodeRegex; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.IDNA; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.StringPrep; +import com.ibm.icu.text.StringPrepParseException; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -18,7 +39,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.Counter; @@ -30,34 +50,15 @@ import org.unicode.cldr.util.XEquivalenceClass; import org.unicode.text.utility.Settings; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.dev.util.UnicodeMapIterator; -import com.ibm.icu.impl.Punycode; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Row.R5; -import com.ibm.icu.impl.UnicodeRegex; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.IDNA; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.StringPrep; -import com.ibm.icu.text.StringPrepParseException; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - public class IdnaLabelTester2 { private static boolean VERBOSE = false; enum Result { - none, next, next2, fail + none, + next, + next2, + fail }; static class Rule { @@ -73,22 +74,36 @@ static class Rule { public String toString() { return "{Rule " - + (before == null ? "" : "before: " + beforeString + ", ") - + "at: " + atString - + ", result: " + result - + ", line: " + lineNumber - + ", title: " + title + "}"; + + (before == null ? "" : "before: " + beforeString + ", ") + + "at: " + + atString + + ", result: " + + result + + ", line: " + + lineNumber + + ", title: " + + title + + "}"; } - public Rule(String before, String at, String result, String title, int lineNumber, VariableReplacer variables) { + public Rule( + String before, + String at, + String result, + String title, + int lineNumber, + VariableReplacer variables) { beforeString = before; if (before != null) { before = variables.replace(before.trim()); } - this.before = before == null || before == "" ? null - : Pattern.compile(".*" + UnicodeRegex.fix(before), Pattern.COMMENTS).matcher(""); // hack, because Java - // doesn't have - // lookingBefore + this.before = + before == null || before == "" + ? null + : Pattern.compile(".*" + UnicodeRegex.fix(before), Pattern.COMMENTS) + .matcher(""); // hack, because Java + // doesn't have + // lookingBefore atString = at; at = variables.replace(at.trim()); this.at = Pattern.compile(UnicodeRegex.fix(at), Pattern.COMMENTS).matcher(""); @@ -126,7 +141,8 @@ public void setLabel(String label) { private List rules = new ArrayList(); - private static final UnicodeSet GRAPHIC = new UnicodeSet("[^[:cn:][:co:][:cs:][:cc:]]").freeze(); + private static final UnicodeSet GRAPHIC = + new UnicodeSet("[^[:cn:][:co:][:cs:][:cc:]]").freeze(); private static final UnicodeSet NOT_NFKC_CASE_FOLD = computeNotNfkcCaseFold(); VariableReplacer variables = new VariableReplacer(); @@ -135,7 +151,7 @@ public IdnaLabelTester2(String file) throws IOException { BufferedReader in = openFile(file); String title = "???"; - for (int lineCount = 1;; ++lineCount) { + for (int lineCount = 1; ; ++lineCount) { String line = in.readLine(); try { if (line == null) break; @@ -172,12 +188,18 @@ public IdnaLabelTester2(String file) throws IOException { System.out.println("Warning: contains $ " + variable + "\t=\t" + value); } // small hack, because this property isn't in ICU until 5.2 - UnicodeSet s = value.equals("[:^nfkc_casefolded:]") - ? NOT_NFKC_CASE_FOLD - : new UnicodeSet(value).complement().complement(); + UnicodeSet s = + value.equals("[:^nfkc_casefolded:]") + ? NOT_NFKC_CASE_FOLD + : new UnicodeSet(value).complement().complement(); System.out.println(variable + "\tcontains 20000\t" + s.contains(0x20000)); if (VERBOSE) { - System.out.println("{Variable: " + variable + ", value: " + toPattern(s, true) + "}"); + System.out.println( + "{Variable: " + + variable + + ", value: " + + toPattern(s, true) + + "}"); } variables.add(variable, toPattern(s, false)); continue; @@ -192,18 +214,18 @@ public IdnaLabelTester2(String file) throws IOException { // } String before, at, result; switch (pieces.length) { - case 2: - before = null; - at = pieces[0]; - result = pieces[1]; - break; - case 3: - before = pieces[0]; - at = pieces[1]; - result = pieces[2]; - break; - default: - throw new IllegalArgumentException(line + " => " + Arrays.asList(pieces)); + case 2: + before = null; + at = pieces[0]; + result = pieces[1]; + break; + case 3: + before = pieces[0]; + at = pieces[1]; + result = pieces[2]; + break; + default: + throw new IllegalArgumentException(line + " => " + Arrays.asList(pieces)); } Rule rule = new Rule(before, at, result, title, lineCount, variables); if (VERBOSE) { @@ -211,18 +233,25 @@ public IdnaLabelTester2(String file) throws IOException { } rules.add(rule); } catch (Exception e) { - throw (RuntimeException) new IllegalArgumentException("Error on line: " + lineCount + ".\t" + line) - .initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Error on line: " + lineCount + ".\t" + line) + .initCause(e); } } in.close(); } - // 248C ; 0035 002E ; MA #* ( ⒌ → 5. ) DIGIT FIVE FULL STOP → DIGIT FIVE, FULL STOP # {nfkc:9357} + // 248C ; 0035 002E ; MA #* ( ⒌ → 5. ) DIGIT FIVE FULL STOP → DIGIT FIVE, FULL STOP # + // {nfkc:9357} public static XEquivalenceClass getConfusables() throws IOException { XEquivalenceClass result = new XEquivalenceClass(); - BufferedReader in = openFile(Settings.CLDR.UCD_DATA_DIRECTORY + "security/" + Settings.latestVersion + "/confusables.txt"); + BufferedReader in = + openFile( + Settings.CLDR.UCD_DATA_DIRECTORY + + "security/" + + Settings.latestVersion + + "/confusables.txt"); String original = null; try { while (true) { @@ -262,7 +291,9 @@ private static UnicodeSet computeNotNfkcCaseFold() { System.out.println("debug??"); } int type = UCharacter.getType(i); - if (type == UCharacter.UNASSIGNED || type == UCharacter.SURROGATE || type == UCharacter.PRIVATE_USE) { + if (type == UCharacter.UNASSIGNED + || type == UCharacter.SURROGATE + || type == UCharacter.PRIVATE_USE) { // result.add(i); continue; } @@ -276,14 +307,19 @@ private static UnicodeSet computeNotNfkcCaseFold() { return result.freeze(); } - static String removals = new UnicodeSet("[\u1806[:di:]-[:cn:]]").complement().complement().toPattern(false); + static String removals = + new UnicodeSet("[\u1806[:di:]-[:cn:]]").complement().complement().toPattern(false); static Matcher rem = Pattern.compile(removals).matcher(""); private static FrequencyData2 frequencies; - private static String NFKC_CaseFold(int i, Normalizer.Mode mode, boolean onlyLower, boolean keepDI) { + private static String NFKC_CaseFold( + int i, Normalizer.Mode mode, boolean onlyLower, boolean keepDI) { String nfkc = Normalizer.normalize(i, mode); - String case_nfkc = onlyLower ? UCharacter.toLowerCase(ULocale.ROOT, nfkc) : UCharacter.foldCase(nfkc, true); + String case_nfkc = + onlyLower + ? UCharacter.toLowerCase(ULocale.ROOT, nfkc) + : UCharacter.foldCase(nfkc, true); String nfkc_case_nfkc = Normalizer.normalize(case_nfkc, mode); if (keepDI) return nfkc_case_nfkc; return rem.reset(nfkc_case_nfkc).replaceAll(""); @@ -291,12 +327,12 @@ private static String NFKC_CaseFold(int i, Normalizer.Mode mode, boolean onlyLow private static boolean equals(String string, int codePoint) { switch (string.length()) { - case 1: - return codePoint == string.charAt(0); - case 2: - return codePoint >= 0x10000 && codePoint == string.codePointAt(0); - default: - return false; + case 1: + return codePoint == string.charAt(0); + case 2: + return codePoint >= 0x10000 && codePoint == string.codePointAt(0); + default: + return false; } } @@ -306,7 +342,8 @@ private static BufferedReader openFile(String file) throws IOException { try { File file1 = new File(file); // System.out.println("Reading:\t" + file1.getCanonicalPath()); - return new BufferedReader(new InputStreamReader(new FileInputStream(file1), UTF8), 1024 * 64); + return new BufferedReader( + new InputStreamReader(new FileInputStream(file1), UTF8), 1024 * 64); } catch (Exception e) { File f = new File(file); throw new IllegalArgumentException("Bad file name: " + f.getCanonicalPath()); @@ -318,25 +355,27 @@ private static boolean startsWithIgnoreCase(String line, final String string) { return line.toLowerCase(Locale.ENGLISH).startsWith(string.toLowerCase(Locale.ENGLISH)); } - public static final UnicodeSet TO_QUOTE = new UnicodeSet("[[:z:][:me:][:mn:][:di:][:c:]-[\u0020]]"); + public static final UnicodeSet TO_QUOTE = + new UnicodeSet("[[:z:][:me:][:mn:][:di:][:c:]-[\u0020]]"); public static final Transliterator UNESCAPER = Transliterator.getInstance("hex-any"); public static final Transliterator ESCAPER = Transliterator.getInstance("any-hex"); + static { ESCAPER.setFilter(TO_QUOTE); } - private static final UnicodeSetPrettyPrinter PRETTY_PRINTER = new UnicodeSetPrettyPrinter() - .setOrdering(Collator.getInstance(ULocale.ROOT)) - .setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)) - .setToQuote(TO_QUOTE) - .setOrdering(null) - .setSpaceComparator(null); + private static final UnicodeSetPrettyPrinter PRETTY_PRINTER = + new UnicodeSetPrettyPrinter() + .setOrdering(Collator.getInstance(ULocale.ROOT)) + .setSpaceComparator( + Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)) + .setToQuote(TO_QUOTE) + .setOrdering(null) + .setSpaceComparator(null); private static String toPattern(UnicodeSet s, boolean escape) { - return !escape - ? s.toPattern(false) - : PRETTY_PRINTER.format(s); + return !escape ? s.toPattern(false) : PRETTY_PRINTER.format(s); } // ==================== Test Code ======================= @@ -354,13 +393,11 @@ public TestStatus(int position, String title, int lineNumber) { } /** - * Test a label; null for success. - * Later, return information. - * + * Test a label; null for success. Later, return information. + * * @param label * @return */ - public TestStatus test(String label) { // initialize for (Rule rule : rules) { @@ -375,12 +412,12 @@ public TestStatus test(String label) { // handle the skipping switch (rule.result) { - case fail: - if (skipOverFail || skipOverFailAndNext2) continue; - break; - case next2: - if (skipOverFailAndNext2) continue; - break; + case fail: + if (skipOverFail || skipOverFailAndNext2) continue; + break; + case next2: + if (skipOverFailAndNext2) continue; + break; } skipOverFail = false; skipOverFailAndNext2 = false; @@ -389,28 +426,28 @@ public TestStatus test(String label) { Result result = rule.match(i); switch (result) { - case next: - if (VERBOSE) { - rule.match(i); - } - skipOverFailAndNext2 = true; - break; - case next2: - if (VERBOSE) { - rule.match(i); - } - skipOverFail = true; - break; - case fail: - if (VERBOSE) { - rule.match(i); - } - return new TestStatus(i, rule.title, rule.lineNumber); - default: - if (VERBOSE) { - rule.match(i); - } - break; + case next: + if (VERBOSE) { + rule.match(i); + } + skipOverFailAndNext2 = true; + break; + case next2: + if (VERBOSE) { + rule.match(i); + } + skipOverFail = true; + break; + case fail: + if (VERBOSE) { + rule.match(i); + } + return new TestStatus(i, rule.title, rule.lineNumber); + default: + if (VERBOSE) { + rule.match(i); + } + break; } } } @@ -427,7 +464,8 @@ public static void main(String[] args) throws Exception { showPunycode("weltfussball".toUpperCase()); showPunycode("weltfuẞball".toUpperCase()); System.out.println("γιατρός\t" + Punycode.encode(new StringBuffer("γιατρός"), null)); - System.out.println("weltfussball\t" + Punycode.encode(new StringBuffer("weltfussball"), null)); + System.out.println( + "weltfussball\t" + Punycode.encode(new StringBuffer("weltfussball"), null)); String dir = CLDRPaths.BASE_DIRECTORY + "tools/java/org/unicode/cldr/draft/"; IdnaLabelTester2 tester = new IdnaLabelTester2(dir + "idnaContextRules.txt"); @@ -439,7 +477,7 @@ public static void main(String[] args) throws Exception { int successes = 0; boolean firstTestLine = true; - for (int lineCount = 1;; ++lineCount) { + for (int lineCount = 1; ; ++lineCount) { String line = in.readLine(); if (line == null) break; int commentPos = line.indexOf("#"); @@ -478,7 +516,8 @@ public static void main(String[] args) throws Exception { if (firstTestLine) { if (VERBOSE) { - System.out.println("# Test lines are in the form . ;"); + System.out.println( + "# Test lines are in the form . ;"); } firstTestLine = false; } @@ -514,10 +553,14 @@ public static void main(String[] args) throws Exception { successes++; } if (showLine) { - System.out.println(ESCAPER.transform(line.substring(0, result.position)) - + "\u2639" + ESCAPER.transform(line.substring(result.position)) - + "\t\t" + result.title - + "; \tRuleLine: " + result.ruleLine); + System.out.println( + ESCAPER.transform(line.substring(0, result.position)) + + "\u2639" + + ESCAPER.transform(line.substring(result.position)) + + "\t\t" + + result.title + + "; \tRuleLine: " + + result.ruleLine); } } } @@ -527,8 +570,7 @@ public static void main(String[] args) throws Exception { } private static void showPunycode(String string) throws StringPrepParseException { - System.out.println(string + - "\t" + Punycode.encode(new StringBuffer(string), null)); + System.out.println(string + "\t" + Punycode.encode(new StringBuffer(string), null)); } private static void checkMapIterator() { @@ -536,7 +578,8 @@ private static void checkMapIterator() { // foo.putAll(new UnicodeSet("[:cc:]"), " control"); foo.putAll(new UnicodeSet("[:Lu:]"), " upper"); foo.putAll(new UnicodeSet("[:Ll:]"), " lower"); - for (UnicodeMapIterator it = new UnicodeMapIterator(foo); it.nextRange();) { + for (UnicodeMapIterator it = new UnicodeMapIterator(foo); + it.nextRange(); ) { String codepointsHex = Utility.hex(it.codepoint, 4); if (it.codepoint != it.codepointEnd) { codepointsHex += ".." + Utility.hex(it.codepointEnd, 4); @@ -549,8 +592,10 @@ private void checkPatrik() throws IOException { UnicodeMap mine = new UnicodeMap(); UnicodeSet contextj = new UnicodeSet(variables.replace("$JoinControl")).freeze(); - UnicodeSet contexto = new UnicodeSet(variables.replace("$Context")).removeAll(contextj).freeze(); - UnicodeSet unassigned = new UnicodeSet(variables.replace("$Unassigned")).removeAll(contextj).freeze(); + UnicodeSet contexto = + new UnicodeSet(variables.replace("$Context")).removeAll(contextj).freeze(); + UnicodeSet unassigned = + new UnicodeSet(variables.replace("$Unassigned")).removeAll(contextj).freeze(); UnicodeSet valid = new UnicodeSet(variables.replace("$Valid")).freeze(); UnicodeSet valid2 = new UnicodeSet(variables.replace("$Valid2")).freeze(); boolean valid2ok = valid.equals(valid2); @@ -560,17 +605,22 @@ private void checkPatrik() throws IOException { System.out.println("valid2-valid:" + new UnicodeSet(valid2).removeAll(valid)); } UnicodeSet pvalid = new UnicodeSet(valid).removeAll(contexto).removeAll(contextj); - UnicodeMap> myLines = new UnicodeMap>(); + UnicodeMap> myLines = + new UnicodeMap>(); mine.putAll(contextj, IdnaStatus.CONTEXTJ); mine.putAll(contexto, IdnaStatus.CONTEXTO); - mine.putAll(new UnicodeSet(valid).removeAll(contexto).removeAll(contextj), IdnaStatus.PVALID); + mine.putAll( + new UnicodeSet(valid).removeAll(contexto).removeAll(contextj), IdnaStatus.PVALID); mine.putAll(unassigned, IdnaStatus.UNASSIGNED); - mine.putAll(new UnicodeSet(variables.replace("$Invalid")).removeAll(unassigned), IdnaStatus.DISALLOWED); + mine.putAll( + new UnicodeSet(variables.replace("$Invalid")).removeAll(unassigned), + IdnaStatus.DISALLOWED); // $Context = [$ExceptionContexto $BackwardCompatibleContexto $JoinControl] // $ValidAlways = [$ExceptionPvalid $BackwardCompatiblePvalid $LDH] - // $InvalidLetterDigits = [$ExceptionDisallowed $BackwardCompatibleDisallowed $Unassigned $Unstable + // $InvalidLetterDigits = [$ExceptionDisallowed $BackwardCompatibleDisallowed $Unassigned + // $Unstable // $IgnorableProperties $IgnorableBlocks $OldHangulJamo] // $Valid = [$ValidAlways $Context [$LetterDigits - $InvalidLetterDigits]] @@ -579,7 +629,7 @@ private void checkPatrik() throws IOException { BufferedReader in = openFile("../DATA/IDN/idna-calculation.txt"); /* * The table has the following format: - * + * * 000020; DISALLOWED # C : Zs : NOK : HOSTNAME # SPACE * Unicode codepoint * IDNA2008 property value @@ -607,10 +657,11 @@ private void checkPatrik() throws IOException { String s = UTF16.valueOf(cp); IdnaStatus idna2008 = IdnaStatus.valueOf(parts[1]); - //String rule2008 = parts[2]; + // String rule2008 = parts[2]; String gcPatrik = parts[3]; - //IdnaStatus idna2003out = parts[4].equals("OK") ? IdnaStatus.PVALID : IdnaStatus.DISALLOWED; - //String idna2003why = parts[5]; + // IdnaStatus idna2003out = parts[4].equals("OK") ? IdnaStatus.PVALID : + // IdnaStatus.DISALLOWED; + // String idna2003why = parts[5]; String cpName = parts[6]; String diff = ""; @@ -620,12 +671,15 @@ private void checkPatrik() throws IOException { } String myName = getName(cp); - if (gc == UCharacter.UNASSIGNED || gc == UCharacter.PRIVATE_USE || gc == UCharacter.SURROGATE - || gc == UCharacter.CONTROL) { + if (gc == UCharacter.UNASSIGNED + || gc == UCharacter.PRIVATE_USE + || gc == UCharacter.SURROGATE + || gc == UCharacter.CONTROL) { // do nothing } else { - if (!myName.equals(cpName) && !myName.startsWith("CJK UNIFIED IDEOGRAPH-") - && !myName.startsWith("HANGUL SYLLABLE ")) { + if (!myName.equals(cpName) + && !myName.startsWith("CJK UNIFIED IDEOGRAPH-") + && !myName.startsWith("HANGUL SYLLABLE ")) { diff += "name; "; } } @@ -643,7 +697,9 @@ private void checkPatrik() throws IOException { if (idna2008map.equals("")) { idna2008map = s; } - if (my2008 == IdnaStatus.DISALLOWED && valid.containsAll(idna2008map) && !idna2008map.equals(s)) { + if (my2008 == IdnaStatus.DISALLOWED + && valid.containsAll(idna2008map) + && !idna2008map.equals(s)) { my2008 = IdnaStatus.REMAP; } else { idna2008map = "\uE000"; @@ -654,17 +710,19 @@ private void checkPatrik() throws IOException { idna2003map = "\uE000"; } - // String predicate = ";\t" + my2008 + ";\t" + idna2008map + ";\t" + myIdna2003 + ";\t" + idna2003map; + // String predicate = ";\t" + my2008 + ";\t" + idna2008map + ";\t" + myIdna2003 + + // ";\t" + idna2003map; // String myLine = hex4 + predicate + ";\t" + myGc + ";\t" + myName; - R5 row = Row.of(my2008, idna2008map, myIdna2003, - idna2003map, UScript.getScript(cp)); + R5 row = + Row.of(my2008, idna2008map, myIdna2003, idna2003map, UScript.getScript(cp)); myLines.put(cp, row); // if (diff.length() != 0) { // System.out.println(line + "\n≠\t" + myLine + "\n#\tdiff:\t" + diff); // } } catch (Exception e) { - throw (RuntimeException) new IllegalArgumentException("EXCEPTION with:\t" + line).initCause(e); + throw (RuntimeException) + new IllegalArgumentException("EXCEPTION with:\t" + line).initCause(e); } } in.close(); @@ -688,9 +746,7 @@ public String toString() { return count + "\t" + countWeighted + "\t" + countWeightedIdna + "\t" + samples; } - /** - * Creates new - */ + /** Creates new */ ConfusableData minus(ConfusableData other) { ConfusableData result = new ConfusableData(); result.count = count - other.count; @@ -714,7 +770,8 @@ private void countConfusables(UnicodeSet pvalid, UnicodeSet contexto) throws IOE XEquivalenceClass equivs = getConfusables(); // int i = 0; // for (String sample : equivs) { - // System.out.println((i++) + "\t" + Utility.hex(sample) + "\t" + equivs.getEquivalences(sample)); + // System.out.println((i++) + "\t" + Utility.hex(sample) + "\t" + + // equivs.getEquivalences(sample)); // } ConfusableData valid = new ConfusableData(); @@ -728,8 +785,13 @@ private void countConfusables(UnicodeSet pvalid, UnicodeSet contexto) throws IOE pvalid = new UnicodeSet(pvalid).addAll(syntax).freeze(); contexto = new UnicodeSet(contexto).removeAll(syntax).freeze(); UnicodeSet pvalidWithContexto = new UnicodeSet(pvalid).addAll(contexto); - UnicodeSet allTest = new UnicodeSet(pvalid).addAll(contexto).addAll(idna2003Valid).addAll(syntax) - .removeAll(new UnicodeSet("[:ideographic:]")).freeze(); + UnicodeSet allTest = + new UnicodeSet(pvalid) + .addAll(contexto) + .addAll(idna2003Valid) + .addAll(syntax) + .removeAll(new UnicodeSet("[:ideographic:]")) + .freeze(); for (String item : allTest) { int codePoint = item.codePointAt(0); long weight = frequencies.getCount(codePoint); @@ -753,10 +815,12 @@ private void countConfusables(UnicodeSet pvalid, UnicodeSet contexto) throws IOE if ((pvalid.containsAll(item) || inSyntax) && pvalid.containsAll(item2)) { has_pvalid_pvalid = true; } - if ((pvalidWithContexto.containsAll(item) || inSyntax) && pvalidWithContexto.containsAll(item2)) { + if ((pvalidWithContexto.containsAll(item) || inSyntax) + && pvalidWithContexto.containsAll(item2)) { has_cvalid_cvalid = true; } - if ((idna2003Valid.containsAll(item) || inSyntax) && idna2003Valid.containsAll(item2)) { + if ((idna2003Valid.containsAll(item) || inSyntax) + && idna2003Valid.containsAll(item2)) { has_pvalid3_pvalid3 = true; } if ((pvalid.containsAll(item) || inSyntax) && idna2003Valid.containsAll(item2)) { @@ -788,38 +852,50 @@ private void countConfusables(UnicodeSet pvalid, UnicodeSet contexto) throws IOE } enum Diff { - same, warn, bad + same, + warn, + bad }; - private void printFullComparison(UnicodeMap> myLines) - throws IOException { - Tabber tabber = new Tabber.MonoTabber() - .add(12, Tabber.LEFT) // code - .add(12, Tabber.CENTER) // chars - .add(12, Tabber.LEFT) // idna2008 - .add(12, Tabber.LEFT) // map - .add(12, Tabber.LEFT) // idna2003 - .add(12, Tabber.LEFT) // map - .add(12, Tabber.LEFT) // gc - .add(7, Tabber.LEFT) // name - ; + private void printFullComparison( + UnicodeMap> myLines) + throws IOException { + Tabber tabber = + new Tabber.MonoTabber() + .add(12, Tabber.LEFT) // code + .add(12, Tabber.CENTER) // chars + .add(12, Tabber.LEFT) // idna2008 + .add(12, Tabber.LEFT) // map + .add(12, Tabber.LEFT) // idna2003 + .add(12, Tabber.LEFT) // map + .add(12, Tabber.LEFT) // gc + .add(7, Tabber.LEFT) // name + ; HTMLTabber htmlTabber = new Tabber.HTMLTabber(); // 003A..0040;DISALLOWED;;DISALLOWED;; Po Sm;COLON..COMMERCIAL AT // 0041;REMAP; 0061; REMAP; 0061; Lu; LATIN CAPITAL LETTER A - PrintWriter out = FileUtilities.openUTF8Writer(org.unicode.cldr.util.CldrUtility.getProperty("out"), "idna-info.txt"); - PrintWriter out2 = FileUtilities.openUTF8Writer(org.unicode.cldr.util.CldrUtility.getProperty("out"), "idna-info-tab.txt"); - PrintWriter out3 = FileUtilities.openUTF8Writer(org.unicode.cldr.util.CldrUtility.getProperty("out"), "idna-info.html"); - out3.println("\n" + - "\n" + - "\n" + - "IDNA Info\n" + - "\n" + - "\n" + - "\n" + - "

Key

" + - "
"); out.print(entry.get0()); for (final ReadingRows heading : values) { out.print(""); String column = entry.get1().get(heading); - column = column.replace(";",""); + column = column.replace(";", ""); out.print(column); } out.println("
"); - String title = "D\tCode Points\tChars\tIdna2008\tMap\tIdna2003\tMap\tScript\tGCs\tDescription"; + PrintWriter out = + FileUtilities.openUTF8Writer( + org.unicode.cldr.util.CldrUtility.getProperty("out"), "idna-info.txt"); + PrintWriter out2 = + FileUtilities.openUTF8Writer( + org.unicode.cldr.util.CldrUtility.getProperty("out"), "idna-info-tab.txt"); + PrintWriter out3 = + FileUtilities.openUTF8Writer( + org.unicode.cldr.util.CldrUtility.getProperty("out"), "idna-info.html"); + out3.println( + "\n" + + "\n" + + "\n" + + "IDNA Info\n" + + "\n" + + "\n" + + "\n" + + "

Key

" + + "
"); + String title = + "D\tCode Points\tChars\tIdna2008\tMap\tIdna2003\tMap\tScript\tGCs\tDescription"; out.println(tabber.process(title)); out2.println(title); htmlTabber.setElement("th"); @@ -828,8 +904,10 @@ private void printFullComparison(UnicodeMap gcs = new LinkedHashSet(); Set scripts = new LinkedHashSet(); - for (UnicodeMapIterator> it = new UnicodeMapIterator>( - myLines); it.nextRange();) { + for (UnicodeMapIterator> it = + new UnicodeMapIterator>( + myLines); + it.nextRange(); ) { String codepointsHex = Utility.hex(it.codepoint, 4); String myName = getName(it.codepoint); String myGc = getGc(it.codepoint); @@ -838,7 +916,10 @@ private void printFullComparison(UnicodeMap\n" + - "\n" + - ""); + out3.println("
\n" + "\n" + ""); out.close(); out2.close(); out3.close(); @@ -899,17 +1001,22 @@ private String shortStringForSet(Set gcs, String myGc, int limit) { if (myGc.length() != 0) myGc += " "; myGc += item; } - if (limit < gcs.size()) { + if (limit < gcs.size()) {} - } myGc = "{" + myGc + "}"; } return myGc; } - private String getline(Diff diff, String codepointsHex, String chars, - R5 value, - String myScript, String myGc, String myName, boolean fixHex) { + private String getline( + Diff diff, + String codepointsHex, + String chars, + R5 value, + String myScript, + String myGc, + String myName, + boolean fixHex) { IdnaStatus v8 = value.get0(); String m8 = value.get1(); IdnaStatus v3 = value.get2(); @@ -921,24 +1028,42 @@ private String getline(Diff diff, String codepointsHex, String chars, sep = "\t"; } - String m8a = v8 != IdnaStatus.REMAP ? "n/a" : - m8.length() == 0 ? "delete" : - hex(m8, fixHex); - String m3a = v3 != IdnaStatus.REMAP ? "n/a" : - v8 == IdnaStatus.REMAP && m8.equals(m3) ? "~" : - m3.length() == 0 ? "delete" : - hex(m3, fixHex); + String m8a = v8 != IdnaStatus.REMAP ? "n/a" : m8.length() == 0 ? "delete" : hex(m8, fixHex); + String m3a = + v3 != IdnaStatus.REMAP + ? "n/a" + : v8 == IdnaStatus.REMAP && m8.equals(m3) + ? "~" + : m3.length() == 0 ? "delete" : hex(m3, fixHex); String v8a = v8.toString(); String v3a = v8 == v3 ? "~" : v3.toString(); - String line = (diff == Diff.bad ? "X" : diff == Diff.warn ? "w" : "") + sep + codepointsHex + sep + chars + sep - + v8a + sep + m8a + sep + v3a + sep + m3a + sep + myScript + sep + myGc + sep + myName; + String line = + (diff == Diff.bad ? "X" : diff == Diff.warn ? "w" : "") + + sep + + codepointsHex + + sep + + chars + + sep + + v8a + + sep + + m8a + + sep + + v3a + + sep + + m3a + + sep + + myScript + + sep + + myGc + + sep + + myName; return line; } private String getGc(int cp) { - return UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(cp), - UProperty.NameChoice.SHORT); + return UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, UCharacter.getType(cp), UProperty.NameChoice.SHORT); } private String getName(int cp) { @@ -961,12 +1086,15 @@ private void showMapping() { UnicodeSet valid = new UnicodeSet(variables.replace("$Valid")); UnicodeSet graphic = new UnicodeSet("[^[:cn:][:co:][:cs:][:cc:]]"); Counter counter = new Counter(); - Map>>> examples = new HashMap>>>(); + Map>>> examples = + new HashMap>>>(); double totalFrequency = 0; loadFrequencies(); for (String s : graphic) { - String idna2003 = getIDNAValue(s, StringPrep.ALLOW_UNASSIGNED, null); // StringPrep.ALLOW_UNASSIGNED + String idna2003 = + getIDNAValue( + s, StringPrep.ALLOW_UNASSIGNED, null); // StringPrep.ALLOW_UNASSIGNED String idna2008 = getIDNA2008Value(s, true); // String idna2008c = getIDNA2008Value(s, true); // a problem case is where they are not equal, and both mapped values are valid @@ -975,8 +1103,15 @@ private void showMapping() { String status3 = getIdnaStatus(valid, idna2003); String status8 = getIdnaStatus(valid, idna2008); // String status8c = getIdnaStatus(valid, idna2008c); - String key = eq + "\t" + sourceStatus + "\t" + status3 + "\t" + status8; // + (idna2008.equals(idna2008c) ? - // "" : " "+status8c); + String key = + eq + + "\t" + + sourceStatus + + "\t" + + status3 + + "\t" + + status8; // + (idna2008.equals(idna2008c) ? + // "" : " "+status8c); counter.add(key, 1); long newFrequency = frequencies.getCount(s.codePointAt(0)); totalFrequency += newFrequency; @@ -990,12 +1125,32 @@ private void showMapping() { long oldFrequency = old.get0(); old.set0(newFrequency + oldFrequency); if (true) { - String example = getExample(s, idna2003, idna2008, eq, sourceStatus, status3, status8, null, null); + String example = + getExample( + s, + idna2003, + idna2008, + eq, + sourceStatus, + status3, + status8, + null, + null); Set> set = old.get1(); set.add(Row.of(new Long(-newFrequency), example)); } if (!idna2008.equals(idna2003)) { - System.out.println(getExample(s, idna2003, idna2008, eq, sourceStatus, status3, status8, null, null)); + System.out.println( + getExample( + s, + idna2003, + idna2008, + eq, + sourceStatus, + status3, + status8, + null, + null)); } } System.out.println("==== Char count for groups"); @@ -1003,7 +1158,8 @@ private void showMapping() { R2>> data = examples.get(s); double freq = data.get0(); R2 freqSample = data.get1().iterator().next(); - System.out.println(freqSample.get1() + "\t" + counter.get(s) + "\t" + (freq / totalFrequency)); + System.out.println( + freqSample.get1() + "\t" + counter.get(s) + "\t" + (freq / totalFrequency)); } System.out.println("==== Samples for groups"); @@ -1013,10 +1169,10 @@ private void showMapping() { int max = 10; for (R2 freqSample : data.get1()) { if (--max <= 0) break; - System.out.println(freqSample.get1() + "\t" + freq + "\t" + (-freqSample.get0() / freq)); + System.out.println( + freqSample.get1() + "\t" + freq + "\t" + (-freqSample.get0() / freq)); } } - } private void loadFrequencies() { @@ -1029,13 +1185,33 @@ private void loadFrequencies() { } } - private static String getExample(String s, String idna2003, String idna2008, String eq, String sourceStatus, - String status3, String status8, String idna2008c, String status8c) { - return version(s) + "\t" + eq - + "\t" + getCodeAndName(s) + "\t" + sourceStatus - + "\t" + getCodeAndName(idna2003, s) + "\t" + status3 - + "\t" + getCodeAndName(idna2008, s) + "\t" + status8 - // + (idna2008.equals(idna2008c) ? "" : "\t" + getCodeAndName(idna2008c, s) + "\t" + status8c) + private static String getExample( + String s, + String idna2003, + String idna2008, + String eq, + String sourceStatus, + String status3, + String status8, + String idna2008c, + String status8c) { + return version(s) + + "\t" + + eq + + "\t" + + getCodeAndName(s) + + "\t" + + sourceStatus + + "\t" + + getCodeAndName(idna2003, s) + + "\t" + + status3 + + "\t" + + getCodeAndName(idna2008, s) + + "\t" + + status8 + // + (idna2008.equals(idna2008c) ? "" : "\t" + getCodeAndName(idna2008c, s) + "\t" + + // status8c) ; } @@ -1055,8 +1231,11 @@ private static String version(String s) { } private static String getIdnaStatus(UnicodeSet valid, String s) { - return (s != null && getIDNAValue(s, StringPrep.DEFAULT, null) != null ? "V" : "x") + "3" - + "\t" + (s != null && valid.containsAll(s) ? "V" : "x") + 8; + return (s != null && getIDNAValue(s, StringPrep.DEFAULT, null) != null ? "V" : "x") + + "3" + + "\t" + + (s != null && valid.containsAll(s) ? "V" : "x") + + 8; } private static String getCodeAndName(String string) { @@ -1067,7 +1246,8 @@ private static String getCodeAndName(String string) { if (i > 0) { result.append(" + "); } - result.append(Utility.hex(cp) + " (" + UTF16.valueOf(cp) + ") " + UCharacter.getName(cp)); + result.append( + Utility.hex(cp) + " (" + UTF16.valueOf(cp) + ") " + UCharacter.getName(cp)); if (cp > 0xFFFF) ++i; } return result.toString(); @@ -1088,7 +1268,12 @@ private static void addMapping(Map mapping, int i, String ma static StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP); enum IdnaStatus { - PVALID, CONTEXTO, CONTEXTJ, REMAP, UNASSIGNED, DISALLOWED + PVALID, + CONTEXTO, + CONTEXTJ, + REMAP, + UNASSIGNED, + DISALLOWED }; IdnaStatus[] temp = new IdnaStatus[1]; @@ -1098,7 +1283,7 @@ private IdnaStatus getIdnaStatus(String s) { return temp[0]; } - static public String getIDNAValue(String s, int namePrepOptions, IdnaStatus[] status) { + public static String getIDNAValue(String s, int namePrepOptions, IdnaStatus[] status) { if (status != null) { status[0] = IdnaStatus.PVALID; } @@ -1121,7 +1306,8 @@ static public String getIDNAValue(String s, int namePrepOptions, IdnaStatus[] st return result; } try { - IDNA.convertIDNToASCII(s, IDNA.USE_STD3_RULES + IDNA.ALLOW_UNASSIGNED); // just catch exception + IDNA.convertIDNToASCII( + s, IDNA.USE_STD3_RULES + IDNA.ALLOW_UNASSIGNED); // just catch exception } catch (Exception e1) { if (status != null) { status[0] = IdnaStatus.DISALLOWED; @@ -1143,10 +1329,13 @@ static public String getIDNAValue(String s, int namePrepOptions, IdnaStatus[] st private static String frequencyFile; - static public String getIDNA2008Value(String original, boolean lowerCase) { + public static String getIDNA2008Value(String original, boolean lowerCase) { String source = original; for (int j = 0; j < 2; ++j) { - String lower = lowerCase ? UCharacter.toLowerCase(ULocale.ROOT, source) : UCharacter.foldCase(source, true); + String lower = + lowerCase + ? UCharacter.toLowerCase(ULocale.ROOT, source) + : UCharacter.foldCase(source, true); // String caseFoldNfc = Normalizer.normalize(caseFold, Normalizer.NFC); // String lowerNfc = Normalizer.normalize(lower, Normalizer.NFC); // if (!caseFoldNfc.equals(lowerNfc)) { diff --git a/unicodetools/src/main/java/org/unicode/draft/Ids2.java b/unicodetools/src/main/java/org/unicode/draft/Ids2.java index e7733bc87..e4493031f 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Ids2.java +++ b/unicodetools/src/main/java/org/unicode/draft/Ids2.java @@ -1,5 +1,16 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.IOException; @@ -14,38 +25,30 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.CodePoints; import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.Counter; import org.unicode.cldr.util.UnicodeSetPrettyPrinter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; - public abstract class Ids2 implements Comparable { - static final UnicodeSet ALLOWED = new UnicodeSet("[[:Unified_Ideograph:]]").freeze(); // [:Block=CJK Radicals - // Supplement:][:Block=Kangxi - // Radicals:] - static final UnicodeSet FULL_ALLOWED = new UnicodeSet("\\p{Block=Ideographic Description Characters}").addAll( - ALLOWED).freeze(); - static final UnicodeSet FULL_ALLOWED_AND_PU = new UnicodeSet("[:General_Category=Private_Use:]").addAll( - FULL_ALLOWED).freeze(); - static final UnicodeSet MAIN_CJK = new UnicodeSet("[[:block=CJK_Unified_Ideographs:]]").freeze(); + static final UnicodeSet ALLOWED = + new UnicodeSet("[[:Unified_Ideograph:]]").freeze(); // [:Block=CJK Radicals + // Supplement:][:Block=Kangxi + // Radicals:] + static final UnicodeSet FULL_ALLOWED = + new UnicodeSet("\\p{Block=Ideographic Description Characters}") + .addAll(ALLOWED) + .freeze(); + static final UnicodeSet FULL_ALLOWED_AND_PU = + new UnicodeSet("[:General_Category=Private_Use:]").addAll(FULL_ALLOWED).freeze(); + static final UnicodeSet MAIN_CJK = + new UnicodeSet("[[:block=CJK_Unified_Ideographs:]]").freeze(); static final Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); static final char BAD_CHAR = '\uFFFF'; static NumberFormat nf = NumberFormat.getInstance(); + static { nf.setGroupingUsed(true); } @@ -58,7 +61,8 @@ static class Hacks { UnicodeMap remap = new UnicodeMap(); UnicodeSet radicals = new UnicodeSet(); String line = ""; - Relation problems = Relation.of(new LinkedHashMap>(), LinkedHashSet.class); + Relation problems = + Relation.of(new LinkedHashMap>(), LinkedHashSet.class); void addProblem(String problem) { if (problem != null) { @@ -97,14 +101,11 @@ public void addBadChar(int cp) { protected int codepoint; static class Visitor { - public void atCodePoint(int codepoint) { - } + public void atCodePoint(int codepoint) {} - public void atRelation(int codepoint) { - } + public void atRelation(int codepoint) {} - public void atComponent(Ids2 first) { - } + public void atComponent(Ids2 first) {} } public boolean equals(Object other) { @@ -165,44 +166,47 @@ protected String showCodepoint(UnicodeMap data, boolean replace, int maxLe if (result instanceof Leaf) { result = null; } - return result == null ? UTF16.valueOf(codepoint) - : !replace ? "*" + UTF16.valueOf(codepoint) - : maxLevel <= 0 ? "†" + UTF16.valueOf(codepoint) - : result.toString(data, replace, maxLevel - 1); + return result == null + ? UTF16.valueOf(codepoint) + : !replace + ? "*" + UTF16.valueOf(codepoint) + : maxLevel <= 0 + ? "†" + UTF16.valueOf(codepoint) + : result.toString(data, replace, maxLevel - 1); } protected static Ids2 parse(CodePoints codePoints) { codePoints.next(); int cp = codePoints.getCodePoint(); switch (cp) { - // double - case '\u2FF0': - case '\u2FF1': - case '\u2FF4': - case '\u2FF5': - case '\u2FF6': - case '\u2FF7': - case '\u2FF8': - case '\u2FF9': - case '\u2FFA': - case '\u2FFB': - return new Dual(cp, codePoints); - // triple - case '\u2FF2': - case '\u2FF3': - return new Trial(cp, codePoints); - case '&': - StringBuffer ncr = new StringBuffer(); - while (codePoints.next()) { - cp = codePoints.getCodePoint(); - if (cp == ';') { - return new Leaf(ncr.toString()); + // double + case '\u2FF0': + case '\u2FF1': + case '\u2FF4': + case '\u2FF5': + case '\u2FF6': + case '\u2FF7': + case '\u2FF8': + case '\u2FF9': + case '\u2FFA': + case '\u2FFB': + return new Dual(cp, codePoints); + // triple + case '\u2FF2': + case '\u2FF3': + return new Trial(cp, codePoints); + case '&': + StringBuffer ncr = new StringBuffer(); + while (codePoints.next()) { + cp = codePoints.getCodePoint(); + if (cp == ';') { + return new Leaf(ncr.toString()); + } + ncr.appendCodePoint(cp); } - ncr.appendCodePoint(cp); - } - throw new IllegalArgumentException("NCR too short: " + ncr); - default: - return new Leaf(cp); + throw new IllegalArgumentException("NCR too short: " + ncr); + default: + return new Leaf(cp); } } @@ -226,9 +230,7 @@ public void atCodePoint(int codepoint) { counter.add(codepoint, 1); } - public void atComponent(Ids2 first) { - - } + public void atComponent(Ids2 first) {} public void atRelation(int codepoint) { relationCounter.add(codepoint, 1); @@ -239,7 +241,8 @@ static final class Leaf extends Ids2 { Leaf(int cp) { String nfcForm = nfc.normalize(UTF16.valueOf(cp)); if (nfcForm.codePointCount(0, nfcForm.length()) != 1) { - throw new IllegalArgumentException("NFC form is too long:\t" + Utility.hex(nfcForm)); + throw new IllegalArgumentException( + "NFC form is too long:\t" + Utility.hex(nfcForm)); } cp = nfcForm.codePointAt(0); String revised = hacks.remap.get(cp); @@ -314,8 +317,11 @@ public String toString() { } public String toString(UnicodeMap data, boolean replace, int maxLevel) { - return "{" + showCodepoint(data, replace, maxLevel - 1) + first.toString(data, replace, maxLevel - 1) - + second.toString(data, replace, maxLevel - 1) + "}"; + return "{" + + showCodepoint(data, replace, maxLevel - 1) + + first.toString(data, replace, maxLevel - 1) + + second.toString(data, replace, maxLevel - 1) + + "}"; } public int compareTo(Ids2 o) { @@ -339,8 +345,9 @@ public UnicodeSet addChars(UnicodeSet results, UnicodeMap data) { } public boolean contains(int codepoint, UnicodeMap data) { - return super.contains(codepoint, data) || first.contains(codepoint, data) - || second.contains(codepoint, data); + return super.contains(codepoint, data) + || first.contains(codepoint, data) + || second.contains(codepoint, data); } public int size() { @@ -392,16 +399,26 @@ public void visit(Visitor visitor) { } public int hashCode() { - return codepoint ^ (37 * first.hashCode() ^ (37 * second.hashCode() ^ (37 * third.hashCode()))); + return codepoint + ^ (37 * first.hashCode() ^ (37 * second.hashCode() ^ (37 * third.hashCode()))); } public String toString() { - return "{" + UTF16.valueOf(codepoint) + first.toString() + second.toString() + third.toString() + "}"; + return "{" + + UTF16.valueOf(codepoint) + + first.toString() + + second.toString() + + third.toString() + + "}"; } public String toString(UnicodeMap data, boolean replace, int maxLevel) { - return "{" + showCodepoint(data, replace, maxLevel - 1) + first.toString(data, replace, maxLevel - 1) - + second.toString(data, replace, maxLevel - 1) + third.toString(data, replace, maxLevel - 1) + "}"; + return "{" + + showCodepoint(data, replace, maxLevel - 1) + + first.toString(data, replace, maxLevel - 1) + + second.toString(data, replace, maxLevel - 1) + + third.toString(data, replace, maxLevel - 1) + + "}"; } public int compareTo(Ids2 o) { @@ -446,13 +463,14 @@ boolean minimize(Map reverseData) { } } - static Comparator IdsComparator = new Comparator() { - public int compare(Ids2 o1, Ids2 o2) { - int diff = o1.size() - o2.size(); - if (diff != 0) return diff; - return o1.compareTo(o2); - } - }; + static Comparator IdsComparator = + new Comparator() { + public int compare(Ids2 o1, Ids2 o2) { + int diff = o1.size() - o2.size(); + if (diff != 0) return diff; + return o1.compareTo(o2); + } + }; /* * U+2FF0 ( ⿰ ) IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT * U+2FF1 ( ⿱ ) IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW @@ -468,12 +486,15 @@ public int compare(Ids2 o1, Ids2 o2) { * U+2FFB ( ⿻ ) IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID */ - static UnicodeSetPrettyPrinter pp = new UnicodeSetPrettyPrinter().setOrdering(RadicalStroke2.RadicalStrokeComparator).setCompressRanges( - false); + static UnicodeSetPrettyPrinter pp = + new UnicodeSetPrettyPrinter() + .setOrdering(RadicalStroke2.RadicalStrokeComparator) + .setCompressRanges(false); public static void main(String[] args) throws IOException { - TreeSet sortedByRadicalStroke = new TreeSet(RadicalStroke2.RadicalStrokeComparator); + TreeSet sortedByRadicalStroke = + new TreeSet(RadicalStroke2.RadicalStrokeComparator); for (String s : ALLOWED) { sortedByRadicalStroke.add(s); } @@ -502,20 +523,24 @@ public static void main(String[] args) throws IOException { if (radicals) { if (line.startsWith("#")) continue; if (line.startsWith(";")) continue; - // ;Radical Number,Status,Unified Ideo,Hex,Radical,Hex,Name,Conf. Char,Hex,Unified Ideo. has NO + // ;Radical Number,Status,Unified Ideo,Hex,Radical,Hex,Name,Conf. + // Char,Hex,Unified Ideo. has NO // Remaining Strokes in Unihan // 5,Variant,,#VALUE!,⺄,U+2E84,SECOND_THREE // 6,Main,亅,U+4E85,⼅,U+2F05,HOOK,,,𠄌 String[] split = line.split(","); - // we depend on the file being set up so that the first integer radical has a Main. Detect bad + // we depend on the file being set up so that the first integer radical has a + // Main. Detect bad // cases. double radical = Double.parseDouble(split[0]); int baseRadical = (int) radical; String base = radicalToBase.get(baseRadical); if (base == null) { base = split[2]; - if (!"Main".equals(split[1]) || radical != baseRadical || !ALLOWED.containsAll(base)) { + if (!"Main".equals(split[1]) + || radical != baseRadical + || !ALLOWED.containsAll(base)) { throw new IllegalArgumentException("Bad radical file"); } radicalToBase.put(baseRadical, base); @@ -564,22 +589,33 @@ public static void main(String[] args) throws IOException { } } - openFile("problems-parsing.txt", ";@ Parsing Problems:\t" + nf.format(hacks.problems.size())); + openFile( + "problems-parsing.txt", + ";@ Parsing Problems:\t" + nf.format(hacks.problems.size())); int counter = 0; for (String s : hacks.problems.keySet()) { out.println(nf.format(++counter) + ")\t" + s + "\t" + hacks.problems.getAll(s)); } - openFile("problems-self.txt", ";@ Character maps to self in IDS data:\t" + nf.format(mapsToSelf.size())); + openFile( + "problems-self.txt", + ";@ Character maps to self in IDS data:\t" + nf.format(mapsToSelf.size())); out.println(pp.format(mapsToSelf)); - openFile("problems-bad.txt", ";@ Bad Chars in IDS data:\t" + nf.format(hacks.badChars.getTotal())); + openFile( + "problems-bad.txt", + ";@ Bad Chars in IDS data:\t" + nf.format(hacks.badChars.getTotal())); out.println(";count ; char ; (hex) ; name\n"); for (Integer cp : hacks.badChars.getKeysetSortedByKey()) { if (cp == 0xFFFF) { out.println(hacks.badChars.getCount(cp) + "\t" + "TRUNCATED"); } else { - out.println(hacks.badChars.getCount(cp) + "\t" + charAndHex(cp) + "\t" + UCharacter.getName(cp)); + out.println( + hacks.badChars.getCount(cp) + + "\t" + + charAndHex(cp) + + "\t" + + UCharacter.getName(cp)); } } TreeSet sortedUCA = new TreeSet(Collator.getInstance(ULocale.ROOT)); @@ -588,8 +624,9 @@ public static void main(String[] args) throws IOException { sortedUCA.add(s); } } - openFile("problems-missing-components.txt", - ";@ Missing Components in IDS data:\t" + nf.format(sortedUCA.size()) + "\n"); + openFile( + "problems-missing-components.txt", + ";@ Missing Components in IDS data:\t" + nf.format(sortedUCA.size()) + "\n"); String tempStr = sortedUCA.toString(); out.println(tempStr); @@ -619,8 +656,12 @@ public static void main(String[] args) throws IOException { } } for (Integer value : sameRadical.values()) { - out.println(nf.format(++counter) + ")\t" - + charAndHex(value) + "\t" + pp.format(new UnicodeSet(sameRadical.getSet(value)).remove(value))); + out.println( + nf.format(++counter) + + ")\t" + + charAndHex(value) + + "\t" + + pp.format(new UnicodeSet(sameRadical.getSet(value)).remove(value))); } openFile("minimalization.txt", ";@ Minimized IDS Data"); @@ -632,8 +673,8 @@ public static void main(String[] args) throws IOException { Integer cp = reverseData2.get(ids); String old = ids.toString(); if (ids.minimize(reverseData2)) { - out.println(nf.format(++counter) + ")\t" - + charAndHex(cp) + "\t" + ids + "\t" + old); + out.println( + nf.format(++counter) + ")\t" + charAndHex(cp) + "\t" + ids + "\t" + old); } } @@ -643,7 +684,8 @@ public static void main(String[] args) throws IOException { // hacks.hackStringSample.get(s)); // } - Relation reverseData = Relation.of(new HashMap>(), LinkedHashSet.class); + Relation reverseData = + Relation.of(new HashMap>(), LinkedHashSet.class); UnicodeMap charsToContainingChars = new UnicodeMap(); // get other data @@ -655,7 +697,7 @@ public static void main(String[] args) throws IOException { value.addChars(charsInIds.clear(), null); charsInIds.remove(key); - for (UnicodeSetIterator it = new UnicodeSetIterator(charsInIds); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(charsInIds); it.next(); ) { UnicodeSet containingSet = charsToContainingChars.get(it.codepoint); if (containingSet == null) { charsToContainingChars.put(it.codepoint, containingSet = new UnicodeSet()); @@ -674,13 +716,13 @@ public static void main(String[] args) throws IOException { for (int i : set) { uset.add(i); } - out.println(nf.format(++counter) + ")\t" - + pp.format(uset) + "\t" + ids); + out.println(nf.format(++counter) + ")\t" + pp.format(uset) + "\t" + ids); } openFile("chars-to-ids.txt", ";@ Characters to IDS"); out.println("; Sorted by total-strokes, then radical (from Unihan data)"); - out.println(";line-number ; total-strokes/radical ; char ; (hex) ; ids-contains-radical? ; ids"); + out.println( + ";line-number ; total-strokes/radical ; char ; (hex) ; ids-contains-radical? ; ids"); counter = 0; UnicodeSet missingInfo = new UnicodeSet(); @@ -704,19 +746,28 @@ public static void main(String[] args) throws IOException { // if (ids instanceof Leaf && ids.codepoint == containingChar.codePointAt(0)) { // continue; // } - out.println(nf.format(++counter) + ")\t" - + strokes + UTF16.valueOf(radical) - + "\t" + charAndHex(containingChar) - + "\t" + (ids != null && ids.contains(radical, data) ? "" : "N") - + "\t" + ids); - } - - openFile("chars-without-ids.txt", ";@ Characters without Ids data\t" + nf.format(missingInfo.size())); + out.println( + nf.format(++counter) + + ")\t" + + strokes + + UTF16.valueOf(radical) + + "\t" + + charAndHex(containingChar) + + "\t" + + (ids != null && ids.contains(radical, data) ? "" : "N") + + "\t" + + ids); + } + + openFile( + "chars-without-ids.txt", + ";@ Characters without Ids data\t" + nf.format(missingInfo.size())); out.println(";\tcount ; total-strokes/radical ; set-without-ids\n"); counter = 0; oldStrokes = 0; oldRadical = 0; - TreeSet sortedByRadicalStroke2 = new TreeSet(RadicalStroke2.RadicalStrokeComparator); + TreeSet sortedByRadicalStroke2 = + new TreeSet(RadicalStroke2.RadicalStrokeComparator); for (String s : missingInfo) { sortedByRadicalStroke2.add(s); } @@ -731,8 +782,13 @@ public static void main(String[] args) throws IOException { if (strokes == null) strokes = 0; if (radical != oldRadical || strokes != oldStrokes) { if (temp.size() != 0) { - out.println(nf.format(++counter) + ")\t" + oldStrokes + UTF16.valueOf(oldRadical) + "\t" - + pp.format(temp)); + out.println( + nf.format(++counter) + + ")\t" + + oldStrokes + + UTF16.valueOf(oldRadical) + + "\t" + + pp.format(temp)); } oldRadical = radical; oldStrokes = strokes; @@ -743,7 +799,13 @@ public static void main(String[] args) throws IOException { // continue; // } } - out.println(nf.format(++counter) + ")\t" + oldStrokes + UTF16.valueOf(oldRadical) + "\t" + pp.format(temp)); + out.println( + nf.format(++counter) + + ")\t" + + oldStrokes + + UTF16.valueOf(oldRadical) + + "\t" + + pp.format(temp)); openFile("expanded-ids.txt", ";@ Characters to IDS-Expansion"); out.println("; Shows the recursive expansion of IDS, if different"); @@ -760,15 +822,19 @@ public static void main(String[] args) throws IOException { String idsSimple = ids.toString(); String idsString = ids.toString(data, true, 20); if (!idsSimple.equals(idsString)) { - out.println(nf.format(++counter) + ")\t" - + charAndHex(containingChar) - + "\t" + idsSimple - + "\t" + idsString - ); + out.println( + nf.format(++counter) + + ")\t" + + charAndHex(containingChar) + + "\t" + + idsSimple + + "\t" + + idsString); } } - openFile("chars-in-expanded-ids.txt", - ";@ Characters present in expanded IDS:\t" + nf.format(leafCounter.counter.size())); + openFile( + "chars-in-expanded-ids.txt", + ";@ Characters present in expanded IDS:\t" + nf.format(leafCounter.counter.size())); UnicodeSet allowedExpanded = new UnicodeSet(); UnicodeSet radicalsAllowed = new UnicodeSet(); for (Integer cp : leafCounter.counter.getKeysetSortedByCount(false)) { @@ -798,8 +864,12 @@ public static void main(String[] args) throws IOException { continue; } inOtherChar.add(containedChar); - out.println(nf.format(++counter) + ")\t" - + charAndHex(containedChar) + "\t" + pp.format(keyset)); + out.println( + nf.format(++counter) + + ")\t" + + charAndHex(containedChar) + + "\t" + + pp.format(keyset)); } int limitHack = 0xE000 + hacks.hackStrings.size(); @@ -808,11 +878,17 @@ public static void main(String[] args) throws IOException { if (keyset == null) { continue; } - out.println(nf.format(++counter) + ")\t" - + charAndHex(containedChar) + "\t" + pp.format(keyset)); + out.println( + nf.format(++counter) + + ")\t" + + charAndHex(containedChar) + + "\t" + + pp.format(keyset)); } - openFile("chars-in-ids.txt", ";@ Characters present in some IDS:\t" + nf.format(inOtherChar.size())); + openFile( + "chars-in-ids.txt", + ";@ Characters present in some IDS:\t" + nf.format(inOtherChar.size())); out.println(pp.format(inOtherChar)); out.close(); @@ -836,11 +912,13 @@ private static boolean betterThan(int cp1, int cp2) { private static String charAndHex(int codepoint) { String hack = hacks.getHackString(codepoint); - return UTF16.valueOf(codepoint) + "\t(" + (hack == null ? Utility.hex(codepoint) : hack) + ")"; + return UTF16.valueOf(codepoint) + + "\t(" + + (hack == null ? Utility.hex(codepoint) : hack) + + ")"; } private static String charAndHex(String codepoint) { return codepoint + "\t(" + Utility.hex(codepoint) + ")"; } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/LanguageDetectionVsTags.java b/unicodetools/src/main/java/org/unicode/draft/LanguageDetectionVsTags.java index 3c21fac31..e7afe2299 100644 --- a/unicodetools/src/main/java/org/unicode/draft/LanguageDetectionVsTags.java +++ b/unicodetools/src/main/java/org/unicode/draft/LanguageDetectionVsTags.java @@ -1,4 +1,9 @@ package org.unicode.draft; + +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; @@ -10,7 +15,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.tool.ConvertLanguageData.InverseComparator; import org.unicode.cldr.tool.LikelySubtags; @@ -20,21 +24,33 @@ import org.unicode.cldr.util.SupplementalDataInfo; import org.unicode.text.utility.Settings; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.util.ULocale; - - public class LanguageDetectionVsTags { - private static final EnumSet tag_range = EnumSet.range(LineFormat.http, LineFormat.xmllang); + private static final EnumSet tag_range = + EnumSet.range(LineFormat.http, LineFormat.xmllang); // http meta html-lang xml-lang detected count navboost pagerang lang encod - enum LineFormat {L1, http, meta, lang, xmllang, detected, occurrences, documents, navboost, pagerank, lang2, enc, url}; + enum LineFormat { + L1, + http, + meta, + lang, + xmllang, + detected, + occurrences, + documents, + navboost, + pagerank, + lang2, + enc, + url + }; public static void main(String[] args) throws IOException { - final BufferedReader in = FileUtilities.openUTF8Reader(Settings.CLDR.BASE_DIRECTORY + "Documents/Data/", "lang78.txt"); - final Map> detectedToCountAndTag = new TreeMap>(); + final BufferedReader in = + FileUtilities.openUTF8Reader( + Settings.CLDR.BASE_DIRECTORY + "Documents/Data/", "lang78.txt"); + final Map> detectedToCountAndTag = + new TreeMap>(); final Counter detectedToCount = new Counter(); final Counter taggedToCount = new Counter(); final Set tagSet = new HashSet(); @@ -46,7 +62,9 @@ public static void main(String[] args) throws IOException { } final String[] parts = line.split("\t"); // L1 html lang detected occurrences documents navboost pagerank lang enc url - // L1 html en en 110977145 110867809 48729828 60774 0 22 http://www.facebook.com/ 27961428 60188 0 0 http://www.mapquest.com/ 10651203 58229 0 22 http://free.grisoft.com/ + // L1 html en en 110977145 110867809 48729828 60774 0 22 http://www.facebook.com/ + // 27961428 60188 0 0 http://www.mapquest.com/ 10651203 58229 0 22 + // http://free.grisoft.com/ try { final String googleID = fixID(parts[LineFormat.detected.ordinal()]); final long count = Long.parseLong(parts[LineFormat.occurrences.ordinal()]); @@ -86,22 +104,27 @@ public static void main(String[] args) throws IOException { System.out.println(i + "\t" + tagCount2.getCount(i)); } for (final String detected : detectedToCount.getKeysetSortedByCount(false)) { - System.out.println(getLanguageName(detected) - + "\t" + detectedToCount.getCount(detected) - + "\t" + taggedToCount.getCount(detected)); + System.out.println( + getLanguageName(detected) + + "\t" + + detectedToCount.getCount(detected) + + "\t" + + taggedToCount.getCount(detected)); } if (true) { return; } - final InverseComparator>> inverseComparator = new InverseComparator>>(); - final TreeSet>> countLangTypes = new TreeSet>>(inverseComparator); + final InverseComparator>> inverseComparator = + new InverseComparator>>(); + final TreeSet>> countLangTypes = + new TreeSet>>(inverseComparator); for (final String lang : detectedToCountAndTag.keySet()) { final Counter counter = detectedToCountAndTag.get(lang); final long total = counter.getTotal(); final Counter typeCount = new Counter(); for (final String x : counter.keySet()) { - typeCount.add(getType(lang,x), counter.getCount(x)); + typeCount.add(getType(lang, x), counter.getCount(x)); } countLangTypes.add(new Row.R3>(total, lang, typeCount)); } @@ -118,7 +141,8 @@ public static void main(String[] args) throws IOException { final Counter typeCount = countAndLang.get2(); System.out.print(getLanguageName(lang)); for (final Type type : Type.values()) { - System.out.print("\t" + ((double)typeCount.getCount(type)/typeCount.getTotal() - 0.0001)); + System.out.print( + "\t" + ((double) typeCount.getCount(type) / typeCount.getTotal() - 0.0001)); } System.out.println(); } @@ -137,7 +161,7 @@ public static void main(String[] args) throws IOException { tags.add(tagged); counts.add(tagCount); remaining -= tagCount; - if (count++ > 3 && remaining*1000 < total) { + if (count++ > 3 && remaining * 1000 < total) { break; } } @@ -161,15 +185,23 @@ public static void main(String[] args) throws IOException { if (name.equals(tag)) { name = "??"; } - System.out.println("\t" + getName(tag) + "\t" + name + "\t" + counts.get(i) - + "\t" + getType(lang, tag)); + System.out.println( + "\t" + + getName(tag) + + "\t" + + name + + "\t" + + counts.get(i) + + "\t" + + getType(lang, tag)); } } } } static LanguageTagParser langTagParser = new LanguageTagParser(); - static SupplementalDataInfo supplementalData = SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); + static SupplementalDataInfo supplementalData = + SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); static Map likelySubtags = supplementalData.getLikelySubtags(); private static String fixID(String string) { @@ -193,7 +225,11 @@ private static String fixID(String string) { } } - enum Type {missing, mismatch, match}; + enum Type { + missing, + mismatch, + match + }; static Type getType(String detected, String tagged) { if (tagged.equals("---")) { @@ -229,12 +265,13 @@ private static String getName(String tag) { return "other"; } if (tag.length() > 12) { - tag = tag.substring(0,12) + "…"; + tag = tag.substring(0, 12) + "…"; } return '"' + tag + '"'; } - static Map remapping = new HashMap(); + static Map remapping = new HashMap(); + static { remapping.put("zh-CN", "Chinese (S)"); remapping.put("zh-TW", "Chinese (T)"); diff --git a/unicodetools/src/main/java/org/unicode/draft/LanguageQuadgrams.java b/unicodetools/src/main/java/org/unicode/draft/LanguageQuadgrams.java index 9f3b9fca1..654b0a643 100644 --- a/unicodetools/src/main/java/org/unicode/draft/LanguageQuadgrams.java +++ b/unicodetools/src/main/java/org/unicode/draft/LanguageQuadgrams.java @@ -1,5 +1,7 @@ package org.unicode.draft; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Row; import java.io.BufferedReader; import java.io.IOException; import java.util.HashMap; @@ -8,25 +10,22 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.text.utility.Settings; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row; - - public class LanguageQuadgrams { public static void main(String[] args) throws IOException { - final BufferedReader in = FileUtilities.openUTF8Reader(Settings.CLDR.BASE_DIRECTORY + "Downloads/", "languageQuadgrams.txt"); + final BufferedReader in = + FileUtilities.openUTF8Reader( + Settings.CLDR.BASE_DIRECTORY + "Downloads/", "languageQuadgrams.txt"); while (true) { final String line = in.readLine(); if (line == null) { break; } final String[] parts = line.split("\\s+"); - add(parts[0], Integer.parseInt(parts[1],16), Byte.parseByte(parts[2])); + add(parts[0], Integer.parseInt(parts[1], 16), Byte.parseByte(parts[2])); } in.close(); @@ -36,14 +35,14 @@ public static void main(String[] args) throws IOException { final Map x = languageToLanguageInfo.get(lang); // get the sum - Blur sum = new Blur(0,0); + Blur sum = new Blur(0, 0); for (final Integer quad : x.keySet()) { sum = sum.add(x.get(quad)); } System.out.println(lang + "\t" + sum); // add the zero value - add(lang, 0, (byte)0); + add(lang, 0, (byte) 0); // normalize to 1.0 for (final Integer quad : x.keySet()) { @@ -56,7 +55,8 @@ public static void main(String[] args) throws IOException { // System.out.println("\t" + lang + "\t" + sum); } - final Relation> distanceToPair = new Relation(new TreeMap(), TreeSet.class); + final Relation> distanceToPair = + new Relation(new TreeMap(), TreeSet.class); // compare languages final Set languagesDone = new HashSet(); @@ -87,27 +87,39 @@ public static void main(String[] args) throws IOException { final Blur distanceCommon = getDistance(common, quad1, quad2, show); final Blur distance2 = getDistance(x2only, quad1, quad2, show); - final String message = "\tcommon:\6" + common.size() + "\t;\t" + distanceCommon - + "\tunique1:\t" + x1only.size() + "\t;\t" + distance1 - + "\tunique2:\t" + x2only.size() + "\t;\t" + distance2; - //message = ""; // remove for now. + final String message = + "\tcommon:\6" + + common.size() + + "\t;\t" + + distanceCommon + + "\tunique1:\t" + + x1only.size() + + "\t;\t" + + distance1 + + "\tunique2:\t" + + x2only.size() + + "\t;\t" + + distance2; + // message = ""; // remove for now. final Blur distance = distanceCommon.add(distance1).add(distance2).divideBy(2); - distanceToPair.put(distance, new Row.R3(lang,lang2,message)); + distanceToPair.put( + distance, new Row.R3(lang, lang2, message)); } languagesDone.add(lang); } for (final Blur distance : distanceToPair.keySet()) { for (final Row.R3 value : distanceToPair.getAll(distance)) { - System.out.println(distance + "\t" + value.get0() + "-" + value.get1() + "\t" + value.get2()); + System.out.println( + distance + "\t" + value.get0() + "-" + value.get1() + "\t" + value.get2()); } } } - private static Blur getDistance(Set quads, Map quad1, - Map quad2, boolean show) { - Blur distance = new Blur(0,0); + private static Blur getDistance( + Set quads, Map quad1, Map quad2, boolean show) { + Blur distance = new Blur(0, 0); final Blur aZero = quad1.get(0); final Blur bZero = quad2.get(0); for (final Integer quad : quads) { @@ -116,37 +128,43 @@ private static Blur getDistance(Set quads, Map quad1, return distance; } - static Map> languageToLanguageInfo = new TreeMap>(); + static Map> languageToLanguageInfo = + new TreeMap>(); - //static Blur one = Math.pow(2,12); + // static Blur one = Math.pow(2,12); private static void add(String string, int quad, byte parseByte) { Map x = languageToLanguageInfo.get(string); if (x == null) { languageToLanguageInfo.put(string, x = new HashMap()); } - x.put(quad, new Blur(Math.pow(2,parseByte-0.5), Math.pow(2,parseByte+0.5))); + x.put(quad, new Blur(Math.pow(2, parseByte - 0.5), Math.pow(2, parseByte + 0.5))); } public static class Blur implements Comparable { - public static Blur ZERO = new Blur(0,0); + public static Blur ZERO = new Blur(0, 0); private final double max; private final double min; - public Blur (double min, double max) { + + public Blur(double min, double max) { this.min = min; this.max = max; } + public Blur add(Blur other) { return new Blur(min + other.min, max + other.max); } + public Blur divideBy(Blur other) { // TODO fix for negatives - return new Blur(min/other.max, max/other.min); + return new Blur(min / other.max, max / other.min); } + public Blur divideBy(double other) { // TODO fix for negatives - return new Blur(min/other, max/other); + return new Blur(min / other, max / other); } + public Blur addDelta(Blur a, Blur b, Blur aZero, Blur bZero, boolean show) { if (a == null) { a = aZero; @@ -186,6 +204,7 @@ public Blur addDelta(Blur a, Blur b, Blur aZero, Blur bZero, boolean show) { public String toString() { return min + "\t" + max; } + @Override public int compareTo(Blur o) { if (min < o.min) { @@ -202,9 +221,11 @@ public int compareTo(Blur o) { } return 0; } + protected double getMax() { return max; } + protected double getMin() { return min; } diff --git a/unicodetools/src/main/java/org/unicode/draft/ListTopLanguages.java b/unicodetools/src/main/java/org/unicode/draft/ListTopLanguages.java index 0719f4841..35ac1c6b9 100644 --- a/unicodetools/src/main/java/org/unicode/draft/ListTopLanguages.java +++ b/unicodetools/src/main/java/org/unicode/draft/ListTopLanguages.java @@ -1,9 +1,14 @@ package org.unicode.draft; + +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.text.NumberFormat; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.Counter; @@ -13,34 +18,36 @@ import org.unicode.cldr.util.SupplementalDataInfo.OfficialStatus; import org.unicode.cldr.util.SupplementalDataInfo.PopulationData; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.text.NumberFormat; - public class ListTopLanguages { - static SupplementalDataInfo sdata = SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); - static Map, String>>> localeAliasInfo = sdata.getLocaleAliasInfo(); + static SupplementalDataInfo sdata = + SupplementalDataInfo.getInstance(CLDRPaths.SUPPLEMENTAL_DIRECTORY); + static Map, String>>> localeAliasInfo = + sdata.getLocaleAliasInfo(); static Map likelySubtags = sdata.getLikelySubtags(); static Factory cldrFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, ".*"); static CLDRFile english = cldrFactory.make("en", true); - static Set SCHEDULED = ImmutableSet.of("hi", "mr", "bn", "ta", "te", "gu", "ur", "kn", "ml", "pa", "or", "as", "mai", "sat", "ks", "ne", "kok", "sd", "doi", "mni", "brx", "sa"); + static Set SCHEDULED = + ImmutableSet.of( + "hi", "mr", "bn", "ta", "te", "gu", "ur", "kn", "ml", "pa", "or", "as", "mai", + "sat", "ks", "ne", "kok", "sd", "doi", "mni", "brx", "sa"); public static void main(String[] args) { - final Counter> gathered = new Counter>(); + final Counter> gathered = + new Counter>(); Counter scheduledLitPop = new Counter<>(); Counter2 scheduledGDP = new Counter2<>(); - + for (final String territory : sdata.getTerritoriesWithPopulationData()) { PopulationData terrData = sdata.getPopulationDataForTerritory(territory); double terrGdpPerLitCapita = terrData.getGdp() / terrData.getLiteratePopulation(); - for (final String language : sdata.getLanguagesForTerritoryWithPopulationData(territory)) { + for (final String language : + sdata.getLanguagesForTerritoryWithPopulationData(territory)) { if (language.equals("und")) { continue; } - final PopulationData data = sdata.getLanguageAndTerritoryPopulationData(language, territory); + final PopulationData data = + sdata.getLanguageAndTerritoryPopulationData(language, territory); final long pop = (long) data.getPopulation(); if (!territory.equals("IN") && SCHEDULED.contains(language)) { double literatePopulation = data.getLiteratePopulation(); @@ -68,15 +75,29 @@ public static void main(String[] args) { } } for (String sched : SCHEDULED) { - System.out.println(sched + "\tlitPop%:\t" + scheduledLitPop.get(sched) + "\tgdp%:\t" + scheduledGDP.getCount(sched)); + System.out.println( + sched + + "\tlitPop%:\t" + + scheduledLitPop.get(sched) + + "\tgdp%:\t" + + scheduledGDP.getCount(sched)); } int rank = 0; final NumberFormat format = NumberFormat.getInstance(); format.setGroupingUsed(true); - for (final R3 row : gathered.getKeysetSortedByCount(false)) { + for (final R3 row : + gathered.getKeysetSortedByCount(false)) { final long pop = gathered.get(row); final OfficialStatus status = row.get0(); - System.out.println(++rank + "\t" + format.format(pop) + "\t" + row.get1() + "\t" + row.get2() + (status == OfficialStatus.unknown ? "" : "\t" + status)); + System.out.println( + ++rank + + "\t" + + format.format(pop) + + "\t" + + row.get1() + + "\t" + + row.get2() + + (status == OfficialStatus.unknown ? "" : "\t" + status)); } } @@ -93,7 +114,8 @@ private static Set getAlternates(String language, String script, String System.out.println("Unknown tag: " + tag); } } - final Set> alternates = new TreeSet>(); + final Set> alternates = + new TreeSet>(); for (final String language2 : languages) { for (final String territory2 : territories) { final R3 row = Row.of(language2, script, territory2); @@ -113,7 +135,10 @@ private static Set getAlternates(String language, String script, String return result; } - private static void addAlternates(String language, Map, String>> replacements, Set languages) { + private static void addAlternates( + String language, + Map, String>> replacements, + Set languages) { languages.add(language); for (final String source : replacements.keySet()) { final List set = replacements.get(source).get0(); diff --git a/unicodetools/src/main/java/org/unicode/draft/MessageFormat.java b/unicodetools/src/main/java/org/unicode/draft/MessageFormat.java index d840dc84a..f8d355c74 100644 --- a/unicodetools/src/main/java/org/unicode/draft/MessageFormat.java +++ b/unicodetools/src/main/java/org/unicode/draft/MessageFormat.java @@ -10,7 +10,10 @@ ********************************************************************** */ - +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UFormat; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.InvalidObjectException; import java.io.ObjectInputStream; @@ -32,11 +35,6 @@ import java.util.Objects; import java.util.Set; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UFormat; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - /** * MessageFormat provides a means to produce concatenated * messages in language-neutral way. Use this to construct messages @@ -84,7 +82,7 @@ * and {@link #parse(String)}. * These APIs all have corresponding new versions as listed above. *

- + * * The API {@link #format(Object, StringBuffer, FieldPosition)} has * been modified so that the Object argument can be * either an Object array or a Map. If this @@ -399,12 +397,10 @@ public class MessageFormat extends UFormat { static final long serialVersionUID = 7136212545847378651L; /** - * Constructs a MessageFormat for the default locale and the - * specified pattern. - * The constructor first sets the locale, then parses the pattern and - * creates a list of subformats for the format elements contained in it. - * Patterns and their interpretation are specified in the - * class description. + * Constructs a MessageFormat for the default locale and the specified pattern. The constructor + * first sets the locale, then parses the pattern and creates a list of subformats for the + * format elements contained in it. Patterns and their interpretation are specified in the class description. * * @param pattern the pattern for this message format * @exception IllegalArgumentException if the pattern is invalid @@ -416,12 +412,10 @@ public MessageFormat(String pattern) { } /** - * Constructs a MessageFormat for the specified locale and - * pattern. - * The constructor first sets the locale, then parses the pattern and - * creates a list of subformats for the format elements contained in it. - * Patterns and their interpretation are specified in the - * class description. + * Constructs a MessageFormat for the specified locale and pattern. The constructor first sets + * the locale, then parses the pattern and creates a list of subformats for the format elements + * contained in it. Patterns and their interpretation are specified in the class description. * * @param pattern the pattern for this message format * @param locale the locale for this message format @@ -433,12 +427,10 @@ public MessageFormat(String pattern, Locale locale) { } /** - * Constructs a MessageFormat for the specified locale and - * pattern. - * The constructor first sets the locale, then parses the pattern and - * creates a list of subformats for the format elements contained in it. - * Patterns and their interpretation are specified in the - * class description. + * Constructs a MessageFormat for the specified locale and pattern. The constructor first sets + * the locale, then parses the pattern and creates a list of subformats for the format elements + * contained in it. Patterns and their interpretation are specified in the class description. * * @param pattern the pattern for this message format * @param locale the locale for this message format @@ -451,11 +443,10 @@ public MessageFormat(String pattern, ULocale locale) { } /** - * Sets the locale to be used when creating or comparing subformats. - * This affects subsequent calls to the {@link #applyPattern applyPattern} - * and {@link #toPattern toPattern} methods as well as to the - * format and - * {@link #formatToCharacterIterator formatToCharacterIterator} methods. + * Sets the locale to be used when creating or comparing subformats. This affects subsequent + * calls to the {@link #applyPattern applyPattern} and {@link #toPattern toPattern} methods as + * well as to the format and {@link #formatToCharacterIterator + * formatToCharacterIterator} methods. * * @param locale the locale to be used when creating or comparing subformats * @stable ICU 3.0 @@ -465,11 +456,10 @@ public void setLocale(Locale locale) { } /** - * Sets the locale to be used when creating or comparing subformats. - * This affects subsequent calls to the {@link #applyPattern applyPattern} - * and {@link #toPattern toPattern} methods as well as to the - * format and - * {@link #formatToCharacterIterator formatToCharacterIterator} methods. + * Sets the locale to be used when creating or comparing subformats. This affects subsequent + * calls to the {@link #applyPattern applyPattern} and {@link #toPattern toPattern} methods as + * well as to the format and {@link #formatToCharacterIterator + * formatToCharacterIterator} methods. * * @param locale the locale to be used when creating or comparing subformats * @stable ICU 3.2 @@ -478,9 +468,9 @@ public void setLocale(ULocale locale) { /* Save the pattern, and then reapply so that */ /* we pick up any changes in locale specific */ /* elements */ - final String existingPattern = toPattern(); /*ibm.3550*/ + final String existingPattern = toPattern(); /*ibm.3550*/ ulocale = locale; - applyPattern(existingPattern); /*ibm.3550*/ + applyPattern(existingPattern); /*ibm.3550*/ } /** @@ -504,14 +494,11 @@ public ULocale getULocale() { } /** - * Sets the pattern used by this message format. - * The method parses the pattern and creates a list of subformats - * for the format elements contained in it. - * Patterns and their interpretation are specified in the - * class description. - *

- * The pattern must contain only named or only numeric arguments, - * mixing them is not allowed. + * Sets the pattern used by this message format. The method parses the pattern and creates a + * list of subformats for the format elements contained in it. Patterns and their interpretation + * are specified in the class description. + * + *

The pattern must contain only named or only numeric arguments, mixing them is not allowed. * * @param pttrn the pattern for this message format * @throws IllegalArgumentException if the pattern is invalid @@ -531,9 +518,8 @@ public void applyPattern(String pttrn) { final char ch = pttrn.charAt(i); if (part == 0) { if (ch == '\'') { - if (i + 1 < pttrn.length() - && pttrn.charAt(i+1) == '\'') { - segments[part].append(ch); // handle doubles + if (i + 1 < pttrn.length() && pttrn.charAt(i + 1) == '\'') { + segments[part].append(ch); // handle doubles ++i; } else { inQuote = !inQuote; @@ -543,40 +529,40 @@ public void applyPattern(String pttrn) { } else { segments[part].append(ch); } - } else if (inQuote) { // just copy quotes in parts + } else if (inQuote) { // just copy quotes in parts segments[part].append(ch); if (ch == '\'') { inQuote = false; } } else { switch (ch) { - case ',': - if (part < 3) { - part += 1; - } else { + case ',': + if (part < 3) { + part += 1; + } else { + segments[part].append(ch); + } + break; + case '{': + ++braceStack; segments[part].append(ch); - } - break; - case '{': - ++braceStack; - segments[part].append(ch); - break; - case '}': - if (braceStack == 0) { - part = 0; - makeFormat(i, formatNumber, segments); - formatNumber++; - } else { - --braceStack; + break; + case '}': + if (braceStack == 0) { + part = 0; + makeFormat(i, formatNumber, segments); + formatNumber++; + } else { + --braceStack; + segments[part].append(ch); + } + break; + case '\'': + inQuote = true; + // fall through, so we keep quotes in other parts + default: segments[part].append(ch); - } - break; - case '\'': - inQuote = true; - // fall through, so we keep quotes in other parts - default: - segments[part].append(ch); - break; + break; } } } @@ -587,11 +573,10 @@ public void applyPattern(String pttrn) { pattern = segments[0].toString(); } - /** - * Returns a pattern representing the current state of the message format. - * The string is constructed from internal information and therefore - * does not necessarily equal the previously applied pattern. + * Returns a pattern representing the current state of the message format. The string is + * constructed from internal information and therefore does not necessarily equal the previously + * applied pattern. * * @return a pattern representing the current state of the message format * @stable ICU 3.0 @@ -601,7 +586,7 @@ public String toPattern() { int lastOffset = 0; final StringBuffer result = new StringBuffer(); for (int i = 0; i <= maxOffset; ++i) { - copyAndFixQuotes(pattern, lastOffset, offsets[i],result); + copyAndFixQuotes(pattern, lastOffset, offsets[i], result); lastOffset = offsets[i]; result.append('{'); result.append(argumentNames[i]); @@ -612,27 +597,34 @@ public String toPattern() { return result.toString(); } - static final FormatRegistry formatRegistry = (FormatRegistry) newInstanceOrNull("SimpleFormatRegistry", new FormatRegistry() { - - @Override - public Format getFormat(String mainType, String subType, ULocale ulocale) { - throw new IllegalArgumentException("No available formats; only String arguments permitted"); - } - - @Override - public Format getFormatForObject(Class classType, ULocale ulocale) { - return null; - } - - @Override - public String getKey(Format format, ULocale ulocale) { - throw new IllegalArgumentException("No available formats; only String arguments permitted"); - } - - }); + static final FormatRegistry formatRegistry = + (FormatRegistry) + newInstanceOrNull( + "SimpleFormatRegistry", + new FormatRegistry() { + + @Override + public Format getFormat( + String mainType, String subType, ULocale ulocale) { + throw new IllegalArgumentException( + "No available formats; only String arguments permitted"); + } + + @Override + public Format getFormatForObject(Class classType, ULocale ulocale) { + return null; + } + + @Override + public String getKey(Format format, ULocale ulocale) { + throw new IllegalArgumentException( + "No available formats; only String arguments permitted"); + } + }); /** * Utility for refactoring + * * @param className * @return A new instance of that class, or fallback if none available. */ @@ -646,40 +638,32 @@ public static Object newInstanceOrNull(String className, Object fallback) { } /** - * Sets the formats to use for the values passed into - * format methods or returned from parse - * methods. The indices of elements in newFormats - * correspond to the argument indices used in the previously set - * pattern string. - * The order of formats in newFormats thus corresponds to - * the order of elements in the arguments array passed - * to the format methods or the result array returned + * Sets the formats to use for the values passed into format methods or returned + * from parse methods. The indices of elements in newFormats + * correspond to the argument indices used in the previously set pattern string. The order of + * formats in newFormats thus corresponds to the order of elements in the + * arguments array passed to the format methods or the result array returned * by the parse methods. - *

- * If an argument index is used for more than one format element - * in the pattern string, then the corresponding new format is used - * for all such format elements. If an argument index is not used - * for any format element in the pattern string, then the - * corresponding new format is ignored. If fewer formats are provided - * than needed, then only the formats for argument indices less - * than newFormats.length are replaced. - * - * This method is only supported if the format does not use - * named arguments, otherwise an IllegalArgumentException is thrown. - * - * @param newFormats - * the new formats to use - * @throws NullPointerException - * if newFormats is null - * @throws IllegalArgumentException - * if this formatter uses named arguments + * + *

If an argument index is used for more than one format element in the pattern string, then + * the corresponding new format is used for all such format elements. If an argument index is + * not used for any format element in the pattern string, then the corresponding new format is + * ignored. If fewer formats are provided than needed, then only the formats for argument + * indices less than newFormats.length are replaced. + * + *

This method is only supported if the format does not use named arguments, otherwise an + * IllegalArgumentException is thrown. + * + * @param newFormats the new formats to use + * @throws NullPointerException if newFormats is null + * @throws IllegalArgumentException if this formatter uses named arguments * @stable ICU 3.0 */ public void setFormatsByArgumentIndex(Format[] newFormats) { if (!argumentNamesAreNumeric) { throw new IllegalArgumentException( - "This method is not available in MessageFormat objects " + - "that use alphanumeric argument names."); + "This method is not available in MessageFormat objects " + + "that use alphanumeric argument names."); } for (int i = 0; i <= maxOffset; i++) { final int j = Integer.parseInt(argumentNames[i]); @@ -690,24 +674,18 @@ public void setFormatsByArgumentIndex(Format[] newFormats) { } /** - * Sets the formats to use for the values passed into - * format methods or returned from parse - * methods. The keys in newFormats are the argument - * names in the previously set pattern string, and the values - * are the formats. - *

- * Only argument names from the pattern string are considered. - * Extra keys in newFormats that do not correspond - * to an argument name are ignored. Similarly, if there is no - * format in newFormats for an argument name, the formatter - * for that argument remains unchanged. - *

- * This may be called on formats that do not use named arguments. - * In this case the map will be queried for key Strings that - * represent argument indices, e.g. "0", "1", "2" etc. + * Sets the formats to use for the values passed into format methods or returned + * from parse methods. The keys in newFormats are the argument names + * in the previously set pattern string, and the values are the formats. * - * @param newFormats a map from String to Format providing new - * formats for named arguments. + *

Only argument names from the pattern string are considered. Extra keys in newFormats + * that do not correspond to an argument name are ignored. Similarly, if there is no + * format in newFormats for an argument name, the formatter for that argument remains unchanged. + * + *

This may be called on formats that do not use named arguments. In this case the map will + * be queried for key Strings that represent argument indices, e.g. "0", "1", "2" etc. + * + * @param newFormats a map from String to Format providing new formats for named arguments. * @stable ICU 3.8 */ public void setFormatsByArgumentName(Map newFormats) { @@ -720,23 +698,19 @@ public void setFormatsByArgumentName(Map newFormats) { } /** - * Sets the formats to use for the format elements in the - * previously set pattern string. - * The order of formats in newFormats corresponds to - * the order of format elements in the pattern string. - *

- * If more formats are provided than needed by the pattern string, - * the remaining ones are ignored. If fewer formats are provided - * than needed, then only the first newFormats.length - * formats are replaced. - *

- * Since the order of format elements in a pattern string often - * changes during localization, it is generally better to use the - * {@link #setFormatsByArgumentIndex setFormatsByArgumentIndex} - * method, which assumes an order of formats corresponding to the - * order of elements in the arguments array passed to - * the format methods or the result array returned by - * the parse methods. + * Sets the formats to use for the format elements in the previously set pattern string. The + * order of formats in newFormats corresponds to the order of format elements in + * the pattern string. + * + *

If more formats are provided than needed by the pattern string, the remaining ones are + * ignored. If fewer formats are provided than needed, then only the first + * newFormats.length formats are replaced. + * + *

Since the order of format elements in a pattern string often changes during localization, + * it is generally better to use the {@link #setFormatsByArgumentIndex + * setFormatsByArgumentIndex} method, which assumes an order of formats corresponding to the + * order of elements in the arguments array passed to the format + * methods or the result array returned by the parse methods. * * @param newFormats the new formats to use * @exception NullPointerException if newFormats is null @@ -753,35 +727,28 @@ public void setFormats(Format[] newFormats) { } /** - * Sets the format to use for the format elements within the - * previously set pattern string that use the given argument - * index. - * The argument index is part of the format element definition and - * represents an index into the arguments array passed - * to the format methods or the result array returned - * by the parse methods. - *

- * If the argument index is used for more than one format element - * in the pattern string, then the new format is used for all such - * format elements. If the argument index is not used for any format - * element in the pattern string, then the new format is ignored. - * - * This method is only supported when exclusively numbers are used for - * argument names. Otherwise an IllegalArgumentException is thrown. - * - * @param argumentIndex - * the argument index for which to use the new format - * @param newFormat - * the new format to use - * @exception IllegalArgumentException - * if alphanumeric arguments where used in MessageFormat. + * Sets the format to use for the format elements within the previously set pattern string that + * use the given argument index. The argument index is part of the format element definition and + * represents an index into the arguments array passed to the format + * methods or the result array returned by the parse methods. + * + *

If the argument index is used for more than one format element in the pattern string, then + * the new format is used for all such format elements. If the argument index is not used for + * any format element in the pattern string, then the new format is ignored. + * + *

This method is only supported when exclusively numbers are used for argument names. + * Otherwise an IllegalArgumentException is thrown. + * + * @param argumentIndex the argument index for which to use the new format + * @param newFormat the new format to use + * @exception IllegalArgumentException if alphanumeric arguments where used in MessageFormat. * @stable ICU 3.0 */ public void setFormatByArgumentIndex(int argumentIndex, Format newFormat) { if (!argumentNamesAreNumeric) { throw new IllegalArgumentException( - "This method is not available in MessageFormat objects " + - "that use alphanumeric argument names."); + "This method is not available in MessageFormat objects " + + "that use alphanumeric argument names."); } for (int j = 0; j <= maxOffset; j++) { if (Integer.parseInt(argumentNames[j]) == argumentIndex) { @@ -791,19 +758,16 @@ public void setFormatByArgumentIndex(int argumentIndex, Format newFormat) { } /** - * Sets the format to use for the format elements within the - * previously set pattern string that use the given argument - * name. - *

- * If the argument name is used for more than one format element - * in the pattern string, then the new format is used for all such - * format elements. If the argument name is not used for any format - * element in the pattern string, then the new format is ignored. - *

- * This API may be used on formats that do not use named arguments. - * In this case argumentName should be a String that names - * an argument index, e.g. "0", "1", "2"... etc. If it does not name - * a valid index, the format will be ignored. No error is thrown. + * Sets the format to use for the format elements within the previously set pattern string that + * use the given argument name. + * + *

If the argument name is used for more than one format element in the pattern string, then + * the new format is used for all such format elements. If the argument name is not used for any + * format element in the pattern string, then the new format is ignored. + * + *

This API may be used on formats that do not use named arguments. In this case + * argumentName should be a String that names an argument index, e.g. "0", "1", "2"... + * etc. If it does not name a valid index, the format will be ignored. No error is thrown. * * @param argumentName the name of the argument to change * @param newFormat the new format to use @@ -818,21 +782,18 @@ public void setFormatByArgumentName(String argumentName, Format newFormat) { } /** - * Sets the format to use for the format element with the given - * format element index within the previously set pattern string. - * The format element index is the zero-based number of the format - * element counting from the start of the pattern string. - *

- * Since the order of format elements in a pattern string often - * changes during localization, it is generally better to use the - * {@link #setFormatByArgumentIndex setFormatByArgumentIndex} - * method, which accesses format elements based on the argument - * index they specify. + * Sets the format to use for the format element with the given format element index within the + * previously set pattern string. The format element index is the zero-based number of the + * format element counting from the start of the pattern string. + * + *

Since the order of format elements in a pattern string often changes during localization, + * it is generally better to use the {@link #setFormatByArgumentIndex setFormatByArgumentIndex} + * method, which accesses format elements based on the argument index they specify. * * @param formatElementIndex the index of a format element within the pattern * @param newFormat the format to use for the specified format element - * @exception ArrayIndexOutOfBoundsException if formatElementIndex is equal to or - * larger than the number of format elements in the pattern string + * @exception ArrayIndexOutOfBoundsException if formatElementIndex is equal to or larger than + * the number of format elements in the pattern string * @stable ICU 3.0 */ public void setFormat(int formatElementIndex, Format newFormat) { @@ -840,35 +801,30 @@ public void setFormat(int formatElementIndex, Format newFormat) { } /** - * Gets the formats used for the values passed into - * format methods or returned from parse - * methods. The indices of elements in the returned array - * correspond to the argument indices used in the previously set - * pattern string. - * The order of formats in the returned array thus corresponds to - * the order of elements in the arguments array passed - * to the format methods or the result array returned - * by the parse methods. - *

- * If an argument index is used for more than one format element - * in the pattern string, then the format used for the last such - * format element is returned in the array. If an argument index - * is not used for any format element in the pattern string, then - * null is returned in the array. + * Gets the formats used for the values passed into format methods or returned from + * parse methods. The indices of elements in the returned array correspond to the + * argument indices used in the previously set pattern string. The order of formats in the + * returned array thus corresponds to the order of elements in the arguments array + * passed to the format methods or the result array returned by the parse + * methods. * - * This method is only supported when exclusively numbers are used for - * argument names. Otherwise an IllegalArgumentException is thrown. + *

If an argument index is used for more than one format element in the pattern string, then + * the format used for the last such format element is returned in the array. If an argument + * index is not used for any format element in the pattern string, then null is returned in the + * array. + * + *

This method is only supported when exclusively numbers are used for argument names. + * Otherwise an IllegalArgumentException is thrown. * * @return the formats used for the arguments within the pattern - * @throws IllegalArgumentException - * if this format uses named arguments + * @throws IllegalArgumentException if this format uses named arguments * @stable ICU 3.0 */ public Format[] getFormatsByArgumentIndex() { if (!argumentNamesAreNumeric) { throw new IllegalArgumentException( - "This method is not available in MessageFormat objects " + - "that use alphanumeric argument names."); + "This method is not available in MessageFormat objects " + + "that use alphanumeric argument names."); } int maximumArgumentNumber = -1; for (int i = 0; i <= maxOffset; i++) { @@ -887,21 +843,18 @@ public Format[] getFormatsByArgumentIndex() { // Where Map is: String argumentName --> Format format. /** - * Gets the formats used for the format elements in the - * previously set pattern string. - * The order of formats in the returned array corresponds to - * the order of format elements in the pattern string. - *

- * Since the order of format elements in a pattern string often - * changes during localization, it's generally better to use the - * {@link #getFormatsByArgumentIndex()} - * method, which assumes an order of formats corresponding to the - * order of elements in the arguments array passed to - * the format methods or the result array returned by - * the parse methods. + * Gets the formats used for the format elements in the previously set pattern string. The order + * of formats in the returned array corresponds to the order of format elements in the pattern + * string. + * + *

Since the order of format elements in a pattern string often changes during localization, + * it's generally better to use the {@link #getFormatsByArgumentIndex()} method, which assumes + * an order of formats corresponding to the order of elements in the arguments + * array passed to the format methods or the result array returned by the + * parse methods. * - * This method is only supported when exclusively numbers are used for - * argument names. Otherwise an IllegalArgumentException is thrown. + *

This method is only supported when exclusively numbers are used for argument names. + * Otherwise an IllegalArgumentException is thrown. * * @return the formats used for the format elements in the pattern * @stable ICU 3.0 @@ -913,7 +866,9 @@ public Format[] getFormats() { } /** - * Get the format argument names. For more details, see {@link #setFormatByArgumentName(String, Format)}. + * Get the format argument names. For more details, see {@link #setFormatByArgumentName(String, + * Format)}. + * * @return List of names * @deprecated * @internal @@ -928,7 +883,9 @@ public Set getFormatArgumentNames() { } /** - * Get the formats according to their argument names. For more details, see {@link #setFormatByArgumentName(String, Format)}. + * Get the formats according to their argument names. For more details, see {@link + * #setFormatByArgumentName(String, Format)}. + * * @return format associated with the name, or null if there isn't one. * @deprecated * @internal @@ -944,18 +901,17 @@ public Format getFormatByArgumentName(String argumentName) { } /** - * Formats an array of objects and appends the MessageFormat's - * pattern, with format elements replaced by the formatted objects, to the - * provided StringBuffer. - *

- * The text substituted for the individual format elements is derived from - * the current subformat of the format element and the - * arguments element at the format element's argument index - * as indicated by the first matching line of the following table. An - * argument is unavailable if arguments is - * null or has fewer than argumentIndex+1 elements. When - * an argument is unavailable no substitution is performed. + * Formats an array of objects and appends the MessageFormat's pattern, with format + * elements replaced by the formatted objects, to the provided StringBuffer. + * + *

The text substituted for the individual format elements is derived from the current + * subformat of the format element and the arguments element at the format + * element's argument index as indicated by the first matching line of the following table. An + * argument is unavailable if arguments is null or has fewer + * than argumentIndex+1 elements. When an argument is unavailable no substitution is performed. + * *

+ * * * *
Subformat @@ -996,38 +952,34 @@ public Format getFormatByArgumentName(String argumentName) { * any * argument.toString() *
- *

- * If pos is non-null, and refers to - * Field.ARGUMENT, the location of the first formatted - * string will be returned. * - * This method is only supported when the format does not use named - * arguments, otherwise an IllegalArgumentException is thrown. + *

If pos is non-null, and refers to Field.ARGUMENT, the location + * of the first formatted string will be returned. + * + *

This method is only supported when the format does not use named arguments, otherwise an + * IllegalArgumentException is thrown. * * @param arguments an array of objects to be formatted and substituted. * @param result where text is appended. - * @param pos On input: an alignment field, if desired. - * On output: the offsets of the alignment field. - * @throws IllegalArgumentException if an argument in the - * arguments array is not of the type - * expected by the format element(s) that use it. - * @throws IllegalArgumentException - * if this format uses named arguments + * @param pos On input: an alignment field, if desired. On output: the offsets of the alignment + * field. + * @throws IllegalArgumentException if an argument in the arguments array is not of + * the type expected by the format element(s) that use it. + * @throws IllegalArgumentException if this format uses named arguments * @stable ICU 3.0 */ - public final StringBuffer format(Object[] arguments, StringBuffer result, - FieldPosition pos) - { + public final StringBuffer format(Object[] arguments, StringBuffer result, FieldPosition pos) { if (!argumentNamesAreNumeric) { throw new IllegalArgumentException( - "This method is not available in MessageFormat objects " + - "that use alphanumeric argument names."); + "This method is not available in MessageFormat objects " + + "that use alphanumeric argument names."); } return subformat(arguments, result, pos, null); } /** * Format a list of arguments + * * @param arguments * @return */ @@ -1036,52 +988,49 @@ public final String format(Object... arguments) { } /** - * Formats a map of objects and appends the MessageFormat's - * pattern, with format elements replaced by the formatted objects, to the - * provided StringBuffer. - *

- * The text substituted for the individual format elements is derived from - * the current subformat of the format element and the - * arguments value corresopnding to the format element's - * argument name. - *

- * This API may be called on formats that do not use named arguments. - * In this case the the keys in arguments must be numeric - * strings (e.g. "0", "1", "2"...). - *

- * An argument is unavailable if arguments is - * null or does not have a value corresponding to an argument - * name in the pattern. When an argument is unavailable no substitution - * is performed. + * Formats a map of objects and appends the MessageFormat's pattern, with format + * elements replaced by the formatted objects, to the provided StringBuffer. + * + *

The text substituted for the individual format elements is derived from the current + * subformat of the format element and the arguments value corresopnding to the + * format element's argument name. + * + *

This API may be called on formats that do not use named arguments. In this case the the + * keys in arguments must be numeric strings (e.g. "0", "1", "2"...). + * + *

An argument is unavailable if arguments is null or does + * not have a value corresponding to an argument name in the pattern. When an argument is + * unavailable no substitution is performed. * * @param arguments a map of objects to be formatted and substituted. * @param result where text is appended. - * @param pos On input: an alignment field, if desired. - * On output: the offsets of the alignment field. - * @throws IllegalArgumentException if an argument in the - * arguments array is not of the type - * expected by the format element(s) that use it. + * @param pos On input: an alignment field, if desired. On output: the offsets of the alignment + * field. + * @throws IllegalArgumentException if an argument in the arguments array is not of + * the type expected by the format element(s) that use it. * @return the passed-in StringBuffer * @stable ICU 3.8 */ - public final StringBuffer format(Map arguments, StringBuffer result, - FieldPosition pos) { + public final StringBuffer format( + Map arguments, StringBuffer result, FieldPosition pos) { return subformat(arguments, result, pos, null); } /** - * Creates a MessageFormat with the given pattern and uses it - * to format the given arguments. This is equivalent to + * Creates a MessageFormat with the given pattern and uses it to format the given arguments. + * This is equivalent to + * *

- * (new {@link #MessageFormat(String) MessageFormat}(pattern)).{@link #format(java.lang.Object[], java.lang.StringBuffer, java.text.FieldPosition) format}(arguments, new StringBuffer(), null).toString() + * + * + * (new {@link #MessageFormat(String) MessageFormat}(pattern)).{@link #format(java.lang.Object[], java.lang.StringBuffer, java.text.FieldPosition) format}(arguments, new StringBuffer(), null).toString() + * + * *
* - * @throws IllegalArgumentException if the pattern is invalid, - * or if an argument in the arguments array - * is not of the type expected by the format element(s) - * that use it. - * @throws IllegalArgumentException - * if this format uses named arguments + * @throws IllegalArgumentException if the pattern is invalid, or if an argument in the + * arguments array is not of the type expected by the format element(s) that use it. + * @throws IllegalArgumentException if this format uses named arguments * @stable ICU 3.0 */ public static String format(String pattern, Object... arguments) { @@ -1090,14 +1039,13 @@ public static String format(String pattern, Object... arguments) { } /** - * Creates a MessageFormat with the given pattern and uses it to - * format the given arguments. The pattern must identifyarguments - * by name instead of by number. + * Creates a MessageFormat with the given pattern and uses it to format the given arguments. The + * pattern must identifyarguments by name instead of by number. + * *

- * @throws IllegalArgumentException if the pattern is invalid, - * or if an argument in the arguments map - * is not of the type expected by the format element(s) - * that use it. + * + * @throws IllegalArgumentException if the pattern is invalid, or if an argument in the + * arguments map is not of the type expected by the format element(s) that use it. * @see #format(Map, StringBuffer, FieldPosition) * @see #format(String, Object[]) * @stable ICU 3.8 @@ -1108,8 +1056,8 @@ public static String format(String pattern, Map arguments) { } /** - * Returns true if this MessageFormat uses named arguments, - * and false otherwise. See class description. + * Returns true if this MessageFormat uses named arguments, and false otherwise. See class + * description. * * @return true if named arguments are used. * @stable ICU 3.8 @@ -1143,68 +1091,67 @@ public boolean usesNamedArguments() { */ @Override @SuppressWarnings("unchecked") - public final StringBuffer format(Object arguments, StringBuffer result, - FieldPosition pos) - { + public final StringBuffer format(Object arguments, StringBuffer result, FieldPosition pos) { if ((arguments == null || arguments instanceof Map)) { - return subformat((Map)arguments, result, pos, null); + return subformat((Map) arguments, result, pos, null); } else { if (!argumentNamesAreNumeric) { throw new IllegalArgumentException( - "This method is not available in MessageFormat objects " + - "that use alphanumeric argument names."); + "This method is not available in MessageFormat objects " + + "that use alphanumeric argument names."); } return subformat((Object[]) arguments, result, pos, null); } } /** - * Formats an array of objects and inserts them into the - * MessageFormat's pattern, producing an - * AttributedCharacterIterator. - * You can use the returned AttributedCharacterIterator - * to build the resulting String, as well as to determine information - * about the resulting String. - *

- * The text of the returned AttributedCharacterIterator is - * the same that would be returned by + * Formats an array of objects and inserts them into the MessageFormat's pattern, + * producing an AttributedCharacterIterator. You can use the returned + * AttributedCharacterIterator to build the resulting String, as well as to determine + * information about the resulting String. + * + *

The text of the returned AttributedCharacterIterator is the same that would + * be returned by + * *

- * {@link #format(java.lang.Object[], java.lang.StringBuffer, java.text.FieldPosition) format}(arguments, new StringBuffer(), null).toString() + * + * + * {@link #format(java.lang.Object[], java.lang.StringBuffer, java.text.FieldPosition) format}(arguments, new StringBuffer(), null).toString() + * + * *
- *

- * In addition, the AttributedCharacterIterator contains at - * least attributes indicating where text was generated from an - * argument in the arguments array. The keys of these attributes are of - * type MessageFormat.Field, their values are - * Integer objects indicating the index in the arguments - * array of the argument from which the text was generated. - *

- * The attributes/value from the underlying Format - * instances that MessageFormat uses will also be - * placed in the resulting AttributedCharacterIterator. - * This allows you to not only find where an argument is placed in the - * resulting String, but also which fields it contains in turn. + * + *

In addition, the AttributedCharacterIterator contains at least attributes + * indicating where text was generated from an argument in the arguments array. The + * keys of these attributes are of type MessageFormat.Field, their values are + * Integer objects indicating the index in the arguments array of the + * argument from which the text was generated. + * + *

The attributes/value from the underlying Format instances that + * MessageFormat uses will also be placed in the resulting + * AttributedCharacterIterator. This allows you to not only find where an argument is + * placed in the resulting String, but also which fields it contains in turn. * * @param arguments an array of objects to be formatted and substituted. * @return AttributedCharacterIterator describing the formatted value. * @exception NullPointerException if arguments is null. - * @exception IllegalArgumentException if an argument in the - * arguments array is not of the type - * expected by the format element(s) that use it. + * @exception IllegalArgumentException if an argument in the arguments array is not + * of the type expected by the format element(s) that use it. * @stable ICU 3.8 */ @Override @SuppressWarnings("unchecked") public AttributedCharacterIterator formatToCharacterIterator(Object arguments) { final StringBuffer result = new StringBuffer(); - final ArrayList iterators = new ArrayList(); + final ArrayList iterators = + new ArrayList(); if (arguments == null) { throw new NullPointerException( "formatToCharacterIterator must be passed non-null object"); } if (arguments instanceof Map) { - subformat((Map)arguments, result, null, iterators); + subformat((Map) arguments, result, null, iterators); } else { subformat((Object[]) arguments, result, null, iterators); } @@ -1218,33 +1165,28 @@ public AttributedCharacterIterator formatToCharacterIterator(Object arguments) { /** * Parses the string. * - *

Caveats: The parse may fail in a number of circumstances. - * For example: + *

Caveats: The parse may fail in a number of circumstances. For example: + * *

    - *
  • If one of the arguments does not occur in the pattern. - *
  • If the format of an argument loses information, such as - * with a choice format where a large number formats to "many". - *
  • Does not yet handle recursion (where - * the substituted strings contain {n} references.) - *
  • Will not always find a match (or the correct match) - * if some part of the parse is ambiguous. - * For example, if the pattern "{1},{2}" is used with the - * string arguments {"a,b", "c"}, it will format as "a,b,c". - * When the result is parsed, it will return {"a", "b,c"}. - *
  • If a single argument is parsed more than once in the string, - * then the later parse wins. + *
  • If one of the arguments does not occur in the pattern. + *
  • If the format of an argument loses information, such as with a choice format where a + * large number formats to "many". + *
  • Does not yet handle recursion (where the substituted strings contain {n} references.) + *
  • Will not always find a match (or the correct match) if some part of the parse is + * ambiguous. For example, if the pattern "{1},{2}" is used with the string arguments + * {"a,b", "c"}, it will format as "a,b,c". When the result is parsed, it will return + * {"a", "b,c"}. + *
  • If a single argument is parsed more than once in the string, then the later parse wins. *
- * When the parse fails, use ParsePosition.getErrorIndex() to find out - * where in the string did the parsing failed. The returned error - * index is the starting offset of the sub-patterns that the string - * is comparing with. For example, if the parsing string "AAA {0} BBB" - * is comparing against the pattern "AAD {0} BBB", the error index is - * 0. When an error occurs, the call to this method will return null. - * If the source is null, return an empty array. - *

- * This method is only supported with numbered arguments. If - * the format pattern used named argument an - * IllegalArgumentException is thrown. + * + * When the parse fails, use ParsePosition.getErrorIndex() to find out where in the string did + * the parsing failed. The returned error index is the starting offset of the sub-patterns that + * the string is comparing with. For example, if the parsing string "AAA {0} BBB" is comparing + * against the pattern "AAD {0} BBB", the error index is 0. When an error occurs, the call to + * this method will return null. If the source is null, return an empty array. + * + *

This method is only supported with numbered arguments. If the format pattern used named + * argument an IllegalArgumentException is thrown. * * @throws IllegalArgumentException if this format uses named arguments * @stable ICU 3.0 @@ -1252,8 +1194,8 @@ public AttributedCharacterIterator formatToCharacterIterator(Object arguments) { public Object[] parse(String source, ParsePosition pos) { if (!argumentNamesAreNumeric) { throw new IllegalArgumentException( - "This method is not available in MessageFormat objects " + - "that use named argument."); + "This method is not available in MessageFormat objects " + + "that use named argument."); } final Map objectMap = parseToMap(source, pos); int maximumArgumentNumber = -1; @@ -1277,15 +1219,13 @@ public Object[] parse(String source, ParsePosition pos) { } /** - * Parses the string, returning the results in a Map. - * This is similar to the version that returns an array - * of Object. This supports both named and numbered - * arguments-- if numbered, the keys in the map are the - * corresponding Strings (e.g. "0", "1", "2"...). + * Parses the string, returning the results in a Map. This is similar to the version that + * returns an array of Object. This supports both named and numbered arguments-- if numbered, + * the keys in the map are the corresponding Strings (e.g. "0", "1", "2"...). * * @param source the text to parse - * @param pos the position at which to start parsing. on return, - * contains the result of the parse. + * @param pos the position at which to start parsing. on return, contains the result of the + * parse. * @return a Map containing key/value pairs for each parsed argument. * @stable ICU 3.8 */ @@ -1312,8 +1252,7 @@ public Map parseToMap(String source, ParsePosition pos) { for (int i = 0; i <= maxOffset; ++i) { // match up to format final int len = offsets[i] - patternOffset; - if (len == 0 || pattern.regionMatches(patternOffset, - source, sourceOffset, len)) { + if (len == 0 || pattern.regionMatches(patternOffset, source, sourceOffset, len)) { sourceOffset += len; patternOffset += len; } else { @@ -1322,17 +1261,19 @@ public Map parseToMap(String source, ParsePosition pos) { } // now use format - if (formats[i] == null) { // string format + if (formats[i] == null) { // string format // if at end, use longest possible match // otherwise uses first match to intervening string // does NOT recursively try all possibilities - final int tempLength = (i != maxOffset) ? offsets[i+1] : pattern.length(); + final int tempLength = (i != maxOffset) ? offsets[i + 1] : pattern.length(); int next; if (patternOffset >= tempLength) { next = source.length(); - }else{ - next = source.indexOf( pattern.substring(patternOffset,tempLength), sourceOffset); + } else { + next = + source.indexOf( + pattern.substring(patternOffset, tempLength), sourceOffset); } if (next < 0) { @@ -1360,8 +1301,7 @@ public Map parseToMap(String source, ParsePosition pos) { } } final int len = pattern.length() - patternOffset; - if (len == 0 || pattern.regionMatches(patternOffset, - source, sourceOffset, len)) { + if (len == 0 || pattern.regionMatches(patternOffset, source, sourceOffset, len)) { pos.setIndex(sourceOffset + len); } else { pos.setErrorIndex(sourceOffset); @@ -1371,43 +1311,38 @@ public Map parseToMap(String source, ParsePosition pos) { } /** - * Parses text from the beginning of the given string to produce an object - * array. - * The method may not use the entire text of the given string. - *

- * See the {@link #parse(String, ParsePosition)} method for more information - * on message parsing. + * Parses text from the beginning of the given string to produce an object array. The method may + * not use the entire text of the given string. + * + *

See the {@link #parse(String, ParsePosition)} method for more information on message + * parsing. * * @param source A String whose beginning should be parsed. * @return An Object array parsed from the string. - * @exception ParseException - * if the beginning of the specified string cannot be parsed. - * @exception IllegalArgumentException - * if this format uses named arguments + * @exception ParseException if the beginning of the specified string cannot be parsed. + * @exception IllegalArgumentException if this format uses named arguments * @stable ICU 3.0 */ public Object[] parse(String source) throws ParseException { final ParsePosition pos = new ParsePosition(0); final Object[] result = parse(source, pos); if (pos.getIndex() == 0) { - throw new ParseException("MessageFormat parse error!", - pos.getErrorIndex()); + throw new ParseException("MessageFormat parse error!", pos.getErrorIndex()); } return result; } /** - * Parses text from the beginning of the given string to produce a map from - * argument to values. The method may not use the entire text of the given string. - *

- * See the {@link #parse(String, ParsePosition)} method for more information - * on message parsing. + * Parses text from the beginning of the given string to produce a map from argument to values. + * The method may not use the entire text of the given string. + * + *

See the {@link #parse(String, ParsePosition)} method for more information on message + * parsing. * * @param source A String whose beginning should be parsed. * @return A Map parsed from the string. - * @throws ParseException if the beginning of the specified string cannot - * be parsed. + * @throws ParseException if the beginning of the specified string cannot be parsed. * @see #parseToMap(String, ParsePosition) * @stable ICU 3.8 */ @@ -1416,8 +1351,7 @@ public Map parseToMap(String source) throws ParseException { final ParsePosition pos = new ParsePosition(0); final Map result = parseToMap(source, pos); if (pos.getIndex() == 0) { - throw new ParseException("MessageFormat parse error!", - pos.getErrorIndex()); + throw new ParseException("MessageFormat parse error!", pos.getErrorIndex()); } return result; @@ -1425,28 +1359,24 @@ public Map parseToMap(String source) throws ParseException { /** * Parses text from a string to produce an object array or Map. - *

- * The method attempts to parse text starting at the index given by - * pos. - * If parsing succeeds, then the index of pos is updated - * to the index after the last character used (parsing does not necessarily - * use all characters up to the end of the string), and the parsed - * object array is returned. The updated pos can be used to - * indicate the starting point for the next call to this method. - * If an error occurs, then the index of pos is not - * changed, the error index of pos is set to the index of + * + *

The method attempts to parse text starting at the index given by pos. If + * parsing succeeds, then the index of pos is updated to the index after the last + * character used (parsing does not necessarily use all characters up to the end of the string), + * and the parsed object array is returned. The updated pos can be used to indicate + * the starting point for the next call to this method. If an error occurs, then the index of + * pos is not changed, the error index of pos is set to the index of * the character where the error occurred, and null is returned. - *

- * See the {@link #parse(String, ParsePosition)} method for more information - * on message parsing. + * + *

See the {@link #parse(String, ParsePosition)} method for more information on message + * parsing. * * @param source A String, part of which should be parsed. - * @param pos A ParsePosition object with index and error - * index information as described above. - * @return An Object parsed from the string, either an - * array of Object, or a Map, depending on whether named - * arguments are used. This can be queried using usesNamedArguments. - * In case of error, returns null. + * @param pos A ParsePosition object with index and error index information as + * described above. + * @return An Object parsed from the string, either an array of Object, or a Map, + * depending on whether named arguments are used. This can be queried using + * usesNamedArguments. In case of error, returns null. * @throws NullPointerException if pos is null. * @stable ICU 3.0 */ @@ -1486,6 +1416,7 @@ public Object clone() { /** * Equality comparison between two message format objects + * * @stable ICU 3.0 */ @Override @@ -1508,6 +1439,7 @@ public boolean equals(Object obj) { /** * Generates a hash code for the message format object. + * * @stable ICU 3.0 */ @Override @@ -1516,9 +1448,8 @@ public int hashCode() { } /** - * Defines constants that are used as attribute keys in the - * AttributedCharacterIterator returned - * from MessageFormat.formatToCharacterIterator. + * Defines constants that are used as attribute keys in the AttributedCharacterIterator + * returned from MessageFormat.formatToCharacterIterator. * * @stable ICU 3.8 */ @@ -1530,7 +1461,6 @@ public static class Field extends Format.Field { * Create a Field with the specified name. * * @param name The name of the attribute - * * @stable ICU 3.8 */ protected Field(String name) { @@ -1542,13 +1472,13 @@ protected Field(String name) { * * @return resolved MessageFormat.Field constant * @throws InvalidObjectException if the constant could not be resolved. - * * @stable ICU 3.8 */ @Override protected Object readResolve() throws InvalidObjectException { if (this.getClass() != MessageFormat.Field.class) { - throw new InvalidObjectException("A subclass of MessageFormat.Field must implement readResolve."); + throw new InvalidObjectException( + "A subclass of MessageFormat.Field must implement readResolve."); } if (getName().equals(ARGUMENT.getName())) { return ARGUMENT; @@ -1558,36 +1488,37 @@ protected Object readResolve() throws InvalidObjectException { } /** - * Constant identifying a portion of a message that was generated - * from an argument passed into formatToCharacterIterator. - * The value associated with the key will be an Integer - * indicating the index in the arguments array of the + * Constant identifying a portion of a message that was generated from an argument passed + * into formatToCharacterIterator. The value associated with the key will be an + * Integer indicating the index in the arguments array of the * argument from which the text was generated. * * @stable ICU 3.8 */ public static final Field ARGUMENT = new Field("message argument field"); - } // ===========================privates============================ /** - * The locale to use for formatting numbers and dates. - * This is no longer used, and here only for serialization compatibility. + * The locale to use for formatting numbers and dates. This is no longer used, and here only for + * serialization compatibility. + * * @serial */ private Locale locale; /** * The locale to use for formatting numbers and dates. + * * @serial */ private ULocale ulocale; /** - * The string that the formatted values are to be plugged into. In other words, this - * is the pattern supplied on construction with all of the {} expressions taken out. + * The string that the formatted values are to be plugged into. In other words, this is the + * pattern supplied on construction with all of the {} expressions taken out. + * * @serial */ private String pattern = ""; @@ -1597,31 +1528,31 @@ protected Object readResolve() throws InvalidObjectException { /** * An array of formatters, which are used to format the arguments. + * * @serial */ private Format[] formats = new Format[INITIAL_FORMATS]; /** - * The positions where the results of formatting each argument are to be - * inserted into the pattern. + * The positions where the results of formatting each argument are to be inserted into the + * pattern. * * @serial */ private int[] offsets = new int[INITIAL_FORMATS]; /** - * The argument numbers corresponding to each formatter. (The formatters are stored - * in the order they occur in the pattern, not in the order in which the arguments - * are specified.) + * The argument numbers corresponding to each formatter. (The formatters are stored in the order + * they occur in the pattern, not in the order in which the arguments are specified.) + * * @serial */ // retained for backwards compatibility private final int[] argumentNumbers = new int[INITIAL_FORMATS]; /** - * The argument names corresponding to each formatter. (The formatters are - * stored in the order they occur in the pattern, not in the order in which - * the arguments are specified.) + * The argument names corresponding to each formatter. (The formatters are stored in the order + * they occur in the pattern, not in the order in which the arguments are specified.) * * @serial */ @@ -1635,40 +1566,43 @@ protected Object readResolve() throws InvalidObjectException { private boolean argumentNamesAreNumeric = true; /** - * One less than the number of entries in offsets. Can also be thought of - * as the index of the highest-numbered element in offsets that is being used. - * All of these arrays should have the same number of elements being used as offsets - * does, and so this variable suffices to tell us how many entries are in all of them. + * One less than the number of entries in offsets. Can also be thought of as the + * index of the highest-numbered element in offsets that is being used. All of + * these arrays should have the same number of elements being used as offsets does, + * and so this variable suffices to tell us how many entries are in all of them. + * * @serial */ private int maxOffset = -1; /** - * Internal routine used by format. If characterIterators is - * non-null, AttributedCharacterIterator will be created from the - * subformats as necessary. If characterIterators is null - * and fp is non-null and identifies - * Field.MESSAGE_ARGUMENT, the location of - * the first replaced argument will be set in it. - * - * @exception IllegalArgumentException if an argument in the - * arguments array is not of the type - * expected by the format element(s) that use it. + * Internal routine used by format. If characterIterators is non-null, + * AttributedCharacterIterator will be created from the subformats as necessary. If + * characterIterators is null and fp is non-null and identifies + * Field.MESSAGE_ARGUMENT, the location of the first replaced argument will be set in it. + * + * @exception IllegalArgumentException if an argument in the arguments array is not + * of the type expected by the format element(s) that use it. */ - private StringBuffer subformat(Object[] arguments, StringBuffer result, - FieldPosition fp, List characterIterators) { + private StringBuffer subformat( + Object[] arguments, + StringBuffer result, + FieldPosition fp, + List characterIterators) { return subformat(arrayToMap(arguments), result, fp, characterIterators); } /** * Internal routine used by format. * - * @throws IllegalArgumentException if an argument in the - * arguments map is not of the type - * expected by the format element(s) that use it. + * @throws IllegalArgumentException if an argument in the arguments map is not of + * the type expected by the format element(s) that use it. */ - private StringBuffer subformat(Map arguments, StringBuffer result, - FieldPosition fp, List characterIterators) { + private StringBuffer subformat( + Map arguments, + StringBuffer result, + FieldPosition fp, + List characterIterators) { // note: this implementation assumes a fast substring & index. // if this is not true, would be better to append chars one by one. int lastOffset = 0; @@ -1718,8 +1652,7 @@ private StringBuffer subformat(Map arguments, StringBuffer resul // to get the CharacterIterator from the child formatter. if (last != result.length()) { characterIterators.add( - _createAttributedCharacterIterator(result.substring - (last))); + _createAttributedCharacterIterator(result.substring(last))); last = result.length(); } if (subFormatter != null) { @@ -1730,8 +1663,11 @@ private StringBuffer subformat(Map arguments, StringBuffer resul if (last != result.length()) { characterIterators.add( _createAttributedCharacterIterator( - subIterator, Field.ARGUMENT, - argumentNamesAreNumeric ? (Object)new Integer(argumentName) : (Object)argumentName)); + subIterator, + Field.ARGUMENT, + argumentNamesAreNumeric + ? (Object) new Integer(argumentName) + : (Object) argumentName)); last = result.length(); } arg = null; @@ -1740,8 +1676,11 @@ private StringBuffer subformat(Map arguments, StringBuffer resul result.append(arg); characterIterators.add( _createAttributedCharacterIterator( - arg, Field.ARGUMENT, - argumentNamesAreNumeric ? (Object)new Integer(argumentName) : (Object)argumentName)); + arg, + Field.ARGUMENT, + argumentNamesAreNumeric + ? (Object) new Integer(argumentName) + : (Object) argumentName)); last = result.length(); } } else { @@ -1750,8 +1689,7 @@ private StringBuffer subformat(Map arguments, StringBuffer resul } last = result.length(); result.append(arg); - if (i == 0 && fp != null && Field.ARGUMENT.equals( - fp.getFieldAttribute())) { + if (i == 0 && fp != null && Field.ARGUMENT.equals(fp.getFieldAttribute())) { fp.setBeginIndex(last); fp.setEndIndex(result.length()); } @@ -1761,15 +1699,14 @@ private StringBuffer subformat(Map arguments, StringBuffer resul } result.append(pattern.substring(lastOffset, pattern.length())); if (characterIterators != null && last != result.length()) { - characterIterators.add(_createAttributedCharacterIterator( - result.substring(last))); + characterIterators.add(_createAttributedCharacterIterator(result.substring(last))); } return result; } /** - * Convenience method to append all the characters in - * iterator to the StringBuffer result. + * Convenience method to append all the characters in iterator to the StringBuffer + * result. */ private void append(StringBuffer result, CharacterIterator iterator) { if (iterator.first() != CharacterIterator.DONE) { @@ -1782,10 +1719,7 @@ private void append(StringBuffer result, CharacterIterator iterator) { } } - - private void makeFormat(int position, int offsetNumber, - StringBuffer[] segments) - { + private void makeFormat(int position, int offsetNumber, StringBuffer[] segments) { // get the argument number // int argumentNumber; // try { @@ -1793,11 +1727,11 @@ private void makeFormat(int position, int offsetNumber, // } catch (NumberFormatException e) { // throw new IllegalArgumentException("can't parse argument number " // + segments[1]); - //} + // } // if (argumentNumber < 0) { // throw new IllegalArgumentException("negative argument number " // + argumentNumber); - //} + // } // resize format information arrays if necessary if (offsetNumber >= formats.length) { @@ -1807,8 +1741,7 @@ private void makeFormat(int position, int offsetNumber, final String[] newArgumentNames = new String[newLength]; System.arraycopy(formats, 0, newFormats, 0, maxOffset + 1); System.arraycopy(offsets, 0, newOffsets, 0, maxOffset + 1); - System.arraycopy(argumentNames, 0, newArgumentNames, 0, - maxOffset + 1); + System.arraycopy(argumentNames, 0, newArgumentNames, 0, maxOffset + 1); formats = newFormats; offsets = newOffsets; argumentNames = newArgumentNames; @@ -1831,15 +1764,14 @@ private void makeFormat(int position, int offsetNumber, argumentNamesAreNumeric = argumentNumber >= 0; } - if (argumentNamesAreNumeric && argumentNumber < 0 || - !argumentNamesAreNumeric && - !isAlphaIdentifier(argumentNames[offsetNumber])) { + if (argumentNamesAreNumeric && argumentNumber < 0 + || !argumentNamesAreNumeric && !isAlphaIdentifier(argumentNames[offsetNumber])) { throw new IllegalArgumentException( - "All argument identifiers have to be either non-negative " + - "numbers or strings following the pattern " + - "([:ID_Start:] [:ID_Continue:]*).\n" + - "For more details on these unicode sets, visit " + - "http://demo.icu-project.org/icu-bin/ubrowse"); + "All argument identifiers have to be either non-negative " + + "numbers or strings following the pattern " + + "([:ID_Start:] [:ID_Continue:]*).\n" + + "For more details on these unicode sets, visit " + + "http://demo.icu-project.org/icu-bin/ubrowse"); } // now get the format @@ -1847,18 +1779,22 @@ private void makeFormat(int position, int offsetNumber, final String subType = segments[3].toString(); try { - formats[offsetNumber] = mainType.length() == 0 ? null : formatRegistry.getFormat(mainType, subType, ulocale); + formats[offsetNumber] = + mainType.length() == 0 + ? null + : formatRegistry.getFormat(mainType, subType, ulocale); } catch (final IllegalArgumentException e) { maxOffset = oldMaxOffset; throw e; } - segments[1].setLength(0); // throw away other segments + segments[1].setLength(0); // throw away other segments segments[2].setLength(0); segments[3].setLength(0); } - private static final void copyAndFixQuotes(String source, int start, int end, StringBuffer target) { + private static final void copyAndFixQuotes( + String source, int start, int end, StringBuffer target) { // added 'gotLB' logic from ICU4C - questionable [alan] boolean gotLB = false; for (int i = start; i < end; ++i) { @@ -1882,8 +1818,9 @@ private static final void copyAndFixQuotes(String source, int start, int end, St } /** - * After reading an object from the input stream, do a simple verification - * to maintain class invariants. + * After reading an object from the input stream, do a simple verification to maintain class + * invariants. + * * @throws InvalidObjectException if the objects read from the stream is invalid. */ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException { @@ -1895,36 +1832,37 @@ private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundE argumentNames[i] = String.valueOf(argumentNumbers[i]); } } - boolean isValid = maxOffset >= -1 - && formats.length > maxOffset - && offsets.length > maxOffset - && argumentNames.length > maxOffset; - if (isValid) { - int lastOffset = pattern.length() + 1; - for (int i = maxOffset; i >= 0; --i) { - if ((offsets[i] < 0) || (offsets[i] > lastOffset)) { - isValid = false; - break; - } else { - lastOffset = offsets[i]; - } - } - } - if (!isValid) { - throw new InvalidObjectException("Could not reconstruct MessageFormat from corrupt stream."); - } - if (ulocale == null) { - ulocale = ULocale.forLocale(locale); + boolean isValid = + maxOffset >= -1 + && formats.length > maxOffset + && offsets.length > maxOffset + && argumentNames.length > maxOffset; + if (isValid) { + int lastOffset = pattern.length() + 1; + for (int i = maxOffset; i >= 0; --i) { + if ((offsets[i] < 0) || (offsets[i] > lastOffset)) { + isValid = false; + break; + } else { + lastOffset = offsets[i]; } + } + } + if (!isValid) { + throw new InvalidObjectException( + "Could not reconstruct MessageFormat from corrupt stream."); + } + if (ulocale == null) { + ulocale = ULocale.forLocale(locale); + } } /** - * This is a helper method for converting an object array into a map. The - * key set of the map is [0, ..., array.length]. The value associated with - * each key is the ith entry of the passed object array. + * This is a helper method for converting an object array into a map. The key set of the map is + * [0, ..., array.length]. The value associated with each key is the ith entry of the passed + * object array. * - * @throws InvalidObjectException - * if the objects read from the stream is invalid. + * @throws InvalidObjectException if the objects read from the stream is invalid. */ private Map arrayToMap(Object[] array) { final Map map = new HashMap(); @@ -1940,9 +1878,9 @@ private boolean isAlphaIdentifier(String argument) { if (argument.length() == 0) { return false; } - for (int i = 0; i < argument.length(); ++i ) { - if (i == 0 && !IDStartChars.contains(argument.charAt(i)) || - i > 0 && !IDContChars.contains(argument.charAt(i))){ + for (int i = 0; i < argument.length(); ++i) { + if (i == 0 && !IDStartChars.contains(argument.charAt(i)) + || i > 0 && !IDContChars.contains(argument.charAt(i))) { return false; } } @@ -1962,17 +1900,14 @@ private boolean isAlphaIdentifier(String argument) { private static UnicodeSet IDContChars = new UnicodeSet("[:ID_Continue:]"); /** - * Convert an 'apostrophe-friendly' pattern into a standard - * pattern. Standard patterns treat all apostrophes as - * quotes, which is problematic in some languages, e.g. - * French, where apostrophe is commonly used. This utility - * assumes that only an unpaired apostrophe immediately before - * a brace is a true quote. Other unpaired apostrophes are paired, - * and the resulting standard pattern string is returned. - * - *

Note it is not guaranteed that the returned pattern - * is indeed a valid pattern. The only effect is to convert - * between patterns having different quoting semantics. + * Convert an 'apostrophe-friendly' pattern into a standard pattern. Standard patterns treat all + * apostrophes as quotes, which is problematic in some languages, e.g. French, where apostrophe + * is commonly used. This utility assumes that only an unpaired apostrophe immediately before a + * brace is a true quote. Other unpaired apostrophes are paired, and the resulting standard + * pattern string is returned. + * + *

Note it is not guaranteed that the returned pattern is indeed a valid pattern. The + * only effect is to convert between patterns having different quoting semantics. * * @param pattern the 'apostrophe-friendly' patttern to convert * @return the standard equivalent of the original pattern @@ -1985,55 +1920,55 @@ public static String autoQuoteApostrophe(String pattern) { for (int i = 0, j = pattern.length(); i < j; ++i) { final char c = pattern.charAt(i); switch (state) { - case STATE_INITIAL: - switch (c) { - case SINGLE_QUOTE: - state = STATE_SINGLE_QUOTE; - break; - case CURLY_BRACE_LEFT: - state = STATE_MSG_ELEMENT; - ++braceCount; - break; - } - break; - case STATE_SINGLE_QUOTE: - switch (c) { - case SINGLE_QUOTE: - state = STATE_INITIAL; - break; - case CURLY_BRACE_LEFT: - case CURLY_BRACE_RIGHT: - state = STATE_IN_QUOTE; - break; - default: - buf.append(SINGLE_QUOTE); - state = STATE_INITIAL; + case STATE_INITIAL: + switch (c) { + case SINGLE_QUOTE: + state = STATE_SINGLE_QUOTE; + break; + case CURLY_BRACE_LEFT: + state = STATE_MSG_ELEMENT; + ++braceCount; + break; + } break; - } - break; - case STATE_IN_QUOTE: - switch (c) { - case SINGLE_QUOTE: - state = STATE_INITIAL; + case STATE_SINGLE_QUOTE: + switch (c) { + case SINGLE_QUOTE: + state = STATE_INITIAL; + break; + case CURLY_BRACE_LEFT: + case CURLY_BRACE_RIGHT: + state = STATE_IN_QUOTE; + break; + default: + buf.append(SINGLE_QUOTE); + state = STATE_INITIAL; + break; + } break; - } - break; - case STATE_MSG_ELEMENT: - switch (c) { - case CURLY_BRACE_LEFT: - ++braceCount; + case STATE_IN_QUOTE: + switch (c) { + case SINGLE_QUOTE: + state = STATE_INITIAL; + break; + } break; - case CURLY_BRACE_RIGHT: - if (--braceCount == 0) { - state = STATE_INITIAL; + case STATE_MSG_ELEMENT: + switch (c) { + case CURLY_BRACE_LEFT: + ++braceCount; + break; + case CURLY_BRACE_RIGHT: + if (--braceCount == 0) { + state = STATE_INITIAL; + } + break; } break; - } - break; - ///CLOVER:OFF - default: // Never happens. - break; - ///CLOVER:ON + /// CLOVER:OFF + default: // Never happens. + break; + /// CLOVER:ON } buf.append(c); } @@ -2059,7 +1994,8 @@ private static AttributedCharacterIterator _createAttributedCharacterIterator(St return as.getIterator(); } - private static AttributedCharacterIterator _createAttributedCharacterIterator(AttributedCharacterIterator[] iterators) { + private static AttributedCharacterIterator _createAttributedCharacterIterator( + AttributedCharacterIterator[] iterators) { if (iterators == null || iterators.length == 0) { return _createAttributedCharacterIterator(""); } @@ -2084,8 +2020,7 @@ private static AttributedCharacterIterator _createAttributedCharacterIterator(At final int len = iterator.getRunLimit() - start; // run length if (map.size() > 0) { for (final Map.Entry entry : map.entrySet()) { - as.addAttribute(entry.getKey(), entry.getValue(), - offset, offset + len); + as.addAttribute(entry.getKey(), entry.getValue(), offset, offset + len); } } offset += len; @@ -2100,15 +2035,17 @@ private static AttributedCharacterIterator _createAttributedCharacterIterator(At return as.getIterator(); } - private static AttributedCharacterIterator _createAttributedCharacterIterator(AttributedCharacterIterator iterator, - AttributedCharacterIterator.Attribute key, Object value) { + private static AttributedCharacterIterator _createAttributedCharacterIterator( + AttributedCharacterIterator iterator, + AttributedCharacterIterator.Attribute key, + Object value) { final AttributedString as = new AttributedString(iterator); as.addAttribute(key, value); return as.getIterator(); } - private static AttributedCharacterIterator _createAttributedCharacterIterator(String text, - AttributedCharacterIterator.Attribute key, Object value) { + private static AttributedCharacterIterator _createAttributedCharacterIterator( + String text, AttributedCharacterIterator.Attribute key, Object value) { final AttributedString as = new AttributedString(text); as.addAttribute(key, value); return as.getIterator(); diff --git a/unicodetools/src/main/java/org/unicode/draft/Misc.java b/unicodetools/src/main/java/org/unicode/draft/Misc.java index d8d8d53ca..dc9a4001a 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Misc.java +++ b/unicodetools/src/main/java/org/unicode/draft/Misc.java @@ -1,4 +1,13 @@ package org.unicode.draft; + +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.SpoofChecker; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeCompressor; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.lang.reflect.Method; @@ -20,16 +29,6 @@ import java.util.TreeSet; import java.util.regex.Pattern; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.SpoofChecker; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeCompressor; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - - public class Misc { public static void main(String[] args) throws ParseException { showRegions(); @@ -61,8 +60,8 @@ public static void main(String[] args) throws ParseException { return; } - Map am; - Map cm; + Map am; + Map cm; TreeSet as; TreeSet bs; @@ -80,23 +79,35 @@ public static void main(String[] args) throws ParseException { d.addAll(bs); System.out.println("addAll:\t" + d); - am = MapBuilder.of(new TreeMap(col)).put("A", 1).put("B", 2).unmodifiable(); - cm = MapBuilder.of(new TreeMap(col)).put("b", 3).put("c", 4).unmodifiable(); + am = + MapBuilder.of(new TreeMap(col)) + .put("A", 1) + .put("B", 2) + .unmodifiable(); + cm = + MapBuilder.of(new TreeMap(col)) + .put("b", 3) + .put("c", 4) + .unmodifiable(); addAll(am, cm, true); System.out.println(am); } - enum Region { // these are generated by a tool from CLDR data - U001, US, IT, ZZ; + U001, + US, + IT, + ZZ; RegionData data; + Region() { // load all the data after all have been initialized if ("ZZ".equals(name())) { RegionData.load(); } } + Set getContained() { return data.contained; } @@ -106,18 +117,33 @@ private static final class RegionData { final Region continentalContainer; // = null for World, Unknown final Set contained; // = null for Territories, Unknown final Set territoriesContained; // = null for Territories, Unknown - public RegionData(Region continentalContainer, Set contained, Set territoriesContained) { + + public RegionData( + Region continentalContainer, + Set contained, + Set territoriesContained) { super(); this.continentalContainer = continentalContainer; this.contained = contained; this.territoriesContained = territoriesContained; } + public static void load() { - final HashMap,EnumSet> internCache = new HashMap,EnumSet>(); + final HashMap, EnumSet> internCache = + new HashMap, EnumSet>(); for (final Region r : Region.values()) { - switch(r) { - case U001: r.data = new RegionData(null, intern(internCache, EnumSet.of(Region.US, Region.IT)), null); break; - case US: case IT: r.data = new RegionData(Region.U001, null, null); break; + switch (r) { + case U001: + r.data = + new RegionData( + null, + intern(internCache, EnumSet.of(Region.US, Region.IT)), + null); + break; + case US: + case IT: + r.data = new RegionData(Region.U001, null, null); + break; } } } @@ -138,30 +164,35 @@ private static void showRegions() { } } - private static void nameUniqueness() { - final Map map = new HashMap(); + final Map map = new HashMap(); for (final String s : new UnicodeSet("[:^C:]")) { final int codePoint = s.codePointAt(0); final String name = UCharacter.getName(codePoint); - final String skeleton = name.replace(" ","").replace("-", ""); + final String skeleton = name.replace(" ", "").replace("-", ""); final Integer old = map.get(skeleton); if (old == null) { map.put(skeleton, codePoint); } else { System.out.println("Name collision: " + skeleton); - System.out.println(Utility.hex(codePoint,4) + " " + name); - System.out.println(Utility.hex(old,4) + " " + UCharacter.getName(old)); + System.out.println(Utility.hex(codePoint, 4) + " " + name); + System.out.println(Utility.hex(old, 4) + " " + UCharacter.getName(old)); } } } private static void checkRegex() { - for (final String string : new String[]{"MARK", "Mark"}) { - for (final String expression : new String[]{"[\\p{Lu}&&[\\u0000-\\u007F]]*", "[A-Z]*"}) { + for (final String string : new String[] {"MARK", "Mark"}) { + for (final String expression : + new String[] {"[\\p{Lu}&&[\\u0000-\\u007F]]*", "[A-Z]*"}) { for (final boolean caseless : new boolean[] {false, true}) { - final boolean matches = Pattern.compile(expression,caseless ? Pattern.CASE_INSENSITIVE : 0).matcher(string).matches(); - System.out.format("expression: %s,\tcaseless: %s, \tstring: \"%s\",\tresult: %s\n", expression, caseless, string, matches); + final boolean matches = + Pattern.compile(expression, caseless ? Pattern.CASE_INSENSITIVE : 0) + .matcher(string) + .matches(); + System.out.format( + "expression: %s,\tcaseless: %s, \tstring: \"%s\",\tresult: %s\n", + expression, caseless, string, matches); } } } @@ -170,21 +201,31 @@ private static void checkRegex() { private static void checkName(String string) { final ArrayList reasons = new ArrayList(); final boolean result = checkName(string, ULocale.ENGLISH, reasons); - System.out.format("%s\t%s\t%s\n", ""+result, string, reasons); + System.out.format("%s\t%s\t%s\n", "" + result, string, reasons); } - static final UnicodeSet ALLOW_IN_NAME = new UnicodeSet("[" - + "[:alphabetic:]" // include alphabetics - + "[:Mn:]" // include combining marks - + "[\\u0020  \\- ‐ – ゠ ・・ , .. ' ’ /]" // add special punctuation and whitespace, from linguists - + "-[:N:]" // exclude all numbers - + "]").freeze(); + static final UnicodeSet ALLOW_IN_NAME = + new UnicodeSet( + "[" + + "[:alphabetic:]" // include alphabetics + + "[:Mn:]" // include combining marks + + "[\\u0020  \\- ‐ – ゠ ・・ , .. ' ’ /]" // add special + // punctuation and + // whitespace, from + // linguists + + "-[:N:]" // exclude all numbers + + "]") + .freeze(); static final UnicodeSet WHITESPACE = new UnicodeSet("[:whitespace:]").freeze(); - static final Normalizer2 NFKC = Normalizer2.getNFKCInstance(); // Normalizer2.getInstance(null, "NFKC", Mode.COMPOSE); + static final Normalizer2 NFKC = + Normalizer2.getNFKCInstance(); // Normalizer2.getInstance(null, "NFKC", Mode.COMPOSE); - static final SpoofChecker spoofChecker = new SpoofChecker.Builder().setChecks(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.INVISIBLE).build(); + static final SpoofChecker spoofChecker = + new SpoofChecker.Builder() + .setChecks(SpoofChecker.SINGLE_SCRIPT | SpoofChecker.INVISIBLE) + .build(); private static boolean checkName(String possibleName, ULocale locale, List reasons) { // is prototype: strings need to be pulled out for localization, avoiding concatenation @@ -193,7 +234,9 @@ private static boolean checkName(String possibleName, ULocale locale, List= Character.MIN_HIGH_SURROGATE - && c1 <= Character.MAX_HIGH_SURROGATE) - ? c1 + final int cp = + !((c1 = s.charAt(index++)) >= Character.MIN_HIGH_SURROGATE + && c1 <= Character.MAX_HIGH_SURROGATE) + ? c1 : index < s.length() - && ((c2 = s.charAt(index)) >= Character.MIN_LOW_SURROGATE - && c2 <= Character.MAX_LOW_SURROGATE) - ? Character.toCodePoint(c1, c2) + && ((c2 = s.charAt(index)) + >= Character.MIN_LOW_SURROGATE + && c2 <= Character.MAX_LOW_SURROGATE) + ? Character.toCodePoint(c1, c2) : c1; - if (!isInterchangeValidCodePoint(cp)) { - return false; - } - index += cp >= Character.MIN_SUPPLEMENTARY_CODE_POINT? 2 : 1; + if (!isInterchangeValidCodePoint(cp)) { + return false; + } + index += cp >= Character.MIN_SUPPLEMENTARY_CODE_POINT ? 2 : 1; } return true; } public static int codePointAt(CharSequence s, int index) { char c1, c2; - return !Character.isHighSurrogate(c1 = s.charAt(index++)) ? c1 - : index < s.length() && Character.isLowSurrogate(c2 = s.charAt(index)) ? Character.toCodePoint(c1, c2) + return !Character.isHighSurrogate(c1 = s.charAt(index++)) + ? c1 + : index < s.length() && Character.isLowSurrogate(c2 = s.charAt(index)) + ? Character.toCodePoint(c1, c2) : c1; } @@ -272,11 +319,9 @@ private static boolean isInterchangeValidCodePoint(int cp) { } private static boolean isSurrogate(int codePoint) { - return codePoint >= Character.MIN_SURROGATE - && codePoint <= Character.MAX_SURROGATE; + return codePoint >= Character.MIN_SURROGATE && codePoint <= Character.MAX_SURROGATE; } - private static void checkSize(String string) { final byte[] utf8 = string.getBytes(Charset.forName("utf-8")); final byte[] scsu = UnicodeCompressor.compress(string); @@ -295,7 +340,7 @@ private static void checkSize(String string) { int len = 0; try { final java.util.zip.GZIPOutputStream foo = new java.util.zip.GZIPOutputStream(outBytes); - //foo.setLevel(9); + // foo.setLevel(9); foo.write(utf8); foo.flush(); len = outBytes.toByteArray().length; @@ -303,14 +348,20 @@ private static void checkSize(String string) { } catch (final IOException e) { } - - System.out.println("'" + string + "'" - + "\tchars:\t" + UTF16.countCodePoint(string) - + "\tutf8:\t" + utf8.length - + "\tscsu:\t" + scsu.length - + "\tscsu:\t" + scsu.length - + "\tgzip:\t" + len - ); + System.out.println( + "'" + + string + + "'" + + "\tchars:\t" + + UTF16.countCodePoint(string) + + "\tutf8:\t" + + utf8.length + + "\tscsu:\t" + + scsu.length + + "\tscsu:\t" + + scsu.length + + "\tgzip:\t" + + len); } @SuppressWarnings("unchecked") @@ -323,7 +374,7 @@ public static T clone(T source) { } } - static Map addAll(Map toChange, Map other, boolean override) { + static Map addAll(Map toChange, Map other, boolean override) { for (final T key : other.keySet()) { if (!override && toChange.containsKey(key)) { continue; @@ -333,7 +384,7 @@ static Map addAll(Map toChange, Map other, boolean override return toChange; } - static Map removeAll(Map toChange, Map other) { + static Map removeAll(Map toChange, Map other) { for (final T key : other.keySet()) { toChange.remove(key); } @@ -341,7 +392,7 @@ static Map removeAll(Map toChange, Map other) { } // change only the cases where A has a mapping - static Map replaceAll(Map toChange, Map other) { + static Map replaceAll(Map toChange, Map other) { for (final T key : other.keySet()) { if (toChange.containsKey(key)) { toChange.put(key, other.get(key)); @@ -350,8 +401,7 @@ static Map replaceAll(Map toChange, Map other) { return toChange; } - - static Map retainAll(Map toChange, Map other, boolean override) { + static Map retainAll(Map toChange, Map other, boolean override) { final Iterator it = toChange.keySet().iterator(); while (it.hasNext()) { final T key = it.next(); @@ -365,20 +415,22 @@ static Map retainAll(Map toChange, Map other, boolean overr return toChange; } - static class MapBuilder> { + static class MapBuilder> { private M map; - public static > MapBuilder of(M map) { - return new MapBuilder(map); + public static > MapBuilder of(M map) { + return new MapBuilder(map); } private MapBuilder(M map) { this.map = map; } - public MapBuilder put(K key, V value) { + + public MapBuilder put(K key, V value) { map.put(key, value); return this; } + public M finish() { final M result = map; map = null; @@ -387,22 +439,24 @@ public M finish() { @SuppressWarnings("unchecked") public M unmodifiable() { - final M result = map instanceof SortedMap ? (M) Collections.unmodifiableSortedMap((SortedMap) map) : - (M) Collections.unmodifiableMap(map); + final M result = + map instanceof SortedMap + ? (M) Collections.unmodifiableSortedMap((SortedMap) map) + : (M) Collections.unmodifiableMap(map); map = null; return result; } } - public static class CollectionBuilder> { + public static class CollectionBuilder> { private C collection; private CollectionBuilder(C collection) { this.collection = collection; } - public static > CollectionBuilder of(C collection) { - return new CollectionBuilder(collection); + public static > CollectionBuilder of(C collection) { + return new CollectionBuilder(collection); } public C finish() { @@ -415,22 +469,25 @@ public C finish() { public C unmodifiable() { // ugly, but don't see a way around it final C result = - collection instanceof SortedSet ? (C) Collections.unmodifiableSortedSet((SortedSet)collection) : - collection instanceof Set ? (C) Collections.unmodifiableSet((Set)collection) : - collection instanceof List ? (C) Collections.unmodifiableList((List)collection) : - (C) Collections.unmodifiableCollection(collection); - collection = null; - return result; + collection instanceof SortedSet + ? (C) Collections.unmodifiableSortedSet((SortedSet) collection) + : collection instanceof Set + ? (C) Collections.unmodifiableSet((Set) collection) + : collection instanceof List + ? (C) Collections.unmodifiableList((List) collection) + : (C) Collections.unmodifiableCollection(collection); + collection = null; + return result; } - public CollectionBuilder add(E... elements) { + public CollectionBuilder add(E... elements) { for (final E element : elements) { collection.add(element); } return this; } - public CollectionBuilder addAll(Iterable iterable) { + public CollectionBuilder addAll(Iterable iterable) { for (final E element : iterable) { collection.add(element); } diff --git a/unicodetools/src/main/java/org/unicode/draft/OldPunycode.java b/unicodetools/src/main/java/org/unicode/draft/OldPunycode.java index 002355f94..2f56f7759 100644 --- a/unicodetools/src/main/java/org/unicode/draft/OldPunycode.java +++ b/unicodetools/src/main/java/org/unicode/draft/OldPunycode.java @@ -12,132 +12,126 @@ /** * Ported code from ICU punycode.c + * * @author ram */ /** * Class that implements the PunyCode algorithm for encode/decode + * * @draft */ -final public class OldPunycode { - static boolean showProgress = false; +public final class OldPunycode { + static boolean showProgress = false; /* Punycode parameters for Bootstring */ - private static final int BASE = 36; - private static final int TMIN = 1; - private static final int TMAX = 26; - private static final int SKEW = 38; - private static final int DAMP = 700; - private static final int INITIAL_BIAS = 72; - private static final int INITIAL_N = 0x80; + private static final int BASE = 36; + private static final int TMIN = 1; + private static final int TMAX = 26; + private static final int SKEW = 38; + private static final int DAMP = 700; + private static final int INITIAL_BIAS = 72; + private static final int INITIAL_N = 0x80; /* "Basic" Unicode/ASCII code points */ - private static final int HYPHEN = 0x2d; - private static final int DELIMITER = HYPHEN; - - private static final int ZERO = 0x30; - //private static final int NINE = 0x39; - - private static final int SMALL_A = 0x61; - private static final int SMALL_Z = 0x7a; - - private static final int CAPITAL_A = 0x41; - private static final int CAPITAL_Z = 0x5a; - private static final int MAX_CP_COUNT = 200; - //private static final int UINT_MAGIC = 0x80000000; - //private static final long ULONG_MAGIC = 0x8000000000000000L; - - private static int adaptBias(int delta, int length, boolean firstTime){ - if(firstTime){ - delta /=DAMP; - }else{ - delta /= 2; + private static final int HYPHEN = 0x2d; + private static final int DELIMITER = HYPHEN; + + private static final int ZERO = 0x30; + // private static final int NINE = 0x39; + + private static final int SMALL_A = 0x61; + private static final int SMALL_Z = 0x7a; + + private static final int CAPITAL_A = 0x41; + private static final int CAPITAL_Z = 0x5a; + private static final int MAX_CP_COUNT = 200; + // private static final int UINT_MAGIC = 0x80000000; + // private static final long ULONG_MAGIC = 0x8000000000000000L; + + private static int adaptBias(int delta, int length, boolean firstTime) { + if (firstTime) { + delta /= DAMP; + } else { + delta /= 2; } - delta += delta/length; + delta += delta / length; - int count=0; - for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { - delta/=(BASE-TMIN); + int count = 0; + for (; delta > ((BASE - TMIN) * TMAX) / 2; count += BASE) { + delta /= (BASE - TMIN); } - return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); + return count + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); } /** - * basicToDigit[] contains the numeric value of a basic code - * point (for use in representing integers) in the range 0 to - * BASE-1, or -1 if b is does not represent a value. + * basicToDigit[] contains the numeric value of a basic code point (for use in representing + * integers) in the range 0 to BASE-1, or -1 if b is does not represent a value. */ - static final int[] basicToDigit= new int[]{ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - - ///CLOVER:OFF + static final int[] basicToDigit = + new int[] { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + /// CLOVER:OFF private static char asciiCaseMap(char b, boolean uppercase) { - if(uppercase) { - if(SMALL_A<=b && b<=SMALL_Z) { - b-=(SMALL_A-CAPITAL_A); + if (uppercase) { + if (SMALL_A <= b && b <= SMALL_Z) { + b -= (SMALL_A - CAPITAL_A); } } else { - if(CAPITAL_A<=b && b<=CAPITAL_Z) { - b+=(SMALL_A-CAPITAL_A); + if (CAPITAL_A <= b && b <= CAPITAL_Z) { + b += (SMALL_A - CAPITAL_A); } } return b; } - ///CLOVER:ON + /// CLOVER:ON /** - * digitToBasic() returns the basic code point whose value - * (when used for representing integers) is d, which must be in the - * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is - * nonzero, in which case the uppercase form is used. + * digitToBasic() returns the basic code point whose value (when used for representing integers) + * is d, which must be in the range 0 to BASE-1. The lowercase form is used unless the uppercase + * flag is nonzero, in which case the uppercase form is used. */ private static char digitToBasic(int digit, boolean uppercase) { /* 0..25 map to ASCII a..z or A..Z */ /* 26..35 map to ASCII 0..9 */ - if(digit<26) { - if(uppercase) { - return (char)(CAPITAL_A+digit); + if (digit < 26) { + if (uppercase) { + return (char) (CAPITAL_A + digit); } else { - return (char)(SMALL_A+digit); + return (char) (SMALL_A + digit); } } else { - return (char)((ZERO-26)+digit); + return (char) ((ZERO - 26) + digit); } } /** - * Converts Unicode to Punycode. - * The input string must not contain single, unpaired surrogates. + * Converts Unicode to Punycode. The input string must not contain single, unpaired surrogates. * The output will be represented as an array of ASCII code points. - * + * * @param src * @param caseFlags * @return * @throws ParseException * @draft */ - public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws StringPrepParseException{ + public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) + throws StringPrepParseException { if (showProgress) { System.out.println("OLD DECODE"); } @@ -152,46 +146,44 @@ public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws * Handle the basic code points and * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): */ - srcCPCount=destLength=0; + srcCPCount = destLength = 0; - for(j=0; j0) { - if(destLength 0) { + if (destLength < destCapacity) { + dest[destLength] = DELIMITER; } ++destLength; } @@ -207,12 +199,12 @@ public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws */ /* Initialize the state: */ - n=INITIAL_N; - delta=0; - bias=INITIAL_BIAS; + n = INITIAL_N; + delta = 0; + bias = INITIAL_BIAS; /* Main encoding loop: */ - for(handledCPCount=basicLength; handledCPCount state to , but guard against overflow: */ - if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { + if (m - n > (0x7fffffff - MAX_CP_COUNT - delta) / (handledCPCount + 1)) { throw new IllegalStateException("Internal program error"); } - delta+=(m-n)*(handledCPCount+1); - n=m; + delta += (m - n) * (handledCPCount + 1); + n = m; if (showProgress) { System.out.println("\tStart Delta: " + Integer.toString(delta, 16)); @@ -247,50 +239,46 @@ public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws int startWriting = 0; /* Encode a sequence of same code points n */ - for(j=0; jTMAX) { - t=TMAX; - } + /** + * RAM: comment out the old code for conformance with + * draft-ietf-idn-punycode-03.txt + * + *

t=k-bias; if(tTMAX) { t=TMAX; } */ - - t=k-bias; - if(t=(bias+TMAX)) { - t=TMAX; + t = k - bias; + if (t < TMIN) { + t = TMIN; + } else if (k >= (bias + TMAX)) { + t = TMAX; } - if(q= CAPITAL_Z); + /// CLOVER:OFF + private static boolean isBasicUpperCase(int ch) { + return (CAPITAL_A <= ch && ch >= CAPITAL_Z); } - ///CLOVER:ON - private static boolean isSurrogate(int ch){ - return (((ch)&0xfffff800)==0xd800); + /// CLOVER:ON + private static boolean isSurrogate(int ch) { + return (((ch) & 0xfffff800) == 0xd800); } /** - * Converts Punycode to Unicode. - * The Unicode string will be at most as long as the Punycode string. - * + * Converts Punycode to Unicode. The Unicode string will be at most as long as the Punycode + * string. + * * @param src * @param caseFlags * @return @@ -327,11 +315,24 @@ private static boolean isSurrogate(int ch){ * @draft */ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) - throws StringPrepParseException{ + throws StringPrepParseException { final int srcLength = src.length(); final StringBuffer result = new StringBuffer(); - int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, - destCPCount, firstSupplementaryIndex, cpLength; + int n, + destLength, + i, + bias, + basicLength, + j, + in, + oldi, + w, + k, + digit, + t, + destCPCount, + firstSupplementaryIndex, + cpLength; char b; final int destCapacity = MAX_CP_COUNT; final char[] dest = new char[destCapacity]; @@ -344,40 +345,41 @@ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) * * The two following loops iterate backward. */ - for(j=srcLength; j>0;) { - if(src.charAt(--j)==DELIMITER) { + for (j = srcLength; j > 0; ) { + if (src.charAt(--j) == DELIMITER) { break; } } - destLength=basicLength=destCPCount=j; + destLength = basicLength = destCPCount = j; - while(j>0) { - b=src.charAt(--j); - if(!isBasic(b)) { - throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); + while (j > 0) { + b = src.charAt(--j); + if (!isBasic(b)) { + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); } - if(j0 ? basicLength+1 : 0; in 0 ? basicLength + 1 : 0; in < srcLength; /* no op */ ) { /* * in is the index of the next character to be consumed, and * destCPCount is the number of code points in the output array. @@ -387,36 +389,40 @@ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) * if we increase i as we go, then subtract off its starting * value at the end to obtain delta. */ - for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { - if(in>=srcLength) { - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + for (oldi = i, w = 1, k = BASE; /* no condition */ ; k += BASE) { + if (in >= srcLength) { + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - digit=basicToDigit[src.charAt(in++) & 0xFF]; - if(digit<0) { - throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); + digit = basicToDigit[src.charAt(in++) & 0xFF]; + if (digit < 0) { + throw new StringPrepParseException( + "Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); } - if(digit>(0x7fffffff-i)/w) { + if (digit > (0x7fffffff - i) / w) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - i+=digit*w; - t=k-bias; - if(t=(bias+TMAX)) { - t=TMAX; + i += digit * w; + t = k - bias; + if (t < TMIN) { + t = TMIN; + } else if (k >= (bias + TMAX)) { + t = TMAX; } - if(digit0x7fffffff/(BASE-t)) { + if (w > 0x7fffffff / (BASE - t)) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - w*=BASE-t; + w *= BASE - t; } /* @@ -425,30 +431,32 @@ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) * where needed instead of in for() loop tail. */ ++destCPCount; - bias=adaptBias(i-oldi, destCPCount, (oldi==0)); + bias = adaptBias(i - oldi, destCPCount, (oldi == 0)); /* * i was supposed to wrap around from (incremented) destCPCount to 0, * incrementing n each time, so we'll fix that now: */ - if(i/destCPCount>(0x7fffffff-n)) { + if (i / destCPCount > (0x7fffffff - n)) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - n+=i/destCPCount; - i%=destCPCount; + n += i / destCPCount; + i %= destCPCount; /* not needed for Punycode: */ /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ - if(n>0x10ffff || isSurrogate(n)) { + if (n > 0x10ffff || isSurrogate(n)) { /* Unicode code point overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } /* Insert n at position i of the output: */ - cpLength=UTF16.getCharCount(n); - if((destLength+cpLength)1) { - firstSupplementaryIndex=codeUnitIndex; + if (i <= firstSupplementaryIndex) { + codeUnitIndex = i; + if (cpLength > 1) { + firstSupplementaryIndex = codeUnitIndex; } else { ++firstSupplementaryIndex; } } else { - codeUnitIndex=firstSupplementaryIndex; - codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); + codeUnitIndex = firstSupplementaryIndex; + codeUnitIndex = + UTF16.moveCodePointOffset( + dest, 0, destLength, codeUnitIndex, i - codeUnitIndex); } /* use the UChar index codeUnitIndex instead of the code point index i */ - if(codeUnitIndex { - /** - * - */ + private final class CharPickerCellRenderer extends JLabel implements ListCellRenderer { + /** */ private static final long serialVersionUID = 869587839960963873L; CharPickerCellRenderer() { @@ -56,27 +50,31 @@ private final class CharPickerCellRenderer extends JLabel implements @Override public Component getListCellRendererComponent( - JList list, String value, int index, - boolean isSelected, boolean cellHasFocus) { + JList list, + String value, + int index, + boolean isSelected, + boolean cellHasFocus) { setText(value); setBackground(isSelected ? Color.YELLOW : frame.getBackground()); return this; } } - final private JList catList; - final private JList grpList; - final private JList charList; - final private JPanel label; - final private JFrame frame; + private final JList catList; + private final JList grpList; + private final JList charList; + private final JPanel label; + private final JFrame frame; public static void main(String args[]) { - SwingUtilities.invokeLater(new Runnable() { - @Override - public void run() { - new PickerApp(); - } - }); + SwingUtilities.invokeLater( + new Runnable() { + @Override + public void run() { + new PickerApp(); + } + }); } public PickerApp() { @@ -85,40 +83,45 @@ public PickerApp() { pickerFont = mainFont.deriveFont(24); biggerFont = mainFont.deriveFont(36); - frame = new JFrame("CLDR " + CLDRFile.GEN_VERSION + " CharPicker http://cldr.unicode.org, ICU:" + VersionInfo.ICU_VERSION); + frame = + new JFrame( + "CLDR " + + CLDRFile.GEN_VERSION + + " CharPicker http://cldr.unicode.org, ICU:" + + VersionInfo.ICU_VERSION); frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); frame.setSize(512, 342); // JLabel l = new JLabel("s+rl"); // j.getContentPane().add(l); frame.setLayout(new GridLayout(3, 1)); - ListModel model = new ListModel() { - final List categories = PickerData2.CATEGORIES; + ListModel model = + new ListModel() { + final List categories = PickerData2.CATEGORIES; - @Override - public int getSize() { - // TODO Auto-generated method stub - return categories.size(); - } - - @Override - public String getElementAt(int index) { - // TODO Auto-generated method stub - return categories.get(index); - } + @Override + public int getSize() { + // TODO Auto-generated method stub + return categories.size(); + } - @Override - public void addListDataListener(ListDataListener l) { - // never changes - //throw new InternalError("foo"); - } + @Override + public String getElementAt(int index) { + // TODO Auto-generated method stub + return categories.get(index); + } - @Override - public void removeListDataListener(ListDataListener l) { - // throw new InternalError("foo"); - } + @Override + public void addListDataListener(ListDataListener l) { + // never changes + // throw new InternalError("foo"); + } - }; + @Override + public void removeListDataListener(ListDataListener l) { + // throw new InternalError("foo"); + } + }; catList = new JList(model); catList.setSelectionMode(ListSelectionModel.SINGLE_INTERVAL_SELECTION); catList.addListSelectionListener(this); @@ -153,16 +156,18 @@ public void removeListDataListener(ListDataListener l) { @Override public void valueChanged(ListSelectionEvent e) { - //if(!e.getValueIsAdjusting()) return; - //System.err.println(e.toString()); + // if(!e.getValueIsAdjusting()) return; + // System.err.println(e.toString()); int c = e.getFirstIndex(); if (e.getSource() == catList) { - //System.err.println("catlist click " + c); + // System.err.println("catlist click " + c); grpList.setListData(PickerData2.getSubCategories(c)); frame.pack(); } else if (e.getSource() == grpList) { - //System.err.println("grplist click" + c); - List chars = PickerData2.getStringArray(catList.getSelectedIndex(), grpList.getSelectedIndex()); + // System.err.println("grplist click" + c); + List chars = + PickerData2.getStringArray( + catList.getSelectedIndex(), grpList.getSelectedIndex()); Vector stringVector = new Vector(); for (Interval interval : chars) { for (int i = interval.first(); i <= interval.last(); i++) { @@ -194,13 +199,15 @@ public void valueChanged(ListSelectionEvent e) { label.add(new JLabel(sb.toString())); JButton copyButton = new JButton("Copy"); - copyButton.addActionListener(new ActionListener() { - @Override - public void actionPerformed(ActionEvent e) { - Toolkit.getDefaultToolkit().getSystemClipboard() - .setContents(new StringSelection(str), null); - } - }); + copyButton.addActionListener( + new ActionListener() { + @Override + public void actionPerformed(ActionEvent e) { + Toolkit.getDefaultToolkit() + .getSystemClipboard() + .setContents(new StringSelection(str), null); + } + }); label.add(copyButton); label.setVisible(true); diff --git a/unicodetools/src/main/java/org/unicode/draft/PickerData2.java b/unicodetools/src/main/java/org/unicode/draft/PickerData2.java index c9fb19370..657ff8ea4 100644 --- a/unicodetools/src/main/java/org/unicode/draft/PickerData2.java +++ b/unicodetools/src/main/java/org/unicode/draft/PickerData2.java @@ -3,10 +3,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Vector; - import org.unicode.cldr.draft.CharacterListCompressor; import org.unicode.cldr.draft.CharacterListCompressor.Interval; - import org.unicode.picker.CharData; public class PickerData2 { @@ -23,7 +21,7 @@ public class PickerData2 { public static int SUBCATEGORY_BASE88 = 1; static { - List categories = new LinkedList(/*CharData.CATEGORIES.length*/); + List categories = new LinkedList(/*CharData.CATEGORIES.length*/ ); for (int i = 0; i < CharData.CATEGORIES.length; i++) { categories.add(CharData.CATEGORIES[i][CATEGORY_TITLE][CATEGORY_TITLE]); } @@ -43,7 +41,7 @@ public static String[] getSubCategories(int c) { } public static List getStringArray(int a, int b) { - return CharacterListCompressor.base88DecodeList(CharData.CATEGORIES[a][b + SUBCATEGORY_OFFSET][SUBCATEGORY_BASE88]); + return CharacterListCompressor.base88DecodeList( + CharData.CATEGORIES[a][b + SUBCATEGORY_OFFSET][SUBCATEGORY_BASE88]); } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/Punycode.java b/unicodetools/src/main/java/org/unicode/draft/Punycode.java index a1dc0411d..dd773b73d 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Punycode.java +++ b/unicodetools/src/main/java/org/unicode/draft/Punycode.java @@ -12,32 +12,34 @@ /** * Ported code from ICU punycode.c + * * @author ram */ /** * Class that implements the PunyCode algorithm for encode/decode + * * @draft */ -final public class Punycode { +public final class Punycode { boolean showProgress = true; /* Punycode parameters for Bootstring */ private final int BASE; - private static final int TMIN = 1; - private static final int TMAX = 26; - private static final int SKEW = 38; - private static final int DAMP = 700; - private static final int INITIAL_BIAS = 72; - private static final int INITIAL_N = 0x80; + private static final int TMIN = 1; + private static final int TMAX = 26; + private static final int SKEW = 38; + private static final int DAMP = 700; + private static final int INITIAL_BIAS = 72; + private static final int INITIAL_N = 0x80; /* "Basic" Unicode/ASCII code points */ private final char DELIMITER; - private static final int MAX_CP_COUNT = 200; - //private static final int UINT_MAGIC = 0x80000000; - //private static final long ULONG_MAGIC = 0x8000000000000000L; + private static final int MAX_CP_COUNT = 200; + // private static final int UINT_MAGIC = 0x80000000; + // private static final long ULONG_MAGIC = 0x8000000000000000L; public Punycode() { this("abcdefghijklmnopqrstuvwxyz0123456789", '-'); @@ -52,10 +54,12 @@ public Punycode(String digits, char delimiter) { for (int i = 0; i < length; ++i) { final char c = digits.charAt(i); if (c > 0xFF) { - throw new IllegalArgumentException("Illegal character, must be 0..FF: " + Integer.toHexString(c)); + throw new IllegalArgumentException( + "Illegal character, must be 0..FF: " + Integer.toHexString(c)); } if (basicToDigit[c] >= 0) { - throw new IllegalArgumentException("Illegal character, cannot repeat in string: " + Integer.toHexString(c)); + throw new IllegalArgumentException( + "Illegal character, cannot repeat in string: " + Integer.toHexString(c)); } basicToDigit[c] = i; digitToBasic2[i] = c; @@ -64,50 +68,56 @@ public Punycode(String digits, char delimiter) { DELIMITER = delimiter; } - private int adaptBias(int delta, int length, boolean firstTime){ - if(firstTime){ - delta /=DAMP; - }else{ - delta /= 2; + private int adaptBias(int delta, int length, boolean firstTime) { + if (firstTime) { + delta /= DAMP; + } else { + delta /= 2; } - delta += delta/length; + delta += delta / length; - int count=0; - for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { - delta/=(BASE-TMIN); + int count = 0; + for (; delta > ((BASE - TMIN) * TMAX) / 2; count += BASE) { + delta /= (BASE - TMIN); } - return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); + return count + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); } /** - * basicToDigit[] contains the numeric value of a basic code - * point (for use in representing integers) in the range 0 to - * BASE-1, or -1 if b is does not represent a value. + * basicToDigit[] contains the numeric value of a basic code point (for use in representing + * integers) in the range 0 to BASE-1, or -1 if b is does not represent a value. */ final int[] basicToDigit = new int[256]; - /** - * The reverse mapping, digit to basic - */ - + /** The reverse mapping, digit to basic */ final int[] digitToBasic2 = new int[256]; /** - * Converts Unicode to Punycode. - * The input string must not contain single, unpaired surrogates. + * Converts Unicode to Punycode. The input string must not contain single, unpaired surrogates. * The output will be represented as an array of ASCII code points. - * + * * @param src * @param caseFlags * @return * @throws ParseException * @draft */ - public StringBuilder encode(CharSequence src, StringBuilder dest) throws StringPrepParseException{ + public StringBuilder encode(CharSequence src, StringBuilder dest) + throws StringPrepParseException { final int srcLength = src.length(); final int[] cpBuffer = new int[srcLength]; - int cpBeingEncoded, delta, handledCPCount, basicLength, bias, j, nextLargerCodePoint, q, k, t, srcCPCount; + int cpBeingEncoded, + delta, + handledCPCount, + basicLength, + bias, + j, + nextLargerCodePoint, + q, + k, + t, + srcCPCount; char c, c2; if (showProgress) { @@ -118,34 +128,37 @@ public StringBuilder encode(CharSequence src, StringBuilder dest) throws StringP * Handle the basic code points and * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): */ - srcCPCount=0; - - for(j=0; j= 0 - ) { - cpBuffer[srcCPCount++]=0; + srcCPCount = 0; + + for (j = 0; j < srcLength; ++j) { + c = src.charAt(j); + if (c < 0x80 + // && basicToDigit[c] >= 0 + ) { + cpBuffer[srcCPCount++] = 0; dest.append(c); } else { - cpBeingEncoded=0; - if(!UTF16.isSurrogate(c)) { - cpBeingEncoded|=c; - } else if(UTF16.isLeadSurrogate(c) && (j+1)0) { + if (basicLength > 0) { dest.append(DELIMITER); } if (showProgress) { @@ -153,7 +166,6 @@ public StringBuilder encode(CharSequence src, StringBuilder dest) throws StringP System.out.println("cpBuffer: " + show(cpBuffer)); } - /* * handledCPCount is the number of code points that have been handled * basicLength is the number of basic code points @@ -161,12 +173,12 @@ public StringBuilder encode(CharSequence src, StringBuilder dest) throws StringP */ /* Initialize the state: */ - cpBeingEncoded=INITIAL_N; - delta=0; - bias=INITIAL_BIAS; + cpBeingEncoded = INITIAL_N; + delta = 0; + bias = INITIAL_BIAS; /* Main encoding loop: */ - for(handledCPCount=basicLength; handledCPCount state to , but guard against overflow: */ - if(nextLargerCodePoint-cpBeingEncoded>(0x7fffffff-srcLength-delta)/(handledCPCount+1)) { + if (nextLargerCodePoint - cpBeingEncoded + > (0x7fffffff - srcLength - delta) / (handledCPCount + 1)) { throw new IllegalStateException("Internal program error"); } // delta is composed of the delta to the next character * gap (the characters so far) - // below, we'll also add to it slowly as we find each new identical character in the input + // below, we'll also add to it slowly as we find each new identical character in the + // input - delta+=(nextLargerCodePoint-cpBeingEncoded)*(handledCPCount+1); - cpBeingEncoded=nextLargerCodePoint; + delta += (nextLargerCodePoint - cpBeingEncoded) * (handledCPCount + 1); + cpBeingEncoded = nextLargerCodePoint; if (showProgress) { System.out.println("\tStart Delta: " + Integer.toString(delta, 16)); } /* Encode a sequence of same code points n */ int startWriting = 0; - for(j=0; j=(bias+TMAX)) { - t=TMAX; + t = k - bias; + if (t < TMIN) { + t = TMIN; + } else if (k >= (bias + TMAX)) { + t = TMAX; } - if(q0;) { - if(src.charAt(--j)==DELIMITER) { + for (j = srcLength; j > 0; ) { + if (src.charAt(--j) == DELIMITER) { break; } } - basicLength=destCPCount=j; + basicLength = destCPCount = j; for (j = 0; j < basicLength; ++j) { - final char b=src.charAt(j); - if(b > 0x80 - // || basicToDigit[b] < 0 - ) { - throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); + final char b = src.charAt(j); + if (b > 0x80 + // || basicToDigit[b] < 0 + ) { + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); } dest.append(b); } /* Initialize the state: */ - n=INITIAL_N; - i=0; - bias=INITIAL_BIAS; - firstSupplementaryIndex=1000000000; + n = INITIAL_N; + i = 0; + bias = INITIAL_BIAS; + firstSupplementaryIndex = 1000000000; /* * Main decoding loop: * Start just after the last delimiter if any * basic code points were copied; start at the beginning otherwise. */ - for(in=basicLength>0 ? basicLength+1 : 0; in 0 ? basicLength + 1 : 0; in < srcLength; /* no op */ ) { /* * in is the index of the next character to be consumed, and * destCPCount is the number of code points in the output array. @@ -318,36 +345,40 @@ public StringBuffer decode(CharSequence src, StringBuffer dest) throws StringPre * if we increase i as we go, then subtract off its starting * value at the end to obtain delta. */ - for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { - if(in>=srcLength) { - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + for (oldi = i, w = 1, k = BASE; /* no condition */ ; k += BASE) { + if (in >= srcLength) { + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - digit=basicToDigit[src.charAt(in++) & 0xFF]; - if(digit<0) { - throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); + digit = basicToDigit[src.charAt(in++) & 0xFF]; + if (digit < 0) { + throw new StringPrepParseException( + "Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); } - if(digit>(0x7fffffff-i)/w) { + if (digit > (0x7fffffff - i) / w) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - i+=digit*w; - t=k-bias; - if(t=(bias+TMAX)) { - t=TMAX; + i += digit * w; + t = k - bias; + if (t < TMIN) { + t = TMIN; + } else if (k >= (bias + TMAX)) { + t = TMAX; } - if(digit0x7fffffff/(BASE-t)) { + if (w > 0x7fffffff / (BASE - t)) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - w*=BASE-t; + w *= BASE - t; } /* @@ -356,25 +387,27 @@ public StringBuffer decode(CharSequence src, StringBuffer dest) throws StringPre * where needed instead of in for() loop tail. */ ++destCPCount; - bias=adaptBias(i-oldi, destCPCount, (oldi==0)); + bias = adaptBias(i - oldi, destCPCount, (oldi == 0)); /* * i was supposed to wrap around from (incremented) destCPCount to 0, * incrementing n each time, so we'll fix that now: */ - if(i/destCPCount>(0x7fffffff-n)) { + if (i / destCPCount > (0x7fffffff - n)) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - n+=i/destCPCount; - i%=destCPCount; + n += i / destCPCount; + i %= destCPCount; /* not needed for Punycode: */ /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ - if(n>0x10ffff || isSurrogate(n)) { + if (n > 0x10ffff || isSurrogate(n)) { /* Unicode code point overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } /* Insert n at position i of the output: */ @@ -391,16 +424,16 @@ public StringBuffer decode(CharSequence src, StringBuffer dest) throws StringPre * Only the rare cases with supplementary code points are handled * more slowly - but not too bad since this is an insertion anyway. */ - if(i<=firstSupplementaryIndex) { - codeUnitIndex=i; - if(n > 0xFFFF) { - firstSupplementaryIndex=codeUnitIndex; + if (i <= firstSupplementaryIndex) { + codeUnitIndex = i; + if (n > 0xFFFF) { + firstSupplementaryIndex = codeUnitIndex; } else { ++firstSupplementaryIndex; } } else { - codeUnitIndex=firstSupplementaryIndex; - codeUnitIndex=UTF16.moveCodePointOffset(dest, codeUnitIndex, i-codeUnitIndex); + codeUnitIndex = firstSupplementaryIndex; + codeUnitIndex = UTF16.moveCodePointOffset(dest, codeUnitIndex, i - codeUnitIndex); } /* use the UChar index codeUnitIndex instead of the code point index i */ @@ -412,14 +445,23 @@ public StringBuffer decode(CharSequence src, StringBuffer dest) throws StringPre } String valueOfCodePoint(int codepoint) { - return codepoint < 0x10000 ? String.valueOf((char)codepoint) : new StringBuilder(2).appendCodePoint(codepoint).toString(); + return codepoint < 0x10000 + ? String.valueOf((char) codepoint) + : new StringBuilder(2).appendCodePoint(codepoint).toString(); } + StringBuilder insertCodePoint(StringBuilder target, int offset, int codepoint) { - return codepoint < 0x10000 ? target.insert(offset, (char) codepoint) : target.insert(offset, valueOfCodePoint(codepoint)); + return codepoint < 0x10000 + ? target.insert(offset, (char) codepoint) + : target.insert(offset, valueOfCodePoint(codepoint)); } + StringBuffer insertCodePoint(StringBuffer target, int offset, int codepoint) { - return codepoint < 0x10000 ? target.insert(offset, (char) codepoint) : target.insert(offset, valueOfCodePoint(codepoint)); + return codepoint < 0x10000 + ? target.insert(offset, (char) codepoint) + : target.insert(offset, valueOfCodePoint(codepoint)); } + static String show(int[] source) { String result = ""; for (final int item : source) { @@ -428,4 +470,3 @@ static String show(int[] source) { return result; } } - diff --git a/unicodetools/src/main/java/org/unicode/draft/RadicalStroke2.java b/unicodetools/src/main/java/org/unicode/draft/RadicalStroke2.java index 6daeeedc7..0f137c63b 100644 --- a/unicodetools/src/main/java/org/unicode/draft/RadicalStroke2.java +++ b/unicodetools/src/main/java/org/unicode/draft/RadicalStroke2.java @@ -1,5 +1,7 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; @@ -8,16 +10,11 @@ import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.CodePoints; -import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.PatternCache; import org.unicode.text.utility.Settings; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class RadicalStroke2 { // U+3433 kRSUnicode 9.3 private static Pattern RAD_STROKE = PatternCache.get("U\\+([A-Z0-9]+)\\s+kRSUnicode\\s+(.*)"); @@ -43,14 +40,18 @@ private RadicalStroke2() { Matcher radStrokeMatcher = RAD_STROKE.matcher(""); Matcher radDataMatcher = RAD_DATA.matcher(""); Matcher iiCore = IICORE.matcher(""); - radStrokesToRadToRemainingStrokes = new TreeMap>>(); - remainder = ScriptCategories2.parseUnicodeSet("[:script=Han:]").removeAll(GeneratePickerData2.SKIP); + radStrokesToRadToRemainingStrokes = + new TreeMap>>(); + remainder = + ScriptCategories2.parseUnicodeSet("[:script=Han:]") + .removeAll(GeneratePickerData2.SKIP); String dataDir = Settings.CLDR.UCD_DATA_DIRECTORY + "/Unihan/"; - BufferedReader in = new BufferedReader( - new FileReader( - Subheader2.getFileNameFromPattern( - dataDir, "Unihan_RadicalStrokeCounts.*\\.txt"))); + BufferedReader in = + new BufferedReader( + new FileReader( + Subheader2.getFileNameFromPattern( + dataDir, "Unihan_RadicalStrokeCounts.*\\.txt"))); while (true) { String line = in.readLine(); @@ -70,10 +71,12 @@ private RadicalStroke2() { Integer radicalChar = ScriptCategories2.RADICAL_NUM2CHAR.get(radical); if (radicalChar == null) { in.close(); - throw new IllegalArgumentException("No radical value for <" + radical + ">"); + throw new IllegalArgumentException( + "No radical value for <" + radical + ">"); } charToRadical.put(cp, radicalChar); - int radicalStrokes = ScriptCategories2.RADICAL_CHAR2STROKES.get(radicalChar); + int radicalStrokes = + ScriptCategories2.RADICAL_CHAR2STROKES.get(radicalChar); int remainingStrokes = Integer.parseInt(radDataMatcher.group(2)); charToTotalStrokes.put(cp, radicalStrokes + remainingStrokes); charToRemainingStrokes.put(cp, remainingStrokes); @@ -81,10 +84,15 @@ private RadicalStroke2() { // if (radical.startsWith("211")) { // System.out.println(line); // } - // String baseRadical = radical.endsWith("'") ? radical.substring(0, radical.length()-1) : + // String baseRadical = radical.endsWith("'") ? radical.substring(0, + // radical.length()-1) : // radical; - RadicalStroke2.mapToUnicodeSetAdd(radStrokesToRadToRemainingStrokes, radicalStrokes, radical, - remainingStrokes, cp); + RadicalStroke2.mapToUnicodeSetAdd( + radStrokesToRadToRemainingStrokes, + radicalStrokes, + radical, + remainingStrokes, + cp); remainder.remove(cp); // if (radDataMatcher.group(2).equals("0") && radical.endsWith("'")) { // String radicalString = Normalizer.normalize(cp, Normalizer.NFKC); @@ -92,7 +100,8 @@ private RadicalStroke2() { // if (old == null) { // radicalToChar.put(radical, radicalString); // } else if (!radicalString.equals(old)) { - // System.out.println("Duplicate radical: " + line + " with " + radicalString + " and " + old); + // System.out.println("Duplicate radical: " + line + " with " + + // radicalString + " and " + old); // } // } } @@ -108,7 +117,8 @@ private RadicalStroke2() { charToRemainingStrokes.freeze(); charToRadical.freeze(); - radStrokesToRadToRemainingStrokes = CldrUtility.protectCollection(radStrokesToRadToRemainingStrokes); + radStrokesToRadToRemainingStrokes = + CldrUtility.protectCollection(radStrokesToRadToRemainingStrokes); // UnicodeSet temp = new UnicodeSet(); // for (UnicodeSetIterator it = new @@ -116,24 +126,33 @@ private RadicalStroke2() { // temp.add(it.codepoint); // if (temp.size() >= 800) { // int code = temp.charAt(0); - // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, false), false, temp); + // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, + // false), false, temp); // temp.clear(); // } // } // if (temp.size() > 0) { // int code = temp.charAt(0); - // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, false), false, temp); + // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, + // false), false, temp); // } } catch (IOException e) { throw new IllegalArgumentException(e); } } - static void mapToUnicodeSetAdd(Map>> index, - int radicalStrokes, String radicalChar, int remainingStrokes, int cp) { + static void mapToUnicodeSetAdd( + Map>> index, + int radicalStrokes, + String radicalChar, + int remainingStrokes, + int cp) { Map> subIndex = index.get(radicalStrokes); if (subIndex == null) { - index.put(radicalStrokes, subIndex = new TreeMap>(GeneratePickerData2.UCA)); + index.put( + radicalStrokes, + subIndex = + new TreeMap>(GeneratePickerData2.UCA)); } Map uset = subIndex.get(radicalChar); if (uset == null) { @@ -146,43 +165,43 @@ static void mapToUnicodeSetAdd(Map RadicalStrokeComparator = new Comparator() { - CodePoints cps1 = new CodePoints(""); - CodePoints cps2 = new CodePoints(""); - - public int compare(CharSequence o1, CharSequence o2) { - cps1.reset(o1); - cps2.reset(o2); - boolean n1 = cps1.next(); - boolean n2 = cps2.next(); - // shorter strings are less - if (!n1) { - return n2 ? -1 : 0; - } else if (!n2) { - return 1; - } - int cp1 = cps1.getCodePoint(); - int cp2 = cps2.getCodePoint(); - - // lower stroke counts are less (null counts as zero) - Integer s1 = SINGLETON.charToTotalStrokes.get(cp1); - Integer s2 = SINGLETON.charToTotalStrokes.get(cp2); - if (s1 == null && s2 == null) { - // no info, return codepoint order - return cp1 - cp2; - } - int ss1 = s1 == null ? 0 : s1; - int ss2 = s2 == null ? 0 : s2; - if (ss1 < ss2) return -1; - if (ss1 > ss2) return 1; - - Integer r1 = SINGLETON.charToRadical.get(cp1); - Integer r2 = SINGLETON.charToRadical.get(cp2); - if (r1 < r2) return -1; - if (r1 > r2) return 1; - // no other diff, return codepoint order - return cp1 - cp2; - } - }; - + static Comparator RadicalStrokeComparator = + new Comparator() { + CodePoints cps1 = new CodePoints(""); + CodePoints cps2 = new CodePoints(""); + + public int compare(CharSequence o1, CharSequence o2) { + cps1.reset(o1); + cps2.reset(o2); + boolean n1 = cps1.next(); + boolean n2 = cps2.next(); + // shorter strings are less + if (!n1) { + return n2 ? -1 : 0; + } else if (!n2) { + return 1; + } + int cp1 = cps1.getCodePoint(); + int cp2 = cps2.getCodePoint(); + + // lower stroke counts are less (null counts as zero) + Integer s1 = SINGLETON.charToTotalStrokes.get(cp1); + Integer s2 = SINGLETON.charToTotalStrokes.get(cp2); + if (s1 == null && s2 == null) { + // no info, return codepoint order + return cp1 - cp2; + } + int ss1 = s1 == null ? 0 : s1; + int ss2 = s2 == null ? 0 : s2; + if (ss1 < ss2) return -1; + if (ss1 > ss2) return 1; + + Integer r1 = SINGLETON.charToRadical.get(cp1); + Integer r2 = SINGLETON.charToRadical.get(cp2); + if (r1 < r2) return -1; + if (r1 > r2) return 1; + // no other diff, return codepoint order + return cp1 - cp2; + } + }; } diff --git a/unicodetools/src/main/java/org/unicode/draft/ScriptCategories2.java b/unicodetools/src/main/java/org/unicode/draft/ScriptCategories2.java index ee8f11eb6..30ae88f06 100644 --- a/unicodetools/src/main/java/org/unicode/draft/ScriptCategories2.java +++ b/unicodetools/src/main/java/org/unicode/draft/ScriptCategories2.java @@ -1,5 +1,12 @@ package org.unicode.draft; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.VersionInfo; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; @@ -13,125 +20,140 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.IdUsage; import org.unicode.cldr.util.With; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.VersionInfo; - public class ScriptCategories2 { private static final boolean DEBUG = false; private static final boolean DEBUG_MAIN = false; - //@formatter:off + // @formatter:off // From: http://www.phon.ucl.ac.uk/home/wells/ipa-unicode.htm - public static final UnicodeSet IPA = (UnicodeSet) new UnicodeSet( - "[a-zæçðøħŋœǀ-ǃɐ-ɨɪ-ɶ ɸ-ɻɽɾʀ-ʄʈ-ʒʔʕʘʙʛ-ʝʟʡʢ ʤʧʰ-ʲʴʷʼˈˌːˑ˞ˠˤ̀́̃̄̆̈ ̘̊̋̏-̜̚-̴̠̤̥̩̪̬̯̰̹-̽͜ ͡βθχ↑-↓↗↘]" - ).freeze(); + public static final UnicodeSet IPA = + (UnicodeSet) + new UnicodeSet( + "[a-zæçðøħŋœǀ-ǃɐ-ɨɪ-ɶ ɸ-ɻɽɾʀ-ʄʈ-ʒʔʕʘʙʛ-ʝʟʡʢ ʤʧʰ-ʲʴʷʼˈˌːˑ˞ˠˤ̀́̃̄̆̈ ̘̊̋̏-̜̚-̴̠̤̥̩̪̬̯̰̹-̽͜ ͡βθχ↑-↓↗↘]") + .freeze(); - public static final UnicodeSet IPA_EXTENSIONS = (UnicodeSet) new UnicodeSet( - "[ɩɷɼɿʅ-ʇʓʖʗʚʞʠʣʥʦʨ-ʯ]" - ).freeze(); + public static final UnicodeSet IPA_EXTENSIONS = + (UnicodeSet) new UnicodeSet("[ɩɷɼɿʅ-ʇʓʖʗʚʞʠʣʥʦʨ-ʯ]").freeze(); - public static final UnicodeSet DEPRECATED_NEW = (UnicodeSet) new UnicodeSet( - "[[:deprecated:][\\u0149\\u0F77\\u0F79\\u17A4\\u2329\\u232A]-[\\u0340\\u0341\\u17D3]]").freeze(); + public static final UnicodeSet DEPRECATED_NEW = + (UnicodeSet) + new UnicodeSet( + "[[:deprecated:][\\u0149\\u0F77\\u0F79\\u17A4\\u2329\\u232A]-[\\u0340\\u0341\\u17D3]]") + .freeze(); // removing 0340, 0341, 17D3, and adding 0149, 0F77, 0F79, 17A4, 2329, 232A // TODO - change to Blocks - public static final UnicodeSet ARCHAIC_31 = (UnicodeSet) new UnicodeSet( - // "[[:script=Bugi:][:script=Buhd:][:script=Cari:][:script=Copt:]" + - // "[:script=Cprt:][:script=Dsrt:][:script=Glag:][:script=Goth:][:script=Hano:][:script=Ital:][:script=Khar:][:script=Linb:]" - // + - // "[:script=Lyci:][:script=Lydi:][:script=Ogam:][:script=Osma:][:script=Phag:][:script=Phnx:][:script=Rjng:][:script=Runr:]" - // + - // "[:script=Shaw:][:script=Sund:][:script=Sylo:][:script=Syrc:][:script=Tagb:][:script=Tglg:][:script=Ugar:][:script=Xpeo:][:script=Xsux:]" - // + - // "[:block=Ancient_Greek_Musical_Notation:][:block=Phaistos_Disc:]]" - "[ [:blk=Ancient_Greek_Musical_Notation:]" + - "[:blk=Buginese:] " + - "[:blk=Buhid:] [:blk=Carian:] " + - "[:blk=Coptic:] [:blk=Cuneiform:] " + - "[:blk=Cuneiform_Numbers_And_Punctuation:] " + - "[:blk=Cypriot_Syllabary:] [:blk=Deseret:] [:blk=Glagolitic:] " + - "[:blk=Gothic:] [:blk=Hanunoo:] [:blk=Kharoshthi:] [:blk=Linear_B_Ideograms:] " + - "[:blk=Linear_B_Syllabary:] [:blk=Lycian:] [:blk=Lydian:] [:blk=Ogham:]" + - "[:blk=Old_Italic:] [:blk=Old_Persian:] [:blk=Osmanya:] [:blk=Phags_Pa:] " + - "[:blk=Phaistos_Disc:] [:blk=Phoenician:] [:blk=Rejang:] [:blk=Runic:] " + - "[:blk=Shavian:] [:blk=Sundanese:] [:blk=Syloti_Nagri:] [:blk=Syriac:] " + - "[:blk=Tagalog:] [:blk=Tagbanwa:] [:blk=Ugaritic:] [:sc=Copt:]]" - ).freeze(); + public static final UnicodeSet ARCHAIC_31 = + (UnicodeSet) + new UnicodeSet( + // "[[:script=Bugi:][:script=Buhd:][:script=Cari:][:script=Copt:]" + + // "[:script=Cprt:][:script=Dsrt:][:script=Glag:][:script=Goth:][:script=Hano:][:script=Ital:][:script=Khar:][:script=Linb:]" + // + + // "[:script=Lyci:][:script=Lydi:][:script=Ogam:][:script=Osma:][:script=Phag:][:script=Phnx:][:script=Rjng:][:script=Runr:]" + // + + // "[:script=Shaw:][:script=Sund:][:script=Sylo:][:script=Syrc:][:script=Tagb:][:script=Tglg:][:script=Ugar:][:script=Xpeo:][:script=Xsux:]" + // + + // "[:block=Ancient_Greek_Musical_Notation:][:block=Phaistos_Disc:]]" + "[ [:blk=Ancient_Greek_Musical_Notation:]" + + "[:blk=Buginese:] " + + "[:blk=Buhid:] [:blk=Carian:] " + + "[:blk=Coptic:] [:blk=Cuneiform:] " + + "[:blk=Cuneiform_Numbers_And_Punctuation:] " + + "[:blk=Cypriot_Syllabary:] [:blk=Deseret:] [:blk=Glagolitic:] " + + "[:blk=Gothic:] [:blk=Hanunoo:] [:blk=Kharoshthi:] [:blk=Linear_B_Ideograms:] " + + "[:blk=Linear_B_Syllabary:] [:blk=Lycian:] [:blk=Lydian:] [:blk=Ogham:]" + + "[:blk=Old_Italic:] [:blk=Old_Persian:] [:blk=Osmanya:] [:blk=Phags_Pa:] " + + "[:blk=Phaistos_Disc:] [:blk=Phoenician:] [:blk=Rejang:] [:blk=Runic:] " + + "[:blk=Shavian:] [:blk=Sundanese:] [:blk=Syloti_Nagri:] [:blk=Syriac:] " + + "[:blk=Tagalog:] [:blk=Tagbanwa:] [:blk=Ugaritic:] [:sc=Copt:]]") + .freeze(); // from the old version of UTS39 - public static final UnicodeSet ARCHAIC_39 = (UnicodeSet) new UnicodeSet( - // "[\\u018D\\u01AA-\\u01AB\\u01B9-\\u01BB\\u01BE\\u01BF\\u021C-\\u021D\\u025F\\u0277\\u027C\\u029E\\u0343" + - // "\\u03D0-\\u03D1\\u03D5-\\u03E1\\u03F7-\\u03F8\\u03F9-\\u03FB\\u0483-\\u0486\\u05A2\\u05C5-\\u05C7\\u066E-\\u066F\\u068E\\u0CDE\\u10F1-\\u10F6\\u1100-\\u1159" - // + - // "\\u115A-\\u115E\\u1161-\\u11A2\\u11A3-\\u11A7\\u11A8-\\u11F9\\u11FA-\\u11FF\\u1680-\\u169A\\u16A0-\\u16EA\\u16EE-\\u16F0\\u1700-\\u170C\\u170E-\\u1714" - // + - // "\\u1720-\\u1734\\u1740-\\u1753\\u1760-\\u176C\\u176E-\\u1770\\u1772-\\u1773\\u17A8\\u17D1\\u17DD\\u1B00-\\u1B4B\\u1B50-\\u1B7C\\u1DC0-\\u1DC3" - // + - // "\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u3165-\\u318E\\uA700-\\uA707\\uA840-\\uA877"+ - // "\\U00010000-\\U0001000B\\U0001000D-\\U00010026\\U00010028-\\U0001003A\\U0001003C-\\U0001003D\\U0001003F-\\U0001004D" - // + - // "\\U00010050-\\U0001005D\\U00010080-\\U000100FA\\U00010140-\\U00010174\\U00010300-\\U0001031E\\U00010330-\\U0001034A" - // + - // "\\U00010380-\\U0001039D\\U0001039F-\\U000103C3\\U000103C8-\\U000103D5\\U00010400-\\U0001049D\\U000104A0-\\U000104A9" - // + - // "\\U00010800-\\U00010805\\U00010808\\U0001080A-\\U00010835\\U00010837-\\U00010838\\U0001083C\\U0001083F\\U00010900-\\U00010919" - // + - // "\\U0001091F\\U00010A00-\\U00010A03\\U00010A05-\\U00010A06\\U00010A0C-\\U00010A13\\U00010A15-\\U00010A17\\U00010A19-\\U00010A33" - // + - // "\\U00010A38-\\U00010A3A\\U00010A3F-\\U00010A47\\U00010A50-\\U00010A58\\U00012000-\\U0001236E\\U00012400-\\U00012462\\U00012470-\\U00012473]" - "[ " + - // "[:blk=Balinese:] " + - "[:blk=Ancient_Greek_Numbers:]" + - "[:Block=Hangul_Jamo:]" + - "[:Block=Hangul_Compatibility_Jamo:]" + - "[֢ ׅ ̓ ᷀-᷃ ҃-҆ ׇ ៑ ៝ ׆ ꜀-꜇ ɟ ʞ ɷ ɼ ƪ ƾ ƫ ƍ ƹ ƺ ȝȜ ƿ ƻ ϐ ϝϜ ϛϚ ϑ ϗ ϖ ϻϺ ϟϞ ϙϘ Ϲ ϕ ϡϠ ϸ Ϸ ჱ-ჶ ٮ ڎ ٯ ೞ ឨ]" + - "]" - // "[\u018D\u01AA\u01AB\u01B9-\u01BB\u01BE\u01BF\u021C\u021D\u025F\u0277\u027C\u029E\u0343\u03D0\u03D1\u03D5-\u03E1\u03F7-\u03FB\u0483-\u0486\u05A2\u05C5-\u05C7\u066E\u066F\u068E\u0CDE\u10F1-\u10F6\u1100-\u115E\u1161-\u11FF\u17A8\u17D1\u17DD\u1DC0-\u1DC3\u3165-\u318E\uA700-\uA707\\U00010140-\\U00010174]]" - ).freeze(); - - public static final UnicodeSet ARCHAIC_HEURISTIC = (UnicodeSet) new UnicodeSet( - "[ " + - "[:blk=Ancient_Symbols:]" + - "[:blk=Ancient_Greek_Musical_Notation:] " + - "[:blk=Cyrillic_Extended_A:] " + - "[:blk=Cyrillic_Extended_B:]" + - "[˯-˿ͣ-ͳͶͷߨ-ߪ᷎-᷿ᷦ᷾ẜẝẟ Ỻ-ỿ⁖⁘-⁞ↀ-Ↄↅ-ↈⱷ-ⱽ⸀-⸗⸪-⸰ ꜠꜡ꜰ-ꝸꟻ-ꟿ[ݾ ݿ ػ-ؿ]]" + - "]" - // "[\u02EF-\u02FF\u0363-\u0373\u0376\u0377\u07E8-\u07EA\u1DCE-\u1DE6\u1DFE\u1DFF\u1E9C\u1E9D\u1E9F\u1EFA-\u1EFF\u2056\u2058-\u205E\u2180-\u2183\u2185-\u2188\u2C77-\u2C7D\u2E00-\u2E17\u2E2A-\u2E30\uA720\uA721\uA730-\uA778\uA7FB-\uA7FF]]" - ).freeze(); - - public static final UnicodeSet ARCHAIC_ADDITIONS = (UnicodeSet) new UnicodeSet( - "[ " + - "[:blk=Aegean_Numbers:] " + - "[:blk=Byzantine_Musical_Symbols:] " + - "[:block=Georgian Supplement:]" + - "[ͻ-ͽϏϽ-Ͽ[ƨ ƽ ƅ][ؕ-ؚ ۖ-ۤ ۧ ۨ ۪-ۭ ۩ ۥ ۦ][֑-֯][ׄ ׅ][ﬠ-ﬨ][ﭏ][Ⴀ-Ⴆ Ⴡ Ⴇ-Ⴌ Ⴢ Ⴍ-Ⴒ Ⴣ Ⴓ-Ⴞ Ⴤ Ⴟ Ⴠ Ⴥ][Ⴀ-Ⴥ][ƄƧƸƼǷϲϴↄ]჻]" + - "]" - // "[\u0269\u027F\u0285-\u0287\u0293\u0296\u0297\u029A\u02A0\u02A3\u02A5\u02A6\u02A8-\u02AF\u0313\u037B-\u037D\u03CF\u03FD-\u03FF]]" - ).freeze(); - - public static final UnicodeSet ARCHAIC = (UnicodeSet) new UnicodeSet(ARCHAIC_31) - .addAll(ARCHAIC_39) - .addAll(ARCHAIC_HEURISTIC) - .addAll(ARCHAIC_ADDITIONS) - .removeAll(IPA) - .removeAll(IPA_EXTENSIONS) - .freeze(); + public static final UnicodeSet ARCHAIC_39 = + (UnicodeSet) + new UnicodeSet( + // "[\\u018D\\u01AA-\\u01AB\\u01B9-\\u01BB\\u01BE\\u01BF\\u021C-\\u021D\\u025F\\u0277\\u027C\\u029E\\u0343" + + // "\\u03D0-\\u03D1\\u03D5-\\u03E1\\u03F7-\\u03F8\\u03F9-\\u03FB\\u0483-\\u0486\\u05A2\\u05C5-\\u05C7\\u066E-\\u066F\\u068E\\u0CDE\\u10F1-\\u10F6\\u1100-\\u1159" + // + + // "\\u115A-\\u115E\\u1161-\\u11A2\\u11A3-\\u11A7\\u11A8-\\u11F9\\u11FA-\\u11FF\\u1680-\\u169A\\u16A0-\\u16EA\\u16EE-\\u16F0\\u1700-\\u170C\\u170E-\\u1714" + // + + // "\\u1720-\\u1734\\u1740-\\u1753\\u1760-\\u176C\\u176E-\\u1770\\u1772-\\u1773\\u17A8\\u17D1\\u17DD\\u1B00-\\u1B4B\\u1B50-\\u1B7C\\u1DC0-\\u1DC3" + // + + // "\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u3165-\\u318E\\uA700-\\uA707\\uA840-\\uA877"+ + // "\\U00010000-\\U0001000B\\U0001000D-\\U00010026\\U00010028-\\U0001003A\\U0001003C-\\U0001003D\\U0001003F-\\U0001004D" + // + + // "\\U00010050-\\U0001005D\\U00010080-\\U000100FA\\U00010140-\\U00010174\\U00010300-\\U0001031E\\U00010330-\\U0001034A" + // + + // "\\U00010380-\\U0001039D\\U0001039F-\\U000103C3\\U000103C8-\\U000103D5\\U00010400-\\U0001049D\\U000104A0-\\U000104A9" + // + + // "\\U00010800-\\U00010805\\U00010808\\U0001080A-\\U00010835\\U00010837-\\U00010838\\U0001083C\\U0001083F\\U00010900-\\U00010919" + // + + // "\\U0001091F\\U00010A00-\\U00010A03\\U00010A05-\\U00010A06\\U00010A0C-\\U00010A13\\U00010A15-\\U00010A17\\U00010A19-\\U00010A33" + // + + // "\\U00010A38-\\U00010A3A\\U00010A3F-\\U00010A47\\U00010A50-\\U00010A58\\U00012000-\\U0001236E\\U00012400-\\U00012462\\U00012470-\\U00012473]" + "[ " + + + // "[:blk=Balinese:] " + + "[:blk=Ancient_Greek_Numbers:]" + + "[:Block=Hangul_Jamo:]" + + "[:Block=Hangul_Compatibility_Jamo:]" + + "[֢ ׅ ̓ ᷀-᷃ ҃-҆ ׇ ៑ ៝ ׆ ꜀-꜇ ɟ ʞ ɷ ɼ ƪ ƾ ƫ ƍ ƹ ƺ ȝȜ ƿ ƻ ϐ ϝϜ ϛϚ ϑ ϗ ϖ ϻϺ ϟϞ ϙϘ Ϲ ϕ ϡϠ ϸ Ϸ ჱ-ჶ ٮ ڎ ٯ ೞ ឨ]" + + "]" + // "[\u018D\u01AA\u01AB\u01B9-\u01BB\u01BE\u01BF\u021C\u021D\u025F\u0277\u027C\u029E\u0343\u03D0\u03D1\u03D5-\u03E1\u03F7-\u03FB\u0483-\u0486\u05A2\u05C5-\u05C7\u066E\u066F\u068E\u0CDE\u10F1-\u10F6\u1100-\u115E\u1161-\u11FF\u17A8\u17D1\u17DD\u1DC0-\u1DC3\u3165-\u318E\uA700-\uA707\\U00010140-\\U00010174]]" + ) + .freeze(); + + public static final UnicodeSet ARCHAIC_HEURISTIC = + (UnicodeSet) + new UnicodeSet( + "[ " + + "[:blk=Ancient_Symbols:]" + + "[:blk=Ancient_Greek_Musical_Notation:] " + + "[:blk=Cyrillic_Extended_A:] " + + "[:blk=Cyrillic_Extended_B:]" + + "[˯-˿ͣ-ͳͶͷߨ-ߪ᷎-᷿ᷦ᷾ẜẝẟ Ỻ-ỿ⁖⁘-⁞ↀ-Ↄↅ-ↈⱷ-ⱽ⸀-⸗⸪-⸰ ꜠꜡ꜰ-ꝸꟻ-ꟿ[ݾ ݿ ػ-ؿ]]" + + "]" + // "[\u02EF-\u02FF\u0363-\u0373\u0376\u0377\u07E8-\u07EA\u1DCE-\u1DE6\u1DFE\u1DFF\u1E9C\u1E9D\u1E9F\u1EFA-\u1EFF\u2056\u2058-\u205E\u2180-\u2183\u2185-\u2188\u2C77-\u2C7D\u2E00-\u2E17\u2E2A-\u2E30\uA720\uA721\uA730-\uA778\uA7FB-\uA7FF]]" + ) + .freeze(); + + public static final UnicodeSet ARCHAIC_ADDITIONS = + (UnicodeSet) + new UnicodeSet( + "[ " + + "[:blk=Aegean_Numbers:] " + + "[:blk=Byzantine_Musical_Symbols:] " + + "[:block=Georgian Supplement:]" + + "[ͻ-ͽϏϽ-Ͽ[ƨ ƽ ƅ][ؕ-ؚ ۖ-ۤ ۧ ۨ ۪-ۭ ۩ ۥ ۦ][֑-֯][ׄ ׅ][ﬠ-ﬨ][ﭏ][Ⴀ-Ⴆ Ⴡ Ⴇ-Ⴌ Ⴢ Ⴍ-Ⴒ Ⴣ Ⴓ-Ⴞ Ⴤ Ⴟ Ⴠ Ⴥ][Ⴀ-Ⴥ][ƄƧƸƼǷϲϴↄ]჻]" + + "]" + // "[\u0269\u027F\u0285-\u0287\u0293\u0296\u0297\u029A\u02A0\u02A3\u02A5\u02A6\u02A8-\u02AF\u0313\u037B-\u037D\u03CF\u03FD-\u03FF]]" + ) + .freeze(); + + public static final UnicodeSet ARCHAIC = + (UnicodeSet) + new UnicodeSet(ARCHAIC_31) + .addAll(ARCHAIC_39) + .addAll(ARCHAIC_HEURISTIC) + .addAll(ARCHAIC_ADDITIONS) + .removeAll(IPA) + .removeAll(IPA_EXTENSIONS) + .freeze(); + static { - UnicodeSet knownOk = new UnicodeSet("[\u0392\u0398\u03A0\u03A6\u03B2\u03B8\u03C0\u03C6[\u10C7\u10CD]\u1C88]"); - final UnicodeSet caseProblems = new UnicodeSet(ARCHAIC).closeOver(UnicodeSet.CASE).removeAll(ARCHAIC) - .removeAll(knownOk); + UnicodeSet knownOk = + new UnicodeSet( + "[\u0392\u0398\u03A0\u03A6\u03B2\u03B8\u03C0\u03C6[\u10C7\u10CD]\u1C88]"); + final UnicodeSet caseProblems = + new UnicodeSet(ARCHAIC) + .closeOver(UnicodeSet.CASE) + .removeAll(ARCHAIC) + .removeAll(knownOk); if (caseProblems.size() != 0) { System.err.println("Case: " + caseProblems); } @@ -147,248 +169,248 @@ public class ScriptCategories2 { Map char2strokes = new LinkedHashMap(); String[][] radicalData = { - { "1", "一", "1" }, - { "2", "丨", "1" }, - { "3", "丶", "1" }, - { "4", "丿", "1" }, - { "5", "乙", "1" }, - { "6", "亅", "1" }, - { "7", "二", "2" }, - { "8", "亠", "2" }, - { "9", "人", "2" }, - { "10", "儿", "2" }, - { "11", "入", "2" }, - { "12", "八", "2" }, - { "13", "冂", "2" }, - { "14", "冖", "2" }, - { "15", "冫", "2" }, - { "16", "几", "2" }, - { "17", "凵", "2" }, - { "18", "刀", "2" }, - { "19", "力", "2" }, - { "20", "勹", "2" }, - { "21", "匕", "2" }, - { "22", "匚", "2" }, - { "23", "匸", "2" }, - { "24", "十", "2" }, - { "25", "卜", "2" }, - { "26", "卩", "2" }, - { "27", "厂", "2" }, - { "28", "厶", "2" }, - { "29", "又", "2" }, - { "30", "口", "3" }, - { "31", "囗", "3" }, - { "32", "土", "3" }, - { "33", "士", "3" }, - { "34", "夂", "3" }, - { "35", "夊", "3" }, - { "36", "夕", "3" }, - { "37", "大", "3" }, - { "38", "女", "3" }, - { "39", "子", "3" }, - { "40", "宀", "3" }, - { "41", "寸", "3" }, - { "42", "小", "3" }, - { "43", "尢", "3" }, - { "44", "尸", "3" }, - { "45", "屮", "3" }, - { "46", "山", "3" }, - { "47", "巛", "3" }, - { "48", "工", "3" }, - { "49", "己", "3" }, - { "50", "巾", "3" }, - { "51", "干", "3" }, - { "52", "幺", "3" }, - { "53", "广", "3" }, - { "54", "廴", "3" }, - { "55", "廾", "3" }, - { "56", "弋", "3" }, - { "57", "弓", "3" }, - { "58", "彐", "3" }, - { "59", "彡", "3" }, - { "60", "彳", "3" }, - { "61", "心", "4" }, - { "62", "戈", "4" }, - { "63", "戶", "4" }, - { "64", "手", "4" }, - { "65", "支", "4" }, - { "66", "攴", "4" }, - { "67", "文", "4" }, - { "68", "斗", "4" }, - { "69", "斤", "4" }, - { "70", "方", "4" }, - { "71", "无", "4" }, - { "72", "日", "4" }, - { "73", "曰", "4" }, - { "74", "月", "4" }, - { "75", "木", "4" }, - { "76", "欠", "4" }, - { "77", "止", "4" }, - { "78", "歹", "4" }, - { "79", "殳", "4" }, - { "80", "毋", "4" }, - { "81", "比", "4" }, - { "82", "毛", "4" }, - { "83", "氏", "4" }, - { "84", "气", "4" }, - { "85", "水", "4" }, - { "86", "火", "4" }, - { "87", "爪", "4" }, - { "88", "父", "4" }, - { "89", "爻", "4" }, - { "90", "爿", "4" }, - { "90'", "\u4E2C", "3" }, - { "91", "片", "4" }, - { "92", "牙", "4" }, - { "93", "牛", "4" }, - { "94", "犬", "4" }, - { "95", "玄", "5" }, - { "96", "玉", "5" }, - { "97", "瓜", "5" }, - { "98", "瓦", "5" }, - { "99", "甘", "5" }, - { "100", "生", "5" }, - { "101", "用", "5" }, - { "102", "田", "5" }, - { "103", "疋", "5" }, - { "104", "疒", "5" }, - { "105", "癶", "5" }, - { "106", "白", "5" }, - { "107", "皮", "5" }, - { "108", "皿", "5" }, - { "109", "目", "5" }, - { "110", "矛", "5" }, - { "111", "矢", "5" }, - { "112", "石", "5" }, - { "113", "示", "5" }, - { "114", "禸", "5" }, - { "115", "禾", "5" }, - { "116", "穴", "5" }, - { "117", "立", "5" }, - { "118", "竹", "6" }, - { "119", "米", "6" }, - { "120", "糸", "6" }, - { "120'", "纟", "3" }, - { "121", "缶", "6" }, - { "122", "网", "6" }, - { "123", "羊", "6" }, - { "124", "羽", "6" }, - { "125", "老", "6" }, - { "126", "而", "6" }, - { "127", "耒", "6" }, - { "128", "耳", "6" }, - { "129", "聿", "6" }, - { "130", "肉", "6" }, - { "131", "臣", "6" }, - { "132", "自", "6" }, - { "133", "至", "6" }, - { "134", "臼", "6" }, - { "135", "舌", "6" }, - { "136", "舛", "6" }, - { "137", "舟", "6" }, - { "138", "艮", "6" }, - { "139", "色", "6" }, - { "140", "艸", "6" }, - { "141", "虍", "6" }, - { "142", "虫", "6" }, - { "143", "血", "6" }, - { "144", "行", "6" }, - { "145", "衣", "6" }, - { "146", "襾", "6" }, - { "147", "見", "7" }, - { "147'", "见", "4" }, - { "148", "角", "7" }, - { "149", "言", "7" }, - { "149'", "讠", "2" }, - { "150", "谷", "7" }, - { "151", "豆", "7" }, - { "152", "豕", "7" }, - { "153", "豸", "7" }, - { "154", "貝", "7" }, - { "154'", "贝", "4" }, - { "155", "赤", "7" }, - { "156", "走", "7" }, - { "157", "足", "7" }, - { "158", "身", "7" }, - { "159", "車", "7" }, - { "159'", "车", "4" }, - { "160", "辛", "7" }, - { "161", "辰", "7" }, - { "162", "辵", "7" }, - { "162'", "\u8FB6", "4" }, + {"1", "一", "1"}, + {"2", "丨", "1"}, + {"3", "丶", "1"}, + {"4", "丿", "1"}, + {"5", "乙", "1"}, + {"6", "亅", "1"}, + {"7", "二", "2"}, + {"8", "亠", "2"}, + {"9", "人", "2"}, + {"10", "儿", "2"}, + {"11", "入", "2"}, + {"12", "八", "2"}, + {"13", "冂", "2"}, + {"14", "冖", "2"}, + {"15", "冫", "2"}, + {"16", "几", "2"}, + {"17", "凵", "2"}, + {"18", "刀", "2"}, + {"19", "力", "2"}, + {"20", "勹", "2"}, + {"21", "匕", "2"}, + {"22", "匚", "2"}, + {"23", "匸", "2"}, + {"24", "十", "2"}, + {"25", "卜", "2"}, + {"26", "卩", "2"}, + {"27", "厂", "2"}, + {"28", "厶", "2"}, + {"29", "又", "2"}, + {"30", "口", "3"}, + {"31", "囗", "3"}, + {"32", "土", "3"}, + {"33", "士", "3"}, + {"34", "夂", "3"}, + {"35", "夊", "3"}, + {"36", "夕", "3"}, + {"37", "大", "3"}, + {"38", "女", "3"}, + {"39", "子", "3"}, + {"40", "宀", "3"}, + {"41", "寸", "3"}, + {"42", "小", "3"}, + {"43", "尢", "3"}, + {"44", "尸", "3"}, + {"45", "屮", "3"}, + {"46", "山", "3"}, + {"47", "巛", "3"}, + {"48", "工", "3"}, + {"49", "己", "3"}, + {"50", "巾", "3"}, + {"51", "干", "3"}, + {"52", "幺", "3"}, + {"53", "广", "3"}, + {"54", "廴", "3"}, + {"55", "廾", "3"}, + {"56", "弋", "3"}, + {"57", "弓", "3"}, + {"58", "彐", "3"}, + {"59", "彡", "3"}, + {"60", "彳", "3"}, + {"61", "心", "4"}, + {"62", "戈", "4"}, + {"63", "戶", "4"}, + {"64", "手", "4"}, + {"65", "支", "4"}, + {"66", "攴", "4"}, + {"67", "文", "4"}, + {"68", "斗", "4"}, + {"69", "斤", "4"}, + {"70", "方", "4"}, + {"71", "无", "4"}, + {"72", "日", "4"}, + {"73", "曰", "4"}, + {"74", "月", "4"}, + {"75", "木", "4"}, + {"76", "欠", "4"}, + {"77", "止", "4"}, + {"78", "歹", "4"}, + {"79", "殳", "4"}, + {"80", "毋", "4"}, + {"81", "比", "4"}, + {"82", "毛", "4"}, + {"83", "氏", "4"}, + {"84", "气", "4"}, + {"85", "水", "4"}, + {"86", "火", "4"}, + {"87", "爪", "4"}, + {"88", "父", "4"}, + {"89", "爻", "4"}, + {"90", "爿", "4"}, + {"90'", "\u4E2C", "3"}, + {"91", "片", "4"}, + {"92", "牙", "4"}, + {"93", "牛", "4"}, + {"94", "犬", "4"}, + {"95", "玄", "5"}, + {"96", "玉", "5"}, + {"97", "瓜", "5"}, + {"98", "瓦", "5"}, + {"99", "甘", "5"}, + {"100", "生", "5"}, + {"101", "用", "5"}, + {"102", "田", "5"}, + {"103", "疋", "5"}, + {"104", "疒", "5"}, + {"105", "癶", "5"}, + {"106", "白", "5"}, + {"107", "皮", "5"}, + {"108", "皿", "5"}, + {"109", "目", "5"}, + {"110", "矛", "5"}, + {"111", "矢", "5"}, + {"112", "石", "5"}, + {"113", "示", "5"}, + {"114", "禸", "5"}, + {"115", "禾", "5"}, + {"116", "穴", "5"}, + {"117", "立", "5"}, + {"118", "竹", "6"}, + {"119", "米", "6"}, + {"120", "糸", "6"}, + {"120'", "纟", "3"}, + {"121", "缶", "6"}, + {"122", "网", "6"}, + {"123", "羊", "6"}, + {"124", "羽", "6"}, + {"125", "老", "6"}, + {"126", "而", "6"}, + {"127", "耒", "6"}, + {"128", "耳", "6"}, + {"129", "聿", "6"}, + {"130", "肉", "6"}, + {"131", "臣", "6"}, + {"132", "自", "6"}, + {"133", "至", "6"}, + {"134", "臼", "6"}, + {"135", "舌", "6"}, + {"136", "舛", "6"}, + {"137", "舟", "6"}, + {"138", "艮", "6"}, + {"139", "色", "6"}, + {"140", "艸", "6"}, + {"141", "虍", "6"}, + {"142", "虫", "6"}, + {"143", "血", "6"}, + {"144", "行", "6"}, + {"145", "衣", "6"}, + {"146", "襾", "6"}, + {"147", "見", "7"}, + {"147'", "见", "4"}, + {"148", "角", "7"}, + {"149", "言", "7"}, + {"149'", "讠", "2"}, + {"150", "谷", "7"}, + {"151", "豆", "7"}, + {"152", "豕", "7"}, + {"153", "豸", "7"}, + {"154", "貝", "7"}, + {"154'", "贝", "4"}, + {"155", "赤", "7"}, + {"156", "走", "7"}, + {"157", "足", "7"}, + {"158", "身", "7"}, + {"159", "車", "7"}, + {"159'", "车", "4"}, + {"160", "辛", "7"}, + {"161", "辰", "7"}, + {"162", "辵", "7"}, + {"162'", "\u8FB6", "4"}, // 162'; 2ECC; 8FB6 - { "163", "邑", "7" }, - { "164", "酉", "7" }, - { "165", "釆", "7" }, - { "166", "里", "7" }, - { "167", "金", "8" }, - { "167'", "钅", "5" }, - { "168", "長", "8" }, - { "168'", "长", "5" }, - { "169", "門", "8" }, - { "169'", "门", "3" }, - { "170", "阜", "8" }, - { "171", "隶", "8" }, - { "172", "隹", "8" }, - { "173", "雨", "8" }, - { "174", "靑", "8" }, - { "175", "非", "8" }, - { "176", "面", "9" }, - { "177", "革", "9" }, - { "178", "韋", "9" }, - { "178'", "韦", "4" }, - { "179", "韭", "9" }, - { "180", "音", "9" }, - { "181", "頁", "9" }, - { "181'", "页", "6" }, - { "182", "風", "9" }, - { "182'", "风", "4" }, - { "183", "飛", "9" }, - { "183'", "飞", "3" }, - { "184", "食", "9" }, - { "184'", "饣", "3" }, - { "185", "首", "9" }, - { "186", "香", "9" }, - { "187", "馬", "10" }, - { "187'", "马", "3" }, - { "188", "骨", "10" }, - { "189", "高", "10" }, - { "190", "髟", "10" }, - { "191", "鬥", "10" }, - { "192", "鬯", "10" }, - { "193", "鬲", "10" }, - { "194", "鬼", "10" }, - { "195", "魚", "11" }, - { "195'", "鱼", "8" }, - { "196", "鳥", "11" }, - { "196'", "鸟", "5" }, - { "197'", "卤", "7" }, - { "197", "鹵", "11" }, - { "198", "鹿", "11" }, - { "199", "麥", "11" }, - { "199'", "麦", "11" }, - { "200", "麻", "11" }, - { "201", "黃", "12" }, - { "201'", "\u9EC4", "11" }, + {"163", "邑", "7"}, + {"164", "酉", "7"}, + {"165", "釆", "7"}, + {"166", "里", "7"}, + {"167", "金", "8"}, + {"167'", "钅", "5"}, + {"168", "長", "8"}, + {"168'", "长", "5"}, + {"169", "門", "8"}, + {"169'", "门", "3"}, + {"170", "阜", "8"}, + {"171", "隶", "8"}, + {"172", "隹", "8"}, + {"173", "雨", "8"}, + {"174", "靑", "8"}, + {"175", "非", "8"}, + {"176", "面", "9"}, + {"177", "革", "9"}, + {"178", "韋", "9"}, + {"178'", "韦", "4"}, + {"179", "韭", "9"}, + {"180", "音", "9"}, + {"181", "頁", "9"}, + {"181'", "页", "6"}, + {"182", "風", "9"}, + {"182'", "风", "4"}, + {"183", "飛", "9"}, + {"183'", "飞", "3"}, + {"184", "食", "9"}, + {"184'", "饣", "3"}, + {"185", "首", "9"}, + {"186", "香", "9"}, + {"187", "馬", "10"}, + {"187'", "马", "3"}, + {"188", "骨", "10"}, + {"189", "高", "10"}, + {"190", "髟", "10"}, + {"191", "鬥", "10"}, + {"192", "鬯", "10"}, + {"193", "鬲", "10"}, + {"194", "鬼", "10"}, + {"195", "魚", "11"}, + {"195'", "鱼", "8"}, + {"196", "鳥", "11"}, + {"196'", "鸟", "5"}, + {"197'", "卤", "7"}, + {"197", "鹵", "11"}, + {"198", "鹿", "11"}, + {"199", "麥", "11"}, + {"199'", "麦", "11"}, + {"200", "麻", "11"}, + {"201", "黃", "12"}, + {"201'", "\u9EC4", "11"}, // 201'; 2EE9; 9EC4 - { "202", "黍", "12" }, - { "203", "黑", "12" }, - { "204", "黹", "12" }, - { "205", "黽", "13" }, - { "205'", "黾", "13" }, - { "206", "鼎", "13" }, - { "207", "鼓", "13" }, - { "208", "鼠", "13" }, - { "209", "鼻", "14" }, - { "210", "齊", "14" }, - { "210'", "齐", "6" }, - { "211", "齒", "15" }, - { "211'", "齿", "8" }, - { "212", "龍", "16" }, - { "212'", "龙", "5" }, - { "213", "龜", "16" }, - { "213'", "龟", "7" }, - { "214", "龠", "17" }, + {"202", "黍", "12"}, + {"203", "黑", "12"}, + {"204", "黹", "12"}, + {"205", "黽", "13"}, + {"205'", "黾", "13"}, + {"206", "鼎", "13"}, + {"207", "鼓", "13"}, + {"208", "鼠", "13"}, + {"209", "鼻", "14"}, + {"210", "齊", "14"}, + {"210'", "齐", "6"}, + {"211", "齒", "15"}, + {"211'", "齿", "8"}, + {"212", "龍", "16"}, + {"212'", "龙", "5"}, + {"213", "龜", "16"}, + {"213'", "龟", "7"}, + {"214", "龠", "17"}, }; for (String[] pair : radicalData) { @@ -405,132 +427,173 @@ public class ScriptCategories2 { // START OF GENERATED CODE - public static final UnicodeSet SCRIPT_CHANGED = (UnicodeSet) new UnicodeSet( - "[\\^`\\u00A8\\u00AF\\u00B4\\u00B5\\u00B8\\u02B9-\\u02DF\\u02E5-\\u02FF\\u0374\\u0375\\u037E\\u0385\\u0387\\u03F6\\u0589\\u0600-\\u0603\\u060C\\u061B\\u061F\\u0640\\u064B-\\u0655\\u0660-\\u0669\\u0670\\u06DD\\u0951\\u0952\\u0964\\u0965\\u0970\\u0CF1\\u0CF2\\u10FB\\u16EB-\\u16ED\\u1735\\u1736\\u1802\\u1803\\u1805\\u1D26-\\u1D2B\\u1D5D-\\u1D61\\u1D66-\\u1D6A\\u1D78\\u1DBF\\u2100-\\u2125\\u2127\\u2128\\u212C-\\u2131\\u2133\\u2134\\u2139-\\u213B\\u2145-\\u214A\\u214C\\u214D\\u249C-\\u24E9\\u2FF0-\\u2FFF\\u3001-\\u3004\\u3006\\u3008-\\u3020\\u302A-\\u3037\\u303C-\\u303F\\u3099-\\u309C\\u30A0\\u30FB\\u30FC\\u3190-\\u319F\\u31C0-\\u31E3\\u3220-\\u3243\\u3250\\u327F-\\u32B0\\u32C0-\\u32CF\\u3358-\\u33FF\\uA700-\\uA721\\uA788-\\uA78A\\uFDFD\\uFE45\\uFE46\\uFF61-\\uFF65\\uFF70\\uFF9E\\uFF9F]") - .freeze(); + public static final UnicodeSet SCRIPT_CHANGED = + (UnicodeSet) + new UnicodeSet( + "[\\^`\\u00A8\\u00AF\\u00B4\\u00B5\\u00B8\\u02B9-\\u02DF\\u02E5-\\u02FF\\u0374\\u0375\\u037E\\u0385\\u0387\\u03F6\\u0589\\u0600-\\u0603\\u060C\\u061B\\u061F\\u0640\\u064B-\\u0655\\u0660-\\u0669\\u0670\\u06DD\\u0951\\u0952\\u0964\\u0965\\u0970\\u0CF1\\u0CF2\\u10FB\\u16EB-\\u16ED\\u1735\\u1736\\u1802\\u1803\\u1805\\u1D26-\\u1D2B\\u1D5D-\\u1D61\\u1D66-\\u1D6A\\u1D78\\u1DBF\\u2100-\\u2125\\u2127\\u2128\\u212C-\\u2131\\u2133\\u2134\\u2139-\\u213B\\u2145-\\u214A\\u214C\\u214D\\u249C-\\u24E9\\u2FF0-\\u2FFF\\u3001-\\u3004\\u3006\\u3008-\\u3020\\u302A-\\u3037\\u303C-\\u303F\\u3099-\\u309C\\u30A0\\u30FB\\u30FC\\u3190-\\u319F\\u31C0-\\u31E3\\u3220-\\u3243\\u3250\\u327F-\\u32B0\\u32C0-\\u32CF\\u3358-\\u33FF\\uA700-\\uA721\\uA788-\\uA78A\\uFDFD\\uFE45\\uFE46\\uFF61-\\uFF65\\uFF70\\uFF9E\\uFF9F]") + .freeze(); public static final Map SCRIPT_NEW; + static { String[][] data = { - { "Arabic", "[\\u0600-\\u0603\\u060C\\u061B\\u061F\\u0640\\u064B-\\u0655\\u0660-\\u0669\\u0670\\uFDFD]" }, - { "Armenian", "[\\u0589]" }, - { "Bengali", "[\\u0964\\u0965\\u0CF1\\u0CF2]" }, + { + "Arabic", + "[\\u0600-\\u0603\\u060C\\u061B\\u061F\\u0640\\u064B-\\u0655\\u0660-\\u0669\\u0670\\uFDFD]" + }, + {"Armenian", "[\\u0589]"}, + {"Bengali", "[\\u0964\\u0965\\u0CF1\\u0CF2]"}, { "Bopomofo", - "[\\u02EA\\u02EB\\u3001-\\u3004\\u3006\\u3008-\\u3011\\u3013-\\u3020\\u302A-\\u302D\\u3030\\u3037\\u303C-\\u303F\\uFE45\\uFE46\\uFF61-\\uFF64]" }, - { "Buhid", "[\\u1735\\u1736]" }, - { "Common", "[\\u03F6\\u06DD]" }, - { "Cyrillic", "[\\u02BC]" }, - { "Devanagari", "[\\u0951\\u0952\\u0964\\u0965\\u0970\\u0CF1\\u0CF2]" }, - { "Georgian", "[\\u0589\\u10FB]" }, - { "Greek", "[\\u00B5\\u0374\\u0375\\u037E\\u0385\\u0387]" }, - { "Gujarati", "[\\u0CF1\\u0CF2]" }, - { "Gurmukhi", "[\\u0964\\u0965\\u0CF1\\u0CF2]" }, + "[\\u02EA\\u02EB\\u3001-\\u3004\\u3006\\u3008-\\u3011\\u3013-\\u3020\\u302A-\\u302D\\u3030\\u3037\\u303C-\\u303F\\uFE45\\uFE46\\uFF61-\\uFF64]" + }, + {"Buhid", "[\\u1735\\u1736]"}, + {"Common", "[\\u03F6\\u06DD]"}, + {"Cyrillic", "[\\u02BC]"}, + {"Devanagari", "[\\u0951\\u0952\\u0964\\u0965\\u0970\\u0CF1\\u0CF2]"}, + {"Georgian", "[\\u0589\\u10FB]"}, + {"Greek", "[\\u00B5\\u0374\\u0375\\u037E\\u0385\\u0387]"}, + {"Gujarati", "[\\u0CF1\\u0CF2]"}, + {"Gurmukhi", "[\\u0964\\u0965\\u0CF1\\u0CF2]"}, { "Han", - "[\\u2FF0-\\u2FFF\\u3001-\\u3004\\u3006\\u3008-\\u3011\\u3013-\\u3020\\u302A-\\u302D\\u3030\\u3037\\u303C-\\u303F\\u31C0-\\u31E3\\u3220-\\u3243\\u3280-\\u32B0\\u32C0-\\u32CB\\u3358-\\u3370\\u337B-\\u337F\\u33E0-\\u33FE\\uFE45\\uFE46\\uFF61-\\uFF64]" }, + "[\\u2FF0-\\u2FFF\\u3001-\\u3004\\u3006\\u3008-\\u3011\\u3013-\\u3020\\u302A-\\u302D\\u3030\\u3037\\u303C-\\u303F\\u31C0-\\u31E3\\u3220-\\u3243\\u3280-\\u32B0\\u32C0-\\u32CB\\u3358-\\u3370\\u337B-\\u337F\\u33E0-\\u33FE\\uFE45\\uFE46\\uFF61-\\uFF64]" + }, { "Hangul", - "[\\u3001-\\u3004\\u3006\\u3008-\\u3011\\u3013-\\u3020\\u302E-\\u3030\\u3037\\u303C-\\u303F\\u327F\\uFE45\\uFE46\\uFF61-\\uFF64]" }, - { "Hanunoo", "[\\u1735\\u1736]" }, + "[\\u3001-\\u3004\\u3006\\u3008-\\u3011\\u3013-\\u3020\\u302E-\\u3030\\u3037\\u303C-\\u303F\\u327F\\uFE45\\uFE46\\uFF61-\\uFF64]" + }, + {"Hanunoo", "[\\u1735\\u1736]"}, { "Hiragana", - "[\\u3001-\\u3004\\u3006\\u3008-\\u3020\\u3030-\\u3037\\u303C-\\u303F\\u3099-\\u309C\\u30A0\\u30FB\\u30FC\\u3190-\\u319F\\uFE45\\uFE46\\uFF61-\\uFF65\\uFF70\\uFF9E\\uFF9F]" }, - { "Kannada", "[\\u0CF1\\u0CF2]" }, + "[\\u3001-\\u3004\\u3006\\u3008-\\u3020\\u3030-\\u3037\\u303C-\\u303F\\u3099-\\u309C\\u30A0\\u30FB\\u30FC\\u3190-\\u319F\\uFE45\\uFE46\\uFF61-\\uFF65\\uFF70\\uFF9E\\uFF9F]" + }, + {"Kannada", "[\\u0CF1\\u0CF2]"}, { "Katakana", - "[\\u3001-\\u3004\\u3006\\u3008-\\u3020\\u3030-\\u3037\\u303C-\\u303F\\u3099-\\u309C\\u30A0\\u30FB\\u30FC\\u3190-\\u319F\\uFE45\\uFE46\\uFF61-\\uFF65\\uFF70\\uFF9E\\uFF9F]" }, + "[\\u3001-\\u3004\\u3006\\u3008-\\u3020\\u3030-\\u3037\\u303C-\\u303F\\u3099-\\u309C\\u30A0\\u30FB\\u30FC\\u3190-\\u319F\\uFE45\\uFE46\\uFF61-\\uFF65\\uFF70\\uFF9E\\uFF9F]" + }, { "Latin", - "[\\^`\\u00A8\\u00AF\\u00B4\\u00B8\\u02B9-\\u02DF\\u02E5-\\u02E9\\u02EC-\\u02FF\\u1D26-\\u1D2B\\u1D5D-\\u1D61\\u1D66-\\u1D6A\\u1D78\\u1DBF\\u2100-\\u2125\\u2127\\u2128\\u212C-\\u2131\\u2133\\u2134\\u2139-\\u213B\\u2145-\\u214A\\u214C\\u214D\\u249C-\\u24E9\\u3250\\u32CC-\\u32CF\\u3371-\\u337A\\u3380-\\u33DF\\u33FF\\uA700-\\uA721\\uA788-\\uA78A]" }, - { "Malayalam", "[\\u0CF1\\u0CF2]" }, - { "Mongolian", "[\\u1802\\u1803\\u1805]" }, - { "Oriya", "[\\u0964\\u0965\\u0CF1\\u0CF2]" }, - { "Phags_Pa", "[\\u1802\\u1803\\u1805\\u3001\\u3002\\u3008-\\u3011\\u3014-\\u301B\\uFF61-\\uFF64]" }, - { "Runic", "[\\u16EB-\\u16ED]" }, - { "Syriac", "[\\u060C\\u061B\\u061F\\u0640\\u064B-\\u0655\\u0670]" }, - { "Tagalog", "[\\u1735\\u1736]" }, - { "Tagbanwa", "[\\u1735\\u1736]" }, - { "Tamil", "[\\u0CF1\\u0CF2]" }, - { "Telugu", "[\\u0CF1\\u0CF2]" }, - { "Thaana", "[\\u060C\\u061B\\u061F\\u0660-\\u0669]" }, - { "Tibetan", "[\\u3001\\u3002\\u3008-\\u3011\\u3014-\\u301B\\uFF61-\\uFF64]" }, - { "Yi", "[\\u3001\\u3002\\u3008-\\u3011\\u3014-\\u301B\\uFF61-\\uFF64]" }, + "[\\^`\\u00A8\\u00AF\\u00B4\\u00B8\\u02B9-\\u02DF\\u02E5-\\u02E9\\u02EC-\\u02FF\\u1D26-\\u1D2B\\u1D5D-\\u1D61\\u1D66-\\u1D6A\\u1D78\\u1DBF\\u2100-\\u2125\\u2127\\u2128\\u212C-\\u2131\\u2133\\u2134\\u2139-\\u213B\\u2145-\\u214A\\u214C\\u214D\\u249C-\\u24E9\\u3250\\u32CC-\\u32CF\\u3371-\\u337A\\u3380-\\u33DF\\u33FF\\uA700-\\uA721\\uA788-\\uA78A]" + }, + {"Malayalam", "[\\u0CF1\\u0CF2]"}, + {"Mongolian", "[\\u1802\\u1803\\u1805]"}, + {"Oriya", "[\\u0964\\u0965\\u0CF1\\u0CF2]"}, + { + "Phags_Pa", + "[\\u1802\\u1803\\u1805\\u3001\\u3002\\u3008-\\u3011\\u3014-\\u301B\\uFF61-\\uFF64]" + }, + {"Runic", "[\\u16EB-\\u16ED]"}, + {"Syriac", "[\\u060C\\u061B\\u061F\\u0640\\u064B-\\u0655\\u0670]"}, + {"Tagalog", "[\\u1735\\u1736]"}, + {"Tagbanwa", "[\\u1735\\u1736]"}, + {"Tamil", "[\\u0CF1\\u0CF2]"}, + {"Telugu", "[\\u0CF1\\u0CF2]"}, + {"Thaana", "[\\u060C\\u061B\\u061F\\u0660-\\u0669]"}, + {"Tibetan", "[\\u3001\\u3002\\u3008-\\u3011\\u3014-\\u301B\\uFF61-\\uFF64]"}, + {"Yi", "[\\u3001\\u3002\\u3008-\\u3011\\u3014-\\u301B\\uFF61-\\uFF64]"}, }; SCRIPT_NEW = loadData(data); } - public static final UnicodeSet CATEGORY_CHANGED = (UnicodeSet) new UnicodeSet( - "[\\u2102\\u210A-\\u2113\\u2115\\u2119-\\u211D\\u2124\\u2128\\u2129\\u212C\\u212D\\u212F-\\u2131\\u2133-\\u2138\\u213C-\\u213F\\u2145-\\u2149\\U0001D165\\U0001D166\\U0001D16D-\\U0001D172\\U0001D400-\\U0001D7FF]") - .freeze(); + + public static final UnicodeSet CATEGORY_CHANGED = + (UnicodeSet) + new UnicodeSet( + "[\\u2102\\u210A-\\u2113\\u2115\\u2119-\\u211D\\u2124\\u2128\\u2129\\u212C\\u212D\\u212F-\\u2131\\u2133-\\u2138\\u213C-\\u213F\\u2145-\\u2149\\U0001D165\\U0001D166\\U0001D16D-\\U0001D172\\U0001D400-\\U0001D7FF]") + .freeze(); public static final Map CATEGORY_NEW; + static { String[][] data = { { "Math_Symbol", - "[\\u2102\\u210A-\\u2113\\u2115\\u2119-\\u211D\\u2124\\u2128\\u2129\\u212C\\u212D\\u212F-\\u2131\\u2133-\\u2138\\u213C-\\u213F\\u2145-\\u2149\\U0001D400-\\U0001D7FF]" }, - { "Modifier_Symbol", "[\\U0001D165\\U0001D166\\U0001D16D-\\U0001D172]" }, + "[\\u2102\\u210A-\\u2113\\u2115\\u2119-\\u211D\\u2124\\u2128\\u2129\\u212C\\u212D\\u212F-\\u2131\\u2133-\\u2138\\u213C-\\u213F\\u2145-\\u2149\\U0001D400-\\U0001D7FF]" + }, + {"Modifier_Symbol", "[\\U0001D165\\U0001D166\\U0001D16D-\\U0001D172]"}, { "Symbol", - "[\\$+<->\\^`|~\\u00A2-\\u00A9\\u00AC\\u00AE-\\u00B1\\u00B4\\u00B6\\u00B8\\u00D7\\u00F7\\u02C2-\\u02C5\\u02D2-\\u02DF\\u02E5-\\u02EB\\u02ED\\u02EF-\\u02FF\\u0375\\u0384\\u0385\\u03F6\\u0482\\u0606-\\u0608\\u060B\\u060E\\u060F\\u06E9\\u06FD\\u06FE\\u07F6\\u09F2\\u09F3\\u09FA\\u0AF1\\u0B70\\u0BF3-\\u0BFA\\u0C7F\\u0CF1\\u0CF2\\u0D79\\u0E3F\\u0F01-\\u0F03\\u0F13-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC\\u0FCE\\u0FCF\\u109E\\u109F\\u1360\\u1390-\\u1399\\u17DB\\u1940\\u19E0-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u1FBD\\u1FBF-\\u1FC1\\u1FCD-\\u1FCF\\u1FDD-\\u1FDF\\u1FED-\\u1FEF\\u1FFD\\u1FFE\\u2044\\u2052\\u207A-\\u207C\\u208A-\\u208C\\u20A0-\\u20B5\\u2100-\\u2106\\u2108-\\u2125\\u2127-\\u2129\\u212C-\\u2131\\u2133-\\u2138\\u213A-\\u214D\\u214F\\u2190-\\u2328\\u232B-\\u23E7\\u2400-\\u2426\\u2440-\\u244A\\u249C-\\u24E9\\u2500-\\u269D\\u26A0-\\u26BC\\u26C0-\\u26C3\\u2701-\\u2704\\u2706-\\u2709\\u270C-\\u2727\\u2729-\\u274B\\u274D\\u274F-\\u2752\\u2756\\u2758-\\u275E\\u2761-\\u2767\\u2794\\u2798-\\u27AF\\u27B1-\\u27BE\\u27C0-\\u27C4\\u27C7-\\u27CA\\u27CC\\u27D0-\\u27E5\\u27F0-\\u2982\\u2999-\\u29D7\\u29DC-\\u29FB\\u29FE-\\u2B4C\\u2B50-\\u2B54\\u2CE5-\\u2CEA\\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u309B\\u309C\\u3190\\u3191\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3243\\u3250\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0-\\u4DFF\\uA490-\\uA4C6\\uA700-\\uA716\\uA720\\uA721\\uA789\\uA78A\\uA828-\\uA82B\\uFB29\\uFDFC\\uFDFD\\uFE62\\uFE64-\\uFE66\\uFE69\\uFF04\\uFF0B\\uFF1C-\\uFF1E\\uFF3E\\uFF40\\uFF5C\\uFF5E\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\uFFFC\\uFFFD\\U00010102\\U00010137-\\U0001013F\\U00010179-\\U00010189\\U00010190-\\U0001019B\\U000101D0-\\U000101FC\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129-\\U0001D166\\U0001D16A-\\U0001D172\\U0001D183\\U0001D184\\U0001D18C-\\U0001D1A9\\U0001D1AE-\\U0001D1DD\\U0001D200-\\U0001D241\\U0001D245\\U0001D300-\\U0001D356\\U0001D400-\\U0001D7FF\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093]" }, + "[\\$+<->\\^`|~\\u00A2-\\u00A9\\u00AC\\u00AE-\\u00B1\\u00B4\\u00B6\\u00B8\\u00D7\\u00F7\\u02C2-\\u02C5\\u02D2-\\u02DF\\u02E5-\\u02EB\\u02ED\\u02EF-\\u02FF\\u0375\\u0384\\u0385\\u03F6\\u0482\\u0606-\\u0608\\u060B\\u060E\\u060F\\u06E9\\u06FD\\u06FE\\u07F6\\u09F2\\u09F3\\u09FA\\u0AF1\\u0B70\\u0BF3-\\u0BFA\\u0C7F\\u0CF1\\u0CF2\\u0D79\\u0E3F\\u0F01-\\u0F03\\u0F13-\\u0F17\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC\\u0FCE\\u0FCF\\u109E\\u109F\\u1360\\u1390-\\u1399\\u17DB\\u1940\\u19E0-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u1FBD\\u1FBF-\\u1FC1\\u1FCD-\\u1FCF\\u1FDD-\\u1FDF\\u1FED-\\u1FEF\\u1FFD\\u1FFE\\u2044\\u2052\\u207A-\\u207C\\u208A-\\u208C\\u20A0-\\u20B5\\u2100-\\u2106\\u2108-\\u2125\\u2127-\\u2129\\u212C-\\u2131\\u2133-\\u2138\\u213A-\\u214D\\u214F\\u2190-\\u2328\\u232B-\\u23E7\\u2400-\\u2426\\u2440-\\u244A\\u249C-\\u24E9\\u2500-\\u269D\\u26A0-\\u26BC\\u26C0-\\u26C3\\u2701-\\u2704\\u2706-\\u2709\\u270C-\\u2727\\u2729-\\u274B\\u274D\\u274F-\\u2752\\u2756\\u2758-\\u275E\\u2761-\\u2767\\u2794\\u2798-\\u27AF\\u27B1-\\u27BE\\u27C0-\\u27C4\\u27C7-\\u27CA\\u27CC\\u27D0-\\u27E5\\u27F0-\\u2982\\u2999-\\u29D7\\u29DC-\\u29FB\\u29FE-\\u2B4C\\u2B50-\\u2B54\\u2CE5-\\u2CEA\\u2E80-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u309B\\u309C\\u3190\\u3191\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3243\\u3250\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0-\\u4DFF\\uA490-\\uA4C6\\uA700-\\uA716\\uA720\\uA721\\uA789\\uA78A\\uA828-\\uA82B\\uFB29\\uFDFC\\uFDFD\\uFE62\\uFE64-\\uFE66\\uFE69\\uFF04\\uFF0B\\uFF1C-\\uFF1E\\uFF3E\\uFF40\\uFF5C\\uFF5E\\uFFE0-\\uFFE6\\uFFE8-\\uFFEE\\uFFFC\\uFFFD\\U00010102\\U00010137-\\U0001013F\\U00010179-\\U00010189\\U00010190-\\U0001019B\\U000101D0-\\U000101FC\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129-\\U0001D166\\U0001D16A-\\U0001D172\\U0001D183\\U0001D184\\U0001D18C-\\U0001D1A9\\U0001D1AE-\\U0001D1DD\\U0001D200-\\U0001D241\\U0001D245\\U0001D300-\\U0001D356\\U0001D400-\\U0001D7FF\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093]" + }, }; CATEGORY_NEW = loadData(data); } // END OF GENERATED CODE - //@formatter:on + // @formatter:on // UnicodeSet override - static UnicodeSet.XSymbolTable myXSymbolTable = new UnicodeSet.XSymbolTable() { - public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { - int propEnum = -1; - int valueEnum = -1; - if (propertyValue.trim().length() != 0) { - propEnum = UCharacter.getPropertyEnum(propertyName); - } else { - try { - propEnum = UProperty.GENERAL_CATEGORY_MASK; - valueEnum = UCharacter.getPropertyValueEnum(propEnum, propertyName); - propertyValue = UCharacter.getPropertyValueName(propEnum, valueEnum, UProperty.NameChoice.LONG); - } catch (IllegalArgumentException e) { - try { - propEnum = UProperty.SCRIPT; - valueEnum = UCharacter.getPropertyValueEnum(propEnum, propertyName); - propertyValue = UCharacter.getPropertyValueName(propEnum, valueEnum, UProperty.NameChoice.LONG); - } catch (Exception e1) { - return false; + static UnicodeSet.XSymbolTable myXSymbolTable = + new UnicodeSet.XSymbolTable() { + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { + int propEnum = -1; + int valueEnum = -1; + if (propertyValue.trim().length() != 0) { + propEnum = UCharacter.getPropertyEnum(propertyName); + } else { + try { + propEnum = UProperty.GENERAL_CATEGORY_MASK; + valueEnum = UCharacter.getPropertyValueEnum(propEnum, propertyName); + propertyValue = + UCharacter.getPropertyValueName( + propEnum, valueEnum, UProperty.NameChoice.LONG); + } catch (IllegalArgumentException e) { + try { + propEnum = UProperty.SCRIPT; + valueEnum = UCharacter.getPropertyValueEnum(propEnum, propertyName); + propertyValue = + UCharacter.getPropertyValueName( + propEnum, valueEnum, UProperty.NameChoice.LONG); + } catch (Exception e1) { + return false; + } + } } - } - } - String pvalue; - UnicodeSet result2; - UnicodeSet additions; - switch (propEnum) { - case UProperty.SCRIPT: - pvalue = getFixedPropertyValue(propEnum, propertyValue, UProperty.NameChoice.LONG); - result2 = new UnicodeSet().applyIntPropertyValue(propEnum, - UCharacter.getPropertyValueEnum(propEnum, pvalue)).removeAll(SCRIPT_CHANGED); - additions = SCRIPT_NEW.get(pvalue); - if (additions != null) { - result2.addAll(additions); - } - result.set(result2); - return true; - case UProperty.GENERAL_CATEGORY_MASK: - case UProperty.GENERAL_CATEGORY: - // TODO: fix Mask - pvalue = getFixedPropertyValue(propEnum, propertyValue, UProperty.NameChoice.LONG); - result2 = new UnicodeSet().applyIntPropertyValue(propEnum, - UCharacter.getPropertyValueEnum(propEnum, pvalue)).removeAll(CATEGORY_CHANGED); - additions = CATEGORY_NEW.get(pvalue); - if (additions != null) { - result2.addAll(additions); + String pvalue; + UnicodeSet result2; + UnicodeSet additions; + switch (propEnum) { + case UProperty.SCRIPT: + pvalue = + getFixedPropertyValue( + propEnum, propertyValue, UProperty.NameChoice.LONG); + result2 = + new UnicodeSet() + .applyIntPropertyValue( + propEnum, + UCharacter.getPropertyValueEnum( + propEnum, pvalue)) + .removeAll(SCRIPT_CHANGED); + additions = SCRIPT_NEW.get(pvalue); + if (additions != null) { + result2.addAll(additions); + } + result.set(result2); + return true; + case UProperty.GENERAL_CATEGORY_MASK: + case UProperty.GENERAL_CATEGORY: + // TODO: fix Mask + pvalue = + getFixedPropertyValue( + propEnum, propertyValue, UProperty.NameChoice.LONG); + result2 = + new UnicodeSet() + .applyIntPropertyValue( + propEnum, + UCharacter.getPropertyValueEnum( + propEnum, pvalue)) + .removeAll(CATEGORY_CHANGED); + additions = CATEGORY_NEW.get(pvalue); + if (additions != null) { + result2.addAll(additions); + } + result.set(result2); + return true; + } + return false; } - result.set(result2); - return true; - } - return false; - } - }; + }; public static UnicodeSet parseUnicodeSet(String input) { String parseInput = input.trim(); @@ -539,11 +602,13 @@ public static UnicodeSet parseUnicodeSet(String input) { int parseEnd = parsePosition.getIndex(); if (parseEnd != parseInput.length()) { parseEnd--; // get input offset - throw new IllegalArgumentException("Additional characters past the end of the set, at " - + parseEnd + ", ..." - + input.substring(Math.max(0, parseEnd - 10), parseEnd) - + "|" - + input.substring(parseEnd, Math.min(input.length(), parseEnd + 10))); + throw new IllegalArgumentException( + "Additional characters past the end of the set, at " + + parseEnd + + ", ..." + + input.substring(Math.max(0, parseEnd - 10), parseEnd) + + "|" + + input.substring(parseEnd, Math.min(input.length(), parseEnd + 10))); } if (DEBUG) { checkDifferences(input, result, new UnicodeSet(input)); @@ -555,14 +620,20 @@ private static void checkDifferences(String input, UnicodeSet result, UnicodeSet if (!original.equals(result)) { final UnicodeSet removed = new UnicodeSet(original).removeAll(result); final UnicodeSet added = new UnicodeSet(result).removeAll(original); - System.out.println(" *Altered UnicodeSet - removed: " + removed.size() + ", added: " + added.size() - + ", input: " + input); + System.out.println( + " *Altered UnicodeSet - removed: " + + removed.size() + + ", added: " + + added.size() + + ", input: " + + input); if (!removed.isEmpty()) System.out.println("\tRemoved: " + removed.toPattern(false)); if (!added.isEmpty()) System.out.println("\tAdded: " + added.toPattern(false)); } } - public static UnicodeSet applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { + public static UnicodeSet applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { UnicodeSet original; if (DEBUG) { original = new UnicodeSet(result).applyPropertyAlias(propertyName, propertyValue); @@ -597,53 +668,61 @@ public String transform(String script) { // Standard items public static final Set EUROPEAN = ScriptMetadata.Groupings.EUROPEAN.scripts; - public static final Set MIDDLE_EASTERN = ScriptMetadata.Groupings.MIDDLE_EASTERN.scripts; + public static final Set MIDDLE_EASTERN = + ScriptMetadata.Groupings.MIDDLE_EASTERN.scripts; public static final Set SOUTH_ASIAN = ScriptMetadata.Groupings.SOUTH_ASIAN.scripts; - public static final Set SOUTHEAST_ASIAN = ScriptMetadata.Groupings.SOUTHEAST_ASIAN.scripts; + public static final Set SOUTHEAST_ASIAN = + ScriptMetadata.Groupings.SOUTHEAST_ASIAN.scripts; public static final Set EAST_ASIAN = ScriptMetadata.Groupings.EAST_ASIAN.scripts; public static final Set AFRICAN = ScriptMetadata.Groupings.AFRICAN.scripts; public static final Set AMERICAN = ScriptMetadata.Groupings.AMERICAN.scripts; - public static final Set HISTORIC_SCRIPTS = With - .in(ScriptMetadata.getScripts()) - .toUnmodifiableCollection(new UsageFilter(IdUsage.EXCLUSION, IdUsage.LIMITED_USE), - new TreeSet()); - -// public static final Set OLD_EUROPEAN = loadUnmodifiable(new TreeSet(), -// "Latin", "Greek", "Coptic", "Cyrillic", -// "Glag", "Armenian", "Georgian", "Shavian", "braille", -// "ogham", "runic", "Gothic", "Cypriot", "Linear b", -// "old italic"); -// -// public static final Set OLD_MIDDLE_EASTERN = loadUnmodifiable(new TreeSet(), -// "Hebrew", "Arabic", "Syriac", "Thaana", "Carian", "Lycian", "Lydian", "Phoenician", -// "Cuneiform", "old persian", "ugaritic" -// ); -// public static final Set OLD_SOUTH_ASIAN = loadUnmodifiable(new TreeSet(), -// "Devanagari", "Bengali", "Gurmukhi", "Gujarati", -// "Oriya", "Tamil", "Telugu", "Kannada", "Malayalam", -// "Sinhala", "Tibetan", "Phags-Pa", "Limbu", "Sylo", "Kharoshthi", "lepcha", "saurashtra", "ol chiki" -// ); -// public static final Set OLD_SOUTHEAST_ASIAN = loadUnmodifiable(new TreeSet(), -// "Thai", "Lao", "Myanmar", "Khmer", -// "Tai_Le", "New Tai Lue", "Tagalog", "Hanunoo", "Buhid", -// "Tagbanwa", "Buginese", "Balinese", "Cham", "kayah li", "rejang", "sundanese" -// ); -// public static final Set OLD_EAST_ASIAN = loadUnmodifiable(new TreeSet(), -// "Bopomofo", "Hiragana", "Katakana", "Mongolian", "Yi", "Han", "Hangul" -// ); -// public static final Set OLD_AFRICAN = loadUnmodifiable(new TreeSet(), -// "Ethiopic", "Osmanya", "Tifinagh", "Nko", "vai" -// ); -// public static final Set OLD_AMERICAN = loadUnmodifiable(new TreeSet(), -// "Cherokee", "CANS", "Deseret" -// ); -// -// public static final Set OLD_HISTORIC_SCRIPTS = loadUnmodifiable(new TreeSet(), -// "Buginese", "Buhid", "Carian", "Coptic", "Cypriot", "Deseret", "Glagolitic", -// "Gothic", "Hanunoo", "Old_Italic", "Kharoshthi", "Linear_B", "Lycian", "Lydian", -// "Ogham", "Osmanya", "Phags_Pa", "Phoenician", "Rejang", "Runic", "Shavian", "Sundanese", -// "Syloti_Nagri", "Syriac", "Tagbanwa", "Tagalog", "Ugaritic", "Old_Persian", "Cuneiform" -// ); + public static final Set HISTORIC_SCRIPTS = + With.in(ScriptMetadata.getScripts()) + .toUnmodifiableCollection( + new UsageFilter(IdUsage.EXCLUSION, IdUsage.LIMITED_USE), new TreeSet()); + + // public static final Set OLD_EUROPEAN = loadUnmodifiable(new TreeSet(), + // "Latin", "Greek", "Coptic", "Cyrillic", + // "Glag", "Armenian", "Georgian", "Shavian", "braille", + // "ogham", "runic", "Gothic", "Cypriot", "Linear b", + // "old italic"); + // + // public static final Set OLD_MIDDLE_EASTERN = loadUnmodifiable(new + // TreeSet(), + // "Hebrew", "Arabic", "Syriac", "Thaana", "Carian", "Lycian", "Lydian", "Phoenician", + // "Cuneiform", "old persian", "ugaritic" + // ); + // public static final Set OLD_SOUTH_ASIAN = loadUnmodifiable(new TreeSet(), + // "Devanagari", "Bengali", "Gurmukhi", "Gujarati", + // "Oriya", "Tamil", "Telugu", "Kannada", "Malayalam", + // "Sinhala", "Tibetan", "Phags-Pa", "Limbu", "Sylo", "Kharoshthi", "lepcha", + // "saurashtra", "ol chiki" + // ); + // public static final Set OLD_SOUTHEAST_ASIAN = loadUnmodifiable(new + // TreeSet(), + // "Thai", "Lao", "Myanmar", "Khmer", + // "Tai_Le", "New Tai Lue", "Tagalog", "Hanunoo", "Buhid", + // "Tagbanwa", "Buginese", "Balinese", "Cham", "kayah li", "rejang", "sundanese" + // ); + // public static final Set OLD_EAST_ASIAN = loadUnmodifiable(new TreeSet(), + // "Bopomofo", "Hiragana", "Katakana", "Mongolian", "Yi", "Han", "Hangul" + // ); + // public static final Set OLD_AFRICAN = loadUnmodifiable(new TreeSet(), + // "Ethiopic", "Osmanya", "Tifinagh", "Nko", "vai" + // ); + // public static final Set OLD_AMERICAN = loadUnmodifiable(new TreeSet(), + // "Cherokee", "CANS", "Deseret" + // ); + // + // public static final Set OLD_HISTORIC_SCRIPTS = loadUnmodifiable(new + // TreeSet(), + // "Buginese", "Buhid", "Carian", "Coptic", "Cypriot", "Deseret", "Glagolitic", + // "Gothic", "Hanunoo", "Old_Italic", "Kharoshthi", "Linear_B", "Lycian", "Lydian", + // "Ogham", "Osmanya", "Phags_Pa", "Phoenician", "Rejang", "Runic", "Shavian", + // "Sundanese", + // "Syloti_Nagri", "Syriac", "Tagbanwa", "Tagalog", "Ugaritic", "Old_Persian", + // "Cuneiform" + // ); // public static final UnicodeSet OTHER_SCRIPTS = (UnicodeSet) // parseUnicodeSet("[^[:script=common:][:script=inherited:]]") @@ -676,12 +755,16 @@ private static Set loadUnmodifiable(Set set, U... items) { } enum RemapType { - NONE, SCRIPT, CATEGORY + NONE, + SCRIPT, + CATEGORY }; - static Map> getRemapData(String filename) throws IOException { + static Map> getRemapData(String filename) + throws IOException { BufferedReader in = new BufferedReader(new FileReader(filename)); - Map> data = new TreeMap>(); + Map> data = + new TreeMap>(); data.put(RemapType.SCRIPT, new TreeMap()); data.put(RemapType.CATEGORY, new TreeMap()); @@ -721,20 +804,27 @@ static Map> getRemapData(String filename) thr // check try { // scriptSet = new UnicodeSet("[:script=" + part + ":]"); - fixed = getFixedPropertyValue(UProperty.SCRIPT, part, UProperty.NameChoice.LONG); + fixed = + getFixedPropertyValue( + UProperty.SCRIPT, part, UProperty.NameChoice.LONG); newRemapType = RemapType.SCRIPT; propertiesToAddTo.add(fixed); } catch (Exception e) { // scriptSet = new UnicodeSet("[:gc=" + part + ":]"); - fixed = getFixedPropertyValue(UProperty.GENERAL_CATEGORY, part, UProperty.NameChoice.LONG); + fixed = + getFixedPropertyValue( + UProperty.GENERAL_CATEGORY, + part, + UProperty.NameChoice.LONG); newRemapType = RemapType.CATEGORY; propertiesToAddTo.add(fixed); } if (remapType == RemapType.NONE) { remapType = newRemapType; } else if (remapType != newRemapType) { - throw new IllegalArgumentException("Mixing Script and Category on one line: " + line); + throw new IllegalArgumentException( + "Mixing Script and Category on one line: " + line); } // addToMapToUnicodeSet(data.get(remapType), fixed, scriptSet); } @@ -746,10 +836,13 @@ static Map> getRemapData(String filename) thr throw new IllegalArgumentException(); } for (String fixed : propertiesToAddTo) { - addToMapToUnicodeSet(data.get(remapType), fixed, - new UnicodeSet().add(Integer.parseInt(line, 16))); + addToMapToUnicodeSet( + data.get(remapType), + fixed, + new UnicodeSet().add(Integer.parseInt(line, 16))); } - // data.composeWith(new UnicodeSet().add(Integer.parseInt(line,16)), currentData, composer); + // data.composeWith(new UnicodeSet().add(Integer.parseInt(line,16)), + // currentData, composer); } else if (line.startsWith("[") && !line.startsWith("[Ed")) { // unicode set UnicodeSet set = new UnicodeSet(line); for (String fixed : propertiesToAddTo) { @@ -776,7 +869,8 @@ public static String getFixedPropertyValue(String propertyName, String valueName return getFixedPropertyValue(UCharacter.getPropertyEnum(propertyName), valueName, length); } - private static void addToMapToUnicodeSet(Map mapToUnicodeSet, T key, UnicodeSet additions) { + private static void addToMapToUnicodeSet( + Map mapToUnicodeSet, T key, UnicodeSet additions) { UnicodeSet oldSet = mapToUnicodeSet.get(key); if (oldSet == null) { mapToUnicodeSet.put(key, oldSet = new UnicodeSet()); @@ -785,7 +879,8 @@ private static void addToMapToUnicodeSet(Map mapToUnicodeSet, oldSet.addAll(additions); } - private static UnicodeSet getChanged(Map> data, RemapType remapType) { + private static UnicodeSet getChanged( + Map> data, RemapType remapType) { UnicodeSet changed = new UnicodeSet(); for (UnicodeSet value : data.get(remapType).values()) { changed.addAll(value); @@ -796,17 +891,31 @@ private static UnicodeSet getChanged(Map> dat public static void main(String[] args) throws IOException { BitSet scripts = new BitSet(); - for (String item : Arrays.asList("EUROPEAN", "MIDDLE_EASTERN", "AFRICAN", "SOUTH_ASIAN", "EAST_ASIAN", - "AMERICAN", "SOUTHEAST_ASIAN")) { + for (String item : + Arrays.asList( + "EUROPEAN", + "MIDDLE_EASTERN", + "AFRICAN", + "SOUTH_ASIAN", + "EAST_ASIAN", + "AMERICAN", + "SOUTHEAST_ASIAN")) { Collection c = - item.equals("EUROPEAN") ? EUROPEAN : - item.equals("MIDDLE_EASTERN") ? MIDDLE_EASTERN : - item.equals("AFRICAN") ? AFRICAN : - item.equals("SOUTH_ASIAN") ? SOUTH_ASIAN : - item.equals("EAST_ASIAN") ? EAST_ASIAN : - item.equals("AMERICAN") ? AMERICAN : - item.equals("SOUTHEAST_ASIAN") ? SOUTHEAST_ASIAN : - null; + item.equals("EUROPEAN") + ? EUROPEAN + : item.equals("MIDDLE_EASTERN") + ? MIDDLE_EASTERN + : item.equals("AFRICAN") + ? AFRICAN + : item.equals("SOUTH_ASIAN") + ? SOUTH_ASIAN + : item.equals("EAST_ASIAN") + ? EAST_ASIAN + : item.equals("AMERICAN") + ? AMERICAN + : item.equals("SOUTHEAST_ASIAN") + ? SOUTHEAST_ASIAN + : null; for (String scriptName : (Collection) c) { int sc = UScript.getCodeFromName(scriptName); String shortName = "[:script=" + UScript.getShortName(sc) + ":]"; @@ -853,9 +962,12 @@ private static String getAge(UnicodeSet unicodeSet) { private static void checkHistoricClosure() { testNormalizationConsistency("Historic", ARCHAIC, Normalizer.NFKD); - for (int pValue = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); pValue <= UCharacter - .getIntPropertyMaxValue(UProperty.SCRIPT); pValue++) { - String pValueName = UCharacter.getPropertyValueName(UProperty.SCRIPT, pValue, UProperty.NameChoice.LONG); + for (int pValue = UCharacter.getIntPropertyMinValue(UProperty.SCRIPT); + pValue <= UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT); + pValue++) { + String pValueName = + UCharacter.getPropertyValueName( + UProperty.SCRIPT, pValue, UProperty.NameChoice.LONG); // System.out.println("Checking " + pValueName); UnicodeSet temp = new UnicodeSet(); boolean t = myXSymbolTable.applyPropertyAlias("script", pValueName, temp); @@ -863,13 +975,19 @@ private static void checkHistoricClosure() { } } - private static void testNormalizationConsistency(String title, UnicodeSet testSet, Normalizer.Mode mode) { + private static void testNormalizationConsistency( + String title, UnicodeSet testSet, Normalizer.Mode mode) { UnicodeSet inSet = new UnicodeSet(); UnicodeSet inDecompSet = new UnicodeSet(); - for (UnicodeSetIterator it = new UnicodeSetIterator(new UnicodeSet("[[:nfkdqc=n:]" + - // "-[:hangulsyllabletype=LV:]" + - // "-[:hangulsyllabletype=LVT:]" + - "]")); it.next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator( + new UnicodeSet( + "[[:nfkdqc=n:]" + + + // "-[:hangulsyllabletype=LV:]" + + // "-[:hangulsyllabletype=LVT:]" + + "]")); + it.next(); ) { String nfkd = Normalizer.normalize(it.codepoint, mode); boolean isHistoric = testSet.contains(it.codepoint); boolean nfkdContainsHistoric = testSet.containsSome(nfkd); @@ -881,7 +999,8 @@ private static void testNormalizationConsistency(String title, UnicodeSet testSe if (inSet.size() != 0 || inDecompSet.size() != 0) { System.out.println("// Possible Problems in " + title + ":"); System.out.println("// \t In ordinary, but not in decomps: " + inSet.toPattern(false)); - System.out.println("// \t In decomps, but not in ordinary: " + inDecompSet.toPattern(false)); + System.out.println( + "// \t In decomps, but not in ordinary: " + inDecompSet.toPattern(false)); } } @@ -889,32 +1008,49 @@ private static void generateRemappingCode(String[] args) throws IOException { Map> data = getRemapData(args[0] + "ScriptData.txt"); for (RemapType r : data.keySet()) { UnicodeSet changed = getChanged(data, r); - System.out.println("public static final UnicodeSet " + r + "_CHANGED = (UnicodeSet) new UnicodeSet(\"" - + changed.toString().replace("\\", "\\\\") + "\").freeze();"); - System.out.println("public static final Map " + r + "_NEW;\n" + - "static {\n" + - "String[][] data = {"); + System.out.println( + "public static final UnicodeSet " + + r + + "_CHANGED = (UnicodeSet) new UnicodeSet(\"" + + changed.toString().replace("\\", "\\\\") + + "\").freeze();"); + System.out.println( + "public static final Map " + + r + + "_NEW;\n" + + "static {\n" + + "String[][] data = {"); Map map = data.get(r); for (String key : map.keySet()) { - System.out.println(" {\"" + key + "\", \"" + map.get(key).toString().replace("\\", "\\\\") + "\"},"); + System.out.println( + " {\"" + + key + + "\", \"" + + map.get(key).toString().replace("\\", "\\\\") + + "\"},"); } if (r == RemapType.CATEGORY) { - System.out.println(" {\"" + "Symbol" + "\", \"" + new UnicodeSet("[:Symbol:]").addAll(changed) - .toString().replace("\\", "\\\\") + "\"},"); + System.out.println( + " {\"" + + "Symbol" + + "\", \"" + + new UnicodeSet("[:Symbol:]") + .addAll(changed) + .toString() + .replace("\\", "\\\\") + + "\"},"); } - System.out.println("};\n" + - "" + r + "_NEW = loadData(data);\n" + - "}"); + System.out.println("};\n" + "" + r + "_NEW = loadData(data);\n" + "}"); } - // System.out.println(data.toString().replace(" ", "\n ").replace("{", "{\n ").replace("}", "\n}")); + // System.out.println(data.toString().replace(" ", "\n ").replace("{", "{\n ").replace("}", + // "\n}")); } /** - * @param radical + * @param radical * @return the radicalNum2char */ public static Integer getRadicalNum2char(String radical) { return RADICAL_NUM2CHAR.get(radical); } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/ScriptCount.java b/unicodetools/src/main/java/org/unicode/draft/ScriptCount.java index 2fe706a9b..e973dc084 100644 --- a/unicodetools/src/main/java/org/unicode/draft/ScriptCount.java +++ b/unicodetools/src/main/java/org/unicode/draft/ScriptCount.java @@ -1,5 +1,15 @@ package org.unicode.draft; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UProperty.NameChoice; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.DecimalFormat; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; import java.io.PrintWriter; import java.util.BitSet; import java.util.HashMap; @@ -10,7 +20,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.tool.Option; import org.unicode.cldr.tool.Option.Options; import org.unicode.cldr.util.Counter; @@ -19,21 +28,11 @@ import org.unicode.text.UCA.UCA; import org.unicode.text.utility.Settings; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UProperty.NameChoice; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.DecimalFormat; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; - public class ScriptCount { private static final double LOG2 = Math.log(2); - final static Options myOptions = new Options(); + static final Options myOptions = new Options(); + enum MyOptions { ranked("(true|false)", "true", "Use ranked frequencies"), language(".*", "mul", "Language code (mul for all)."), @@ -43,21 +42,24 @@ enum MyOptions { ; // boilerplate final Option option; + MyOptions(String argumentPattern, String defaultArgument, String helpText) { option = myOptions.add(this, argumentPattern, defaultArgument, helpText); } } - static class SecondaryInfo implements Comparable{ + static class SecondaryInfo implements Comparable { final int secondary; long frequency; int codePointCount; int sampleCodePoint; long sampleCount; int sampleLength = Integer.MAX_VALUE; + SecondaryInfo(int secondary) { this.secondary = secondary; } + public void add(int sampleCodePoint2, int length, long count2) { frequency += count2; codePointCount++; @@ -70,9 +72,10 @@ public void add(int sampleCodePoint2, int length, long count2) { sampleLength = length; } } + @Override public int compareTo(SecondaryInfo arg0) { - if (frequency != arg0.frequency){ + if (frequency != arg0.frequency) { return frequency > arg0.frequency ? -1 : 1; } return arg0.secondary - secondary; @@ -86,7 +89,7 @@ static class SecondaryCounts { void add(int sourceString, long count) { final CEList celist = uca.getCEList(UTF16.valueOf(sourceString), true); final int length = celist.length(); - for (int i = 0; i < length; ++i) { + for (int i = 0; i < length; ++i) { final int ce = celist.at(i); final int secondary = UCA.getSecondary(ce); SecondaryInfo info = counter.get(secondary); @@ -96,6 +99,7 @@ void add(int sourceString, long count) { info.add(sourceString, length, count); } } + Set getSorted() { return new TreeSet(counter.values()); } @@ -110,10 +114,10 @@ public static void main(String[] args) { final boolean secondary = MyOptions.secondary.option.doesOccur(); Counter langCounter = CharacterFrequency.getCodePointCounter(language, ranked); - //System.out.println(langCounter.getItemCount()); + // System.out.println(langCounter.getItemCount()); final Normalizer2 nfkc = Normalizer2.getNFKCInstance(); final Normalizer2 toNfd = Normalizer2.getNFDInstance(); - final Map> keyCounter = new TreeMap>(); + final Map> keyCounter = new TreeMap>(); final BitSet bitset = new BitSet(); SecondaryCounts secondaryCounts = null; if (secondary) { @@ -145,14 +149,28 @@ public static void main(String[] args) { for (final Entry> entry : keyCounter.entrySet()) { final String key = entry.getKey(); final Counter counter = entry.getValue(); - System.out.println(key + "\t" + Math.log(counter.getTotal()) + "\t" + counter.getItemCount() + "\t" + getTop(32, counter, langCounter)); + System.out.println( + key + + "\t" + + Math.log(counter.getTotal()) + + "\t" + + counter.getItemCount() + + "\t" + + getTop(32, counter, langCounter)); } // Only supplementary characters for (final Entry> entry : keyCounter.entrySet()) { final String key = entry.getKey(); final Counter counter0 = entry.getValue(); final Counter counter = filterCounter(counter0); - System.out.println(key + "\t" + Math.log(counter.getTotal()) + "\t" + counter.getItemCount() + "\t" + getTop(32, counter, langCounter)); + System.out.println( + key + + "\t" + + Math.log(counter.getTotal()) + + "\t" + + counter.getItemCount() + + "\t" + + getTop(32, counter, langCounter)); } final DecimalFormat pf = (DecimalFormat) NumberFormat.getInstance(); @@ -162,54 +180,81 @@ public static void main(String[] args) { // pf.setMaximumSignificantDigits(3); final int counter = 0; final double max = langCounter.getTotal(); - PrintWriter out = org.unicode.text.utility.Utility.openPrintWriter(Settings.Output.GEN_DIR + "/frequency-text", - language - + (nfd ? "-nfd" : "") - + (filter != null ? "-" + filter : "") + - ".txt", org.unicode.text.utility.Utility.UTF8_WINDOWS); + PrintWriter out = + org.unicode.text.utility.Utility.openPrintWriter( + Settings.Output.GEN_DIR + "/frequency-text", + language + + (nfd ? "-nfd" : "") + + (filter != null ? "-" + filter : "") + + ".txt", + org.unicode.text.utility.Utility.UTF8_WINDOWS); final Matcher m = filter == null ? null : Pattern.compile(filter).matcher(""); for (final int ch : langCounter.getKeysetSortedByCount(false)) { final long count = langCounter.get(ch); // 0% 忌 U+5FCC Lo Hani CJK UNIFIED IDEOGRAPH-5FCC - final String catString = propValue(ch, UProperty.GENERAL_CATEGORY, UProperty.NameChoice.SHORT); + final String catString = + propValue(ch, UProperty.GENERAL_CATEGORY, UProperty.NameChoice.SHORT); final String scriptString = propValue(ch, UProperty.SCRIPT, UProperty.NameChoice.SHORT); if (m != null && !m.reset(catString).matches() && !m.reset(scriptString).matches()) { continue; } - out.println(pf.format(Math.log(count/max)/LOG2) - + "\t" + show(ch) - + "\tU+" + Utility.hex(ch, 4) - + "\t" + catString - + "\t" + scriptString - + "\t" + UCharacter.getExtendedName(ch)); - //if (count < 10000) break; + out.println( + pf.format(Math.log(count / max) / LOG2) + + "\t" + + show(ch) + + "\tU+" + + Utility.hex(ch, 4) + + "\t" + + catString + + "\t" + + scriptString + + "\t" + + UCharacter.getExtendedName(ch)); + // if (count < 10000) break; } out.close(); if (secondary) { - out = org.unicode.text.utility.Utility.openPrintWriter(Settings.Output.GEN_DIR + "/frequency-text", - language + "-sec" - + (nfd ? "-nfd" : "") - + (filter != null ? "-" + filter : "") + - ".txt", org.unicode.text.utility.Utility.UTF8_WINDOWS); + out = + org.unicode.text.utility.Utility.openPrintWriter( + Settings.Output.GEN_DIR + "/frequency-text", + language + + "-sec" + + (nfd ? "-nfd" : "") + + (filter != null ? "-" + filter : "") + + ".txt", + org.unicode.text.utility.Utility.UTF8_WINDOWS); for (final SecondaryInfo secondaryInfo : secondaryCounts.getSorted()) { if (secondaryInfo.secondary == 0) { continue; } final int ch = secondaryInfo.sampleCodePoint; - final String catString = propValue(ch, UProperty.GENERAL_CATEGORY, UProperty.NameChoice.SHORT); - final String scriptString = propValue(ch, UProperty.SCRIPT, UProperty.NameChoice.SHORT); - if (m != null && !m.reset(catString).matches() && !m.reset(scriptString).matches()) { + final String catString = + propValue(ch, UProperty.GENERAL_CATEGORY, UProperty.NameChoice.SHORT); + final String scriptString = + propValue(ch, UProperty.SCRIPT, UProperty.NameChoice.SHORT); + if (m != null + && !m.reset(catString).matches() + && !m.reset(scriptString).matches()) { continue; } - out.println(pf.format(Math.log(secondaryInfo.frequency/max)/LOG2) - + "\t0x" + Utility.hex(secondaryInfo.secondary) - + "\t" + secondaryInfo.sampleLength - + "\t" + secondaryInfo.codePointCount - + "\t" + show(ch) - + "\tU+" + Utility.hex(secondaryInfo.sampleCodePoint, 4) - + "\t" + catString - + "\t" + scriptString - + "\t" + UCharacter.getExtendedName(ch)); + out.println( + pf.format(Math.log(secondaryInfo.frequency / max) / LOG2) + + "\t0x" + + Utility.hex(secondaryInfo.secondary) + + "\t" + + secondaryInfo.sampleLength + + "\t" + + secondaryInfo.codePointCount + + "\t" + + show(ch) + + "\tU+" + + Utility.hex(secondaryInfo.sampleCodePoint, 4) + + "\t" + + catString + + "\t" + + scriptString + + "\t" + + UCharacter.getExtendedName(ch)); } out.close(); } @@ -225,9 +270,12 @@ private static Counter filterCounter(Counter counter0) { return result; } - public static Integer addCharacter(Normalizer2 nfkc, - Map> keyCounter, BitSet bitset, - Integer cp, long count) { + public static Integer addCharacter( + Normalizer2 nfkc, + Map> keyCounter, + BitSet bitset, + Integer cp, + long count) { int cat = UCharacter.getType(cp); if (cat == ECharacterCategory.ENCLOSING_MARK) { cat = ECharacterCategory.NON_SPACING_MARK; @@ -253,30 +301,40 @@ public static Integer addCharacter(Normalizer2 nfkc, key = "*WS\tWhitespace"; addCount(keyCounter, cp, count, key); } else { - key = "*" + UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, cat, NameChoice.SHORT) - + "\t" + UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, cat, NameChoice.LONG); + key = + "*" + + UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, cat, NameChoice.SHORT) + + "\t" + + UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, cat, NameChoice.LONG); addCount(keyCounter, cp, count, key); } return cp; } public static boolean isLetter(int cat) { - return cat == ECharacterCategory.UPPERCASE_LETTER || cat == ECharacterCategory.LOWERCASE_LETTER || cat == ECharacterCategory.MODIFIER_LETTER || cat == ECharacterCategory.TITLECASE_LETTER - || cat == ECharacterCategory.OTHER_LETTER || cat == ECharacterCategory.COMBINING_SPACING_MARK; + return cat == ECharacterCategory.UPPERCASE_LETTER + || cat == ECharacterCategory.LOWERCASE_LETTER + || cat == ECharacterCategory.MODIFIER_LETTER + || cat == ECharacterCategory.TITLECASE_LETTER + || cat == ECharacterCategory.OTHER_LETTER + || cat == ECharacterCategory.COMBINING_SPACING_MARK; } - public static void addScript(Map> keyCounter, - int cp2, BitSet bitset, long count) { + public static void addScript( + Map> keyCounter, int cp2, BitSet bitset, long count) { UScript.getScriptExtensions(cp2, bitset); - for (int script = bitset.nextSetBit(0); script >= 0; script = bitset.nextSetBit(script+1)) { + for (int script = bitset.nextSetBit(0); + script >= 0; + script = bitset.nextSetBit(script + 1)) { final String key = UScript.getShortName(script) + "\t" + UScript.getName(script); addCount(keyCounter, cp2, count, key); } } public static Counter addCount( - Map> keyCounter, Integer cp, long count, - String key) { + Map> keyCounter, Integer cp, long count, String key) { Counter counter = keyCounter.get(key); if (counter == null) { keyCounter.put(key, counter = new Counter()); @@ -285,7 +343,8 @@ public static Counter addCount( return counter; } - private static String getTop(int max, Counter counter, Counter languageCounter) { + private static String getTop( + int max, Counter counter, Counter languageCounter) { final StringBuilder b = new StringBuilder(); for (final int cp : counter.getKeysetSortedByCount(false)) { if (--max < 0) { @@ -294,9 +353,9 @@ private static String getTop(int max, Counter counter, Counter if (b.length() != 0) { b.append("\t"); } - //b.append('“'); + // b.append('“'); b.append(show(cp)); - //b.append('”').append("(").append((int)Math.round(100*Math.log(langCounter.get(0x20)/langCounter.get(cp)))).append(")"); + // b.append('”').append("(").append((int)Math.round(100*Math.log(langCounter.get(0x20)/langCounter.get(cp)))).append(")"); } return b.toString(); } @@ -316,13 +375,18 @@ private static String getExtendedName(String s, String separator) { } private static String propValue(int ch, int propEnum, int nameChoice) { - return UCharacter.getPropertyValueName(propEnum, UCharacter.getIntPropertyValue(ch, propEnum), nameChoice); + return UCharacter.getPropertyValueName( + propEnum, UCharacter.getIntPropertyValue(ch, propEnum), nameChoice); } private static String show(int s) { final int cat = UCharacter.getType(s); - if (cat == ECharacterCategory.FORMAT || cat == ECharacterCategory.CONTROL || cat == ECharacterCategory.PRIVATE_USE - || cat == ECharacterCategory.SPACE_SEPARATOR || cat == ECharacterCategory.LINE_SEPARATOR || cat == ECharacterCategory.PARAGRAPH_SEPARATOR) { + if (cat == ECharacterCategory.FORMAT + || cat == ECharacterCategory.CONTROL + || cat == ECharacterCategory.PRIVATE_USE + || cat == ECharacterCategory.SPACE_SEPARATOR + || cat == ECharacterCategory.LINE_SEPARATOR + || cat == ECharacterCategory.PARAGRAPH_SEPARATOR) { return "U+" + Utility.hex(s); } if (s == '\'' || s == '"' || s == '=') { diff --git a/unicodetools/src/main/java/org/unicode/draft/SetComparator.java b/unicodetools/src/main/java/org/unicode/draft/SetComparator.java index 627d0362b..0f4e19c89 100644 --- a/unicodetools/src/main/java/org/unicode/draft/SetComparator.java +++ b/unicodetools/src/main/java/org/unicode/draft/SetComparator.java @@ -27,4 +27,4 @@ public int compare(Set o1, Set o2) { // make it through the gauntlet, we're done return 0; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/draft/SimpleFormatRegistry.java b/unicodetools/src/main/java/org/unicode/draft/SimpleFormatRegistry.java index 53f2d1f9c..69535d952 100644 --- a/unicodetools/src/main/java/org/unicode/draft/SimpleFormatRegistry.java +++ b/unicodetools/src/main/java/org/unicode/draft/SimpleFormatRegistry.java @@ -1,7 +1,4 @@ package org.unicode.draft; -import java.text.ChoiceFormat; -import java.text.Format; -import java.util.Date; import com.ibm.icu.text.DateFormat; import com.ibm.icu.text.DecimalFormat; @@ -11,7 +8,9 @@ import com.ibm.icu.text.RuleBasedNumberFormat; import com.ibm.icu.text.SimpleDateFormat; import com.ibm.icu.util.ULocale; - +import java.text.ChoiceFormat; +import java.text.Format; +import java.util.Date; public class SimpleFormatRegistry implements FormatRegistry { @@ -26,7 +25,7 @@ public Format getFormatForObject(Class classType, ULocale ulocale) { } else if (classType.isAssignableFrom(Date.class)) { // format a Date if can return DateFormat.getDateTimeInstance( - DateFormat.SHORT, DateFormat.SHORT, ulocale);//fix + DateFormat.SHORT, DateFormat.SHORT, ulocale); // fix } else { return null; } @@ -34,18 +33,10 @@ public Format getFormatForObject(Class classType, ULocale ulocale) { // UGLY CODE, move somewhere else /** - if (subFormatter instanceof ChoiceFormat - || subFormatter instanceof PluralFormat) { - arg = formats[i].format(obj); - // TODO: This should be made more robust. - // Does this work with '{' in quotes? - if (arg.indexOf('{') >= 0) { - subFormatter = new MessageFormat(arg, ulocale); - obj = arguments; - arg = null; - } - } - + * if (subFormatter instanceof ChoiceFormat || subFormatter instanceof PluralFormat) { arg = + * formats[i].format(obj); // TODO: This should be made more robust. // Does this work with '{' + * in quotes? if (arg.indexOf('{') >= 0) { subFormatter = new MessageFormat(arg, ulocale); obj = + * arguments; arg = null; } } */ // for now, just hard-code /* (non-Javadoc) @@ -66,40 +57,40 @@ public String getKey(Format formats, ULocale ulocale) { } else if (formats.equals(NumberFormat.getIntegerInstance(ulocale))) { return ("number,integer"); } else { - return ("number," + - ((DecimalFormat)formats).toPattern()); + return ("number," + ((DecimalFormat) formats).toPattern()); } } else if (formats instanceof SimpleDateFormat) { - if (formats.equals(DateFormat.getDateInstance(DateFormat.DEFAULT,ulocale))) { + if (formats.equals(DateFormat.getDateInstance(DateFormat.DEFAULT, ulocale))) { return ("date"); - } else if (formats.equals(DateFormat.getDateInstance(DateFormat.SHORT,ulocale))) { + } else if (formats.equals(DateFormat.getDateInstance(DateFormat.SHORT, ulocale))) { return ("date,short"); - //This code will never be executed [alan] - // } else if (inputFormat.equals(DateFormat.getDateInstance(DateFormat.DEFAULT,ulocale))) { + // This code will never be executed [alan] + // } else if + // (inputFormat.equals(DateFormat.getDateInstance(DateFormat.DEFAULT,ulocale))) { // return ("date,medium"); - } else if (formats.equals(DateFormat.getDateInstance(DateFormat.LONG,ulocale))) { + } else if (formats.equals(DateFormat.getDateInstance(DateFormat.LONG, ulocale))) { return ("date,long"); - } else if (formats.equals(DateFormat.getDateInstance(DateFormat.FULL,ulocale))) { + } else if (formats.equals(DateFormat.getDateInstance(DateFormat.FULL, ulocale))) { return ("date,full"); - } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.DEFAULT,ulocale))) { + } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.DEFAULT, ulocale))) { return ("time"); - } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.SHORT,ulocale))) { + } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.SHORT, ulocale))) { return ("time,short"); - //This code will never be executed [alan] - // } else if (inputFormat.equals(DateFormat.getTimeInstance(DateFormat.DEFAULT,ulocale))) { + // This code will never be executed [alan] + // } else if + // (inputFormat.equals(DateFormat.getTimeInstance(DateFormat.DEFAULT,ulocale))) { // return ("time,medium"); - } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.LONG,ulocale))) { + } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.LONG, ulocale))) { return ("time,long"); - } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.FULL,ulocale))) { + } else if (formats.equals(DateFormat.getTimeInstance(DateFormat.FULL, ulocale))) { return ("time,full"); } else { - return ("date," + ((SimpleDateFormat)formats).toPattern()); + return ("date," + ((SimpleDateFormat) formats).toPattern()); } } else if (formats instanceof ChoiceFormat) { - return ("choice," - + ((ChoiceFormat) formats).toPattern()); + return ("choice," + ((ChoiceFormat) formats).toPattern()); } else if (formats instanceof PluralFormat) { - String pat = ((PluralFormat)formats).toPattern(); + String pat = ((PluralFormat) formats).toPattern(); // TODO: PluralFormat doesn't do the single quote thing, just reapply if (pat.indexOf('\'') != 0) { final StringBuffer buf = new StringBuffer(); @@ -118,37 +109,34 @@ public String getKey(Format formats, ULocale ulocale) { } } - private static final String[] typeList = {"", "number", "date", "time", "choice", "spellout", "ordinal", "duration", "plural"}; + private static final String[] typeList = { + "", "number", "date", "time", "choice", "spellout", "ordinal", "duration", "plural" + }; - private static final int - TYPE_EMPTY = 0, - TYPE_NUMBER = 1, - TYPE_DATE = 2, - TYPE_TIME = 3, - TYPE_CHOICE = 4, - TYPE_SPELLOUT = 5, - TYPE_ORDINAL = 6, - TYPE_DURATION = 7, - TYPE_PLURAL = 8; + private static final int TYPE_EMPTY = 0, + TYPE_NUMBER = 1, + TYPE_DATE = 2, + TYPE_TIME = 3, + TYPE_CHOICE = 4, + TYPE_SPELLOUT = 5, + TYPE_ORDINAL = 6, + TYPE_DURATION = 7, + TYPE_PLURAL = 8; - private static final String[] modifierList = - {"", "currency", "percent", "integer"}; + private static final String[] modifierList = {"", "currency", "percent", "integer"}; - private static final int - MODIFIER_EMPTY = 0, - MODIFIER_CURRENCY = 1, - MODIFIER_PERCENT = 2, - MODIFIER_INTEGER = 3; + private static final int MODIFIER_EMPTY = 0, + MODIFIER_CURRENCY = 1, + MODIFIER_PERCENT = 2, + MODIFIER_INTEGER = 3; - private static final String[] dateModifierList = - {"", "short", "medium", "long", "full"}; + private static final String[] dateModifierList = {"", "short", "medium", "long", "full"}; - private static final int - DATE_MODIFIER_EMPTY = 0, - DATE_MODIFIER_SHORT = 1, - DATE_MODIFIER_MEDIUM = 2, - DATE_MODIFIER_LONG = 3, - DATE_MODIFIER_FULL = 4; + private static final int DATE_MODIFIER_EMPTY = 0, + DATE_MODIFIER_SHORT = 1, + DATE_MODIFIER_MEDIUM = 2, + DATE_MODIFIER_LONG = 3, + DATE_MODIFIER_FULL = 4; private static final int findKeyword(String s, String[] list) { s = s.trim().toLowerCase(); @@ -160,8 +148,6 @@ private static final int findKeyword(String s, String[] list) { return -1; } - - /* (non-Javadoc) * @see FormatRegistry2#getFormat(java.lang.String, java.lang.String, com.ibm.icu.util.ULocale, boolean[]) */ @@ -169,153 +155,152 @@ private static final int findKeyword(String s, String[] list) { public Format getFormat(String mainType, String subType, ULocale ulocale) { Format newFormat = null; switch (findKeyword(mainType, typeList)) { - case TYPE_EMPTY: - break; - case TYPE_NUMBER: - switch (findKeyword(subType, modifierList)) { - case MODIFIER_EMPTY: - newFormat = NumberFormat.getInstance(ulocale); - break; - case MODIFIER_CURRENCY: - newFormat = NumberFormat.getCurrencyInstance(ulocale); - break; - case MODIFIER_PERCENT: - newFormat = NumberFormat.getPercentInstance(ulocale); - break; - case MODIFIER_INTEGER: - newFormat = NumberFormat.getIntegerInstance(ulocale); - break; - default: // pattern - newFormat = new DecimalFormat(subType, new DecimalFormatSymbols(ulocale)); + case TYPE_EMPTY: break; - } - break; - case TYPE_DATE: - switch (findKeyword(subType, dateModifierList)) { - case DATE_MODIFIER_EMPTY: - newFormat = DateFormat.getDateInstance(DateFormat.DEFAULT, ulocale); - break; - case DATE_MODIFIER_SHORT: - newFormat = DateFormat.getDateInstance(DateFormat.SHORT, ulocale); - break; - case DATE_MODIFIER_MEDIUM: - newFormat = DateFormat.getDateInstance(DateFormat.DEFAULT, ulocale); - break; - case DATE_MODIFIER_LONG: - newFormat = DateFormat.getDateInstance(DateFormat.LONG, ulocale); - break; - case DATE_MODIFIER_FULL: - newFormat = DateFormat.getDateInstance(DateFormat.FULL, ulocale); - break; - default: - newFormat = new SimpleDateFormat(subType, ulocale); - break; - } - break; - case TYPE_TIME: - switch (findKeyword(subType, dateModifierList)) { - case DATE_MODIFIER_EMPTY: - newFormat = DateFormat.getTimeInstance(DateFormat.DEFAULT, ulocale); - break; - case DATE_MODIFIER_SHORT: - newFormat = DateFormat.getTimeInstance(DateFormat.SHORT, ulocale); - break; - case DATE_MODIFIER_MEDIUM: - newFormat = DateFormat.getTimeInstance(DateFormat.DEFAULT, ulocale); - break; - case DATE_MODIFIER_LONG: - newFormat = DateFormat.getTimeInstance(DateFormat.LONG, ulocale); - break; - case DATE_MODIFIER_FULL: - newFormat = DateFormat.getTimeInstance(DateFormat.FULL, ulocale); - break; - default: - newFormat = new SimpleDateFormat(subType, ulocale); + case TYPE_NUMBER: + switch (findKeyword(subType, modifierList)) { + case MODIFIER_EMPTY: + newFormat = NumberFormat.getInstance(ulocale); + break; + case MODIFIER_CURRENCY: + newFormat = NumberFormat.getCurrencyInstance(ulocale); + break; + case MODIFIER_PERCENT: + newFormat = NumberFormat.getPercentInstance(ulocale); + break; + case MODIFIER_INTEGER: + newFormat = NumberFormat.getIntegerInstance(ulocale); + break; + default: // pattern + newFormat = new DecimalFormat(subType, new DecimalFormatSymbols(ulocale)); + break; + } break; - } - break; - case TYPE_CHOICE: - try { - newFormat = new ChoiceFormat(subType); - } catch (final Exception e) { - throw new IllegalArgumentException("Choice Pattern incorrect", e); - } - break; - case TYPE_SPELLOUT: - { - final RuleBasedNumberFormat rbnf = new RuleBasedNumberFormat(ulocale, RuleBasedNumberFormat.SPELLOUT); - final String ruleset = subType.trim(); - if (ruleset.length() != 0) { - try { - rbnf.setDefaultRuleSet(ruleset); + case TYPE_DATE: + switch (findKeyword(subType, dateModifierList)) { + case DATE_MODIFIER_EMPTY: + newFormat = DateFormat.getDateInstance(DateFormat.DEFAULT, ulocale); + break; + case DATE_MODIFIER_SHORT: + newFormat = DateFormat.getDateInstance(DateFormat.SHORT, ulocale); + break; + case DATE_MODIFIER_MEDIUM: + newFormat = DateFormat.getDateInstance(DateFormat.DEFAULT, ulocale); + break; + case DATE_MODIFIER_LONG: + newFormat = DateFormat.getDateInstance(DateFormat.LONG, ulocale); + break; + case DATE_MODIFIER_FULL: + newFormat = DateFormat.getDateInstance(DateFormat.FULL, ulocale); + break; + default: + newFormat = new SimpleDateFormat(subType, ulocale); + break; } - catch (final Exception e) { - // warn invalid ruleset + break; + case TYPE_TIME: + switch (findKeyword(subType, dateModifierList)) { + case DATE_MODIFIER_EMPTY: + newFormat = DateFormat.getTimeInstance(DateFormat.DEFAULT, ulocale); + break; + case DATE_MODIFIER_SHORT: + newFormat = DateFormat.getTimeInstance(DateFormat.SHORT, ulocale); + break; + case DATE_MODIFIER_MEDIUM: + newFormat = DateFormat.getTimeInstance(DateFormat.DEFAULT, ulocale); + break; + case DATE_MODIFIER_LONG: + newFormat = DateFormat.getTimeInstance(DateFormat.LONG, ulocale); + break; + case DATE_MODIFIER_FULL: + newFormat = DateFormat.getTimeInstance(DateFormat.FULL, ulocale); + break; + default: + newFormat = new SimpleDateFormat(subType, ulocale); + break; } - } - newFormat = rbnf; - } - break; - case TYPE_ORDINAL: - { - final RuleBasedNumberFormat rbnf = new RuleBasedNumberFormat(ulocale, RuleBasedNumberFormat.ORDINAL); - final String ruleset = subType.trim(); - if (ruleset.length() != 0) { + break; + case TYPE_CHOICE: try { - rbnf.setDefaultRuleSet(ruleset); + newFormat = new ChoiceFormat(subType); + } catch (final Exception e) { + throw new IllegalArgumentException("Choice Pattern incorrect", e); } - catch (final Exception e) { - // warn invalid ruleset - } - } - newFormat = rbnf; - } - break; - case TYPE_DURATION: - { - final RuleBasedNumberFormat rbnf = new RuleBasedNumberFormat(ulocale, RuleBasedNumberFormat.DURATION); - final String ruleset = subType.trim(); - if (ruleset.length() != 0) { - try { - rbnf.setDefaultRuleSet(ruleset); + break; + case TYPE_SPELLOUT: + { + final RuleBasedNumberFormat rbnf = + new RuleBasedNumberFormat(ulocale, RuleBasedNumberFormat.SPELLOUT); + final String ruleset = subType.trim(); + if (ruleset.length() != 0) { + try { + rbnf.setDefaultRuleSet(ruleset); + } catch (final Exception e) { + // warn invalid ruleset + } + } + newFormat = rbnf; } - catch (final Exception e) { - // warn invalid ruleset + break; + case TYPE_ORDINAL: + { + final RuleBasedNumberFormat rbnf = + new RuleBasedNumberFormat(ulocale, RuleBasedNumberFormat.ORDINAL); + final String ruleset = subType.trim(); + if (ruleset.length() != 0) { + try { + rbnf.setDefaultRuleSet(ruleset); + } catch (final Exception e) { + // warn invalid ruleset + } + } + newFormat = rbnf; } - } - newFormat = rbnf; - } - break; - case TYPE_PLURAL: - { - // PluralFormat does not handle quotes. - // Remove quotes. - // TODO: Should PluralFormat handle quotes? - final StringBuffer unquotedPattern = new StringBuffer(); - final String quotedPattern = subType; - boolean inQuote = false; - for (int i = 0; i < quotedPattern.length(); ++i) { - final char ch = quotedPattern.charAt(i); - if (ch == '\'') { - if (i+1 < quotedPattern.length() && - quotedPattern.charAt(i+1) == '\'') { - unquotedPattern.append(ch); - ++i; - } else { - inQuote = !inQuote; + break; + case TYPE_DURATION: + { + final RuleBasedNumberFormat rbnf = + new RuleBasedNumberFormat(ulocale, RuleBasedNumberFormat.DURATION); + final String ruleset = subType.trim(); + if (ruleset.length() != 0) { + try { + rbnf.setDefaultRuleSet(ruleset); + } catch (final Exception e) { + // warn invalid ruleset + } } - } else { - unquotedPattern.append(ch); + newFormat = rbnf; } - } + break; + case TYPE_PLURAL: + { + // PluralFormat does not handle quotes. + // Remove quotes. + // TODO: Should PluralFormat handle quotes? + final StringBuffer unquotedPattern = new StringBuffer(); + final String quotedPattern = subType; + boolean inQuote = false; + for (int i = 0; i < quotedPattern.length(); ++i) { + final char ch = quotedPattern.charAt(i); + if (ch == '\'') { + if (i + 1 < quotedPattern.length() + && quotedPattern.charAt(i + 1) == '\'') { + unquotedPattern.append(ch); + ++i; + } else { + inQuote = !inQuote; + } + } else { + unquotedPattern.append(ch); + } + } - final PluralFormat pls = new PluralFormat(ulocale, - unquotedPattern.toString()); - newFormat = pls; - } - break; - default: - throw new IllegalArgumentException("unknown format type at "); + final PluralFormat pls = new PluralFormat(ulocale, unquotedPattern.toString()); + newFormat = pls; + } + break; + default: + throw new IllegalArgumentException("unknown format type at "); } return newFormat; } diff --git a/unicodetools/src/main/java/org/unicode/draft/Snippet.java b/unicodetools/src/main/java/org/unicode/draft/Snippet.java index f06534076..339f5a2bf 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Snippet.java +++ b/unicodetools/src/main/java/org/unicode/draft/Snippet.java @@ -1,8 +1,5 @@ package org.unicode.draft; - public class Snippet { - public static void main(String[] args) { - - } -} \ No newline at end of file + public static void main(String[] args) {} +} diff --git a/unicodetools/src/main/java/org/unicode/draft/Subheader2.java b/unicodetools/src/main/java/org/unicode/draft/Subheader2.java index 4d3f5b8ca..fa2107f5b 100644 --- a/unicodetools/src/main/java/org/unicode/draft/Subheader2.java +++ b/unicodetools/src/main/java/org/unicode/draft/Subheader2.java @@ -1,8 +1,12 @@ -/** - * - */ +/** */ package org.unicode.draft; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UTF16.StringComparator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; @@ -17,16 +21,8 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; - import org.unicode.cldr.util.PatternCache; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UTF16.StringComparator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - class Subheader2 { Matcher isArchaic = GeneratePickerData2.IS_ARCHAIC.matcher(""); Matcher subheadMatcher = PatternCache.get("(@+)\\s+(.*)").matcher(""); @@ -44,7 +40,8 @@ class Subheader2 { // if (false) { // if (GeneratePickerData.DEBUG) // System.out.println("*** Fixing plurals"); - // for (java.util.Iterator it = subblock2UnicodeSet.keySet().iterator(); it.hasNext();) { + // for (java.util.Iterator it = subblock2UnicodeSet.keySet().iterator(); + // it.hasNext();) { // String subblock = it.next(); // final String pluralSubblock = subblock + "s"; // UnicodeSet plural = subblock2UnicodeSet.get(pluralSubblock); @@ -61,12 +58,15 @@ class Subheader2 { for (String subblock : subblock2UnicodeSet.keySet()) { final UnicodeSet uset = subblock2UnicodeSet.get(subblock); - for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next(); ) { codePoint2Subblock.put(it.codepoint, subblock); - String block = UCharacter - .getStringPropertyValue(UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG).toString() - .replace('_', ' ').intern(); + String block = + UCharacter.getStringPropertyValue( + UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG) + .toString() + .replace('_', ' ') + .intern(); Set set = block2subblock.get(block); if (set == null) { @@ -81,7 +81,9 @@ class Subheader2 { set.add(block); String name = UCharacter.getExtendedName(it.codepoint); - if (isArchaic.reset(block).find() || isArchaic.reset(subblock).find() || isArchaic.reset(name).find()) { + if (isArchaic.reset(block).find() + || isArchaic.reset(subblock).find() + || isArchaic.reset(name).find()) { archaicSubblock.add(it.codepoint); } } @@ -92,19 +94,34 @@ class Subheader2 { private void writeBlockInfo(String outputDirectory) throws IOException, FileNotFoundException { System.out.println("***Block/Subblock start"); - PrintWriter out = GeneratePickerData2.getFileWriter(outputDirectory, "blocks_subblocks.html"); + PrintWriter out = + GeneratePickerData2.getFileWriter(outputDirectory, "blocks_subblocks.html"); htmlHeader(out); - out.println("" + "Block" + "" + "Notes" + "" + "Subblock" + ""); + out.println( + "" + + "Block" + + "" + + "Notes" + + "" + + "Subblock" + + ""); for (String block : block2subblock.keySet()) { final Set set = block2subblock.get(block); for (String subblock2 : set) { - out.println("" + block + "" + - (subblock2.equalsIgnoreCase(block) || subblock2.equalsIgnoreCase(block + "s") ? "duplicate" : "") + - (set.size() < 2 ? " singleton" : "") - + "\u00a0" - + "" + subblock2 + - ""); + out.println( + "" + + block + + "" + + (subblock2.equalsIgnoreCase(block) + || subblock2.equalsIgnoreCase(block + "s") + ? "duplicate" + : "") + + (set.size() < 2 ? " singleton" : "") + + "\u00a0" + + "" + + subblock2 + + ""); } } out.println(""); @@ -114,7 +131,14 @@ private void writeBlockInfo(String outputDirectory) throws IOException, FileNotF out = GeneratePickerData2.getFileWriter(outputDirectory, "subblocks_blocks.html"); htmlHeader(out); - out.println("" + "Subblock" + "" + "Notes" + "" + "Blocks" + ""); + out.println( + "" + + "Subblock" + + "" + + "Notes" + + "" + + "Blocks" + + ""); StringComparator caseless = new UTF16.StringComparator(true, true, 0); TreeSet tests = new TreeSet(caseless); tests.addAll(subblock2block.keySet()); @@ -123,10 +147,14 @@ private void writeBlockInfo(String outputDirectory) throws IOException, FileNotF final String first = set.iterator().next(); String otherString = String.valueOf(set); otherString = otherString.substring(1, otherString.length() - 1) + '\u00a0'; - out.println("" + subblock2 - + "" + getComments(subblock2, tests) - + "" + otherString - + ""); + out.println( + "" + + subblock2 + + "" + + getComments(subblock2, tests) + + "" + + otherString + + ""); } System.out.println("***Block/Subblock end"); out.close(); @@ -135,34 +163,30 @@ private void writeBlockInfo(String outputDirectory) throws IOException, FileNotF private String getComments(String subblock2, Set keySet) { if (keySet.contains(subblock2 + "s") - || keySet.contains("Additional " + subblock2) - || keySet.contains("Additional " + subblock2 + "s") - || keySet.contains("Other " + subblock2) - || keySet.contains("Other " + subblock2 + "s") - || keySet.contains("Miscellaneous " + subblock2) - || keySet.contains("Miscellaneous " + subblock2 + "s")) return "has-longer"; + || keySet.contains("Additional " + subblock2) + || keySet.contains("Additional " + subblock2 + "s") + || keySet.contains("Other " + subblock2) + || keySet.contains("Other " + subblock2 + "s") + || keySet.contains("Miscellaneous " + subblock2) + || keySet.contains("Miscellaneous " + subblock2 + "s")) return "has-longer"; return "\u00a0"; } private void htmlHeader(PrintWriter out) { - out.println("" - + - "" - + - "" + - "" + - "" - ); + out.println( + "" + + "" + + "" + + "" + + "
"); } - private String getDataFromFile(String dir, String filenameRegex) throws FileNotFoundException, IOException { + private String getDataFromFile(String dir, String filenameRegex) + throws FileNotFoundException, IOException { String subblock = "?"; File actualName = getFileNameFromPattern(dir, filenameRegex); BufferedReader in = new BufferedReader(new FileReader(actualName)); @@ -194,8 +218,13 @@ public static File getFileNameFromPattern(String directory, String filenameRegex } String[] files = dir.list(new RegexFileFilter(filenameRegex)); if (files.length != 1) { - throw new IllegalArgumentException("Not a unique match for : " + dir.getCanonicalPath() + " / " - + filenameRegex + " : " + Arrays.asList(files)); + throw new IllegalArgumentException( + "Not a unique match for : " + + dir.getCanonicalPath() + + " / " + + filenameRegex + + " : " + + Arrays.asList(files)); } return new File(directory, files[0]); } catch (IOException e) { @@ -222,4 +251,4 @@ public void set(String regex) { String getSubheader(int codepoint) { return codePoint2Subblock.get(codepoint); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/draft/TimeEntry.java b/unicodetools/src/main/java/org/unicode/draft/TimeEntry.java index cd4b22d95..ef679ed91 100644 --- a/unicodetools/src/main/java/org/unicode/draft/TimeEntry.java +++ b/unicodetools/src/main/java/org/unicode/draft/TimeEntry.java @@ -1,17 +1,16 @@ package org.unicode.draft; + import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Random; - import org.unicode.cldr.util.Timer; - public class TimeEntry { public static void main(String[] args) { final Random rand = new Random(0); - final Map samples = new HashMap(); - for (int i = 0 ; i < 1000000; ++i) { + final Map samples = new HashMap(); + for (int i = 0; i < 1000000; ++i) { samples.put(rand.nextLong(), rand.nextLong()); } final Timer timer = new Timer(); @@ -22,7 +21,7 @@ public static void main(String[] args) { System.out.println(timer); final Timer timer2 = new Timer(); - for (final Entry entry : samples.entrySet()) { + for (final Entry entry : samples.entrySet()) { final Long key = entry.getKey(); final Long value = entry.getValue(); } diff --git a/unicodetools/src/main/java/org/unicode/draft/UnicodeDataInput.java b/unicodetools/src/main/java/org/unicode/draft/UnicodeDataInput.java index af544a181..b8f60ff64 100644 --- a/unicodetools/src/main/java/org/unicode/draft/UnicodeDataInput.java +++ b/unicodetools/src/main/java/org/unicode/draft/UnicodeDataInput.java @@ -1,10 +1,9 @@ package org.unicode.draft; -import java.io.DataInput; -import java.io.IOException; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.text.UnicodeSet; +import java.io.DataInput; +import java.io.IOException; public class UnicodeDataInput { @@ -22,6 +21,7 @@ public DataInput get() { /** * Reads a UnicodeSet in the format of writeUnicodeSet. + * * @param input * @return set read * @throws IOException @@ -43,7 +43,7 @@ public UnicodeSet readUnicodeSet() throws IOException { return result; } - public static abstract class ItemReader { + public abstract static class ItemReader { public abstract T read(DataInput in) throws IOException; public T[] readArray(DataInput input) throws IOException { @@ -74,14 +74,15 @@ public UnicodeMap readUnicodeMap(ItemReader reader) throws IOException return readUnicodeMap(reader, input); } - public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput dataInput) throws IOException { + public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput dataInput) + throws IOException { final UnicodeMap result = new UnicodeMap(); // values final T[] values = reader.readArray(dataInput); // transitions final int transitionCount = dataInput.readInt(); - final int[] transitions = new int[transitionCount+1]; + final int[] transitions = new int[transitionCount + 1]; int last = 0; for (int i = 0; i < transitionCount; ++i) { transitions[i] = last = dataInput.readInt() + last; @@ -90,12 +91,11 @@ public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput d // values for (int i = 0; i < transitionCount; ++i) { final int valueIndex = dataInput.readInt(); - if (valueIndex < 0) - { + if (valueIndex < 0) { continue; // no value } final T value = values[valueIndex]; - result.putAll(transitions[i], transitions[i+1]-1, value); + result.putAll(transitions[i], transitions[i + 1] - 1, value); } // strings final int stringCount = dataInput.readInt(); @@ -107,5 +107,4 @@ public static UnicodeMap readUnicodeMap(ItemReader reader, DataInput d } return result; } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/UnicodeDataOutput.java b/unicodetools/src/main/java/org/unicode/draft/UnicodeDataOutput.java index 467147c3f..05bf502c4 100644 --- a/unicodetools/src/main/java/org/unicode/draft/UnicodeDataOutput.java +++ b/unicodetools/src/main/java/org/unicode/draft/UnicodeDataOutput.java @@ -1,5 +1,8 @@ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.DataOutput; import java.io.IOException; import java.util.Collection; @@ -7,10 +10,6 @@ import java.util.Map; import java.util.Set; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class UnicodeDataOutput { private DataOutput output; @@ -26,18 +25,20 @@ public DataOutput get() { } /** - * Writes a UnicodeSet as: the count of ranges (int), each pair of ranges (int), a count of strings, and then the strings. + * Writes a UnicodeSet as: the count of ranges (int), each pair of ranges (int), a count of + * strings, and then the strings. + * * @param output * @param toWrite * @throws IOException */ - public void writeUnicodeSet(UnicodeSet toWrite) throws IOException { + public void writeUnicodeSet(UnicodeSet toWrite) throws IOException { final int rangeCount = toWrite.getRangeCount(); int last = 0; output.writeInt(rangeCount); int count = 0; boolean firstString = true; - for (final UnicodeSetIterator it = new UnicodeSetIterator(toWrite); it.nextRange();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(toWrite); it.nextRange(); ) { if (it.codepoint != UnicodeSetIterator.IS_STRING) { output.writeInt(it.codepoint - last); last = it.codepoint; @@ -46,7 +47,9 @@ public void writeUnicodeSet(UnicodeSet toWrite) throws IOException { count += it.codepointEnd - it.codepoint + 1; } else { if (firstString) { - output.writeInt(toWrite.size() - count); // write terminator. negatives will not occur above. + output.writeInt( + toWrite.size() + - count); // write terminator. negatives will not occur above. firstString = false; } output.writeUTF(it.string); @@ -57,18 +60,20 @@ public void writeUnicodeSet(UnicodeSet toWrite) throws IOException { } } - public static abstract class ItemWriter { + public abstract static class ItemWriter { public abstract void write(DataOutput out, T item) throws IOException; /** * Can be overridden for efficiency. The collection is actually a set. + * * @param output * @param values * @return * @throws IOException */ - public Map writeArray(DataOutput output, Collection values) throws IOException { - final Map valuesToInts = new LinkedHashMap(); + public Map writeArray(DataOutput output, Collection values) + throws IOException { + final Map valuesToInts = new LinkedHashMap(); int index = 0; output.writeInt(values.size()); for (final T value : values) { @@ -92,19 +97,22 @@ public void write(DataOutput out, String item) throws IOException { * count of transitions, list of transitions
* list of corresponding value indexes (same count - 1)
* count of strings, list of string/valueIndex pairs
- * The transitions and corresponding value indexes are written separately to allow compression of the former. - * + * The transitions and corresponding value indexes are written separately to allow compression + * of the former. + * * @param * @param output * @param toWrite * @param writer * @throws IOException */ - public void writeUnicodeMap(UnicodeMap toWrite, ItemWriter writer) throws IOException { + public void writeUnicodeMap(UnicodeMap toWrite, ItemWriter writer) + throws IOException { writeUnicodeMap(toWrite, writer, output); } - public static void writeUnicodeMap(UnicodeMap toWrite, ItemWriter writer, DataOutput dataOutput) throws IOException { + public static void writeUnicodeMap( + UnicodeMap toWrite, ItemWriter writer, DataOutput dataOutput) throws IOException { // values final Collection values = toWrite.getAvailableValues(); final Map valuesToInts = writer.writeArray(dataOutput, values); diff --git a/unicodetools/src/main/java/org/unicode/draft/UnicodeIntMap.java b/unicodetools/src/main/java/org/unicode/draft/UnicodeIntMap.java index 291ebb84b..852413c65 100644 --- a/unicodetools/src/main/java/org/unicode/draft/UnicodeIntMap.java +++ b/unicodetools/src/main/java/org/unicode/draft/UnicodeIntMap.java @@ -6,6 +6,13 @@ */ package org.unicode.draft; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.StringTransform; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.Freezable; import java.util.Collection; import java.util.Collections; import java.util.Comparator; @@ -18,31 +25,22 @@ import java.util.TreeMap; import java.util.TreeSet; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.StringTransform; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.Freezable; - /** - * Class for mapping Unicode characters and strings to values, optimized for single code points, - * where ranges of code points have the same value. - * Much smaller storage than using HashMap, and much faster and more compact than - * a list of UnicodeSets. The API design mimics Map but can't extend it due to some - * necessary changes (much as UnicodeSet mimics Set). Note that nulls are not permitted as values; - * that is, a put(x,null) is the same as remove(x).
+ * Class for mapping Unicode characters and strings to values, optimized for single code points, + * where ranges of code points have the same value. Much smaller storage than using HashMap, and + * much faster and more compact than a list of UnicodeSets. The API design mimics Map but + * can't extend it due to some necessary changes (much as UnicodeSet mimics Set). Note that + * nulls are not permitted as values; that is, a put(x,null) is the same as remove(x).
* At this point "" is also not allowed as a key, although that may change. + * * @author markdavis */ - -public final class UnicodeIntMap implements Cloneable, Freezable, StringTransform, Iterable { - /** - * For serialization - */ - //private static final long serialVersionUID = -6540936876295804105L; +public final class UnicodeIntMap + implements Cloneable, Freezable, StringTransform, Iterable { + /** For serialization */ + // private static final long serialVersionUID = -6540936876295804105L; static final boolean ASSERTIONS = false; + static final long GROWTH_PERCENT = 200; // 100 is no growth! static final long GROWTH_GAP = 10; // extra bump! public static final int UNASSIGNED = Integer.MAX_VALUE; @@ -56,25 +54,26 @@ public final class UnicodeIntMap implements Cloneable, Freezable, private transient boolean staleAvailableValues; private transient boolean errorOnReset; - private volatile transient boolean locked; + private transient volatile boolean locked; private int lastIndex; - private TreeMap stringMap; + private TreeMap stringMap; - { clear(); } - - public UnicodeIntMap() { + { + clear(); } + public UnicodeIntMap() {} + public UnicodeIntMap(UnicodeIntMap other) { this.putAll(other); } - + public UnicodeIntMap clear() { if (locked) { throw new UnsupportedOperationException("Attempt to modify locked object"); } length = 2; - transitions = new int[] {0,0x110000,0,0,0,0,0,0,0,0}; + transitions = new int[] {0, 0x110000, 0, 0, 0, 0, 0, 0, 0, 0}; values = new int[10]; values[0] = UNASSIGNED; @@ -93,7 +92,7 @@ public boolean equals(Object other) { try { UnicodeIntMap that = (UnicodeIntMap) other; if (length != that.length) return false; - for (int i = 0; i < length-1; ++i) { + for (int i = 0; i < length - 1; ++i) { if (transitions[i] != that.transitions[i]) return false; if (values[i] != that.values[i]) return false; } @@ -106,22 +105,20 @@ public boolean equals(Object other) { public int hashCode() { int result = length; // TODO might want to abbreviate this for speed. - for (int i = 0; i < length-1; ++i) { - result = 37*result + transitions[i]; - result = 37*result; + for (int i = 0; i < length - 1; ++i) { + result = 37 * result + transitions[i]; + result = 37 * result; if (values[i] != UNASSIGNED) { - result += values[i]; + result += values[i]; } } if (stringMap != null) { - result = 37*result + stringMap.hashCode(); + result = 37 * result + stringMap.hashCode(); } return result; } - /** - * Standard clone. Warning, as with Collections, does not do deep clone. - */ + /** Standard clone. Warning, as with Collections, does not do deep clone. */ public UnicodeIntMap cloneAsThawed() { UnicodeIntMap that = new UnicodeIntMap(); that.length = length; @@ -136,35 +133,48 @@ public UnicodeIntMap cloneAsThawed() { /* for internal consistency checking */ void _checkInvariants() { - if (length < 2 - || length > transitions.length - || transitions.length != values.length) { + if (length < 2 || length > transitions.length || transitions.length != values.length) { throw new IllegalArgumentException("Invariant failed: Lengths bad"); } - for (int i = 1; i < length-1; ++i) { - if (values[i-1] == values[i]) { - throw new IllegalArgumentException("Invariant failed: values shared at " - + "\t" + Utility.hex(i-1) + ": <" + values[i-1] + ">" - + "\t" + Utility.hex(i) + ": <" + values[i] + ">" - ); + for (int i = 1; i < length - 1; ++i) { + if (values[i - 1] == values[i]) { + throw new IllegalArgumentException( + "Invariant failed: values shared at " + + "\t" + + Utility.hex(i - 1) + + ": <" + + values[i - 1] + + ">" + + "\t" + + Utility.hex(i) + + ": <" + + values[i] + + ">"); } } - if (transitions[0] != 0 || transitions[length-1] != 0x110000) { + if (transitions[0] != 0 || transitions[length - 1] != 0x110000) { throw new IllegalArgumentException("Invariant failed: bounds set wrong"); } - for (int i = 1; i < length-1; ++i) { - if (transitions[i-1] >= transitions[i]) { - throw new IllegalArgumentException("Invariant failed: not monotonic" - + "\t" + Utility.hex(i-1) + ": " + transitions[i-1] - + "\t" + Utility.hex(i) + ": " + transitions[i] - ); + for (int i = 1; i < length - 1; ++i) { + if (transitions[i - 1] >= transitions[i]) { + throw new IllegalArgumentException( + "Invariant failed: not monotonic" + + "\t" + + Utility.hex(i - 1) + + ": " + + transitions[i - 1] + + "\t" + + Utility.hex(i) + + ": " + + transitions[i]); } } } /** - * Finds an index such that inversionList[i] <= codepoint < inversionList[i+1] - * Assumes that 0 <= codepoint <= 0x10FFFF + * Finds an index such that inversionList[i] <= codepoint < inversionList[i+1] Assumes that 0 <= + * codepoint <= 0x10FFFF + * * @param codepoint * @return the index */ @@ -189,56 +199,62 @@ private int _findIndex(int c) { private void _checkFind(int codepoint, int value) { int other = __findIndex(codepoint); if (other != value) { - throw new IllegalArgumentException("Invariant failed: binary search" - + "\t" + Utility.hex(codepoint) + ": " + value - + "\tshould be: " + other); + throw new IllegalArgumentException( + "Invariant failed: binary search" + + "\t" + + Utility.hex(codepoint) + + ": " + + value + + "\tshould be: " + + other); } } private int __findIndex(int codepoint) { - for (int i = length-1; i > 0; --i) { + for (int i = length - 1; i > 0; --i) { if (transitions[i] <= codepoint) return i; } return 0; } /* - * Try indexed lookup - - static final int SHIFT = 8; - int[] starts = new int[0x10FFFF>>SHIFT]; // lowest transition index where codepoint>>x can be found - boolean startsValid = false; - private int findIndex(int codepoint) { - if (!startsValid) { - int start = 0; - for (int i = 1; i < length; ++i) { - - } + * Try indexed lookup + + static final int SHIFT = 8; + int[] starts = new int[0x10FFFF>>SHIFT]; // lowest transition index where codepoint>>x can be found + boolean startsValid = false; + private int findIndex(int codepoint) { + if (!startsValid) { + int start = 0; + for (int i = 1; i < length; ++i) { + + } + } + for (int i = length-1; i > 0; --i) { + if (transitions[i] <= codepoint) return i; } - for (int i = length-1; i > 0; --i) { - if (transitions[i] <= codepoint) return i; - } - return 0; - } - */ + return 0; + } + */ /** - * Remove the items from index through index+count-1. - * Logically reduces the size of the internal arrays. + * Remove the items from index through index+count-1. Logically reduces the size of the internal + * arrays. + * * @param index * @param count */ private void _removeAt(int index, int count) { for (int i = index + count; i < length; ++i) { - transitions[i-count] = transitions[i]; - values[i-count] = values[i]; + transitions[i - count] = transitions[i]; + values[i - count] = values[i]; } length -= count; } /** - * Add a gap from index to index+count-1. - * The values there are undefined, and must be set. + * Add a gap from index to index+count-1. The values there are undefined, and must be set. * Logically grows arrays to accomodate. Actual growth is limited + * * @param index * @param count */ @@ -254,17 +270,18 @@ private void _insertGapAt(int index, int count) { transitions[i] = oldtransitions[i]; values[i] = oldvalues[i]; } - } + } for (int i = length - 1; i >= index; --i) { - transitions[i+count] = oldtransitions[i]; - values[i+count] = oldvalues[i]; + transitions[i + count] = oldtransitions[i]; + values[i + count] = oldvalues[i]; } length = newLength; } /** - * Associates code point with value. Removes any previous association. - * All code that calls this MUST check for frozen first! + * Associates code point with value. Removes any previous association. All code that calls this + * MUST check for frozen first! + * * @param codepoint * @param value * @return this, for chaining @@ -274,10 +291,9 @@ private UnicodeIntMap _put(int codepoint, int value) { // be defined such that transitions[baseIndex] < codepoint // at end of this routine. int baseIndex; - if (transitions[lastIndex] <= codepoint - && codepoint < transitions[lastIndex+1]) { + if (transitions[lastIndex] <= codepoint && codepoint < transitions[lastIndex + 1]) { baseIndex = lastIndex; - } else { + } else { baseIndex = _findIndex(codepoint); } int limitIndex = baseIndex + 1; @@ -287,13 +303,18 @@ private UnicodeIntMap _put(int codepoint, int value) { throw new UnsupportedOperationException("Attempt to modify locked object"); } if (errorOnReset && values[baseIndex] != UNASSIGNED) { - throw new UnsupportedOperationException("Attempt to reset value for " + Utility.hex(codepoint) - + " when that is disallowed. Old: " + values[baseIndex] + "; New: " + value); + throw new UnsupportedOperationException( + "Attempt to reset value for " + + Utility.hex(codepoint) + + " when that is disallowed. Old: " + + values[baseIndex] + + "; New: " + + value); } // adjust the available values staleAvailableValues = true; - availableValues.add(value); // add if not there already + availableValues.add(value); // add if not there already int baseCP = transitions[baseIndex]; int limitCP = transitions[limitIndex]; @@ -303,13 +324,12 @@ private UnicodeIntMap _put(int codepoint, int value) { if (baseCP == codepoint) { // CASE: At very start of range - boolean connectsWithPrevious = - baseIndex != 0 && value == values[baseIndex-1]; + boolean connectsWithPrevious = baseIndex != 0 && value == values[baseIndex - 1]; if (limitCP == codepoint + 1) { // CASE: Single codepoint range boolean connectsWithFollowing = - baseIndex < length - 2 && value == values[limitIndex]; // was -1 + baseIndex < length - 2 && value == values[limitIndex]; // was -1 if (connectsWithPrevious) { // A1a connects with previous & following, so remove index @@ -321,31 +341,31 @@ private UnicodeIntMap _put(int codepoint, int value) { --baseIndex; // fix up } else if (connectsWithFollowing) { _removeAt(baseIndex, 1); // extend following backwards - transitions[baseIndex] = codepoint; + transitions[baseIndex] = codepoint; } else { // doesn't connect on either side, just reset values[baseIndex] = value; } - } else if (connectsWithPrevious) { + } else if (connectsWithPrevious) { // A.1: start of multi codepoint range // if connects ++transitions[baseIndex]; // extend previous } else { // otherwise insert new transition - transitions[baseIndex] = codepoint+1; // fix following range + transitions[baseIndex] = codepoint + 1; // fix following range _insertGapAt(baseIndex, 1); values[baseIndex] = value; transitions[baseIndex] = codepoint; } } else if (limitCP == codepoint + 1) { - // CASE: at end of range + // CASE: at end of range // if connects, just back up range boolean connectsWithFollowing = - baseIndex < length - 2 && value == values[limitIndex]; // was -1 + baseIndex < length - 2 && value == values[limitIndex]; // was -1 if (connectsWithFollowing) { - --transitions[limitIndex]; - return this; + --transitions[limitIndex]; + return this; } else { _insertGapAt(limitIndex, 1); transitions[limitIndex] = codepoint; @@ -354,11 +374,11 @@ private UnicodeIntMap _put(int codepoint, int value) { } else { // CASE: in middle of range // insert gap, then set the new range - _insertGapAt(++baseIndex,2); + _insertGapAt(++baseIndex, 2); transitions[baseIndex] = codepoint; values[baseIndex] = value; - transitions[baseIndex+1] = codepoint + 1; - values[baseIndex+1] = values[baseIndex-1]; // copy lower range values + transitions[baseIndex + 1] = codepoint + 1; + values[baseIndex + 1] = values[baseIndex - 1]; // copy lower range values } lastIndex = baseIndex; // store for next time return this; @@ -375,6 +395,7 @@ private UnicodeIntMap _putAll(int startCodePoint, int endCodePoint, int value) { /** * Sets the codepoint value. + * * @param codepoint * @param value * @return this (for chaining) @@ -390,6 +411,7 @@ public UnicodeIntMap put(int codepoint, int value) { /** * Sets the codepoint value. + * * @param codepoint * @param value * @return this (for chaining) @@ -402,7 +424,7 @@ public UnicodeIntMap put(String string, int value) { } if (value != UNASSIGNED) { if (stringMap == null) { - stringMap = new TreeMap(); + stringMap = new TreeMap(); } stringMap.put(string, value); staleAvailableValues = true; @@ -418,6 +440,7 @@ public UnicodeIntMap put(String string, int value) { /** * Adds bunch o' codepoints; otherwise like put. + * * @param codepoints * @param value * @return this (for chaining) @@ -436,6 +459,7 @@ public UnicodeIntMap putAll(UnicodeSet codepoints, int value) { /** * Adds bunch o' codepoints; otherwise like add. + * * @param startCodePoint * @param endCodePoint * @param value @@ -446,28 +470,32 @@ public UnicodeIntMap putAll(int startCodePoint, int endCodePoint, int value) { throw new UnsupportedOperationException("Attempt to modify locked object"); } if (startCodePoint < 0 || endCodePoint > 0x10FFFF) { - throw new IllegalArgumentException("Codepoint out of range: " - + Utility.hex(startCodePoint) + ".." + Utility.hex(endCodePoint)); + throw new IllegalArgumentException( + "Codepoint out of range: " + + Utility.hex(startCodePoint) + + ".." + + Utility.hex(endCodePoint)); } return _putAll(startCodePoint, endCodePoint, value); } /** * Add all the (main) values from a UnicodeMap + * * @param unicodeMap the property to add to the map * @return this (for chaining) */ - public UnicodeIntMap putAll(UnicodeIntMap unicodeMap) { + public UnicodeIntMap putAll(UnicodeIntMap unicodeMap) { for (int i = 0; i < unicodeMap.length; ++i) { int value = unicodeMap.values[i]; if (value != UNASSIGNED) { - _putAll(unicodeMap.transitions[i], unicodeMap.transitions[i+1]-1, value); + _putAll(unicodeMap.transitions[i], unicodeMap.transitions[i + 1] - 1, value); } if (ASSERTIONS) _checkInvariants(); } if (unicodeMap.stringMap != null && !unicodeMap.stringMap.isEmpty()) { if (stringMap == null) { - stringMap = new TreeMap(); + stringMap = new TreeMap(); } stringMap.putAll(unicodeMap.stringMap); } @@ -476,12 +504,13 @@ public UnicodeIntMap putAll(UnicodeIntMap unicodeMap) { /** * Add all the (main) values from a Unicode property + * * @param prop the property to add to the map * @return this (for chaining) */ public UnicodeIntMap putAllFiltered(UnicodeIntMap prop, UnicodeSet filter) { // TODO optimize - for (UnicodeSetIterator it = new UnicodeSetIterator(filter); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(filter); it.next(); ) { if (it.codepoint != UnicodeSetIterator.IS_STRING) { int value = prop.getValue(it.codepoint); if (value != UNASSIGNED) { @@ -501,6 +530,7 @@ public UnicodeIntMap putAllFiltered(UnicodeIntMap prop, UnicodeSet filter) { /** * Set the currently unmapped Unicode code points to the given value. + * * @param value the value to set * @return this (for chaining) */ @@ -518,22 +548,21 @@ public UnicodeIntMap setMissing(int value) { } } /** - * Returns the keyset consisting of all the keys that would produce the given value. Deposits into - * result if it is not null. Remember to clear if you just want - * the new values. + * Returns the keyset consisting of all the keys that would produce the given value. Deposits + * into result if it is not null. Remember to clear if you just want the new values. */ public UnicodeSet keySet(int value, UnicodeSet result) { if (result == null) result = new UnicodeSet(); for (int i = 0; i < length - 1; ++i) { if (value == values[i]) { - result.add(transitions[i], transitions[i+1]-1); - } + result.add(transitions[i], transitions[i + 1] - 1); + } } if (value != UNASSIGNED && stringMap != null) { for (String key : stringMap.keySet()) { int newValue = stringMap.get(key); if (value == newValue) { - result.add((String)key); + result.add((String) key); } } } @@ -541,22 +570,20 @@ public UnicodeSet keySet(int value, UnicodeSet result) { } /** - * Returns the keyset consisting of all the keys that would produce the given value. - * the new values. + * Returns the keyset consisting of all the keys that would produce the given value. the new + * values. */ public UnicodeSet keySet(int value) { - return keySet(value,null); + return keySet(value, null); } - - /** - * Returns the keyset consisting of all the keys that would produce (non-null) values. - */ + + /** Returns the keyset consisting of all the keys that would produce (non-null) values. */ public UnicodeSet keySet() { UnicodeSet result = new UnicodeSet(); for (int i = 0; i < length - 1; ++i) { if (values[i] != UNASSIGNED) { - result.add(transitions[i], transitions[i+1]-1); - } + result.add(transitions[i], transitions[i + 1] - 1); + } } if (stringMap != null) { result.addAll(stringMap.keySet()); @@ -565,9 +592,9 @@ public UnicodeSet keySet() { } /** - * Returns the list of possible values. Deposits each non-null value into - * result. Creates result if it is null. Remember to clear result if - * you are not appending to existing collection. + * Returns the list of possible values. Deposits each non-null value into result. Creates result + * if it is null. Remember to clear result if you are not appending to existing collection. + * * @param result * @return result */ @@ -592,15 +619,13 @@ public > U values(U result) { return result; } - /** - * Convenience method - */ + /** Convenience method */ public Set values() { return getAvailableValues(null); } /** - * Gets the value associated with a given code point. - * Returns null, if there is no such value. + * Gets the value associated with a given code point. Returns null, if there is no such value. + * * @param codepoint * @return the value */ @@ -612,8 +637,8 @@ public int get(int codepoint) { } /** - * Gets the value associated with a given code point. - * Returns null, if there is no such value. + * Gets the value associated with a given code point. Returns null, if there is no such value. + * * @param codepoint * @return the value */ @@ -627,11 +652,11 @@ public int get(String value) { return getValue(UTF16.charAt(value, 0)); } - /** - * Change a new string from the source string according to the mappings. - * For each code point cp, if getValue(cp) is null, append the character, otherwise append getValue(cp).toString() - * TODO: extend to strings + * Change a new string from the source string according to the mappings. For each code point cp, + * if getValue(cp) is null, append the character, otherwise append getValue(cp).toString() TODO: + * extend to strings + * * @param source * @return */ @@ -652,12 +677,15 @@ public String transform(String source) { /** * Used to add complex values, where the value isn't replaced but in some sense composed + * * @author markdavis */ public abstract static class Composer { /** - * This will be called with either a string or a code point. The result is the new value for that item. - * If the codepoint is used, the string is null; if the string is used, the codepoint is -1. + * This will be called with either a string or a code point. The result is the new value for + * that item. If the codepoint is used, the string is null; if the string is used, the + * codepoint is -1. + * * @param a * @param b */ @@ -673,7 +701,7 @@ public UnicodeIntMap composeWith(UnicodeIntMap other, Composer composer } public UnicodeIntMap composeWith(UnicodeSet set, int value, Composer composer) { - for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next(); ) { int i = it.codepoint; if (i == UnicodeSetIterator.IS_STRING) { String s = it.string; @@ -681,7 +709,7 @@ public UnicodeIntMap composeWith(UnicodeSet set, int value, Composer co int v3 = composer.compose(-1, s, v1, value); if (v1 != v3 && (v1 == UNASSIGNED || v1 != v3)) { put(s, v3); - } + } } else { int v1 = getValue(i); int v3 = composer.compose(i, null, v1, value); @@ -698,25 +726,28 @@ public String toString() { } public String toString(Comparator collected) { - StringBuffer result = new StringBuffer(); + StringBuffer result = new StringBuffer(); if (collected == null) { - for (int i = 0; i < length-1; ++i) { + for (int i = 0; i < length - 1; ++i) { int value = values[i]; if (value == UNASSIGNED) continue; int start = transitions[i]; - int end = transitions[i+1]-1; + int end = transitions[i + 1] - 1; result.append(Utility.hex(start)); if (start != end) result.append("-").append(Utility.hex(end)); result.append("=").append(String.valueOf(value)).append("\n"); } if (stringMap != null) { for (String s : stringMap.keySet()) { - result.append(Utility.hex(s)).append("=").append(stringMap.get(s).toString()).append("\n"); + result.append(Utility.hex(s)) + .append("=") + .append(stringMap.get(s).toString()) + .append("\n"); } } } else { Set set = values(new TreeSet(collected)); - for (Iterator it = set.iterator(); it.hasNext();) { + for (Iterator it = set.iterator(); it.hasNext(); ) { int value = it.next(); UnicodeSet s = keySet(value); result.append(value).append("=").append(s.toString()).append("\n"); @@ -731,7 +762,9 @@ public boolean getErrorOnReset() { return errorOnReset; } /** - * Puts the UnicodeMap into a state whereby new mappings are accepted, but changes to old mappings cause an exception. + * Puts the UnicodeMap into a state whereby new mappings are accepted, but changes to old + * mappings cause an exception. + * * @param errorOnReset The errorOnReset to set. */ public UnicodeIntMap setErrorOnReset(boolean errorOnReset) { @@ -755,11 +788,8 @@ public UnicodeIntMap freeze() { return this; } - /** - * Utility to find the maximal common prefix of two strings. - * TODO: fix supplemental support - */ - static public int findCommonPrefix(String last, String s) { + /** Utility to find the maximal common prefix of two strings. TODO: fix supplemental support */ + public static int findCommonPrefix(String last, String s) { int minLen = Math.min(last.length(), s.length()); for (int i = 0; i < minLen; ++i) { if (last.charAt(i) != s.charAt(i)) return i; @@ -768,35 +798,37 @@ static public int findCommonPrefix(String last, String s) { } /** - * Get the number of ranges; used for getRangeStart/End. The ranges together cover all of the single-codepoint keys in the UnicodeMap. Other keys can be gotten with getStrings(). + * Get the number of ranges; used for getRangeStart/End. The ranges together cover all of the + * single-codepoint keys in the UnicodeMap. Other keys can be gotten with getStrings(). */ public int getRangeCount() { - return length-1; + return length - 1; } /** - * Get the start of a range. All code points between start and end are in the UnicodeMap's keyset. + * Get the start of a range. All code points between start and end are in the UnicodeMap's + * keyset. */ public int getRangeStart(int range) { return transitions[range]; } /** - * Get the start of a range. All code points between start and end are in the UnicodeMap's keyset. + * Get the start of a range. All code points between start and end are in the UnicodeMap's + * keyset. */ public int getRangeEnd(int range) { - return transitions[range+1] - 1; + return transitions[range + 1] - 1; } - /** - * Get the value for the range. - */ + /** Get the value for the range. */ public int getRangeValue(int range) { return values[range]; } /** * Get the strings that are not in the ranges. Returns null if there are none. + * * @return */ public Set getNonRangeStrings() { @@ -842,13 +874,14 @@ public boolean isEmpty() { */ public UnicodeIntMap putAll(Map map) { for (String key : map.keySet()) { - put(key,map.get(key)); + put(key, map.get(key)); } return this; } /** * Utility for extracting map + * * @deprecated */ public UnicodeIntMap putAllIn(Map map) { @@ -858,9 +891,7 @@ public UnicodeIntMap putAllIn(Map map) { return this; } - /** - * Utility for extracting map - */ + /** Utility for extracting map */ public > U putAllInto(U map) { for (EntryRange entry : entryRanges()) { if (entry.string != null) { @@ -874,9 +905,7 @@ public > U putAllInto(U map) { return map; } - /** - * Utility for extracting map - */ + /** Utility for extracting map */ public > U putAllCodepointsInto(U map) { for (EntryRange entry : entryRanges()) { if (entry.string != null) { @@ -908,10 +937,10 @@ public UnicodeIntMap remove(int key) { */ public int size() { int result = stringMap == null ? 0 : stringMap.size(); - for (int i = 0; i < length-1; ++i) { + for (int i = 0; i < length - 1; ++i) { int value = values[i]; if (value == UNASSIGNED) continue; - result += transitions[i+1] - transitions[i]; + result += transitions[i + 1] - transitions[i]; } return result; } @@ -919,7 +948,7 @@ public int size() { /* (non-Javadoc) * @see java.util.Map#entrySet() */ - public Iterable> entrySet() { + public Iterable> entrySet() { return new EntrySetX(); } @@ -927,9 +956,10 @@ private class EntrySetX implements Iterable> { public Iterator> iterator() { return new IteratorX(); } + public String toString() { StringBuffer b = new StringBuffer(); - for (Iterator it = iterator(); it.hasNext();) { + for (Iterator it = iterator(); it.hasNext(); ) { Object item = it.next(); b.append(item.toString()).append(' '); } @@ -961,49 +991,60 @@ public Entry next() { public void remove() { throw new UnsupportedOperationException(); } - } - + /** - * Struct-like class used to iterate over a UnicodeMap in a for loop. - * If the value is a string, then codepoint == codepointEnd == -1. Otherwise the string is null; - * Caution: The contents may change during the iteration! + * Struct-like class used to iterate over a UnicodeMap in a for loop. If the value is a string, + * then codepoint == codepointEnd == -1. Otherwise the string is null; Caution: The contents may + * change during the iteration! */ public static class EntryRange { public int codepoint; public int codepointEnd; public String string; public T value; + @Override public String toString() { - return (string != null ? Utility.hex(string) - : Utility.hex(codepoint) + (codepoint == codepointEnd ? "" : ".." + Utility.hex(codepointEnd))) - + "=" + value; + return (string != null + ? Utility.hex(string) + : Utility.hex(codepoint) + + (codepoint == codepointEnd + ? "" + : ".." + Utility.hex(codepointEnd))) + + "=" + + value; } } - + /** - * Returns an Iterable over EntryRange, designed for efficient for loops over UnicodeMaps. - * Caution: For efficiency, the EntryRange may be reused, so the EntryRange may change on each iteration! - * The value is guaranteed never to be null. The entryRange.string values (non-null) are after all the ranges. + * Returns an Iterable over EntryRange, designed for efficient for loops over UnicodeMaps. + * Caution: For efficiency, the EntryRange may be reused, so the EntryRange may change on each + * iteration! The value is guaranteed never to be null. The entryRange.string values (non-null) + * are after all the ranges. + * * @return entry range, for for loops */ public Iterable> entryRanges() { return new EntryRanges(); } - private class EntryRanges implements Iterable>, Iterator> { + private class EntryRanges + implements Iterable>, Iterator> { private int pos; private EntryRange result = new EntryRange(); - private int lastRealRange = values[length-2] == UNASSIGNED ? length - 2 : length - 1; - private Iterator> stringIterator = stringMap == null ? null : stringMap.entrySet().iterator(); - + private int lastRealRange = values[length - 2] == UNASSIGNED ? length - 2 : length - 1; + private Iterator> stringIterator = + stringMap == null ? null : stringMap.entrySet().iterator(); + public Iterator> iterator() { return this; } + public boolean hasNext() { return pos < lastRealRange || (stringIterator != null && stringIterator.hasNext()); } + public EntryRange next() { // a range may be null, but then the next one must not be (except the final range) if (pos < lastRealRange) { @@ -1012,7 +1053,7 @@ public EntryRange next() { temp = values[++pos]; } result.codepoint = transitions[pos]; - result.codepointEnd = transitions[pos+1]-1; + result.codepointEnd = transitions[pos + 1] - 1; result.string = null; result.value = temp; ++pos; @@ -1024,6 +1065,7 @@ public EntryRange next() { } return result; } + public void remove() { throw new UnsupportedOperationException(); } @@ -1036,50 +1078,39 @@ public Iterator iterator() { return keySet().iterator(); } - /** - * Old form for compatibility - */ + /** Old form for compatibility */ public int getValue(String key) { return get(key); } - /** - * Old form for compatibility - */ + /** Old form for compatibility */ public int getValue(int key) { // TODO Auto-generated method stub return get(key); } - /** - * Old form for compatibility - */ + /** Old form for compatibility */ public Collection getAvailableValues() { return values(); } - /** - * Old form for compatibility - */ + /** Old form for compatibility */ public > U getAvailableValues(U result) { return values(result); } - /** - * Old form for compatibility - */ + /** Old form for compatibility */ public UnicodeSet getSet(int value) { return keySet(value); } - /** - * Old form for compatibility - */ + /** Old form for compatibility */ public UnicodeSet getSet(int value, UnicodeSet result) { return keySet(value, result); } - // This is to support compressed serialization. It works; just commented out for now as we shift to Generics + // This is to support compressed serialization. It works; just commented out for now as we shift + // to Generics // TODO Fix once generics are cleaned up. // // TODO Fix to serialize more than just strings. // // Only if all the items are strings will we do the following compression @@ -1094,7 +1125,7 @@ public UnicodeSet getSet(int value, UnicodeSet result) { // if (allAreString(availableVals)) { // sc.writeStringSet(new TreeSet(availableVals), object_index); // } else { - // sc.writeCollection(availableVals, object_index); + // sc.writeCollection(availableVals, object_index); // } // sc.writeUInt(length); // int lastTransition = -1; @@ -1102,7 +1133,8 @@ public UnicodeSet getSet(int value, UnicodeSet result) { // if (DEBUG_WRITE) System.out.println("Trans count: " + length); // for (int i = 0; i < length; ++i) { // int valueNumber = ((Integer)object_index.get(values[i])).intValue(); - // if (DEBUG_WRITE) System.out.println("Trans: " + transitions[i] + ",\t" + valueNumber); + // if (DEBUG_WRITE) System.out.println("Trans: " + transitions[i] + ",\t" + + // valueNumber); // // int deltaTransition = transitions[i] - lastTransition; // lastTransition = transitions[i]; @@ -1123,7 +1155,7 @@ public UnicodeSet getSet(int value, UnicodeSet result) { // } // // /** - // * + // * // */ // private boolean allAreString(Collection availableValues2) { // //if (true) return false; @@ -1141,7 +1173,7 @@ public UnicodeSet getSet(int value, UnicodeSet result) { // if (allStrings) { // valuesList = sc.readStringSet(availableValues); // } else { - // valuesList = sc.readCollection(availableValues); + // valuesList = sc.readCollection(availableValues); // } // length = sc.readUInt(); // transitions = new int[length]; @@ -1163,10 +1195,11 @@ public UnicodeSet getSet(int value, UnicodeSet result) { // deltaTransition = 1; // } // transitions[i] = currentTransition += deltaTransition; // delta value - // if (DEBUG_WRITE) System.out.println("Trans: " + transitions[i] + ",\t" + currentValue); + // if (DEBUG_WRITE) System.out.println("Trans: " + transitions[i] + ",\t" + + // currentValue); // } // } - + public final UnicodeIntMap removeAll(UnicodeSet set) { return putAll(set, UNASSIGNED); } @@ -1216,20 +1249,22 @@ private final UnicodeIntMap removeRetainAll(UnicodeIntMap reference, boolean rem } return putAll(toNuke, UNASSIGNED); } - + /** * Returns the keys that consist of multiple code points. + * * @return */ public final Set stringKeys() { return getNonRangeStrings(); } - + /** * Gets the inverse of this map, adding to the target. Like putAllIn + * * @return */ - public > U addInverseTo(U target) { + public > U addInverseTo(U target) { for (int value : values()) { UnicodeSet uset = getSet(value); target.put(value, uset); @@ -1239,10 +1274,11 @@ public > U addInverseTo(U target) { /** * Freeze an inverse map. + * * @param target * @return */ - public static Map freeze(Map target) { + public static Map freeze(Map target) { for (UnicodeSet entry : target.values()) { entry.freeze(); } @@ -1259,8 +1295,8 @@ public UnicodeIntMap putAllInverse(Map source) { } return this; } - - public class ImmutableEntry implements Map.Entry { + + public class ImmutableEntry implements Map.Entry { final K k; final V v; @@ -1269,9 +1305,13 @@ public ImmutableEntry(K key, V value) { v = value; } - public K getKey() {return k;} + public K getKey() { + return k; + } - public V getValue() {return v;} + public V getValue() { + return v; + } public V setValue(V value) { throw new UnsupportedOperationException(); @@ -1279,7 +1319,7 @@ public V setValue(V value) { public boolean equals(Object o) { try { - Map.Entry e = (Map.Entry)o; + Map.Entry e = (Map.Entry) o; return UnicodeMap.areEqual(e.getKey(), k) && UnicodeMap.areEqual(e.getValue(), v); } catch (ClassCastException e) { return false; @@ -1287,12 +1327,11 @@ public boolean equals(Object o) { } public int hashCode() { - return ((k==null ? 0 : k.hashCode()) ^ (v==null ? 0 : v.hashCode())); + return ((k == null ? 0 : k.hashCode()) ^ (v == null ? 0 : v.hashCode())); } public String toString() { - return k+"="+v; + return k + "=" + v; } } - } diff --git a/unicodetools/src/main/java/org/unicode/draft/WebpageCharacterData.java b/unicodetools/src/main/java/org/unicode/draft/WebpageCharacterData.java index 9d0f6e591..95d3f20cd 100644 --- a/unicodetools/src/main/java/org/unicode/draft/WebpageCharacterData.java +++ b/unicodetools/src/main/java/org/unicode/draft/WebpageCharacterData.java @@ -1,4 +1,6 @@ package org.unicode.draft; + +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -6,76 +8,90 @@ import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.tool.LanguageCodeConverter; import org.unicode.cldr.util.Counter; import org.unicode.text.utility.Settings; -import com.ibm.icu.text.UnicodeSet; - /** * Tool to process raw character data, extracting a subset for faster processing by other tools. - * Here are the latest results of the code point frequencies for one -whole Base segment: - -http://www.corp.google.com/~erikv/unicode-count62.txt - -The 1st column is the code point. - -Then there are 3 groups of 4 columns, where each group is: - -pre-HTML code point count, post-HTML code point count, document count, UTF-8 document count - -The 1st group includes "bad" docs (error during input conversion or -contains unassigned or high private use), 2nd group excludes "bad" -docs, 3rd group is multiplied by pagerank (and excludes "bad" docs). - -Then there are up to 3 groups, where each group is: - -navboost, pagerank, language, encoding, url - -...Data/unicode-count62.txt + * Here are the latest results of the code point frequencies for one whole Base segment: + * + *

http://www.corp.google.com/~erikv/unicode-count62.txt + * + *

The 1st column is the code point. + * + *

Then there are 3 groups of 4 columns, where each group is: + * + *

pre-HTML code point count, post-HTML code point count, document count, UTF-8 document count + * + *

The 1st group includes "bad" docs (error during input conversion or contains unassigned or + * high private use), 2nd group excludes "bad" docs, 3rd group is multiplied by pagerank (and + * excludes "bad" docs). + * + *

Then there are up to 3 groups, where each group is: + * + *

navboost, pagerank, language, encoding, url + * + *

...Data/unicode-count62.txt */ public class WebpageCharacterData { - private static final UnicodeSet DEBUG_SET = new UnicodeSet(0x0020,0x0020).freeze(); - private static final String SOURCE_DATA = "Apr.11.2015.tsv"; // "unicode-count75.txt"; // "unicode-count-2012-July-21.txt"; + private static final UnicodeSet DEBUG_SET = new UnicodeSet(0x0020, 0x0020).freeze(); + private static final String SOURCE_DATA = + "Apr.11.2015.tsv"; // "unicode-count75.txt"; // "unicode-count-2012-July-21.txt"; enum Columns { - // 000009 ht 954857442 0 0 0 953577889 0 0 0 11182029595621 0 0 0 804363 56255 139 22 http://www.palmbeachschools.org/ 71269 55048 139 22 http://www.palmbeachschools.org/jobs/ 50871 54366 139 22 http://rtghaiti.com/ + // 000009 ht 954857442 0 0 0 953577889 0 0 0 11182029595621 + // 0 0 0 804363 56255 139 22 http://www.palmbeachschools.org/ 71269 + // 55048 139 22 http://www.palmbeachschools.org/jobs/ 50871 54366 139 + // 22 http://rtghaiti.com/ codePoint, language, - preHtmlCount1, postHtmlCount1, documentCount1, utf8DocumentCount1, - preHtmlCount2, postHtmlCount2, documentCount2, utf8DocumentCount2, - preHtmlCount3, postHtmlCount3, documentCount3, utf8DocumentCount3; + preHtmlCount1, + postHtmlCount1, + documentCount1, + utf8DocumentCount1, + preHtmlCount2, + postHtmlCount2, + documentCount2, + utf8DocumentCount2, + preHtmlCount3, + postHtmlCount3, + documentCount3, + utf8DocumentCount3; static String[] parts; + public static void set(String line) { parts = line.split("\t"); } + public String get() { return parts[ordinal()]; } + @Override public String toString() { return name() + "(" + get() + ")"; } } - private static Map> lang2chars = new HashMap>(); - private static Map> lang2charsPageRank = new HashMap>(); + private static Map> lang2chars = + new HashMap>(); + private static Map> lang2charsPageRank = + new HashMap>(); public static void main(String[] args) throws IOException { doData(); System.out.println("DONE"); } - static public void doData() throws IOException { - final BufferedReader in = FileUtilities.openUTF8Reader( - Settings.Output.GEN_DIR + "frequency/", SOURCE_DATA); + public static void doData() throws IOException { + final BufferedReader in = + FileUtilities.openUTF8Reader(Settings.Output.GEN_DIR + "frequency/", SOURCE_DATA); int lineCounter = 0; final int zeroCountLines = 0; - final HashMap langSeen = new HashMap(); + final HashMap langSeen = new HashMap(); while (true) { final String line = in.readLine(); if (line == null) { @@ -101,7 +117,6 @@ static public void doData() throws IOException { } } - final long good = Long.parseLong(Columns.postHtmlCount2.get()); addToCounter(lang2chars, lang, codePoint, good); addToCounter(lang2chars, "mul", codePoint, good); @@ -111,15 +126,14 @@ static public void doData() throws IOException { } in.close(); System.out.println("Writing data"); - //System.out.println("zeroCountLines " + zeroCountLines); - writeData(lang2chars, Settings.Output.GEN_DIR + - "frequency/languages"); + // System.out.println("zeroCountLines " + zeroCountLines); + writeData(lang2chars, Settings.Output.GEN_DIR + "frequency/languages"); System.out.println("Writing ranked data"); - writeData(lang2charsPageRank, Settings.Output.GEN_DIR + - "frequency/languages-rank"); + writeData(lang2charsPageRank, Settings.Output.GEN_DIR + "frequency/languages-rank"); } - public static void writeData(Map> map, String directory) throws IOException { + public static void writeData(Map> map, String directory) + throws IOException { final Counter totalLang = new Counter(); final Counter totalLangChars = new Counter(); for (final Entry> entry : map.entrySet()) { @@ -132,7 +146,10 @@ public static void writeData(Map> map, String directory final long count = counter.getCount(cp); totalCount += count; totalChars += 1; - out.println(com.ibm.icu.impl.Utility.hex(cp) + " ; " + count); // + " # " + UCharacter.getExtendedName(cp)); + out.println( + com.ibm.icu.impl.Utility.hex(cp) + + " ; " + + count); // + " # " + UCharacter.getExtendedName(cp)); } totalLang.add(lang, totalCount); totalLangChars.add(lang, totalChars); @@ -145,7 +162,8 @@ public static void writeData(Map> map, String directory } } - public static void addToCounter(Map> map, String lang, int codePoint, long post) { + public static void addToCounter( + Map> map, String lang, int codePoint, long post) { if (post == 0) { return; } diff --git a/unicodetools/src/main/java/org/unicode/draft/WordSolver.java b/unicodetools/src/main/java/org/unicode/draft/WordSolver.java index 276636707..293c23b49 100644 --- a/unicodetools/src/main/java/org/unicode/draft/WordSolver.java +++ b/unicodetools/src/main/java/org/unicode/draft/WordSolver.java @@ -1,4 +1,5 @@ package org.unicode.draft; + import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; @@ -57,7 +58,7 @@ public WordSolver(String[] itemsIn) { shared[i] = new int[count]; for (int j = 0; j < count; ++j) { final int share = shared[i][j] = countShared(i, j); - bits[i] |= (1<= 0) { - line = line.substring(0,pos2); + line = line.substring(0, pos2); } line = line.trim(); - if (line.length() == 0) {continue;} + if (line.length() == 0) { + continue; + } final String[] parts = line.split("\\s*;\\s*"); - // 00A1..00A7 ; valid ; ; NV8 # 1.1 INVERTED EXCLAMATION MARK..SECTION SIGN + // 00A1..00A7 ; valid ; ; NV8 # 1.1 INVERTED EXCLAMATION + // MARK..SECTION SIGN try { final int pos = parts[0].indexOf(".."); int start, end; if (pos < 0) { start = end = Integer.parseInt(parts[0], 16); } else { - start = Integer.parseInt(parts[0].substring(0,pos), 16); - end = Integer.parseInt(parts[0].substring(pos+2), 16); + start = Integer.parseInt(parts[0].substring(0, pos), 16); + end = Integer.parseInt(parts[0].substring(pos + 2), 16); } final String status = parts[1]; final String nv8 = parts.length >= 4 ? parts[3] : null; - if (status.equals("valid") - && !"NV8".equals(nv8)) { + if (status.equals("valid") && !"NV8".equals(nv8)) { uts46.addAll(start, end); } utsData.putAll(start, end, allButFirst(parts)); @@ -63,12 +67,16 @@ public static void main(String[] args) { throw new IllegalArgumentException(line, e); } } - //UnicodeMap gc = iup.load(UcdProperty.General_Category); + // UnicodeMap gc = iup.load(UcdProperty.General_Category); // gc.getSet(PropertyValues.General_Category_Values.Unassigned.toString()); final UnicodeSet cn = new UnicodeSet(); if (!uts46.equals(patriks2008)) { - show("\nUTS46 - IDNA2008\n", new UnicodeSet(uts46).removeAll(patriks2008).removeAll(cn)); - show("\nIDNA2008 - UTS46\n", new UnicodeSet(patriks2008).removeAll(uts46).removeAll(cn)); + show( + "\nUTS46 - IDNA2008\n", + new UnicodeSet(uts46).removeAll(patriks2008).removeAll(cn)); + show( + "\nIDNA2008 - UTS46\n", + new UnicodeSet(patriks2008).removeAll(uts46).removeAll(cn)); } } @@ -90,8 +98,8 @@ private static void show(String string, UnicodeSet diff) { final String p = patriksData.get(cp); final String u = utsData.get(cp); final String name = iup.getResolvedValue(UcdProperty.Name, cp); - System.out.println(Utility.hex(s) + "\t" + name + "\n\tIDNA2008:\t" + p + "\n\tUTS46:\t\t" + u); + System.out.println( + Utility.hex(s) + "\t" + name + "\n\tIDNA2008:\t" + p + "\n\tUTS46:\t\t" + u); } - } } diff --git a/unicodetools/src/main/java/org/unicode/idna/CompareCompatProperties.java b/unicodetools/src/main/java/org/unicode/idna/CompareCompatProperties.java index 7ea965f5a..49152c6b6 100644 --- a/unicodetools/src/main/java/org/unicode/idna/CompareCompatProperties.java +++ b/unicodetools/src/main/java/org/unicode/idna/CompareCompatProperties.java @@ -1,35 +1,33 @@ package org.unicode.idna; +import com.google.common.base.CharMatcher; +import com.google.common.base.Objects; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; - import org.unicode.cldr.draft.FileUtilities; -import com.google.common.base.CharMatcher; -import com.google.common.base.Objects; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; - public class CompareCompatProperties { /* -# Column 1: source - The source string to be tested -# Column 2: toUnicode - The result of applying toUnicode to the source, with Transitional_Processing=false. -# A blank value means the same as the source value. -# Column 3: toUnicodeStatus - A set of status codes, each corresponding to a particular test. -# A blank value means [] (no errors). -# Column 4: toAsciiN - The result of applying toASCII to the source, with Transitional_Processing=false. -# A blank value means the same as the toUnicode value. -# Column 5: toAsciiNStatus - A set of status codes, each corresponding to a particular test. -# A blank value means the same as the toUnicodeStatus value. An explicit [] means no errors. -# Column 6: toAsciiT - The result of applying toASCII to the source, with Transitional_Processing=true. -# A blank value means the same as the toAsciiN value. -# Column 7: toAsciiTStatus - A set of status codes, each corresponding to a particular test. -# A blank value means the same as the toAsciiNStatus value. An explicit [] means no errors. - */ + # Column 1: source - The source string to be tested + # Column 2: toUnicode - The result of applying toUnicode to the source, with Transitional_Processing=false. + # A blank value means the same as the source value. + # Column 3: toUnicodeStatus - A set of status codes, each corresponding to a particular test. + # A blank value means [] (no errors). + # Column 4: toAsciiN - The result of applying toASCII to the source, with Transitional_Processing=false. + # A blank value means the same as the toUnicode value. + # Column 5: toAsciiNStatus - A set of status codes, each corresponding to a particular test. + # A blank value means the same as the toUnicodeStatus value. An explicit [] means no errors. + # Column 6: toAsciiT - The result of applying toASCII to the source, with Transitional_Processing=true. + # A blank value means the same as the toAsciiN value. + # Column 7: toAsciiTStatus - A set of status codes, each corresponding to a particular test. + # A blank value means the same as the toAsciiNStatus value. An explicit [] means no errors. + */ enum Column { source, @@ -46,23 +44,27 @@ enum Column { private Column() { this(null); } + private Column(Column getFrom) { this.getFrom = getFrom; } + public Column next() { - return values()[ordinal()+1]; + return values()[ordinal() + 1]; } + public Column addFixedAndNext(String item, List lineParts) { if (item.isEmpty()) { if (getFrom != null) { item = lineParts.get(getFrom.ordinal()); } else if (this == toUnicodeStatus) { - item ="[]"; + item = "[]"; } } lineParts.add(item); return next(); } + static boolean equalsIgnoringErrorDiffs(List a, List b) { if (a == b) { return true; @@ -76,13 +78,13 @@ static boolean equalsIgnoringErrorDiffs(List a, List b) { String itemA = a.get(col.ordinal()); String itemB = b.get(col.ordinal()); if (!itemA.equals(itemB)) { - switch(col) { - case toUnicodeStatus: - case toAsciiNStatus: - case toAsciiTStatus: - return a.equals("[]") == b.equals("[]"); - default: - return false; + switch (col) { + case toUnicodeStatus: + case toAsciiNStatus: + case toAsciiTStatus: + return a.equals("[]") == b.equals("[]"); + default: + return false; } } } @@ -91,7 +93,7 @@ static boolean equalsIgnoringErrorDiffs(List a, List b) { } static final Splitter semiSplitter = Splitter.on(';').trimResults(CharMatcher.anyOf(" \t")); - + public static void main(String[] args) { Map> oldFile = fleshOut("internal-IdnaTestV2-fixed.txt"); Map> newFile = fleshOut("IdnaTestV2.txt"); diff --git a/unicodetools/src/main/java/org/unicode/idna/FilteredUnicodeTransform.java b/unicodetools/src/main/java/org/unicode/idna/FilteredUnicodeTransform.java index 0faac4931..8e1f64683 100644 --- a/unicodetools/src/main/java/org/unicode/idna/FilteredUnicodeTransform.java +++ b/unicodetools/src/main/java/org/unicode/idna/FilteredUnicodeTransform.java @@ -1,17 +1,18 @@ package org.unicode.idna; -import org.unicode.text.utility.UnicodeTransform; - import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.SpanCondition; +import org.unicode.text.utility.UnicodeTransform; public class FilteredUnicodeTransform extends UnicodeTransform { final UnicodeSet transformOnly; final UnicodeTransform baseTransform; + public FilteredUnicodeTransform(UnicodeTransform transform, UnicodeSet unicodeSet) { baseTransform = transform; transformOnly = unicodeSet; } + @Override public String transform(String source) { final StringBuilder builder = new StringBuilder(); @@ -33,7 +34,6 @@ public String transform(String source) { if (end == source.length()) { break; } - } return builder.toString(); } diff --git a/unicodetools/src/main/java/org/unicode/idna/GenerateIdna.java b/unicodetools/src/main/java/org/unicode/idna/GenerateIdna.java index f979fee2e..017e6de4b 100644 --- a/unicodetools/src/main/java/org/unicode/idna/GenerateIdna.java +++ b/unicodetools/src/main/java/org/unicode/idna/GenerateIdna.java @@ -1,43 +1,43 @@ package org.unicode.idna; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.text.DateFormat; +import com.ibm.icu.text.SimpleDateFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.TimeZone; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.util.Arrays; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; +import org.unicode.idna.Idna.IdnaType; import org.unicode.props.BagFormatter; import org.unicode.props.BagFormatter.NameLabel; -import org.unicode.props.UnicodeProperty; -import org.unicode.props.UnicodeProperty.UnicodeMapProperty; -import org.unicode.idna.Idna.IdnaType; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.Age_Values; +import org.unicode.props.UnicodeProperty; +import org.unicode.props.UnicodeProperty.UnicodeMapProperty; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.ToolUnicodePropertySource; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.text.DateFormat; -import com.ibm.icu.text.SimpleDateFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.TimeZone; -import com.ibm.icu.util.ULocale; - public class GenerateIdna { static { // MUST BE FIRST GenerateIdnaTest.setUnicodeVersion(); } - public static final String GEN_IDNA_DIR = Settings.Output.GEN_DIR + "idna/" + Default.ucdVersion() + "/"; + + public static final String GEN_IDNA_DIR = + Settings.Output.GEN_DIR + "idna/" + Default.ucdVersion() + "/"; // Utility.WORKSPACE_DIRECTORY + "draft/reports/tr46/data"; private static final int MAX_STATUS_LENGTH = "disallowed_STD3_mapped".length(); @@ -50,22 +50,27 @@ public class GenerateIdna { static ToolUnicodePropertySource properties; static UnicodeSet cn; static UnicodeSet bidiControls; - public static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss 'GMT'", ULocale.US); + public static DateFormat dateFormat = + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss 'GMT'", ULocale.US); + static { dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); } + static UnicodeSet IDNA2008Valid = GenerateIdnaTest.getIdna2008Valid(); public static void main(String[] args) throws IOException { System.setProperty("line.separator", "\n"); - switch (args.length){ - case 0: - break; - case 1: - Default.setUCD(args[0]); - break; - default: throw new IllegalArgumentException("Only single argument allowed:\t" + Arrays.asList(args)); + switch (args.length) { + case 0: + break; + case 1: + Default.setUCD(args[0]); + break; + default: + throw new IllegalArgumentException( + "Only single argument allowed:\t" + Arrays.asList(args)); } U32 = new UnicodeSet("[:age=3.2:]").freeze(); @@ -75,8 +80,6 @@ public static void main(String[] args) throws IOException { cn = properties.getSet("gc=Cn").freeze(); bidiControls = properties.getSet("bidi_control=true"); - - final UnicodeMap> mappingTable = createMappingTable(true); final UnicodeMap> mappingTableNSTD3 = createMappingTable(false); { @@ -106,16 +109,24 @@ public static void main(String[] args) throws IOException { rawStatus.put(cp, status); Age_Values age = AGE.get(cp); if (age.compareTo(Age_Values.V3_2) > 0 && age != Age_Values.Unassigned) { - filteredStatus.put(cp, status == IdnaType.deviation || status == IdnaType.ignored ? IdnaType.mapped : status); + filteredStatus.put( + cp, + status == IdnaType.deviation || status == IdnaType.ignored + ? IdnaType.mapped + : status); } final IdnaType statusNstd3 = valueNstd3.get0(); - String endStatus = statusNstd3 == status ? status.toString() : status + "_STD3_" + statusNstd3; + String endStatus = + statusNstd3 == status ? status.toString() : status + "_STD3_" + statusNstd3; final String mapping = value.get1(); final String mappingNstd3 = valueNstd3.get1(); // if mapped, add info - if (status == IdnaType.mapped || status == IdnaType.deviation || statusNstd3 == IdnaType.mapped || statusNstd3 == IdnaType.deviation) { - endStatus += Utility.repeat(" ", MAX_STATUS_LENGTH-endStatus.length()) + " ; "; + if (status == IdnaType.mapped + || status == IdnaType.deviation + || statusNstd3 == IdnaType.mapped + || statusNstd3 == IdnaType.deviation) { + endStatus += Utility.repeat(" ", MAX_STATUS_LENGTH - endStatus.length()) + " ; "; if (mapping != null && mapping.length() != 0) { endStatus += Utility.hex(mapping); } else if (mappingNstd3 != null && mappingNstd3.length() != 0) { @@ -128,27 +139,31 @@ public static void main(String[] args) throws IOException { } if (status == IdnaType.valid && !IDNA2008Valid.contains(cp) && cp != '.') { - endStatus += Utility.repeat(" ", MAX_STATUS_LENGTH-endStatus.length()) + " ; ; NV8"; + endStatus += + Utility.repeat(" ", MAX_STATUS_LENGTH - endStatus.length()) + + " ; ; NV8"; } else if (Idna2008.GRANDFATHERED_VALID.contains(cp)) { - endStatus += Utility.repeat(" ", MAX_STATUS_LENGTH-endStatus.length()) + " ; ; XV8"; + endStatus += + Utility.repeat(" ", MAX_STATUS_LENGTH - endStatus.length()) + + " ; ; XV8"; } stringMappingTable.put(cp, endStatus); if (!U32.contains(cp) && !UNASSIGNED.contains(cp)) { - switch(status) { - case mapped: - case ignored: - case deviation: - mappedSet.add(cp); - break; - case disallowed: - disallowedSet.add(cp); - break; - case valid: - validSet.add(cp); - break; - default: - throw new IllegalAccessError(); + switch (status) { + case mapped: + case ignored: + case deviation: + mappedSet.add(cp); + break; + case disallowed: + disallowedSet.add(cp); + break; + case valid: + validSet.add(cp); + break; + default: + throw new IllegalAccessError(); } } } @@ -160,8 +175,10 @@ public static void main(String[] args) throws IOException { showAge(rawStatus); showAge("ValidSet", validSet); - // System.out.println("After running, copy the data file to the jsp directory, and run org.unicode.jsptest.TestGenerate to generate the differences table.\n" + - // "Then run org.unicode.jsptest.TestUt6s46 with the argument 'generate' to generate the tests."); + // System.out.println("After running, copy the data file to the jsp directory, and + // run org.unicode.jsptest.TestGenerate to generate the differences table.\n" + + // "Then run org.unicode.jsptest.TestUt6s46 with the argument 'generate' to generate + // the tests."); final UnicodeSet IDNA2008Disallowed = new UnicodeSet(IDNA2008Valid).complement().freeze(); @@ -171,9 +188,15 @@ public static void main(String[] args) throws IOException { // showSet("Mapped/Ignored\tDisallowed", mappedSet, IDNA2008Disallowed); showSet("Valid\tValid", filteredStatus.getSet(IdnaType.valid), IDNA2008Valid); - showSet("Disallowed\tDisallowed", filteredStatus.getSet(IdnaType.disallowed), IDNA2008Disallowed); + showSet( + "Disallowed\tDisallowed", + filteredStatus.getSet(IdnaType.disallowed), + IDNA2008Disallowed); showSet("Valid\tDisallowed", filteredStatus.getSet(IdnaType.valid), IDNA2008Disallowed); - showSet("Mapped/Ignored\tDisallowed", filteredStatus.getSet(IdnaType.mapped), IDNA2008Disallowed); + showSet( + "Mapped/Ignored\tDisallowed", + filteredStatus.getSet(IdnaType.mapped), + IDNA2008Disallowed); int missing = 0; for (Age_Values age : AGE.values()) { @@ -189,6 +212,7 @@ public static void main(String[] args) throws IOException { static final IndexUnicodeProperties IUP = IndexUnicodeProperties.make(); static final UnicodeMap AGE = IUP.loadEnum(UcdProperty.Age, Age_Values.class); static final UnicodeSet UNASSIGNED = AGE.getSet(Age_Values.Unassigned); + static { for (Age_Values age : AGE.values()) { UnicodeSet ageSet = AGE.getSet(age); @@ -199,7 +223,7 @@ public static void main(String[] args) throws IOException { private static void showSet(String title, UnicodeSet validSet, UnicodeSet iDNA2008Valid2) { final UnicodeSet intersect = new UnicodeSet(validSet).retainAll(iDNA2008Valid2); System.out.println(intersect.size() + "\t" + title + intersect.toPattern(false)); - //showAge(title, intersect); + // showAge(title, intersect); } private static void showAge(String title, final UnicodeSet intersect) { @@ -207,7 +231,14 @@ private static void showAge(String title, final UnicodeSet intersect) { UnicodeSet ageSet = AGE.getSet(age); if (intersect.containsSome(ageSet)) { UnicodeSet intersect2 = new UnicodeSet(ageSet).retainAll(intersect); - System.out.println("\t" + age + "\t" + intersect2.size() + "\t" + title + intersect2.toPattern(false)); + System.out.println( + "\t" + + age + + "\t" + + intersect2.size() + + "\t" + + title + + intersect2.toPattern(false)); } } } @@ -221,15 +252,24 @@ private static void showAge(final UnicodeMap map) { UnicodeSet intersect = map.getSet(idnaType); if (intersect.containsSome(ageSet)) { UnicodeSet intersect2 = new UnicodeSet(ageSet).retainAll(intersect); - System.out.println("\t" + idnaType + "\t" + intersect2.size() + intersect2.toPattern(false)); + System.out.println( + "\t" + + idnaType + + "\t" + + intersect2.size() + + intersect2.toPattern(false)); } } } } - private static void verifyDifferences(UnicodeMap mappings, UnicodeMap types, UnicodeMap> mappingTable) { + private static void verifyDifferences( + UnicodeMap mappings, + UnicodeMap types, + UnicodeMap> mappingTable) { System.out.println("Verifying Differences"); - final UnicodeMap, R2>> diff = new UnicodeMap, R2>>(); + final UnicodeMap, R2>> diff = + new UnicodeMap, R2>>(); for (int i = 0; i <= 0x10FFFF; ++i) { if (!U32.contains(i)) { continue; @@ -237,20 +277,35 @@ private static void verifyDifferences(UnicodeMap mappings, UnicodeMap data46 = mappingTable.get(i); final R2 data2003 = Row.of(types.get(i), mappings.get(i)); if (!equals(data46, data2003)) { - diff.put(i, Row.of( - (data2003.get0() == IdnaType.disallowed ? "D" : "-") - + (data46.get0() == IdnaType.disallowed ? "D" : "-") - , data2003, data46)); - //System.out.println(Utility.hex(i) + " - ust46: " + data46 + "\t idna2003: " + data2003 + "\t" + UCharacter.getExtendedName(i) + status); + diff.put( + i, + Row.of( + (data2003.get0() == IdnaType.disallowed ? "D" : "-") + + (data46.get0() == IdnaType.disallowed ? "D" : "-"), + data2003, + data46)); + // System.out.println(Utility.hex(i) + " - ust46: " + data46 + "\t idna2003: " + + // data2003 + "\t" + UCharacter.getExtendedName(i) + status); } } - for (final R3, R2> item : new TreeSet, R2>>(diff.values())) { + for (final R3, R2> item : + new TreeSet, R2>>( + diff.values())) { final UnicodeSet set = diff.getSet(item); final String ok = item.get0(); final R2 data2003 = item.get1(); final R2 data46 = item.get2(); - System.out.println(ok + "\tidna2003: " + data2003 + "\tust46: " + data46 + "\t" + set.size() + "\t" + set); + System.out.println( + ok + + "\tidna2003: " + + data2003 + + "\tust46: " + + data46 + + "\t" + + set.size() + + "\t" + + set); } } @@ -271,71 +326,89 @@ private static UnicodeMap> createMappingTable(boolean S baseMapping.putAll(bidiControls, null); baseMapping.freeze(); - - final UnicodeSet labelSeparator = new UnicodeSet("[\\u002E \\uFF0E \\u3002 \\uFF61]").freeze(); - - final UnicodeSet baseValidSet = new UnicodeSet(0,0x10FFFF) - .removeAll(properties.getSet("Changes_When_NFKC_Casefolded=true")) - .removeAll(properties.getSet("gc=Cc")) - .removeAll(properties.getSet("gc=Cf")) - .removeAll(cn) - .removeAll(properties.getSet("gc=Co")) - .removeAll(properties.getSet("gc=Cs")) - .removeAll(properties.getSet("gc=Zl")) - .removeAll(properties.getSet("gc=Zp")) - .removeAll(properties.getSet("gc=Zs")) - .removeAll(properties.getSet("Block=Ideographic_Description_Characters")) - .removeAll(new UnicodeSet("[\\u0000-\\u007F]")) - //.addAll(0x200c, 0x200d) - .addAll(STD3 ? VALID_ASCII : NSTD3_ASCII).freeze() - ; - - System.out.println(STD3 + " Base Valid Set & nfkcqc=n" + new UnicodeSet("[:nfkcqc=n:]").retainAll(baseValidSet)); - - final R2 baseExclusionSetInfo = computeBaseExclusionSet(baseMapping, baseValidSet, STD3); + final UnicodeSet labelSeparator = + new UnicodeSet("[\\u002E \\uFF0E \\u3002 \\uFF61]").freeze(); + + final UnicodeSet baseValidSet = + new UnicodeSet(0, 0x10FFFF) + .removeAll(properties.getSet("Changes_When_NFKC_Casefolded=true")) + .removeAll(properties.getSet("gc=Cc")) + .removeAll(properties.getSet("gc=Cf")) + .removeAll(cn) + .removeAll(properties.getSet("gc=Co")) + .removeAll(properties.getSet("gc=Cs")) + .removeAll(properties.getSet("gc=Zl")) + .removeAll(properties.getSet("gc=Zp")) + .removeAll(properties.getSet("gc=Zs")) + .removeAll(properties.getSet("Block=Ideographic_Description_Characters")) + .removeAll(new UnicodeSet("[\\u0000-\\u007F]")) + // .addAll(0x200c, 0x200d) + .addAll(STD3 ? VALID_ASCII : NSTD3_ASCII) + .freeze(); + + System.out.println( + STD3 + + " Base Valid Set & nfkcqc=n" + + new UnicodeSet("[:nfkcqc=n:]").retainAll(baseValidSet)); + + final R2 baseExclusionSetInfo = + computeBaseExclusionSet(baseMapping, baseValidSet, STD3); final UnicodeSet disallowedExclusionSet = baseExclusionSetInfo.get0(); final UnicodeSet mappingChanged = baseExclusionSetInfo.get1(); - final UnicodeSet baseExclusionSet = new UnicodeSet(disallowedExclusionSet).addAll(mappingChanged); - final UnicodeSet baseExclusionSet2 = new UnicodeSet("[" + - "\\u04C0 \\u10A0-\\u10C5 \\u2132 \\u2183" + - "\\U0002F868 \\U0002F874 \\U0002F91F \\U0002F95F \\U0002F9BF" + - "\u3164 \uFFA0 \u115F \u1160 \u17B4 \u17B5 \u1806 \uFFFC \uFFFD" + - "[\\u200E\\u200F\\u202A-\\u202E\\u2061-\\u2063\\u206A-\\u206F\\U0001D173-\\U0001D17A\\U000E0001\\U000E0020-\\U000E007F]" + - "[\u200B\u2060\uFEFF]" + - "]").freeze(); //.addAll(cn) + final UnicodeSet baseExclusionSet = + new UnicodeSet(disallowedExclusionSet).addAll(mappingChanged); + final UnicodeSet baseExclusionSet2 = + new UnicodeSet( + "[" + + "\\u04C0 \\u10A0-\\u10C5 \\u2132 \\u2183" + + "\\U0002F868 \\U0002F874 \\U0002F91F \\U0002F95F \\U0002F9BF" + + "\u3164 \uFFA0 \u115F \u1160 \u17B4 \u17B5 \u1806 \uFFFC \uFFFD" + + "[\\u200E\\u200F\\u202A-\\u202E\\u2061-\\u2063\\u206A-\\u206F\\U0001D173-\\U0001D17A\\U000E0001\\U000E0020-\\U000E007F]" + + "[\u200B\u2060\uFEFF]" + + "]") + .freeze(); // .addAll(cn) System.out.println(STD3 + " base valid set:\t" + baseValidSet); - System.out.println(STD3 + " computed base exclusion disallowed:\t" + disallowedExclusionSet); + System.out.println( + STD3 + " computed base exclusion disallowed:\t" + disallowedExclusionSet); System.out.println(STD3 + " computed base exclusion mapping changed:\t" + mappingChanged); if (false && !baseExclusionSet.equals(baseExclusionSet2)) { - System.out.println("computed-static:\t" + new UnicodeSet(baseExclusionSet).removeAll(baseExclusionSet2)); - System.out.println("static-computed:\t" + new UnicodeSet(baseExclusionSet2).removeAll(baseExclusionSet)); + System.out.println( + "computed-static:\t" + + new UnicodeSet(baseExclusionSet).removeAll(baseExclusionSet2)); + System.out.println( + "static-computed:\t" + + new UnicodeSet(baseExclusionSet2).removeAll(baseExclusionSet)); throw new IllegalArgumentException(); } - System.out.println(STD3 + " ***Overlap with baseValidSet and baseExclusionSet:\t" + new UnicodeSet( - baseValidSet).retainAll(baseExclusionSet)); + System.out.println( + STD3 + + " ***Overlap with baseValidSet and baseExclusionSet:\t" + + new UnicodeSet(baseValidSet).retainAll(baseExclusionSet)); - final UnicodeSet deviationSet = new UnicodeSet("[\u200C \u200D \u00DF \u03C2]").freeze(); // \u200C \u200D + final UnicodeSet deviationSet = + new UnicodeSet("[\u200C \u200D \u00DF \u03C2]").freeze(); // \u200C \u200D /** - * 1. If the code point is in the deviation set the status is deviation and - * the mapping value is the base mapping value for that code point
- * 2. Otherwise, if (a) the code point is in the base exclusion set, or if - * (b) any code point in its base mapping value is not in the base valid set - * the status is disallowed and there is no mapping value in the table
- * 3. Otherwise, if the base mapping value is an empty string the status is - * ignored and there is no mapping value in the table
- * 4. Otherwise, if the base mapping value is the same as the code point the - * status is valid and there is no mapping value in the table
- * 5. Otherwise, the status is mapping and the mapping value is the base - * mapping value for that code point + * 1. If the code point is in the deviation set the status is deviation and the mapping + * value is the base mapping value for that code point
+ * 2. Otherwise, if (a) the code point is in the base exclusion set, or if (b) any code + * point in its base mapping value is not in the base valid set the status is disallowed and + * there is no mapping value in the table
+ * 3. Otherwise, if the base mapping value is an empty string the status is ignored and + * there is no mapping value in the table
+ * 4. Otherwise, if the base mapping value is the same as the code point the status is valid + * and there is no mapping value in the table
+ * 5. Otherwise, the status is mapping and the mapping value is the base mapping value for + * that code point */ - final UnicodeMap> mappingTable = new UnicodeMap>(); - final R2 disallowedResult = Row.of(IdnaType.disallowed, (String)null); - final R2 ignoredResult = Row.of(IdnaType.ignored, (String)null); - final R2 validResult = Row.of(IdnaType.valid, (String)null); + final UnicodeMap> mappingTable = + new UnicodeMap>(); + final R2 disallowedResult = Row.of(IdnaType.disallowed, (String) null); + final R2 ignoredResult = Row.of(IdnaType.ignored, (String) null); + final R2 validResult = Row.of(IdnaType.valid, (String) null); for (int cp = 0; cp <= 0x10FFFF; ++cp) { if (TESTING && cp == 0x10C7) { @@ -352,7 +425,8 @@ private static UnicodeMap> createMappingTable(boolean S } else if (baseExclusionSet.contains(cp) || false && bidiControls.contains(cp)) { // Step 5. result = disallowedResult; - } else if (!labelSeparator.contains(cp) && !baseValidSet.containsAll(baseMappingValue)) { + } else if (!labelSeparator.contains(cp) + && !baseValidSet.containsAll(baseMappingValue)) { result = disallowedResult; } else if (cn.contains(cp)) { // do this in a different order just for debuggin result = disallowedResult; @@ -363,7 +437,7 @@ private static UnicodeMap> createMappingTable(boolean S } else { result = Row.of(IdnaType.mapped, baseMappingValue); } - //if (0==(cp&0xFFF)) System.out.println(cp + " = " + result); + // if (0==(cp&0xFFF)) System.out.println(cp + " = " + result); mappingTable.put(cp, result); } final UnicodeSet excluded = new UnicodeSet(); @@ -378,8 +452,11 @@ private static UnicodeMap> createMappingTable(boolean S excluded.add(valid); } } - final UnicodeSet mappedSet = new UnicodeSet(0,0x10FFFF).removeAll(validSet) - .removeAll(disallowedSet).removeAll(ignoredSet); + final UnicodeSet mappedSet = + new UnicodeSet(0, 0x10FFFF) + .removeAll(validSet) + .removeAll(disallowedSet) + .removeAll(ignoredSet); for (final String mapped : mappedSet) { final R2 mappedValue = mappingTable.get(mapped); final String mapResult = mappedValue.get1(); @@ -404,44 +481,44 @@ private static UnicodeMap> createMappingTable(boolean S return mappingTable.freeze(); } - - private static R2 computeBaseExclusionSet(UnicodeMap baseMapping, UnicodeSet baseValidSet, boolean STD3) { + private static R2 computeBaseExclusionSet( + UnicodeMap baseMapping, UnicodeSet baseValidSet, boolean STD3) { final Idna Idna2003Data = STD3 ? Idna2003.SINGLETON : Idna2003.SINGLETON_NSTD3; final UnicodeSet disallowed = new UnicodeSet(); final UnicodeSet mappingChanged = new UnicodeSet(); - for (final UnicodeSetIterator it = new UnicodeSetIterator(U32); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(U32); it.next(); ) { final int i = it.codepoint; if (TESTING && i == 0x41) { System.out.println("??TEST??"); } final IdnaType type = Idna2003Data.types.get(i); switch (type) { - case disallowed: - if (baseValidSet.contains(i)) { - disallowed.add(i); + case disallowed: + if (baseValidSet.contains(i)) { + disallowed.add(i); + break; + } + final String base2 = baseMapping.get(i); + if (base2 != null && baseValidSet.containsAll(base2)) { + disallowed.add(i); + } + break; + default: + String idna2003 = Idna2003Data.mappings.get(i); + String base = baseMapping.get(i); + if (base == idna2003) { + continue; + } + if (base == null) { + base = UTF16.valueOf(i); + } + if (idna2003 == null) { + idna2003 = UTF16.valueOf(i); + } + if (!base.equals(idna2003)) { + mappingChanged.add(i); + } break; - } - final String base2 = baseMapping.get(i); - if (base2 != null && baseValidSet.containsAll(base2)) { - disallowed.add(i); - } - break; - default: - String idna2003 = Idna2003Data.mappings.get(i); - String base = baseMapping.get(i); - if (base == idna2003) { - continue; - } - if (base == null) { - base = UTF16.valueOf(i); - } - if (idna2003 == null) { - idna2003 = UTF16.valueOf(i); - } - if (!base.equals(idna2003)) { - mappingChanged.add(i); - } - break; } } return Row.of(disallowed.freeze(), mappingChanged.freeze()); @@ -452,22 +529,24 @@ private static void writeDataFile(UnicodeMap mappingTable) throws IOExce final String unversionedFileName = "IdnaMappingTable.txt"; final PrintWriter writer = FileUtilities.openUTF8Writer(GEN_IDNA_DIR, unversionedFileName); - writer.println(Utility.getBaseDataHeader( - unversionedFileName, - 46, - "Unicode IDNA Compatible Preprocessing", - Default.ucdVersion())); -// writer.println( -// "#\n" + -// "# Unicode IDNA Compatible Preprocessing (UTS #46)\n" + -// "# For documentation, see http://www.unicode.org/reports/tr46/\n"); - - final UnicodeProperty ASSIGNED = new UnicodeProperty.SimpleProperty() { - @Override - protected String _getValue(int codepoint) { - return cn.contains(codepoint) ? "Cn" : "As"; - } - }; + writer.println( + Utility.getBaseDataHeader( + unversionedFileName, + 46, + "Unicode IDNA Compatible Preprocessing", + Default.ucdVersion())); + // writer.println( + // "#\n" + + // "# Unicode IDNA Compatible Preprocessing (UTS #46)\n" + + // "# For documentation, see http://www.unicode.org/reports/tr46/\n"); + + final UnicodeProperty ASSIGNED = + new UnicodeProperty.SimpleProperty() { + @Override + protected String _getValue(int codepoint) { + return cn.contains(codepoint) ? "Cn" : "As"; + } + }; final UnicodeProperty age = properties.getProperty("age"); // UnicodeMap ageValue = age0.getUnicodeMap(); // UnicodeSet unassigned = ageValue.getSet("unassigned"); diff --git a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaStableSamples.java b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaStableSamples.java index 8646cdcd3..1a5d19824 100644 --- a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaStableSamples.java +++ b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaStableSamples.java @@ -1,5 +1,7 @@ package org.unicode.idna; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.XSymbolTable; import org.unicode.props.UnicodeProperty; import org.unicode.props.UnicodePropertySymbolTable; import org.unicode.text.UCD.Default; @@ -7,45 +9,43 @@ import org.unicode.text.UCD.ToolUnicodeTransformFactory; import org.unicode.text.utility.UnicodeTransform; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.XSymbolTable; - public class GenerateIdnaStableSamples { public static void main(String[] args) { Default.setUCD("9.0.0"); UnicodeTransform.setFactory(new ToolUnicodeTransformFactory()); - final ToolUnicodePropertySource toolUPS1 = ToolUnicodePropertySource.make(Default.ucdVersion()); + final ToolUnicodePropertySource toolUPS1 = + ToolUnicodePropertySource.make(Default.ucdVersion()); final XSymbolTable toolUPS = new UnicodePropertySymbolTable(toolUPS1); UnicodeSet.setDefaultXSymbolTable(toolUPS); UnicodeProperty.ResetCacheProperties(); final String[] samples = { - "// bidi", - "[[:bc=R:][:bc=AL:]-[:Cn:]]", - "[[:bc=L:]-[:Cn:]]", - "[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]-[:Cn:]]", - "[[:bc=EN:]-[:Cn:]]", - "[[:bc=AN:]-[:Cn:]]", - "[[:bc=NSM:]-[:Cn:]]", - "// contextj",// - "[\u200C\u200D]", - "[[:ccc=virama:]-[:Cn:]]", - "[[:jt=T:]-[:Cn:]]", - "[[:jt=L:][:jt=D:]-[:Cn:]]", - "[[:jt=R:][:jt=D:]-[:Cn:]]", - "// syntax",// - "[-]", - "// changed mapping from 2003",// - "[[\u04C0 \u10A0-\u10C5 \u2132 \u2183 \u2F868 \u2F874 \u2F91F \u2F95F \u2F9BF \u3164 \uFFA0 \u115F \u1160 \u17B4 \u17B5 \u1806]-[:Cn:]]", - "// disallowed in 2003",// disallowed in 2003 - "[[\u200E-\u200F \u202A-\u202E \u2061-\u2063 \uFFFC \uFFFD \u1D173-\u1D17A \u206A-\u206F \uE0001 \uE0020-\uE007F]-[:Cn:]]", - "// Step 7",// - "[[\u2260 \u226E \u226F \uFE12 \u2488]-[:Cn:]]", - "// disallowed", - "[[:S:][:P:][:C:]-[:Cn:][:noncharactercodepoint:][\\U000D0000\\U000E0000\\U000F0000\\U00100000]]", - "// deviations", // - "[[\\u200C\\u200D\\u00DF\\u03C2]-[:Cn:]]", + "// bidi", + "[[:bc=R:][:bc=AL:]-[:Cn:]]", + "[[:bc=L:]-[:Cn:]]", + "[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]-[:Cn:]]", + "[[:bc=EN:]-[:Cn:]]", + "[[:bc=AN:]-[:Cn:]]", + "[[:bc=NSM:]-[:Cn:]]", + "// contextj", // + "[\u200C\u200D]", + "[[:ccc=virama:]-[:Cn:]]", + "[[:jt=T:]-[:Cn:]]", + "[[:jt=L:][:jt=D:]-[:Cn:]]", + "[[:jt=R:][:jt=D:]-[:Cn:]]", + "// syntax", // + "[-]", + "// changed mapping from 2003", // + "[[\u04C0 \u10A0-\u10C5 \u2132 \u2183 \u2F868 \u2F874 \u2F91F \u2F95F \u2F9BF \u3164 \uFFA0 \u115F \u1160 \u17B4 \u17B5 \u1806]-[:Cn:]]", + "// disallowed in 2003", // disallowed in 2003 + "[[\u200E-\u200F \u202A-\u202E \u2061-\u2063 \uFFFC \uFFFD \u1D173-\u1D17A \u206A-\u206F \uE0001 \uE0020-\uE007F]-[:Cn:]]", + "// Step 7", // + "[[\u2260 \u226E \u226F \uFE12 \u2488]-[:Cn:]]", + "// disallowed", + "[[:S:][:P:][:C:]-[:Cn:][:noncharactercodepoint:][\\U000D0000\\U000E0000\\U000F0000\\U00100000]]", + "// deviations", // + "[[\\u200C\\u200D\\u00DF\\u03C2]-[:Cn:]]", }; for (int i = 0; i < samples.length; ++i) { @@ -58,7 +58,6 @@ public static void main(String[] args) { System.out.println(sample); } } - } private static UnicodeSet trim(UnicodeSet source, int maxRanges) { @@ -70,10 +69,10 @@ private static UnicodeSet trim(UnicodeSet source, int maxRanges) { return result; } // take first half and last half, to get a mix - for (int i = 0; i < maxRanges/2; ++i) { + for (int i = 0; i < maxRanges / 2; ++i) { result.addAll(source.getRangeStart(i), source.getRangeEnd(i)); } - for (int i = rangeCount-(maxRanges - maxRanges/2); i < rangeCount; ++i) { + for (int i = rangeCount - (maxRanges - maxRanges / 2); i < rangeCount; ++i) { result.addAll(source.getRangeStart(i), source.getRangeEnd(i)); } if (result.getRangeCount() != maxRanges) { diff --git a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java index c32c10d8f..3fc42eccc 100644 --- a/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java +++ b/unicodetools/src/main/java/org/unicode/idna/GenerateIdnaTest.java @@ -1,5 +1,16 @@ package org.unicode.idna; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.DateFormat; +import com.ibm.icu.text.SimpleDateFormat; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.XSymbolTable; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.util.Collections; @@ -9,15 +20,14 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CldrUtility; -import org.unicode.props.UnicodeProperty; -import org.unicode.props.UnicodePropertySymbolTable; import org.unicode.idna.Idna2008.Idna2008Type; import org.unicode.idna.LoadIdnaTest.TestLine; import org.unicode.idna.Uts46.Errors; import org.unicode.idna.Uts46.IdnaChoice; +import org.unicode.props.UnicodeProperty; +import org.unicode.props.UnicodePropertySymbolTable; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.ToolUnicodePropertySource; import org.unicode.text.UCD.ToolUnicodeTransformFactory; @@ -26,18 +36,6 @@ import org.unicode.text.utility.UnicodeTransform; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.DateFormat; -import com.ibm.icu.text.SimpleDateFormat; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.XSymbolTable; -import com.ibm.icu.util.ULocale; - public class GenerateIdnaTest { static { @@ -47,20 +45,20 @@ public class GenerateIdnaTest { private static final String TO_ESCAPE = "[[:c:][:z:][:m:][:di:][:bc=R:][:bc=AL:][:bc=AN:]#;]"; - private static final Pattern IDNA2003_LABEL_SEPARATOR = Pattern.compile("[.\uFF0E \u3002\uFF61]"); + private static final Pattern IDNA2003_LABEL_SEPARATOR = + Pattern.compile("[.\uFF0E \u3002\uFF61]"); private static final boolean NEW_FORMAT = true; private static final int UNDEFINED; - static { // find a character that is likely to remain undefined, and is if possible in the BMP. // so we take the highest BMP if possible, then the highest smp UnicodeSet unassigned = new UnicodeSet("[[:cn:]-[:NChar:]]").freeze(); UnicodeSet unassignedBMP = new UnicodeSet("[\\u0000-\\uFFFF]").retainAll(unassigned); if (!unassignedBMP.isEmpty()) { - UNDEFINED = unassignedBMP.getRangeEnd(unassignedBMP.getRangeCount()-1); + UNDEFINED = unassignedBMP.getRangeEnd(unassignedBMP.getRangeCount() - 1); } else { - UNDEFINED = unassigned.getRangeEnd(unassigned.getRangeCount()-1); + UNDEFINED = unassigned.getRangeEnd(unassigned.getRangeCount() - 1); } } @@ -69,50 +67,66 @@ public static void main(String[] args) throws IOException { final int count = new GenerateIdnaTest().generateTests(1000); System.out.println("DONE " + count); - System.out.println("NOTE: use ICU to test until TestIdna is updated." - + "\nCopy the new data to {workspace}/unicodetools/data/idna and run TestIdna -prop:DIR=draft"); + System.out.println( + "NOTE: use ICU to test until TestIdna is updated." + + "\nCopy the new data to {workspace}/unicodetools/data/idna and run TestIdna -prop:DIR=draft"); } // private static void assertEquals(String string, T expected, T actual) { // if (!Objects.equal(expected, actual)) { - // throw new IllegalArgumentException(string + ": expected: " + expected + " ≠ " + actual); + // throw new IllegalArgumentException(string + ": expected: " + expected + " ≠ " + + // actual); // } // } public static void setUnicodeVersion() { Default.setUCD(Settings.latestVersion); UnicodeTransform.setFactory(new ToolUnicodeTransformFactory()); - final ToolUnicodePropertySource toolUPS1 = ToolUnicodePropertySource.make(Default.ucdVersion()); + final ToolUnicodePropertySource toolUPS1 = + ToolUnicodePropertySource.make(Default.ucdVersion()); final XSymbolTable toolUPS = new UnicodePropertySymbolTable(toolUPS1); UnicodeSet.setDefaultXSymbolTable(toolUPS); UnicodeProperty.ResetCacheProperties(); } - public static DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss 'GMT'", ULocale.US); + public static DateFormat dateFormat = + new SimpleDateFormat("yyyy-MM-dd HH:mm:ss 'GMT'", ULocale.US); public static String internalOldName = "internal-IdnaTest.txt"; public static String NEW_FILE_NAME = "IdnaTestV2.txt"; int generateTests(int lines) throws IOException { - final PrintWriter out = org.unicode.cldr.draft.FileUtilities.openUTF8Writer(GenerateIdna.GEN_IDNA_DIR, internalOldName); + final PrintWriter out = + org.unicode.cldr.draft.FileUtilities.openUTF8Writer( + GenerateIdna.GEN_IDNA_DIR, internalOldName); out.println(Utility.getDataHeader(internalOldName)); - FileUtilities.appendFile(this.getClass().getResource("IdnaTestHeader.txt").toString().substring(5), "UTF-8", out); + FileUtilities.appendFile( + this.getClass().getResource("IdnaTestHeader.txt").toString().substring(5), + "UTF-8", + out); - final PrintWriter out2 = org.unicode.cldr.draft.FileUtilities.openUTF8Writer(GenerateIdna.GEN_IDNA_DIR, NEW_FILE_NAME); -// out2.println(Utility.getDataHeader(NEW_FILE_NAME)); - out2.println(Utility.getBaseDataHeader( - NEW_FILE_NAME, - 46, - "Unicode IDNA Compatible Preprocessing", - Default.ucdVersion())); + final PrintWriter out2 = + org.unicode.cldr.draft.FileUtilities.openUTF8Writer( + GenerateIdna.GEN_IDNA_DIR, NEW_FILE_NAME); + // out2.println(Utility.getDataHeader(NEW_FILE_NAME)); + out2.println( + Utility.getBaseDataHeader( + NEW_FILE_NAME, + 46, + "Unicode IDNA Compatible Preprocessing", + Default.ucdVersion())); - FileUtilities.appendFile(this.getClass().getResource("IdnaTestHeader2.txt").toString().substring(5), "UTF-8", out2); + FileUtilities.appendFile( + this.getClass().getResource("IdnaTestHeader2.txt").toString().substring(5), + "UTF-8", + out2); // out.println( // "# Format\n" + // "# source ; type ; toASCII ; toUnicode\n" + // "# type: T for transitional, N for nontransitional, B for both\n" + - // "# In case of errors, field 3 and 4 show errors in [....] instead of a result\n" + + // "# In case of errors, field 3 and 4 show errors in [....] instead of a + // result\n" + // "# The errors are based on the step numbers in UTS46.\n" + // "# Pn for Section 4 Processing step n\n" + // "# Vn for 4.1 Validity Criteria step n\n" + @@ -129,7 +143,6 @@ int generateTests(int lines) throws IOException { out.println("\n# BIDI TESTS\n"); out2.println("\n# BIDI TESTS\n"); - for (final String[] testCase : bidiTests) { count += generateLine(testCase[0], out, out2); } @@ -137,7 +150,6 @@ int generateTests(int lines) throws IOException { out.println("\n# CONTEXT TESTS\n"); out2.println("\n# CONTEXT TESTS\n"); - for (final String[] testCase : contextTests) { count += generateLine(testCase[0], out, out2); } @@ -145,7 +157,6 @@ int generateTests(int lines) throws IOException { out.println("\n# SELECTED TESTS\n"); out2.println("\n# SELECTED TESTS\n"); - count += generateLine("\u00a1", out, out2); for (String s : Idna2008.GRANDFATHERED_VALID) { count += generateLine(s, out, out2); @@ -172,11 +183,12 @@ int generateTests(int lines) throws IOException { throw new AssertionError( "update GenerateIdnaTest.ucdTypesLastVersion to match " + lastVersion); } - Set testLines = LoadIdnaTest.load( - Settings.UnicodeTools.DATA_DIR + "idna/" + lastVersion); + Set testLines = + LoadIdnaTest.load(Settings.UnicodeTools.DATA_DIR + "idna/" + lastVersion); for (TestLine testLine : testLines) { - count += generateLine(replaceNewerThan(testLine.source, ucdTypesLastVersion), out, out2); + count += + generateLine(replaceNewerThan(testLine.source, ucdTypesLastVersion), out, out2); } // final RandomString randomString = new RandomString(); @@ -187,7 +199,8 @@ int generateTests(int lines) throws IOException { // // for (int line = 0; line < lines; ++line) { // sb.setLength(0); - // randomString.resetRandom(line); // provide predictable results based on line number + // randomString.resetRandom(line); // provide predictable results based on line + // number // // random number of labels // int labels = RandomString.random.nextInt(labelLength); // for (; labels >= 0; --labels) { @@ -223,74 +236,105 @@ int generateLine(String source, PrintWriter out, PrintWriter out2) { } int result = 0; final Set toUnicodeErrors = EnumSet.noneOf(Errors.class); - final String unicode = Uts46.SINGLETON.toUnicode(source, IdnaChoice.nontransitional, toUnicodeErrors); + final String unicode = + Uts46.SINGLETON.toUnicode(source, IdnaChoice.nontransitional, toUnicodeErrors); if (!Collections.disjoint(toUnicodeErrors, Errors.TO_ASCII_ERRORS)) { - System.err.println("Should never have ASCII errors in toUnicode:\t" + source + "\ty==>\t" + toUnicodeErrors); + System.err.println( + "Should never have ASCII errors in toUnicode:\t" + + source + + "\ty==>\t" + + toUnicodeErrors); } -// if (MATCH_OLD) { -// replace(toUnicodeErrors, Errors.X4_2, Errors.A4_2); -// replace(toUnicodeErrors, Errors.P4, Errors.A3); -// replace(toUnicodeErrors, Errors.X3, Errors.A3); -// } + // if (MATCH_OLD) { + // replace(toUnicodeErrors, Errors.X4_2, Errors.A4_2); + // replace(toUnicodeErrors, Errors.P4, Errors.A3); + // replace(toUnicodeErrors, Errors.X3, Errors.A3); + // } final Set transitionalErrors = EnumSet.noneOf(Errors.class); - final String transitional = Uts46.SINGLETON.toASCII(source, IdnaChoice.transitional, transitionalErrors); + final String transitional = + Uts46.SINGLETON.toASCII(source, IdnaChoice.transitional, transitionalErrors); replace(transitionalErrors, Errors.X4_2, Errors.A4_2); -// if (MATCH_OLD) { -// replace(transitionalErrors, Errors.P4, Errors.A3); -// replace(transitionalErrors, Errors.X3, Errors.A3); -// } + // if (MATCH_OLD) { + // replace(transitionalErrors, Errors.P4, Errors.A3); + // replace(transitionalErrors, Errors.X3, Errors.A3); + // } final Set nonTransitionalErrors = EnumSet.noneOf(Errors.class); - final String nontransitional = Uts46.SINGLETON.toASCII(source, IdnaChoice.nontransitional, nonTransitionalErrors); + final String nontransitional = + Uts46.SINGLETON.toASCII(source, IdnaChoice.nontransitional, nonTransitionalErrors); replace(nonTransitionalErrors, Errors.X4_2, Errors.A4_2); -// if (MATCH_OLD) { -// replace(nonTransitionalErrors, Errors.P4, Errors.A3); -// replace(nonTransitionalErrors, Errors.X3, Errors.A3); -// } - -// Set toUnicodeErrors2 = toUnicodeErrors; -// if (!IDNA2008Valid.containsAll(source)) { -// toUnicodeErrors2 = EnumSet.copyOf(toUnicodeErrors2); -// toUnicodeErrors2.add(Errors.NV8); -// } -// -// // Hack to check whether problems were introduced. Needs to be deeper check in processMap -// -// final Set throwAway = EnumSet.noneOf(Errors.class); -// Set nonTransitionalErrors2 = nonTransitionalErrors; -// final String nontransitional2 = Uts46.SINGLETON.toASCII(unicode, IdnaChoice.nontransitional, throwAway); -// if (!IDNA2008Valid.containsAll(nontransitional2)) { -// nonTransitionalErrors2 = EnumSet.copyOf(nonTransitionalErrors); -// nonTransitionalErrors2.add(Errors.NV8); -// } -// -// Set transitionalErrors2 = transitionalErrors; -// final String transitional2 = Uts46.SINGLETON.toASCII(unicode, IdnaChoice.transitional, throwAway); -// if (!IDNA2008Valid.containsAll(transitional2)) { -// nonTransitionalErrors2 = EnumSet.copyOf(nonTransitionalErrors); -// nonTransitionalErrors2.add(Errors.NV8); -// } - - out2.println(hexForTest.transform(source) - + "; " + escapeIfDifferentElseEmpty(unicode, source) - + "; " + CldrUtility.ifEqual(toUnicodeErrors, Collections.EMPTY_SET, "") - + "; " + escapeIfDifferentElseEmpty(nontransitional, unicode) - + "; " + CldrUtility.ifEqual(nonTransitionalErrors, toUnicodeErrors, "") - + "; " + escapeIfDifferentElseEmpty(transitional, nontransitional) - + "; " + CldrUtility.ifEqual(transitionalErrors, nonTransitionalErrors, "") - + " # " + removeInvisible.transform(unicode) - ); + // if (MATCH_OLD) { + // replace(nonTransitionalErrors, Errors.P4, Errors.A3); + // replace(nonTransitionalErrors, Errors.X3, Errors.A3); + // } + + // Set toUnicodeErrors2 = toUnicodeErrors; + // if (!IDNA2008Valid.containsAll(source)) { + // toUnicodeErrors2 = EnumSet.copyOf(toUnicodeErrors2); + // toUnicodeErrors2.add(Errors.NV8); + // } + // + // // Hack to check whether problems were introduced. Needs to be deeper check in + // processMap + // + // final Set throwAway = EnumSet.noneOf(Errors.class); + // Set nonTransitionalErrors2 = nonTransitionalErrors; + // final String nontransitional2 = Uts46.SINGLETON.toASCII(unicode, + // IdnaChoice.nontransitional, throwAway); + // if (!IDNA2008Valid.containsAll(nontransitional2)) { + // nonTransitionalErrors2 = EnumSet.copyOf(nonTransitionalErrors); + // nonTransitionalErrors2.add(Errors.NV8); + // } + // + // Set transitionalErrors2 = transitionalErrors; + // final String transitional2 = Uts46.SINGLETON.toASCII(unicode, + // IdnaChoice.transitional, throwAway); + // if (!IDNA2008Valid.containsAll(transitional2)) { + // nonTransitionalErrors2 = EnumSet.copyOf(nonTransitionalErrors); + // nonTransitionalErrors2.add(Errors.NV8); + // } + + out2.println( + hexForTest.transform(source) + + "; " + + escapeIfDifferentElseEmpty(unicode, source) + + "; " + + CldrUtility.ifEqual(toUnicodeErrors, Collections.EMPTY_SET, "") + + "; " + + escapeIfDifferentElseEmpty(nontransitional, unicode) + + "; " + + CldrUtility.ifEqual(nonTransitionalErrors, toUnicodeErrors, "") + + "; " + + escapeIfDifferentElseEmpty(transitional, nontransitional) + + "; " + + CldrUtility.ifEqual(transitionalErrors, nonTransitionalErrors, "") + + " # " + + removeInvisible.transform(unicode)); if (!transitionalErrors.equals(nonTransitionalErrors) || !transitional.equals(nontransitional) - //&& transitionalErrors.size() == 0 - ) { + // && transitionalErrors.size() == 0 + ) { showLine(source, "T", transitional, transitionalErrors, unicode, toUnicodeErrors, out); - showLine(source, "N", nontransitional, nonTransitionalErrors, unicode, toUnicodeErrors, out); + showLine( + source, + "N", + nontransitional, + nonTransitionalErrors, + unicode, + toUnicodeErrors, + out); result += 2; } else { - showLine(source, "B", nontransitional, nonTransitionalErrors, unicode, toUnicodeErrors, out); + showLine( + source, + "B", + nontransitional, + nonTransitionalErrors, + unicode, + toUnicodeErrors, + out); result += 1; } if (NEW_FORMAT) { @@ -304,12 +348,12 @@ int generateLine(String source, PrintWriter out, PrintWriter out2) { result += generateLine(UCharacter.foldCase(source, true), out, out2); result += generateLine(UCharacter.toTitleCase(source, null), out, out2); - //if (transitionalErrors.size() == 0) { + // if (transitionalErrors.size() == 0) { result += generateLine(transitional, out, out2); - //} - //if (nonTransitionalErrors.size() == 0) { + // } + // if (nonTransitionalErrors.size() == 0) { result += generateLine(nontransitional, out, out2); - //} + // } if (toUnicodeErrors.size() == 0) { result += generateLine(unicode, out, out2); } @@ -324,7 +368,8 @@ private String escapeIfDifferentElseEmpty(String target, String source) { } } - private void replace(final Set transitionalErrors, Errors toReplace, Errors replacement) { + private void replace( + final Set transitionalErrors, Errors toReplace, Errors replacement) { if (transitionalErrors.contains(toReplace)) { transitionalErrors.remove(toReplace); transitionalErrors.add(replacement); @@ -343,6 +388,7 @@ private boolean alreadyDone(String source) { } Matcher labelSeparator = IDNA2003_LABEL_SEPARATOR.matcher(""); + String getCanonicalString(String source) { labelSeparator.reset(source); StringBuilder result = new StringBuilder(); @@ -353,7 +399,6 @@ String getCanonicalString(String source) { } result.append(getCanonicalLabel(source.substring(last))); return result.toString(); - } // we uppercase IF all ascii, otherwise leave it alone String getCanonicalLabel(String source) { @@ -379,116 +424,137 @@ String getCanonicalLabel(String source) { return result.toString(); } - /** - * Draws line - */ - private void showLine(String source, String type, String ascii, Set asciiErrors, String unicode, Set toUnicodeErrors, PrintWriter out) { + /** Draws line */ + private void showLine( + String source, + String type, + String ascii, + Set asciiErrors, + String unicode, + Set toUnicodeErrors, + PrintWriter out) { final String unicodeReveal = hexForTest.transform(unicode); final boolean hasUnicodeErrors = toUnicodeErrors.size() != 0; final boolean hasAsciiErrors = asciiErrors.size() != 0; final Set extraErrors = EnumSet.noneOf(Errors.class); - final boolean validIdna2008 = IDNA2008Valid.containsAll(unicode) && Uts46.hasBidiOrContextError(unicode, extraErrors ) == 0; - out.println(type - + ";\t" - + hexForTest.transform(source) - + ";\t" - + (hasUnicodeErrors ? showErrors(toUnicodeErrors) : unicode.equals(source) ? "" : unicodeReveal) - + ";\t" - + (hasAsciiErrors ? showErrors(asciiErrors) : unicode.equals(ascii) ? "" : hexForTest.transform(ascii)) - + (Idna2008.GRANDFATHERED_VALID.containsSome(unicode) ? ";\tXV8" - : hasUnicodeErrors || validIdna2008 ? "" : ";\tNV8") // checking - + (!NEW_FORMAT ? "" : "" - + (unicodeReveal.equals(unicode) ? "" : "\t#\t" + removeInvisible.transform(unicode))) - ); + final boolean validIdna2008 = + IDNA2008Valid.containsAll(unicode) + && Uts46.hasBidiOrContextError(unicode, extraErrors) == 0; + out.println( + type + + ";\t" + + hexForTest.transform(source) + + ";\t" + + (hasUnicodeErrors + ? showErrors(toUnicodeErrors) + : unicode.equals(source) ? "" : unicodeReveal) + + ";\t" + + (hasAsciiErrors + ? showErrors(asciiErrors) + : unicode.equals(ascii) ? "" : hexForTest.transform(ascii)) + + (Idna2008.GRANDFATHERED_VALID.containsSome(unicode) + ? ";\tXV8" + : hasUnicodeErrors || validIdna2008 ? "" : ";\tNV8") // checking + + (!NEW_FORMAT + ? "" + : "" + + (unicodeReveal.equals(unicode) + ? "" + : "\t#\t" + removeInvisible.transform(unicode)))); } static class RandomString { static Random random = new Random(0); static UnicodeSet[] sampleSets; + static { final String[] samplesNew = { - // // bidi - // "[[:bc=R:][:bc=AL:]]", - // "[[:bc=L:]]", - // "[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]]", - // "[[:bc=EN:]]", - // "[[:bc=AN:]]", - // "[[:bc=NSM:]]", - // // contextj - // "[\u200C\u200D]", - // "[[:ccc=virama:]]", - // "[[:jt=T:]]", - // "[[:jt=L:][:jt=D:]]", - // "[[:jt=R:][:jt=D:]]", - // // syntax - // "[-]", - // // changed mapping from 2003 - // "[\u04C0 \u10A0-\u10C5 \u2132 \u2183 \u2F868 \u2F874 \u2F91F \u2F95F \u2F9BF \u3164 \uFFA0 \u115F \u1160 \u17B4 \u17B5 \u1806]", - // // disallowed in 2003 - // "[\u200E-\u200F \u202A-\u202E \u2061-\u2063 \uFFFC \uFFFD \u1D173-\u1D17A \u206A-\u206F \uE0001 \uE0020-\uE007F]", - // // Step 7 - // "[\u2260 \u226E \u226F \uFE12 \u2488]", - // // disallowed - // "[:age=9.0:]", - // // deviations - // "[\\u200C\\u200D\\u00DF\\u03C2]", - // stable sets - // bidi - "[\\u05BE\\u05C0\\u05C3\\u05C6\\u05D0-\\u05EA\\u05F0-\\u05F4\\u0608\\u060B\\u060D\\u061B\\u061C\\U0001EE67-\\U0001EE6A\\U0001EE6C-\\U0001EE72\\U0001EE74-\\U0001EE77\\U0001EE79-\\U0001EE7C\\U0001EE7E\\U0001EE80-\\U0001EE89\\U0001EE8B-\\U0001EE9B\\U0001EEA1-\\U0001EEA3\\U0001EEA5-\\U0001EEA9\\U0001EEAB-\\U0001EEBB]", - "[A-Za-z\\u00AA\\u00B5\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02B8\\u02BB-\\u02C1\\u02D0\\u02D1\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250\\U0001F251\\U00020000-\\U0002A6D6\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002F800-\\U0002FA1D\\U000F0000-\\U000FFFFD\\U00100000-\\U0010FFFD]", - "[\\u0000-\\u0008\\u000E-\\u001B!-/\\:-@\\[-`\\{-\\u0084\\u0086-\\u00A9\\u00AB-\\u00B1\\u00B4\\u00B6-\\u00B8\\U0001F920-\\U0001F927\\U0001F930\\U0001F933-\\U0001F93E\\U0001F940-\\U0001F94B\\U0001F950-\\U0001F95E\\U0001F980-\\U0001F991\\U0001F9C0\\U000E0001\\U000E0020-\\U000E007F\\U000E0100-\\U000E01EF]", - "[0-9\\u00B2\\u00B3\\u00B9\\u06F0-\\u06F9\\u2070\\u2074-\\u2079\\u2080-\\u2089\\u2488-\\u249B\\uFF10-\\uFF19\\U000102E1-\\U000102FB\\U0001D7CE-\\U0001D7FF\\U0001F100-\\U0001F10A]", - "[\\u0600-\\u0605\\u0660-\\u0669\\u066B\\u066C\\u06DD\\u08E2\\U00010E60-\\U00010E7E]", - "[\\u0300-\\u036F\\u0483-\\u0489\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u064B-\\u065F\\u0670\\U0001DA9B-\\U0001DA9F\\U0001DAA1-\\U0001DAAF\\U0001E000-\\U0001E006\\U0001E008-\\U0001E018\\U0001E01B-\\U0001E021\\U0001E023\\U0001E024\\U0001E026-\\U0001E02A\\U0001E8D0-\\U0001E8D6\\U0001E944-\\U0001E94A\\U000E0100-\\U000E01EF]", - // contextj - "[\\u200C\\u200D]", - "[\\u094D\\u09CD\\u0A4D\\u0ACD\\u0B4D\\u0BCD\\u0C4D\\u0CCD\\u0D4D\\u0DCA\\U00011235\\U000112EA\\U0001134D\\U00011442\\U000114C2\\U000115BF\\U0001163F\\U000116B6\\U0001172B\\U00011C3F]", - "[\\u00AD\\u0300-\\u036F\\u0483-\\u0489\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u061C\\U0001E000-\\U0001E006\\U0001E008-\\U0001E018\\U0001E01B-\\U0001E021\\U0001E023\\U0001E024\\U0001E026-\\U0001E02A\\U0001E8D0-\\U0001E8D6\\U0001E944-\\U0001E94A\\U000E0001\\U000E0020-\\U000E007F\\U000E0100-\\U000E01EF]", - "[\\u0620\\u0626\\u0628\\u062A-\\u062E\\u0633-\\u063F\\u0641-\\u0647\\u0649\\u064A\\u066E\\u066F\\u0678-\\u0687\\u069A-\\u06BF\\U00010ADE-\\U00010AE0\\U00010AEB-\\U00010AEE\\U00010B80\\U00010B82\\U00010B86-\\U00010B88\\U00010B8A\\U00010B8B\\U00010B8D\\U00010B90\\U00010BAD\\U00010BAE\\U0001E900-\\U0001E943]", - "[\\u0620\\u0622-\\u063F\\u0641-\\u064A\\u066E\\u066F\\u0671-\\u0673\\u0675-\\u06D3\\u06D5\\u06EE\\u06EF\\u06FA-\\u06FC\\u06FF\\U00010AC0-\\U00010AC5\\U00010AC7\\U00010AC9\\U00010ACA\\U00010ACE-\\U00010AD6\\U00010AD8-\\U00010AE1\\U00010AE4\\U00010AEB-\\U00010AEF\\U00010B80-\\U00010B91\\U00010BA9-\\U00010BAE\\U0001E900-\\U0001E943]", - // syntax - "[\\-]", - // changed mapping from 2003 - "[48F\\u04C0\\u10A0-\\u10C5\\u115F\\u1160\\u17B4\\u17B5\\u1806\\u2132\\u2183\\u2F86\\u2F87\\u2F91\\u2F95\\u2F9B\\u3164\\uFFA0]", - // disallowed in 2003 - "[\\-0-\\u0377\\u037A-\\u037F\\u0384-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u052F\\u0531-\\u0556\\u0559-\\u055F\\u0561-\\u0587\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAB30-\\uAB65\\uAB70-\\uABED\\uABF0-\\uABF9\\uAC00-\\uD7A3\\uD7B0-\\uD7C6\\uD7CB-\\uD7FB\\uD800-\\uE007\\uFFFC\\uFFFD]", - // Step 7 - "[\\u2260\\u226E\\u226F\\u2488\\uFE12]", - // disallowed - "[^\\ 0-9A-Za-z\\u00A0\\u00AA\\u00B2\\u00B3\\u00B5\\u00B9\\u00BA\\u00BC-\\U0006FFFD\\U00070000-\\U0007FFFD\\U00080000-\\U0008FFFD\\U00090000-\\U0009FFFD\\U000A0000-\\U000AFFFD\\U000B0000-\\U000BFFFD\\U000C0000-\\U000CFFFD\\U000D0001-\\U000DFFFD\\U000E0002-\\U000E001F\\U000E0080-\\U000EFFFD]", - // deviations - "[\\u00DF\\u03C2\\u200C\\u200D]", + // // bidi + // "[[:bc=R:][:bc=AL:]]", + // "[[:bc=L:]]", + // "[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]]", + // "[[:bc=EN:]]", + // "[[:bc=AN:]]", + // "[[:bc=NSM:]]", + // // contextj + // "[\u200C\u200D]", + // "[[:ccc=virama:]]", + // "[[:jt=T:]]", + // "[[:jt=L:][:jt=D:]]", + // "[[:jt=R:][:jt=D:]]", + // // syntax + // "[-]", + // // changed mapping from 2003 + // "[\u04C0 \u10A0-\u10C5 \u2132 \u2183 \u2F868 \u2F874 \u2F91F + // \u2F95F \u2F9BF \u3164 \uFFA0 \u115F \u1160 \u17B4 \u17B5 \u1806]", + // // disallowed in 2003 + // "[\u200E-\u200F \u202A-\u202E \u2061-\u2063 \uFFFC \uFFFD + // \u1D173-\u1D17A \u206A-\u206F \uE0001 \uE0020-\uE007F]", + // // Step 7 + // "[\u2260 \u226E \u226F \uFE12 \u2488]", + // // disallowed + // "[:age=9.0:]", + // // deviations + // "[\\u200C\\u200D\\u00DF\\u03C2]", + // stable sets + // bidi + "[\\u05BE\\u05C0\\u05C3\\u05C6\\u05D0-\\u05EA\\u05F0-\\u05F4\\u0608\\u060B\\u060D\\u061B\\u061C\\U0001EE67-\\U0001EE6A\\U0001EE6C-\\U0001EE72\\U0001EE74-\\U0001EE77\\U0001EE79-\\U0001EE7C\\U0001EE7E\\U0001EE80-\\U0001EE89\\U0001EE8B-\\U0001EE9B\\U0001EEA1-\\U0001EEA3\\U0001EEA5-\\U0001EEA9\\U0001EEAB-\\U0001EEBB]", + "[A-Za-z\\u00AA\\u00B5\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02B8\\u02BB-\\u02C1\\u02D0\\u02D1\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250\\U0001F251\\U00020000-\\U0002A6D6\\U0002A700-\\U0002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002F800-\\U0002FA1D\\U000F0000-\\U000FFFFD\\U00100000-\\U0010FFFD]", + "[\\u0000-\\u0008\\u000E-\\u001B!-/\\:-@\\[-`\\{-\\u0084\\u0086-\\u00A9\\u00AB-\\u00B1\\u00B4\\u00B6-\\u00B8\\U0001F920-\\U0001F927\\U0001F930\\U0001F933-\\U0001F93E\\U0001F940-\\U0001F94B\\U0001F950-\\U0001F95E\\U0001F980-\\U0001F991\\U0001F9C0\\U000E0001\\U000E0020-\\U000E007F\\U000E0100-\\U000E01EF]", + "[0-9\\u00B2\\u00B3\\u00B9\\u06F0-\\u06F9\\u2070\\u2074-\\u2079\\u2080-\\u2089\\u2488-\\u249B\\uFF10-\\uFF19\\U000102E1-\\U000102FB\\U0001D7CE-\\U0001D7FF\\U0001F100-\\U0001F10A]", + "[\\u0600-\\u0605\\u0660-\\u0669\\u066B\\u066C\\u06DD\\u08E2\\U00010E60-\\U00010E7E]", + "[\\u0300-\\u036F\\u0483-\\u0489\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u064B-\\u065F\\u0670\\U0001DA9B-\\U0001DA9F\\U0001DAA1-\\U0001DAAF\\U0001E000-\\U0001E006\\U0001E008-\\U0001E018\\U0001E01B-\\U0001E021\\U0001E023\\U0001E024\\U0001E026-\\U0001E02A\\U0001E8D0-\\U0001E8D6\\U0001E944-\\U0001E94A\\U000E0100-\\U000E01EF]", + // contextj + "[\\u200C\\u200D]", + "[\\u094D\\u09CD\\u0A4D\\u0ACD\\u0B4D\\u0BCD\\u0C4D\\u0CCD\\u0D4D\\u0DCA\\U00011235\\U000112EA\\U0001134D\\U00011442\\U000114C2\\U000115BF\\U0001163F\\U000116B6\\U0001172B\\U00011C3F]", + "[\\u00AD\\u0300-\\u036F\\u0483-\\u0489\\u0591-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u05C5\\u05C7\\u0610-\\u061A\\u061C\\U0001E000-\\U0001E006\\U0001E008-\\U0001E018\\U0001E01B-\\U0001E021\\U0001E023\\U0001E024\\U0001E026-\\U0001E02A\\U0001E8D0-\\U0001E8D6\\U0001E944-\\U0001E94A\\U000E0001\\U000E0020-\\U000E007F\\U000E0100-\\U000E01EF]", + "[\\u0620\\u0626\\u0628\\u062A-\\u062E\\u0633-\\u063F\\u0641-\\u0647\\u0649\\u064A\\u066E\\u066F\\u0678-\\u0687\\u069A-\\u06BF\\U00010ADE-\\U00010AE0\\U00010AEB-\\U00010AEE\\U00010B80\\U00010B82\\U00010B86-\\U00010B88\\U00010B8A\\U00010B8B\\U00010B8D\\U00010B90\\U00010BAD\\U00010BAE\\U0001E900-\\U0001E943]", + "[\\u0620\\u0622-\\u063F\\u0641-\\u064A\\u066E\\u066F\\u0671-\\u0673\\u0675-\\u06D3\\u06D5\\u06EE\\u06EF\\u06FA-\\u06FC\\u06FF\\U00010AC0-\\U00010AC5\\U00010AC7\\U00010AC9\\U00010ACA\\U00010ACE-\\U00010AD6\\U00010AD8-\\U00010AE1\\U00010AE4\\U00010AEB-\\U00010AEF\\U00010B80-\\U00010B91\\U00010BA9-\\U00010BAE\\U0001E900-\\U0001E943]", + // syntax + "[\\-]", + // changed mapping from 2003 + "[48F\\u04C0\\u10A0-\\u10C5\\u115F\\u1160\\u17B4\\u17B5\\u1806\\u2132\\u2183\\u2F86\\u2F87\\u2F91\\u2F95\\u2F9B\\u3164\\uFFA0]", + // disallowed in 2003 + "[\\-0-\\u0377\\u037A-\\u037F\\u0384-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u052F\\u0531-\\u0556\\u0559-\\u055F\\u0561-\\u0587\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAB30-\\uAB65\\uAB70-\\uABED\\uABF0-\\uABF9\\uAC00-\\uD7A3\\uD7B0-\\uD7C6\\uD7CB-\\uD7FB\\uD800-\\uE007\\uFFFC\\uFFFD]", + // Step 7 + "[\\u2260\\u226E\\u226F\\u2488\\uFE12]", + // disallowed + "[^\\ 0-9A-Za-z\\u00A0\\u00AA\\u00B2\\u00B3\\u00B5\\u00B9\\u00BA\\u00BC-\\U0006FFFD\\U00070000-\\U0007FFFD\\U00080000-\\U0008FFFD\\U00090000-\\U0009FFFD\\U000A0000-\\U000AFFFD\\U000B0000-\\U000BFFFD\\U000C0000-\\U000CFFFD\\U000D0001-\\U000DFFFD\\U000E0002-\\U000E001F\\U000E0080-\\U000EFFFD]", + // deviations + "[\\u00DF\\u03C2\\u200C\\u200D]", }; final String[] samplesOld = { - // bidi - "[[:bc=R:][:bc=AL:]]", - "[[:bc=L:]]", - "[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]]", - "[[:bc=EN:]]", - "[[:bc=AN:]]", - "[[:bc=NSM:]]", - // contextj - "[\u200C\u200D]", - "[:ccc=virama:]", - "[:jt=T:]", - "[[:jt=L:][:jt=D:]]", - "[[:jt=R:][:jt=D:]]", - // syntax - "[-]", - "[\\u200C\\u200D\\u00DF\\u03C2]", // deviations + // bidi + "[[:bc=R:][:bc=AL:]]", + "[[:bc=L:]]", + "[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]]", + "[[:bc=EN:]]", + "[[:bc=AN:]]", + "[[:bc=NSM:]]", + // contextj + "[\u200C\u200D]", + "[:ccc=virama:]", + "[:jt=T:]", + "[[:jt=L:][:jt=D:]]", + "[[:jt=R:][:jt=D:]]", + // syntax + "[-]", + "[\\u200C\\u200D\\u00DF\\u03C2]", // deviations }; final String[] samples = NEW_FORMAT ? samplesNew : samplesOld; // OLD B; \u063D\uD803\uDDD6\u1039; [P1 V6]; [P1 V6] // NEW B; \u063D\uFBB0\u0BCD⁰.\uDB40\uDDD6\uD803\uDE71; [B1]; [B1] sampleSets = new UnicodeSet[samples.length]; - //UnicodeSet age = new UnicodeSet("[:age=6.0:]"); + // UnicodeSet age = new UnicodeSet("[:age=6.0:]"); for (int i = 0; i < samples.length; ++i) { - sampleSets[i] = new UnicodeSet(samples[i]) - // .retainAll(age) - .freeze(); + sampleSets[i] = + new UnicodeSet(samples[i]) + // .retainAll(age) + .freeze(); } } + void appendNext(StringBuilder sb) { int len = 1 + random.nextInt(NEW_FORMAT ? 4 : 7); // random contents, picking from random set @@ -501,6 +567,7 @@ void appendNext(StringBuilder sb) { sb.appendCodePoint(cp); } } + void resetRandom(long seed) { random.setSeed(seed); } @@ -512,36 +579,42 @@ private String showErrors(Set errors) { public static UnicodeSet getIdna2008Valid() { // IdnaLabelTester tester = getIdna2008Tester(); - // UnicodeSet valid2008 = UnicodeSetUtilities.parseUnicodeSet(tester.getVariable("$Valid"), TableStyle.simple); + // UnicodeSet valid2008 = + // UnicodeSetUtilities.parseUnicodeSet(tester.getVariable("$Valid"), TableStyle.simple); // return valid2008; final UnicodeMap typeMapping = Idna2008.getTypeMapping(); return new UnicodeSet(typeMapping.getSet(Idna2008Type.PVALID)) .addAll(typeMapping.getSet(Idna2008Type.CONTEXTJ)) - .addAll(typeMapping.getSet(Idna2008Type.CONTEXTO)) - ; + .addAll(typeMapping.getSet(Idna2008Type.CONTEXTO)); } - public static Transliterator hexForTest = Transliterator.getInstance( - "[" + TO_ESCAPE + "&[\\u0000-\\uFFFF]] any-hex;" + TO_ESCAPE + " any-hex/perl;"); - Transliterator removeInvisible = Transliterator.getInstance("[[:di:][:c:]-[:whitespace:]] remove"); + public static Transliterator hexForTest = + Transliterator.getInstance( + "[" + + TO_ESCAPE + + "&[\\u0000-\\uFFFF]] any-hex;" + + TO_ESCAPE + + " any-hex/perl;"); + Transliterator removeInvisible = + Transliterator.getInstance("[[:di:][:c:]-[:whitespace:]] remove"); static UnicodeSet IDNA2008Valid = new UnicodeSet(getIdna2008Valid()).add('.').freeze(); // 1. The first character must be a character with BIDI property L, R // or AL. If it has the R or AL property, it is an RTL label; if it // has the L property, it is an LTR label. // - //2. In an RTL label, only characters with the BIDI properties R, AL, + // 2. In an RTL label, only characters with the BIDI properties R, AL, // AN, EN, ES, CS, ET, ON, BN and NSM are allowed. // in practice, this excludes L // - //3. In an RTL label, the end of the label must be a character with + // 3. In an RTL label, the end of the label must be a character with // BIDI property R, AL, EN or AN, followed by zero or more // characters with BIDI property NSM. // - //4. In an RTL label, if an EN is present, no AN may be present, and + // 4. In an RTL label, if an EN is present, no AN may be present, and // vice versa. // - //5. In an LTR label, only characters with the BIDI properties L, EN, + // 5. In an LTR label, only characters with the BIDI properties L, EN, // ES, CS. ET, ON, BN and NSM are allowed. // in practice, this excludes R, AL // @@ -558,357 +631,545 @@ public static UnicodeSet getIdna2008Valid() { static final char SAMPLE_NSM = '\u0308'; // U+02C7 ( ˇ ) CARON public static String[][] bidiTests = { - {"à" + SAMPLE_R_AL, "B5", "B6"}, - {"0à." + SAMPLE_R_AL,"B1"}, - {"à." + SAMPLE_R_AL + SAMPLE_NSM}, - {"à." + SAMPLE_R_AL + SAMPLE_EN + SAMPLE_AN + SAMPLE_R_AL, "B4"}, - {SAMPLE_NSM + "." + SAMPLE_R_AL + "","B3"}, - {"à." + SAMPLE_R_AL + "0" + SAMPLE_AN,"B4"}, - {"à" + SAMPLE_ES_CS_ET_ON_BN + "." + SAMPLE_R_AL + "","B6"}, - {"à" + SAMPLE_NSM + "." + SAMPLE_R_AL + ""}, + {"à" + SAMPLE_R_AL, "B5", "B6"}, + {"0à." + SAMPLE_R_AL, "B1"}, + {"à." + SAMPLE_R_AL + SAMPLE_NSM}, + {"à." + SAMPLE_R_AL + SAMPLE_EN + SAMPLE_AN + SAMPLE_R_AL, "B4"}, + {SAMPLE_NSM + "." + SAMPLE_R_AL + "", "B3"}, + {"à." + SAMPLE_R_AL + "0" + SAMPLE_AN, "B4"}, + {"à" + SAMPLE_ES_CS_ET_ON_BN + "." + SAMPLE_R_AL + "", "B6"}, + {"à" + SAMPLE_NSM + "." + SAMPLE_R_AL + ""}, }; - public static String[][] contextTests = new String[][] { - {"a\u200Cb","C1"}, - {"a\u094D\u200Cb"}, - {"\u0308\u200C\u0308بb","C1"}, - {"aب\u0308\u200C\u0308","C1"}, - {"aب\u0308\u200C\u0308بb"}, - - {"a\u200Db","C2"}, - {"a\u094D\u200Db"}, - {"\u0308\u200D\u0308بb","C2"}, - {"aب\u0308\u200D\u0308","C2"}, - {"aب\u0308\u200D\u0308بb","C2"}, - }; + public static String[][] contextTests = + new String[][] { + {"a\u200Cb", "C1"}, + {"a\u094D\u200Cb"}, + {"\u0308\u200C\u0308بb", "C1"}, + {"aب\u0308\u200C\u0308", "C1"}, + {"aب\u0308\u200C\u0308بb"}, + {"a\u200Db", "C2"}, + {"a\u094D\u200Db"}, + {"\u0308\u200D\u0308بb", "C2"}, + {"aب\u0308\u200D\u0308", "C2"}, + {"aب\u0308\u200D\u0308بb", "C2"}, + }; public static final Object[][] testCases = { - // special case - { "。", "B", "。", 0 }, - // special case - { "\uAB60", "B", "\uAB60", 0 }, - - { "1234567890\u00E41234567890123456789012345678901234567890123456", "B", - "1234567890\u00E41234567890123456789012345678901234567890123456", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG }, - // all ASCII - { "www.eXample.cOm", "B", "www.example.com", 0 }, - // u-umlaut - { "B\u00FCcher.de", "B", "b\u00FCcher.de", 0 }, - // O-umlaut - { "\u00D6BB", "B", "\u00F6bb", 0 }, - { "fa\u00DF.de", "N", "fa\u00DF.de", 0 }, // sharp s - // sharp s - { "fa\u00DF.de", "T", "fass.de", 0 }, - // sharp s in Punycode - { "XN--fA-hia.dE", "B", "fa\u00DF.de", 0 }, - // Greek with final sigma - { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "N", - "\u03B2\u03CC\u03BB\u03BF\u03C2.com", 0 }, - // Greek with final - // sigma - { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "T", - "\u03B2\u03CC\u03BB\u03BF\u03C3.com", 0 }, - { "xn--nxasmm1c", "B", // Greek with final sigma in Punycode - "\u03B2\u03CC\u03BB\u03BF\u03C2", 0 }, - { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "N", // "Sri" in "Sri - // Lanka" has a ZWJ - "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", 0 }, - { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "T", // "Sri" in "Sri - // Lanka" has a ZWJ - "www.\u0DC1\u0DCA\u0DBB\u0DD3.com", 0 }, - { "www.xn--10cl1a0b660p.com", "B", // "Sri" in Punycode - "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", 0 }, - { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "N", // ZWNJ - "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", 0 }, - { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "T", // ZWNJ - "\u0646\u0627\u0645\u0647\u0627\u06CC", 0 }, - { "xn--mgba3gch31f060k.com", "B", // ZWNJ in Punycode - "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com", 0 }, - { "a.b\uFF0Ec\u3002d\uFF61", "B", - "a.b.c.d.", 0 }, - { "U\u0308.xn--tda", "B", // U+umlaut.u-umlaut - "\u00FC.\u00FC", 0 }, - { "xn--u-ccb", "B", // u+umlaut in Punycode - "xn--u-ccb\uFFFD", Uts46.UIDNA_ERROR_INVALID_ACE_LABEL }, - { "a\u2488com", "B", // contains 1-dot - "a\uFFFDcom", Uts46.UIDNA_ERROR_DISALLOWED }, - { "xn--a-ecp.ru", "B", // contains 1-dot in Punycode - "xn--a-ecp\uFFFD.ru", Uts46.UIDNA_ERROR_INVALID_ACE_LABEL }, - { "xn--0.pt", "B", // invalid Punycode - "xn--0\uFFFD.pt", Uts46.UIDNA_ERROR_PUNYCODE }, - { "xn--a.pt", "B", // U+0080 - "xn--a\uFFFD.pt", Uts46.UIDNA_ERROR_INVALID_ACE_LABEL }, - { "xn--a-\u00C4.pt", "B", // invalid Punycode - "xn--a-\u00E4.pt", Uts46.UIDNA_ERROR_PUNYCODE }, - { "\u65E5\u672C\u8A9E\u3002\uFF2A\uFF30", "B", // Japanese with - // fullwidth ".jp" - "\u65E5\u672C\u8A9E.jp", 0 }, - { "\u2615", "B", "\u2615", 0 }, // Unicode 4.0 HOT BEVERAGE - // many deviation characters, test the special mapping code - { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd" + // special case + {"。", "B", "。", 0}, + // special case + {"\uAB60", "B", "\uAB60", 0}, + { + "1234567890\u00E41234567890123456789012345678901234567890123456", + "B", + "1234567890\u00E41234567890123456789012345678901234567890123456", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG + }, + // all ASCII + {"www.eXample.cOm", "B", "www.example.com", 0}, + // u-umlaut + {"B\u00FCcher.de", "B", "b\u00FCcher.de", 0}, + // O-umlaut + {"\u00D6BB", "B", "\u00F6bb", 0}, + {"fa\u00DF.de", "N", "fa\u00DF.de", 0}, // sharp s + // sharp s + {"fa\u00DF.de", "T", "fass.de", 0}, + // sharp s in Punycode + {"XN--fA-hia.dE", "B", "fa\u00DF.de", 0}, + // Greek with final sigma + {"\u03B2\u03CC\u03BB\u03BF\u03C2.com", "N", "\u03B2\u03CC\u03BB\u03BF\u03C2.com", 0}, + // Greek with final + // sigma + {"\u03B2\u03CC\u03BB\u03BF\u03C2.com", "T", "\u03B2\u03CC\u03BB\u03BF\u03C3.com", 0}, + { + "xn--nxasmm1c", + "B", // Greek with final sigma in Punycode + "\u03B2\u03CC\u03BB\u03BF\u03C2", + 0 + }, + { + "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", + "N", // "Sri" in "Sri + // Lanka" has a ZWJ + "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", + 0 + }, + { + "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", + "T", // "Sri" in "Sri + // Lanka" has a ZWJ + "www.\u0DC1\u0DCA\u0DBB\u0DD3.com", + 0 + }, + { + "www.xn--10cl1a0b660p.com", + "B", // "Sri" in Punycode + "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", + 0 + }, + { + "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", + "N", // ZWNJ + "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", + 0 + }, + { + "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", + "T", // ZWNJ + "\u0646\u0627\u0645\u0647\u0627\u06CC", + 0 + }, + { + "xn--mgba3gch31f060k.com", + "B", // ZWNJ in Punycode + "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com", + 0 + }, + {"a.b\uFF0Ec\u3002d\uFF61", "B", "a.b.c.d.", 0}, + { + "U\u0308.xn--tda", + "B", // U+umlaut.u-umlaut + "\u00FC.\u00FC", + 0 + }, + { + "xn--u-ccb", + "B", // u+umlaut in Punycode + "xn--u-ccb\uFFFD", + Uts46.UIDNA_ERROR_INVALID_ACE_LABEL + }, + { + "a\u2488com", + "B", // contains 1-dot + "a\uFFFDcom", + Uts46.UIDNA_ERROR_DISALLOWED + }, + { + "xn--a-ecp.ru", + "B", // contains 1-dot in Punycode + "xn--a-ecp\uFFFD.ru", + Uts46.UIDNA_ERROR_INVALID_ACE_LABEL + }, + { + "xn--0.pt", + "B", // invalid Punycode + "xn--0\uFFFD.pt", + Uts46.UIDNA_ERROR_PUNYCODE + }, + { + "xn--a.pt", + "B", // U+0080 + "xn--a\uFFFD.pt", + Uts46.UIDNA_ERROR_INVALID_ACE_LABEL + }, + { + "xn--a-\u00C4.pt", + "B", // invalid Punycode + "xn--a-\u00E4.pt", + Uts46.UIDNA_ERROR_PUNYCODE + }, + { + "\u65E5\u672C\u8A9E\u3002\uFF2A\uFF30", + "B", // Japanese with + // fullwidth ".jp" + "\u65E5\u672C\u8A9E.jp", + 0 + }, + {"\u2615", "B", "\u2615", 0}, // Unicode 4.0 HOT BEVERAGE + // many deviation characters, test the special mapping code + { + "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd" + "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe" + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx" + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy" - + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "N", - "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd" - + "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe" - + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx" - + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy" - + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG | Uts46.UIDNA_ERROR_CONTEXTJ }, - { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd" + + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", + "N", + "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd" + "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe" + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx" + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy" - + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "T", - "1.assbcssssssssd" - + "\u03C3\u03C3sssssssssssssssse" - + "ssssssssssssssssssssx" - + "ssssssssssssssssssssy" - + "sssssssssssssss\u015Dssz", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG }, - // "xn--bss" with deviation characters - { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "N", - "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", Uts46.UIDNA_ERROR_CONTEXTJ }, - { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "T", - "\u5919", 0 }, - // "xn--bssffl" written as: - // 02E3 MODIFIER LETTER SMALL X - // 034F COMBINING GRAPHEME JOINER (ignored) - // 2115 DOUBLE-STRUCK CAPITAL N - // 200B ZERO WIDTH SPACE (ignored) - // FE63 SMALL HYPHEN-MINUS - // 00AD SOFT HYPHEN (ignored) - // FF0D FULLWIDTH HYPHEN-MINUS - // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored) - // 212C SCRIPT CAPITAL B - // FE00 VARIATION SELECTOR-1 (ignored) - // 017F LATIN SMALL LETTER LONG S - // 2064 INVISIBLE PLUS (ignored) - // 1D530 MATHEMATICAL FRAKTUR SMALL S - // E01EF VARIATION SELECTOR-256 (ignored) - // FB04 LATIN SMALL LIGATURE FFL - { "\u02E3\u034F\u2115\u200B\uFE63\u00AD\uFF0D\u180C" + + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG | Uts46.UIDNA_ERROR_CONTEXTJ + }, + { + "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd" + + "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe" + + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx" + + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy" + + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", + "T", + "1.assbcssssssssd" + + "\u03C3\u03C3sssssssssssssssse" + + "ssssssssssssssssssssx" + + "ssssssssssssssssssssy" + + "sssssssssssssss\u015Dssz", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG + }, + // "xn--bss" with deviation characters + { + "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", + "N", + "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", + Uts46.UIDNA_ERROR_CONTEXTJ + }, + {"\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "T", "\u5919", 0}, + // "xn--bssffl" written as: + // 02E3 MODIFIER LETTER SMALL X + // 034F COMBINING GRAPHEME JOINER (ignored) + // 2115 DOUBLE-STRUCK CAPITAL N + // 200B ZERO WIDTH SPACE (ignored) + // FE63 SMALL HYPHEN-MINUS + // 00AD SOFT HYPHEN (ignored) + // FF0D FULLWIDTH HYPHEN-MINUS + // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored) + // 212C SCRIPT CAPITAL B + // FE00 VARIATION SELECTOR-1 (ignored) + // 017F LATIN SMALL LETTER LONG S + // 2064 INVISIBLE PLUS (ignored) + // 1D530 MATHEMATICAL FRAKTUR SMALL S + // E01EF VARIATION SELECTOR-256 (ignored) + // FB04 LATIN SMALL LIGATURE FFL + { + "\u02E3\u034F\u2115\u200B\uFE63\u00AD\uFF0D\u180C" + "\u212C\uFE00\u017F\u2064" // + "\\U0001D530" + UTF16.valueOf(0x1D530) // + "\\U000E01EF" + UTF16.valueOf(0xE01EF) - + "\uFB04", "B", - "\u5921\u591E\u591C\u5919", 0 }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "\uFB04", + "B", + "\u5921\u591E\u591C\u5919", + 0 + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901", + 0 + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890123." + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901.", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", - 0 }, - { "123456789012345678901234567890123456789012345678901234567890123." + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901.", + 0 + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901.", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901.", - 0 }, - { "123456789012345678901234567890123456789012345678901234567890123." + "123456789012345678901234567890123456789012345678901234567890123." + + "12345678901234567890123456789012345678901234567890123456789012", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + "123456789012345678901234567890123456789012345678901234567890123." - + "12345678901234567890123456789012345678901234567890123456789012", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890123." - + "12345678901234567890123456789012345678901234567890123456789012", - Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890123." + + "12345678901234567890123456789012345678901234567890123456789012", + Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901234." + + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901234." + + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890123456789012345678901234567890123456789012345678901234." + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901234." - + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890.", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890123456789012345678901234567890123456789012345678901234." + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890.", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901234." - + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890.", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890.", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890123456789012345678901234567890123456789012345678901234." + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901234." - + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG | Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG }, - // label length 63: - // xn--1234567890123456789012345678901234567890123456789012345-9te - { "\u00E41234567890123456789012345678901234567890123456789012345", "B", - "\u00E41234567890123456789012345678901234567890123456789012345", 0 }, - { "1234567890\u00E41234567890123456789012345678901234567890123456", "B", - "1234567890\u00E41234567890123456789012345678901234567890123456", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901234." + + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG | Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG + }, + // label length 63: + // xn--1234567890123456789012345678901234567890123456789012345-9te + { + "\u00E41234567890123456789012345678901234567890123456789012345", + "B", + "\u00E41234567890123456789012345678901234567890123456789012345", + 0 + }, + { + "1234567890\u00E41234567890123456789012345678901234567890123456", + "B", + "1234567890\u00E41234567890123456789012345678901234567890123456", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890\u00E4123456789012345678901234567890123456789012345." + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890\u00E4123456789012345678901234567890123456789012345." - + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", - 0 }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890\u00E4123456789012345678901234567890123456789012345." + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901.", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890\u00E4123456789012345678901234567890123456789012345." - + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901.", - 0 }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901", + 0 + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890\u00E4123456789012345678901234567890123456789012345." + "123456789012345678901234567890123456789012345678901234567890123." - + "12345678901234567890123456789012345678901234567890123456789012", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890\u00E4123456789012345678901234567890123456789012345." - + "123456789012345678901234567890123456789012345678901234567890123." - + "12345678901234567890123456789012345678901234567890123456789012", - Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901.", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890\u00E4123456789012345678901234567890123456789012345." + + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901.", + 0 + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890\u00E4123456789012345678901234567890123456789012345." + + "123456789012345678901234567890123456789012345678901234567890123." + + "12345678901234567890123456789012345678901234567890123456789012", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890\u00E4123456789012345678901234567890123456789012345." + + "123456789012345678901234567890123456789012345678901234567890123." + + "12345678901234567890123456789012345678901234567890123456789012", + Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890\u00E41234567890123456789012345678901234567890123456." + + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890\u00E41234567890123456789012345678901234567890123456." + + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890\u00E41234567890123456789012345678901234567890123456." + + "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890.", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890\u00E41234567890123456789012345678901234567890123456." + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890\u00E41234567890123456789012345678901234567890123456." - + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "123456789012345678901234567890123456789012345678901234567890.", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG + }, + { + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890\u00E41234567890123456789012345678901234567890123456." + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890.", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890\u00E41234567890123456789012345678901234567890123456." - + "123456789012345678901234567890123456789012345678901234567890123." - + "123456789012345678901234567890123456789012345678901234567890.", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG }, - { "123456789012345678901234567890123456789012345678901234567890123." + + "1234567890123456789012345678901234567890123456789012345678901", + "B", + "123456789012345678901234567890123456789012345678901234567890123." + "1234567890\u00E41234567890123456789012345678901234567890123456." + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", "B", - "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890\u00E41234567890123456789012345678901234567890123456." - + "123456789012345678901234567890123456789012345678901234567890123." - + "1234567890123456789012345678901234567890123456789012345678901", - Uts46.UIDNA_ERROR_LABEL_TOO_LONG | Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG }, - // hyphen errors and empty-label errors - // "xn---q----jra"=="-q--a-umlaut-" - { "a.b..-q--a-.e", "B", "a.b..-q--a-.e", - Uts46.UIDNA_ERROR_EMPTY_LABEL | Uts46.UIDNA_ERROR_LEADING_HYPHEN | Uts46.UIDNA_ERROR_TRAILING_HYPHEN - | - Uts46.UIDNA_ERROR_HYPHEN_3_4 }, - { "a.b..-q--\u00E4-.e", "B", "a.b..-q--\u00E4-.e", - Uts46.UIDNA_ERROR_EMPTY_LABEL | Uts46.UIDNA_ERROR_LEADING_HYPHEN | Uts46.UIDNA_ERROR_TRAILING_HYPHEN - | - Uts46.UIDNA_ERROR_HYPHEN_3_4 }, - { "a.b..xn---q----jra.e", "B", "a.b..-q--\u00E4-.e", - Uts46.UIDNA_ERROR_EMPTY_LABEL | Uts46.UIDNA_ERROR_LEADING_HYPHEN | Uts46.UIDNA_ERROR_TRAILING_HYPHEN - | - Uts46.UIDNA_ERROR_HYPHEN_3_4 }, - { "a..c", "B", "a..c", Uts46.UIDNA_ERROR_EMPTY_LABEL }, - { "a.-b.", "B", "a.-b.", Uts46.UIDNA_ERROR_LEADING_HYPHEN }, - { "a.b-.c", "B", "a.b-.c", Uts46.UIDNA_ERROR_TRAILING_HYPHEN }, - { "a.-.c", "B", "a.-.c", Uts46.UIDNA_ERROR_LEADING_HYPHEN | Uts46.UIDNA_ERROR_TRAILING_HYPHEN }, - { "a.bc--de.f", "B", "a.bc--de.f", Uts46.UIDNA_ERROR_HYPHEN_3_4 }, - { "\u00E4.\u00AD.c", "B", "\u00E4..c", Uts46.UIDNA_ERROR_EMPTY_LABEL }, - { "\u00E4.-b.", "B", "\u00E4.-b.", Uts46.UIDNA_ERROR_LEADING_HYPHEN }, - { "\u00E4.b-.c", "B", "\u00E4.b-.c", Uts46.UIDNA_ERROR_TRAILING_HYPHEN }, - { "\u00E4.-.c", "B", "\u00E4.-.c", Uts46.UIDNA_ERROR_LEADING_HYPHEN | Uts46.UIDNA_ERROR_TRAILING_HYPHEN }, - { "\u00E4.bc--de.f", "B", "\u00E4.bc--de.f", Uts46.UIDNA_ERROR_HYPHEN_3_4 }, - { "a.b.\u0308c.d", "B", "a.b.\uFFFDc.d", Uts46.UIDNA_ERROR_LEADING_COMBINING_MARK }, - { "a.b.xn--c-bcb.d", "B", "a.b.xn--c-bcb\uFFFD.d", Uts46.UIDNA_ERROR_LEADING_COMBINING_MARK }, - // BiDi - { "A0", "B", "a0", 0 }, - { "0A", "B", "0a", 0 }, // all-LTR is ok to start with a digit (EN) - { "0A.\u05D0", "B", // ASCII label does not start with L/R/AL - "0a.\u05D0", Uts46.UIDNA_ERROR_BIDI }, - { "c.xn--0-eha.xn--4db", "B", // 2nd label does not start with - // L/R/AL - "c.0\u00FC.\u05D0", Uts46.UIDNA_ERROR_BIDI }, - { "b-.\u05D0", "B", // label does not end with L/EN - "b-.\u05D0", Uts46.UIDNA_ERROR_TRAILING_HYPHEN | Uts46.UIDNA_ERROR_BIDI }, - { "d.xn----dha.xn--4db", "B", // 2nd label does not end with L/EN - "d.\u00FC-.\u05D0", Uts46.UIDNA_ERROR_TRAILING_HYPHEN | Uts46.UIDNA_ERROR_BIDI }, - { "a\u05D0", "B", "a\u05D0", Uts46.UIDNA_ERROR_BIDI }, // first dir - // != last - // dir - { "\u05D0\u05C7", "B", "\u05D0\u05C7", 0 }, - { "\u05D09\u05C7", "B", "\u05D09\u05C7", 0 }, - { "\u05D0a\u05C7", "B", "\u05D0a\u05C7", Uts46.UIDNA_ERROR_BIDI }, // first - // dir - // != - // last - // dir - { "\u05D0\u05EA", "B", "\u05D0\u05EA", 0 }, - { "\u05D0\u05F3\u05EA", "B", "\u05D0\u05F3\u05EA", 0 }, - { "a\u05D0Tz", "B", "a\u05D0tz", Uts46.UIDNA_ERROR_BIDI }, // mixed - // dir - { "\u05D0T\u05EA", "B", "\u05D0t\u05EA", Uts46.UIDNA_ERROR_BIDI }, // mixed - // dir - { "\u05D07\u05EA", "B", "\u05D07\u05EA", 0 }, - { "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", 0 }, // Arabic 7 - // in the - // middle - { "a7\u0667z", "B", "a7\u0667z", Uts46.UIDNA_ERROR_BIDI }, // AN - // digit - // in LTR - { "\u05D07\u0667\u05EA", "B", // mixed EN/AN digits in RTL - "\u05D07\u0667\u05EA", Uts46.UIDNA_ERROR_BIDI }, - // ZWJ - { "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", 0 }, // Virama+ZWJ - { "\u0BB9\u200D", "N", "\u0BB9\u200D", Uts46.UIDNA_ERROR_CONTEXTJ }, // no - // Virama - { "\u200D", "N", "\u200D", Uts46.UIDNA_ERROR_CONTEXTJ }, // no - // Virama - // ZWNJ - { "\u0BB9\u0BCD\u200C", "N", "\u0BB9\u0BCD\u200C", 0 }, // Virama+ZWNJ - { "\u0BB9\u200C", "N", "\u0BB9\u200C", Uts46.UIDNA_ERROR_CONTEXTJ }, // no - // Virama - { "\u200C", "N", "\u200C", Uts46.UIDNA_ERROR_CONTEXTJ }, // no - // Virama - { "\u0644\u0670\u200C\u06ED\u06EF", "N", // Joining types D T ZWNJ T - // R - "\u0644\u0670\u200C\u06ED\u06EF", 0 }, - { "\u0644\u0670\u200C\u06EF", "N", // D T ZWNJ R - "\u0644\u0670\u200C\u06EF", 0 }, - { "\u0644\u200C\u06ED\u06EF", "N", // D ZWNJ T R - "\u0644\u200C\u06ED\u06EF", 0 }, - { "\u0644\u200C\u06EF", "N", // D ZWNJ R - "\u0644\u200C\u06EF", 0 }, - { "\u0644\u0670\u200C\u06ED", "N", // D T ZWNJ T - "\u0644\u0670\u200C\u06ED", Uts46.UIDNA_ERROR_BIDI | Uts46.UIDNA_ERROR_CONTEXTJ }, - { "\u06EF\u200C\u06EF", "N", // R ZWNJ R - "\u06EF\u200C\u06EF", Uts46.UIDNA_ERROR_CONTEXTJ }, - { "\u0644\u200C", "N", // D ZWNJ - "\u0644\u200C", Uts46.UIDNA_ERROR_BIDI | Uts46.UIDNA_ERROR_CONTEXTJ }, - // { "", "B", - // "", 0 }, - {"0à.\u05D0"}, - {"à.\u05D00\u0660"}, - {"a。。b"}, - {"\u200D。。\u06B9\u200C"}, - {"\u05D0\u0030\u0660"}, - {"$"}, + + "1234567890123456789012345678901234567890123456789012345678901", + Uts46.UIDNA_ERROR_LABEL_TOO_LONG | Uts46.UIDNA_ERROR_DOMAIN_NAME_TOO_LONG + }, + // hyphen errors and empty-label errors + // "xn---q----jra"=="-q--a-umlaut-" + { + "a.b..-q--a-.e", + "B", + "a.b..-q--a-.e", + Uts46.UIDNA_ERROR_EMPTY_LABEL + | Uts46.UIDNA_ERROR_LEADING_HYPHEN + | Uts46.UIDNA_ERROR_TRAILING_HYPHEN + | Uts46.UIDNA_ERROR_HYPHEN_3_4 + }, + { + "a.b..-q--\u00E4-.e", + "B", + "a.b..-q--\u00E4-.e", + Uts46.UIDNA_ERROR_EMPTY_LABEL + | Uts46.UIDNA_ERROR_LEADING_HYPHEN + | Uts46.UIDNA_ERROR_TRAILING_HYPHEN + | Uts46.UIDNA_ERROR_HYPHEN_3_4 + }, + { + "a.b..xn---q----jra.e", + "B", + "a.b..-q--\u00E4-.e", + Uts46.UIDNA_ERROR_EMPTY_LABEL + | Uts46.UIDNA_ERROR_LEADING_HYPHEN + | Uts46.UIDNA_ERROR_TRAILING_HYPHEN + | Uts46.UIDNA_ERROR_HYPHEN_3_4 + }, + {"a..c", "B", "a..c", Uts46.UIDNA_ERROR_EMPTY_LABEL}, + {"a.-b.", "B", "a.-b.", Uts46.UIDNA_ERROR_LEADING_HYPHEN}, + {"a.b-.c", "B", "a.b-.c", Uts46.UIDNA_ERROR_TRAILING_HYPHEN}, + { + "a.-.c", + "B", + "a.-.c", + Uts46.UIDNA_ERROR_LEADING_HYPHEN | Uts46.UIDNA_ERROR_TRAILING_HYPHEN + }, + {"a.bc--de.f", "B", "a.bc--de.f", Uts46.UIDNA_ERROR_HYPHEN_3_4}, + {"\u00E4.\u00AD.c", "B", "\u00E4..c", Uts46.UIDNA_ERROR_EMPTY_LABEL}, + {"\u00E4.-b.", "B", "\u00E4.-b.", Uts46.UIDNA_ERROR_LEADING_HYPHEN}, + {"\u00E4.b-.c", "B", "\u00E4.b-.c", Uts46.UIDNA_ERROR_TRAILING_HYPHEN}, + { + "\u00E4.-.c", + "B", + "\u00E4.-.c", + Uts46.UIDNA_ERROR_LEADING_HYPHEN | Uts46.UIDNA_ERROR_TRAILING_HYPHEN + }, + {"\u00E4.bc--de.f", "B", "\u00E4.bc--de.f", Uts46.UIDNA_ERROR_HYPHEN_3_4}, + {"a.b.\u0308c.d", "B", "a.b.\uFFFDc.d", Uts46.UIDNA_ERROR_LEADING_COMBINING_MARK}, + {"a.b.xn--c-bcb.d", "B", "a.b.xn--c-bcb\uFFFD.d", Uts46.UIDNA_ERROR_LEADING_COMBINING_MARK}, + // BiDi + {"A0", "B", "a0", 0}, + {"0A", "B", "0a", 0}, // all-LTR is ok to start with a digit (EN) + { + "0A.\u05D0", + "B", // ASCII label does not start with L/R/AL + "0a.\u05D0", + Uts46.UIDNA_ERROR_BIDI + }, + { + "c.xn--0-eha.xn--4db", + "B", // 2nd label does not start with + // L/R/AL + "c.0\u00FC.\u05D0", + Uts46.UIDNA_ERROR_BIDI + }, + { + "b-.\u05D0", + "B", // label does not end with L/EN + "b-.\u05D0", + Uts46.UIDNA_ERROR_TRAILING_HYPHEN | Uts46.UIDNA_ERROR_BIDI + }, + { + "d.xn----dha.xn--4db", + "B", // 2nd label does not end with L/EN + "d.\u00FC-.\u05D0", + Uts46.UIDNA_ERROR_TRAILING_HYPHEN | Uts46.UIDNA_ERROR_BIDI + }, + {"a\u05D0", "B", "a\u05D0", Uts46.UIDNA_ERROR_BIDI}, // first dir + // != last + // dir + {"\u05D0\u05C7", "B", "\u05D0\u05C7", 0}, + {"\u05D09\u05C7", "B", "\u05D09\u05C7", 0}, + {"\u05D0a\u05C7", "B", "\u05D0a\u05C7", Uts46.UIDNA_ERROR_BIDI}, // first + // dir + // != + // last + // dir + {"\u05D0\u05EA", "B", "\u05D0\u05EA", 0}, + {"\u05D0\u05F3\u05EA", "B", "\u05D0\u05F3\u05EA", 0}, + {"a\u05D0Tz", "B", "a\u05D0tz", Uts46.UIDNA_ERROR_BIDI}, // mixed + // dir + {"\u05D0T\u05EA", "B", "\u05D0t\u05EA", Uts46.UIDNA_ERROR_BIDI}, // mixed + // dir + {"\u05D07\u05EA", "B", "\u05D07\u05EA", 0}, + {"\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", 0}, // Arabic 7 + // in the + // middle + {"a7\u0667z", "B", "a7\u0667z", Uts46.UIDNA_ERROR_BIDI}, // AN + // digit + // in LTR + { + "\u05D07\u0667\u05EA", + "B", // mixed EN/AN digits in RTL + "\u05D07\u0667\u05EA", + Uts46.UIDNA_ERROR_BIDI + }, + // ZWJ + {"\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", 0}, // Virama+ZWJ + {"\u0BB9\u200D", "N", "\u0BB9\u200D", Uts46.UIDNA_ERROR_CONTEXTJ}, // no + // Virama + {"\u200D", "N", "\u200D", Uts46.UIDNA_ERROR_CONTEXTJ}, // no + // Virama + // ZWNJ + {"\u0BB9\u0BCD\u200C", "N", "\u0BB9\u0BCD\u200C", 0}, // Virama+ZWNJ + {"\u0BB9\u200C", "N", "\u0BB9\u200C", Uts46.UIDNA_ERROR_CONTEXTJ}, // no + // Virama + {"\u200C", "N", "\u200C", Uts46.UIDNA_ERROR_CONTEXTJ}, // no + // Virama + { + "\u0644\u0670\u200C\u06ED\u06EF", + "N", // Joining types D T ZWNJ T + // R + "\u0644\u0670\u200C\u06ED\u06EF", + 0 + }, + { + "\u0644\u0670\u200C\u06EF", + "N", // D T ZWNJ R + "\u0644\u0670\u200C\u06EF", + 0 + }, + { + "\u0644\u200C\u06ED\u06EF", + "N", // D ZWNJ T R + "\u0644\u200C\u06ED\u06EF", + 0 + }, + { + "\u0644\u200C\u06EF", + "N", // D ZWNJ R + "\u0644\u200C\u06EF", + 0 + }, + { + "\u0644\u0670\u200C\u06ED", + "N", // D T ZWNJ T + "\u0644\u0670\u200C\u06ED", + Uts46.UIDNA_ERROR_BIDI | Uts46.UIDNA_ERROR_CONTEXTJ + }, + { + "\u06EF\u200C\u06EF", + "N", // R ZWNJ R + "\u06EF\u200C\u06EF", + Uts46.UIDNA_ERROR_CONTEXTJ + }, + { + "\u0644\u200C", + "N", // D ZWNJ + "\u0644\u200C", + Uts46.UIDNA_ERROR_BIDI | Uts46.UIDNA_ERROR_CONTEXTJ + }, + // { "", "B", + // "", 0 }, + {"0à.\u05D0"}, + {"à.\u05D00\u0660"}, + {"a。。b"}, + {"\u200D。。\u06B9\u200C"}, + {"\u05D0\u0030\u0660"}, + {"$"}, }; } diff --git a/unicodetools/src/main/java/org/unicode/idna/Idna.java b/unicodetools/src/main/java/org/unicode/idna/Idna.java index 1a0c1bf67..d7bbc81bb 100644 --- a/unicodetools/src/main/java/org/unicode/idna/Idna.java +++ b/unicodetools/src/main/java/org/unicode/idna/Idna.java @@ -1,15 +1,12 @@ package org.unicode.idna; - -import java.util.regex.Pattern; - -import org.unicode.text.utility.UnicodeTransform; -import org.unicode.text.utility.UnicodeTransform.Type; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.text.StringPrepParseException; import com.ibm.icu.text.StringTransform; import com.ibm.icu.text.UnicodeSet; +import java.util.regex.Pattern; +import org.unicode.text.utility.UnicodeTransform; +import org.unicode.text.utility.UnicodeTransform.Type; public class Idna implements StringTransform { @@ -17,12 +14,18 @@ public class Idna implements StringTransform { public static final UnicodeTransform NFD = UnicodeTransform.getInstance(Type.NFD); public static final UnicodeTransform NFKC = UnicodeTransform.getInstance(Type.NFKC); public static final UnicodeTransform NFKD = UnicodeTransform.getInstance(Type.NFKD); - public static final UnicodeTransform NFKC_3_2 = new FilteredUnicodeTransform(NFKC, new UnicodeSet("[:age=3.2:]")); - public static final UnicodeTransform CASEFOLD = UnicodeTransform.getInstance(UnicodeTransform.Type.CASEFOLD); + public static final UnicodeTransform NFKC_3_2 = + new FilteredUnicodeTransform(NFKC, new UnicodeSet("[:age=3.2:]")); + public static final UnicodeTransform CASEFOLD = + UnicodeTransform.getInstance(UnicodeTransform.Type.CASEFOLD); public static final Pattern FULL_STOP = Pattern.compile("\\."); public enum IdnaType { - valid, ignored, mapped, deviation, disallowed; + valid, + ignored, + mapped, + deviation, + disallowed; } public UnicodeMap types = new UnicodeMap(); @@ -32,11 +35,12 @@ public enum IdnaType { public UnicodeSet validSet_transitional = new UnicodeSet(); protected boolean checkPunycodeValidity = false; private final String name; - //static final Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE); + // static final Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", + // Normalizer2.Mode.COMPOSE); protected Idna() { final String[] names = this.getClass().getName().split("[.]"); - name = names[names.length-1]; + name = names[names.length - 1]; } public IdnaType getType(int i) { @@ -122,27 +126,26 @@ public String toPunyCode(String source, boolean[] error) { public int isValidLabel(String string, boolean display) { /* -The label must contain at least one code point. -The label must not contain a U+002D HYPHEN-MINUS character in both the third position and fourth positions. -The label must neither begin nor end with a U+002D HYPHEN-MINUS character. -The label must be in Unicode Normalization Form NFC. -The label must not contain a U+002E ( . ) FULL STOP. -Each code point in the label must only have certain status values according to Section 5, IDNA Mapping Table: -For Transitional Processing, each value must be valid. -For Nontransitional Processing, each value must be either valid or deviation. -The label must not begin with a combining mark, that is: General_Category=Mark. - */ + The label must contain at least one code point. + The label must not contain a U+002D HYPHEN-MINUS character in both the third position and fourth positions. + The label must neither begin nor end with a U+002D HYPHEN-MINUS character. + The label must be in Unicode Normalization Form NFC. + The label must not contain a U+002E ( . ) FULL STOP. + Each code point in the label must only have certain status values according to Section 5, IDNA Mapping Table: + For Transitional Processing, each value must be valid. + For Nontransitional Processing, each value must be either valid or deviation. + The label must not begin with a combining mark, that is: General_Category=Mark. + */ if (string.length() == 0) { return 1; } - if (string.length() > 3 && string.charAt(2) == '-' && string.charAt(3) == '-') - { + if (string.length() > 3 && string.charAt(2) == '-' && string.charAt(3) == '-') { return 2; // fix to use code points } if (string.startsWith("-") || string.endsWith("-")) { return 3; } - if (!NFC.isTransformed(string)) { //Normalizer.isNormalized(string, Normalizer.NFC, 0)) + if (!NFC.isTransformed(string)) { // Normalizer.isNormalized(string, Normalizer.NFC, 0)) return 4; } if (string.contains(".")) { @@ -159,7 +162,7 @@ public int isValidLabel(String string, boolean display) { public boolean isValid(String string) { final String trans = transform(string); - return NFC.isTransformed(trans) && validSet.containsAll(trans); // Normalizer.isNormalized(trans, Normalizer.NFC, 0) + return NFC.isTransformed(trans) + && validSet.containsAll(trans); // Normalizer.isNormalized(trans, Normalizer.NFC, 0) } } - diff --git a/unicodetools/src/main/java/org/unicode/idna/Idna2003.java b/unicodetools/src/main/java/org/unicode/idna/Idna2003.java index ff00225dc..78da49e05 100644 --- a/unicodetools/src/main/java/org/unicode/idna/Idna2003.java +++ b/unicodetools/src/main/java/org/unicode/idna/Idna2003.java @@ -1,6 +1,5 @@ package org.unicode.idna; - public class Idna2003 extends Idna { private Idna2003(boolean STD3) { @@ -12,7 +11,7 @@ private Idna2003(boolean STD3) { public static Idna2003 SINGLETON = new Idna2003(true); public static Idna2003 SINGLETON_NSTD3 = new Idna2003(false); - static public IdnaType getIDNA2003Type(int cp) { + public static IdnaType getIDNA2003Type(int cp) { return SINGLETON.getType(cp); } @@ -72,13 +71,15 @@ public static String toIdna2003(String s) { // // private static void convertWithHack() throws StringPrepParseException { // try { - // Idna2003.intermediate = IDNA.convertToASCII(Idna2003.inbuffer, IDNA.USE_STD3_RULES); // USE_STD3_RULES, + // Idna2003.intermediate = IDNA.convertToASCII(Idna2003.inbuffer, IDNA.USE_STD3_RULES); // + // USE_STD3_RULES, // } catch (StringPrepParseException e) { // if (!e.getMessage().contains("BIDI")) { // throw e; // } // inbuffer.append("\\u05D9"); - // Idna2003.intermediate = IDNA.convertToASCII(Idna2003.inbuffer, IDNA.USE_STD3_RULES); // USE_STD3_RULES, + // Idna2003.intermediate = IDNA.convertToASCII(Idna2003.inbuffer, IDNA.USE_STD3_RULES); // + // USE_STD3_RULES, // } // } // @@ -89,12 +90,14 @@ public static String toIdna2003(String s) { // // UnicodeUtilities.inbuffer.setLength(0); // // UTF16.append(UnicodeUtilities.inbuffer, cp); // // try { - // // UnicodeUtilities.intermediate = IDNA.convertToASCII(UnicodeUtilities.inbuffer, IDNA.USE_STD3_RULES); // USE_STD3_RULES, + // // UnicodeUtilities.intermediate = IDNA.convertToASCII(UnicodeUtilities.inbuffer, + // IDNA.USE_STD3_RULES); // USE_STD3_RULES, // // // DEFAULT // // if (UnicodeUtilities.intermediate.length() == 0) { // // return ""; // // } - // // UnicodeUtilities.outbuffer = IDNA.convertToUnicode(UnicodeUtilities.intermediate, IDNA.USE_STD3_RULES); + // // UnicodeUtilities.outbuffer = IDNA.convertToUnicode(UnicodeUtilities.intermediate, + // IDNA.USE_STD3_RULES); // // } catch (StringPrepParseException e) { // // if (e.getMessage().startsWith("Found zero length")) { // // return ""; @@ -125,7 +128,8 @@ public static String toIdna2003(String s) { // throw new IllegalArgumentException(); // } // idna2003.append(UnicodeUtilities.toHTML.transform(Idna2003.intermediate.toString())); - // idna2003back.append(IDNA.convertToUnicode(Idna2003.intermediate, IDNA.USE_STD3_RULES).toString()); + // idna2003back.append(IDNA.convertToUnicode(Idna2003.intermediate, + // IDNA.USE_STD3_RULES).toString()); // } catch (Exception e) { // idna2003.append('\uFFFD'); // idna2003back.append('\uFFFD'); diff --git a/unicodetools/src/main/java/org/unicode/idna/Idna2008.java b/unicodetools/src/main/java/org/unicode/idna/Idna2008.java index 2ebb4734f..657dece84 100644 --- a/unicodetools/src/main/java/org/unicode/idna/Idna2008.java +++ b/unicodetools/src/main/java/org/unicode/idna/Idna2008.java @@ -9,14 +9,19 @@ public class Idna2008 extends Idna { public static final UnicodeSet GRANDFATHERED_VALID = new UnicodeSet().add(0x19DA).freeze(); public enum Idna2008Type { - UNASSIGNED, DISALLOWED, PVALID, CONTEXTJ, CONTEXTO + UNASSIGNED, + DISALLOWED, + PVALID, + CONTEXTJ, + CONTEXTO } static final UnicodeMap IDNA2008Computed; static { // A: General_Category(cp) is in {Ll, Lu, Lo, Nd, Lm, Mn, Mc} - final UnicodeSet LetterDigits = new UnicodeSet("[[:Ll:][:Lu:][:Lo:][:Nd:][:Lm:][:Mn:][:Mc:]]").freeze(); + final UnicodeSet LetterDigits = + new UnicodeSet("[[:Ll:][:Lu:][:Lo:][:Nd:][:Lm:][:Mn:][:Mc:]]").freeze(); // B: toNFKC(toCaseFold(toNFKC(cp))) != cp final UnicodeSet Unstable = new UnicodeSet(); @@ -34,15 +39,21 @@ public enum Idna2008Type { // C: Default_Ignorable_Code_Point(cp) = True or // White_Space(cp) = True or // Noncharacter_Code_Point(cp) = True - final UnicodeSet IgnorableProperties = new UnicodeSet("[[:Default_Ignorable_Code_Point:]" + - "[:White_Space:]" + - "[:Noncharacter_Code_Point:]]").freeze(); + final UnicodeSet IgnorableProperties = + new UnicodeSet( + "[[:Default_Ignorable_Code_Point:]" + + "[:White_Space:]" + + "[:Noncharacter_Code_Point:]]") + .freeze(); // Block(cp) is in {Combining Diacritical Marks for Symbols, // Musical Symbols, Ancient Greek Musical Notation} - final UnicodeSet IgnorableBlocks = new UnicodeSet("[[:block=Combining Diacritical Marks for Symbols:]" + - "[:block=Musical Symbols:]" + - "[:block=Ancient Greek Musical Notation:]]").freeze(); + final UnicodeSet IgnorableBlocks = + new UnicodeSet( + "[[:block=Combining Diacritical Marks for Symbols:]" + + "[:block=Musical Symbols:]" + + "[:block=Ancient Greek Musical Notation:]]") + .freeze(); // E: cp is in {002D, 0030..0039, 0061..007A} final UnicodeSet LDH = new UnicodeSet("[\u002D\u0030-\u0039\u0061-\u007A]").freeze(); @@ -54,11 +65,20 @@ public enum Idna2008Type { // 302E, 302F, 3031, 3032, 3033, 3034, 3035, 303B, // 30FB} - final UnicodeMap Exceptions = new UnicodeMap() - .putAll(new UnicodeSet("[\u00DF\u03C2\u06FD\u06FE\u0F0B\u3007]"), Idna2008Type.PVALID) - .putAll(new UnicodeSet("[\u00B7\u0375\u05F3\u05F4\u30FB\u0660-\u0669\u06F0-\u06F9]"), Idna2008Type.CONTEXTO) - .putAll(new UnicodeSet("[\u0640\u07FA\u302E\u302F\u3031\u3032\u3033\u3034\u3035\u303B]"), Idna2008Type.DISALLOWED) - .freeze(); + final UnicodeMap Exceptions = + new UnicodeMap() + .putAll( + new UnicodeSet("[\u00DF\u03C2\u06FD\u06FE\u0F0B\u3007]"), + Idna2008Type.PVALID) + .putAll( + new UnicodeSet( + "[\u00B7\u0375\u05F3\u05F4\u30FB\u0660-\u0669\u06F0-\u06F9]"), + Idna2008Type.CONTEXTO) + .putAll( + new UnicodeSet( + "[\u0640\u07FA\u302E\u302F\u3031\u3032\u3033\u3034\u3035\u303B]"), + Idna2008Type.DISALLOWED) + .freeze(); // G: cp is in {} @@ -70,9 +90,11 @@ public enum Idna2008Type { // Hangul_Syllable_Type(cp) is in {L, V, T} - final UnicodeSet OldHangulJamo = new UnicodeSet("[[:Hangul_Syllable_Type=L:]" + - "[:Hangul_Syllable_Type=V:]" + - "[:Hangul_Syllable_Type=T:]]"); + final UnicodeSet OldHangulJamo = + new UnicodeSet( + "[[:Hangul_Syllable_Type=L:]" + + "[:Hangul_Syllable_Type=V:]" + + "[:Hangul_Syllable_Type=T:]]"); // J: General_Category(cp) is in {Cn} and // Noncharacter_Code_Point(cp) = False @@ -91,8 +113,10 @@ public enum Idna2008Type { // Else If .cp. .in. LetterDigits Then PVALID; // Else DISALLOWED; - - final UnicodeMap Incompatible = new UnicodeMap().putAll(GRANDFATHERED_VALID, Idna2008Type.PVALID).freeze(); + final UnicodeMap Incompatible = + new UnicodeMap() + .putAll(GRANDFATHERED_VALID, Idna2008Type.PVALID) + .freeze(); IDNA2008Computed = new UnicodeMap(); @@ -128,21 +152,21 @@ public enum Idna2008Type { IDNA2008Computed.freeze(); } - public static Idna2008 SINGLETON = new Idna2008(); + public static Idna2008 SINGLETON = new Idna2008(); private Idna2008() { for (final Idna2008Type oldType : IDNA2008Computed.values()) { final UnicodeSet uset = IDNA2008Computed.getSet(oldType); switch (oldType) { - case UNASSIGNED: - case DISALLOWED: - types.putAll(uset, Idna.IdnaType.disallowed); - break; - case PVALID: - case CONTEXTJ: - case CONTEXTO: - types.putAll(uset, Idna.IdnaType.valid); - break; + case UNASSIGNED: + case DISALLOWED: + types.putAll(uset, Idna.IdnaType.disallowed); + break; + case PVALID: + case CONTEXTJ: + case CONTEXTO: + types.putAll(uset, Idna.IdnaType.valid); + break; } } types.put('.', IdnaType.valid); @@ -158,7 +182,8 @@ public static UnicodeMap getTypeMapping() { public static UnicodeSet getIdna2008Valid() { // IdnaLabelTester tester = getIdna2008Tester(); - // UnicodeSet valid2008 = UnicodeSetUtilities.parseUnicodeSet(tester.getVariable("$Valid"), TableStyle.simple); + // UnicodeSet valid2008 = + // UnicodeSetUtilities.parseUnicodeSet(tester.getVariable("$Valid"), TableStyle.simple); // return valid2008; UnicodeMap typeMapping = Idna2008.getTypeMapping(); return new UnicodeSet(typeMapping.getSet(Idna2008Type.PVALID)) diff --git a/unicodetools/src/main/java/org/unicode/idna/Idna2008t.java b/unicodetools/src/main/java/org/unicode/idna/Idna2008t.java index 2e0f44e04..720171a78 100644 --- a/unicodetools/src/main/java/org/unicode/idna/Idna2008t.java +++ b/unicodetools/src/main/java/org/unicode/idna/Idna2008t.java @@ -1,15 +1,13 @@ package org.unicode.idna; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.idna.Idna2008.Idna2008Type; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class Idna2008t extends Idna { public static Idna2008t SINGLETON = new Idna2008t(); @@ -24,15 +22,15 @@ private Idna2008t() { for (final Idna2008Type oldType : oldTypes.values()) { final UnicodeSet uset = oldTypes.getSet(oldType); switch (oldType) { - case UNASSIGNED: - case DISALLOWED: - types.putAll(uset, Idna.IdnaType.disallowed); - break; - case PVALID: - case CONTEXTJ: - case CONTEXTO: - types.putAll(uset, Idna.IdnaType.valid); - break; + case UNASSIGNED: + case DISALLOWED: + types.putAll(uset, Idna.IdnaType.disallowed); + break; + case PVALID: + case CONTEXTJ: + case CONTEXTO: + types.putAll(uset, Idna.IdnaType.valid); + break; } } types.put('.', IdnaType.valid); @@ -50,18 +48,21 @@ public static UnicodeMap getTypeMapping() { private static void initData() { - final Matcher DATALINE = Pattern.compile( - "([0-9a-fA-F]{4,6})" + - "(?:\\.\\.([0-9a-fA-F]{4,6}))?" + - "\\s*;\\s*" + - "(PVALID|DISALLOWED|UNASSIGNED|CONTEXTJ|CONTEXTO)" + - "\\s*#\\s*" + - "(.*)").matcher(""); + final Matcher DATALINE = + Pattern.compile( + "([0-9a-fA-F]{4,6})" + + "(?:\\.\\.([0-9a-fA-F]{4,6}))?" + + "\\s*;\\s*" + + "(PVALID|DISALLOWED|UNASSIGNED|CONTEXTJ|CONTEXTO)" + + "\\s*#\\s*" + + "(.*)") + .matcher(""); try { - final BufferedReader in = new BufferedReader( - new InputStreamReader( - Idna2008.class.getResourceAsStream("tables.txt"))); + final BufferedReader in = + new BufferedReader( + new InputStreamReader( + Idna2008.class.getResourceAsStream("tables.txt"))); // FileUtilities.openReader(Utility.DATA_DIRECTORY + "/IDN/", // "draft-faltstrom-idnabis-tables-05.txt", "ascii"); boolean inTable = false; @@ -87,7 +88,9 @@ private static void initData() { if (!inTable) { continue; } - if (line.length() == 0 || line.startsWith("Faltstrom") || line.startsWith("Internet-Draft")) { + if (line.length() == 0 + || line.startsWith("Faltstrom") + || line.startsWith("Internet-Draft")) { continue; } // we now have real data @@ -96,8 +99,10 @@ private static void initData() { continue; } final int startChar = Integer.parseInt(DATALINE.group(1), 16); - final int endChar = DATALINE.group(2) == null ? startChar : Integer.parseInt(DATALINE - .group(2), 16); + final int endChar = + DATALINE.group(2) == null + ? startChar + : Integer.parseInt(DATALINE.group(2), 16); final Idna2008Type idnaType = Idna2008Type.valueOf(DATALINE.group(3)); oldTypes.putAll(startChar, endChar, idnaType); } diff --git a/unicodetools/src/main/java/org/unicode/idna/IdnaTypes.java b/unicodetools/src/main/java/org/unicode/idna/IdnaTypes.java index 846b053bb..6b06e298f 100644 --- a/unicodetools/src/main/java/org/unicode/idna/IdnaTypes.java +++ b/unicodetools/src/main/java/org/unicode/idna/IdnaTypes.java @@ -1,8 +1,7 @@ package org.unicode.idna; -import java.util.regex.Pattern; - import com.ibm.icu.text.UnicodeSet; +import java.util.regex.Pattern; public class IdnaTypes { diff --git a/unicodetools/src/main/java/org/unicode/idna/LoadIdnaTest.java b/unicodetools/src/main/java/org/unicode/idna/LoadIdnaTest.java index 1e8392bf3..2a25e0bcb 100644 --- a/unicodetools/src/main/java/org/unicode/idna/LoadIdnaTest.java +++ b/unicodetools/src/main/java/org/unicode/idna/LoadIdnaTest.java @@ -1,5 +1,8 @@ package org.unicode.idna; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.text.Transliterator; import java.util.Collections; import java.util.EnumSet; import java.util.LinkedHashSet; @@ -7,25 +10,30 @@ import java.util.Objects; import java.util.Set; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.idna.Uts46.Errors; import org.unicode.text.utility.Settings; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.text.Transliterator; - public class LoadIdnaTest { public static final Splitter semi = Splitter.on(';').trimResults(); - public static final Splitter SPACE_SPLITTER = Splitter.on(Pattern.compile(",? ")).trimResults().omitEmptyStrings(); - + public static final Splitter SPACE_SPLITTER = + Splitter.on(Pattern.compile(",? ")).trimResults().omitEmptyStrings(); + public static final Transliterator fromHex = Transliterator.getInstance("hex-any/java"); - public enum Type {T, N, B}; - public enum Idna2008Status {V8, NV8, XV8}; + public enum Type { + T, + N, + B + }; - public final static class TestLine { + public enum Idna2008Status { + V8, + NV8, + XV8 + }; + + public static final class TestLine { public final String source; public final String toUnicode; public final Set toUnicodeErrors; @@ -35,8 +43,9 @@ public final static class TestLine { public final Set toAsciiTErrors; /** - * Create a test line from a string. Comments are removed and resulting empty lines are ignored. - * but it will do \\u and \\x expansion. + * Create a test line from a string. Comments are removed and resulting empty lines are + * ignored. but it will do \\u and \\x expansion. + * * @param test */ public static TestLine from(String line) { @@ -52,67 +61,68 @@ private TestLine(String test) { if (test.contains("\\u")) { int debug = 0; } - /* -OLD -# Column 1: type - T for transitional, N for nontransitional, B for both -# Column 2: source - The source string to be tested -# Column 3: toUnicode - The result of applying toUnicode to the source, using nontransitional. -# A blank value means the same as the source value; a value in [...] is a set of error codes. -# Column 4: toASCII - The result of applying toASCII to the source, using the specified type: T, N, or B. -# A blank value means the same as the toUnicode value; a value in [...] is a set of error codes. -# Column 5: idna2008 - NV8 is only present if the status is valid but the character is excluded by IDNA2008 -# from all domain names for all versions of Unicode. -# XV8 is present when the character is excluded by IDNA2008 for the current version of Unicode. -# These are informative values only. - -V2 -# Column 1: source - The source string to be tested -# Column 2: toUnicode - The result of applying toUnicode to the source, -# with Transitional_Processing=false. -# A blank value means the same as the source value. -# Column 3: toUnicodeStatus - A set of status codes, each corresponding to a particular test. -# A blank value means [] (no errors). -# Column 4: toAsciiN - The result of applying toASCII to the source, -# with Transitional_Processing=false. -# A blank value means the same as the toUnicode value. -# Column 5: toAsciiNStatus - A set of status codes, each corresponding to a particular test. -# A blank value means the same as the toUnicodeStatus value. -# An explicit [] means no errors. -# Column 6: toAsciiT - The result of applying toASCII to the source, -# with Transitional_Processing=true. -# A blank value means the same as the toAsciiN value. -# Column 7: toAsciiTStatus - A set of status codes, each corresponding to a particular test. -# A blank value means the same as the toAsciiNStatus value. -# An explicit [] means no errors. - - */ + /* + OLD + # Column 1: type - T for transitional, N for nontransitional, B for both + # Column 2: source - The source string to be tested + # Column 3: toUnicode - The result of applying toUnicode to the source, using nontransitional. + # A blank value means the same as the source value; a value in [...] is a set of error codes. + # Column 4: toASCII - The result of applying toASCII to the source, using the specified type: T, N, or B. + # A blank value means the same as the toUnicode value; a value in [...] is a set of error codes. + # Column 5: idna2008 - NV8 is only present if the status is valid but the character is excluded by IDNA2008 + # from all domain names for all versions of Unicode. + # XV8 is present when the character is excluded by IDNA2008 for the current version of Unicode. + # These are informative values only. + + V2 + # Column 1: source - The source string to be tested + # Column 2: toUnicode - The result of applying toUnicode to the source, + # with Transitional_Processing=false. + # A blank value means the same as the source value. + # Column 3: toUnicodeStatus - A set of status codes, each corresponding to a particular test. + # A blank value means [] (no errors). + # Column 4: toAsciiN - The result of applying toASCII to the source, + # with Transitional_Processing=false. + # A blank value means the same as the toUnicode value. + # Column 5: toAsciiNStatus - A set of status codes, each corresponding to a particular test. + # A blank value means the same as the toUnicodeStatus value. + # An explicit [] means no errors. + # Column 6: toAsciiT - The result of applying toASCII to the source, + # with Transitional_Processing=true. + # A blank value means the same as the toAsciiN value. + # Column 7: toAsciiTStatus - A set of status codes, each corresponding to a particular test. + # A blank value means the same as the toAsciiNStatus value. + # An explicit [] means no errors. + + */ List parts = semi.splitToList(test); int col = 0; - + // TODO (maybe) enable for old format also - + // type = Type.valueOf(parts.get(0)); try { source = fromHex.transform(parts.get(col++)); - + toUnicode = getWithFallback(fromHex.transform(parts.get(col++)), source); toUnicodeErrors = parseEnumSet(parts.get(col++)); - + toAsciiN = getWithFallback(fromHex.transform(parts.get(col++)), toUnicode); toAsciiNErrors = parseEnumSet(parts.get(col++)); - + toAsciiT = getWithFallback(fromHex.transform(parts.get(col++)), toUnicode); toAsciiTErrors = parseEnumSet(parts.get(col++)); } catch (Exception e) { throw e; // pause for debugging } - } private Set parseEnumSet(String toUnicodeRaw) { if (toUnicodeRaw.startsWith("[") && toUnicodeRaw.endsWith("]")) { Set toUnicodeErrorsRaw = EnumSet.noneOf(Errors.class); - for (String item : SPACE_SPLITTER.split(toUnicodeRaw.substring(1, toUnicodeRaw.length()-1))) { + for (String item : + SPACE_SPLITTER.split( + toUnicodeRaw.substring(1, toUnicodeRaw.length() - 1))) { try { toUnicodeErrorsRaw.add(Errors.valueOf(item)); } catch (Exception e) { @@ -123,33 +133,44 @@ private Set parseEnumSet(String toUnicodeRaw) { } return Collections.emptySet(); } - + private String getWithFallback(String string, String fallback) { return string.isEmpty() ? fallback : string; } + @Override public String toString() { - return source - + ";\t" + toUnicode + ";\t" + toUnicodeErrors - + ";\t" + toAsciiN + ";\t" + toAsciiNErrors - + ";\t" + toAsciiT + ";\t" + toAsciiTErrors - ; + return source + + ";\t" + + toUnicode + + ";\t" + + toUnicodeErrors + + ";\t" + + toAsciiN + + ";\t" + + toAsciiNErrors + + ";\t" + + toAsciiT + + ";\t" + + toAsciiTErrors; } + @Override public boolean equals(Object obj) { if (obj.getClass() != TestLine.class) { return false; } - TestLine that = (TestLine)obj; + TestLine that = (TestLine) obj; return source == that.source; // rest should be consistent } + @Override public int hashCode() { return Objects.hash(source); } } - - static public Set load(String directory) { + + public static Set load(String directory) { Set result = new LinkedHashSet<>(); for (String line : FileUtilities.in(directory, "IdnaTestV2.txt")) { diff --git a/unicodetools/src/main/java/org/unicode/idna/Punycode.java b/unicodetools/src/main/java/org/unicode/idna/Punycode.java index 57f0c7c25..97e07e944 100644 --- a/unicodetools/src/main/java/org/unicode/idna/Punycode.java +++ b/unicodetools/src/main/java/org/unicode/idna/Punycode.java @@ -10,9 +10,9 @@ import com.ibm.icu.text.StringPrepParseException; import com.ibm.icu.text.UTF16; - /** * Ported code from ICU punycode.c + * * @author ram */ @@ -20,121 +20,113 @@ public final class Punycode { /* Punycode parameters for Bootstring */ - private static final int BASE = 36; - private static final int TMIN = 1; - private static final int TMAX = 26; - private static final int SKEW = 38; - private static final int DAMP = 700; - private static final int INITIAL_BIAS = 72; - private static final int INITIAL_N = 0x80; + private static final int BASE = 36; + private static final int TMIN = 1; + private static final int TMAX = 26; + private static final int SKEW = 38; + private static final int DAMP = 700; + private static final int INITIAL_BIAS = 72; + private static final int INITIAL_N = 0x80; /* "Basic" Unicode/ASCII code points */ - private static final int HYPHEN = 0x2d; - private static final int DELIMITER = HYPHEN; - - private static final int ZERO = 0x30; - //private static final int NINE = 0x39; - - private static final int SMALL_A = 0x61; - private static final int SMALL_Z = 0x7a; - - private static final int CAPITAL_A = 0x41; - private static final int CAPITAL_Z = 0x5a; - private static final int MAX_CP_COUNT = 200; - //private static final int UINT_MAGIC = 0x80000000; - //private static final long ULONG_MAGIC = 0x8000000000000000L; - - private static int adaptBias(int delta, int length, boolean firstTime){ - if(firstTime){ - delta /=DAMP; - }else{ - delta /= 2; + private static final int HYPHEN = 0x2d; + private static final int DELIMITER = HYPHEN; + + private static final int ZERO = 0x30; + // private static final int NINE = 0x39; + + private static final int SMALL_A = 0x61; + private static final int SMALL_Z = 0x7a; + + private static final int CAPITAL_A = 0x41; + private static final int CAPITAL_Z = 0x5a; + private static final int MAX_CP_COUNT = 200; + // private static final int UINT_MAGIC = 0x80000000; + // private static final long ULONG_MAGIC = 0x8000000000000000L; + + private static int adaptBias(int delta, int length, boolean firstTime) { + if (firstTime) { + delta /= DAMP; + } else { + delta /= 2; } - delta += delta/length; + delta += delta / length; - int count=0; - for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { - delta/=(BASE-TMIN); + int count = 0; + for (; delta > ((BASE - TMIN) * TMAX) / 2; count += BASE) { + delta /= (BASE - TMIN); } - return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); + return count + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); } /** - * basicToDigit[] contains the numeric value of a basic code - * point (for use in representing integers) in the range 0 to - * BASE-1, or -1 if b is does not represent a value. + * basicToDigit[] contains the numeric value of a basic code point (for use in representing + * integers) in the range 0 to BASE-1, or -1 if b is does not represent a value. */ - static final int[] basicToDigit= new int[]{ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - - ///CLOVER:OFF + static final int[] basicToDigit = + new int[] { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + /// CLOVER:OFF private static char asciiCaseMap(char b, boolean uppercase) { - if(uppercase) { - if(SMALL_A<=b && b<=SMALL_Z) { - b-=(SMALL_A-CAPITAL_A); + if (uppercase) { + if (SMALL_A <= b && b <= SMALL_Z) { + b -= (SMALL_A - CAPITAL_A); } } else { - if(CAPITAL_A<=b && b<=CAPITAL_Z) { - b+=(SMALL_A-CAPITAL_A); + if (CAPITAL_A <= b && b <= CAPITAL_Z) { + b += (SMALL_A - CAPITAL_A); } } return b; } - ///CLOVER:ON + /// CLOVER:ON /** - * digitToBasic() returns the basic code point whose value - * (when used for representing integers) is d, which must be in the - * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is - * nonzero, in which case the uppercase form is used. + * digitToBasic() returns the basic code point whose value (when used for representing integers) + * is d, which must be in the range 0 to BASE-1. The lowercase form is used unless the uppercase + * flag is nonzero, in which case the uppercase form is used. */ private static char digitToBasic(int digit, boolean uppercase) { /* 0..25 map to ASCII a..z or A..Z */ /* 26..35 map to ASCII 0..9 */ - if(digit<26) { - if(uppercase) { - return (char)(CAPITAL_A+digit); + if (digit < 26) { + if (uppercase) { + return (char) (CAPITAL_A + digit); } else { - return (char)(SMALL_A+digit); + return (char) (SMALL_A + digit); } } else { - return (char)((ZERO-26)+digit); + return (char) ((ZERO - 26) + digit); } } /** - * Converts Unicode to Punycode. - * The input string must not contain single, unpaired surrogates. + * Converts Unicode to Punycode. The input string must not contain single, unpaired surrogates. * The output will be represented as an array of ASCII code points. - * + * * @param src * @param caseFlags * @return * @throws ParseException */ - public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws StringPrepParseException{ + public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) + throws StringPrepParseException { final int[] cpBuffer = new int[MAX_CP_COUNT]; int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; @@ -147,44 +139,44 @@ public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws * Handle the basic code points and * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): */ - srcCPCount=destLength=0; + srcCPCount = destLength = 0; - for(j=0; j0) { - if(destLength 0) { + if (destLength < destCapacity) { + dest[destLength] = DELIMITER; } ++destLength; } @@ -196,20 +188,20 @@ public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws */ /* Initialize the state: */ - n=INITIAL_N; - delta=0; - bias=INITIAL_BIAS; + n = INITIAL_N; + delta = 0; + bias = INITIAL_BIAS; /* Main encoding loop: */ - for(handledCPCount=basicLength; handledCPCount state to , but guard against overflow: */ - if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { + if (m - n > (0x7fffffff - MAX_CP_COUNT - delta) / (handledCPCount + 1)) { throw new IllegalStateException("Internal program error"); } - delta+=(m-n)*(handledCPCount+1); - n=m; + delta += (m - n) * (handledCPCount + 1); + n = m; /* Encode a sequence of same code points n */ - for(j=0; jTMAX) { - t=TMAX; - } + /** + * RAM: comment out the old code for conformance with + * draft-ietf-idn-punycode-03.txt + * + *

t=k-bias; if(tTMAX) { t=TMAX; } */ - - t=k-bias; - if(t=(bias+TMAX)) { - t=TMAX; + t = k - bias; + if (t < TMIN) { + t = TMIN; + } else if (k >= (bias + TMAX)) { + t = TMAX; } - if(q= CAPITAL_Z); + /// CLOVER:OFF + private static boolean isBasicUpperCase(int ch) { + return (CAPITAL_A <= ch && ch >= CAPITAL_Z); } - ///CLOVER:ON - private static boolean isSurrogate(int ch){ - return (((ch)&0xfffff800)==0xd800); + /// CLOVER:ON + private static boolean isSurrogate(int ch) { + return (((ch) & 0xfffff800) == 0xd800); } /** - * Converts Punycode to Unicode. - * The Unicode string will be at most as long as the Punycode string. - * + * Converts Punycode to Unicode. The Unicode string will be at most as long as the Punycode + * string. + * * @param src * @param caseFlags * @return * @throws ParseException */ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) - throws StringPrepParseException{ + throws StringPrepParseException { final int srcLength = src.length(); final StringBuffer result = new StringBuffer(); - int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, - destCPCount, firstSupplementaryIndex, cpLength; + int n, + destLength, + i, + bias, + basicLength, + j, + in, + oldi, + w, + k, + digit, + t, + destCPCount, + firstSupplementaryIndex, + cpLength; char b; final int destCapacity = MAX_CP_COUNT; final char[] dest = new char[destCapacity]; @@ -313,40 +314,41 @@ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) * * The two following loops iterate backward. */ - for(j=srcLength; j>0;) { - if(src.charAt(--j)==DELIMITER) { + for (j = srcLength; j > 0; ) { + if (src.charAt(--j) == DELIMITER) { break; } } - destLength=basicLength=destCPCount=j; + destLength = basicLength = destCPCount = j; - while(j>0) { - b=src.charAt(--j); - if(!isBasic(b)) { - throw new StringPrepParseException("Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); + while (j > 0) { + b = src.charAt(--j); + if (!isBasic(b)) { + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND); } - if(j0 ? basicLength+1 : 0; in 0 ? basicLength + 1 : 0; in < srcLength; /* no op */ ) { /* * in is the index of the next character to be consumed, and * destCPCount is the number of code points in the output array. @@ -356,36 +358,40 @@ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) * if we increase i as we go, then subtract off its starting * value at the end to obtain delta. */ - for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { - if(in>=srcLength) { - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + for (oldi = i, w = 1, k = BASE; /* no condition */ ; k += BASE) { + if (in >= srcLength) { + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - digit=basicToDigit[src.charAt(in++) & 0xFF]; - if(digit<0) { - throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); + digit = basicToDigit[src.charAt(in++) & 0xFF]; + if (digit < 0) { + throw new StringPrepParseException( + "Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND); } - if(digit>(0x7fffffff-i)/w) { + if (digit > (0x7fffffff - i) / w) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - i+=digit*w; - t=k-bias; - if(t=(bias+TMAX)) { - t=TMAX; + i += digit * w; + t = k - bias; + if (t < TMIN) { + t = TMIN; + } else if (k >= (bias + TMAX)) { + t = TMAX; } - if(digit0x7fffffff/(BASE-t)) { + if (w > 0x7fffffff / (BASE - t)) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - w*=BASE-t; + w *= BASE - t; } /* @@ -394,30 +400,32 @@ public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) * where needed instead of in for() loop tail. */ ++destCPCount; - bias=adaptBias(i-oldi, destCPCount, (oldi==0)); + bias = adaptBias(i - oldi, destCPCount, (oldi == 0)); /* * i was supposed to wrap around from (incremented) destCPCount to 0, * incrementing n each time, so we'll fix that now: */ - if(i/destCPCount>(0x7fffffff-n)) { + if (i / destCPCount > (0x7fffffff - n)) { /* integer overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } - n+=i/destCPCount; - i%=destCPCount; + n += i / destCPCount; + i %= destCPCount; /* not needed for Punycode: */ /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ - if(n>0x10ffff || isSurrogate(n)) { + if (n > 0x10ffff || isSurrogate(n)) { /* Unicode code point overflow */ - throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); + throw new StringPrepParseException( + "Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND); } /* Insert n at position i of the output: */ - cpLength=UTF16.getCharCount(n); - if((destLength+cpLength)1) { - firstSupplementaryIndex=codeUnitIndex; + if (i <= firstSupplementaryIndex) { + codeUnitIndex = i; + if (cpLength > 1) { + firstSupplementaryIndex = codeUnitIndex; } else { ++firstSupplementaryIndex; } } else { - codeUnitIndex=firstSupplementaryIndex; - codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); + codeUnitIndex = firstSupplementaryIndex; + codeUnitIndex = + UTF16.moveCodePointOffset( + dest, 0, destLength, codeUnitIndex, i - codeUnitIndex); } /* use the UChar index codeUnitIndex instead of the code point index i */ - if(codeUnitIndex matchList, String[] destArray) { + public static String[] split( + Matcher m, String input, ArrayList matchList, String[] destArray) { int lastPos = 0; while (true) { final boolean found = m.find(); @@ -49,5 +50,4 @@ public static String[] split(Matcher m, String input, } return matchList.toArray(destArray); } - } diff --git a/unicodetools/src/main/java/org/unicode/idna/StringPrepData.java b/unicodetools/src/main/java/org/unicode/idna/StringPrepData.java index fe221b9e2..ded575a3c 100644 --- a/unicodetools/src/main/java/org/unicode/idna/StringPrepData.java +++ b/unicodetools/src/main/java/org/unicode/idna/StringPrepData.java @@ -1,76 +1,62 @@ -/** - * - */ +/** */ package org.unicode.idna; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; import java.io.IOException; import java.util.EnumSet; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import org.unicode.props.UnicodeProperty; import org.unicode.idna.Idna.IdnaType; import org.unicode.jsp.FileUtilities; - -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; +import org.unicode.props.UnicodeProperty; public class StringPrepData { private static final boolean DEBUG = getDebugFlag(StringPrepData.class); /** -3. Mapping - This profile specifies mapping using the following tables from - [STRINGPREP]: - Table B.1 - Table B.2 -4. Normalization - This profile specifies using Unicode normalization form KC, as - described in [STRINGPREP]. -5. Prohibited Output - Table C.1.2 - Table C.2.2 - Table C.3 - Table C.4 - Table C.5 - Table C.6 - Table C.7 - Table C.8 - Table C.9 + * 3. Mapping This profile specifies mapping using the following tables from [STRINGPREP]: Table + * B.1 Table B.2 4. Normalization This profile specifies using Unicode normalization form KC, as + * described in [STRINGPREP]. 5. Prohibited Output Table C.1.2 Table C.2.2 Table C.3 Table C.4 + * Table C.5 Table C.6 Table C.7 Table C.8 Table C.9 + * * @param STD3 TODO */ - - public static void getIdna2003Tables(UnicodeMap mappings, UnicodeMap types, boolean STD3) { + public static void getIdna2003Tables( + UnicodeMap mappings, UnicodeMap types, boolean STD3) { final UnicodeSet prohibited = new UnicodeSet(); - getNamePrepData32(EnumSet.of( - Idna2003Table.B_1, - Idna2003Table.B_2, - Idna2003Table.C_1_2 - , Idna2003Table.C_2_2 - , Idna2003Table.C_3 - , Idna2003Table.C_4 - , Idna2003Table.C_5 - , Idna2003Table.C_6 - , Idna2003Table.C_7 - , Idna2003Table.C_8 - , Idna2003Table.C_9 - ), mappings, prohibited, STD3); + getNamePrepData32( + EnumSet.of( + Idna2003Table.B_1, + Idna2003Table.B_2, + Idna2003Table.C_1_2, + Idna2003Table.C_2_2, + Idna2003Table.C_3, + Idna2003Table.C_4, + Idna2003Table.C_5, + Idna2003Table.C_6, + Idna2003Table.C_7, + Idna2003Table.C_8, + Idna2003Table.C_9), + mappings, + prohibited, + STD3); - types.putAll(0,0x10FFFF, IdnaType.disallowed); + types.putAll(0, 0x10FFFF, IdnaType.disallowed); types.putAll(IdnaTypes.U32, IdnaType.valid); types.putAll(prohibited, IdnaType.disallowed); final UnicodeSet ignored = mappings.getSet(""); final UnicodeSet hasMapping = mappings.keySet(); types.putAll(ignored, IdnaType.ignored); types.putAll(new UnicodeSet(hasMapping).removeAll(ignored), IdnaType.mapped); - //mappings.putAll(ignored, null); + // mappings.putAll(ignored, null); // special handling for separators - mappings.putAll(IdnaTypes.OTHER_DOT_SET,"."); + mappings.putAll(IdnaTypes.OTHER_DOT_SET, "."); // special old exceptions mappings.put(0x2F868, UTF16.valueOf(0x2136A)); @@ -79,8 +65,8 @@ public static void getIdna2003Tables(UnicodeMap mappings, UnicodeMap class1, String flagName) { String className = class1.getName().toLowerCase(Locale.ROOT); final int lastPart = className.lastIndexOf('.'); if (lastPart >= 0) { - className = className.substring(lastPart+1); + className = className.substring(lastPart + 1); } - return System.getProperty(className+"_" + flagName) != null; + return System.getProperty(className + "_" + flagName) != null; } + // static Normalizer normalizer32 = new Normalizer(UCD_Types.NFKC, "3.2.0"); - - //static Normalizer normalizer32 = new Normalizer(UCD_Types.NFKC, "3.2.0"); - - // private static String normalizeAndCheckString(String inputString, UnicodeMap> rawIdna2003Data) { + // private static String normalizeAndCheckString(String inputString, + // UnicodeMap> rawIdna2003Data) { // String string = Normalizer.normalize(inputString, Normalizer.NFKC); // int cp; // for (int i = 0; i < string.length(); i += Character.charCount(cp)) { @@ -111,89 +96,80 @@ private static boolean getDebugFlag(Class class1, String flagName) { // R3 data = rawIdna2003Data.get(cp); // StringPrepData.Idna2003Table type = data == null ? Idna2003Table.none : data.get0(); // switch (type) { - // case A_1: case C_1_2: case C_2_1: case C_2_2: case C_3: case C_4: case C_5: case C_6: case C_7: case C_8: case C_9: + // case A_1: case C_1_2: case C_2_1: case C_2_2: case C_3: case C_4: case C_5: case C_6: + // case C_7: case C_8: case C_9: // return null; // } // } // return string; // } - enum Idna2003Table {none, A_1, B_1, B_2, B_3, C_1_1, C_1_2, C_2_1, C_2_2, C_3, C_4, C_5, C_6, C_7, C_8, C_9, D_1, D_2} + enum Idna2003Table { + none, + A_1, + B_1, + B_2, + B_3, + C_1_1, + C_1_2, + C_2_1, + C_2_2, + C_3, + C_4, + C_5, + C_6, + C_7, + C_8, + C_9, + D_1, + D_2 + } /** - A.1 Unassigned code points in Unicode 3.2 - ----- Start Table A.1 ----- - 0221 - B.1 Commonly mapped to nothing - ----- Start Table B.1 ----- - 00AD; ; Map to nothing - B.2 Mapping for case-folding used with NFKC - ----- Start Table B.2 ----- - 0041; 0061; Case map - B.3 Mapping for case-folding used with no normalization - ----- Start Table B.3 ----- - 0041; 0061; Case map - C.1.1 ASCII space characters - ----- Start Table C.1.1 ----- - 0020; SPACE - C.1.2 Non-ASCII space characters - ----- Start Table C.1.2 ----- - 00A0; NO-BREAK SPACE - C.2.1 ASCII control characters - ----- Start Table C.2.1 ----- - 0000-001F; [CONTROL CHARACTERS] - C.2.2 Non-ASCII control characters - ----- Start Table C.2.2 ----- - 0080-009F; [CONTROL CHARACTERS] - C.2.2 Non-ASCII control characters - ----- Start Table C.2.2 ----- - 0080-009F; [CONTROL CHARACTERS] - C.3 Private use - ----- Start Table C.3 ----- - E000-F8FF; [PRIVATE USE, PLANE 0] - C.4 Non-character code points - ----- Start Table C.4 ----- - FDD0-FDEF; [NONCHARACTER CODE POINTS] - C.5 Surrogate codes - ----- Start Table C.5 ----- - D800-DFFF; [SURROGATE CODES] - C.6 Inappropriate for plain text - ----- Start Table C.6 ----- - FFF9; INTERLINEAR ANNOTATION ANCHOR - C.7 Inappropriate for canonical representation - ----- Start Table C.7 ----- - 2FF0-2FFB; [IDEOGRAPHIC DESCRIPTION CHARACTERS] - C.8 Change display properties or are deprecated - ----- Start Table C.8 ----- - 0340; COMBINING GRAVE TONE MARK - C.9 Tagging characters - ----- Start Table C.9 ----- - E0001; LANGUAGE TAG - D.1 Characters with bidirectional property "R" or "AL" - ----- Start Table D.1 ----- - 05BE - D.2 Characters with bidirectional property "L" - ----- Start Table D.2 ----- - 0041-005A + * A.1 Unassigned code points in Unicode 3.2 ----- Start Table A.1 ----- 0221 B.1 Commonly + * mapped to nothing ----- Start Table B.1 ----- 00AD; ; Map to nothing B.2 Mapping for + * case-folding used with NFKC ----- Start Table B.2 ----- 0041; 0061; Case map B.3 Mapping for + * case-folding used with no normalization ----- Start Table B.3 ----- 0041; 0061; Case map + * C.1.1 ASCII space characters ----- Start Table C.1.1 ----- 0020; SPACE C.1.2 Non-ASCII space + * characters ----- Start Table C.1.2 ----- 00A0; NO-BREAK SPACE C.2.1 ASCII control characters + * ----- Start Table C.2.1 ----- 0000-001F; [CONTROL CHARACTERS] C.2.2 Non-ASCII control + * characters ----- Start Table C.2.2 ----- 0080-009F; [CONTROL CHARACTERS] C.2.2 Non-ASCII + * control characters ----- Start Table C.2.2 ----- 0080-009F; [CONTROL CHARACTERS] C.3 Private + * use ----- Start Table C.3 ----- E000-F8FF; [PRIVATE USE, PLANE 0] C.4 Non-character code + * points ----- Start Table C.4 ----- FDD0-FDEF; [NONCHARACTER CODE POINTS] C.5 Surrogate codes + * ----- Start Table C.5 ----- D800-DFFF; [SURROGATE CODES] C.6 Inappropriate for plain text + * ----- Start Table C.6 ----- FFF9; INTERLINEAR ANNOTATION ANCHOR C.7 Inappropriate for + * canonical representation ----- Start Table C.7 ----- 2FF0-2FFB; [IDEOGRAPHIC DESCRIPTION + * CHARACTERS] C.8 Change display properties or are deprecated ----- Start Table C.8 ----- 0340; + * COMBINING GRAVE TONE MARK C.9 Tagging characters ----- Start Table C.9 ----- E0001; LANGUAGE + * TAG D.1 Characters with bidirectional property "R" or "AL" ----- Start Table D.1 ----- 05BE + * D.2 Characters with bidirectional property "L" ----- Start Table D.2 ----- 0041-005A */ + static Pattern TABLE_DELIMITER = + Pattern.compile("\\Q-----\\E\\s*(Start|End)\\s*Table\\s*(\\S+)\\s*\\Q-----\\E"); - static Pattern TABLE_DELIMITER = Pattern.compile("\\Q-----\\E\\s*(Start|End)\\s*Table\\s*(\\S+)\\s*\\Q-----\\E"); - static Pattern MAP_LINE = Pattern.compile("([A-Z0-9]{4,6})" + - "(?:-([A-Z0-9]{4,6}))?" + - "(?:\\s*;\\s*((?:[A-Z0-9]{4,6}\\s*)*))?" + - "(?:\\s*;\\s*.*)?"); - static Pattern SET_LINE = Pattern.compile("([A-Z0-9]{4,6})" + - "(?:-([A-Z0-9]{4,6}))?" + - "(?:\\s*;\\s*.*)?"); + static Pattern MAP_LINE = + Pattern.compile( + "([A-Z0-9]{4,6})" + + "(?:-([A-Z0-9]{4,6}))?" + + "(?:\\s*;\\s*((?:[A-Z0-9]{4,6}\\s*)*))?" + + "(?:\\s*;\\s*.*)?"); + static Pattern SET_LINE = + Pattern.compile("([A-Z0-9]{4,6})" + "(?:-([A-Z0-9]{4,6}))?" + "(?:\\s*;\\s*.*)?"); - private static void getNamePrepData32(EnumSet allowed, UnicodeMap mappings, UnicodeSet prohibited, boolean STD3) { + private static void getNamePrepData32( + EnumSet allowed, + UnicodeMap mappings, + UnicodeSet prohibited, + boolean STD3) { try { final Matcher tableDelimiter = TABLE_DELIMITER.matcher(""); final Matcher mapLine = MAP_LINE.matcher(""); final Matcher setLine = SET_LINE.matcher(""); final BufferedReader in = FileUtilities.openFile(StringPrepData.class, "nameprep.txt"); - //BufferedReader in = FileUtilities.openUTF8Reader(UCD_Types.BASE_DIR + "idna/", "nameprep.txt"); + // BufferedReader in = FileUtilities.openUTF8Reader(UCD_Types.BASE_DIR + "idna/", + // "nameprep.txt"); StringPrepData.Idna2003Table table = null; boolean inTable = false; boolean isMapping = false; @@ -211,16 +187,19 @@ private static void getNamePrepData32(EnumSet allo throw new IllegalArgumentException("Bad syntax: " + line); } inTable = tableDelimiter.group(1).equals("Start"); - final StringPrepData.Idna2003Table newTable = Idna2003Table.valueOf(tableDelimiter.group(2).replace(".","_")); + final StringPrepData.Idna2003Table newTable = + Idna2003Table.valueOf(tableDelimiter.group(2).replace(".", "_")); if (inTable) { if (table != null) { - throw new IllegalArgumentException("Table not terminated: " + table + "; " + line); + throw new IllegalArgumentException( + "Table not terminated: " + table + "; " + line); } table = newTable; isMapping = newTable.toString().startsWith("B"); } else { if (newTable != table) { - throw new IllegalArgumentException("Bad table end: " + newTable + " != " + table + "; " + line); + throw new IllegalArgumentException( + "Bad table end: " + newTable + " != " + table + "; " + line); } table = null; isMapping = false; @@ -240,16 +219,21 @@ private static void getNamePrepData32(EnumSet allo if (!lineMatcher.reset(line).matches()) { throw new IllegalArgumentException("Illegal range-value syntax: " + line); } - final int startCode = Utility.fromHex(lineMatcher.group(1),4," ").codePointAt(0); - final String endCodeString = lineMatcher.groupCount() < 2 ? null : lineMatcher.group(2); + final int startCode = Utility.fromHex(lineMatcher.group(1), 4, " ").codePointAt(0); + final String endCodeString = + lineMatcher.groupCount() < 2 ? null : lineMatcher.group(2); final String group3 = lineMatcher.groupCount() < 3 ? null : lineMatcher.group(3); final String group4 = lineMatcher.groupCount() < 4 ? null : lineMatcher.group(4); - final int endCode = endCodeString == null ? startCode : Utility.fromHex(endCodeString,4," ").codePointAt(0); + final int endCode = + endCodeString == null + ? startCode + : Utility.fromHex(endCodeString, 4, " ").codePointAt(0); String comment, mapValueString; if (isMapping) { comment = group4; try { - mapValueString = group3.length() == 0 ? "" : Utility.fromHex(group3,4," "); + mapValueString = + group3.length() == 0 ? "" : Utility.fromHex(group3, 4, " "); } catch (final RuntimeException e) { throw e; } @@ -263,7 +247,13 @@ private static void getNamePrepData32(EnumSet allo if (mapValueString != null) { final String oldValue = mappings.get(i); if (oldValue != null && !UnicodeProperty.equals(mapValueString, oldValue)) { - throw new IllegalArgumentException("Duplicates: " + Utility.hex(i) + "\told: " + oldValue + "\t skipping new: " + mapValueString); + throw new IllegalArgumentException( + "Duplicates: " + + Utility.hex(i) + + "\told: " + + oldValue + + "\t skipping new: " + + mapValueString); } mappings.put(i, mapValueString); } else { @@ -281,15 +271,14 @@ private static void getNamePrepData32(EnumSet allo prohibited.add("."); } - for (int i = 'A'; i <= 'Z'; ++i) { - mappings.put(i, UTF16.valueOf(i-'A'+'a')); + mappings.put(i, UTF16.valueOf(i - 'A' + 'a')); } // fix up mappings // add normalization maps for all unmapped characters final UnicodeSet addedMappings = new UnicodeSet(); - for (final UnicodeSetIterator it = new UnicodeSetIterator(IdnaTypes.U32); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(IdnaTypes.U32); it.next(); ) { final int i = it.codepoint; final String mapValue = mappings.get(i); if (mapValue == null) { @@ -297,18 +286,36 @@ private static void getNamePrepData32(EnumSet allo continue; } addedMappings.add(i); - mappings.put(i, Idna.NFKC_3_2.transform(i)); // Normalizer.normalize(i, Normalizer.NFKC, Normalizer.UNICODE_3_2)); - } else if (!Idna.NFKC_3_2.isTransformed(mapValue)) { // (!Normalizer.isNormalized(mapValue, Normalizer.NFKC, Normalizer.UNICODE_3_2)) { - final String newValue = Idna.NFKC_3_2.transform(mapValue); // Normalizer.normalize(mapValue, Normalizer.NFKC, Normalizer.UNICODE_3_2); + mappings.put( + i, + Idna.NFKC_3_2.transform(i)); // Normalizer.normalize(i, Normalizer.NFKC, + // Normalizer.UNICODE_3_2)); + } else if (!Idna.NFKC_3_2.isTransformed( + mapValue)) { // (!Normalizer.isNormalized(mapValue, Normalizer.NFKC, + // Normalizer.UNICODE_3_2)) { + final String newValue = + Idna.NFKC_3_2.transform( + mapValue); // Normalizer.normalize(mapValue, Normalizer.NFKC, + // Normalizer.UNICODE_3_2); if (DEBUG) { - System.out.println("Change for NFKC mapping of " + Utility.hex(i) + ", \t" + Utility.hex(mapValue) + " \t => \t" + Utility.hex(newValue)); + System.out.println( + "Change for NFKC mapping of " + + Utility.hex(i) + + ", \t" + + Utility.hex(mapValue) + + " \t => \t" + + Utility.hex(newValue)); } addedMappings.add(i); mappings.put(i, newValue); } } if (DEBUG) { - System.out.println("Adding NFKC mapping for " + addedMappings.toPattern(false) + ",\t" + addedMappings); + System.out.println( + "Adding NFKC mapping for " + + addedMappings.toPattern(false) + + ",\t" + + addedMappings); } // remove identical mapping @@ -320,20 +327,32 @@ private static void getNamePrepData32(EnumSet allo } } if (DEBUG) { - System.out.println("Removing Identical mapping for " + identicals.toPattern(false) + ",\t" + identicals); + System.out.println( + "Removing Identical mapping for " + + identicals.toPattern(false) + + ",\t" + + identicals); } mappings.putAll(identicals, null); // fix the prohibition according to the resulting characters for (final String source : mappings) { - final int cpSource = source.codePointAt(0); + final int cpSource = source.codePointAt(0); boolean shouldBeProhibited = false; final String mapping = mappings.get(source); for (int i = 0; i < mapping.length(); i += Character.charCount(cpSource)) { final int cpInMapping = mapping.codePointAt(i); final String otherMap = mappings.get(cpInMapping); if (otherMap != null) { - throw new IllegalArgumentException("Recursive mapping\t" + Utility.hex(source) + ",\t" + Utility.hex(mapping) + ",\t" + Utility.hex(cpInMapping) + ",\t" + Utility.hex(otherMap)); + throw new IllegalArgumentException( + "Recursive mapping\t" + + Utility.hex(source) + + ",\t" + + Utility.hex(mapping) + + ",\t" + + Utility.hex(cpInMapping) + + ",\t" + + Utility.hex(otherMap)); } if (prohibited.contains(cpInMapping)) { shouldBeProhibited = true; @@ -343,12 +362,24 @@ private static void getNamePrepData32(EnumSet allo if (wasProhibited != shouldBeProhibited) { if (shouldBeProhibited) { if (DEBUG) { - System.out.println("Changing to prohibited for " + source + "\t" + Utility.hex(cpSource) + ",\t" + shouldBeProhibited); + System.out.println( + "Changing to prohibited for " + + source + + "\t" + + Utility.hex(cpSource) + + ",\t" + + shouldBeProhibited); } prohibited.add(cpSource); } else { if (DEBUG) { - System.out.println("Removing from prohibited for " + source + "\t" + Utility.hex(cpSource) + ",\t" + shouldBeProhibited); + System.out.println( + "Removing from prohibited for " + + source + + "\t" + + Utility.hex(cpSource) + + ",\t" + + shouldBeProhibited); } prohibited.remove(cpSource); } diff --git a/unicodetools/src/main/java/org/unicode/idna/Uts46.java b/unicodetools/src/main/java/org/unicode/idna/Uts46.java index a003bce9d..ecd6896a7 100644 --- a/unicodetools/src/main/java/org/unicode/idna/Uts46.java +++ b/unicodetools/src/main/java/org/unicode/idna/Uts46.java @@ -1,19 +1,16 @@ package org.unicode.idna; -import java.util.List; -import java.util.Set; -import java.util.regex.Pattern; - -import org.unicode.text.utility.Settings; - import com.google.common.base.Splitter; import com.google.common.collect.ImmutableSortedSet; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.SpanCondition; - +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; import org.unicode.jsp.FileUtilities; +import org.unicode.text.utility.Settings; public class Uts46 extends Idna { @@ -28,7 +25,8 @@ private Uts46() { mappings.freeze(); mappings_display.freeze(); validSet = new UnicodeSet(types.getSet(IdnaType.valid)).freeze(); - validSet_transitional = new UnicodeSet(validSet).addAll(types.getSet(IdnaType.deviation)).freeze(); + validSet_transitional = + new UnicodeSet(validSet).addAll(types.getSet(IdnaType.deviation)).freeze(); checkPunycodeValidity = true; } // private @@ -46,24 +44,24 @@ public boolean handleLine(int start, int end, String[] items) { String value; switch (type) { - case mapped: - value = Utility.fromHex(items[2], 4, " "); - break; - case deviation: - if (items.length > 2) { + case mapped: value = Utility.fromHex(items[2], 4, " "); - } else { + break; + case deviation: + if (items.length > 2) { + value = Utility.fromHex(items[2], 4, " "); + } else { + value = ""; + } + break; + case ignored: value = ""; - } - break; - case ignored: - value = ""; - break; - case disallowed: - case valid: - default: - value = null; - break; + break; + case disallowed: + case valid: + default: + value = null; + break; } if (mappings != null) { mappings.putAll(start, end, value); @@ -163,25 +161,26 @@ public boolean handleLine(int start, int end, String[] items) { * NSM. */ - static final UnicodeSet R_AL_AN = new UnicodeSet("[[:bc=R:][:bc=AL:][:bc=AN:]]").freeze(); - static final UnicodeSet R_AL = new UnicodeSet("[[:bc=R:][:bc=AL:]]").freeze(); - static final UnicodeSet L = new UnicodeSet("[[:bc=L:]]").freeze(); - static final UnicodeSet ES_CS_ET_ON_BN_NSM = new UnicodeSet("[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]]").freeze(); - static final UnicodeSet R_AL_AN_EN = new UnicodeSet("[[:bc=R:][:bc=AL:][:bc=AN:][:bc=EN:]]").freeze(); - static final UnicodeSet R_AL_AN_EN_ES_CS_ET_ON_BN_NSM = new UnicodeSet(R_AL_AN_EN).addAll(ES_CS_ET_ON_BN_NSM).freeze(); - static final UnicodeSet L_EN = new UnicodeSet("[[:bc=L:][:bc=EN:]]").freeze(); - static final UnicodeSet L_EN_ES_CS_ET_ON_BN_NSM = new UnicodeSet(L_EN).addAll(ES_CS_ET_ON_BN_NSM).freeze(); - static final UnicodeSet EN = new UnicodeSet("[[:bc=EN:]]").freeze(); - static final UnicodeSet AN = new UnicodeSet("[[:bc=AN:]]").freeze(); - static final UnicodeSet NSM = new UnicodeSet("[[:bc=NSM:]]").freeze(); + static final UnicodeSet R_AL_AN = new UnicodeSet("[[:bc=R:][:bc=AL:][:bc=AN:]]").freeze(); + static final UnicodeSet R_AL = new UnicodeSet("[[:bc=R:][:bc=AL:]]").freeze(); + static final UnicodeSet L = new UnicodeSet("[[:bc=L:]]").freeze(); + static final UnicodeSet ES_CS_ET_ON_BN_NSM = + new UnicodeSet("[[:bc=ES:][:bc=CS:][:bc=ET:][:bc=ON:][:bc=BN:][:bc=NSM:]]").freeze(); + static final UnicodeSet R_AL_AN_EN = + new UnicodeSet("[[:bc=R:][:bc=AL:][:bc=AN:][:bc=EN:]]").freeze(); + static final UnicodeSet R_AL_AN_EN_ES_CS_ET_ON_BN_NSM = + new UnicodeSet(R_AL_AN_EN).addAll(ES_CS_ET_ON_BN_NSM).freeze(); + static final UnicodeSet L_EN = new UnicodeSet("[[:bc=L:][:bc=EN:]]").freeze(); + static final UnicodeSet L_EN_ES_CS_ET_ON_BN_NSM = + new UnicodeSet(L_EN).addAll(ES_CS_ET_ON_BN_NSM).freeze(); + static final UnicodeSet EN = new UnicodeSet("[[:bc=EN:]]").freeze(); + static final UnicodeSet AN = new UnicodeSet("[[:bc=AN:]]").freeze(); + static final UnicodeSet NSM = new UnicodeSet("[[:bc=NSM:]]").freeze(); /** * Checks a string for IDNA2008 bidi errors. label must not be empty * - * @param domainName - * the string to be tested - * @param errors - * if an error is found, then an error string is added to this - * set. + * @param domainName the string to be tested + * @param errors if an error is found, then an error string is added to this set. * @return true if errors are found, otherwise false. */ public static boolean hasBidiError(String label, Set errors) { @@ -248,14 +247,14 @@ public static boolean hasBidiError(String label, Set errors) { } static final UnicodeSet JOINER_SET = new UnicodeSet("[\u200C\u200D]"); - static final UnicodeSet VIRAMAS = new UnicodeSet("[:ccc=virama:]"); - static final UnicodeSet T = new UnicodeSet("[:jt=T:]"); - static final UnicodeSet L_D = new UnicodeSet("[[:jt=L:][:jt=D:]]"); - static final UnicodeSet R_D = new UnicodeSet("[[:jt=R:][:jt=D:]]"); + static final UnicodeSet VIRAMAS = new UnicodeSet("[:ccc=virama:]"); + static final UnicodeSet T = new UnicodeSet("[:jt=T:]"); + static final UnicodeSet L_D = new UnicodeSet("[[:jt=L:][:jt=D:]]"); + static final UnicodeSet R_D = new UnicodeSet("[[:jt=R:][:jt=D:]]"); - static final Pattern JOINERS = Pattern.compile("[\u200C\u200D]"); - static final int NON_JOINER = 0x200C; - static final int JOINER = 0x200D; + static final Pattern JOINERS = Pattern.compile("[\u200C\u200D]"); + static final int NON_JOINER = 0x200C; + static final int JOINER = 0x200D; // U+200C ( ) ZERO WIDTH NON-JOINER // U+200D ( ) ZERO WIDTH JOINER @@ -308,7 +307,11 @@ public static boolean hasContextJError(String domain, Set errors) { if (beforeT > 0) { final int previousChar = Character.codePointBefore(domain, beforeT); if (L_D.contains(previousChar)) { - final int afterT = (i + 1) + T.span(domain.subSequence(i + 1, domain.length()), SpanCondition.CONTAINED); + final int afterT = + (i + 1) + + T.span( + domain.subSequence(i + 1, domain.length()), + SpanCondition.CONTAINED); if (afterT < domain.length()) { final int nextChar = Character.codePointAt(domain, afterT); if (R_D.contains(nextChar)) { @@ -370,26 +373,27 @@ protected String fromPunycode(String label, Set errors) { } } - static final UnicodeSet ASCII = new UnicodeSet("[\\u0000-\\u007F]").freeze(); + static final UnicodeSet ASCII = new UnicodeSet("[\\u0000-\\u007F]").freeze(); public enum IdnaChoice { - transitional, nontransitional + transitional, + nontransitional } - public static final int UIDNA_ERROR_INVALID_ACE_LABEL = 1; - public static final int UIDNA_ERROR_DISALLOWED = 2; - public static final int UIDNA_ERROR_PUNYCODE = 4; - public static final int UIDNA_ERROR_CONTEXTJ = 8; - public static final int UIDNA_ERROR_LABEL_TOO_LONG = 16; - public static final int UIDNA_ERROR_DOMAIN_NAME_TOO_LONG = 32; - public static final int UIDNA_ERROR_EMPTY_LABEL = 64; - public static final int UIDNA_ERROR_LEADING_HYPHEN = 128; - public static final int UIDNA_ERROR_HYPHEN_3_4 = 256; - public static final int UIDNA_ERROR_TRAILING_HYPHEN = 512; + public static final int UIDNA_ERROR_INVALID_ACE_LABEL = 1; + public static final int UIDNA_ERROR_DISALLOWED = 2; + public static final int UIDNA_ERROR_PUNYCODE = 4; + public static final int UIDNA_ERROR_CONTEXTJ = 8; + public static final int UIDNA_ERROR_LABEL_TOO_LONG = 16; + public static final int UIDNA_ERROR_DOMAIN_NAME_TOO_LONG = 32; + public static final int UIDNA_ERROR_EMPTY_LABEL = 64; + public static final int UIDNA_ERROR_LEADING_HYPHEN = 128; + public static final int UIDNA_ERROR_HYPHEN_3_4 = 256; + public static final int UIDNA_ERROR_TRAILING_HYPHEN = 512; public static final int UIDNA_ERROR_LEADING_COMBINING_MARK = 1024; - public static final int UIDNA_ERROR_BIDI = 2048; - public static final int UIDNA_ERROR_LABEL_HAS_DOT = 4096; - public static final int UIDNA_NOT_IDNA2008 = 8192; + public static final int UIDNA_ERROR_BIDI = 2048; + public static final int UIDNA_ERROR_LABEL_HAS_DOT = 4096; + public static final int UIDNA_NOT_IDNA2008 = 8192; public enum Errors { B1(UIDNA_ERROR_BIDI), @@ -441,23 +445,25 @@ public String process(String domainName, IdnaChoice idnaChoice, Set erro } /** - * Returns number of new errors. Must already be in canonical form, unicode with period separators. + * Returns number of new errors. Must already be in canonical form, unicode with period + * separators. + * * @param domainName * @param errors * @return */ public static int hasBidiOrContextError(String domainName, Set errors) { if (domainName.endsWith(".")) { - domainName = domainName.substring(0,domainName.length() - 1); + domainName = domainName.substring(0, domainName.length() - 1); } -// From end of https://tools.ietf.org/html/rfc5893#section-1.4 -// An RTL label is a label that contains at least one character of type -// R, AL, or AN. -// -// An LTR label is any label that is not an RTL label. -// -// A "Bidi domain name" is a domain name that contains at least one RTL -// label. + // From end of https://tools.ietf.org/html/rfc5893#section-1.4 + // An RTL label is a label that contains at least one character of type + // R, AL, or AN. + // + // An LTR label is any label that is not an RTL label. + // + // A "Bidi domain name" is a domain name that contains at least one RTL + // label. boolean isBidi = R_AL_AN.containsSome(domainName); final int oldErrorLength = errors.size(); for (final String label : PERIOD.split(domainName)) { @@ -487,44 +493,45 @@ private String processMap(String domainName, IdnaChoice idnaChoice, Set // disallowed: Leave the code point unchanged in the string, and // record that there was an error. switch (type) { - case disallowed: - errors.add(Errors.P1); - buffer.appendCodePoint(cp); - break; - // ignored: Remove the code point from the string. This is - // equivalent to mapping the code point to an empty string. - case ignored: - break; - // mapped: Replace the code point in the string by the value for the - // mapping in Section 5, IDNA Mapping Table. - case mapped: - String mapped = mappings.get(cp); - buffer.append(mapped); - break; - // deviation: - // For Transitional Processing, replace the code point in the string - // by the value for the mapping in Section 5, IDNA Mapping Table. - // For Nontransitional Processing, leave the code point unchanged in - // the string. - case deviation: - if (idnaChoice == IdnaChoice.transitional) { - mapped = mappings.get(cp); + case disallowed: + errors.add(Errors.P1); + buffer.appendCodePoint(cp); + break; + // ignored: Remove the code point from the string. This is + // equivalent to mapping the code point to an empty string. + case ignored: + break; + // mapped: Replace the code point in the string by the value for the + // mapping in Section 5, IDNA Mapping Table. + case mapped: + String mapped = mappings.get(cp); buffer.append(mapped); - } else { + break; + // deviation: + // For Transitional Processing, replace the code point in the string + // by the value for the mapping in Section 5, IDNA Mapping Table. + // For Nontransitional Processing, leave the code point unchanged in + // the string. + case deviation: + if (idnaChoice == IdnaChoice.transitional) { + mapped = mappings.get(cp); + buffer.append(mapped); + } else { + buffer.appendCodePoint(cp); + } + break; + // valid: Leave the code point unchanged in the string. + case valid: buffer.appendCodePoint(cp); - } - break; - // valid: Leave the code point unchanged in the string. - case valid: - buffer.appendCodePoint(cp); - break; + break; } } domainName = buffer.toString(); return domainName; } - private String processConvertValidateLabels(IdnaChoice idnaChoice, Set errors, Iterable labels) { + private String processConvertValidateLabels( + IdnaChoice idnaChoice, Set errors, Iterable labels) { String domainName; final StringBuilder buffer = new StringBuilder(); boolean first = true; @@ -562,17 +569,18 @@ private String processConvertValidateLabels(IdnaChoice idnaChoice, Set e } buffer.append(label); } -// drop final period -// if (buffer.length() > 0 && Character.codePointBefore(buffer, buffer.length()) == '.') { -// buffer.setLength(buffer.length()-1); -// } + // drop final period + // if (buffer.length() > 0 && Character.codePointBefore(buffer, buffer.length()) == + // '.') { + // buffer.setLength(buffer.length()-1); + // } domainName = buffer.toString(); return domainName; } - static final Pattern HYPHEN34 = Pattern.compile("..--.*"); - static final Pattern HYPHEN_START_END = Pattern.compile("(-.*)|(.*-)"); - static final UnicodeSet MARKS = new UnicodeSet("[:M:]").freeze(); + static final Pattern HYPHEN34 = Pattern.compile("..--.*"); + static final Pattern HYPHEN_START_END = Pattern.compile("(-.*)|(.*-)"); + static final UnicodeSet MARKS = new UnicodeSet("[:M:]").freeze(); private void checkLabelValidity(String label, IdnaChoice idnaChoice, Set errors) { // Each of the following criteria must be satisfied for a label: @@ -613,15 +621,15 @@ private void checkLabelValidity(String label, IdnaChoice idnaChoice, Set // For Nontransitional Processing, each value must be either valid // or deviation. switch (type) { - case valid: - break; - case deviation: - if (idnaChoice == IdnaChoice.transitional) { + case valid: + break; + case deviation: + if (idnaChoice == IdnaChoice.transitional) { + errors.add(Errors.V6); + } + break; + default: errors.add(Errors.V6); - } - break; - default: - errors.add(Errors.V6); } } } @@ -666,17 +674,20 @@ public String toASCII(String domainName, IdnaChoice idnaChoice, Set erro } buffer.append(label); } -// drop final period -// if (buffer.length() > 0 && Character.codePointBefore(buffer, buffer.length()) == '.') { -// buffer.setLength(buffer.length()-1); -// } + // drop final period + // if (buffer.length() > 0 && Character.codePointBefore(buffer, buffer.length()) == + // '.') { + // buffer.setLength(buffer.length()-1); + // } domainName = buffer.toString(); // Verify DNS length restrictions. This may record an error. For more // information, see [STD13] and [STD3]. // The length of the domain name, excluding the root label and its dot, // is from 1 to 253. final int labelDomainNameLength = UTF16.countCodePoint(domainName); - if (labelDomainNameLength < 0 || labelDomainNameLength > 254 || labelDomainNameLength == 254 && !domainName.endsWith(".")) { + if (labelDomainNameLength < 0 + || labelDomainNameLength > 254 + || labelDomainNameLength == 254 && !domainName.endsWith(".")) { errors.add(Errors.A4_1); } // If an error was recorded, then the operation failed, and no DNS diff --git a/unicodetools/src/main/java/org/unicode/jsp/CharEncoder.java b/unicodetools/src/main/java/org/unicode/jsp/CharEncoder.java index 645aa42d3..d6483d9b4 100644 --- a/unicodetools/src/main/java/org/unicode/jsp/CharEncoder.java +++ b/unicodetools/src/main/java/org/unicode/jsp/CharEncoder.java @@ -1,6 +1,4 @@ -/** - * - */ +/** */ package org.unicode.jsp; import java.nio.BufferUnderflowException; @@ -28,7 +26,6 @@ public class CharEncoder { private final CharBuffer returnCharBuffer = CharBuffer.wrap(returnChars); /** - * * @param charset * @param verifyRoundtrip * @param justCheck @@ -36,12 +33,14 @@ public class CharEncoder { public CharEncoder(Charset charset, boolean verifyRoundtrip, boolean justCheck) { this.verifyRoundtrip = verifyRoundtrip; this.justCheck = justCheck; - encoder = charset.newEncoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - decoder = charset.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); + encoder = + charset.newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + decoder = + charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); } public boolean isVerifyRoundtrip() { @@ -49,8 +48,10 @@ public boolean isVerifyRoundtrip() { } /** - * Convert the code point. Return -1 if fails. If justCheck, then return 1 if success. Otherwise return length of the bytes - * converted, and fill in the destination. In either case, if isVerifyRoundtrip() then check that the roundtrip works. + * Convert the code point. Return -1 if fails. If justCheck, then return 1 if success. Otherwise + * return length of the bytes converted, and fill in the destination. In either case, if + * isVerifyRoundtrip() then check that the roundtrip works. + * * @param codepoint * @param destination * @param offset @@ -100,4 +101,4 @@ public int getValue(int codepoint, byte[] destination, int offset) { return -1; } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/jsp/FileUtilities.java b/unicodetools/src/main/java/org/unicode/jsp/FileUtilities.java index 93ff7b817..366627e50 100644 --- a/unicodetools/src/main/java/org/unicode/jsp/FileUtilities.java +++ b/unicodetools/src/main/java/org/unicode/jsp/FileUtilities.java @@ -14,12 +14,14 @@ public final class FileUtilities { - public static abstract class SemiFileReader { - public final static Pattern SPLIT = Pattern.compile("\\s*;\\s*"); + public abstract static class SemiFileReader { + public static final Pattern SPLIT = Pattern.compile("\\s*;\\s*"); private int lineCount; protected void handleStart() {} + protected abstract boolean handleLine(int start, int end, String[] items); + protected void handleEnd() {} public int getLineCount() { @@ -39,23 +41,28 @@ public SemiFileReader process(Class classLocation, String fileName) { try { in = FileUtilities.openFile(classLocation, fileName); } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException(classLocation.getName() + ", " + fileName).initCause(e); + throw (RuntimeException) + new IllegalArgumentException(classLocation.getName() + ", " + fileName) + .initCause(e); } try { return process(in, fileName); } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException(lineCount + ":\t" + 0).initCause(e); + throw (RuntimeException) + new IllegalArgumentException(lineCount + ":\t" + 0).initCause(e); } } public SemiFileReader process(String directory, String fileName) { try { final FileInputStream fileStream = new FileInputStream(directory + "/" + fileName); - final InputStreamReader reader = new InputStreamReader(fileStream, StandardCharsets.UTF_8); - final BufferedReader bufferedReader = new BufferedReader(reader,1024*64); + final InputStreamReader reader = + new InputStreamReader(fileStream, StandardCharsets.UTF_8); + final BufferedReader bufferedReader = new BufferedReader(reader, 1024 * 64); return process(bufferedReader, fileName); } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException(lineCount + ":\t" + 0).initCause(e); + throw (RuntimeException) + new IllegalArgumentException(lineCount + ":\t" + 0).initCause(e); } } @@ -72,7 +79,7 @@ public SemiFileReader process(BufferedReader in, String fileName) { final int comment = line.indexOf("#"); if (comment >= 0) { processComment(line, comment); - line = line.substring(0,comment); + line = line.substring(0, comment); } if (line.startsWith("\uFEFF")) { line = line.substring(1); @@ -87,8 +94,8 @@ public SemiFileReader process(BufferedReader in, String fileName) { final String source = parts[0]; final int range = source.indexOf(".."); if (range >= 0) { - start = Integer.parseInt(source.substring(0,range),16); - end = Integer.parseInt(source.substring(range+2),16); + start = Integer.parseInt(source.substring(0, range), 16); + end = Integer.parseInt(source.substring(range + 2), 16); } else { start = end = Integer.parseInt(source, 16); } @@ -102,21 +109,23 @@ public SemiFileReader process(BufferedReader in, String fileName) { in.close(); handleEnd(); } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException(lineCount + ":\t" + line).initCause(e); + throw (RuntimeException) + new IllegalArgumentException(lineCount + ":\t" + line).initCause(e); } return this; } - protected void processComment(String line, int comment) { - } + + protected void processComment(String line, int comment) {} } // - // public static SemiFileReader fillMapFromSemi(Class classLocation, String fileName, SemiFileReader handler) { + // public static SemiFileReader fillMapFromSemi(Class classLocation, String fileName, + // SemiFileReader handler) { // return handler.process(classLocation, fileName); // } public static BufferedReader openFile(Class class1, String file) throws IOException { - //URL path = null; - //String externalForm = null; + // URL path = null; + // String externalForm = null; try { // //System.out.println("Reading:\t" + file1.getCanonicalPath()); // path = class1.getResource(file); @@ -128,20 +137,27 @@ public static BufferedReader openFile(Class class1, String file) throws IOExcept // boolean x = file1.canRead(); // final InputStream resourceAsStream = new FileInputStream(file1); final InputStream resourceAsStream = class1.getResourceAsStream(file); - final InputStreamReader reader = new InputStreamReader(resourceAsStream, StandardCharsets.UTF_8); - final BufferedReader bufferedReader = new BufferedReader(reader,1024*64); + final InputStreamReader reader = + new InputStreamReader(resourceAsStream, StandardCharsets.UTF_8); + final BufferedReader bufferedReader = new BufferedReader(reader, 1024 * 64); return bufferedReader; } catch (final Exception e) { final File file1 = new File(file); final String foo = class1.getResource(".").toString(); - throw (RuntimeException) new IllegalArgumentException("Bad file name: " - // + path + "\t" + externalForm + "\t" + - + file1.getCanonicalPath() - + "\n" + foo - + "\n" + new File(".").getCanonicalFile() + " => " + Arrays.asList(new File(".").getCanonicalFile().list()) - ) - .initCause(e); + throw (RuntimeException) + new IllegalArgumentException( + "Bad file name: " + // + path + "\t" + externalForm + "\t" + + + file1.getCanonicalPath() + + "\n" + + foo + + "\n" + + new File(".").getCanonicalFile() + + " => " + + Arrays.asList( + new File(".").getCanonicalFile().list())) + .initCause(e); } } @@ -155,27 +171,28 @@ static String[] splitCommaSeparated(String line) { boolean inQuote = false; for (int i = 0; i < line.length(); ++i) { final char ch = line.charAt(i); // don't worry about supplementaries - switch(ch) { - case '"': - inQuote = !inQuote; - // at start or end, that's enough - // if get a quote when we are not in a quote, and not at start, then add it and return to inQuote - if (inQuote && item.length() != 0) { - item.append('"'); - inQuote = true; - } - break; - case ',': - if (!inQuote) { - result.add(item.toString()); - item.setLength(0); - } else { + switch (ch) { + case '"': + inQuote = !inQuote; + // at start or end, that's enough + // if get a quote when we are not in a quote, and not at start, then add it and + // return to inQuote + if (inQuote && item.length() != 0) { + item.append('"'); + inQuote = true; + } + break; + case ',': + if (!inQuote) { + result.add(item.toString()); + item.setLength(0); + } else { + item.append(ch); + } + break; + default: item.append(ch); - } - break; - default: - item.append(ch); - break; + break; } } result.add(item.toString()); diff --git a/unicodetools/src/main/java/org/unicode/jsp/ICUPropertyFactory.java b/unicodetools/src/main/java/org/unicode/jsp/ICUPropertyFactory.java index 8e8fa7547..7e80548d0 100644 --- a/unicodetools/src/main/java/org/unicode/jsp/ICUPropertyFactory.java +++ b/unicodetools/src/main/java/org/unicode/jsp/ICUPropertyFactory.java @@ -6,6 +6,15 @@ */ package org.unicode.jsp; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.VersionInfo; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -18,535 +27,585 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; - -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.VersionInfo; - import org.unicode.props.UnicodeProperty; - /** - * Provides a general interface for Unicode Properties, and - * extracting sets based on those values. + * Provides a general interface for Unicode Properties, and extracting sets based on those values. + * * @author Davis */ - public class ICUPropertyFactory extends UnicodeProperty.Factory { - static class ICUProperty extends UnicodeProperty { - protected int propEnum = Integer.MIN_VALUE; - - protected ICUProperty(String propName, int propEnum) { - setName(propName); - this.propEnum = propEnum; - setType(internalGetPropertyType(propEnum)); - if (propEnum == UProperty.DEFAULT_IGNORABLE_CODE_POINT - || propEnum == UProperty.BIDI_CLASS - || propEnum == UProperty.BLOCK - || propEnum == UProperty.EAST_ASIAN_WIDTH - || propEnum == UProperty.LINE_BREAK - || propEnum == UProperty.NONCHARACTER_CODE_POINT - || propEnum == UProperty.PATTERN_SYNTAX - || propEnum == UProperty.PATTERN_WHITE_SPACE - || propEnum == UProperty.CHANGES_WHEN_CASEFOLDED - || propEnum == UProperty.EMOJI - || propEnum == UProperty.EMOJI_MODIFIER - || propEnum == UProperty.EMOJI_MODIFIER_BASE - || propEnum == UProperty.EMOJI_PRESENTATION - || propEnum == UProperty.EXTENDED_PICTOGRAPHIC - ) { - setUniformUnassigned(false); - } - } + static class ICUProperty extends UnicodeProperty { + protected int propEnum = Integer.MIN_VALUE; + + protected ICUProperty(String propName, int propEnum) { + setName(propName); + this.propEnum = propEnum; + setType(internalGetPropertyType(propEnum)); + if (propEnum == UProperty.DEFAULT_IGNORABLE_CODE_POINT + || propEnum == UProperty.BIDI_CLASS + || propEnum == UProperty.BLOCK + || propEnum == UProperty.EAST_ASIAN_WIDTH + || propEnum == UProperty.LINE_BREAK + || propEnum == UProperty.NONCHARACTER_CODE_POINT + || propEnum == UProperty.PATTERN_SYNTAX + || propEnum == UProperty.PATTERN_WHITE_SPACE + || propEnum == UProperty.CHANGES_WHEN_CASEFOLDED + || propEnum == UProperty.EMOJI + || propEnum == UProperty.EMOJI_MODIFIER + || propEnum == UProperty.EMOJI_MODIFIER_BASE + || propEnum == UProperty.EMOJI_PRESENTATION + || propEnum == UProperty.EXTENDED_PICTOGRAPHIC) { + setUniformUnassigned(false); + } + } - boolean shownException = false; - - public String _getValue(int codePoint) { - switch(propEnum) { - case UProperty.AGE: return getAge(codePoint); - case UProperty.BIDI_MIRRORING_GLYPH: return UTF16.valueOf(UCharacter.getMirror(codePoint)); - case UProperty.CASE_FOLDING: return UCharacter.foldCase(UTF16.valueOf(codePoint),true); - case UProperty.ISO_COMMENT: return UCharacter.getISOComment(codePoint); - case UProperty.LOWERCASE_MAPPING: return UCharacter.toLowerCase(Locale.ENGLISH,UTF16.valueOf(codePoint)); - case UProperty.NAME: return UCharacter.getName(codePoint); - case UProperty.SIMPLE_CASE_FOLDING: return UTF16.valueOf(UCharacter.foldCase(codePoint,true)); - case UProperty.SIMPLE_LOWERCASE_MAPPING: return UTF16.valueOf(UCharacter.toLowerCase(codePoint)); - case UProperty.SIMPLE_TITLECASE_MAPPING: return UTF16.valueOf(UCharacter.toTitleCase(codePoint)); - case UProperty.SIMPLE_UPPERCASE_MAPPING: return UTF16.valueOf(UCharacter.toUpperCase(codePoint)); - case UProperty.TITLECASE_MAPPING: return UCharacter.toTitleCase(Locale.ENGLISH,UTF16.valueOf(codePoint),null); - case UProperty.UNICODE_1_NAME: return UCharacter.getName1_0(codePoint); - case UProperty.UPPERCASE_MAPPING: return UCharacter.toUpperCase(Locale.ENGLISH,UTF16.valueOf(codePoint)); -// case NFC: return Normalizer.normalize(codePoint, Normalizer.NFC); -// case NFD: return Normalizer.normalize(codePoint, Normalizer.NFD); -// case NFKC: return Normalizer.normalize(codePoint, Normalizer.NFKC); -// case NFKD: return Normalizer.normalize(codePoint, Normalizer.NFKD); - case isNFC: return String.valueOf(Normalizer.normalize(codePoint, Normalizer.NFC).equals(UTF16.valueOf(codePoint))); - case isNFD: return String.valueOf(Normalizer.normalize(codePoint, Normalizer.NFD).equals(UTF16.valueOf(codePoint))); - case isNFKC: return String.valueOf(Normalizer.normalize(codePoint, Normalizer.NFKC).equals(UTF16.valueOf(codePoint))); - case isNFKD: return String.valueOf(Normalizer.normalize(codePoint, Normalizer.NFKD).equals(UTF16.valueOf(codePoint))); - case isLowercase: return String.valueOf(UCharacter.toLowerCase(Locale.ENGLISH,UTF16.valueOf(codePoint)).equals(UTF16.valueOf(codePoint))); - case isUppercase: return String.valueOf(UCharacter.toUpperCase(Locale.ENGLISH,UTF16.valueOf(codePoint)).equals(UTF16.valueOf(codePoint))); - case isTitlecase: return String.valueOf(UCharacter.toTitleCase(Locale.ENGLISH,UTF16.valueOf(codePoint),null).equals(UTF16.valueOf(codePoint))); - case isCasefolded: return String.valueOf(UCharacter.foldCase(UTF16.valueOf(codePoint),true).equals(UTF16.valueOf(codePoint))); - case isCased: return String.valueOf(UCharacter.toLowerCase(Locale.ENGLISH,UTF16.valueOf(codePoint)).equals(UTF16.valueOf(codePoint))); - } - if (propEnum < UProperty.INT_LIMIT) { - int enumValue = -1; - String value = null; - try { - enumValue = UCharacter.getIntPropertyValue(codePoint, propEnum); - if (enumValue >= 0) value = fixedGetPropertyValueName(propEnum,enumValue, UProperty.NameChoice.LONG); - } catch (IllegalArgumentException e) { - if (!shownException) { - System.out.println("Fail: " + getName() + ", " + Integer.toHexString(codePoint)); - shownException = true; - } + boolean shownException = false; + + public String _getValue(int codePoint) { + switch (propEnum) { + case UProperty.AGE: + return getAge(codePoint); + case UProperty.BIDI_MIRRORING_GLYPH: + return UTF16.valueOf(UCharacter.getMirror(codePoint)); + case UProperty.CASE_FOLDING: + return UCharacter.foldCase(UTF16.valueOf(codePoint), true); + case UProperty.ISO_COMMENT: + return UCharacter.getISOComment(codePoint); + case UProperty.LOWERCASE_MAPPING: + return UCharacter.toLowerCase(Locale.ENGLISH, UTF16.valueOf(codePoint)); + case UProperty.NAME: + return UCharacter.getName(codePoint); + case UProperty.SIMPLE_CASE_FOLDING: + return UTF16.valueOf(UCharacter.foldCase(codePoint, true)); + case UProperty.SIMPLE_LOWERCASE_MAPPING: + return UTF16.valueOf(UCharacter.toLowerCase(codePoint)); + case UProperty.SIMPLE_TITLECASE_MAPPING: + return UTF16.valueOf(UCharacter.toTitleCase(codePoint)); + case UProperty.SIMPLE_UPPERCASE_MAPPING: + return UTF16.valueOf(UCharacter.toUpperCase(codePoint)); + case UProperty.TITLECASE_MAPPING: + return UCharacter.toTitleCase(Locale.ENGLISH, UTF16.valueOf(codePoint), null); + case UProperty.UNICODE_1_NAME: + return UCharacter.getName1_0(codePoint); + case UProperty.UPPERCASE_MAPPING: + return UCharacter.toUpperCase(Locale.ENGLISH, UTF16.valueOf(codePoint)); + // case NFC: return Normalizer.normalize(codePoint, Normalizer.NFC); + // case NFD: return Normalizer.normalize(codePoint, Normalizer.NFD); + // case NFKC: return Normalizer.normalize(codePoint, Normalizer.NFKC); + // case NFKD: return Normalizer.normalize(codePoint, Normalizer.NFKD); + case isNFC: + return String.valueOf( + Normalizer.normalize(codePoint, Normalizer.NFC) + .equals(UTF16.valueOf(codePoint))); + case isNFD: + return String.valueOf( + Normalizer.normalize(codePoint, Normalizer.NFD) + .equals(UTF16.valueOf(codePoint))); + case isNFKC: + return String.valueOf( + Normalizer.normalize(codePoint, Normalizer.NFKC) + .equals(UTF16.valueOf(codePoint))); + case isNFKD: + return String.valueOf( + Normalizer.normalize(codePoint, Normalizer.NFKD) + .equals(UTF16.valueOf(codePoint))); + case isLowercase: + return String.valueOf( + UCharacter.toLowerCase(Locale.ENGLISH, UTF16.valueOf(codePoint)) + .equals(UTF16.valueOf(codePoint))); + case isUppercase: + return String.valueOf( + UCharacter.toUpperCase(Locale.ENGLISH, UTF16.valueOf(codePoint)) + .equals(UTF16.valueOf(codePoint))); + case isTitlecase: + return String.valueOf( + UCharacter.toTitleCase(Locale.ENGLISH, UTF16.valueOf(codePoint), null) + .equals(UTF16.valueOf(codePoint))); + case isCasefolded: + return String.valueOf( + UCharacter.foldCase(UTF16.valueOf(codePoint), true) + .equals(UTF16.valueOf(codePoint))); + case isCased: + return String.valueOf( + UCharacter.toLowerCase(Locale.ENGLISH, UTF16.valueOf(codePoint)) + .equals(UTF16.valueOf(codePoint))); + } + if (propEnum < UProperty.INT_LIMIT) { + int enumValue = -1; + String value = null; + try { + enumValue = UCharacter.getIntPropertyValue(codePoint, propEnum); + if (enumValue >= 0) + value = + fixedGetPropertyValueName( + propEnum, enumValue, UProperty.NameChoice.LONG); + } catch (IllegalArgumentException e) { + if (!shownException) { + System.out.println( + "Fail: " + getName() + ", " + Integer.toHexString(codePoint)); + shownException = true; + } + } + return value != null ? value : String.valueOf(enumValue); + } else if (propEnum < UProperty.DOUBLE_LIMIT) { + double num = UCharacter.getUnicodeNumericValue(codePoint); + if (num == UCharacter.NO_NUMERIC_VALUE) return null; + return Double.toString(num); + // TODO: Fix HACK -- API deficient + } + return null; } - return value != null ? value : String.valueOf(enumValue); - } else if (propEnum < UProperty.DOUBLE_LIMIT) { - double num = UCharacter.getUnicodeNumericValue(codePoint); - if (num == UCharacter.NO_NUMERIC_VALUE) return null; - return Double.toString(num); - // TODO: Fix HACK -- API deficient - } - return null; - } - private String getAge(int codePoint) { - String temp = UCharacter.getAge(codePoint).toString(); - if (temp.equals("0.0.0.0")) return "unassigned"; - if (temp.endsWith(".0.0")) return temp.substring(0,temp.length()-4); - return temp; - } + private String getAge(int codePoint) { + String temp = UCharacter.getAge(codePoint).toString(); + if (temp.equals("0.0.0.0")) return "unassigned"; + if (temp.endsWith(".0.0")) return temp.substring(0, temp.length() - 4); + return temp; + } - /** - * @param propId TODO - * @param valueAlias null if unused. - * @param valueEnum -1 if unused - * @param nameChoice - * @return - */ - private String getFixedValueAlias(int propId, String valueAlias, int valueEnum, int nameChoice) { - if (propId >= UProperty.STRING_START) { - if (nameChoice > UProperty.NameChoice.LONG) throw new IllegalArgumentException(); - if (nameChoice != UProperty.NameChoice.LONG) return null; - return ""; - } else if (propId >= UProperty.DOUBLE_START) { - if (nameChoice > UProperty.NameChoice.LONG) throw new IllegalArgumentException(); - if (nameChoice != UProperty.NameChoice.LONG) return null; - return ""; - } - if (valueAlias != null && !valueAlias.equals("")) { - valueEnum = fixedGetPropertyValueEnum(propId,valueAlias); - } - // because these are defined badly, there may be no normal (long) name. - // if there is - String result = fixedGetPropertyValueName(propId, valueEnum, nameChoice); - if (result != null) return result; - // HACK try other namechoice - if (nameChoice == UProperty.NameChoice.LONG) { - result = fixedGetPropertyValueName(propId,valueEnum, UProperty.NameChoice.SHORT); - if (result != null) return result; - if (isCombiningClassProperty()) return null; - return ""; - } - return null; - } + /** + * @param propId TODO + * @param valueAlias null if unused. + * @param valueEnum -1 if unused + * @param nameChoice + * @return + */ + private String getFixedValueAlias( + int propId, String valueAlias, int valueEnum, int nameChoice) { + if (propId >= UProperty.STRING_START) { + if (nameChoice > UProperty.NameChoice.LONG) throw new IllegalArgumentException(); + if (nameChoice != UProperty.NameChoice.LONG) return null; + return ""; + } else if (propId >= UProperty.DOUBLE_START) { + if (nameChoice > UProperty.NameChoice.LONG) throw new IllegalArgumentException(); + if (nameChoice != UProperty.NameChoice.LONG) return null; + return ""; + } + if (valueAlias != null && !valueAlias.equals("")) { + valueEnum = fixedGetPropertyValueEnum(propId, valueAlias); + } + // because these are defined badly, there may be no normal (long) name. + // if there is + String result = fixedGetPropertyValueName(propId, valueEnum, nameChoice); + if (result != null) return result; + // HACK try other namechoice + if (nameChoice == UProperty.NameChoice.LONG) { + result = fixedGetPropertyValueName(propId, valueEnum, UProperty.NameChoice.SHORT); + if (result != null) return result; + if (isCombiningClassProperty()) return null; + return ""; + } + return null; + } - public boolean isCombiningClassProperty() { - return (propEnum == UProperty.CANONICAL_COMBINING_CLASS - || propEnum == UProperty.LEAD_CANONICAL_COMBINING_CLASS - || propEnum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS - ); - } + public boolean isCombiningClassProperty() { + return (propEnum == UProperty.CANONICAL_COMBINING_CLASS + || propEnum == UProperty.LEAD_CANONICAL_COMBINING_CLASS + || propEnum == UProperty.TRAIL_CANONICAL_COMBINING_CLASS); + } - private static int fixedGetPropertyValueEnum(int propEnum, String valueAlias) { - try { - if (propEnum < BINARY_LIMIT) { - propEnum = UProperty.ALPHABETIC; + private static int fixedGetPropertyValueEnum(int propEnum, String valueAlias) { + try { + if (propEnum < BINARY_LIMIT) { + propEnum = UProperty.ALPHABETIC; + } + return UCharacter.getPropertyValueEnum(propEnum, valueAlias); + } catch (Exception e) { + return Integer.parseInt(valueAlias); + } } - return UCharacter.getPropertyValueEnum(propEnum, valueAlias); - } catch (Exception e) { - return Integer.parseInt(valueAlias); - } - } - static Map fixSkeleton = new HashMap(); - private static String fixedGetPropertyValueName(int propEnum, int valueEnum, int nameChoice) { + static Map fixSkeleton = new HashMap(); + + private static String fixedGetPropertyValueName( + int propEnum, int valueEnum, int nameChoice) { + + String value = UCharacter.getPropertyValueName(propEnum, valueEnum, nameChoice); + String newValue = (String) fixSkeleton.get(value); + if (newValue == null) { + newValue = value; + if (propEnum == UProperty.JOINING_GROUP) { + newValue = newValue == null ? null : newValue.toLowerCase(Locale.ENGLISH); + } + newValue = regularize(newValue, true); + if (propEnum == UProperty.BLOCK && newValue.equals("Sutton_Sign_Writing")) { + newValue = "Sutton_SignWriting"; + } + fixSkeleton.put(value, newValue); + } + return newValue; + } - String value = UCharacter.getPropertyValueName(propEnum,valueEnum,nameChoice); - String newValue = (String) fixSkeleton.get(value); - if (newValue == null) { - newValue = value; - if (propEnum == UProperty.JOINING_GROUP) { - newValue = newValue == null ? null : newValue.toLowerCase(Locale.ENGLISH); + public List _getNameAliases(List result) { + if (result == null) result = new ArrayList(); + // String alias = String_Extras.get(propEnum); + // if (alias == null) + String alias = Binary_Extras.get(propEnum); + if (alias != null) { + addUnique(alias, result); + } else { + addUnique(getFixedPropertyName(propEnum, UProperty.NameChoice.SHORT), result); + addUnique(getFixedPropertyName(propEnum, UProperty.NameChoice.LONG), result); + } + return result; } - newValue = regularize(newValue, true); - if (propEnum == UProperty.BLOCK && newValue.equals("Sutton_Sign_Writing")) { - newValue = "Sutton_SignWriting"; + + public String getFixedPropertyName(int propName, int nameChoice) { + try { + return UCharacter.getPropertyName(propEnum, nameChoice); + } catch (IllegalArgumentException e) { + return null; + } } - fixSkeleton.put(value, newValue); - } - return newValue; - } - public List _getNameAliases(List result) { - if (result == null) result = new ArrayList(); -// String alias = String_Extras.get(propEnum); -// if (alias == null) - String alias = Binary_Extras.get(propEnum); - if (alias != null) { - addUnique(alias, result); - } else { - addUnique(getFixedPropertyName(propEnum, UProperty.NameChoice.SHORT), result); - addUnique(getFixedPropertyName(propEnum, UProperty.NameChoice.LONG), result); - } - return result; - } + private static Map cccHack = new HashMap(); + private static Set cccExtras = new HashSet(); + + static { + int start = UCharacter.getIntPropertyMinValue(UProperty.CANONICAL_COMBINING_CLASS); + int end = UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS); + for (int i = 0; i <= 255; ++i) { + String alias = + UCharacter.getPropertyValueName( + UProperty.CANONICAL_COMBINING_CLASS, i, UProperty.NameChoice.LONG); + String numStr = String.valueOf(i); + if (alias != null) { + cccHack.put(alias, numStr); + } else { + cccHack.put(numStr, numStr); + cccExtras.add(numStr); + } + } + } - public String getFixedPropertyName(int propName, int nameChoice) { - try { - return UCharacter.getPropertyName(propEnum, nameChoice); - } catch (IllegalArgumentException e) { - return null; - } - } + public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { + result = super.getSet(matcher, result); + if (propEnum == UProperty.GENERAL_CATEGORY) { + for (String multiprop : SPECIAL_GC.keySet()) { + R2 value = SPECIAL_GC.get(multiprop); + if (matcher.test(multiprop) || matcher.test(value.get0())) { + result.addAll(value.get1()); + } + } + } + return result; + } - private static Map cccHack = new HashMap(); - private static Set cccExtras = new HashSet(); - static { - int start = UCharacter.getIntPropertyMinValue(UProperty.CANONICAL_COMBINING_CLASS); - int end = UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS); - for (int i = 0; i <= 255; ++i) { - String alias = UCharacter.getPropertyValueName(UProperty.CANONICAL_COMBINING_CLASS, i, UProperty.NameChoice.LONG); - String numStr = String.valueOf(i); - if (alias != null) { - cccHack.put(alias, numStr); - } else { - cccHack.put(numStr, numStr); - cccExtras.add(numStr); + static Map> SPECIAL_GC = + new LinkedHashMap>(); + + static { + String[][] extras = { + {"Other", "C", "[[:Cc:][:Cf:][:Cn:][:Co:][:Cs:]]"}, + {"Letter", "L", "[[:Ll:][:Lm:][:Lo:][:Lt:][:Lu:]]"}, + {"Cased_Letter", "LC", "[[:Ll:][:Lt:][:Lu:]]"}, + {"Mark", "M", "[[:Mc:][:Me:][:Mn:]]"}, + {"Number", "N", "[[:Nd:][:Nl:][:No:]]"}, + {"Punctuation", "P", "[[:Pc:][:Pd:][:Pe:][:Pf:][:Pi:][:Po:][:Ps:]]"}, + {"Symbol", "S", "[[:Sc:][:Sk:][:Sm:][:So:]]"}, + {"Separator", "Z", "[[:Zl:][:Zp:][:Zs:]]"}, + }; + for (String[] extra : extras) { + SPECIAL_GC.put( + extra[0], + (R2) + Row.of(extra[1], new UnicodeSet(extra[2]).freeze()).freeze()); + } } - } - } - public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { - result = super.getSet(matcher, result); - if (propEnum == UProperty.GENERAL_CATEGORY) { - for (String multiprop : SPECIAL_GC.keySet()) { - R2 value = SPECIAL_GC.get(multiprop); - if (matcher.test(multiprop) || matcher.test(value.get0())) { - result.addAll(value.get1()); - } + public List _getAvailableValues(List result) { + if (result == null) result = new ArrayList(); + if (propEnum == UProperty.AGE) { + addAllUnique(getAges(), result); + return result; + } + if (propEnum < UProperty.INT_LIMIT) { + if (Binary_Extras.isInRange(propEnum)) { + propEnum = UProperty.BINARY_START; // HACK + } + addValues(propEnum, result); + if (propEnum == UProperty.GENERAL_CATEGORY) { + for (String item : SPECIAL_GC.keySet()) { + addUnique(item, result); + } + } + } else if (propEnum >= UProperty.DOUBLE_START && propEnum < UProperty.DOUBLE_LIMIT) { + UnicodeMap map = getUnicodeMap(); + Collection values = map.values(); + addAllUnique(values, result); + } else { + String alias = getFixedValueAlias(propEnum, null, -1, UProperty.NameChoice.LONG); + addUnique(alias, result); + } + return result; } - } - return result; - } + private void addValues(int propertyId, List result) { + int start = UCharacter.getIntPropertyMinValue(propertyId); + int end = UCharacter.getIntPropertyMaxValue(propertyId); + for (int i = start; i <= end; ++i) { + String alias = getFixedValueAlias(propEnum, null, i, UProperty.NameChoice.LONG); + String alias2 = getFixedValueAlias(propEnum, null, i, UProperty.NameChoice.SHORT); + if (alias == null) { + alias = alias2; + if (alias == null && isCombiningClassProperty()) { + alias = String.valueOf(i); + } + } + // System.out.println(propertyAlias + "\t" + i + ":\t" + alias); + addUnique(alias, result); + } + } - static Map> SPECIAL_GC = new LinkedHashMap>(); - static { - String[][] extras = { - {"Other", "C", "[[:Cc:][:Cf:][:Cn:][:Co:][:Cs:]]"}, - {"Letter", "L", "[[:Ll:][:Lm:][:Lo:][:Lt:][:Lu:]]"}, - {"Cased_Letter", "LC", "[[:Ll:][:Lt:][:Lu:]]"}, - {"Mark", "M", "[[:Mc:][:Me:][:Mn:]]"}, - {"Number", "N", "[[:Nd:][:Nl:][:No:]]"}, - {"Punctuation", "P", "[[:Pc:][:Pd:][:Pe:][:Pf:][:Pi:][:Po:][:Ps:]]"}, - {"Symbol", "S", "[[:Sc:][:Sk:][:Sm:][:So:]]"}, - {"Separator", "Z", "[[:Zl:][:Zp:][:Zs:]]"}, - }; - for (String[] extra : extras) { - SPECIAL_GC.put(extra[0], (R2) Row.of(extra[1], new UnicodeSet(extra[2]).freeze()).freeze()); - } - } + static String[] AGES = null; + + private String[] getAges() { + if (AGES == null) { + Set ages = new TreeSet(); + for (int i = 0; i < 0x10FFFF; ++i) { + ages.add(getAge(i)); + } + AGES = (String[]) ages.toArray(new String[ages.size()]); + } + return AGES; + } - public List _getAvailableValues(List result) { - if (result == null) result = new ArrayList(); - if (propEnum == UProperty.AGE) { - addAllUnique(getAges(), result); - return result; + public List _getValueAliases(String valueAlias, List result) { + if (result == null) result = new ArrayList(); + if (propEnum == UProperty.AGE) { + addUnique(valueAlias, result); + return result; + } + if (isCombiningClassProperty()) { + addUnique(cccHack.get(valueAlias), result); // add number + } + int type = getType(); + if (type == UnicodeProperty.NUMERIC || type == EXTENDED_NUMERIC) { + addUnique(valueAlias, result); + if (valueAlias.endsWith(".0")) { + addUnique(valueAlias.substring(0, valueAlias.length() - 2), result); + } + } else { + R2 temp; + if (propEnum == UProperty.GENERAL_CATEGORY + && (temp = SPECIAL_GC.get(valueAlias)) != null) { + addUnique(valueAlias, result); + addUnique(temp.get0(), result); + } else { + addAliases(propEnum, valueAlias, result); + } + } + return result; + } - } - if (propEnum < UProperty.INT_LIMIT) { - if (Binary_Extras.isInRange(propEnum)) { - propEnum = UProperty.BINARY_START; // HACK + private void addAliases(int propId, String valueAlias, List result) { + for (int nameChoice = UProperty.NameChoice.SHORT; ; ++nameChoice) { + try { + addUnique(getFixedValueAlias(propId, valueAlias, -1, nameChoice), result); + } catch (Exception e) { + break; + } + } } - addValues(propEnum, result); - if (propEnum == UProperty.GENERAL_CATEGORY) { - for (String item : SPECIAL_GC.keySet()) { - addUnique(item, result); - } + + /* (non-Javadoc) + * @see com.ibm.icu.dev.util.UnicodePropertySource#getPropertyType() + */ + private int internalGetPropertyType(int prop) { + switch (prop) { + case UProperty.AGE: + case UProperty.BLOCK: + case UProperty.SCRIPT: + return UnicodeProperty.CATALOG; + case UProperty.ISO_COMMENT: + case UProperty.NAME: + case UProperty.UNICODE_1_NAME: + return UnicodeProperty.MISC; + case UProperty.BIDI_MIRRORING_GLYPH: + case UProperty.CASE_FOLDING: + case UProperty.LOWERCASE_MAPPING: + case UProperty.SIMPLE_CASE_FOLDING: + case UProperty.SIMPLE_LOWERCASE_MAPPING: + case UProperty.SIMPLE_TITLECASE_MAPPING: + case UProperty.SIMPLE_UPPERCASE_MAPPING: + case UProperty.TITLECASE_MAPPING: + case UProperty.UPPERCASE_MAPPING: + return UnicodeProperty.EXTENDED_STRING; + } + if (prop < UProperty.BINARY_START) return UnicodeProperty.UNKNOWN; + if (prop < UProperty.BINARY_LIMIT) return UnicodeProperty.BINARY; + if (prop < UProperty.INT_START) return UnicodeProperty.EXTENDED_BINARY; + if (prop < UProperty.INT_LIMIT) return UnicodeProperty.ENUMERATED; + if (prop < UProperty.DOUBLE_START) return UnicodeProperty.EXTENDED_ENUMERATED; + if (prop < UProperty.DOUBLE_LIMIT) return UnicodeProperty.NUMERIC; + if (prop < UProperty.STRING_START) return UnicodeProperty.EXTENDED_NUMERIC; + if (prop < UProperty.STRING_LIMIT) return UnicodeProperty.STRING; + return UnicodeProperty.EXTENDED_STRING; } - } else if (propEnum >= UProperty.DOUBLE_START && propEnum < UProperty.DOUBLE_LIMIT) { - UnicodeMap map = getUnicodeMap(); - Collection values = map.values(); - addAllUnique(values, result); - } else { - String alias = getFixedValueAlias(propEnum, null,-1, UProperty.NameChoice.LONG); - addUnique(alias, result); - } - return result; - } - private void addValues(int propertyId, List result) { - int start = UCharacter.getIntPropertyMinValue(propertyId); - int end = UCharacter.getIntPropertyMaxValue(propertyId); - for (int i = start; i <= end; ++i) { - String alias = getFixedValueAlias(propEnum, null, i, UProperty.NameChoice.LONG); - String alias2 = getFixedValueAlias(propEnum, null, i, UProperty.NameChoice.SHORT); - if (alias == null) { - alias = alias2; - if (alias == null && isCombiningClassProperty()) { - alias = String.valueOf(i); - } + /* (non-Javadoc) + * @see com.ibm.icu.dev.util.UnicodeProperty#getVersion() + */ + public String _getVersion() { + return VersionInfo.ICU_VERSION.toString(); } - //System.out.println(propertyAlias + "\t" + i + ":\t" + alias); - addUnique(alias, result); - } } - static String[] AGES = null; - private String[] getAges() { - if (AGES == null) { - Set ages = new TreeSet(); - for (int i = 0; i < 0x10FFFF; ++i) { - ages.add(getAge(i)); + /*{ + matchIterator = new UnicodeSetIterator( + new UnicodeSet("[^[:Cn:]-[:Default_Ignorable_Code_Point:]]")); + }*/ + + /* + * Other Missing Functions: + Expands_On_NFC + Expands_On_NFD + Expands_On_NFKC + Expands_On_NFKD + Composition_Exclusion + Decomposition_Mapping + FC_NFKC_Closure + ISO_Comment + NFC_Quick_Check + NFD_Quick_Check + NFKC_Quick_Check + NFKD_Quick_Check + Special_Case_Condition + Unicode_Radical_Stroke + */ + + static final Names Binary_Extras = + new Names( + UProperty.BINARY_LIMIT, + new String[] { + "isNFC", + "isNFD", + "isNFKC", + "isNFKD", + "isLowercase", + "isUppercase", + "isTitlecase", + "isCasefolded", + "isCased", + }); + + // static final Names String_Extras = new Names(UProperty.STRING_LIMIT, + // new String[] { + // "toNFC", "toNFD", "toNFKC", "toNKFD", + // }); + + static final int isNFC = UProperty.BINARY_LIMIT, + isNFD = UProperty.BINARY_LIMIT + 1, + isNFKC = UProperty.BINARY_LIMIT + 2, + isNFKD = UProperty.BINARY_LIMIT + 3, + isLowercase = UProperty.BINARY_LIMIT + 4, + isUppercase = UProperty.BINARY_LIMIT + 5, + isTitlecase = UProperty.BINARY_LIMIT + 6, + isCasefolded = UProperty.BINARY_LIMIT + 7, + isCased = UProperty.BINARY_LIMIT + 8, + BINARY_LIMIT = UProperty.BINARY_LIMIT + 9 + + // NFC = UProperty.STRING_LIMIT, + // NFD = UProperty.STRING_LIMIT+1, + // NFKC = UProperty.STRING_LIMIT+2, + // NFKD = UProperty.STRING_LIMIT+3 + ; + + private ICUPropertyFactory() { + Collection c = getInternalAvailablePropertyAliases(new ArrayList()); + Iterator it = c.iterator(); + while (it.hasNext()) { + add(getInternalProperty((String) it.next())); } - AGES = (String[]) ages.toArray(new String[ages.size()]); - } - return AGES; } - public List _getValueAliases(String valueAlias, List result) { - if (result == null) result = new ArrayList(); - if (propEnum == UProperty.AGE) { - addUnique(valueAlias, result); - return result; - } - if (isCombiningClassProperty()) { - addUnique(cccHack.get(valueAlias), result); // add number - } - int type = getType(); - if (type == UnicodeProperty.NUMERIC || type == EXTENDED_NUMERIC) { - addUnique(valueAlias, result); - if (valueAlias.endsWith(".0")) { - addUnique(valueAlias.substring(0, valueAlias.length() - 2), result); - } - } else { - R2 temp; - if (propEnum == UProperty.GENERAL_CATEGORY && (temp = SPECIAL_GC.get(valueAlias)) != null) { - addUnique(valueAlias, result); - addUnique(temp.get0(), result); - } else { - addAliases(propEnum, valueAlias, result); - } - } - return result; + private static ICUPropertyFactory singleton = null; + + public static synchronized ICUPropertyFactory make() { + if (singleton != null) return singleton; + singleton = new ICUPropertyFactory(); + return singleton; } - private void addAliases(int propId, String valueAlias, List result) { - for (int nameChoice = UProperty.NameChoice.SHORT; ; ++nameChoice) { - try { - addUnique(getFixedValueAlias(propId, valueAlias, -1, nameChoice), result); - } catch (Exception e) { - break; + public List getInternalAvailablePropertyAliases(List result) { + int[][] ranges = { + {UProperty.BINARY_START, UProperty.BINARY_LIMIT}, + {UProperty.INT_START, UProperty.INT_LIMIT}, + {UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT}, + {UProperty.STRING_START, UProperty.STRING_LIMIT}, + }; + for (int i = 0; i < ranges.length; ++i) { + for (int j = ranges[i][0]; j < ranges[i][1]; ++j) { + String alias; + try { + alias = UCharacter.getPropertyName(j, UProperty.NameChoice.LONG); + } catch (Exception e) { + continue; // probably mismatch in ICU version + } + UnicodeProperty.addUnique(alias, result); + if (!result.contains(alias)) result.add(alias); + } } - } + // result.addAll(String_Extras.getNames()); + result.addAll(Binary_Extras.getNames()); + return result; } - - /* (non-Javadoc) - * @see com.ibm.icu.dev.util.UnicodePropertySource#getPropertyType() - */ - private int internalGetPropertyType(int prop) { - switch(prop) { - case UProperty.AGE: - case UProperty.BLOCK: - case UProperty.SCRIPT: - return UnicodeProperty.CATALOG; - case UProperty.ISO_COMMENT: - case UProperty.NAME: - case UProperty.UNICODE_1_NAME: - return UnicodeProperty.MISC; - case UProperty.BIDI_MIRRORING_GLYPH: - case UProperty.CASE_FOLDING: - case UProperty.LOWERCASE_MAPPING: - case UProperty.SIMPLE_CASE_FOLDING: - case UProperty.SIMPLE_LOWERCASE_MAPPING: - case UProperty.SIMPLE_TITLECASE_MAPPING: - case UProperty.SIMPLE_UPPERCASE_MAPPING: - case UProperty.TITLECASE_MAPPING: - case UProperty.UPPERCASE_MAPPING: - return UnicodeProperty.EXTENDED_STRING; - } - if (prop < UProperty.BINARY_START) return UnicodeProperty.UNKNOWN; - if (prop < UProperty.BINARY_LIMIT) return UnicodeProperty.BINARY; - if (prop < UProperty.INT_START) return UnicodeProperty.EXTENDED_BINARY; - if (prop < UProperty.INT_LIMIT) return UnicodeProperty.ENUMERATED; - if (prop < UProperty.DOUBLE_START) return UnicodeProperty.EXTENDED_ENUMERATED; - if (prop < UProperty.DOUBLE_LIMIT) return UnicodeProperty.NUMERIC; - if (prop < UProperty.STRING_START) return UnicodeProperty.EXTENDED_NUMERIC; - if (prop < UProperty.STRING_LIMIT) return UnicodeProperty.STRING; - return UnicodeProperty.EXTENDED_STRING; + public UnicodeProperty getInternalProperty(String propertyAlias) { + int propEnum; + main: + { + int possibleItem = Binary_Extras.get(propertyAlias); + if (possibleItem >= 0) { + propEnum = possibleItem; + break main; + } + // possibleItem = String_Extras.get(propertyAlias); + // if (possibleItem >= 0) { + // propEnum = possibleItem; + // break main; + // } + propEnum = UCharacter.getPropertyEnum(propertyAlias); + } + return new ICUProperty(propertyAlias, propEnum); } /* (non-Javadoc) - * @see com.ibm.icu.dev.util.UnicodeProperty#getVersion() + * @see com.ibm.icu.dev.util.UnicodePropertySource#getProperty(java.lang.String) */ - public String _getVersion() { - return VersionInfo.ICU_VERSION.toString(); - } - } - - /*{ - matchIterator = new UnicodeSetIterator( - new UnicodeSet("[^[:Cn:]-[:Default_Ignorable_Code_Point:]]")); - }*/ - - - - /* - * Other Missing Functions: - Expands_On_NFC - Expands_On_NFD - Expands_On_NFKC - Expands_On_NFKD - Composition_Exclusion - Decomposition_Mapping - FC_NFKC_Closure - ISO_Comment - NFC_Quick_Check - NFD_Quick_Check - NFKC_Quick_Check - NFKD_Quick_Check - Special_Case_Condition - Unicode_Radical_Stroke - */ - - static final Names Binary_Extras = new Names(UProperty.BINARY_LIMIT, - new String[] { - "isNFC", "isNFD", "isNFKC", "isNFKD", - "isLowercase", "isUppercase", "isTitlecase", "isCasefolded", "isCased", - }); - -// static final Names String_Extras = new Names(UProperty.STRING_LIMIT, -// new String[] { -// "toNFC", "toNFD", "toNFKC", "toNKFD", -// }); - - static final int - isNFC = UProperty.BINARY_LIMIT, - isNFD = UProperty.BINARY_LIMIT+1, - isNFKC = UProperty.BINARY_LIMIT+2, - isNFKD = UProperty.BINARY_LIMIT+3, - isLowercase = UProperty.BINARY_LIMIT+4, - isUppercase = UProperty.BINARY_LIMIT+5, - isTitlecase = UProperty.BINARY_LIMIT+6, - isCasefolded = UProperty.BINARY_LIMIT+7, - isCased = UProperty.BINARY_LIMIT+8, - BINARY_LIMIT = UProperty.BINARY_LIMIT+9 - -// NFC = UProperty.STRING_LIMIT, -// NFD = UProperty.STRING_LIMIT+1, -// NFKC = UProperty.STRING_LIMIT+2, -// NFKD = UProperty.STRING_LIMIT+3 - ; - - private ICUPropertyFactory() { - Collection c = getInternalAvailablePropertyAliases(new ArrayList()); - Iterator it = c.iterator(); - while (it.hasNext()) { - add(getInternalProperty((String)it.next())); - } - } - - private static ICUPropertyFactory singleton = null; - - public static synchronized ICUPropertyFactory make() { - if (singleton != null) return singleton; - singleton = new ICUPropertyFactory(); - return singleton; - } - - public List getInternalAvailablePropertyAliases(List result) { - int[][] ranges = { - {UProperty.BINARY_START, UProperty.BINARY_LIMIT}, - {UProperty.INT_START, UProperty.INT_LIMIT}, - {UProperty.DOUBLE_START, UProperty.DOUBLE_LIMIT}, - {UProperty.STRING_START, UProperty.STRING_LIMIT}, - }; - for (int i = 0; i < ranges.length; ++i) { - for (int j = ranges[i][0]; j < ranges[i][1]; ++j) { - String alias; - try { - alias = UCharacter.getPropertyName(j, UProperty.NameChoice.LONG); - } catch (Exception e) { - continue; // probably mismatch in ICU version + // TODO file bug on getPropertyValueName for Canonical_Combining_Class + + public static class Names { + private String[] names; + private int base; + + public Names(int base, String[] names) { + this.base = base; + this.names = names; + } + + public int get(String name) { + for (int i = 0; i < names.length; ++i) { + if (name.equalsIgnoreCase(names[i])) return base + i; + } + return -1; + } + + public String get(int number) { + number -= base; + if (number < 0 || names.length <= number) return null; + return names[number]; + } + + public boolean isInRange(int number) { + number -= base; + return (0 <= number && number < names.length); + } + + public List getNames() { + return Arrays.asList(names); } - UnicodeProperty.addUnique(alias, result); - if (!result.contains(alias)) result.add(alias); - } - } - //result.addAll(String_Extras.getNames()); - result.addAll(Binary_Extras.getNames()); - return result; - } - - public UnicodeProperty getInternalProperty(String propertyAlias) { - int propEnum; - main: - { - int possibleItem = Binary_Extras.get(propertyAlias); - if (possibleItem >= 0) { - propEnum = possibleItem; - break main; - } -// possibleItem = String_Extras.get(propertyAlias); -// if (possibleItem >= 0) { -// propEnum = possibleItem; -// break main; -// } - propEnum = UCharacter.getPropertyEnum(propertyAlias); - } - return new ICUProperty(propertyAlias, propEnum); - } - - /* (non-Javadoc) - * @see com.ibm.icu.dev.util.UnicodePropertySource#getProperty(java.lang.String) - */ - // TODO file bug on getPropertyValueName for Canonical_Combining_Class - - public static class Names { - private String[] names; - private int base; - public Names(int base, String[] names) { - this.base = base; - this.names = names; - } - public int get(String name) { - for (int i = 0; i < names.length; ++i) { - if (name.equalsIgnoreCase(names[i])) return base + i; - } - return -1; - } - public String get(int number) { - number -= base; - if (number < 0 || names.length <= number) return null; - return names[number]; - } - public boolean isInRange(int number) { - number -= base; - return (0 <= number && number < names.length); - } - public List getNames() { - return Arrays.asList(names); } - } } diff --git a/unicodetools/src/main/java/org/unicode/jsp/MySymbolTable.java b/unicodetools/src/main/java/org/unicode/jsp/MySymbolTable.java index e97ad99c5..5ba954e1d 100644 --- a/unicodetools/src/main/java/org/unicode/jsp/MySymbolTable.java +++ b/unicodetools/src/main/java/org/unicode/jsp/MySymbolTable.java @@ -1,13 +1,11 @@ package org.unicode.jsp; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.util.Comparator; import java.util.List; - import org.unicode.props.UnicodeProperty; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class MySymbolTable extends UnicodeSet.XSymbolTable { private UnicodeRegex unicodeRegex; private UnicodeProperty.Factory factory; @@ -17,7 +15,6 @@ public MySymbolTable(UnicodeProperty.Factory propertyFactory) { unicodeRegex = new UnicodeRegex().setSymbolTable(this); } - // public boolean applyPropertyAlias0(String propertyName, // String propertyValue, UnicodeSet result) { // if (!propertyName.contains("*")) { @@ -31,8 +28,8 @@ public MySymbolTable(UnicodeProperty.Factory propertyFactory) { // return null; // } - public boolean applyPropertyAlias(String propertyName, - String propertyValue, UnicodeSet result) { + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { boolean status = false; boolean invert = false; int posNotEqual = propertyName.indexOf('\u2260'); @@ -41,9 +38,11 @@ public boolean applyPropertyAlias(String propertyName, if (posNotEqual < 0) posNotEqual = propertyName.length(); if (posColon < 0) posColon = propertyName.length(); int opPos = posNotEqual < posColon ? posNotEqual : posColon; - propertyValue = propertyValue.length() == 0 ? propertyName.substring(opPos+1) - : propertyName.substring(opPos+1) + "=" + propertyValue; - propertyName = propertyName.substring(0,opPos); + propertyValue = + propertyValue.length() == 0 + ? propertyName.substring(opPos + 1) + : propertyName.substring(opPos + 1) + "=" + propertyValue; + propertyName = propertyName.substring(0, opPos); if (posNotEqual < posColon) { invert = true; } @@ -58,15 +57,21 @@ public boolean applyPropertyAlias(String propertyName, } else { try { status = applyPropertyAlias0("gc", propertyName, result); - } catch (Exception e) {}; + } catch (Exception e) { + } + ; if (!status) { try { status = applyPropertyAlias0("sc", propertyName, result); - } catch (Exception e) {}; + } catch (Exception e) { + } + ; if (!status) { try { status = applyPropertyAlias0(propertyName, "Yes", result); - } catch (Exception e) {}; + } catch (Exception e) { + } + ; if (!status) { status = applyPropertyAlias0(propertyName, "", result); } @@ -79,17 +84,22 @@ public boolean applyPropertyAlias(String propertyName, return status; } - public boolean applyPropertyAlias0(String propertyName, - String propertyValue, UnicodeSet result) { + public boolean applyPropertyAlias0( + String propertyName, String propertyValue, UnicodeSet result) { result.clear(); UnicodeProperty.PatternMatcher patternMatcher = null; - if (propertyValue.length() > 1 && propertyValue.startsWith("/") && propertyValue.endsWith("/")) { - String fixedRegex = unicodeRegex.transform(propertyValue.substring(1, propertyValue.length() - 1)); + if (propertyValue.length() > 1 + && propertyValue.startsWith("/") + && propertyValue.endsWith("/")) { + String fixedRegex = + unicodeRegex.transform(propertyValue.substring(1, propertyValue.length() - 1)); patternMatcher = new UnicodeProperty.RegexMatcher().set(fixedRegex); } UnicodeProperty otherProperty = null; boolean testCp = false; - if (propertyValue.length() > 1 && propertyValue.startsWith("@") && propertyValue.endsWith("@")) { + if (propertyValue.length() > 1 + && propertyValue.startsWith("@") + && propertyValue.endsWith("@")) { String otherPropName = propertyValue.substring(1, propertyValue.length() - 1).trim(); if ("cp".equalsIgnoreCase(otherPropName)) { testCp = true; @@ -119,12 +129,21 @@ public boolean applyPropertyAlias0(String propertyName, } } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { - throw new IllegalArgumentException("The value '" + propertyValue + "' is illegal. Values for " + propertyName - + " must be in " - + prop.getAvailableValues() + " or in " + prop.getValueAliases()); + throw new IllegalArgumentException( + "The value '" + + propertyValue + + "' is illegal. Values for " + + propertyName + + " must be in " + + prop.getAvailableValues() + + " or in " + + prop.getValueAliases()); } if (isAge) { - set = prop.getSet(new ComparisonMatcher(propertyValue, ComparisonMatcher.Relation.geq)); + set = + prop.getSet( + new ComparisonMatcher( + propertyValue, ComparisonMatcher.Relation.geq)); } else { set = prop.getSet(propertyValue); } @@ -149,8 +168,6 @@ public boolean applyPropertyAlias0(String propertyName, throw new IllegalArgumentException("Illegal property: " + propertyName); } - - private boolean isValid(UnicodeProperty prop, String propertyValue) { // if (prop.getName().equals("General_Category")) { // if (propertyValue) @@ -160,8 +177,16 @@ private boolean isValid(UnicodeProperty prop, String propertyValue) { public static class ComparisonMatcher implements UnicodeProperty.PatternMatcher { Relation relation; - enum Relation {less, leq, equal, geq, greater} - static Comparator comparator = new UTF16.StringComparator(true, false,0); + + enum Relation { + less, + leq, + equal, + geq, + greater + } + + static Comparator comparator = new UTF16.StringComparator(true, false, 0); String pattern; @@ -174,11 +199,16 @@ public ComparisonMatcher(String pattern, Relation comparator) { public boolean test(String value) { int comp = comparator.compare(pattern, value.toString()); switch (relation) { - case less: return comp < 0; - case leq: return comp <= 0; - default: return comp == 0; - case geq: return comp >= 0; - case greater: return comp > 0; + case less: + return comp < 0; + case leq: + return comp <= 0; + default: + return comp == 0; + case geq: + return comp >= 0; + case greater: + return comp > 0; } } diff --git a/unicodetools/src/main/java/org/unicode/jsp/UnicodeRegex.java b/unicodetools/src/main/java/org/unicode/jsp/UnicodeRegex.java index a9a7eebbc..be8f90f78 100644 --- a/unicodetools/src/main/java/org/unicode/jsp/UnicodeRegex.java +++ b/unicodetools/src/main/java/org/unicode/jsp/UnicodeRegex.java @@ -1,4 +1,4 @@ -//##header +// ##header /* ******************************************************************************* * Copyright (C) 2009, Google, International Business Machines Corporation and * @@ -7,6 +7,10 @@ */ package org.unicode.jsp; +import com.ibm.icu.text.StringTransform; +import com.ibm.icu.text.SymbolTable; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Freezable; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; @@ -21,15 +25,9 @@ import java.util.TreeMap; import java.util.regex.Pattern; -import com.ibm.icu.text.StringTransform; -import com.ibm.icu.text.SymbolTable; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Freezable; - /** - * Contains utilities to supplement the JDK Regex, since it doesn't handle - * Unicode well. - * + * Contains utilities to supplement the JDK Regex, since it doesn't handle Unicode well. + * * @author markdavis */ public class UnicodeRegex implements Cloneable, Freezable, StringTransform { @@ -49,24 +47,22 @@ public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { } /** - * Adds full Unicode property support, with the latest version of Unicode, - * to Java Regex, bringing it up to Level 1 (see - * http://www.unicode.org/reports/tr18/). It does this by preprocessing the - * regex pattern string and interpreting the character classes (\p{...}, - * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With - * this utility, Java regex expressions can be updated to work with the - * latest version of Unicode, and with all Unicode properties. Note that the - * UnicodeSet syntax has not yet, however, been updated to be completely - * consistent with Java regex, so be careful of the differences. + * Adds full Unicode property support, with the latest version of Unicode, to Java Regex, + * bringing it up to Level 1 (see http://www.unicode.org/reports/tr18/). It does this by + * preprocessing the regex pattern string and interpreting the character classes (\p{...}, + * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With this utility, Java + * regex expressions can be updated to work with the latest version of Unicode, and with all + * Unicode properties. Note that the UnicodeSet syntax has not yet, however, been updated to be + * completely consistent with Java regex, so be careful of the differences. + * *

Not thread-safe; create a separate copy for different threads. + * *

In the future, we may extend this to support other regex packages. - * - * @regex A modified Java regex pattern, as in the input to - * Pattern.compile(), except that all "character classes" are - * processed as if they were UnicodeSet patterns. Example: - * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. - * @return A processed Java regex pattern, suitable for input to - * Pattern.compile(). + * + * @regex A modified Java regex pattern, as in the input to Pattern.compile(), except that all + * "character classes" are processed as if they were UnicodeSet patterns. Example: + * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. + * @return A processed Java regex pattern, suitable for input to Pattern.compile(). */ public String transform(String regex) { StringBuilder result = new StringBuilder(); @@ -82,43 +78,43 @@ public String transform(String regex) { // look for UnicodeSets, allowing for quoting with \ and \Q char ch = regex.charAt(i); switch (state) { - case 0: // we only care about \, and '['. - if (ch == '\\') { - if (UnicodeSet.resemblesPattern(regex, i)) { - // should only happen with \p - i = processSet(regex, i, result, temp, pos); - continue; - } - state = 1; - } else if (ch == '[') { - // if we have what looks like a UnicodeSet - if (UnicodeSet.resemblesPattern(regex, i)) { - i = processSet(regex, i, result, temp, pos); - continue; + case 0: // we only care about \, and '['. + if (ch == '\\') { + if (UnicodeSet.resemblesPattern(regex, i)) { + // should only happen with \p + i = processSet(regex, i, result, temp, pos); + continue; + } + state = 1; + } else if (ch == '[') { + // if we have what looks like a UnicodeSet + if (UnicodeSet.resemblesPattern(regex, i)) { + i = processSet(regex, i, result, temp, pos); + continue; + } } - } - break; + break; - case 1: // we are after a \ - if (ch == 'Q') { - state = 1; - } else { - state = 0; - } - break; + case 1: // we are after a \ + if (ch == 'Q') { + state = 1; + } else { + state = 0; + } + break; - case 2: // we are in a \Q... - if (ch == '\\') { - state = 3; - } - break; + case 2: // we are in a \Q... + if (ch == '\\') { + state = 3; + } + break; - case 3: // we are in at \Q...\ - if (ch == 'E') { - state = 0; - } - state = 2; - break; + case 3: // we are in at \Q...\ + if (ch == 'E') { + state = 0; + } + state = 2; + break; } result.append(ch); } @@ -127,6 +123,7 @@ public String transform(String regex) { /** * Convenience static function, using standard parameters. + * * @param regex as in process() * @return processed regex pattern, as in process() */ @@ -136,9 +133,8 @@ public static String fix(String regex) { /** * Compile a regex string, after processing by fix(...). - * - * @param regex - * Raw regex pattern, as in fix(...). + * + * @param regex Raw regex pattern, as in fix(...). * @return Pattern */ public static Pattern compile(String regex) { @@ -147,7 +143,7 @@ public static Pattern compile(String regex) { /** * Compile a composed string from a set of BNF lines; see the List version for more information. - * + * * @param bnfLines Series of BNF lines. * @return Pattern */ @@ -156,11 +152,12 @@ public String compileBnf(String bnfLines) { } /** - * Compile a composed string from a set of BNF lines, such as for composing a regex - * expression. The lines can be in any order, but there must not be any - * cycles. The result can be used as input for fix(). - *

- * Example: + * Compile a composed string from a set of BNF lines, such as for composing a regex expression. + * The lines can be in any order, but there must not be any cycles. The result can be used as + * input for fix(). + * + *

Example: + * *

      * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
      * scheme = reserved+;
@@ -169,17 +166,14 @@ public String compileBnf(String bnfLines) {
      * fragment = reserved+;
      * reserved = [[:ascii:][:alphabetic:]];
      * 
- *

- * Caveats: at this point the parsing is simple; for example, # cannot be - * quoted (use \\u0023); you can set it to null to disable. - * The equality sign and a few others can be reset with - * setBnfX(). - * - * @param bnfLines - * Series of lines that represent a BNF expression. The lines contain - * a series of statements that of the form x=y;. A statement can take - * multiple lines, but there can't be multiple statements on a line. - * A hash quotes to the end of the line. + * + *

Caveats: at this point the parsing is simple; for example, # cannot be quoted (use + * \\u0023); you can set it to null to disable. The equality sign and a few others can be reset + * with setBnfX(). + * + * @param bnfLines Series of lines that represent a BNF expression. The lines contain a series + * of statements that of the form x=y;. A statement can take multiple lines, but there can't + * be multiple statements on a line. A hash quotes to the end of the line. * @return Pattern */ public String compileBnf(List lines) { @@ -203,7 +197,8 @@ public String compileBnf(List lines) { try { log.append(variable2 + "=" + altered2 + ";"); } catch (IOException e) { - throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); + throw (IllegalArgumentException) + new IllegalArgumentException().initCause(e); } } } @@ -218,9 +213,8 @@ public String compileBnf(List lines) { /** * Compile a regex string, after processing by fix(...). - * - * @param regex - * Raw regex pattern, as in fix(...). + * + * @param regex Raw regex pattern, as in fix(...). * @return Pattern */ public static Pattern compile(String regex, int options) { @@ -253,13 +247,15 @@ public void setBnfLineSeparator(String bnfLineSeparator) { /** * Utility for loading lines from a UTF8 file. + * * @param file * @param result * @return * @throws IOException */ public static List loadFile(String file, List result) throws IOException { - BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); + BufferedReader in = + new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); while (true) { String line = in.readLine(); if (line == null) { @@ -270,7 +266,6 @@ public static List loadFile(String file, List result) throws IOE return result; } - /* (non-Javadoc) * @see com.ibm.icu.util.Freezable#cloneAsThawed() */ @@ -301,7 +296,8 @@ public boolean isFrozen() { // ===== PRIVATES ===== - private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { + private int processSet( + String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { try { pos.setIndex(i); UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); @@ -310,7 +306,8 @@ private int processSet(String regex, int i, StringBuilder result, UnicodeSet tem i = pos.getIndex() - 1; // allow for the loop increment return i; } catch (Exception e) { - throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); + throw (IllegalArgumentException) + new IllegalArgumentException("Error in " + regex).initCause(e); } } @@ -320,20 +317,20 @@ private int processSet(String regex, int i, StringBuilder result, UnicodeSet tem private String bnfLineSeparator = "\n"; private Appendable log = null; - private Comparator LongestFirst = new Comparator () { - public int compare(String arg0, String arg1) { - int len0 = arg0.length(); - int len1 = arg1.length(); - if (len0 != len1) { - return len1 - len0; - } - return arg0.compareTo(arg1); - } - }; - + private Comparator LongestFirst = + new Comparator() { + public int compare(String arg0, String arg1) { + int len0 = arg0.length(); + int len1 = arg1.length(); + if (len0 != len1) { + return len1 - len0; + } + return arg0.compareTo(arg1); + } + }; - private Map getVariables(List lines) { - Map variables = new TreeMap(LongestFirst); + private Map getVariables(List lines) { + Map variables = new TreeMap(LongestFirst); String variable = null; StringBuffer definition = new StringBuffer(); int count = 0; @@ -365,18 +362,18 @@ private Map getVariables(List lines) { } boolean terminated = trimline.endsWith(";"); if (terminated) { - linePart = linePart.substring(0,linePart.lastIndexOf(';')); + linePart = linePart.substring(0, linePart.lastIndexOf(';')); } int equalsPos = linePart.indexOf(bnfVariableInfix); if (equalsPos >= 0) { if (variable != null) { throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); } - variable = linePart.substring(0,equalsPos).trim(); + variable = linePart.substring(0, equalsPos).trim(); if (variables.containsKey(variable)) { throw new IllegalArgumentException("Duplicate variable definition in " + line); } - definition.append(linePart.substring(equalsPos+1).trim()); + definition.append(linePart.substring(equalsPos + 1).trim()); } else { // no equals, so if (variable == null) { throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); @@ -395,5 +392,4 @@ private Map getVariables(List lines) { } return variables; } - } diff --git a/unicodetools/src/main/java/org/unicode/jsp/XIDModifications.java b/unicodetools/src/main/java/org/unicode/jsp/XIDModifications.java index b2b2f3c6e..3b6320061 100644 --- a/unicodetools/src/main/java/org/unicode/jsp/XIDModifications.java +++ b/unicodetools/src/main/java/org/unicode/jsp/XIDModifications.java @@ -11,37 +11,43 @@ static class MyReader extends FileUtilities.SemiFileReader { @Override protected boolean handleLine(int start, int end, String[] items) { -// String type = items[1]; -// if (type.equalsIgnoreCase("allowed")) { -// reasons.putAll(start, end, items[2]); -// } else if (type.equalsIgnoreCase("restricted")) { -// // allowed.remove(start, end); -// } else { -// throw new IllegalArgumentException(type); -// } + // String type = items[1]; + // if (type.equalsIgnoreCase("allowed")) { + // reasons.putAll(start, end, items[2]); + // } else if (type.equalsIgnoreCase("restricted")) { + // // allowed.remove(start, end); + // } else { + // throw new IllegalArgumentException(type); + // } allowed.putAll(start, end, items[1]); reasons.putAll(start, end, items[2]); return true; } } + static { - //# @missing: 0000..10FFFF; Restricted ; Not-Characters - allowed.putAll(0,0x10FFFF,"Restricted"); - reasons.putAll(0,0x10FFFF,"Not-Characters"); - //reasons.putAll(new UnicodeSet("[[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"),"not-char"); + // # @missing: 0000..10FFFF; Restricted ; Not-Characters + allowed.putAll(0, 0x10FFFF, "Restricted"); + reasons.putAll(0, 0x10FFFF, "Not-Characters"); + // reasons.putAll(new + // UnicodeSet("[[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"),"not-char"); new MyReader().process(XIDModifications.class, "xidmodifications.txt"); allowed.freeze(); reasons.freeze(); } + public static UnicodeMap getTypes() { return reasons; } + public static UnicodeMap getReasons() { return reasons; } + public static UnicodeMap getStatus() { return allowed; } + public static UnicodeSet getAllowed() { return allowed.getSet("Restricted"); } @@ -49,6 +55,7 @@ public static UnicodeSet getAllowed() { public static boolean isAllowed(int codePoint) { return allowed.get(codePoint).equals("Restricted"); } + public static String getType(int codePoint) { return reasons.get(codePoint); } diff --git a/unicodetools/src/main/java/org/unicode/parse/EBNF.java b/unicodetools/src/main/java/org/unicode/parse/EBNF.java index 186e85936..9c36b93d2 100644 --- a/unicodetools/src/main/java/org/unicode/parse/EBNF.java +++ b/unicodetools/src/main/java/org/unicode/parse/EBNF.java @@ -1,5 +1,6 @@ package org.unicode.parse; +import com.ibm.icu.text.UnicodeSet; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -7,11 +8,8 @@ import java.util.List; import java.util.Map; import java.util.Set; - import org.unicode.parse.Tokenizer.Result; -import com.ibm.icu.text.UnicodeSet; - public class EBNF { boolean DEBUG = true; @@ -25,7 +23,7 @@ public String getInternal() { } /* - + "rule = string '=' alternation;"; + + "rule = string '=' alternation;"; + "alternation = sequence (weight? ('|' sequence weight?)+)?;" + "sequence = (core quantifier*)+;" + "core = string | unicodeSet | '(' alternation ')';" @@ -42,21 +40,20 @@ public String getInternal() { ? Match 1 or 0 times {n} Match exactly n times {n,} Match at least n times - {n,m} Match at least n but not more than m times + {n,m} Match at least n but not more than m times */ public EBNF addRules(String rules) { - t.setSource(rules); - while (addRule()) { - } + t.setSource(rules); + while (addRule()) {} return this; // for chaining } public EBNF build() { // check that the rules match the variables, except for $root in rules Set ruleSet = map.keySet(); - // add also + // add also variables.add("$root"); variables.addAll(t.getLookedUpItems()); if (!ruleSet.equals(variables)) { @@ -66,10 +63,10 @@ public EBNF build() { if (temp.length() != 0) temp = "Warning: Defined but not used: " + temp; if (msg.length() == 0) msg = temp; else if (temp.length() != 0) { - msg = msg + "; " + temp; + msg = msg + "; " + temp; } - error(msg); - } + error(msg); + } if (!ruleSet.equals(variables)) { String msg = showDiff(variables, ruleSet); @@ -78,19 +75,19 @@ else if (temp.length() != 0) { if (temp.length() != 0) temp = "Defined but not used: " + temp; if (msg.length() == 0) msg = temp; else if (temp.length() != 0) { - msg = msg + "; " + temp; + msg = msg + "; " + temp; } - error(msg); - } + error(msg); + } // replace variables by definitions Iterator it = ruleSet.iterator(); while (it.hasNext()) { String key = (String) it.next(); Pick expression = (Pick) map.get(key); - Iterator it2 = ruleSet.iterator(); + Iterator it2 = ruleSet.iterator(); if (DEBUG && key.equals("$crlf")) { - System.out.println("debug") ; + System.out.println("debug"); } while (it2.hasNext()) { Object key2 = it2.next(); @@ -169,8 +166,7 @@ String showDiff(Set a, Set b) { } void error(String msg) { - throw new IllegalArgumentException(msg - + "\r\n" + t.toString()); + throw new IllegalArgumentException(msg + "\r\n" + t.toString()); } private boolean addRule() { @@ -193,12 +189,11 @@ private boolean addRule() { } t.addSymbol(s, t.getSource(), startBody, t.index); if (t.nextCodePoint() != ';') { - error("missing ;"); + error("missing ;"); } return addPick(s, rule.setName(s)); } - protected boolean addPick(String s, Pick rule) { Pick temp = map.get(s); if (temp != null) { @@ -214,7 +209,7 @@ protected boolean addPick(String s, Pick rule) { public EBNF addSet(String variable, UnicodeSet set) { if (set != null) { String body = set.toString(); - t.addSymbol(variable, body, 0, body.length()); + t.addSymbol(variable, body, 0, body.length()); addPick(variable, Pick.codePoint(set)); } return this; @@ -223,30 +218,30 @@ public EBNF addSet(String variable, UnicodeSet set) { Pick qualify(Pick item) { Result result = t.next(); if (result == Result.CODEPOINT) { - switch(t.getCodePoint()) { - case '?': - return Pick.repeat(0, 1, item); - case '*': - return Pick.repeat(0, Integer.MAX_VALUE, item); - case '+': - return Pick.repeat(1, Integer.MAX_VALUE, item); - case '{': - if (t.next() != Result.NUMBER) error("missing number"); - int start = (int) t.getNumber(); - int end = start; - result = t.next(); - if (t.getCodePoint() == ',') { - end = Integer.MAX_VALUE; + switch (t.getCodePoint()) { + case '?': + return Pick.repeat(0, 1, item); + case '*': + return Pick.repeat(0, Integer.MAX_VALUE, item); + case '+': + return Pick.repeat(1, Integer.MAX_VALUE, item); + case '{': + if (t.next() != Result.NUMBER) error("missing number"); + int start = (int) t.getNumber(); + int end = start; result = t.next(); - if (result == Result.NUMBER) { - end = (int) t.getNumber(); + if (t.getCodePoint() == ',') { + end = Integer.MAX_VALUE; result = t.next(); + if (result == Result.NUMBER) { + end = (int) t.getNumber(); + result = t.next(); + } } - } - if (t.getCodePoint() != '}') { - error("missing }"); - } - return Pick.repeat(start, end, item); + if (t.getCodePoint() != '}') { + error("missing }"); + } + return Pick.repeat(start, end, item); } } t.backup(); @@ -265,7 +260,7 @@ Pick getCore() { return Pick.string(s); } if (token == Result.UNICODESET) { - return Pick.codePoint(t.getUnicodeSet()); + return Pick.codePoint(t.getUnicodeSet()); } if (t.getCodePoint() != '(') { t.backup(); @@ -274,9 +269,9 @@ Pick getCore() { Pick temp = getAlternation(); token = t.next(); if (t.getCodePoint() != ')') { - error("missing )"); + error("missing )"); } - return temp; + return temp; } Pick getSequence() { @@ -299,7 +294,7 @@ Pick getSequence() { if (last == null) { last = item; } else { - if (result == null) result = Pick.makeSequence().and2(last); + if (result == null) result = Pick.makeSequence().and2(last); result = result.and2(item); } } @@ -320,7 +315,7 @@ Pick getAlternation() { if (result == null) { result = Pick.makeAlternation().or2(last); } - result = result.or2(temp); + result = result.or2(temp); } t.next(); if (t.getCodePoint() != '|') { @@ -328,6 +323,6 @@ Pick getAlternation() { if (result != null) return result; if (last != null) return last; } - } + } } } diff --git a/unicodetools/src/main/java/org/unicode/parse/Pick.java b/unicodetools/src/main/java/org/unicode/parse/Pick.java index ba6df853d..1eadd25d6 100644 --- a/unicodetools/src/main/java/org/unicode/parse/Pick.java +++ b/unicodetools/src/main/java/org/unicode/parse/Pick.java @@ -6,15 +6,13 @@ */ package org.unicode.parse; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.util.HashSet; import java.util.Set; - import org.unicode.parse.EBNF.Position; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - -abstract public class Pick { +public abstract class Pick { private static boolean DEBUG = false; // for Building @@ -29,22 +27,23 @@ public Pick setName(String nameStr) { return this; } - static public Pick.Sequence makeSequence() { + public static Pick.Sequence makeSequence() { return new Sequence(); } - static public Pick.Alternation makeAlternation() { + public static Pick.Alternation makeAlternation() { return new Alternation(); } - static public Pick repeat(int minCount, int maxCount, Pick item) { + public static Pick repeat(int minCount, int maxCount, Pick item) { return new Repeat(minCount, maxCount, item); } - static public Pick codePoint(UnicodeSet source) { + public static Pick codePoint(UnicodeSet source) { return new CodePoint(source); } - static public Pick string(String source) { + + public static Pick string(String source) { return new Literal(source); } @@ -63,21 +62,23 @@ public String toString() { public abstract boolean match(String input, int inputPos, Position p); public static class Sequence extends ListPick { - public Sequence and2 (Pick item) { + public Sequence and2(Pick item) { addInternal(new Pick[] {item}); // we don't care about perf return this; // for chaining } - public Sequence and2 (Pick[] itemArray) { + + public Sequence and2(Pick[] itemArray) { addInternal(itemArray); return this; // for chaining } + public String getInternal(int depth, Set alreadySeen) { String result = checkName(name, alreadySeen); if (result.startsWith("$")) return result; result = indent(depth) + result + "SEQ("; for (int i = 0; i < items.length; ++i) { if (i != 0) result += ", "; - result += items[i].getInternal(depth+1, alreadySeen); + result += items[i].getInternal(depth + 1, alreadySeen); } result += ")"; return result; @@ -109,11 +110,12 @@ String checkName(String nameStr, Set alreadySeen) { public static class Alternation extends ListPick { - public Alternation or2 (Pick item) { + public Alternation or2(Pick item) { addInternal(new Pick[] {item}); // we don't care about perf return this; // for chaining } - public Alternation or2 (Pick[] itemArray) { + + public Alternation or2(Pick[] itemArray) { addInternal(itemArray); return this; // for chaining } @@ -124,13 +126,13 @@ public String getInternal(int depth, Set alreadySeen) { result = indent(depth) + result + "OR("; for (int i = 0; i < items.length; ++i) { if (i != 0) result += ", "; - result += items[i].getInternal(depth+1, alreadySeen); + result += items[i].getInternal(depth + 1, alreadySeen); } return result + ")"; } // keep private private Alternation() { - sep="|"; + sep = "|"; } // take first matching option @@ -168,9 +170,12 @@ private Repeat(int minCount, int maxCount, Pick item) { public String getInternal(int depth, Set alreadySeen) { String result = checkName(name, alreadySeen); if (result.startsWith("$")) return result; - result = indent(depth) + result + "REPEAT(" - + item.getInternal(depth+1, alreadySeen) - + ")"; + result = + indent(depth) + + result + + "REPEAT(" + + item.getInternal(depth + 1, alreadySeen) + + ")"; return result; } @@ -181,9 +186,9 @@ public boolean match(String input, int inputPos, Position p) { for (int i = 0; i < maxCount; ++i) { if (!item.match(input, inputPos, p)) { break; - } + } inputPos = p.getIndex(); - count++; + count++; } if (count >= minCount) { return true; @@ -191,17 +196,21 @@ public boolean match(String input, int inputPos, Position p) { p.restoreState(state); return false; } + @Override public String toString(int depth) { - return name != null ? name : - item.toString(depth) + - (maxCount == Integer.MAX_VALUE ? - (minCount == 0 ? "*" - : minCount == 1 ? "+" - : "{" + minCount + ",}") - : maxCount == 1 ? "?" - : maxCount == minCount ? "{" + minCount + "}" - : "{" + minCount + "," + maxCount + "}"); + return name != null + ? name + : item.toString(depth) + + (maxCount == Integer.MAX_VALUE + ? (minCount == 0 + ? "*" + : minCount == 1 ? "+" : "{" + minCount + ",}") + : maxCount == 1 + ? "?" + : maxCount == minCount + ? "{" + minCount + "}" + : "{" + minCount + "," + maxCount + "}"); } } @@ -214,6 +223,7 @@ private CodePoint(UnicodeSet source) { } this.source = source; } + public boolean match(String s, int inputPos, Position p) { if (inputPos >= s.length()) { return false; @@ -222,35 +232,36 @@ public boolean match(String s, int inputPos, Position p) { if (match < inputPos) { return false; } else { - p.addIndex(match-inputPos, s.substring(inputPos, match)); + p.addIndex(match - inputPos, s.substring(inputPos, match)); return true; } } + public String getInternal(int depth, Set alreadySeen) { String result = checkName(name, alreadySeen); if (result.startsWith("$")) return result; return source.toString(); } + public String toString() { return source.toString(); } + public String toString(int depth) { return toString(); } } - - /* Add character if we can */ static int getChar(String newValue, int newIndex, StringBuffer mergeBuffer, boolean copy) { if (newIndex >= newValue.length()) return newIndex; - int cp = UTF16.charAt(newValue,newIndex); + int cp = UTF16.charAt(newValue, newIndex); if (copy) UTF16.append(mergeBuffer, cp); return newIndex + UTF16.getCharCount(cp); } - /* + /* // quoted add appendQuoted(target, addBuffer.toString(), quoteBuffer); // fix buffers @@ -261,14 +272,15 @@ static int getChar(String newValue, int newIndex, StringBuffer mergeBuffer, bool } */ - private static class Literal extends FinalPick { public String toString() { return "'" + name + "'"; - } - private Literal(String source) { + } + + private Literal(String source) { this.name = source; } + public boolean match(String input, int inputPos, Position p) { int len = name.length(); if (input.regionMatches(inputPos, name, 0, len)) { @@ -277,9 +289,11 @@ public boolean match(String input, int inputPos, Position p) { } return false; } + public String getInternal(int depth, Set alreadySeen) { return toString(); } + public String toString(int depth) { return toString(); } @@ -291,11 +305,13 @@ abstract static class Visitor { Set already = new HashSet(); // Note: each visitor should return the Pick that will replace a (or a itself) abstract Pick handle(Pick a); + boolean alreadyEntered(Pick item) { boolean result = already.contains(item); already.add(item); return result; } + void reset() { already.clear(); } @@ -306,19 +322,21 @@ void reset() { static class Replacer extends Visitor { String toReplace; Pick replacement; + Replacer(String toReplace, Pick replacement) { this.toReplace = toReplace; this.replacement = replacement; } + public Pick handle(Pick a) { if (toReplace.equals(a.name)) { a = replacement; - } + } return a; } } - abstract private static class FinalPick extends Pick { + private abstract static class FinalPick extends Pick { public Pick visit(Visitor visitor) { return visitor.handle(this); } @@ -327,7 +345,7 @@ public Pick visit(Visitor visitor) { private abstract static class ItemPick extends Pick { protected Pick item; - ItemPick (Pick item) { + ItemPick(Pick item) { this.item = item; } @@ -354,11 +372,11 @@ int size() { } Pick getLast() { - return items[items.length-1]; + return items[items.length - 1]; } void setLast(Pick newOne) { - items[items.length-1] = newOne; + items[items.length - 1] = newOne; } protected void addInternal(Pick[] objs) { @@ -382,8 +400,9 @@ public Pick visit(Visitor visitor) { public String toString(int depth) { if (name != null) { return name; - } if (items.length == 1) { - return items[0].toString(depth-1); + } + if (items.length == 1) { + return items[0].toString(depth - 1); } else if (depth < 0) { return "?"; } @@ -393,33 +412,32 @@ public String toString(int depth) { if (b.length() != 1) { b.append(sep); } - b.append(item.toString(depth-1)); + b.append(item.toString(depth - 1)); } return b.append(")").toString(); } } - // these utilities really ought to be in Java public static double[] realloc(double[] source, int newSize) { double[] temp = new double[newSize]; if (newSize > source.length) newSize = source.length; - if (newSize != 0) System.arraycopy(source,0,temp,0,newSize); + if (newSize != 0) System.arraycopy(source, 0, temp, 0, newSize); return temp; } public static int[] realloc(int[] source, int newSize) { int[] temp = new int[newSize]; if (newSize > source.length) newSize = source.length; - if (newSize != 0) System.arraycopy(source,0,temp,0,newSize); + if (newSize != 0) System.arraycopy(source, 0, temp, 0, newSize); return temp; } public static Pick[] realloc(Pick[] source, int newSize) { Pick[] temp = new Pick[newSize]; if (newSize > source.length) newSize = source.length; - if (newSize != 0) System.arraycopy(source,0,temp,0,newSize); + if (newSize != 0) System.arraycopy(source, 0, temp, 0, newSize); return temp; } @@ -456,15 +474,15 @@ static public class SimpleSpread implements Spread { static final Spread FLAT = new SimpleSpread(1.0); boolean flat = false; double aa, bb, cc; - public SimpleSpread(double maxWeight) { + public SimpleSpread(double maxWeight) { if (maxWeight > 0.999 && maxWeight < 1.001) { flat = true; - } else { + } else { double q = (maxWeight - 1.0); aa = -1/q; bb = 1/(q*q); cc = (2.0+q)/q; - } + } } public double spread(double value) { if (flat) return value; @@ -480,5 +498,4 @@ static public int pick(Spread spread, Random random, int start, int end) { */ - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/parse/Tokenizer.java b/unicodetools/src/main/java/org/unicode/parse/Tokenizer.java index 36d6e4835..2e009e7ca 100644 --- a/unicodetools/src/main/java/org/unicode/parse/Tokenizer.java +++ b/unicodetools/src/main/java/org/unicode/parse/Tokenizer.java @@ -6,21 +6,19 @@ */ package org.unicode.parse; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.SymbolTable; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeMatcher; +import com.ibm.icu.text.UnicodeSet; import java.text.ParsePosition; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; - import org.unicode.cldr.util.InternalCldrException; import org.unicode.text.utility.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.SymbolTable; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeMatcher; -import com.ibm.icu.text.UnicodeSet; - public class Tokenizer { static final boolean DEBUG = true; @@ -37,31 +35,29 @@ public class Tokenizer { Result lastValue = Result.BACKEDUP_TOO_FAR; TokenSymbolTable symbolTable = new TokenSymbolTable(); - private static final char - QUOTE = '\'', - BSLASH = '\\'; - - private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH) - .freeze(); - private static final UnicodeSet WHITESPACE = new UnicodeSet("[" + - "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" + - "]") - .freeze(); - private static final UnicodeSet SYNTAX = new UnicodeSet("[" + - "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" + - "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" + - "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" + - "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" + - "\\u3001\\u3003\\u3008-\\u3020\\u3030" + - "\\uFD3E\\uFD3F\\uFE45\\uFE46" + - "]").removeAll(QUOTERS).remove('$') - .freeze(); - private static final UnicodeSet NEWLINE = new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]") - .freeze(); - //private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]"); - private static final UnicodeSet NON_STRING = new UnicodeSet() - .addAll(WHITESPACE) - .addAll(SYNTAX); + private static final char QUOTE = '\'', BSLASH = '\\'; + + private static final UnicodeSet QUOTERS = new UnicodeSet().add(QUOTE).add(BSLASH).freeze(); + private static final UnicodeSet WHITESPACE = + new UnicodeSet("[" + "\\u0009-\\u000D\\u0020\\u0085\\u200E\\u200F\\u2028\\u2029" + "]") + .freeze(); + private static final UnicodeSet SYNTAX = + new UnicodeSet( + "[" + + "\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E" + + "\\u00A1-\\u00A7\\u00A9\\u00AB-\\u00AC\\u00AE" + + "\\u00B0-\\u00B1\\u00B6\\u00B7\\u00BB\\u00BF\\u00D7\\u00F7" + + "\\u2010-\\u2027\\u2030-\\u205E\\u2190-\\u2BFF" + + "\\u3001\\u3003\\u3008-\\u3020\\u3030" + + "\\uFD3E\\uFD3F\\uFE45\\uFE46" + + "]") + .removeAll(QUOTERS) + .remove('$') + .freeze(); + private static final UnicodeSet NEWLINE = + new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]").freeze(); + // private static final UnicodeSet DECIMAL = new UnicodeSet("[:Nd:]"); + private static final UnicodeSet NON_STRING = new UnicodeSet().addAll(WHITESPACE).addAll(SYNTAX); protected UnicodeSet whiteSpace = WHITESPACE; protected UnicodeSet syntax = SYNTAX; @@ -69,16 +65,13 @@ public class Tokenizer { private void fixSets() { if (syntax.containsSome(QUOTERS) || syntax.containsSome(whiteSpace)) { - syntax = ((UnicodeSet)syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace) - .freeze(); + syntax = + ((UnicodeSet) syntax.clone()).removeAll(QUOTERS).removeAll(whiteSpace).freeze(); } if (whiteSpace.containsSome(QUOTERS)) { - whiteSpace = ((UnicodeSet)whiteSpace.clone()).removeAll(QUOTERS) - .freeze(); + whiteSpace = ((UnicodeSet) whiteSpace.clone()).removeAll(QUOTERS).freeze(); } - non_string = new UnicodeSet(syntax) - .addAll(whiteSpace) - .freeze(); + non_string = new UnicodeSet(syntax).addAll(whiteSpace).freeze(); } public Tokenizer setSource(String source) { @@ -93,20 +86,20 @@ public Tokenizer setIndex(int index) { } public enum Result { - CODEPOINT, + CODEPOINT, IDENTIFIER, - NUMBER, - STRING, - UNICODESET, - DONE, - UNTERMINATED_QUOTE, + NUMBER, + STRING, + UNICODESET, + DONE, + UNTERMINATED_QUOTE, BACKEDUP_TOO_FAR, ILLEGAL_CHARACTER, - INCOMPLETE_BACKSLASH, + INCOMPLETE_BACKSLASH, } public String toStringFull() { - StringBuilder result = new StringBuilder(source.substring(0,index)).append("$$$"); + StringBuilder result = new StringBuilder(source.substring(0, index)).append("$$$"); return toString(result).append(source.substring(index)).toString(); } @@ -116,25 +109,25 @@ public String toString() { private StringBuilder toString(StringBuilder result) { result.append(lastValue); - switch(lastValue) { - case STRING: - case IDENTIFIER: - result.append(":«").append(getString()).append('»'); - break; - case NUMBER: - result.append(':').append(getNumber()); - break; - case UNICODESET: - result.append(':').append(getUnicodeSet().toPattern(false)); - break; - case CODEPOINT: - result.append(':').appendCodePoint(getCodePoint()); - break; - case ILLEGAL_CHARACTER: - result.append("\\x{").append(Utility.hex(getCodePoint())).append("}"); - break; - default: - break; + switch (lastValue) { + case STRING: + case IDENTIFIER: + result.append(":«").append(getString()).append('»'); + break; + case NUMBER: + result.append(':').append(getNumber()); + break; + case UNICODESET: + result.append(':').append(getUnicodeSet().toPattern(false)); + break; + case CODEPOINT: + result.append(':').appendCodePoint(getCodePoint()); + break; + case ILLEGAL_CHARACTER: + result.append("\\x{").append(Utility.hex(getCodePoint())).append("}"); + break; + default: + break; } return result.append(backedup ? "@" : ""); } @@ -154,7 +147,7 @@ public int next2() { int result = next(); System.out.println(toString(result, backedupBefore)); return result; - } + } */ public Result next() { @@ -192,8 +185,8 @@ public Result next() { lastIndex = index; if (cp == '[' || cp == '\\') { - ParsePosition pos = new ParsePosition(index-1); - unicodeSet = new UnicodeSet(source,pos,symbolTable); + ParsePosition pos = new ParsePosition(index - 1); + unicodeSet = new UnicodeSet(source, pos, symbolTable); if (unicodeSet == null) { throw new NullPointerException(); } @@ -226,10 +219,10 @@ public Result next() { number *= 10; number += UCharacter.getNumericValue(cp); } - lastValue = Result.NUMBER; + lastValue = Result.NUMBER; if (DEBUG) { System.out.println("Tokenizer: " + this); - } + } return lastValue; } buffer.setLength(0); @@ -238,37 +231,44 @@ public Result next() { while (index < source.length()) { cp = nextChar(); switch (result) { - case UNTERMINATED_QUOTE: - if (cp == QUOTE) { - lastValue = Result.STRING; - if (DEBUG) { - System.out.println("Tokenizer: " + this); - } - return lastValue; - } else if (cp == BSLASH) { - result = Result.INCOMPLETE_BACKSLASH; - } else { - UTF16.append(buffer,cp); - } - break; - case INCOMPLETE_BACKSLASH: - switch(cp) { - case 'n': cp = '\n'; break; - case 'r': cp = '\r'; break; - case 't': cp = '\t'; break; - default: break; - } - UTF16.append(buffer,cp); - result = Result.UNTERMINATED_QUOTE; - break; - default: - throw new InternalCldrException("Internal error"); + case UNTERMINATED_QUOTE: + if (cp == QUOTE) { + lastValue = Result.STRING; + if (DEBUG) { + System.out.println("Tokenizer: " + this); + } + return lastValue; + } else if (cp == BSLASH) { + result = Result.INCOMPLETE_BACKSLASH; + } else { + UTF16.append(buffer, cp); + } + break; + case INCOMPLETE_BACKSLASH: + switch (cp) { + case 'n': + cp = '\n'; + break; + case 'r': + cp = '\r'; + break; + case 't': + cp = '\t'; + break; + default: + break; + } + UTF16.append(buffer, cp); + result = Result.UNTERMINATED_QUOTE; + break; + default: + throw new InternalCldrException("Internal error"); } } lastValue = result; if (DEBUG) { System.out.println("Tokenizer: " + this); - } + } return lastValue; } @@ -286,19 +286,21 @@ public Result next() { lastValue = Result.IDENTIFIER; if (DEBUG) { System.out.println("Tokenizer: " + this); - } + } return lastValue; } codePoint = cp; lastValue = Result.ILLEGAL_CHARACTER; if (DEBUG) { System.out.println("Tokenizer: " + this); - } + } return lastValue; } public String getString() { - return lastValue != Result.STRING && lastValue != Result.IDENTIFIER ? null : buffer.toString(); + return lastValue != Result.STRING && lastValue != Result.IDENTIFIER + ? null + : buffer.toString(); } public long getNumber() { @@ -310,7 +312,9 @@ public UnicodeSet getUnicodeSet() { } public int getCodePoint() { - return lastValue != Result.CODEPOINT && lastValue != Result.ILLEGAL_CHARACTER ? -1 : codePoint; + return lastValue != Result.CODEPOINT && lastValue != Result.ILLEGAL_CHARACTER + ? -1 + : codePoint; } public int nextCodePoint() { @@ -319,26 +323,32 @@ public int nextCodePoint() { } private int nextChar() { - int cp = UTF16.charAt(source,index); + int cp = UTF16.charAt(source, index); index += UTF16.getCharCount(cp); return cp; } + public int getIndex() { return index; } + public String getSource() { return source; } + public UnicodeSet getSyntax() { return syntax; } + public UnicodeSet getWhiteSpace() { return whiteSpace; } + public void setSyntax(UnicodeSet set) { syntax = set; fixSets(); } + public void setWhiteSpace(UnicodeSet set) { whiteSpace = set; fixSets(); @@ -370,7 +380,7 @@ public void add(String var, char[] body) { */ public char[] lookup(String s) { itemsLookedUp.add('$' + s); - return (char[])contents.get(s); + return (char[]) contents.get(s); } /* (non-Javadoc) @@ -395,9 +405,7 @@ public String parseReference(String text, ParsePosition pos, int limit) { } } pos.setIndex(i); - return text.substring(start,i); + return text.substring(start, i); } - } - } diff --git a/unicodetools/src/main/java/org/unicode/picker/CharData.java b/unicodetools/src/main/java/org/unicode/picker/CharData.java index 2b10ffac7..7027973f7 100644 --- a/unicodetools/src/main/java/org/unicode/picker/CharData.java +++ b/unicodetools/src/main/java/org/unicode/picker/CharData.java @@ -1,724 +1,808 @@ package org.unicode.picker; -public class CharData { -public static String[][] CHARACTERS_TO_NAME = { -{" ","SPACE"}, -{"-","HYPHEN-MINUS"}, -{" ","NO-BREAK SPACE"}, -{"­","SOFT HYPHEN"}, -{"͏","COMBINING GRAPHEME JOINER"}, -{"֊","ARMENIAN HYPHEN"}, -{"־","HEBREW PUNCTUATION MAQAF"}, -{"؀","ARABIC NUMBER SIGN"}, -{"؁","ARABIC SIGN SANAH"}, -{"؂","ARABIC FOOTNOTE MARKER"}, -{"؃","ARABIC SIGN SAFHA"}, -{"؄","ARABIC SIGN SAMVAT"}, -{"؅","ARABIC NUMBER MARK ABOVE"}, -{"؜","ARABIC LETTER MARK"}, -{"۝","ARABIC END OF AYAH"}, -{"܏","SYRIAC ABBREVIATION MARK"}, -{"ᅟ","HANGUL CHOSEONG FILLER"}, -{"ᅠ","HANGUL JUNGSEONG FILLER"}, -{"᐀","CANADIAN SYLLABICS HYPHEN"}, -{" ","OGHAM SPACE MARK"}, -{"឴","KHMER VOWEL INHERENT AQ"}, -{"឵","KHMER VOWEL INHERENT AA"}, -{"᠆","MONGOLIAN TODO SOFT HYPHEN"}, -{"᠋","MONGOLIAN FREE VARIATION SELECTOR ONE"}, -{"᠌","MONGOLIAN FREE VARIATION SELECTOR TWO"}, -{"᠍","MONGOLIAN FREE VARIATION SELECTOR THREE"}, -{"᠎","MONGOLIAN VOWEL SEPARATOR"}, -{" ","EN QUAD"}, -{" ","EM QUAD"}, -{" ","EN SPACE"}, -{" ","EM SPACE"}, -{" ","THREE-PER-EM SPACE"}, -{" ","FOUR-PER-EM SPACE"}, -{" ","SIX-PER-EM SPACE"}, -{" ","FIGURE SPACE"}, -{" ","PUNCTUATION SPACE"}, -{" ","THIN SPACE"}, -{" ","HAIR SPACE"}, -{"​","ZERO WIDTH SPACE"}, -{"‌","ZERO WIDTH NON-JOINER"}, -{"‍","ZERO WIDTH JOINER"}, -{"‎","LEFT-TO-RIGHT MARK"}, -{"‏","RIGHT-TO-LEFT MARK"}, -{"‐","HYPHEN"}, -{"‑","NON-BREAKING HYPHEN"}, -{"‒","FIGURE DASH"}, -{"–","EN DASH"}, -{"—","EM DASH"}, -{"―","HORIZONTAL BAR"}, -{"\u2028","LINE SEPARATOR"}, -{"\u2029","PARAGRAPH SEPARATOR"}, -{"‪","LEFT-TO-RIGHT EMBEDDING"}, -{"‫","RIGHT-TO-LEFT EMBEDDING"}, -{"‬","POP DIRECTIONAL FORMATTING"}, -{"‭","LEFT-TO-RIGHT OVERRIDE"}, -{"‮","RIGHT-TO-LEFT OVERRIDE"}, -{" ","NARROW NO-BREAK SPACE"}, -{" ","MEDIUM MATHEMATICAL SPACE"}, -{"⁠","WORD JOINER"}, -{"⁡","FUNCTION APPLICATION"}, -{"⁢","INVISIBLE TIMES"}, -{"⁣","INVISIBLE SEPARATOR"}, -{"⁤","INVISIBLE PLUS"}, -{"⁦","LEFT-TO-RIGHT ISOLATE"}, -{"⁧","RIGHT-TO-LEFT ISOLATE"}, -{"⁨","FIRST STRONG ISOLATE"}, -{"⁩","POP DIRECTIONAL ISOLATE"}, -{"⸗","DOUBLE OBLIQUE HYPHEN"}, -{"⸚","HYPHEN WITH DIAERESIS"}, -{"⸺","TWO-EM DASH"}, -{"⸻","THREE-EM DASH"}, -{"⹀","DOUBLE HYPHEN"}, -{" ","IDEOGRAPHIC SPACE"}, -{"〜","WAVE DASH"}, -{"〰","WAVY DASH"}, -{"゠","KATAKANA-HIRAGANA DOUBLE HYPHEN"}, -{"ㅤ","HANGUL FILLER"}, -{"︀","VARIATION SELECTOR-1"}, -{"︁","VARIATION SELECTOR-2"}, -{"︂","VARIATION SELECTOR-3"}, -{"︃","VARIATION SELECTOR-4"}, -{"︄","VARIATION SELECTOR-5"}, -{"︅","VARIATION SELECTOR-6"}, -{"︆","VARIATION SELECTOR-7"}, -{"︇","VARIATION SELECTOR-8"}, -{"︈","VARIATION SELECTOR-9"}, -{"︉","VARIATION SELECTOR-10"}, -{"︊","VARIATION SELECTOR-11"}, -{"︋","VARIATION SELECTOR-12"}, -{"︌","VARIATION SELECTOR-13"}, -{"︍","VARIATION SELECTOR-14"}, -{"︎","VARIATION SELECTOR-15"}, -{"️","VARIATION SELECTOR-16"}, -{"︱","PRESENTATION FORM FOR VERTICAL EM DASH"}, -{"︲","PRESENTATION FORM FOR VERTICAL EN DASH"}, -{"﹘","SMALL EM DASH"}, -{"﹣","SMALL HYPHEN-MINUS"}, -{"","ZERO WIDTH NO-BREAK SPACE"}, -{"-","FULLWIDTH HYPHEN-MINUS"}, -{"ᅠ","HALFWIDTH HANGUL FILLER"}, -{"","INTERLINEAR ANNOTATION ANCHOR"}, -{"","INTERLINEAR ANNOTATION SEPARATOR"}, -{"","INTERLINEAR ANNOTATION TERMINATOR"}, -{"𑂽","KAITHI NUMBER SIGN"}, -{"𛲠","SHORTHAND FORMAT LETTER OVERLAP"}, -{"𛲡","SHORTHAND FORMAT CONTINUING OVERLAP"}, -{"𛲢","SHORTHAND FORMAT DOWN STEP"}, -{"𛲣","SHORTHAND FORMAT UP STEP"}, -{"𝅳","MUSICAL SYMBOL BEGIN BEAM"}, -{"𝅴","MUSICAL SYMBOL END BEAM"}, -{"𝅵","MUSICAL SYMBOL BEGIN TIE"}, -{"𝅶","MUSICAL SYMBOL END TIE"}, -{"𝅷","MUSICAL SYMBOL BEGIN SLUR"}, -{"𝅸","MUSICAL SYMBOL END SLUR"}, -{"𝅹","MUSICAL SYMBOL BEGIN PHRASE"}, -{"𝅺","MUSICAL SYMBOL END PHRASE"}, -{"󠀠","TAG SPACE"}, -{"󠀡","TAG EXCLAMATION MARK"}, -{"󠀢","TAG QUOTATION MARK"}, -{"󠀣","TAG NUMBER SIGN"}, -{"󠀤","TAG DOLLAR SIGN"}, -{"󠀥","TAG PERCENT SIGN"}, -{"󠀦","TAG AMPERSAND"}, -{"󠀧","TAG APOSTROPHE"}, -{"󠀨","TAG LEFT PARENTHESIS"}, -{"󠀩","TAG RIGHT PARENTHESIS"}, -{"󠀪","TAG ASTERISK"}, -{"󠀫","TAG PLUS SIGN"}, -{"󠀬","TAG COMMA"}, -{"󠀭","TAG HYPHEN-MINUS"}, -{"󠀮","TAG FULL STOP"}, -{"󠀯","TAG SOLIDUS"}, -{"󠀰","TAG DIGIT ZERO"}, -{"󠀱","TAG DIGIT ONE"}, -{"󠀲","TAG DIGIT TWO"}, -{"󠀳","TAG DIGIT THREE"}, -{"󠀴","TAG DIGIT FOUR"}, -{"󠀵","TAG DIGIT FIVE"}, -{"󠀶","TAG DIGIT SIX"}, -{"󠀷","TAG DIGIT SEVEN"}, -{"󠀸","TAG DIGIT EIGHT"}, -{"󠀹","TAG DIGIT NINE"}, -{"󠀺","TAG COLON"}, -{"󠀻","TAG SEMICOLON"}, -{"󠀼","TAG LESS-THAN SIGN"}, -{"󠀽","TAG EQUALS SIGN"}, -{"󠀾","TAG GREATER-THAN SIGN"}, -{"󠀿","TAG QUESTION MARK"}, -{"󠁀","TAG COMMERCIAL AT"}, -{"󠁁","TAG LATIN CAPITAL LETTER A"}, -{"󠁂","TAG LATIN CAPITAL LETTER B"}, -{"󠁃","TAG LATIN CAPITAL LETTER C"}, -{"󠁄","TAG LATIN CAPITAL LETTER D"}, -{"󠁅","TAG LATIN CAPITAL LETTER E"}, -{"󠁆","TAG LATIN CAPITAL LETTER F"}, -{"󠁇","TAG LATIN CAPITAL LETTER G"}, -{"󠁈","TAG LATIN CAPITAL LETTER H"}, -{"󠁉","TAG LATIN CAPITAL LETTER I"}, -{"󠁊","TAG LATIN CAPITAL LETTER J"}, -{"󠁋","TAG LATIN CAPITAL LETTER K"}, -{"󠁌","TAG LATIN CAPITAL LETTER L"}, -{"󠁍","TAG LATIN CAPITAL LETTER M"}, -{"󠁎","TAG LATIN CAPITAL LETTER N"}, -{"󠁏","TAG LATIN CAPITAL LETTER O"}, -{"󠁐","TAG LATIN CAPITAL LETTER P"}, -{"󠁑","TAG LATIN CAPITAL LETTER Q"}, -{"󠁒","TAG LATIN CAPITAL LETTER R"}, -{"󠁓","TAG LATIN CAPITAL LETTER S"}, -{"󠁔","TAG LATIN CAPITAL LETTER T"}, -{"󠁕","TAG LATIN CAPITAL LETTER U"}, -{"󠁖","TAG LATIN CAPITAL LETTER V"}, -{"󠁗","TAG LATIN CAPITAL LETTER W"}, -{"󠁘","TAG LATIN CAPITAL LETTER X"}, -{"󠁙","TAG LATIN CAPITAL LETTER Y"}, -{"󠁚","TAG LATIN CAPITAL LETTER Z"}, -{"󠁛","TAG LEFT SQUARE BRACKET"}, -{"󠁜","TAG REVERSE SOLIDUS"}, -{"󠁝","TAG RIGHT SQUARE BRACKET"}, -{"󠁞","TAG CIRCUMFLEX ACCENT"}, -{"󠁟","TAG LOW LINE"}, -{"󠁠","TAG GRAVE ACCENT"}, -{"󠁡","TAG LATIN SMALL LETTER A"}, -{"󠁢","TAG LATIN SMALL LETTER B"}, -{"󠁣","TAG LATIN SMALL LETTER C"}, -{"󠁤","TAG LATIN SMALL LETTER D"}, -{"󠁥","TAG LATIN SMALL LETTER E"}, -{"󠁦","TAG LATIN SMALL LETTER F"}, -{"󠁧","TAG LATIN SMALL LETTER G"}, -{"󠁨","TAG LATIN SMALL LETTER H"}, -{"󠁩","TAG LATIN SMALL LETTER I"}, -{"󠁪","TAG LATIN SMALL LETTER J"}, -{"󠁫","TAG LATIN SMALL LETTER K"}, -{"󠁬","TAG LATIN SMALL LETTER L"}, -{"󠁭","TAG LATIN SMALL LETTER M"}, -{"󠁮","TAG LATIN SMALL LETTER N"}, -{"󠁯","TAG LATIN SMALL LETTER O"}, -{"󠁰","TAG LATIN SMALL LETTER P"}, -{"󠁱","TAG LATIN SMALL LETTER Q"}, -{"󠁲","TAG LATIN SMALL LETTER R"}, -{"󠁳","TAG LATIN SMALL LETTER S"}, -{"󠁴","TAG LATIN SMALL LETTER T"}, -{"󠁵","TAG LATIN SMALL LETTER U"}, -{"󠁶","TAG LATIN SMALL LETTER V"}, -{"󠁷","TAG LATIN SMALL LETTER W"}, -{"󠁸","TAG LATIN SMALL LETTER X"}, -{"󠁹","TAG LATIN SMALL LETTER Y"}, -{"󠁺","TAG LATIN SMALL LETTER Z"}, -{"󠁻","TAG LEFT CURLY BRACKET"}, -{"󠁼","TAG VERTICAL LINE"}, -{"󠁽","TAG RIGHT CURLY BRACKET"}, -{"󠁾","TAG TILDE"}, -{"󠄀","VARIATION SELECTOR-17"}, -{"󠄁","VARIATION SELECTOR-18"}, -{"󠄂","VARIATION SELECTOR-19"}, -{"󠄃","VARIATION SELECTOR-20"}, -{"󠄄","VARIATION SELECTOR-21"}, -{"󠄅","VARIATION SELECTOR-22"}, -{"󠄆","VARIATION SELECTOR-23"}, -{"󠄇","VARIATION SELECTOR-24"}, -{"󠄈","VARIATION SELECTOR-25"}, -{"󠄉","VARIATION SELECTOR-26"}, -{"󠄊","VARIATION SELECTOR-27"}, -{"󠄋","VARIATION SELECTOR-28"}, -{"󠄌","VARIATION SELECTOR-29"}, -{"󠄍","VARIATION SELECTOR-30"}, -{"󠄎","VARIATION SELECTOR-31"}, -{"󠄏","VARIATION SELECTOR-32"}, -{"󠄐","VARIATION SELECTOR-33"}, -{"󠄑","VARIATION SELECTOR-34"}, -{"󠄒","VARIATION SELECTOR-35"}, -{"󠄓","VARIATION SELECTOR-36"}, -{"󠄔","VARIATION SELECTOR-37"}, -{"󠄕","VARIATION SELECTOR-38"}, -{"󠄖","VARIATION SELECTOR-39"}, -{"󠄗","VARIATION SELECTOR-40"}, -{"󠄘","VARIATION SELECTOR-41"}, -{"󠄙","VARIATION SELECTOR-42"}, -{"󠄚","VARIATION SELECTOR-43"}, -{"󠄛","VARIATION SELECTOR-44"}, -{"󠄜","VARIATION SELECTOR-45"}, -{"󠄝","VARIATION SELECTOR-46"}, -{"󠄞","VARIATION SELECTOR-47"}, -{"󠄟","VARIATION SELECTOR-48"}, -{"󠄠","VARIATION SELECTOR-49"}, -{"󠄡","VARIATION SELECTOR-50"}, -{"󠄢","VARIATION SELECTOR-51"}, -{"󠄣","VARIATION SELECTOR-52"}, -{"󠄤","VARIATION SELECTOR-53"}, -{"󠄥","VARIATION SELECTOR-54"}, -{"󠄦","VARIATION SELECTOR-55"}, -{"󠄧","VARIATION SELECTOR-56"}, -{"󠄨","VARIATION SELECTOR-57"}, -{"󠄩","VARIATION SELECTOR-58"}, -{"󠄪","VARIATION SELECTOR-59"}, -{"󠄫","VARIATION SELECTOR-60"}, -{"󠄬","VARIATION SELECTOR-61"}, -{"󠄭","VARIATION SELECTOR-62"}, -{"󠄮","VARIATION SELECTOR-63"}, -{"󠄯","VARIATION SELECTOR-64"}, -{"󠄰","VARIATION SELECTOR-65"}, -{"󠄱","VARIATION SELECTOR-66"}, -{"󠄲","VARIATION SELECTOR-67"}, -{"󠄳","VARIATION SELECTOR-68"}, -{"󠄴","VARIATION SELECTOR-69"}, -{"󠄵","VARIATION SELECTOR-70"}, -{"󠄶","VARIATION SELECTOR-71"}, -{"󠄷","VARIATION SELECTOR-72"}, -{"󠄸","VARIATION SELECTOR-73"}, -{"󠄹","VARIATION SELECTOR-74"}, -{"󠄺","VARIATION SELECTOR-75"}, -{"󠄻","VARIATION SELECTOR-76"}, -{"󠄼","VARIATION SELECTOR-77"}, -{"󠄽","VARIATION SELECTOR-78"}, -{"󠄾","VARIATION SELECTOR-79"}, -{"󠄿","VARIATION SELECTOR-80"}, -{"󠅀","VARIATION SELECTOR-81"}, -{"󠅁","VARIATION SELECTOR-82"}, -{"󠅂","VARIATION SELECTOR-83"}, -{"󠅃","VARIATION SELECTOR-84"}, -{"󠅄","VARIATION SELECTOR-85"}, -{"󠅅","VARIATION SELECTOR-86"}, -{"󠅆","VARIATION SELECTOR-87"}, -{"󠅇","VARIATION SELECTOR-88"}, -{"󠅈","VARIATION SELECTOR-89"}, -{"󠅉","VARIATION SELECTOR-90"}, -{"󠅊","VARIATION SELECTOR-91"}, -{"󠅋","VARIATION SELECTOR-92"}, -{"󠅌","VARIATION SELECTOR-93"}, -{"󠅍","VARIATION SELECTOR-94"}, -{"󠅎","VARIATION SELECTOR-95"}, -{"󠅏","VARIATION SELECTOR-96"}, -{"󠅐","VARIATION SELECTOR-97"}, -{"󠅑","VARIATION SELECTOR-98"}, -{"󠅒","VARIATION SELECTOR-99"}, -{"󠅓","VARIATION SELECTOR-100"}, -{"󠅔","VARIATION SELECTOR-101"}, -{"󠅕","VARIATION SELECTOR-102"}, -{"󠅖","VARIATION SELECTOR-103"}, -{"󠅗","VARIATION SELECTOR-104"}, -{"󠅘","VARIATION SELECTOR-105"}, -{"󠅙","VARIATION SELECTOR-106"}, -{"󠅚","VARIATION SELECTOR-107"}, -{"󠅛","VARIATION SELECTOR-108"}, -{"󠅜","VARIATION SELECTOR-109"}, -{"󠅝","VARIATION SELECTOR-110"}, -{"󠅞","VARIATION SELECTOR-111"}, -{"󠅟","VARIATION SELECTOR-112"}, -{"󠅠","VARIATION SELECTOR-113"}, -{"󠅡","VARIATION SELECTOR-114"}, -{"󠅢","VARIATION SELECTOR-115"}, -{"󠅣","VARIATION SELECTOR-116"}, -{"󠅤","VARIATION SELECTOR-117"}, -{"󠅥","VARIATION SELECTOR-118"}, -{"󠅦","VARIATION SELECTOR-119"}, -{"󠅧","VARIATION SELECTOR-120"}, -{"󠅨","VARIATION SELECTOR-121"}, -{"󠅩","VARIATION SELECTOR-122"}, -{"󠅪","VARIATION SELECTOR-123"}, -{"󠅫","VARIATION SELECTOR-124"}, -{"󠅬","VARIATION SELECTOR-125"}, -{"󠅭","VARIATION SELECTOR-126"}, -{"󠅮","VARIATION SELECTOR-127"}, -{"󠅯","VARIATION SELECTOR-128"}, -{"󠅰","VARIATION SELECTOR-129"}, -{"󠅱","VARIATION SELECTOR-130"}, -{"󠅲","VARIATION SELECTOR-131"}, -{"󠅳","VARIATION SELECTOR-132"}, -{"󠅴","VARIATION SELECTOR-133"}, -{"󠅵","VARIATION SELECTOR-134"}, -{"󠅶","VARIATION SELECTOR-135"}, -{"󠅷","VARIATION SELECTOR-136"}, -{"󠅸","VARIATION SELECTOR-137"}, -{"󠅹","VARIATION SELECTOR-138"}, -{"󠅺","VARIATION SELECTOR-139"}, -{"󠅻","VARIATION SELECTOR-140"}, -{"󠅼","VARIATION SELECTOR-141"}, -{"󠅽","VARIATION SELECTOR-142"}, -{"󠅾","VARIATION SELECTOR-143"}, -{"󠅿","VARIATION SELECTOR-144"}, -{"󠆀","VARIATION SELECTOR-145"}, -{"󠆁","VARIATION SELECTOR-146"}, -{"󠆂","VARIATION SELECTOR-147"}, -{"󠆃","VARIATION SELECTOR-148"}, -{"󠆄","VARIATION SELECTOR-149"}, -{"󠆅","VARIATION SELECTOR-150"}, -{"󠆆","VARIATION SELECTOR-151"}, -{"󠆇","VARIATION SELECTOR-152"}, -{"󠆈","VARIATION SELECTOR-153"}, -{"󠆉","VARIATION SELECTOR-154"}, -{"󠆊","VARIATION SELECTOR-155"}, -{"󠆋","VARIATION SELECTOR-156"}, -{"󠆌","VARIATION SELECTOR-157"}, -{"󠆍","VARIATION SELECTOR-158"}, -{"󠆎","VARIATION SELECTOR-159"}, -{"󠆏","VARIATION SELECTOR-160"}, -{"󠆐","VARIATION SELECTOR-161"}, -{"󠆑","VARIATION SELECTOR-162"}, -{"󠆒","VARIATION SELECTOR-163"}, -{"󠆓","VARIATION SELECTOR-164"}, -{"󠆔","VARIATION SELECTOR-165"}, -{"󠆕","VARIATION SELECTOR-166"}, -{"󠆖","VARIATION SELECTOR-167"}, -{"󠆗","VARIATION SELECTOR-168"}, -{"󠆘","VARIATION SELECTOR-169"}, -{"󠆙","VARIATION SELECTOR-170"}, -{"󠆚","VARIATION SELECTOR-171"}, -{"󠆛","VARIATION SELECTOR-172"}, -{"󠆜","VARIATION SELECTOR-173"}, -{"󠆝","VARIATION SELECTOR-174"}, -{"󠆞","VARIATION SELECTOR-175"}, -{"󠆟","VARIATION SELECTOR-176"}, -{"󠆠","VARIATION SELECTOR-177"}, -{"󠆡","VARIATION SELECTOR-178"}, -{"󠆢","VARIATION SELECTOR-179"}, -{"󠆣","VARIATION SELECTOR-180"}, -{"󠆤","VARIATION SELECTOR-181"}, -{"󠆥","VARIATION SELECTOR-182"}, -{"󠆦","VARIATION SELECTOR-183"}, -{"󠆧","VARIATION SELECTOR-184"}, -{"󠆨","VARIATION SELECTOR-185"}, -{"󠆩","VARIATION SELECTOR-186"}, -{"󠆪","VARIATION SELECTOR-187"}, -{"󠆫","VARIATION SELECTOR-188"}, -{"󠆬","VARIATION SELECTOR-189"}, -{"󠆭","VARIATION SELECTOR-190"}, -{"󠆮","VARIATION SELECTOR-191"}, -{"󠆯","VARIATION SELECTOR-192"}, -{"󠆰","VARIATION SELECTOR-193"}, -{"󠆱","VARIATION SELECTOR-194"}, -{"󠆲","VARIATION SELECTOR-195"}, -{"󠆳","VARIATION SELECTOR-196"}, -{"󠆴","VARIATION SELECTOR-197"}, -{"󠆵","VARIATION SELECTOR-198"}, -{"󠆶","VARIATION SELECTOR-199"}, -{"󠆷","VARIATION SELECTOR-200"}, -{"󠆸","VARIATION SELECTOR-201"}, -{"󠆹","VARIATION SELECTOR-202"}, -{"󠆺","VARIATION SELECTOR-203"}, -{"󠆻","VARIATION SELECTOR-204"}, -{"󠆼","VARIATION SELECTOR-205"}, -{"󠆽","VARIATION SELECTOR-206"}, -{"󠆾","VARIATION SELECTOR-207"}, -{"󠆿","VARIATION SELECTOR-208"}, -{"󠇀","VARIATION SELECTOR-209"}, -{"󠇁","VARIATION SELECTOR-210"}, -{"󠇂","VARIATION SELECTOR-211"}, -{"󠇃","VARIATION SELECTOR-212"}, -{"󠇄","VARIATION SELECTOR-213"}, -{"󠇅","VARIATION SELECTOR-214"}, -{"󠇆","VARIATION SELECTOR-215"}, -{"󠇇","VARIATION SELECTOR-216"}, -{"󠇈","VARIATION SELECTOR-217"}, -{"󠇉","VARIATION SELECTOR-218"}, -{"󠇊","VARIATION SELECTOR-219"}, -{"󠇋","VARIATION SELECTOR-220"}, -{"󠇌","VARIATION SELECTOR-221"}, -{"󠇍","VARIATION SELECTOR-222"}, -{"󠇎","VARIATION SELECTOR-223"}, -{"󠇏","VARIATION SELECTOR-224"}, -{"󠇐","VARIATION SELECTOR-225"}, -{"󠇑","VARIATION SELECTOR-226"}, -{"󠇒","VARIATION SELECTOR-227"}, -{"󠇓","VARIATION SELECTOR-228"}, -{"󠇔","VARIATION SELECTOR-229"}, -{"󠇕","VARIATION SELECTOR-230"}, -{"󠇖","VARIATION SELECTOR-231"}, -{"󠇗","VARIATION SELECTOR-232"}, -{"󠇘","VARIATION SELECTOR-233"}, -{"󠇙","VARIATION SELECTOR-234"}, -{"󠇚","VARIATION SELECTOR-235"}, -{"󠇛","VARIATION SELECTOR-236"}, -{"󠇜","VARIATION SELECTOR-237"}, -{"󠇝","VARIATION SELECTOR-238"}, -{"󠇞","VARIATION SELECTOR-239"}, -{"󠇟","VARIATION SELECTOR-240"}, -{"󠇠","VARIATION SELECTOR-241"}, -{"󠇡","VARIATION SELECTOR-242"}, -{"󠇢","VARIATION SELECTOR-243"}, -{"󠇣","VARIATION SELECTOR-244"}, -{"󠇤","VARIATION SELECTOR-245"}, -{"󠇥","VARIATION SELECTOR-246"}, -{"󠇦","VARIATION SELECTOR-247"}, -{"󠇧","VARIATION SELECTOR-248"}, -{"󠇨","VARIATION SELECTOR-249"}, -{"󠇩","VARIATION SELECTOR-250"}, -{"󠇪","VARIATION SELECTOR-251"}, -{"󠇫","VARIATION SELECTOR-252"}, -{"󠇬","VARIATION SELECTOR-253"}, -{"󠇭","VARIATION SELECTOR-254"}, -{"󠇮","VARIATION SELECTOR-255"}, -{"󠇯","VARIATION SELECTOR-256"}, - - }; - public static String[][][] CATEGORIES = { -{{"Symbol"}, -/*116,6*/ {"Alchemical Symbols@Other ","A2j1dA"}, -/*589,116*/ {"Arrows","%=68k11I3706:%M%G7AnTMm6e6HDk%`O728F1f4V1PNF2WF1G}58?]514M]Ol1%2l2%1#1GsGV1f172otW06gQ01U:1Un2MH$|W}4$,ml3f1MW|L+%0M"}, -/*256,6*/ {"Braille",";oA0FN"}, -/*3,6*/ {"Common Indic Number Forms@Other ","wgi068"}, -/*39,6*/ {"Control Pictures","(j90d3"}, -/*53,50*/ {"Currency","H3XBMQQ10HB(2106uPM]N:qol202S20#2;.Z0^xM0:91E]J6O6"}, -/*613,223*/ {"Emoji",";O906vIMOG%I9UGOun294v1O%1916$6n16]1]1u68WGWGGUGG88:48WE891X4EH2f@695^O1091vs8g0U1nE6916P1G,P9692:1G!]16HH]26G68#18M8-3W#191-1$s888]1M8|f2EG68Gt18#188738cGt58sGt4G8U868Mmm?8EeF1GM$Mv1|fE@18E8U8E8@2W?X4GE88GG8E8886u868kWOGc8"}, -/*80,6*/ {"Emoticons@Other ",";(i1F7"}, -/*272,42*/ {"Game Pieces","Q6A06f5#1H2,]4MeEY[W1@3W}891N1GN18N18N3P#k"}, -/*14,7*/ {"Gender and Genealogical","2JA0sOc"}, -/*434,65*/ {"Geometric Shapes","oG90nMcPTFNfFEQE10t2v2EO71%26f1cGsH26O|8sX2M;>t0%E6OW6^<$0sW6Xq#5"}, -/*63,30*/ {"Keyboard and UI","Qz80XqUGv771.Uv46%7Y^Y1F2mc]1M"}, -/*2,4*/ {"Latin 1 Supplement@Other Punctuation","9FP1"}, -/*1182,227*/ {"Math","wug1M8V2868G8,8M88mW888E868G8888868GM8k8M8M88,8d1eE8U8d1++g1f1E:2v2894WX3:2v+]lEQ?60f2E11OH1P1M]1U11U]571WO6WUv3f111MuUmH6Ue6WGGu:26G8:2NO$M:16H8%2V28H211cvg.]4s9AnU#5PNdkX4-1Gc24P1P2:2P2:2P2:2P2:2P2QB606bf$0:;c8%Ef1Ev28v28]BmM"}, -/*1052,127*/ {"Math Alphanumeric","w010EGX26G6gy70bm806e2Y806Gce?]Au,8OUmOO68E86uMeU^`Q1t78V686GG6GM8|88k8-58MGs8k8d28M8U8Ok8-UGF28F28#28F28#28F28#28F28#28F28sGd4"}, -/*470,200*/ {"Miscellaneous","w=B0rzB0GW8Y040Mg%50EHB686WU8l1$Uv4?8En1E8|:29168U8718k8kG8M868M8686e686888,v2M118MO8|8E]7(V10c2tN1cYf8068n2EG6G:1W]3M:1Mm6X3888-1W91,e|O6G86%1:18H3m6%5$6%468eGWc8c11126v1V191t28t38#7X29DuM8E86m8ULN%0"}, -/*568,29*/ {"Musical",";DA0k2mO1NM[d3GVH92N6g-80c92s"}, -/*2,5*/ {"Number Forms@Other ","g=806"}, -/*48,6*/ {"Ornamental Dingbats@Other ","A;i1N4"}, -/*82,28*/ {"Stars/Asterisks",";OA0v5l2W,g510E^jW1WV1:lvx-1"}, -/*29,13*/ {"Subscript","Qq80N1871QC30"}, -/*35,41*/ {"Superscript","XFX1x6e1oUg2701+6G|nE8I0302QW069JPC6^A}06"}, -/*14,10*/ {"Supplemental Arrows C@Other ","gcj1sWM916"}, -/*15,10*/ {"Supplemental Symbols And Pictographs@Other ","Aoj1!X9UX5"}, -/*200,26*/ {"Technical","gM90-2G6$l7H1!%2N2O?mF2P6?"}, -/*4,5*/ {"Tibetan@Other ","YG40M"}, -/*98,12*/ {"Transport And Map Symbols@Other ","g?i1N7X171OM"}, -/*10,9*/ {"Vedic Extensions@Other Letter","(u70M8MO6"}, -/*79,26*/ {"Weather and Astrological","Q4A0F1mv3}1v8,uUe^zX171:1|"}, -/*165,20*/ {"Yijing / Tai Xuan Jing","w8A0sf7c2WA0#5A>E1-7"}, -/*158,16*/ {"Compatibility","^dh1#28F5m-3:6N2"}, -/*67,14*/ {"Historic","I{)0%4!P7|%4}3"}, -/*60,18*/ {"Compatibility","(PD0M(ZU16H1-3e!u6"}, -}, -{{"Punctuation"}, -/*20,22*/ {"ASCII Based","]2E8EG886[6O6f2H6eP16u"}, -/*17,28*/ {"Dash/Connector","14f4gX80c%36%1gu30:26W;2t0XG"}, -/*47,39*/ {"Other","(s70:<.MO$EGGG8OEms88Iu3068G6n1!GM8(iW0"}, -/*139,93*/ {"Paired","n36f48v2894X1;P80sP26[6]46P16nvMPF6f3c1^F1H76:2,va@1%5M]26;7106G,H2Hf,Gs2Ms06nPcXF6f48v288686"}, -/*50,23*/ {"Historic","gm808kQT30MnN72v1?(%t0E"}, -/*105,63*/ {"Compatibility","Ig80e91E91686W8$EH1X36P162pw0,12-1G|8F18W86nDE8c8M[6O6X2E8f2886"}, -}, -{{"Number"}, -/*500,166*/ {"Decimal","P4,]A6egh10,HC,1I,fb,%A,%A,%A,%A,%A,%A,%A,%A,%A,%8,%A,X6,PP,X6,Q]10,f3,PR,vB,9F,m,nG,]K,m,A710Ocm,^SZ0,vz,f3,1I,12,:7,]a,w{L0,oo40,vB,f5,9D,PP,%g,1Y,P9,P9,Xc,;lL0,]K,"}, -/*198,37*/ {"Enclosed/Dotted","gs90#7%4@1Pvt2g+20,%2s8N1]2,n3N1g2U16"}, -/*40,20*/ {"Fractions/Related","9G6eGEoX80Ocm,1IV1%3"}, -/*427,153*/ {"Other","ot20cHYc]AE9Ck]Lcvd,^910#1oF10,vh2}1073GMQ:30P2!P1EHVMI2V0,9TcA|N0V2(a10sP2kn3!:6U9H6GV1G74XB6%2E:6Uf9sH2s%3k1Uc1W#2fg#1fY#1wY1069d!;+L0kIiR0l1gu50!oE20?"}, -/*271,60*/ {"Historic","o560EgM10,Yk10EGMo230w6u0}39175n16%aMv2$HCUXI,^E10cnQso,60}9"}, -/*60,24*/ {"Compatibility","w.80-2o?30EHVM2Us0,w{#0?"}, -}, -{{"Format & Whitespace"}, -/*140,53*/ {"Format","vF;Z10c12o%40;920UX2Uf4U8M2n#0Iej0MQi50sY)W9l8bk0AvME"}, -/*262,22*/ {"Variation Selector","]=oY506%7E^$zA#LDF1AV1"}, -/*18,25*/ {"Whitespace","^a)05Y)0nBQQ80,n26eP4wB40"}, -/*7,23*/ {"Historic","w-10f4^#206IV10(970ols0"}, -/*16,19*/ {"Compatibility","fEAQ80?P3P4wB40^@s0"}, -}, -{{"Modifier"}, -/*8,11*/ {"Enclosing","Q670Ys10M8E"}, -/*225,67*/ {"Nonspacing","%+#5GG,8t1QE60F1HmE8718kWmO6XI,P2N1m6v%71WO|A(x0Yss0En1sGk%2MT_t0F1"}, -/*132,54*/ {"Spacing","f!!.M%3M91gz30(C30f1695E8?8l18d2X4N32D40XH2zW0]ZUo@|0U"}, -/*47,22*/ {"Historic","%?71HP62x60M[F2926^Py0"}, -/*4,5*/ {"Compatibility","n<686"}, -}, -{{"Latin"}, -/*403,153*/ {"Common",":5N2mN2P6}18#28V1G,GcGcGcGMW68cGs8MGcGMGMGsGd1GWG6OU8GEOG6H168E11M.s$$6f16%2MG6P3P168688uW.128$IN706126f16m6W6:16m6$6P16Gc916[878QAa06zph0696UG6OX2.o2706"}, -/*89,16*/ {"Enclosed","Q!90t4Y#X1M8-2:5"}, -/*105,257*/ {"Flipped/Mirrored","]r=i1jKjnjQq40L!401GCpwGi0Trh04pM83:liJK1qQMnmaJQE10jm10(;50Lj50wX50{W50A1i0TJd0bB506(T40v]a8zE50I0105010IUi0{Zh0:7=w*Uc:V%Dih:h`h9X%B41n1WSL1Qau9q`jh_Bnm4lPm*mHn6amfmSmH6;+80j630Lj50wX50{W50QW80P1T#806f=^Y40(d30gtZ0bUi06AL10D9102g70+M70(#80+q80P3*jA#80{z80"}, -/*729,223*/ {"Other","]N6[6m6m6m6m6G]16m6W6W6$6v186O6G6m86OE86GUGGEGOEv2s8sG!OEOt2$F38?A570@3%5718}2H9|G@1GV1GcGMG#1GcGsGF1G6m|GcXyf2o]20}1u62cW0F1v6N1e@2Gs%5Gv;-3eUDKj081s868EG?8E8EGcu8E8UGEw^60t5H193N3v!H1f171X9O11G6e6O88m11X186IWZ072f9E]96%?M"}, -/*183,87*/ {"Phonetics (IPA)","%8N2%96$uH4H3u:9M%CF28718M868UO?86G68E8868GHOeP1SPE8GW11OO6918Of26868886OV3WU%2Wg|70EO6"}, -/*24,20*/ {"Phonetics (X-IPA)","1uH1WGeE11G6GO8G868s"}, -/*148,53*/ {"Historic","HZ6uP268691s15P361Jd1oQ7068H8cHw!Y?20kAZW0sH26P1l6:BU"}, -/*358,91*/ {"Compatibility","HF8WWO8:A6116v5H6!P3E%KcgT706vtM8E8?86GUGE8O8M8E86W8.U12-2X.}6;l30HBMvE,et8:2Qtq0kg710N2mN2"}, -}, -{{"Other European Scripts"}, -/*303,20*/ {"Cyrillic","2510#B$}E`uHfWE;1(06"}, -/*277,79*/ {"Greek","P]m8E88#18@3P3$wC70@1GcGV3GcGs8888l1888888O#48U8eE8E88OEOUeE8k8eE8E88Y=a0bai06W"}, -/*53,8*/ {"Historic - Caucasian Albanian","g6+0t411"}, -/*130,17*/ {"Historic - Cyrillic","^G106g^A0-2o,V0t8"}, -/*143,15*/ {"Historic - Duployan","2bT1t9e71O!u,GM"}, -/*40,6*/ {"Historic - Elbasan","A2+0l3"}, -/*94,9*/ {"Historic - Glagolitic","^tB0F48F4"}, -/*27,6*/ {"Historic - Gothic","^l*0V2"}, -/*183,34*/ {"Historic - Greek","]@MG6OEX7EO71f18GU8E;{(0@6%1Y9t0N6"}, -/*341,11*/ {"Historic - Linear A","YP+0FS.@1[s"}, -/*211,23*/ {"Historic - Linear B","(z)0|8N28t1868N1GF1937B"}, -/*29,6*/ {"Historic - Ogham","o_50l2"}, -/*108,12*/ {"Historic - Old Hungarian","w0-0l4H1l4uc"}, -/*36,6*/ {"Historic - Old Italic","oh*0F3"}, -/*43,6*/ {"Historic - Old Permic",";o*0-3"}, -/*89,6*/ {"Historic - Runic","g|50}7"}, -/*48,6*/ {"Historic - Shavian","A;*0N4"}, -/*44,46*/ {"Compatibility - Greek","XG%$e68%6Ef26OoN70888888n58Uu88EOu8EOu8E.886:Q"}, -}, -{{"American Scripts"}, -/*710,20*/ {"Canadian Aboriginal","gP50NuGd1]oN6TR10Xu6"}, -/*172,14*/ {"Historic - Cherokee","wG50#7Gco4e0F7"}, -/*80,6*/ {"Historic - Deseret",";(*0F7"}, -/*672,12*/ {"Historic - SignWriting","w.a1FxX1U8N1"}, -}, -{{"African Scripts"}, -/*495,87*/ {"Ethiopic",";(40l68MGk88MGt38MG@28MGk88MGN18758MG}5X3V1w<60}1.k8k8k8k8k8k8k8kI8X0cGcGc.k8kDDe0-2%1,"}, -/*59,10*/ {"Tifinagh","o_B0}4u6P1"}, -/*657,12*/ {"Historic - Bamum","(5i0@7Y4p0tp"}, -/*36,8*/ {"Historic - Bassa Vah","o_71t2Gc"}, -/*137,14*/ {"Historic - Coptic","Q210F12$A0dAek"}, -/*1071,8*/ {"Historic - Egyptian Hieroglyphs",";Y[0}}N9"}, -/*213,9*/ {"Historic - Mende Kikakui","25f1-HGV1"}, -/*90,12*/ {"Historic - Meroitic Cursive","(L,072W#1G74"}, -/*32,6*/ {"Historic - Meroitic Hieroglyphs",";I,0-2"}, -/*59,6*/ {"Historic - Nko","Q420N5"}, -/*40,8*/ {"Historic - Osmanya","g?*0t2G,"}, -/*300,6*/ {"Historic - Vai","^th0FR"}, -}, -{{"Middle Eastern Scripts"}, -/*357,84*/ {"Arabic","gs10V2m,f368W-18F68H26[EGP774XQ-1A}$05!%0U8N1mG6]2[73G19f2,O61il2^A+0#2YVx06{S$0V1]p"}, -/*88,17*/ {"Armenian","(W10V3[V344k%36GE"}, -/*44,20*/ {"Georgian","Yc40eG@2mMGEz230Y230"}, -/*53,19*/ {"Hebrew","Il10V2eE`5#1P46o:$0"}, -/*583,6*/ {"Historic - Anatolian Hieroglyphs","Qy{0@q"}, -/*39,22*/ {"Historic - Arabic","gr10c]2UH46%2f6k8V19D6"}, -/*49,6*/ {"Historic - Carian",";Y*0V4"}, -/*1234,16*/ {"Historic - Cuneiform","gE=0#_P9}98U11#H"}, -/*55,13*/ {"Historic - Cypriot","^-+0cG8@386OG"}, -/*85,18*/ {"Historic - Georgian",";Y40V3]3cW2a70V38e"}, -/*26,10*/ {"Historic - Hatran","Q4,0t186eU"}, -/*45,16*/ {"Historic - Hebrew","gf10#2:1M;>$0!f3"}, -/*29,6*/ {"Historic - Lycian","^V*0l2"}, -/*27,7*/ {"Historic - Lydian","AA,0N2e"}, -/*40,8*/ {"Historic - Nabataean","o_+0#2$!"}, -/*32,6*/ {"Historic - Old North Arabian","Ag,0-2"}, -/*32,6*/ {"Historic - Old South Arabian","Id,0-2"}, -/*32,6*/ {"Historic - Palmyrene","w[+0-2"}, -/*29,7*/ {"Historic - Phoenician","I7,0d2O"}, -/*61,9*/ {"Historic - Samaritan","AA2074GN1"}, -/*93,22*/ {"Historic - Syriac","wq10P1O]2[?X21DF18V5GE"}, -/*31,7*/ {"Historic - Ugaritic","It*0t28"}, -/*876,89*/ {"Compatibility - Arabic","I!10MA($0-813@Wv1#5G-4v371fAE88FCgI#0M8V2868G8,8M88mW888E868G8888868GM8k8M8M88,8d1eE8U8d1"}, -/*6,9*/ {"Compatibility - Armenian","oe10g^$0U"}, -/*35,17*/ {"Compatibility - Hebrew","2a(08.F18U886868!"}, -}, -{{"South Asian Scripts"}, -/*94,49*/ {"Bengali","2j20WsG6G@18k8OMOf1n16P16%+6y^6f2E958kG6GE.[6G,G,"}, -/*152,51*/ {"Devanagari","(X20-4Ov1X16f1F19[6gMf0cO8TRg0M]4E8l18k[V1YEg0l1mE8"}, -/*87,43*/ {"Gujarati","(*20!8E8@18k868UOv1X1692%j6*uE958s8E8E:16G|"}, -/*77,43*/ {"Gurmukhi","Av20cW6G@18k8GG693]1E:v6y^69EE958UW6GEO:1|O"}, -/*86,39*/ {"Kannada","QR30s8E8}18,8UO936X16KAE958k8E8Mu6116G,"}, -/*102,39*/ {"Malayalam","Ab306v1s8E8t3Gf1f1EH2caBEP5k8E8M.[6GV1O"}, -/*92,47*/ {"Oriya","Y[20sG6G@18k868UO13EX1:Y6y^6XbE958kG6GE$6[6G?8c"}, -/*110,34*/ {"Sinhala","oo30l1O728!8GkC66X6Wc88sm,GEA%*0#1"}, -/*74,40*/ {"Tamil",";3308cOE8MO6886O6OEO|12HQ6aXX5UOE8M.P1-1"}, -/*98,41*/ {"Telugu","wF30s8E8}18V1OX2Ee61D6`LMP5k8E8Mu6116G,$s"}, -/*63,19*/ {"Thaana","g|10V311KcP1O:5,%S?"}, -/*57,12*/ {"Historic - Ahom","2+:0N2ON1WV1"}, -/*61,8*/ {"Historic - Avestan","(r,0-4Ok"}, -/*109,11*/ {"Historic - Brahmi","A^-0}6Wt2X1"}, -/*67,9*/ {"Historic - Chakma","YH.0#48F1"}, -/*85,32*/ {"Historic - Grantha","^#.0M8sG6G@18k868UG!G6GEGmekGkOU"}, -/*31,8*/ {"Historic - Imperial Aramaic","(>+0@18!"}, -/*27,8*/ {"Historic - Inscriptional Pahlavi","g!,0t1es"}, -/*30,8*/ {"Historic - Inscriptional Parthian","ox,0@1Gs"}, -/*66,6*/ {"Historic - Kaithi","(5.0@5"}, -/*1,4*/ {"Historic - Kannada","YZ30"}, -/*65,20*/ {"Historic - Kharoshthi","gU,0M86es8E8V2WEW!$!"}, -/*61,9*/ {"Historic - Khojki","we.0l18-3"}, -/*69,8*/ {"Historic - Khudawadi","wu.0N5e,"}, -/*74,11*/ {"Historic - Lepcha","oZ70}4ON1OE"}, -/*68,13*/ {"Historic - Limbu","(r60#28|W|WO|"}, -/*39,6*/ {"Historic - Mahajani","wO.0d3"}, -/*29,7*/ {"Historic - Mandaic","^F20d2G"}, -/*79,12*/ {"Historic - Meetei Mayek","wGj0}1PI74G,"}, -/*79,9*/ {"Historic - Modi","(j:0F611,"}, -/*43,10*/ {"Historic - Mro","g,71#28,W6"}, -/*38,13*/ {"Historic - Multani","Qq.0k88M8N18?"}, -/*48,6*/ {"Historic - Ol Chiki",";g70N4"}, -/*50,9*/ {"Historic - Old Persian","Aw*0F3WF1"}, -/*81,8*/ {"Historic - Saurashtra","Yni0F6.|"}, -/*94,9*/ {"Historic - Sharada","2T.0}6GV1"}, -/*92,9*/ {"Historic - Siddham","AY:0-4GV3"}, -/*35,8*/ {"Historic - Sora Sompeng","2D.0F2u,"}, -/*44,6*/ {"Historic - Syloti Nagri","(bi0@3"}, -/*66,8*/ {"Historic - Takri","Yv:0}4$,"}, -/*82,8*/ {"Historic - Tirhuta",";A:0d6$,"}, -/*84,8*/ {"Historic - Warang Citi",";I;0d791"}, -/*3,6*/ {"Compatibility - Bengali","Yr2068"}, -/*8,5*/ {"Compatibility - Devanagari","Yf20s"}, -/*6,9*/ {"Compatibility - Gurmukhi","Qz20G93EG"}, -/*2,5*/ {"Compatibility - Oriya","Q0306"}, -}, -{{"Southeast Asian Scripts"}, -/*141,26*/ {"Khmer",";I6073GE8?v3q3l28,W,m,Hi-2"}, -/*67,41*/ {"Lao","g:3068G68GmM8k8E88G68M86.GU92MC4Gc86.8cG,"}, -/*223,79*/ {"Myanmar","QK40-3:1f1cWMOO6uEW7191Ame0UG![U:8V18cOO6r-e0#18V1mMWE8EGkOMH1|8d12le06.,%AmE8E"}, -/*86,19*/ {"Thai",";z30N48691c*1Gk11@1"}, -/*121,9*/ {"Historic - Balinese","QC70-6W}3"}, -/*56,8*/ {"Historic - Batak","(T70t4$M"}, -/*30,8*/ {"Historic - Buginese","2>60d2G6"}, -/*22,8*/ {"Historic - Buhid","2C606.#1"}, -/*83,13*/ {"Historic - Cham","Q`i0@4.F1G,GM"}, -/*23,6*/ {"Historic - Hanunoo","AA60}1"}, -/*90,10*/ {"Historic - Javanese","w.i0}6G,W6"}, -/*47,7*/ {"Historic - Kayah Li","2zi0748"}, -/*3,8*/ {"Historic - Khmer","gM60v311"}, -/*127,16*/ {"Historic - Pahawh Hmong","^}71N6[,8k8-1et1"}, -/*57,6*/ {"Historic - Pau Cin Hau","Q*;075"}, -/*37,8*/ {"Historic - Rejang","Y%i0F311"}, -/*72,9*/ {"Historic - Sundanese","^N70#5PNs"}, -/*22,11*/ {"Historic - Tagalog","I760718k]26"}, -/*20,13*/ {"Historic - Tagbanwa","2C606%3718E86"}, -/*127,16*/ {"Historic - Tai Tham","^@60t58l2G?m,mF1"}, -/*72,9*/ {"Historic - Tai Viet","^7j0}5H2U"}, -}, -{{"Hangul"}, -/*112,38*/ {"Other","ozC0:42Pi0}1WV4Lbi0MO,8F1H1EmeEPqQ?r06"}, -/*1176,8*/ {"ᄀ HANGUL CHOSEONG KIYEOK",";gj0}}-I"}, -/*588,6*/ {"ᄂ HANGUL CHOSEONG NIEUN","(zk0Vr"}, -/*1180,13*/ {"ᄃ HANGUL CHOSEONG TIKEUT","(+i0MAj20}}-I"}, -/*599,11*/ {"ᄅ HANGUL CHOSEONG RIEUL","A,i0?2#30Vr"}, -/*591,11*/ {"ᄆ HANGUL CHOSEONG MIEUM","A-i0EIS40Vr"}, -/*1179,13*/ {"ᄇ HANGUL CHOSEONG PIEUP","Y-i0EY]40}}-I"}, -/*1177,12*/ {"ᄉ HANGUL CHOSEONG SIOS","w-i0IC60}}-I"}, -/*590,11*/ {"ᄋ HANGUL CHOSEONG IEUNG","(-i06^U70Vr"}, -/*1177,12*/ {"ᄌ HANGUL CHOSEONG CIEUC","^-i0Q`70}}-I"}, -/*588,6*/ {"ᄎ HANGUL CHOSEONG CHIEUCH","I}r0Vr"}, -/*588,6*/ {"ᄏ HANGUL CHOSEONG KHIEUKH","wqs0Vr"}, -/*589,10*/ {"ᄐ HANGUL CHOSEONG THIEUTH","2.i02YA0Vr"}, -/*589,10*/ {"ᄑ HANGUL CHOSEONG PHIEUPH","A.i0Y}A0Vr"}, -/*589,10*/ {"ᄒ HANGUL CHOSEONG HIEUH","I.i0(qB0Vr"}, -/*1,4*/ {"ᅙ HANGUL CHOSEONG YEORINHIEUH","Q.i0"}, -/*350,12*/ {"Historic","oh40FN^L80d8"}, -/*118,27*/ {"Compatibility","oJD0#2]5#2IGs0MX5#2OcGcGcGE"}, -}, -{{"Other East Asian Scripts"}, -/*111,40*/ {"Bopomofo","ozC0:4HIt3XAV2bXC06I]B0MO,8F1.MGmeEwgs06"}, -/*142,49*/ {"Hiragana","ozC0:4W#7AZD1zmD1MOF2X1c8eE986G68H86XD6^Bs061R946"}, -/*162,52*/ {"Katakana","ozC0:49978PMV1I2D1rmD1MOF2X1c8eE986eH8MHD6^Bs061R946"}, -/*133,14*/ {"Miao","2591F611F4f1d1"}, -/*156,23*/ {"Mongolian","YX60738t4$t38aFN18,%3H9"}, -/*207,70*/ {"Tibetan","2{30%5E8M8M8M8M8M8|8Ef2UqC?8l4f468ek8cec8M8M8M8M8M8|8E8N18kW6Ii806e,Gs"}, -/*1240,30*/ {"Yi","oRg0-18}}-FL.U06e,Gs^rT0IG10@4"}, -/*48,6*/ {"Historic - Lisu","oph0N4"}, -/*51,8*/ {"Historic - Manichaean","^l,0d3W|"}, -/*83,13*/ {"Historic - New Tai Lue","Y%60@3WN2m?O6"}, -/*73,6*/ {"Historic - Old Turkic","2>,0l6"}, -/*83,26*/ {"Historic - Phags Pa","wU6068AU606e,Gs2*V0}4w|M0M"}, -/*29,11*/ {"Historic - Psalter Pahlavi","Y%,0l1uM91k"}, -/*35,8*/ {"Historic - Tai Le","2z60t2GU"}, -/*4,5*/ {"Compatibility - Bopomofo","Ql)0M"}, -/*24,21*/ {"Compatibility - Hiragana","^%C0996G1MF1gas0U2E$0"}, -/*213,29*/ {"Compatibility - Katakana","^%C0996]8PDF1vRF48@7g`r0N18}3"}, -/*22,30*/ {"Compatibility - Tibetan","A|30]4.WWW91.868$n1.WWW91YX#0M"}, -/*4,5*/ {"Compatibility - Yi","Ql)0M"}, -}, -{{"Han - Other"}, -/*149,15*/ {"CJK Strokes","AQC0N28M8d7H%F3"}, -/*12,5*/ {"Ideographic Description","oxC0|"}, -/*21003,66*/ {"Other","AzC0d18V2GmOUY=70}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}du({O06"}, -/*1383,65*/ {"Compatibility","^SC0n791VJ]8E9iF3f5V4X1|%CF2[U%8#2;8q0dOG8G,88G6O76Gl9YH10Modj1Fn"}, -/*59438,155*/ {"Less Common","A(D0}}}}}}}}}}}}N,^oj06886[886GEwL+0}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}#A%3}}}}}}}}7P117KG}}}}}}}}}}#_"}, -}, -{{"TODO"}, -/*29,40*/ {"Missing","A-80A4R1f6G686G6W918u]5W6$un2We8Eu]U6nQ6"}, -}, - }; +public class CharData { + public static String[][] CHARACTERS_TO_NAME = { + {" ", "SPACE"}, + {"-", "HYPHEN-MINUS"}, + {" ", "NO-BREAK SPACE"}, + {"­", "SOFT HYPHEN"}, + {"͏", "COMBINING GRAPHEME JOINER"}, + {"֊", "ARMENIAN HYPHEN"}, + {"־", "HEBREW PUNCTUATION MAQAF"}, + {"؀", "ARABIC NUMBER SIGN"}, + {"؁", "ARABIC SIGN SANAH"}, + {"؂", "ARABIC FOOTNOTE MARKER"}, + {"؃", "ARABIC SIGN SAFHA"}, + {"؄", "ARABIC SIGN SAMVAT"}, + {"؅", "ARABIC NUMBER MARK ABOVE"}, + {"؜", "ARABIC LETTER MARK"}, + {"۝", "ARABIC END OF AYAH"}, + {"܏", "SYRIAC ABBREVIATION MARK"}, + {"ᅟ", "HANGUL CHOSEONG FILLER"}, + {"ᅠ", "HANGUL JUNGSEONG FILLER"}, + {"᐀", "CANADIAN SYLLABICS HYPHEN"}, + {" ", "OGHAM SPACE MARK"}, + {"឴", "KHMER VOWEL INHERENT AQ"}, + {"឵", "KHMER VOWEL INHERENT AA"}, + {"᠆", "MONGOLIAN TODO SOFT HYPHEN"}, + {"᠋", "MONGOLIAN FREE VARIATION SELECTOR ONE"}, + {"᠌", "MONGOLIAN FREE VARIATION SELECTOR TWO"}, + {"᠍", "MONGOLIAN FREE VARIATION SELECTOR THREE"}, + {"᠎", "MONGOLIAN VOWEL SEPARATOR"}, + {" ", "EN QUAD"}, + {" ", "EM QUAD"}, + {" ", "EN SPACE"}, + {" ", "EM SPACE"}, + {" ", "THREE-PER-EM SPACE"}, + {" ", "FOUR-PER-EM SPACE"}, + {" ", "SIX-PER-EM SPACE"}, + {" ", "FIGURE SPACE"}, + {" ", "PUNCTUATION SPACE"}, + {" ", "THIN SPACE"}, + {" ", "HAIR SPACE"}, + {"​", "ZERO WIDTH SPACE"}, + {"‌", "ZERO WIDTH NON-JOINER"}, + {"‍", "ZERO WIDTH JOINER"}, + {"‎", "LEFT-TO-RIGHT MARK"}, + {"‏", "RIGHT-TO-LEFT MARK"}, + {"‐", "HYPHEN"}, + {"‑", "NON-BREAKING HYPHEN"}, + {"‒", "FIGURE DASH"}, + {"–", "EN DASH"}, + {"—", "EM DASH"}, + {"―", "HORIZONTAL BAR"}, + {"\u2028", "LINE SEPARATOR"}, + {"\u2029", "PARAGRAPH SEPARATOR"}, + {"‪", "LEFT-TO-RIGHT EMBEDDING"}, + {"‫", "RIGHT-TO-LEFT EMBEDDING"}, + {"‬", "POP DIRECTIONAL FORMATTING"}, + {"‭", "LEFT-TO-RIGHT OVERRIDE"}, + {"‮", "RIGHT-TO-LEFT OVERRIDE"}, + {" ", "NARROW NO-BREAK SPACE"}, + {" ", "MEDIUM MATHEMATICAL SPACE"}, + {"⁠", "WORD JOINER"}, + {"⁡", "FUNCTION APPLICATION"}, + {"⁢", "INVISIBLE TIMES"}, + {"⁣", "INVISIBLE SEPARATOR"}, + {"⁤", "INVISIBLE PLUS"}, + {"⁦", "LEFT-TO-RIGHT ISOLATE"}, + {"⁧", "RIGHT-TO-LEFT ISOLATE"}, + {"⁨", "FIRST STRONG ISOLATE"}, + {"⁩", "POP DIRECTIONAL ISOLATE"}, + {"⸗", "DOUBLE OBLIQUE HYPHEN"}, + {"⸚", "HYPHEN WITH DIAERESIS"}, + {"⸺", "TWO-EM DASH"}, + {"⸻", "THREE-EM DASH"}, + {"⹀", "DOUBLE HYPHEN"}, + {" ", "IDEOGRAPHIC SPACE"}, + {"〜", "WAVE DASH"}, + {"〰", "WAVY DASH"}, + {"゠", "KATAKANA-HIRAGANA DOUBLE HYPHEN"}, + {"ㅤ", "HANGUL FILLER"}, + {"︀", "VARIATION SELECTOR-1"}, + {"︁", "VARIATION SELECTOR-2"}, + {"︂", "VARIATION SELECTOR-3"}, + {"︃", "VARIATION SELECTOR-4"}, + {"︄", "VARIATION SELECTOR-5"}, + {"︅", "VARIATION SELECTOR-6"}, + {"︆", "VARIATION SELECTOR-7"}, + {"︇", "VARIATION SELECTOR-8"}, + {"︈", "VARIATION SELECTOR-9"}, + {"︉", "VARIATION SELECTOR-10"}, + {"︊", "VARIATION SELECTOR-11"}, + {"︋", "VARIATION SELECTOR-12"}, + {"︌", "VARIATION SELECTOR-13"}, + {"︍", "VARIATION SELECTOR-14"}, + {"︎", "VARIATION SELECTOR-15"}, + {"️", "VARIATION SELECTOR-16"}, + {"︱", "PRESENTATION FORM FOR VERTICAL EM DASH"}, + {"︲", "PRESENTATION FORM FOR VERTICAL EN DASH"}, + {"﹘", "SMALL EM DASH"}, + {"﹣", "SMALL HYPHEN-MINUS"}, + {"", "ZERO WIDTH NO-BREAK SPACE"}, + {"-", "FULLWIDTH HYPHEN-MINUS"}, + {"ᅠ", "HALFWIDTH HANGUL FILLER"}, + {"", "INTERLINEAR ANNOTATION ANCHOR"}, + {"", "INTERLINEAR ANNOTATION SEPARATOR"}, + {"", "INTERLINEAR ANNOTATION TERMINATOR"}, + {"𑂽", "KAITHI NUMBER SIGN"}, + {"𛲠", "SHORTHAND FORMAT LETTER OVERLAP"}, + {"𛲡", "SHORTHAND FORMAT CONTINUING OVERLAP"}, + {"𛲢", "SHORTHAND FORMAT DOWN STEP"}, + {"𛲣", "SHORTHAND FORMAT UP STEP"}, + {"𝅳", "MUSICAL SYMBOL BEGIN BEAM"}, + {"𝅴", "MUSICAL SYMBOL END BEAM"}, + {"𝅵", "MUSICAL SYMBOL BEGIN TIE"}, + {"𝅶", "MUSICAL SYMBOL END TIE"}, + {"𝅷", "MUSICAL SYMBOL BEGIN SLUR"}, + {"𝅸", "MUSICAL SYMBOL END SLUR"}, + {"𝅹", "MUSICAL SYMBOL BEGIN PHRASE"}, + {"𝅺", "MUSICAL SYMBOL END PHRASE"}, + {"󠀠", "TAG SPACE"}, + {"󠀡", "TAG EXCLAMATION MARK"}, + {"󠀢", "TAG QUOTATION MARK"}, + {"󠀣", "TAG NUMBER SIGN"}, + {"󠀤", "TAG DOLLAR SIGN"}, + {"󠀥", "TAG PERCENT SIGN"}, + {"󠀦", "TAG AMPERSAND"}, + {"󠀧", "TAG APOSTROPHE"}, + {"󠀨", "TAG LEFT PARENTHESIS"}, + {"󠀩", "TAG RIGHT PARENTHESIS"}, + {"󠀪", "TAG ASTERISK"}, + {"󠀫", "TAG PLUS SIGN"}, + {"󠀬", "TAG COMMA"}, + {"󠀭", "TAG HYPHEN-MINUS"}, + {"󠀮", "TAG FULL STOP"}, + {"󠀯", "TAG SOLIDUS"}, + {"󠀰", "TAG DIGIT ZERO"}, + {"󠀱", "TAG DIGIT ONE"}, + {"󠀲", "TAG DIGIT TWO"}, + {"󠀳", "TAG DIGIT THREE"}, + {"󠀴", "TAG DIGIT FOUR"}, + {"󠀵", "TAG DIGIT FIVE"}, + {"󠀶", "TAG DIGIT SIX"}, + {"󠀷", "TAG DIGIT SEVEN"}, + {"󠀸", "TAG DIGIT EIGHT"}, + {"󠀹", "TAG DIGIT NINE"}, + {"󠀺", "TAG COLON"}, + {"󠀻", "TAG SEMICOLON"}, + {"󠀼", "TAG LESS-THAN SIGN"}, + {"󠀽", "TAG EQUALS SIGN"}, + {"󠀾", "TAG GREATER-THAN SIGN"}, + {"󠀿", "TAG QUESTION MARK"}, + {"󠁀", "TAG COMMERCIAL AT"}, + {"󠁁", "TAG LATIN CAPITAL LETTER A"}, + {"󠁂", "TAG LATIN CAPITAL LETTER B"}, + {"󠁃", "TAG LATIN CAPITAL LETTER C"}, + {"󠁄", "TAG LATIN CAPITAL LETTER D"}, + {"󠁅", "TAG LATIN CAPITAL LETTER E"}, + {"󠁆", "TAG LATIN CAPITAL LETTER F"}, + {"󠁇", "TAG LATIN CAPITAL LETTER G"}, + {"󠁈", "TAG LATIN CAPITAL LETTER H"}, + {"󠁉", "TAG LATIN CAPITAL LETTER I"}, + {"󠁊", "TAG LATIN CAPITAL LETTER J"}, + {"󠁋", "TAG LATIN CAPITAL LETTER K"}, + {"󠁌", "TAG LATIN CAPITAL LETTER L"}, + {"󠁍", "TAG LATIN CAPITAL LETTER M"}, + {"󠁎", "TAG LATIN CAPITAL LETTER N"}, + {"󠁏", "TAG LATIN CAPITAL LETTER O"}, + {"󠁐", "TAG LATIN CAPITAL LETTER P"}, + {"󠁑", "TAG LATIN CAPITAL LETTER Q"}, + {"󠁒", "TAG LATIN CAPITAL LETTER R"}, + {"󠁓", "TAG LATIN CAPITAL LETTER S"}, + {"󠁔", "TAG LATIN CAPITAL LETTER T"}, + {"󠁕", "TAG LATIN CAPITAL LETTER U"}, + {"󠁖", "TAG LATIN CAPITAL LETTER V"}, + {"󠁗", "TAG LATIN CAPITAL LETTER W"}, + {"󠁘", "TAG LATIN CAPITAL LETTER X"}, + {"󠁙", "TAG LATIN CAPITAL LETTER Y"}, + {"󠁚", "TAG LATIN CAPITAL LETTER Z"}, + {"󠁛", "TAG LEFT SQUARE BRACKET"}, + {"󠁜", "TAG REVERSE SOLIDUS"}, + {"󠁝", "TAG RIGHT SQUARE BRACKET"}, + {"󠁞", "TAG CIRCUMFLEX ACCENT"}, + {"󠁟", "TAG LOW LINE"}, + {"󠁠", "TAG GRAVE ACCENT"}, + {"󠁡", "TAG LATIN SMALL LETTER A"}, + {"󠁢", "TAG LATIN SMALL LETTER B"}, + {"󠁣", "TAG LATIN SMALL LETTER C"}, + {"󠁤", "TAG LATIN SMALL LETTER D"}, + {"󠁥", "TAG LATIN SMALL LETTER E"}, + {"󠁦", "TAG LATIN SMALL LETTER F"}, + {"󠁧", "TAG LATIN SMALL LETTER G"}, + {"󠁨", "TAG LATIN SMALL LETTER H"}, + {"󠁩", "TAG LATIN SMALL LETTER I"}, + {"󠁪", "TAG LATIN SMALL LETTER J"}, + {"󠁫", "TAG LATIN SMALL LETTER K"}, + {"󠁬", "TAG LATIN SMALL LETTER L"}, + {"󠁭", "TAG LATIN SMALL LETTER M"}, + {"󠁮", "TAG LATIN SMALL LETTER N"}, + {"󠁯", "TAG LATIN SMALL LETTER O"}, + {"󠁰", "TAG LATIN SMALL LETTER P"}, + {"󠁱", "TAG LATIN SMALL LETTER Q"}, + {"󠁲", "TAG LATIN SMALL LETTER R"}, + {"󠁳", "TAG LATIN SMALL LETTER S"}, + {"󠁴", "TAG LATIN SMALL LETTER T"}, + {"󠁵", "TAG LATIN SMALL LETTER U"}, + {"󠁶", "TAG LATIN SMALL LETTER V"}, + {"󠁷", "TAG LATIN SMALL LETTER W"}, + {"󠁸", "TAG LATIN SMALL LETTER X"}, + {"󠁹", "TAG LATIN SMALL LETTER Y"}, + {"󠁺", "TAG LATIN SMALL LETTER Z"}, + {"󠁻", "TAG LEFT CURLY BRACKET"}, + {"󠁼", "TAG VERTICAL LINE"}, + {"󠁽", "TAG RIGHT CURLY BRACKET"}, + {"󠁾", "TAG TILDE"}, + {"󠄀", "VARIATION SELECTOR-17"}, + {"󠄁", "VARIATION SELECTOR-18"}, + {"󠄂", "VARIATION SELECTOR-19"}, + {"󠄃", "VARIATION SELECTOR-20"}, + {"󠄄", "VARIATION SELECTOR-21"}, + {"󠄅", "VARIATION SELECTOR-22"}, + {"󠄆", "VARIATION SELECTOR-23"}, + {"󠄇", "VARIATION SELECTOR-24"}, + {"󠄈", "VARIATION SELECTOR-25"}, + {"󠄉", "VARIATION SELECTOR-26"}, + {"󠄊", "VARIATION SELECTOR-27"}, + {"󠄋", "VARIATION SELECTOR-28"}, + {"󠄌", "VARIATION SELECTOR-29"}, + {"󠄍", "VARIATION SELECTOR-30"}, + {"󠄎", "VARIATION SELECTOR-31"}, + {"󠄏", "VARIATION SELECTOR-32"}, + {"󠄐", "VARIATION SELECTOR-33"}, + {"󠄑", "VARIATION SELECTOR-34"}, + {"󠄒", "VARIATION SELECTOR-35"}, + {"󠄓", "VARIATION SELECTOR-36"}, + {"󠄔", "VARIATION SELECTOR-37"}, + {"󠄕", "VARIATION SELECTOR-38"}, + {"󠄖", "VARIATION SELECTOR-39"}, + {"󠄗", "VARIATION SELECTOR-40"}, + {"󠄘", "VARIATION SELECTOR-41"}, + {"󠄙", "VARIATION SELECTOR-42"}, + {"󠄚", "VARIATION SELECTOR-43"}, + {"󠄛", "VARIATION SELECTOR-44"}, + {"󠄜", "VARIATION SELECTOR-45"}, + {"󠄝", "VARIATION SELECTOR-46"}, + {"󠄞", "VARIATION SELECTOR-47"}, + {"󠄟", "VARIATION SELECTOR-48"}, + {"󠄠", "VARIATION SELECTOR-49"}, + {"󠄡", "VARIATION SELECTOR-50"}, + {"󠄢", "VARIATION SELECTOR-51"}, + {"󠄣", "VARIATION SELECTOR-52"}, + {"󠄤", "VARIATION SELECTOR-53"}, + {"󠄥", "VARIATION SELECTOR-54"}, + {"󠄦", "VARIATION SELECTOR-55"}, + {"󠄧", "VARIATION SELECTOR-56"}, + {"󠄨", "VARIATION SELECTOR-57"}, + {"󠄩", "VARIATION SELECTOR-58"}, + {"󠄪", "VARIATION SELECTOR-59"}, + {"󠄫", "VARIATION SELECTOR-60"}, + {"󠄬", "VARIATION SELECTOR-61"}, + {"󠄭", "VARIATION SELECTOR-62"}, + {"󠄮", "VARIATION SELECTOR-63"}, + {"󠄯", "VARIATION SELECTOR-64"}, + {"󠄰", "VARIATION SELECTOR-65"}, + {"󠄱", "VARIATION SELECTOR-66"}, + {"󠄲", "VARIATION SELECTOR-67"}, + {"󠄳", "VARIATION SELECTOR-68"}, + {"󠄴", "VARIATION SELECTOR-69"}, + {"󠄵", "VARIATION SELECTOR-70"}, + {"󠄶", "VARIATION SELECTOR-71"}, + {"󠄷", "VARIATION SELECTOR-72"}, + {"󠄸", "VARIATION SELECTOR-73"}, + {"󠄹", "VARIATION SELECTOR-74"}, + {"󠄺", "VARIATION SELECTOR-75"}, + {"󠄻", "VARIATION SELECTOR-76"}, + {"󠄼", "VARIATION SELECTOR-77"}, + {"󠄽", "VARIATION SELECTOR-78"}, + {"󠄾", "VARIATION SELECTOR-79"}, + {"󠄿", "VARIATION SELECTOR-80"}, + {"󠅀", "VARIATION SELECTOR-81"}, + {"󠅁", "VARIATION SELECTOR-82"}, + {"󠅂", "VARIATION SELECTOR-83"}, + {"󠅃", "VARIATION SELECTOR-84"}, + {"󠅄", "VARIATION SELECTOR-85"}, + {"󠅅", "VARIATION SELECTOR-86"}, + {"󠅆", "VARIATION SELECTOR-87"}, + {"󠅇", "VARIATION SELECTOR-88"}, + {"󠅈", "VARIATION SELECTOR-89"}, + {"󠅉", "VARIATION SELECTOR-90"}, + {"󠅊", "VARIATION SELECTOR-91"}, + {"󠅋", "VARIATION SELECTOR-92"}, + {"󠅌", "VARIATION SELECTOR-93"}, + {"󠅍", "VARIATION SELECTOR-94"}, + {"󠅎", "VARIATION SELECTOR-95"}, + {"󠅏", "VARIATION SELECTOR-96"}, + {"󠅐", "VARIATION SELECTOR-97"}, + {"󠅑", "VARIATION SELECTOR-98"}, + {"󠅒", "VARIATION SELECTOR-99"}, + {"󠅓", "VARIATION SELECTOR-100"}, + {"󠅔", "VARIATION SELECTOR-101"}, + {"󠅕", "VARIATION SELECTOR-102"}, + {"󠅖", "VARIATION SELECTOR-103"}, + {"󠅗", "VARIATION SELECTOR-104"}, + {"󠅘", "VARIATION SELECTOR-105"}, + {"󠅙", "VARIATION SELECTOR-106"}, + {"󠅚", "VARIATION SELECTOR-107"}, + {"󠅛", "VARIATION SELECTOR-108"}, + {"󠅜", "VARIATION SELECTOR-109"}, + {"󠅝", "VARIATION SELECTOR-110"}, + {"󠅞", "VARIATION SELECTOR-111"}, + {"󠅟", "VARIATION SELECTOR-112"}, + {"󠅠", "VARIATION SELECTOR-113"}, + {"󠅡", "VARIATION SELECTOR-114"}, + {"󠅢", "VARIATION SELECTOR-115"}, + {"󠅣", "VARIATION SELECTOR-116"}, + {"󠅤", "VARIATION SELECTOR-117"}, + {"󠅥", "VARIATION SELECTOR-118"}, + {"󠅦", "VARIATION SELECTOR-119"}, + {"󠅧", "VARIATION SELECTOR-120"}, + {"󠅨", "VARIATION SELECTOR-121"}, + {"󠅩", "VARIATION SELECTOR-122"}, + {"󠅪", "VARIATION SELECTOR-123"}, + {"󠅫", "VARIATION SELECTOR-124"}, + {"󠅬", "VARIATION SELECTOR-125"}, + {"󠅭", "VARIATION SELECTOR-126"}, + {"󠅮", "VARIATION SELECTOR-127"}, + {"󠅯", "VARIATION SELECTOR-128"}, + {"󠅰", "VARIATION SELECTOR-129"}, + {"󠅱", "VARIATION SELECTOR-130"}, + {"󠅲", "VARIATION SELECTOR-131"}, + {"󠅳", "VARIATION SELECTOR-132"}, + {"󠅴", "VARIATION SELECTOR-133"}, + {"󠅵", "VARIATION SELECTOR-134"}, + {"󠅶", "VARIATION SELECTOR-135"}, + {"󠅷", "VARIATION SELECTOR-136"}, + {"󠅸", "VARIATION SELECTOR-137"}, + {"󠅹", "VARIATION SELECTOR-138"}, + {"󠅺", "VARIATION SELECTOR-139"}, + {"󠅻", "VARIATION SELECTOR-140"}, + {"󠅼", "VARIATION SELECTOR-141"}, + {"󠅽", "VARIATION SELECTOR-142"}, + {"󠅾", "VARIATION SELECTOR-143"}, + {"󠅿", "VARIATION SELECTOR-144"}, + {"󠆀", "VARIATION SELECTOR-145"}, + {"󠆁", "VARIATION SELECTOR-146"}, + {"󠆂", "VARIATION SELECTOR-147"}, + {"󠆃", "VARIATION SELECTOR-148"}, + {"󠆄", "VARIATION SELECTOR-149"}, + {"󠆅", "VARIATION SELECTOR-150"}, + {"󠆆", "VARIATION SELECTOR-151"}, + {"󠆇", "VARIATION SELECTOR-152"}, + {"󠆈", "VARIATION SELECTOR-153"}, + {"󠆉", "VARIATION SELECTOR-154"}, + {"󠆊", "VARIATION SELECTOR-155"}, + {"󠆋", "VARIATION SELECTOR-156"}, + {"󠆌", "VARIATION SELECTOR-157"}, + {"󠆍", "VARIATION SELECTOR-158"}, + {"󠆎", "VARIATION SELECTOR-159"}, + {"󠆏", "VARIATION SELECTOR-160"}, + {"󠆐", "VARIATION SELECTOR-161"}, + {"󠆑", "VARIATION SELECTOR-162"}, + {"󠆒", "VARIATION SELECTOR-163"}, + {"󠆓", "VARIATION SELECTOR-164"}, + {"󠆔", "VARIATION SELECTOR-165"}, + {"󠆕", "VARIATION SELECTOR-166"}, + {"󠆖", "VARIATION SELECTOR-167"}, + {"󠆗", "VARIATION SELECTOR-168"}, + {"󠆘", "VARIATION SELECTOR-169"}, + {"󠆙", "VARIATION SELECTOR-170"}, + {"󠆚", "VARIATION SELECTOR-171"}, + {"󠆛", "VARIATION SELECTOR-172"}, + {"󠆜", "VARIATION SELECTOR-173"}, + {"󠆝", "VARIATION SELECTOR-174"}, + {"󠆞", "VARIATION SELECTOR-175"}, + {"󠆟", "VARIATION SELECTOR-176"}, + {"󠆠", "VARIATION SELECTOR-177"}, + {"󠆡", "VARIATION SELECTOR-178"}, + {"󠆢", "VARIATION SELECTOR-179"}, + {"󠆣", "VARIATION SELECTOR-180"}, + {"󠆤", "VARIATION SELECTOR-181"}, + {"󠆥", "VARIATION SELECTOR-182"}, + {"󠆦", "VARIATION SELECTOR-183"}, + {"󠆧", "VARIATION SELECTOR-184"}, + {"󠆨", "VARIATION SELECTOR-185"}, + {"󠆩", "VARIATION SELECTOR-186"}, + {"󠆪", "VARIATION SELECTOR-187"}, + {"󠆫", "VARIATION SELECTOR-188"}, + {"󠆬", "VARIATION SELECTOR-189"}, + {"󠆭", "VARIATION SELECTOR-190"}, + {"󠆮", "VARIATION SELECTOR-191"}, + {"󠆯", "VARIATION SELECTOR-192"}, + {"󠆰", "VARIATION SELECTOR-193"}, + {"󠆱", "VARIATION SELECTOR-194"}, + {"󠆲", "VARIATION SELECTOR-195"}, + {"󠆳", "VARIATION SELECTOR-196"}, + {"󠆴", "VARIATION SELECTOR-197"}, + {"󠆵", "VARIATION SELECTOR-198"}, + {"󠆶", "VARIATION SELECTOR-199"}, + {"󠆷", "VARIATION SELECTOR-200"}, + {"󠆸", "VARIATION SELECTOR-201"}, + {"󠆹", "VARIATION SELECTOR-202"}, + {"󠆺", "VARIATION SELECTOR-203"}, + {"󠆻", "VARIATION SELECTOR-204"}, + {"󠆼", "VARIATION SELECTOR-205"}, + {"󠆽", "VARIATION SELECTOR-206"}, + {"󠆾", "VARIATION SELECTOR-207"}, + {"󠆿", "VARIATION SELECTOR-208"}, + {"󠇀", "VARIATION SELECTOR-209"}, + {"󠇁", "VARIATION SELECTOR-210"}, + {"󠇂", "VARIATION SELECTOR-211"}, + {"󠇃", "VARIATION SELECTOR-212"}, + {"󠇄", "VARIATION SELECTOR-213"}, + {"󠇅", "VARIATION SELECTOR-214"}, + {"󠇆", "VARIATION SELECTOR-215"}, + {"󠇇", "VARIATION SELECTOR-216"}, + {"󠇈", "VARIATION SELECTOR-217"}, + {"󠇉", "VARIATION SELECTOR-218"}, + {"󠇊", "VARIATION SELECTOR-219"}, + {"󠇋", "VARIATION SELECTOR-220"}, + {"󠇌", "VARIATION SELECTOR-221"}, + {"󠇍", "VARIATION SELECTOR-222"}, + {"󠇎", "VARIATION SELECTOR-223"}, + {"󠇏", "VARIATION SELECTOR-224"}, + {"󠇐", "VARIATION SELECTOR-225"}, + {"󠇑", "VARIATION SELECTOR-226"}, + {"󠇒", "VARIATION SELECTOR-227"}, + {"󠇓", "VARIATION SELECTOR-228"}, + {"󠇔", "VARIATION SELECTOR-229"}, + {"󠇕", "VARIATION SELECTOR-230"}, + {"󠇖", "VARIATION SELECTOR-231"}, + {"󠇗", "VARIATION SELECTOR-232"}, + {"󠇘", "VARIATION SELECTOR-233"}, + {"󠇙", "VARIATION SELECTOR-234"}, + {"󠇚", "VARIATION SELECTOR-235"}, + {"󠇛", "VARIATION SELECTOR-236"}, + {"󠇜", "VARIATION SELECTOR-237"}, + {"󠇝", "VARIATION SELECTOR-238"}, + {"󠇞", "VARIATION SELECTOR-239"}, + {"󠇟", "VARIATION SELECTOR-240"}, + {"󠇠", "VARIATION SELECTOR-241"}, + {"󠇡", "VARIATION SELECTOR-242"}, + {"󠇢", "VARIATION SELECTOR-243"}, + {"󠇣", "VARIATION SELECTOR-244"}, + {"󠇤", "VARIATION SELECTOR-245"}, + {"󠇥", "VARIATION SELECTOR-246"}, + {"󠇦", "VARIATION SELECTOR-247"}, + {"󠇧", "VARIATION SELECTOR-248"}, + {"󠇨", "VARIATION SELECTOR-249"}, + {"󠇩", "VARIATION SELECTOR-250"}, + {"󠇪", "VARIATION SELECTOR-251"}, + {"󠇫", "VARIATION SELECTOR-252"}, + {"󠇬", "VARIATION SELECTOR-253"}, + {"󠇭", "VARIATION SELECTOR-254"}, + {"󠇮", "VARIATION SELECTOR-255"}, + {"󠇯", "VARIATION SELECTOR-256"}, + }; + public static String[][][] CATEGORIES = { + { + {"Symbol"}, + /*116,6*/ {"Alchemical Symbols@Other ", "A2j1dA"}, + /*589,116*/ { + "Arrows", + "%=68k11I3706:%M%G7AnTMm6e6HDk%`O728F1f4V1PNF2WF1G}58?]514M]Ol1%2l2%1#1GsGV1f172otW06gQ01U:1Un2MH$|W}4$,ml3f1MW|L+%0M" + }, + /*256,6*/ {"Braille", ";oA0FN"}, + /*3,6*/ {"Common Indic Number Forms@Other ", "wgi068"}, + /*39,6*/ {"Control Pictures", "(j90d3"}, + /*53,50*/ {"Currency", "H3XBMQQ10HB(2106uPM]N:qol202S20#2;.Z0^xM0:91E]J6O6"}, + /*613,223*/ { + "Emoji", + ";O906vIMOG%I9UGOun294v1O%1916$6n16]1]1u68WGWGGUGG88:48WE891X4EH2f@695^O1091vs8g0U1nE6916P1G,P9692:1G!]16HH]26G68#18M8-3W#191-1$s888]1M8|f2EG68Gt18#188738cGt58sGt4G8U868Mmm?8EeF1GM$Mv1|fE@18E8U8E8@2W?X4GE88GG8E8886u868kWOGc8" + }, + /*80,6*/ {"Emoticons@Other ", ";(i1F7"}, + /*272,42*/ {"Game Pieces", "Q6A06f5#1H2,]4MeEY[W1@3W}891N1GN18N18N3P#k"}, + /*14,7*/ {"Gender and Genealogical", "2JA0sOc"}, + /*434,65*/ { + "Geometric Shapes", + "oG90nMcPTFNfFEQE10t2v2EO71%26f1cGsH26O|8sX2M;>t0%E6OW6^<$0sW6Xq#5" + }, + /*63,30*/ {"Keyboard and UI", "Qz80XqUGv771.Uv46%7Y^Y1F2mc]1M"}, + /*2,4*/ {"Latin 1 Supplement@Other Punctuation", "9FP1"}, + /*1182,227*/ { + "Math", + "wug1M8V2868G8,8M88mW888E868G8888868GM8k8M8M88,8d1eE8U8d1++g1f1E:2v2894WX3:2v+]lEQ?60f2E11OH1P1M]1U11U]571WO6WUv3f111MuUmH6Ue6WGGu:26G8:2NO$M:16H8%2V28H211cvg.]4s9AnU#5PNdkX4-1Gc24P1P2:2P2:2P2:2P2:2P2QB606bf$0:;c8%Ef1Ev28v28]BmM" + }, + /*1052,127*/ { + "Math Alphanumeric", + "w010EGX26G6gy70bm806e2Y806Gce?]Au,8OUmOO68E86uMeU^`Q1t78V686GG6GM8|88k8-58MGs8k8d28M8U8Ok8-UGF28F28#28F28#28F28#28F28#28F28sGd4" + }, + /*470,200*/ { + "Miscellaneous", + "w=B0rzB0GW8Y040Mg%50EHB686WU8l1$Uv4?8En1E8|:29168U8718k8kG8M868M8686e686888,v2M118MO8|8E]7(V10c2tN1cYf8068n2EG6G:1W]3M:1Mm6X3888-1W91,e|O6G86%1:18H3m6%5$6%468eGWc8c11126v1V191t28t38#7X29DuM8E86m8ULN%0" + }, + /*568,29*/ {"Musical", ";DA0k2mO1NM[d3GVH92N6g-80c92s"}, + /*2,5*/ {"Number Forms@Other ", "g=806"}, + /*48,6*/ {"Ornamental Dingbats@Other ", "A;i1N4"}, + /*82,28*/ {"Stars/Asterisks", ";OA0v5l2W,g510E^jW1WV1:lvx-1"}, + /*29,13*/ {"Subscript", "Qq80N1871QC30"}, + /*35,41*/ {"Superscript", "XFX1x6e1oUg2701+6G|nE8I0302QW069JPC6^A}06"}, + /*14,10*/ {"Supplemental Arrows C@Other ", "gcj1sWM916"}, + /*15,10*/ {"Supplemental Symbols And Pictographs@Other ", "Aoj1!X9UX5"}, + /*200,26*/ {"Technical", "gM90-2G6$l7H1!%2N2O?mF2P6?"}, + /*4,5*/ {"Tibetan@Other ", "YG40M"}, + /*98,12*/ {"Transport And Map Symbols@Other ", "g?i1N7X171OM"}, + /*10,9*/ {"Vedic Extensions@Other Letter", "(u70M8MO6"}, + /*79,26*/ {"Weather and Astrological", "Q4A0F1mv3}1v8,uUe^zX171:1|"}, + /*165,20*/ {"Yijing / Tai Xuan Jing", "w8A0sf7c2WA0#5A>E1-7"}, + /*158,16*/ {"Compatibility", "^dh1#28F5m-3:6N2"}, + /*67,14*/ {"Historic", "I{)0%4!P7|%4}3"}, + /*60,18*/ {"Compatibility", "(PD0M(ZU16H1-3e!u6"}, + }, + { + {"Punctuation"}, + /*20,22*/ {"ASCII Based", "]2E8EG886[6O6f2H6eP16u"}, + /*17,28*/ {"Dash/Connector", "14f4gX80c%36%1gu30:26W;2t0XG"}, + /*47,39*/ {"Other", "(s70:<.MO$EGGG8OEms88Iu3068G6n1!GM8(iW0"}, + /*139,93*/ { + "Paired", + "n36f48v2894X1;P80sP26[6]46P16nvMPF6f3c1^F1H76:2,va@1%5M]26;7106G,H2Hf,Gs2Ms06nPcXF6f48v288686" + }, + /*50,23*/ {"Historic", "gm808kQT30MnN72v1?(%t0E"}, + /*105,63*/ { + "Compatibility", "Ig80e91E91686W8$EH1X36P162pw0,12-1G|8F18W86nDE8c8M[6O6X2E8f2886" + }, + }, + { + {"Number"}, + /*500,166*/ { + "Decimal", + "P4,]A6egh10,HC,1I,fb,%A,%A,%A,%A,%A,%A,%A,%A,%A,%8,%A,X6,PP,X6,Q]10,f3,PR,vB,9F,m,nG,]K,m,A710Ocm,^SZ0,vz,f3,1I,12,:7,]a,w{L0,oo40,vB,f5,9D,PP,%g,1Y,P9,P9,Xc,;lL0,]K," + }, + /*198,37*/ {"Enclosed/Dotted", "gs90#7%4@1Pvt2g+20,%2s8N1]2,n3N1g2U16"}, + /*40,20*/ {"Fractions/Related", "9G6eGEoX80Ocm,1IV1%3"}, + /*427,153*/ { + "Other", + "ot20cHYc]AE9Ck]Lcvd,^910#1oF10,vh2}1073GMQ:30P2!P1EHVMI2V0,9TcA|N0V2(a10sP2kn3!:6U9H6GV1G74XB6%2E:6Uf9sH2s%3k1Uc1W#2fg#1fY#1wY1069d!;+L0kIiR0l1gu50!oE20?" + }, + /*271,60*/ {"Historic", "o560EgM10,Yk10EGMo230w6u0}39175n16%aMv2$HCUXI,^E10cnQso,60}9"}, + /*60,24*/ {"Compatibility", "w.80-2o?30EHVM2Us0,w{#0?"}, + }, + { + {"Format & Whitespace"}, + /*140,53*/ {"Format", "vF;Z10c12o%40;920UX2Uf4U8M2n#0Iej0MQi50sY)W9l8bk0AvME"}, + /*262,22*/ {"Variation Selector", "]=oY506%7E^$zA#LDF1AV1"}, + /*18,25*/ {"Whitespace", "^a)05Y)0nBQQ80,n26eP4wB40"}, + /*7,23*/ {"Historic", "w-10f4^#206IV10(970ols0"}, + /*16,19*/ {"Compatibility", "fEAQ80?P3P4wB40^@s0"}, + }, + { + {"Modifier"}, + /*8,11*/ {"Enclosing", "Q670Ys10M8E"}, + /*225,67*/ { + "Nonspacing", "%+#5GG,8t1QE60F1HmE8718kWmO6XI,P2N1m6v%71WO|A(x0Yss0En1sGk%2MT_t0F1" + }, + /*132,54*/ {"Spacing", "f!!.M%3M91gz30(C30f1695E8?8l18d2X4N32D40XH2zW0]ZUo@|0U"}, + /*47,22*/ {"Historic", "%?71HP62x60M[F2926^Py0"}, + /*4,5*/ {"Compatibility", "n<686"}, + }, + { + {"Latin"}, + /*403,153*/ { + "Common", + ":5N2mN2P6}18#28V1G,GcGcGcGMW68cGs8MGcGMGMGsGd1GWG6OU8GEOG6H168E11M.s$$6f16%2MG6P3P168688uW.128$IN706126f16m6W6:16m6$6P16Gc916[878QAa06zph0696UG6OX2.o2706" + }, + /*89,16*/ {"Enclosed", "Q!90t4Y#X1M8-2:5"}, + /*105,257*/ { + "Flipped/Mirrored", + "]r=i1jKjnjQq40L!401GCpwGi0Trh04pM83:liJK1qQMnmaJQE10jm10(;50Lj50wX50{W50A1i0TJd0bB506(T40v]a8zE50I0105010IUi0{Zh0:7=w*Uc:V%Dih:h`h9X%B41n1WSL1Qau9q`jh_Bnm4lPm*mHn6amfmSmH6;+80j630Lj50wX50{W50QW80P1T#806f=^Y40(d30gtZ0bUi06AL10D9102g70+M70(#80+q80P3*jA#80{z80" + }, + /*729,223*/ { + "Other", + "]N6[6m6m6m6m6G]16m6W6W6$6v186O6G6m86OE86GUGGEGOEv2s8sG!OEOt2$F38?A570@3%5718}2H9|G@1GV1GcGMG#1GcGsGF1G6m|GcXyf2o]20}1u62cW0F1v6N1e@2Gs%5Gv;-3eUDKj081s868EG?8E8EGcu8E8UGEw^60t5H193N3v!H1f171X9O11G6e6O88m11X186IWZ072f9E]96%?M" + }, + /*183,87*/ { + "Phonetics (IPA)", + "%8N2%96$uH4H3u:9M%CF28718M868UO?86G68E8868GHOeP1SPE8GW11OO6918Of26868886OV3WU%2Wg|70EO6" + }, + /*24,20*/ {"Phonetics (X-IPA)", "1uH1WGeE11G6GO8G868s"}, + /*148,53*/ {"Historic", "HZ6uP268691s15P361Jd1oQ7068H8cHw!Y?20kAZW0sH26P1l6:BU"}, + /*358,91*/ { + "Compatibility", + "HF8WWO8:A6116v5H6!P3E%KcgT706vtM8E8?86GUGE8O8M8E86W8.U12-2X.}6;l30HBMvE,et8:2Qtq0kg710N2mN2" + }, + }, + { + {"Other European Scripts"}, + /*303,20*/ {"Cyrillic", "2510#B$}E`uHfWE;1(06"}, + /*277,79*/ { + "Greek", + "P]m8E88#18@3P3$wC70@1GcGV3GcGs8888l1888888O#48U8eE8E88OEOUeE8k8eE8E88Y=a0bai06W" + }, + /*53,8*/ {"Historic - Caucasian Albanian", "g6+0t411"}, + /*130,17*/ {"Historic - Cyrillic", "^G106g^A0-2o,V0t8"}, + /*143,15*/ {"Historic - Duployan", "2bT1t9e71O!u,GM"}, + /*40,6*/ {"Historic - Elbasan", "A2+0l3"}, + /*94,9*/ {"Historic - Glagolitic", "^tB0F48F4"}, + /*27,6*/ {"Historic - Gothic", "^l*0V2"}, + /*183,34*/ {"Historic - Greek", "]@MG6OEX7EO71f18GU8E;{(0@6%1Y9t0N6"}, + /*341,11*/ {"Historic - Linear A", "YP+0FS.@1[s"}, + /*211,23*/ {"Historic - Linear B", "(z)0|8N28t1868N1GF1937B"}, + /*29,6*/ {"Historic - Ogham", "o_50l2"}, + /*108,12*/ {"Historic - Old Hungarian", "w0-0l4H1l4uc"}, + /*36,6*/ {"Historic - Old Italic", "oh*0F3"}, + /*43,6*/ {"Historic - Old Permic", ";o*0-3"}, + /*89,6*/ {"Historic - Runic", "g|50}7"}, + /*48,6*/ {"Historic - Shavian", "A;*0N4"}, + /*44,46*/ {"Compatibility - Greek", "XG%$e68%6Ef26OoN70888888n58Uu88EOu8EOu8E.886:Q"}, + }, + { + {"American Scripts"}, + /*710,20*/ {"Canadian Aboriginal", "gP50NuGd1]oN6TR10Xu6"}, + /*172,14*/ {"Historic - Cherokee", "wG50#7Gco4e0F7"}, + /*80,6*/ {"Historic - Deseret", ";(*0F7"}, + /*672,12*/ {"Historic - SignWriting", "w.a1FxX1U8N1"}, + }, + { + {"African Scripts"}, + /*495,87*/ { + "Ethiopic", + ";(40l68MGk88MGt38MG@28MGk88MGN18758MG}5X3V1w<60}1.k8k8k8k8k8k8k8kI8X0cGcGc.k8kDDe0-2%1," + }, + /*59,10*/ {"Tifinagh", "o_B0}4u6P1"}, + /*657,12*/ {"Historic - Bamum", "(5i0@7Y4p0tp"}, + /*36,8*/ {"Historic - Bassa Vah", "o_71t2Gc"}, + /*137,14*/ {"Historic - Coptic", "Q210F12$A0dAek"}, + /*1071,8*/ {"Historic - Egyptian Hieroglyphs", ";Y[0}}N9"}, + /*213,9*/ {"Historic - Mende Kikakui", "25f1-HGV1"}, + /*90,12*/ {"Historic - Meroitic Cursive", "(L,072W#1G74"}, + /*32,6*/ {"Historic - Meroitic Hieroglyphs", ";I,0-2"}, + /*59,6*/ {"Historic - Nko", "Q420N5"}, + /*40,8*/ {"Historic - Osmanya", "g?*0t2G,"}, + /*300,6*/ {"Historic - Vai", "^th0FR"}, + }, + { + {"Middle Eastern Scripts"}, + /*357,84*/ { + "Arabic", + "gs10V2m,f368W-18F68H26[EGP774XQ-1A}$05!%0U8N1mG6]2[73G19f2,O61il2^A+0#2YVx06{S$0V1]p" + }, + /*88,17*/ {"Armenian", "(W10V3[V344k%36GE"}, + /*44,20*/ {"Georgian", "Yc40eG@2mMGEz230Y230"}, + /*53,19*/ {"Hebrew", "Il10V2eE`5#1P46o:$0"}, + /*583,6*/ {"Historic - Anatolian Hieroglyphs", "Qy{0@q"}, + /*39,22*/ {"Historic - Arabic", "gr10c]2UH46%2f6k8V19D6"}, + /*49,6*/ {"Historic - Carian", ";Y*0V4"}, + /*1234,16*/ {"Historic - Cuneiform", "gE=0#_P9}98U11#H"}, + /*55,13*/ {"Historic - Cypriot", "^-+0cG8@386OG"}, + /*85,18*/ {"Historic - Georgian", ";Y40V3]3cW2a70V38e"}, + /*26,10*/ {"Historic - Hatran", "Q4,0t186eU"}, + /*45,16*/ {"Historic - Hebrew", "gf10#2:1M;>$0!f3"}, + /*29,6*/ {"Historic - Lycian", "^V*0l2"}, + /*27,7*/ {"Historic - Lydian", "AA,0N2e"}, + /*40,8*/ {"Historic - Nabataean", "o_+0#2$!"}, + /*32,6*/ {"Historic - Old North Arabian", "Ag,0-2"}, + /*32,6*/ {"Historic - Old South Arabian", "Id,0-2"}, + /*32,6*/ {"Historic - Palmyrene", "w[+0-2"}, + /*29,7*/ {"Historic - Phoenician", "I7,0d2O"}, + /*61,9*/ {"Historic - Samaritan", "AA2074GN1"}, + /*93,22*/ {"Historic - Syriac", "wq10P1O]2[?X21DF18V5GE"}, + /*31,7*/ {"Historic - Ugaritic", "It*0t28"}, + /*876,89*/ { + "Compatibility - Arabic", + "I!10MA($0-813@Wv1#5G-4v371fAE88FCgI#0M8V2868G8,8M88mW888E868G8888868GM8k8M8M88,8d1eE8U8d1" + }, + /*6,9*/ {"Compatibility - Armenian", "oe10g^$0U"}, + /*35,17*/ {"Compatibility - Hebrew", "2a(08.F18U886868!"}, + }, + { + {"South Asian Scripts"}, + /*94,49*/ {"Bengali", "2j20WsG6G@18k8OMOf1n16P16%+6y^6f2E958kG6GE.[6G,G,"}, + /*152,51*/ {"Devanagari", "(X20-4Ov1X16f1F19[6gMf0cO8TRg0M]4E8l18k[V1YEg0l1mE8"}, + /*87,43*/ {"Gujarati", "(*20!8E8@18k868UOv1X1692%j6*uE958s8E8E:16G|"}, + /*77,43*/ {"Gurmukhi", "Av20cW6G@18k8GG693]1E:v6y^69EE958UW6GEO:1|O"}, + /*86,39*/ {"Kannada", "QR30s8E8}18,8UO936X16KAE958k8E8Mu6116G,"}, + /*102,39*/ {"Malayalam", "Ab306v1s8E8t3Gf1f1EH2caBEP5k8E8M.[6GV1O"}, + /*92,47*/ {"Oriya", "Y[20sG6G@18k868UO13EX1:Y6y^6XbE958kG6GE$6[6G?8c"}, + /*110,34*/ {"Sinhala", "oo30l1O728!8GkC66X6Wc88sm,GEA%*0#1"}, + /*74,40*/ {"Tamil", ";3308cOE8MO6886O6OEO|12HQ6aXX5UOE8M.P1-1"}, + /*98,41*/ {"Telugu", "wF30s8E8}18V1OX2Ee61D6`LMP5k8E8Mu6116G,$s"}, + /*63,19*/ {"Thaana", "g|10V311KcP1O:5,%S?"}, + /*57,12*/ {"Historic - Ahom", "2+:0N2ON1WV1"}, + /*61,8*/ {"Historic - Avestan", "(r,0-4Ok"}, + /*109,11*/ {"Historic - Brahmi", "A^-0}6Wt2X1"}, + /*67,9*/ {"Historic - Chakma", "YH.0#48F1"}, + /*85,32*/ {"Historic - Grantha", "^#.0M8sG6G@18k868UG!G6GEGmekGkOU"}, + /*31,8*/ {"Historic - Imperial Aramaic", "(>+0@18!"}, + /*27,8*/ {"Historic - Inscriptional Pahlavi", "g!,0t1es"}, + /*30,8*/ {"Historic - Inscriptional Parthian", "ox,0@1Gs"}, + /*66,6*/ {"Historic - Kaithi", "(5.0@5"}, + /*1,4*/ {"Historic - Kannada", "YZ30"}, + /*65,20*/ {"Historic - Kharoshthi", "gU,0M86es8E8V2WEW!$!"}, + /*61,9*/ {"Historic - Khojki", "we.0l18-3"}, + /*69,8*/ {"Historic - Khudawadi", "wu.0N5e,"}, + /*74,11*/ {"Historic - Lepcha", "oZ70}4ON1OE"}, + /*68,13*/ {"Historic - Limbu", "(r60#28|W|WO|"}, + /*39,6*/ {"Historic - Mahajani", "wO.0d3"}, + /*29,7*/ {"Historic - Mandaic", "^F20d2G"}, + /*79,12*/ {"Historic - Meetei Mayek", "wGj0}1PI74G,"}, + /*79,9*/ {"Historic - Modi", "(j:0F611,"}, + /*43,10*/ {"Historic - Mro", "g,71#28,W6"}, + /*38,13*/ {"Historic - Multani", "Qq.0k88M8N18?"}, + /*48,6*/ {"Historic - Ol Chiki", ";g70N4"}, + /*50,9*/ {"Historic - Old Persian", "Aw*0F3WF1"}, + /*81,8*/ {"Historic - Saurashtra", "Yni0F6.|"}, + /*94,9*/ {"Historic - Sharada", "2T.0}6GV1"}, + /*92,9*/ {"Historic - Siddham", "AY:0-4GV3"}, + /*35,8*/ {"Historic - Sora Sompeng", "2D.0F2u,"}, + /*44,6*/ {"Historic - Syloti Nagri", "(bi0@3"}, + /*66,8*/ {"Historic - Takri", "Yv:0}4$,"}, + /*82,8*/ {"Historic - Tirhuta", ";A:0d6$,"}, + /*84,8*/ {"Historic - Warang Citi", ";I;0d791"}, + /*3,6*/ {"Compatibility - Bengali", "Yr2068"}, + /*8,5*/ {"Compatibility - Devanagari", "Yf20s"}, + /*6,9*/ {"Compatibility - Gurmukhi", "Qz20G93EG"}, + /*2,5*/ {"Compatibility - Oriya", "Q0306"}, + }, + { + {"Southeast Asian Scripts"}, + /*141,26*/ {"Khmer", ";I6073GE8?v3q3l28,W,m,Hi-2"}, + /*67,41*/ {"Lao", "g:3068G68GmM8k8E88G68M86.GU92MC4Gc86.8cG,"}, + /*223,79*/ { + "Myanmar", + "QK40-3:1f1cWMOO6uEW7191Ame0UG![U:8V18cOO6r-e0#18V1mMWE8EGkOMH1|8d12le06.,%AmE8E" + }, + /*86,19*/ {"Thai", ";z30N48691c*1Gk11@1"}, + /*121,9*/ {"Historic - Balinese", "QC70-6W}3"}, + /*56,8*/ {"Historic - Batak", "(T70t4$M"}, + /*30,8*/ {"Historic - Buginese", "2>60d2G6"}, + /*22,8*/ {"Historic - Buhid", "2C606.#1"}, + /*83,13*/ {"Historic - Cham", "Q`i0@4.F1G,GM"}, + /*23,6*/ {"Historic - Hanunoo", "AA60}1"}, + /*90,10*/ {"Historic - Javanese", "w.i0}6G,W6"}, + /*47,7*/ {"Historic - Kayah Li", "2zi0748"}, + /*3,8*/ {"Historic - Khmer", "gM60v311"}, + /*127,16*/ {"Historic - Pahawh Hmong", "^}71N6[,8k8-1et1"}, + /*57,6*/ {"Historic - Pau Cin Hau", "Q*;075"}, + /*37,8*/ {"Historic - Rejang", "Y%i0F311"}, + /*72,9*/ {"Historic - Sundanese", "^N70#5PNs"}, + /*22,11*/ {"Historic - Tagalog", "I760718k]26"}, + /*20,13*/ {"Historic - Tagbanwa", "2C606%3718E86"}, + /*127,16*/ {"Historic - Tai Tham", "^@60t58l2G?m,mF1"}, + /*72,9*/ {"Historic - Tai Viet", "^7j0}5H2U"}, + }, + { + {"Hangul"}, + /*112,38*/ {"Other", "ozC0:42Pi0}1WV4Lbi0MO,8F1H1EmeEPqQ?r06"}, + /*1176,8*/ {"ᄀ HANGUL CHOSEONG KIYEOK", ";gj0}}-I"}, + /*588,6*/ {"ᄂ HANGUL CHOSEONG NIEUN", "(zk0Vr"}, + /*1180,13*/ {"ᄃ HANGUL CHOSEONG TIKEUT", "(+i0MAj20}}-I"}, + /*599,11*/ {"ᄅ HANGUL CHOSEONG RIEUL", "A,i0?2#30Vr"}, + /*591,11*/ {"ᄆ HANGUL CHOSEONG MIEUM", "A-i0EIS40Vr"}, + /*1179,13*/ {"ᄇ HANGUL CHOSEONG PIEUP", "Y-i0EY]40}}-I"}, + /*1177,12*/ {"ᄉ HANGUL CHOSEONG SIOS", "w-i0IC60}}-I"}, + /*590,11*/ {"ᄋ HANGUL CHOSEONG IEUNG", "(-i06^U70Vr"}, + /*1177,12*/ {"ᄌ HANGUL CHOSEONG CIEUC", "^-i0Q`70}}-I"}, + /*588,6*/ {"ᄎ HANGUL CHOSEONG CHIEUCH", "I}r0Vr"}, + /*588,6*/ {"ᄏ HANGUL CHOSEONG KHIEUKH", "wqs0Vr"}, + /*589,10*/ {"ᄐ HANGUL CHOSEONG THIEUTH", "2.i02YA0Vr"}, + /*589,10*/ {"ᄑ HANGUL CHOSEONG PHIEUPH", "A.i0Y}A0Vr"}, + /*589,10*/ {"ᄒ HANGUL CHOSEONG HIEUH", "I.i0(qB0Vr"}, + /*1,4*/ {"ᅙ HANGUL CHOSEONG YEORINHIEUH", "Q.i0"}, + /*350,12*/ {"Historic", "oh40FN^L80d8"}, + /*118,27*/ {"Compatibility", "oJD0#2]5#2IGs0MX5#2OcGcGcGE"}, + }, + { + {"Other East Asian Scripts"}, + /*111,40*/ {"Bopomofo", "ozC0:4HIt3XAV2bXC06I]B0MO,8F1.MGmeEwgs06"}, + /*142,49*/ {"Hiragana", "ozC0:4W#7AZD1zmD1MOF2X1c8eE986G68H86XD6^Bs061R946"}, + /*162,52*/ {"Katakana", "ozC0:49978PMV1I2D1rmD1MOF2X1c8eE986eH8MHD6^Bs061R946"}, + /*133,14*/ {"Miao", "2591F611F4f1d1"}, + /*156,23*/ {"Mongolian", "YX60738t4$t38aFN18,%3H9"}, + /*207,70*/ { + "Tibetan", "2{30%5E8M8M8M8M8M8|8Ef2UqC?8l4f468ek8cec8M8M8M8M8M8|8E8N18kW6Ii806e,Gs" + }, + /*1240,30*/ {"Yi", "oRg0-18}}-FL.U06e,Gs^rT0IG10@4"}, + /*48,6*/ {"Historic - Lisu", "oph0N4"}, + /*51,8*/ {"Historic - Manichaean", "^l,0d3W|"}, + /*83,13*/ {"Historic - New Tai Lue", "Y%60@3WN2m?O6"}, + /*73,6*/ {"Historic - Old Turkic", "2>,0l6"}, + /*83,26*/ {"Historic - Phags Pa", "wU6068AU606e,Gs2*V0}4w|M0M"}, + /*29,11*/ {"Historic - Psalter Pahlavi", "Y%,0l1uM91k"}, + /*35,8*/ {"Historic - Tai Le", "2z60t2GU"}, + /*4,5*/ {"Compatibility - Bopomofo", "Ql)0M"}, + /*24,21*/ {"Compatibility - Hiragana", "^%C0996G1MF1gas0U2E$0"}, + /*213,29*/ {"Compatibility - Katakana", "^%C0996]8PDF1vRF48@7g`r0N18}3"}, + /*22,30*/ {"Compatibility - Tibetan", "A|30]4.WWW91.868$n1.WWW91YX#0M"}, + /*4,5*/ {"Compatibility - Yi", "Ql)0M"}, + }, + { + {"Han - Other"}, + /*149,15*/ {"CJK Strokes", "AQC0N28M8d7H%F3"}, + /*12,5*/ {"Ideographic Description", "oxC0|"}, + /*21003,66*/ { + "Other", "AzC0d18V2GmOUY=70}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}du({O06" + }, + /*1383,65*/ { + "Compatibility", "^SC0n791VJ]8E9iF3f5V4X1|%CF2[U%8#2;8q0dOG8G,88G6O76Gl9YH10Modj1Fn" + }, + /*59438,155*/ { + "Less Common", + "A(D0}}}}}}}}}}}}N,^oj06886[886GEwL+0}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}#A%3}}}}}}}}7P117KG}}}}}}}}}}#_" + }, + }, + { + {"TODO"}, /*29,40*/ {"Missing", "A-80A4R1f6G686G6W918u]5W6$un2We8Eu]U6nQ6"}, + }, + }; } diff --git a/unicodetools/src/main/java/org/unicode/props/BagFormatter.java b/unicodetools/src/main/java/org/unicode/props/BagFormatter.java index 9ee80cd69..27e43ebf5 100644 --- a/unicodetools/src/main/java/org/unicode/props/BagFormatter.java +++ b/unicodetools/src/main/java/org/unicode/props/BagFormatter.java @@ -6,6 +6,11 @@ */ package org.unicode.props; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.PrintWriter; import java.io.StringWriter; import java.text.MessageFormat; @@ -14,33 +19,26 @@ import java.util.HashSet; import java.util.Locale; import java.util.Map; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Tabber; import org.unicode.cldr.util.Visitor; import org.unicode.cldr.util.props.UnicodeLabel; import org.unicode.jsp.ICUPropertyFactory; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class BagFormatter { static final boolean DEBUG = false; public static final boolean SHOW_FILES; + static { - boolean showFiles = false; - try { - showFiles = System.getProperty("SHOW_FILES") != null; - } - catch (SecurityException e) { - } - SHOW_FILES = showFiles; + boolean showFiles = false; + try { + showFiles = System.getProperty("SHOW_FILES") != null; + } catch (SecurityException e) { + } + SHOW_FILES = showFiles; } - public static final PrintWriter CONSOLE = new PrintWriter(System.out,true); + public static final PrintWriter CONSOLE = new PrintWriter(System.out, true); private static PrintWriter log = CONSOLE; @@ -55,7 +53,7 @@ public class BagFormatter { private UnicodeLabel valueSource; private String propName = ""; private boolean showCount = true; - //private boolean suppressReserved = true; + // private boolean suppressReserved = true; private boolean hexValue = false; private static final String NULL_VALUE = "_NULL_VALUE_"; private int fullTotal = -1; @@ -65,29 +63,22 @@ public class BagFormatter { /** * Compare two UnicodeSets, and show the differences + * * @param name1 name of first set to be compared * @param set1 first set * @param name2 name of second set to be compared * @param set2 second set * @return formatted string */ - public String showSetDifferences( - String name1, - UnicodeSet set1, - String name2, - UnicodeSet set2) { + public String showSetDifferences(String name1, UnicodeSet set1, String name2, UnicodeSet set2) { StringWriter result = new StringWriter(); - showSetDifferences(new PrintWriter(result),name1,set1,name2,set2); + showSetDifferences(new PrintWriter(result), name1, set1, name2, set2); result.flush(); return result.getBuffer().toString(); } - public String showSetDifferences( - String name1, - Collection set1, - String name2, - Collection set2) { + public String showSetDifferences(String name1, Collection set1, String name2, Collection set2) { StringWriter result = new StringWriter(); showSetDifferences(new PrintWriter(result), name1, set1, name2, set2); @@ -96,34 +87,30 @@ public String showSetDifferences( } public void showSetDifferences( - PrintWriter pw, - String name1, - UnicodeSet set1, - String name2, - UnicodeSet set2) { + PrintWriter pw, String name1, UnicodeSet set1, String name2, UnicodeSet set2) { showSetDifferences(pw, name1, set1, name2, set2, -1); } /** * Compare two UnicodeSets, and show the differences + * * @param name1 name of first set to be compared * @param set1 first set * @param name2 name of second set to be compared * @param set2 second set */ public void showSetDifferences( - PrintWriter pw, - String name1, - UnicodeSet set1, - String name2, - UnicodeSet set2, - int flags) - { + PrintWriter pw, + String name1, + UnicodeSet set1, + String name2, + UnicodeSet set2, + int flags) { if (pw == null) pw = FileUtilities.CONSOLE; - String[] names = { name1, name2 }; + String[] names = {name1, name2}; UnicodeSet temp; - if ((flags&1) != 0) { + if ((flags & 1) != 0) { temp = new UnicodeSet(set1).removeAll(set2); pw.print(lineSeparator); pw.print(inOut.format(names)); @@ -131,7 +118,7 @@ public void showSetDifferences( showSetNames(pw, temp); } - if ((flags&2) != 0) { + if ((flags & 2) != 0) { temp = new UnicodeSet(set2).removeAll(set1); pw.print(lineSeparator); pw.print(outIn.format(names)); @@ -139,7 +126,7 @@ public void showSetDifferences( showSetNames(pw, temp); } - if ((flags&4) != 0) { + if ((flags & 4) != 0) { temp = new UnicodeSet(set2).retainAll(set1); pw.print(lineSeparator); pw.print(inIn.format(names)); @@ -150,14 +137,10 @@ public void showSetDifferences( } public void showSetDifferences( - PrintWriter pw, - String name1, - Collection set1, - String name2, - Collection set2) { + PrintWriter pw, String name1, Collection set1, String name2, Collection set2) { if (pw == null) pw = FileUtilities.CONSOLE; - String[] names = { name1, name2 }; + String[] names = {name1, name2}; // damn'd collection doesn't have a clone, so // we go with Set, even though that // may not preserve order and duplicates @@ -183,21 +166,23 @@ public void showSetDifferences( } /** - * Returns a list of items in the collection, with each separated by the separator. - * Each item must not be null; its toString() is called for a printable representation + * Returns a list of items in the collection, with each separated by the separator. Each item + * must not be null; its toString() is called for a printable representation + * * @param c source collection * @return a String representation of the list */ public String showSetNames(Object c) { StringWriter buffer = new StringWriter(); PrintWriter output = new PrintWriter(buffer); - showSetNames(output,c); + showSetNames(output, c); return buffer.toString(); } /** - * Returns a list of items in the collection, with each separated by the separator. - * Each item must not be null; its toString() is called for a printable representation + * Returns a list of items in the collection, with each separated by the separator. Each item + * must not be null; its toString() is called for a printable representation + * * @param output destination to which to write names * @param c source collection */ @@ -206,35 +191,26 @@ public void showSetNames(PrintWriter output, Object c) { output.flush(); } - public String getAbbreviatedName( - String src, - String pattern, - String substitute) { + public String getAbbreviatedName(String src, String pattern, String substitute) { int matchEnd = NameIterator.findMatchingEnd(src, pattern); int sdiv = src.length() - matchEnd; int pdiv = pattern.length() - matchEnd; StringBuffer result = new StringBuffer(); - addMatching( - src.substring(0, sdiv), - pattern.substring(0, pdiv), - substitute, - result); - addMatching( - src.substring(sdiv), - pattern.substring(pdiv), - substitute, - result); + addMatching(src.substring(0, sdiv), pattern.substring(0, pdiv), substitute, result); + addMatching(src.substring(sdiv), pattern.substring(pdiv), substitute, result); return result.toString(); } - abstract public static class Relation { - abstract public String getRelation(String a, String b); + public abstract static class Relation { + public abstract String getRelation(String a, String b); } static class NullRelation extends Relation { @Override - public String getRelation(String a, String b) { return ""; } + public String getRelation(String a, String b) { + return ""; + } } private Relation r = new NullRelation(); @@ -249,8 +225,8 @@ public Relation getRelation() { } /* - r.getRelati on(last, s) + quote(s) + "\t#" + UnicodeSetFormatter.getResolvedName(s) - */ + r.getRelati on(last, s) + quote(s) + "\t#" + UnicodeSetFormatter.getResolvedName(s) + */ /* static final UnicodeSet NO_NAME = new UnicodeSet("[\\u0080\\u0081\\u0084\\u0099\\p{Cn}\\p{Co}]"); @@ -288,6 +264,7 @@ public BagFormatter setMergeRanges(boolean in) { mergeRanges = in; return this; } + public BagFormatter setShowSetAlso(boolean b) { showSetAlso = b; return this; @@ -303,7 +280,7 @@ public String getName(String sep, int start, int end) { if (start == end) return sep + result; String endString = getName(end, false); if (result.length() == 0 && endString.length() == 0) return sep; - if (abbreviated) endString = getAbbreviatedName(endString,result,"~"); + if (abbreviated) endString = getAbbreviatedName(endString, result, "~"); return sep + result + ".." + endString; } @@ -328,12 +305,9 @@ public NameLabel(UnicodeProperty.Factory source) { @Override public String getValue(int codePoint, boolean isShort) { - String hcp = !isShort - ? "U+" + Utility.hex(codePoint, 4) + " " - : ""; + String hcp = !isShort ? "U+" + Utility.hex(codePoint, 4) + " " : ""; String result = nameProp.getValue(codePoint); - if (result != null) - return hcp + result; + if (result != null) return hcp + result; if (control.contains(codePoint)) { return ""; } @@ -346,10 +320,9 @@ public String getValue(int codePoint, boolean isShort) { if (noncharacter.contains(codePoint)) { return ""; } - //if (suppressReserved) return ""; + // if (suppressReserved) return ""; return hcp + ""; } - } // refactored @@ -364,7 +337,7 @@ public String getName(String s, boolean withCodePoint) { } public String hex(String s) { - return hex(s,separator); + return hex(s, separator); } public String hex(String s, String sep) { @@ -372,9 +345,9 @@ public String hex(String s, String sep) { } public String hex(int start, int end) { - String s = Utility.hex(start,4); + String s = Utility.hex(start, 4); if (start == end) return s; - return s + ".." + Utility.hex(end,4); + return s + ".." + Utility.hex(end, 4); } public BagFormatter setUnicodePropertyFactory(UnicodeProperty.Factory source) { @@ -387,10 +360,9 @@ private UnicodeProperty.Factory getUnicodePropertyFactory() { return source; } - public BagFormatter () { - } + public BagFormatter() {} - public BagFormatter (UnicodeProperty.Factory source) { + public BagFormatter(UnicodeProperty.Factory source) { setUnicodePropertyFactory(source); } @@ -425,31 +397,23 @@ private String getLabels(int start, int end) { } */ - private void addMatching( - String src, - String pattern, - String substitute, - StringBuffer result) { + private void addMatching(String src, String pattern, String substitute, StringBuffer result) { NameIterator n1 = new NameIterator(src); NameIterator n2 = new NameIterator(pattern); boolean first = true; while (true) { String s1 = n1.next(); - if (s1 == null) - break; + if (s1 == null) break; String s2 = n2.next(); - if (!first) - result.append(" "); + if (!first) result.append(" "); first = false; - if (s1.equals(s2)) - result.append(substitute); - else - result.append(s1); + if (s1.equals(s2)) result.append(substitute); + else result.append(s1); } } - private static NumberFormat nf = - NumberFormat.getIntegerInstance(Locale.ENGLISH); + private static NumberFormat nf = NumberFormat.getIntegerInstance(Locale.ENGLISH); + static { nf.setGroupingUsed(false); } @@ -458,7 +422,7 @@ private void addMatching( private int maxLabelWidthOverride = -1; public BagFormatter setValueWidthOverride(int maxWidthOverride) { - this.maxWidthOverride = maxWidthOverride; + this.maxWidthOverride = maxWidthOverride; return this; } @@ -467,7 +431,7 @@ public int getValueWidthOverride() { } public BagFormatter setLabelWidthOverride(int maxWidthOverride) { - this.maxLabelWidthOverride = maxWidthOverride; + this.maxLabelWidthOverride = maxWidthOverride; return this; } @@ -475,7 +439,6 @@ public int getLabelWidthOverride() { return maxLabelWidthOverride; } - private class MyVisitor extends Visitor { private PrintWriter output; String commentSeparator; @@ -494,10 +457,8 @@ public void toOutput(String s) { output.print("

"); } output.print(s); - if (isHtml) - output.println("

"); - else - output.print(lineSeparator); + if (isHtml) output.println("

"); + else output.print(lineSeparator); } public void toTable(String s) { @@ -505,7 +466,7 @@ public void toTable(String s) { output.print("
"); inTable = true; } - output.print(tabber.process(s) + lineSeparator); + output.print(tabber.process(s) + lineSeparator); } public void doAt(Object c, PrintWriter out) { @@ -518,43 +479,55 @@ public void doAt(Object c, PrintWriter out) { // 0009..000D ; White_Space # Cc [5] .. // new // 0009..000D ; White_Space #Cc [5] .. - tabber.add(mergeRanges ? 14 : 6,Tabber.LEFT); + tabber.add(mergeRanges ? 14 : 6, Tabber.LEFT); if (propName.length() > 0) { - tabber.add(propName.length() + 2,Tabber.LEFT); + tabber.add(propName.length() + 2, Tabber.LEFT); } - valueSize = maxWidthOverride > 0 ? maxWidthOverride : getValueSource().getMaxWidth(shortValue); + valueSize = + maxWidthOverride > 0 + ? maxWidthOverride + : getValueSource().getMaxWidth(shortValue); if (DEBUG) System.out.println("ValueSize: " + valueSize); if (valueSize > 0) { - tabber.add(valueSize + 2,Tabber.LEFT); // value + tabber.add(valueSize + 2, Tabber.LEFT); // value } - tabber.add(3,Tabber.LEFT); // comment character + tabber.add(3, Tabber.LEFT); // comment character - labelSize = maxLabelWidthOverride > 0 ? maxLabelWidthOverride : getLabelSource(true).getMaxWidth(shortLabel); + labelSize = + maxLabelWidthOverride > 0 + ? maxLabelWidthOverride + : getLabelSource(true).getMaxWidth(shortLabel); if (labelSize > 0) { - tabber.add(labelSize + 1,Tabber.LEFT); // value + tabber.add(labelSize + 1, Tabber.LEFT); // value } if (mergeRanges && showCount) { - tabber.add(5,Tabber.RIGHT); + tabber.add(5, Tabber.RIGHT); } if (showLiteral != null) { - tabber.add(4,Tabber.LEFT); + tabber.add(4, Tabber.LEFT); } - //myTabber.add(7,Tabber.LEFT); + // myTabber.add(7,Tabber.LEFT); - commentSeparator = (showCount || showLiteral != null - || getLabelSource(true) != UnicodeLabel.NULL - || getNameSource() != UnicodeLabel.NULL) - ? "\t #" : ""; + commentSeparator = + (showCount + || showLiteral != null + || getLabelSource(true) != UnicodeLabel.NULL + || getNameSource() != UnicodeLabel.NULL) + ? "\t #" + : ""; if (DEBUG) System.out.println("Tabber: " + tabber.toString()); - if (DEBUG) System.out.println("Tabber: " + tabber.process( - "200C..200D\t; White_Space\t #\tCf\t [2]\t ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER")); + if (DEBUG) + System.out.println( + "Tabber: " + + tabber.process( + "200C..200D\t; White_Space\t #\tCf\t [2]\t ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER")); doAt(c); } @@ -577,15 +550,17 @@ protected void doBefore(Object container, Object o) { } @Override - protected void doBetween(Object container, Object lastItem, Object nextItem) { - } + protected void doBetween(Object container, Object lastItem, Object nextItem) {} @Override protected void doAfter(Object container, Object o) { if (fullTotal != -1 && fullTotal != counter) { if (showTotal) { toOutput(""); - toOutput("# The above property value applies to " + nf.format(fullTotal-counter) + " code points not listed here."); + toOutput( + "# The above property value applies to " + + nf.format(fullTotal - counter) + + " code points not listed here."); toOutput("# Total code points: " + nf.format(fullTotal)); } fullTotal = -1; @@ -598,7 +573,7 @@ protected void doAfter(Object container, Object o) { @Override protected void doSimpleAt(Object o) { if (o instanceof Map.Entry) { - Map.Entry oo = (Map.Entry)o; + Map.Entry oo = (Map.Entry) o; Object key = oo.getKey(); Object value = oo.getValue(); doBefore(o, key); @@ -611,18 +586,24 @@ protected void doSimpleAt(Object o) { doAt((Visitor.CodePointRange) o); } else { String thing = o.toString(); - String value = getValueSource() == UnicodeLabel.NULL ? "" : getValueSource().getValue(thing, ",", true); + String value = + getValueSource() == UnicodeLabel.NULL + ? "" + : getValueSource().getValue(thing, ",", true); if (getValueSource() != UnicodeLabel.NULL) value = "\t; " + value; - String label = getLabelSource(true) == UnicodeLabel.NULL ? "" : getLabelSource(true).getValue(thing, ",", true); + String label = + getLabelSource(true) == UnicodeLabel.NULL + ? "" + : getLabelSource(true).getValue(thing, ",", true); if (label.length() != 0) label = " " + label; toTable( - hex(thing) - + value - + commentSeparator - + label - + insertLiteral(thing) - + "\t" - + getName(thing)); + hex(thing) + + value + + commentSeparator + + label + + insertLiteral(thing) + + "\t" + + getName(thing)); counter++; } } @@ -653,43 +634,49 @@ private void showLine(int start, int end) { if (valueSize > 0) { value = "\t; " + value; } else if (value.length() > 0) { - throw new IllegalArgumentException("maxwidth bogus " + value + "," + getValueSource().getMaxWidth(shortValue)); + throw new IllegalArgumentException( + "maxwidth bogus " + value + "," + getValueSource().getMaxWidth(shortValue)); } if (labelSize > 0) { label = "\t" + label; } else if (label.length() > 0) { - throw new IllegalArgumentException("maxwidth bogus " + label + ", " + getLabelSource(true).getMaxWidth(shortLabel)); + throw new IllegalArgumentException( + "maxwidth bogus " + + label + + ", " + + getLabelSource(true).getMaxWidth(shortLabel)); } String count = ""; if (mergeRanges && showCount) { if (end == start) count = "\t"; - else count = "\t ["+ nf.format(end - start + 1)+ "]"; + else count = "\t [" + nf.format(end - start + 1) + "]"; } toTable( - hex(start, end) - + pn - + value - + commentSeparator - + label - + count - + insertLiteral(start, end) - + getName("\t ", start, end)); + hex(start, end) + + pn + + value + + commentSeparator + + label + + count + + insertLiteral(start, end) + + getName("\t ", start, end)); } private String insertLiteral(String thing) { - return (showLiteral == null ? "" - : " \t(" + showLiteral.transliterate(thing) + ") "); + return (showLiteral == null ? "" : " \t(" + showLiteral.transliterate(thing) + ") "); } private String insertLiteral(int start, int end) { - return (showLiteral == null ? "" : - " \t(" + showLiteral.transliterate(UTF16.valueOf(start)) - + ((start != end) - ? (".." + showLiteral.transliterate(UTF16.valueOf(end))) - : "") - + ") "); + return (showLiteral == null + ? "" + : " \t(" + + showLiteral.transliterate(UTF16.valueOf(start)) + + ((start != end) + ? (".." + showLiteral.transliterate(UTF16.valueOf(end))) + : "") + + ") "); } /* private String insertLiteral(int cp) { @@ -701,6 +688,7 @@ private String insertLiteral(int cp) { /** * Iterate through a string, breaking at words. + * * @author Davis */ private static class NameIterator { @@ -714,14 +702,13 @@ private static class NameIterator { } /** * Find next word, including trailing spaces + * * @return the next word */ String next() { - if (position >= limit) - return null; + if (position >= limit) return null; int pos = source.indexOf(' ', position); - if (pos < 0 || pos >= limit) - pos = limit; + if (pos < 0 || pos >= limit) pos = limit; String result = source.substring(position, pos); position = pos + 1; return result; @@ -734,15 +721,14 @@ static int findMatchingEnd(String s1, String s2) { while (true) { --i; // decrement both before calling function! --j; - if (s1.charAt(i) != s2.charAt(j)) - break; + if (s1.charAt(i) != s2.charAt(j)) break; } - } catch (Exception e) {} // run off start + } catch (Exception e) { + } // run off start ++i; // counteract increment i = s1.indexOf(' ', i); // move forward to space - if (i < 0) - return 0; + if (i < 0) return 0; return s1.length() - i; } } @@ -750,20 +736,27 @@ static int findMatchingEnd(String s1, String s2) { private class RangeFinder { int start, limit; private int veryLimit; - //String label, value; + // String label, value; void reset(int rangeStart, int rangeLimit) { limit = rangeStart; veryLimit = rangeLimit; } + boolean next() { - if (limit >= veryLimit) - return false; + if (limit >= veryLimit) return false; start = limit; // set to end of last String label = getLabelSource(false).getValue(limit, true); String value = getValue(limit, true); - String breaker = getRangeBreakSource().getValue(limit,true); + String breaker = getRangeBreakSource().getValue(limit, true); if (DEBUG && 0x3FFD < limit && limit < 0x9FD6) { - System.out.println(Utility.hex(limit) + ", Label: " + label + ", Value: " + value + ", Break: " + breaker); + System.out.println( + Utility.hex(limit) + + ", Label: " + + label + + ", Value: " + + value + + ", Break: " + + breaker); } limit++; for (; limit < veryLimit; limit++) { @@ -771,11 +764,16 @@ boolean next() { String v = getValue(limit, true); String b = getRangeBreakSource().getValue(limit, true); if (DEBUG && limit > 0x9FD4) { - System.out.println(Utility.hex(limit) + ", *Label: " + s + ", Value: " + v + ", Break: " + b); + System.out.println( + Utility.hex(limit) + + ", *Label: " + + s + + ", Value: " + + v + + ", Break: " + + b); } - if (!equalTo(s, label) - || !equalTo(v, value) - || !equalTo(b, breaker)) { + if (!equalTo(s, label) || !equalTo(v, value) || !equalTo(b, breaker)) { break; } } @@ -824,14 +822,15 @@ public BagFormatter setAbbreviated(boolean b) { public UnicodeLabel getLabelSource(boolean visible) { if (labelSource == null) { Map labelMap = new HashMap(); - //labelMap.put("Lo","L&"); - labelMap.put("Lu","L&"); - labelMap.put("Lt","L&"); - labelMap.put("Ll","L&"); - labelSource = new UnicodeProperty.FilteredProperty( - getUnicodePropertyFactory().getProperty("General_Category"), - new UnicodeProperty.MapFilter(labelMap) - ).setAllowValueAliasCollisions(true); + // labelMap.put("Lo","L&"); + labelMap.put("Lu", "L&"); + labelMap.put("Lt", "L&"); + labelMap.put("Ll", "L&"); + labelSource = + new UnicodeProperty.FilteredProperty( + getUnicodePropertyFactory().getProperty("General_Category"), + new UnicodeProperty.MapFilter(labelMap)) + .setAllowValueAliasCollisions(true); } return labelSource; } @@ -846,19 +845,22 @@ public static void addAll(UnicodeSet source, Collection target) { // UTILITIES - public static final Transliterator hex = Transliterator.getInstance( - "[^\\u0009\\u0020-\\u007E\\u00A0-\\u00FF] hex"); + public static final Transliterator hex = + Transliterator.getInstance("[^\\u0009\\u0020-\\u007E\\u00A0-\\u00FF] hex"); public String getSeparator() { return separator; } + public BagFormatter setSeparator(String string) { separator = string; return this; } + public Transliterator getShowLiteral() { return showLiteral; } + public BagFormatter setShowLiteral(Transliterator transliterator) { showLiteral = transliterator; return this; @@ -867,27 +869,33 @@ public BagFormatter setShowLiteral(Transliterator transliterator) { // ===== CONVENIENCES ===== private class Join extends Visitor { StringBuffer output = new StringBuffer(); + @SuppressWarnings("unused") int depth = 0; - String join (Object o) { + + String join(Object o) { output.setLength(0); doAt(o); return output.toString(); } + @Override protected void doBefore(Object container, Object item) { ++depth; output.append(prefix); } + @Override protected void doAfter(Object container, Object item) { output.append(suffix); --depth; } + @Override protected void doBetween(Object container, Object lastItem, Object nextItem) { output.append(separator); } + @Override protected void doSimpleAt(Object o) { if (o != null) output.append(o.toString()); @@ -1066,12 +1074,10 @@ public UnicodeLabel getRangeBreakSource() { labelMap.put("Zp", "Cf"); rangeBreakSource = - new UnicodeProperty - .FilteredProperty( - getUnicodePropertyFactory().getProperty( - "General_Category"), - new UnicodeProperty.MapFilter(labelMap)) - .setAllowValueAliasCollisions(true); + new UnicodeProperty.FilteredProperty( + getUnicodePropertyFactory().getProperty("General_Category"), + new UnicodeProperty.MapFilter(labelMap)) + .setAllowValueAliasCollisions(true); /* "Cn", // = Other, Not Assigned 0 diff --git a/unicodetools/src/main/java/org/unicode/props/DefaultValues.java b/unicodetools/src/main/java/org/unicode/props/DefaultValues.java index af31e53eb..7466ebba6 100644 --- a/unicodetools/src/main/java/org/unicode/props/DefaultValues.java +++ b/unicodetools/src/main/java/org/unicode/props/DefaultValues.java @@ -1,16 +1,12 @@ package org.unicode.props; -import org.unicode.props.UcdPropertyValues.Bidi_Class_Values; -import org.unicode.props.UcdPropertyValues.Block_Values; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.VersionInfo; +import org.unicode.props.UcdPropertyValues.Bidi_Class_Values; +import org.unicode.props.UcdPropertyValues.Block_Values; -/** - * Default property values for some properties and certain ranges - * other than all of Unicode. - */ +/** Default property values for some properties and certain ranges other than all of Unicode. */ public final class DefaultValues { public static final class BidiClass { private static final Bidi_Class_Values L = Bidi_Class_Values.Left_To_Right; @@ -19,7 +15,10 @@ public static final class BidiClass { private static final Bidi_Class_Values BN = Bidi_Class_Values.Boundary_Neutral; private static final Bidi_Class_Values ET = Bidi_Class_Values.European_Terminator; - public static enum Option { ALL, OMIT_BN }; + public static enum Option { + ALL, + OMIT_BN + }; private static final class Builder { int compositeVersion; @@ -120,8 +119,7 @@ private void addBlockValueIfAtLeast( } } - public static UnicodeMap forVersion( - VersionInfo version, Option option) { + public static UnicodeMap forVersion(VersionInfo version, Option option) { return new Builder(version).build(option); } } diff --git a/unicodetools/src/main/java/org/unicode/props/GenerateEnums.java b/unicodetools/src/main/java/org/unicode/props/GenerateEnums.java index 25c36e461..521ea2c66 100644 --- a/unicodetools/src/main/java/org/unicode/props/GenerateEnums.java +++ b/unicodetools/src/main/java/org/unicode/props/GenerateEnums.java @@ -1,5 +1,11 @@ package org.unicode.props; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.VersionInfo; import java.io.IOException; import java.io.PrintWriter; import java.util.Arrays; @@ -16,28 +22,25 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.RegexUtilities; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.VersionInfo; - public class GenerateEnums { public static final String ENUM_VERSION = Settings.latestVersion; - public static final VersionInfo ENUM_VERSION_INFO = VersionInfo.getInstance(GenerateEnums.ENUM_VERSION); + public static final VersionInfo ENUM_VERSION_INFO = + VersionInfo.getInstance(GenerateEnums.ENUM_VERSION); - public static final String PROPERTY_FILE_OUTPUT = Settings.UnicodeTools.UNICODETOOLS_JAVA_DIR + "/org/unicode/props/UcdProperty.java"; - public static final String PROPERTY_VALUE_OUTPUT = Settings.UnicodeTools.UNICODETOOLS_JAVA_DIR + "/org/unicode/props/UcdPropertyValues.java"; + public static final String PROPERTY_FILE_OUTPUT = + Settings.UnicodeTools.UNICODETOOLS_JAVA_DIR + "/org/unicode/props/UcdProperty.java"; + public static final String PROPERTY_VALUE_OUTPUT = + Settings.UnicodeTools.UNICODETOOLS_JAVA_DIR + + "/org/unicode/props/UcdPropertyValues.java"; // private static class Locations { - // private static Set files = addAll(new HashSet(), new File(SOURCE_DIR)); + // private static Set files = addAll(new HashSet(), new + // File(SOURCE_DIR)); // public static boolean contains(String file) { // return files.contains(file.replace("_","")); // } @@ -62,24 +65,28 @@ public class GenerateEnums { // } // } + static Map lookup = new HashMap(); + static Map lookupMain = new TreeMap(); - static Map lookup = new HashMap(); - static Map lookupMain = new TreeMap(); + private static final Pattern PROPERTY_LONG_NAME = + Pattern.compile("[A-Z]+[0-9]?[a-z]*(_[A-Z0-9]+[a-z]*)*"); + private static final Pattern PROPERTY_VALUE_LONG_NAME = + Pattern.compile("[A-Z]+[0-9]?[a-z]*(_[A-Z0-9]+[a-z]*)*(\\d*)"); + private static final Pattern PROPER_CJK_LONG_NAME = + Pattern.compile("(cj)?k[A-Z][a-z]*(_?[A-Z0-9][a-z]*)*"); - private static final Pattern PROPERTY_LONG_NAME = Pattern.compile("[A-Z]+[0-9]?[a-z]*(_[A-Z0-9]+[a-z]*)*"); - private static final Pattern PROPERTY_VALUE_LONG_NAME = Pattern.compile("[A-Z]+[0-9]?[a-z]*(_[A-Z0-9]+[a-z]*)*(\\d*)"); - private static final Pattern PROPER_CJK_LONG_NAME = Pattern.compile("(cj)?k[A-Z][a-z]*(_?[A-Z0-9][a-z]*)*"); - - static class PropName implements Comparable{ + static class PropName implements Comparable { final PropertyType propertyType; final String shortName; final String longName; final List others; final Map subnames = new TreeMap(); - PropName(PropertyType type, OverrideChoice override, String...strings) { + + PropName(PropertyType type, OverrideChoice override, String... strings) { shortName = strings[0]; longName = strings[1]; - propertyType = longName.equals("Script_Extensions") ? PropertyType.Catalog : type; // HACK + propertyType = + longName.equals("Script_Extensions") ? PropertyType.Catalog : type; // HACK final String badName = isProperLongName(longName, PROPERTY_LONG_NAME, true); if (badName != null) { addWarning("Improper Property long name: " + badName); @@ -102,7 +109,9 @@ static class PropName implements Comparable{ } lookupMain.put(longName, this); } - private static String isProperLongName(String longName2, Pattern pattern, boolean allowCjk) { + + private static String isProperLongName( + String longName2, Pattern pattern, boolean allowCjk) { boolean result = pattern.matcher(longName2).matches(); if (result == false && allowCjk) { result = PROPER_CJK_LONG_NAME.matcher(longName2).matches(); @@ -112,10 +121,20 @@ private static String isProperLongName(String longName2, Pattern pattern, boolea } return null; } + @Override public String toString() { - return "{" + propertyType + ", " + longName + ", " + shortName + ", " + others + "}"; + return "{" + + propertyType + + ", " + + longName + + ", " + + shortName + + ", " + + others + + "}"; } + @Override public int compareTo(PropName arg0) { if (longName.contains("10")) { @@ -126,32 +145,47 @@ public int compareTo(PropName arg0) { } static final RuleBasedCollator COL = (RuleBasedCollator) Collator.getInstance(Locale.ROOT); + static { COL.setNumericCollation(true); COL.freeze(); } - private static final Comparator ARRAY_SORT = new Comparator() { - @Override - public int compare(String[] o1, String[] o2) { - int min = o1.length < o2.length ? o1.length : o2.length; - for (int i = 0; i < min; ++i) { - int diff = COL.compare(o1[i], o2[i]); - if (diff != 0) { - return diff; + + private static final Comparator ARRAY_SORT = + new Comparator() { + @Override + public int compare(String[] o1, String[] o2) { + int min = o1.length < o2.length ? o1.length : o2.length; + for (int i = 0; i < min; ++i) { + int diff = COL.compare(o1[i], o2[i]); + if (diff != 0) { + return diff; + } + } + return o1.length - o2.length; } - } - return o1.length - o2.length; - } - }; + }; - enum OverrideChoice {allow, disallow} + enum OverrideChoice { + allow, + disallow + } public static void main(String[] args) throws IOException { final Map> values = new TreeMap>(); - addPropertyAliases(values, FileUtilities.in("", Utility.getMostRecentUnicodeDataFile("PropertyAliases", ENUM_VERSION, true, true)), OverrideChoice.disallow); - addPropertyAliases(values, FileUtilities.in(GenerateEnums.class, "ExtraPropertyAliases.txt"), OverrideChoice.allow); + addPropertyAliases( + values, + FileUtilities.in( + "", + Utility.getMostRecentUnicodeDataFile( + "PropertyAliases", ENUM_VERSION, true, true)), + OverrideChoice.disallow); + addPropertyAliases( + values, + FileUtilities.in(GenerateEnums.class, "ExtraPropertyAliases.txt"), + OverrideChoice.allow); writeMainUcdFile(); @@ -165,59 +199,84 @@ public static void main(String[] args) throws IOException { } public static String getNameStuff2(final String enumName) { - return ";\n" + - " private final PropertyNames<" + enumName + "> names;\n"+ - " private " + enumName + " (String shortName, String...otherNames) {\n"+ - " names = new PropertyNames<" + enumName + ">(\n" - + " " + enumName + ".class, this, shortName, otherNames);\n"+ - " }\n"+ - " public PropertyNames<" + enumName + "> getNames() {\n"+ - " return names;\n"+ - " }\n" + - " public String getShortName() {\n" + - " return names.getShortName();\n" + - " }\n" + - " private static final NameMatcher<" + enumName + "> NAME_MATCHER = PropertyNames.getNameToEnums(" + enumName + ".class);\n" + - " public static " + enumName + " forName(String name) {\n" + - " return NAME_MATCHER.get(name);\n" + - " }\n" + - " }\n"; + return ";\n" + + " private final PropertyNames<" + + enumName + + "> names;\n" + + " private " + + enumName + + " (String shortName, String...otherNames) {\n" + + " names = new PropertyNames<" + + enumName + + ">(\n" + + " " + + enumName + + ".class, this, shortName, otherNames);\n" + + " }\n" + + " public PropertyNames<" + + enumName + + "> getNames() {\n" + + " return names;\n" + + " }\n" + + " public String getShortName() {\n" + + " return names.getShortName();\n" + + " }\n" + + " private static final NameMatcher<" + + enumName + + "> NAME_MATCHER = PropertyNames.getNameToEnums(" + + enumName + + ".class);\n" + + " public static " + + enumName + + " forName(String name) {\n" + + " return NAME_MATCHER.get(name);\n" + + " }\n" + + " }\n"; } public static void writeValueEnumFile(Map> values) throws IOException { final PrintWriter output = FileUtilities.openUTF8Writer("", PROPERTY_VALUE_OUTPUT); - output.println("package org.unicode.props;\n" - + "import org.unicode.props.PropertyNames.NameMatcher;\n" - + "import org.unicode.props.PropertyNames.Named;\n" - + "\n" - + "/**\n" - + " Machine-generated file for property values, produced by GenerateEnums.java\n" - + " from PropertyValueAliases.txt and ExtraPropertyValueAliases.txt.\n" - + " The ordering of property value enums is alphabetical (ASCII),\n" - + " but the order of the values for the enums is based on the order within those two files\n" - + " with the ones in PropertyValueAliases coming first.\n" - + "*/\n" - + "public class UcdPropertyValues {"); - - //[Alpha, N, No, F, False] - addPropertyValueAliases(values, FileUtilities.in("", Utility.getMostRecentUnicodeDataFile("PropertyValueAliases", ENUM_VERSION, true, true))); - addPropertyValueAliases(values, FileUtilities.in(GenerateEnums.class, "ExtraPropertyValueAliases.txt")); + output.println( + "package org.unicode.props;\n" + + "import org.unicode.props.PropertyNames.NameMatcher;\n" + + "import org.unicode.props.PropertyNames.Named;\n" + + "\n" + + "/**\n" + + " Machine-generated file for property values, produced by GenerateEnums.java\n" + + " from PropertyValueAliases.txt and ExtraPropertyValueAliases.txt.\n" + + " The ordering of property value enums is alphabetical (ASCII),\n" + + " but the order of the values for the enums is based on the order within those two files\n" + + " with the ones in PropertyValueAliases coming first.\n" + + "*/\n" + + "public class UcdPropertyValues {"); + + // [Alpha, N, No, F, False] + addPropertyValueAliases( + values, + FileUtilities.in( + "", + Utility.getMostRecentUnicodeDataFile( + "PropertyValueAliases", ENUM_VERSION, true, true))); + addPropertyValueAliases( + values, FileUtilities.in(GenerateEnums.class, "ExtraPropertyValueAliases.txt")); output.println( - "\n public enum Binary implements Named {\n"+ - " No(\"N\", \"F\", \"False\"),\n"+ - " Yes(\"Y\", \"T\", \"True\")" + - getNameStuff2("Binary") - // ";\n"+ - // - // " private final PropertyNames names;\n"+ - // " private Binary (String shortName, String...otherNames) {\n"+ - // " names = new PropertyNames(Binary.class, this, shortName, otherNames);\n"+ - // " }\n"+ - // " public PropertyNames getNames() {\n"+ - // " return names;\n"+ - // " }\n"+ - // " }\n" + "\n public enum Binary implements Named {\n" + + " No(\"N\", \"F\", \"False\"),\n" + + " Yes(\"Y\", \"T\", \"True\")" + + getNameStuff2("Binary") + // ";\n"+ + // + // " private final PropertyNames names;\n"+ + // " private Binary (String shortName, + // String...otherNames) {\n"+ + // " names = new PropertyNames(Binary.class, this, + // shortName, otherNames);\n"+ + // " }\n"+ + // " public PropertyNames getNames() {\n"+ + // " return names;\n"+ + // " }\n"+ + // " }\n" ); for (final Entry> value : values.entrySet()) { @@ -231,7 +290,8 @@ public static void writeValueEnumFile(Map> values) throw output.println(" // " + propName.longName); continue; } - output.println(" public enum " + (propName.longName + "_Values") + " implements Named {"); + output.println( + " public enum " + (propName.longName + "_Values") + " implements Named {"); final StringBuilder constants = new StringBuilder(); boolean first = true; for (final String[] parts : partList) { @@ -241,7 +301,8 @@ public static void writeValueEnumFile(Map> values) throw if (propName.shortName.equals("ccc")) { longName = parts[3]; } - final String badName = PropName.isProperLongName(longName, PROPERTY_VALUE_LONG_NAME, false); + final String badName = + PropName.isProperLongName(longName, PROPERTY_VALUE_LONG_NAME, false); if (badName != null) { addWarning("Improper long value name for " + parts[0] + ": " + badName); } @@ -256,7 +317,11 @@ public static void writeValueEnumFile(Map> values) throw for (int i = 1; i < parts.length; ++i) { final String otherName = parts[i]; - if (i == 2 || otherName.equals("n/a") || otherName.equals(longName) || otherName.contains("-") || otherName.charAt(0) < 'A') { + if (i == 2 + || otherName.equals("n/a") + || otherName.equals(longName) + || otherName.contains("-") + || otherName.charAt(0) < 'A') { continue; } if (constants.length() != 0) { @@ -267,14 +332,14 @@ public static void writeValueEnumFile(Map> values) throw } final String enumName = propName.longName; - output.println(getNameStuff2(enumName+"_Values")); + output.println(getNameStuff2(enumName + "_Values")); } output.println("}"); output.close(); } - static Set WARNINGS = new LinkedHashSet(); + private static void addWarning(String string) { WARNINGS.add(string); } @@ -305,10 +370,12 @@ public static void writeOtherNames2(PrintWriter output, String longName, String. output.print(")"); } - static Map NAME2CARD = new HashMap<>(); + static Map NAME2CARD = new HashMap<>(); + static { Splitter SEMICOLON = Splitter.on(";").trimResults(); - for (String line : FileUtilities.in(IndexUnicodeProperties.class, "IndexPropertyRegex.txt")) { + for (String line : + FileUtilities.in(IndexUnicodeProperties.class, "IndexPropertyRegex.txt")) { line = line.trim(); if (line.startsWith("$") || line.isEmpty() || line.startsWith("#")) { continue; @@ -323,12 +390,22 @@ public static void writeOtherNames2(PrintWriter output, String longName, String. } catch (Exception e) { throw new ICUException("Bad line " + line, e); } - switch(multivalued) { - case "ORDERED": NAME2CARD.put(propertyName, ValueCardinality.Ordered); break; - case "MULTI_VALUED": NAME2CARD.put(propertyName, ValueCardinality.Unordered); break; - case "SINGLE_VALUED": NAME2CARD.put(propertyName, ValueCardinality.Singleton); break; - case "EXTENSIBLE": NAME2CARD.put(propertyName, ValueCardinality.Singleton); break; - default: throw new UnicodePropertyException("IndexPropertyRegex: didn't expect " + multivalued); + switch (multivalued) { + case "ORDERED": + NAME2CARD.put(propertyName, ValueCardinality.Ordered); + break; + case "MULTI_VALUED": + NAME2CARD.put(propertyName, ValueCardinality.Unordered); + break; + case "SINGLE_VALUED": + NAME2CARD.put(propertyName, ValueCardinality.Singleton); + break; + case "EXTENSIBLE": + NAME2CARD.put(propertyName, ValueCardinality.Singleton); + break; + default: + throw new UnicodePropertyException( + "IndexPropertyRegex: didn't expect " + multivalued); } } } @@ -337,30 +414,33 @@ public static void writeMainUcdFile() throws IOException { final PrintWriter output = FileUtilities.openUTF8Writer("", PROPERTY_FILE_OUTPUT); output.print( - "package org.unicode.props;\n" + - "import java.util.EnumSet;\n" + - "import java.util.Set;\n"+ - "import org.unicode.props.PropertyNames.NameMatcher;\n" - //"import org.unicode.props.UcdPropertyValues.*;\n\n" + "package org.unicode.props;\n" + + "import java.util.EnumSet;\n" + + "import java.util.Set;\n" + + "import org.unicode.props.PropertyNames.NameMatcher;\n" + // "import org.unicode.props.UcdPropertyValues.*;\n\n" ); output.println("import org.unicode.props.UcdPropertyValues.Binary;"); TreeSet imports = new TreeSet<>(); for (final Entry i : lookupMain.entrySet()) { final PropName pname = i.getValue(); switch (pname.propertyType) { - case Enumerated: - case Catalog: - final String longName = pname.longName; - if (!pname.longName.equals("Script_Extensions")) { // exception, since uses Script_Values - imports.add(longName); - } -// final ValueCardinality cardinality = NAME2CARD.get(longName.toLowerCase(Locale.ENGLISH)); -// if (true || cardinality == null || cardinality == ValueCardinality.Singleton) { -// imports.add(longName); -// } - break; - default: - break; + case Enumerated: + case Catalog: + final String longName = pname.longName; + if (!pname.longName.equals( + "Script_Extensions")) { // exception, since uses Script_Values + imports.add(longName); + } + // final ValueCardinality cardinality = + // NAME2CARD.get(longName.toLowerCase(Locale.ENGLISH)); + // if (true || cardinality == null || cardinality == + // ValueCardinality.Singleton) { + // imports.add(longName); + // } + break; + default: + break; } } for (String s : imports) { @@ -368,12 +448,15 @@ public static void writeMainUcdFile() throws IOException { } output.println(); - output.println("/**\n" - + " Machine-generated file for properties, produced by GenerateEnums.java\n" - + " from PropertyAliases.txt and ExtraPropertyAliases.txt.\n" - + " The ordering of properties is first by category, then alphabetical (ASCII order).\n" - + "*/\n" - + "public enum " + "UcdProperty" + " {"); + output.println( + "/**\n" + + " Machine-generated file for properties, produced by GenerateEnums.java\n" + + " from PropertyAliases.txt and ExtraPropertyAliases.txt.\n" + + " The ordering of properties is first by category, then alphabetical (ASCII order).\n" + + "*/\n" + + "public enum " + + "UcdProperty" + + " {"); Set missingCardinality = new TreeSet<>(); Set extraCardinality = new TreeSet<>(NAME2CARD.keySet()); @@ -402,19 +485,23 @@ public static void writeMainUcdFile() throws IOException { } switch (pt) { - case Binary: - classItem = //"UcdPropertyValues." + - "Binary.class"; - break; - case Enumerated: - case Catalog: - classItem = // "UcdPropertyValues." + - ("Script_Extensions".equals(pname.longName) ? "Script" : pname.longName) + "_Values.class"; // HACK! - break; - default: - break; // leave classItem = null + case Binary: + classItem = // "UcdPropertyValues." + + "Binary.class"; + break; + case Enumerated: + case Catalog: + classItem = // "UcdPropertyValues." + + ("Script_Extensions".equals(pname.longName) + ? "Script" + : pname.longName) + + "_Values.class"; // HACK! + break; + default: + break; // leave classItem = null } - writeOtherNames(output, type, classItem, cardinality, pname.shortName, pname.others); + writeOtherNames( + output, type, classItem, cardinality, pname.shortName, pname.others); output.print(",\n"); } } @@ -425,82 +512,89 @@ public static void writeMainUcdFile() throws IOException { System.err.println("Extra Cardinality for " + extraCardinality); } output.println(" ;"); - output.println("\n" + - "private final PropertyType type;\n"+ - " private final PropertyNames names;\n"+ - " // for enums\n"+ - " private final NameMatcher name2enum;\n"+ - " private final EnumSet enums;\n"+ - " private final Class enumClass;\n"+ - " private final ValueCardinality cardinality;\n"+ - " \n"+ - " private UcdProperty(PropertyType type, String shortName, String...otherNames) {\n"+ - " this.type = type;\n"+ - " names = new PropertyNames(UcdProperty.class, this, shortName, otherNames);\n"+ - " name2enum = null;\n"+ - " enums = null;\n"+ - " enumClass = null;\n"+ - " cardinality = ValueCardinality.Singleton;\n"+ - " }\n"+ - " private UcdProperty(PropertyType type, Class classItem, ValueCardinality _cardinality, String shortName, String...otherNames) {\n"+ - " this.type = type;\n"+ - " names = new PropertyNames(UcdProperty.class, this, shortName, otherNames);\n"+ - " cardinality = _cardinality == null ? ValueCardinality.Singleton : _cardinality;\n"+ - " if (classItem == null) {\n" + - " name2enum = null;\n"+ - " enums = null;\n"+ - " enumClass = null;\n"+ - " } else {\n" + - " enums = EnumSet.allOf(classItem);\n"+ - " name2enum = PropertyNames.getNameToEnums(classItem);\n"+ - " enumClass = classItem;\n"+ - " }\n" + - " }\n"+ - " \n"+ - " public ValueCardinality getCardinality() {\n"+ - " return cardinality;\n"+ - " }\n"+ - " public Class getEnumClass() {\n"+ - " return enumClass;\n"+ - " }\n"+ - " public PropertyType getType() {\n"+ - " return type;\n"+ - " }\n"+ - " public PropertyNames getNames() {\n"+ - " return names;\n"+ - " }\n"+ - " public String getShortName() {\n" + - " return names.getShortName();\n" + - " }\n" + - " public static UcdProperty forString(String name) {\n"+ - " return Numeric_Value.names.forString(name);\n"+ - " }\n"+ - " public Enum getEnum(String name) {\n"+ - " return name2enum == null ? null : name2enum.get(name);\n"+ - " }\n"+ - " public PropertyNames getEnumNames() {\n"+ - " return name2enum == null ? null : name2enum.getNames();\n"+ - " }\n" + - " public Set getEnums() {\n"+ - " return enums;\n"+ - " }\n" - ); + output.println( + "\n" + + "private final PropertyType type;\n" + + " private final PropertyNames names;\n" + + " // for enums\n" + + " private final NameMatcher name2enum;\n" + + " private final EnumSet enums;\n" + + " private final Class enumClass;\n" + + " private final ValueCardinality cardinality;\n" + + " \n" + + " private UcdProperty(PropertyType type, String shortName, String...otherNames) {\n" + + " this.type = type;\n" + + " names = new PropertyNames(UcdProperty.class, this, shortName, otherNames);\n" + + " name2enum = null;\n" + + " enums = null;\n" + + " enumClass = null;\n" + + " cardinality = ValueCardinality.Singleton;\n" + + " }\n" + + " private UcdProperty(PropertyType type, Class classItem, ValueCardinality _cardinality, String shortName, String...otherNames) {\n" + + " this.type = type;\n" + + " names = new PropertyNames(UcdProperty.class, this, shortName, otherNames);\n" + + " cardinality = _cardinality == null ? ValueCardinality.Singleton : _cardinality;\n" + + " if (classItem == null) {\n" + + " name2enum = null;\n" + + " enums = null;\n" + + " enumClass = null;\n" + + " } else {\n" + + " enums = EnumSet.allOf(classItem);\n" + + " name2enum = PropertyNames.getNameToEnums(classItem);\n" + + " enumClass = classItem;\n" + + " }\n" + + " }\n" + + " \n" + + " public ValueCardinality getCardinality() {\n" + + " return cardinality;\n" + + " }\n" + + " public Class getEnumClass() {\n" + + " return enumClass;\n" + + " }\n" + + " public PropertyType getType() {\n" + + " return type;\n" + + " }\n" + + " public PropertyNames getNames() {\n" + + " return names;\n" + + " }\n" + + " public String getShortName() {\n" + + " return names.getShortName();\n" + + " }\n" + + " public static UcdProperty forString(String name) {\n" + + " return Numeric_Value.names.forString(name);\n" + + " }\n" + + " public Enum getEnum(String name) {\n" + + " return name2enum == null ? null : name2enum.get(name);\n" + + " }\n" + + " public PropertyNames getEnumNames() {\n" + + " return name2enum == null ? null : name2enum.getNames();\n" + + " }\n" + + " public Set getEnums() {\n" + + " return enums;\n" + + " }\n"); output.println("\n}"); output.close(); } - - public static void writeOtherNames(PrintWriter output, String type, - String classItem, ValueCardinality cardinality, - String shortName, List otherNames) { + public static void writeOtherNames( + PrintWriter output, + String type, + String classItem, + ValueCardinality cardinality, + String shortName, + List otherNames) { output.print("("); - //if (shortName != null) { + // if (shortName != null) { output.print(type); if (classItem != null || cardinality != ValueCardinality.Singleton) { - output.print(", " + classItem + ", " - + (cardinality == ValueCardinality.Singleton ? "null" - : "ValueCardinality." + cardinality.toString())); + output.print( + ", " + + classItem + + ", " + + (cardinality == ValueCardinality.Singleton + ? "null" + : "ValueCardinality." + cardinality.toString())); } output.print(", \"" + shortName + "\""); for (final String otherName : otherNames) { @@ -509,8 +603,8 @@ public static void writeOtherNames(PrintWriter output, String type, output.print(")"); } - - public static void addPropertyValueAliases(Map> values, Iterable lines) { + public static void addPropertyValueAliases( + Map> values, Iterable lines) { for (final String line : lines) { final String[] parts = FileUtilities.cleanSemiFields(line); if (parts == null) { @@ -522,12 +616,14 @@ public static void addPropertyValueAliases(Map> values, } final Set set = values.get(propName); set.add(parts); - //System.out.println(propName.longName + " " + Arrays.asList(parts)); + // System.out.println(propName.longName + " " + Arrays.asList(parts)); } } - public static void addPropertyAliases(Map> values, Iterable lines, OverrideChoice override) { - final Matcher propType = Pattern.compile("#\\s+(\\p{Alpha}+)\\s+Properties\\s*").matcher(""); + public static void addPropertyAliases( + Map> values, Iterable lines, OverrideChoice override) { + final Matcher propType = + Pattern.compile("#\\s+(\\p{Alpha}+)\\s+Properties\\s*").matcher(""); PropertyType type = null; for (final String line : lines) { System.out.println(line); @@ -539,13 +635,18 @@ public static void addPropertyAliases(Map> values, Itera continue; } final PropName propName = new PropName(type, override, parts); - values.put(propName, propName.longName.equals("Age") ? new TreeSet<>(ARRAY_SORT) : new LinkedHashSet<>()); + values.put( + propName, + propName.longName.equals("Age") + ? new TreeSet<>(ARRAY_SORT) + : new LinkedHashSet<>()); System.out.println(propName); // if (!Locations.contains(propName.longName)) { // System.out.println("Missing file: " + propName.longName); // } } } + private static String fix(String string) { final char ch = string.charAt(0); if ('0' <= ch && ch <= '9') { diff --git a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java index cf2f176cd..b7c901add 100644 --- a/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java +++ b/unicodetools/src/main/java/org/unicode/props/IndexUnicodeProperties.java @@ -1,5 +1,19 @@ package org.unicode.props; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.VersionInfo; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; @@ -24,7 +38,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; - import org.unicode.draft.CldrUtility.VariableReplacer; import org.unicode.draft.UnicodeDataInput; import org.unicode.draft.UnicodeDataInput.ItemReader; @@ -38,50 +51,35 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.VersionInfo; - /** * TODO StandardizedVariants and NameSequences* - * @author markdavis * + * @author markdavis */ public class IndexUnicodeProperties extends UnicodeProperty.Factory { static final String SET_SEPARATOR = "|"; - /** - * Control file caching - */ + /** Control file caching */ static final boolean GZIP = true; + static final boolean SIMPLE_COMPRESSION = true; static final boolean FILE_CACHE = System.getProperty("DISABLE_PROP_FILE_CACHE") == null; - /** - * Debugging - */ + /** Debugging */ static final boolean SHOW_DEFAULTS = false; + private static final boolean CHECK_PROPERTY_STATUS = false; - static final UcdProperty CHECK_PROPERTY = null; // UcdProperty.Bidi_Class; // UcdProperty.Numeric_Value; // + static final UcdProperty CHECK_PROPERTY = + null; // UcdProperty.Bidi_Class; // UcdProperty.Numeric_Value; // static Normalizer2 NFD = Normalizer2.getNFDInstance(); // - //static Normalizer2 NFD2 = Normalizer2.getInstance(null, "NFC", Mode.DECOMPOSE); + // static Normalizer2 NFD2 = Normalizer2.getInstance(null, "NFC", Mode.DECOMPOSE); static final boolean SHOW_LOADED = false; - public final static String FIELD_SEPARATOR = "; "; - private static final Relation DATA_LOADING_ERRORS - = Relation.of(new EnumMap>(UcdProperty.class), LinkedHashSet.class); + public static final String FIELD_SEPARATOR = "; "; + private static final Relation DATA_LOADING_ERRORS = + Relation.of( + new EnumMap>(UcdProperty.class), LinkedHashSet.class); public enum DefaultValueType { LITERAL(null), @@ -92,7 +90,9 @@ public enum DefaultValueType { Simple_Lowercase_Mapping(UcdProperty.Simple_Lowercase_Mapping), Simple_Titlecase_Mapping(UcdProperty.Simple_Titlecase_Mapping), Simple_Uppercase_Mapping(UcdProperty.Simple_Uppercase_Mapping); - static final HashMap mapping = new HashMap(); + static final HashMap mapping = + new HashMap(); + static { mapping.put("", NONE); mapping.put("", Simple_Lowercase_Mapping); @@ -101,22 +101,27 @@ public enum DefaultValueType { mapping.put("", CODE_POINT); mapping.put("", CODE_POINT); mapping.put("\n" + - "\n" + - ""); + // PrintWriter out = FileUtilities.openUTF8Writer("C:/DATA/GEN/charts/namelist/", + // chartPrefix + fileName); + PrintWriter out = + Utility.openPrintWriter( + NAMESLIST_DIR, chartPrefix + fileName, Utility.UTF8_WINDOWS); + final String heading = + TransliteratorUtilities.toHTML.transliterate(getHeading(lineParts[2])); + + out.println( + "\n" + + "\n" + + "\n" + + "\n" + + "" + + heading + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + ""); // header - out.println("
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
" + lineParts[1] + "" + heading + "" + lineParts[3] + "
"); + out.println( + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
" + + lineParts[1] + + "" + + heading + + "" + + lineParts[3] + + "
"); if ("Unassigned".equals(lineParts[2])) { System.out.println("debug"); @@ -132,13 +160,13 @@ public static void main(String[] args) throws Exception { // first pass through and collect all the code points collectedCodePoints.clear(); for (int i = 1; i < lines.size(); ++i) { - final String line = (String)lines.get(i); + final String line = (String) lines.get(i); final int cp1 = line.charAt(0); if (cp1 == ';') { continue; } if (cp1 != '@' && cp1 != '\t') { - final int cp = Integer.parseInt(line.split("\t")[0],16); + final int cp = Integer.parseInt(line.split("\t")[0], 16); collectedCodePoints.add(cp); } } @@ -149,7 +177,8 @@ public static void main(String[] args) throws Exception { } else { out.println("

"); int counter = 0; - for (final UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(collectedCodePoints); + it.next(); ) { if ((counter % 16) == 0 && counter != 0) { out.println(""); } @@ -168,14 +197,27 @@ public static void main(String[] args) throws Exception { String title = ""; final String name = Default.ucd().getName(it.codepoint); if (name != null) { - title = " title='" + TransliteratorUtilities.toHTML.transliterate(name.toLowerCase()) + "'"; + title = + " title='" + + TransliteratorUtilities.toHTML.transliterate( + name.toLowerCase()) + + "'"; } - out.println(""); + out.println( + ""); counter++; } if (counter > 16) { @@ -190,20 +232,24 @@ public static void main(String[] args) throws Exception { } out.println("\n"); out.close(); - //out = FileUtilities.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + fileName); - out = Utility.openPrintWriter(NAMESLIST_DIR, namePrefix + fileName, Utility.UTF8_WINDOWS); - out.println("\n" + - UtilityBase.HTML_HEAD + - "none\n" + - "\n" + - "\n" + - ""); + // out = FileUtilities.openUTF8Writer("C:/DATA/GEN/charts/namelist/", namePrefix + + // fileName); + out = + Utility.openPrintWriter( + NAMESLIST_DIR, namePrefix + fileName, Utility.UTF8_WINDOWS); + out.println( + "\n" + + UtilityBase.HTML_HEAD + + "none\n" + + "\n" + + "\n" + + ""); out.println("

" + heading + "

\n

\u00A0" - + showChar(it.codepoint, true) - + "\u00A0
" + - hexcp + "
\u00A0" + + showChar(it.codepoint, true) + + "\u00A0
" + + hexcp + + "
"); // now do the characters boolean inTable = true; boolean firstInTable = true; for (int i = 1; i < lines.size(); ++i) { - String line = (String)lines.get(i); + String line = (String) lines.get(i); try { if (line.startsWith("@") && !line.startsWith("@+\t*")) { finishItem(out); @@ -213,31 +259,32 @@ public static void main(String[] args) throws Exception { // } line = line.substring(1); if (line.equals("@+")) { - // skip @@+ which is an index tab - } else if (line.startsWith("~") || line.startsWith("@~") || line.startsWith("@@~")) { + // skip @@+ which is an index tab + } else if (line.startsWith("~") + || line.startsWith("@~") + || line.startsWith("@@~")) { // skip @~, @@~, @@@~ which are variation subheaders - if ((i + 1 < lines.size()) && ((String)lines.get(i + 1)).startsWith("@+")) { - // also skip a following @+ notice line that continues the variation subheader + if ((i + 1 < lines.size()) + && ((String) lines.get(i + 1)).startsWith("@+")) { + // also skip a following @+ notice line that continues the variation + // subheader ++i; } } else if (line.startsWith("+")) { line = line.substring(1).trim(); - out.println(""); + out.println( + ""); } else if (line.startsWith("@")) { System.err.println("*** Can't handle line: " + i + "\t" + line); } else { line = line.trim(); - out.println(""); + out.println(""); } } else { boolean convertHex = true; if (line.startsWith("@+\t*")) { - line = line.substring(2); // handle like regular informative note - convertHex = false; // but without converting hex numbers + line = line.substring(2); // handle like regular informative note + convertHex = false; // but without converting hex numbers } if (!inTable) { out.println("
" - + line - + "
" + line + "

" - + line - + "

" + line + "

"); @@ -247,31 +294,53 @@ public static void main(String[] args) throws Exception { if (line.startsWith("\t")) { String body = line.trim(); if (false && line.indexOf(body) != 1) { - System.out.println("Format error: too much inital whitespace: <" + line + ">"); + System.out.println( + "Format error: too much inital whitespace: <" + line + ">"); } final char firstChar = body.charAt(0); switch (firstChar) { - case '*': body = "\u2022 " + body.substring(2); break; - case '%': body = "\u203B " + body.substring(2); break; - case ':': body = checkCanonical(lastCodePoint, body); break; - case '#': body = checkCompatibility(lastCodePoint, body); break; - case 'x': body = getOther(body); break; - case '=': break; - case ';': continue; - case '~': continue; - default: throw new IllegalArgumentException("Huh? " + body); + case '*': + body = "\u2022 " + body.substring(2); + break; + case '%': + body = "\u203B " + body.substring(2); + break; + case ':': + body = checkCanonical(lastCodePoint, body); + break; + case '#': + body = checkCompatibility(lastCodePoint, body); + break; + case 'x': + body = getOther(body); + break; + case '=': + break; + case ';': + continue; + case '~': + continue; + default: + throw new IllegalArgumentException("Huh? " + body); } final char firstDisplayChar = body.charAt(0); body = body.substring(1).trim(); - out.println("\u00A0" - + "" - + ""); + out.println( + "\u00A0" + + "" + + ""); convertHex = true; } else if (line.startsWith(";")) { System.err.println("*** Ignoring:" + line); @@ -280,73 +349,101 @@ public static void main(String[] args) throws Exception { finishItem(out); lineParts = line.split("\t"); final String x = lineParts[0]; - lastCodePoint = Integer.parseInt(x,16); + lastCodePoint = Integer.parseInt(x, 16); final boolean lastCodePointIsNew = isNew(lastCodePoint); if (lastCodePointIsNew) { - nameListNew.set(nameList.size()-1, true); + nameListNew.set(nameList.size() - 1, true); } - out.println("" + x + "" - + "" - + ""); + out.println( + "" + + x + + "" + + "" + + ""); lastDecompType = Default.ucd().getDecompositionType(lastCodePoint); } firstInTable = false; } } catch (final Exception e) { - throw (IllegalArgumentException) new IllegalArgumentException("Error on line: " + line) - .initCause(e); + throw (IllegalArgumentException) + new IllegalArgumentException("Error on line: " + line).initCause(e); } } finishItem(out); - out.println("
\u00A0" - + firstDisplayChar - + "" - + maybeNameStyle(showTextConvertingHex(body, convertHex && firstChar != '=' && firstChar != '%'), firstChar == '=') - + "
\u00A0" + + firstDisplayChar + + "" + + maybeNameStyle( + showTextConvertingHex( + body, + convertHex + && firstChar != '=' + && firstChar != '%'), + firstChar == '=') + + "
\u00A0" - + showChar(lastCodePoint, true) + "\u00A0" - + nameStyle(showTextConvertingHex(lineParts[1], false)) + "
\u00A0" + + showChar(lastCodePoint, true) + + "\u00A0" + + nameStyle(showTextConvertingHex(lineParts[1], false)) + + "
\n" + - "




































\n" + - "\n"); + out.println( + "\n" + + "




































\n" + + "\n"); out.close(); } blockInfo.in.close(); - // PrintWriter out = FileUtilities.openUTF8Writer("C:/DATA/GEN/charts/namelist/", "mainList.html"); - final PrintWriter out = Utility.openPrintWriter(NAMESLIST_DIR, "mainList.html", Utility.UTF8_WINDOWS); + // PrintWriter out = FileUtilities.openUTF8Writer("C:/DATA/GEN/charts/namelist/", + // "mainList.html"); + final PrintWriter out = + Utility.openPrintWriter(NAMESLIST_DIR, "mainList.html", Utility.UTF8_WINDOWS); FileUtilities.appendFile(WriteCharts.class, "nameslist_chart_header.html", out); - // out.println("" + - // "Main List" + + // out.println("" + + // "Main List" + // ""); for (int i = 0; i < nameList.size(); ++i) { final String line = (String) nameList.get(i); final String[] lineParts = line.split("\t"); final String fileName = lineParts[1] + ".html"; - out.println("" + getHeading(lineParts[2]) + ""); + out.println( + "" + + getHeading(lineParts[2]) + + ""); } out.println("
" + lineParts[1] + - "" + - lineParts[3] +"
" + + lineParts[1] + + "" + + lineParts[3] + + "
"); WriteCharts.closeIndexFile(out, "", WriteCharts.NAMELIST, true); - //out.close(); - //final BagFormatter bf = new BagFormatter(); - //System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in decomps", hasNoName)); + // out.close(); + // final BagFormatter bf = new BagFormatter(); + // System.out.println(bf.showSetDifferences("Has name in decomps", hasName, "Has no name in + // decomps", hasNoName)); System.out.println("Name differences: Canonical"); showNameDifferences(hasNameCan, hasNoNameCan); System.out.println("Name differences: Compatibility"); showNameDifferences(hasNameComp, hasNoNameComp); - // System.out.println("Characters with names in decomps: " + hasName.toPattern(true)); - // System.out.println("Characters without names in decomps: " + hasNoName.toPattern(true)); - // System.out.println("Characters sometimes with, sometimes without names in decomps: " + both.toPattern(true)); + // System.out.println("Characters with names in decomps: " + + // hasName.toPattern(true)); + // System.out.println("Characters without names in decomps: " + + // hasNoName.toPattern(true)); + // System.out.println("Characters sometimes with, sometimes without names in decomps: + // " + both.toPattern(true)); System.out.println("Done"); } private static void checkFile() throws IOException { - final BufferedReader in = Utility.openUnicodeFile("NamesList", Default.ucdVersion(), true, Utility.LATIN1_WINDOWS); + final BufferedReader in = + Utility.openUnicodeFile( + "NamesList", Default.ucdVersion(), true, Utility.LATIN1_WINDOWS); final Set missing = new TreeSet(EnumSet.allOf(LineMatcher.class)); - final Map examples = new TreeMap(); + final Map examples = new TreeMap(); while (true) { final String line = in.readLine(); if (line == null) { @@ -376,9 +473,9 @@ private static boolean isNew(int codepoint) { private static void showNameDifferences(Map hasName, Map hasNoName) { final Set both = new TreeSet(hasNoName.keySet()); both.retainAll(hasName.keySet()); - //hasNoName.removeAll(both); - //hasName.removeAll(both); - for (final Iterator it = both.iterator(); it.hasNext();) { + // hasNoName.removeAll(both); + // hasName.removeAll(both); + for (final Iterator it = both.iterator(); it.hasNext(); ) { final String decomp = (String) it.next(); System.out.println(); System.out.println("decomp: " + Utility.hex(decomp)); @@ -406,15 +503,43 @@ private static void finishItem(PrintWriter out) { System.out.println("Alert: missing decomp for " + Utility.hex(lastCodePoint)); } final String str = UTF16.valueOf(lastCodePoint); - final String upper = showForm(out, str, null, null, Default.ucd().getCase(str,UCD_Types.FULL,UCD_Types.UPPER), "\u2191"); - showForm(out, str, upper, null, Default.ucd().getCase(str,UCD_Types.FULL,UCD_Types.TITLE), "\u2195"); - final String lower = showForm(out, str, null, null, Default.ucd().getCase(str,UCD_Types.FULL,UCD_Types.LOWER), "\u2193"); - showForm(out, lower, null, null, Default.ucd().getCase(str,UCD_Types.FULL,UCD_Types.FOLD), "\u2194"); + final String upper = + showForm( + out, + str, + null, + null, + Default.ucd().getCase(str, UCD_Types.FULL, UCD_Types.UPPER), + "\u2191"); + showForm( + out, + str, + upper, + null, + Default.ucd().getCase(str, UCD_Types.FULL, UCD_Types.TITLE), + "\u2195"); + final String lower = + showForm( + out, + str, + null, + null, + Default.ucd().getCase(str, UCD_Types.FULL, UCD_Types.LOWER), + "\u2193"); + showForm( + out, + lower, + null, + null, + Default.ucd().getCase(str, UCD_Types.FULL, UCD_Types.FOLD), + "\u2194"); final String dc = Default.ucd().getDecompositionMapping(lastCodePoint); - final String nfd = showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB"); - //String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB"); - final String nfkd = showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD"); + final String nfd = + showForm(out, dc, str, null, Default.nfd().normalize(lastCodePoint), "\u21DB"); + // String nfc = showForm(out, dc, null, Default.nfc().normalize(lastCodePoint), "\u21DB"); + final String nfkd = + showForm(out, dc, str, nfd, Default.nfkd().normalize(lastCodePoint), "\u21DD"); // if (nfkd.equals(str)) { // Set s = ti.getConfusables(lastCodePoint, "MA"); @@ -427,11 +552,14 @@ private static void finishItem(PrintWriter out) { // for (Iterator it = sortedSet.iterator(); it.hasNext();) { // String other = (String)it.next(); // if (nfkd.equals(Default.nfkd().normalize(other))) continue; - // out.println("\u00A0\u00A0\u279F\u00A0" + // out.println("\u00A0\u00A0\u279F\u00A0" // + showTextConvertingHex(Utility.hex(other, 4, " + "), true) // + " " - // + Default.ucd().getName(other, UCD.NORMAL, " + ").toLowerCase() - // // maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=') + // + Default.ucd().getName(other, UCD.NORMAL, " + + // ").toLowerCase() + // // maybeNameStyle(showTextConvertingHex(upper, firstChar != + // '='), firstChar == '=') // + ""); // } // } @@ -441,19 +569,33 @@ private static void finishItem(PrintWriter out) { static Set sortedSet = new TreeSet(Collator.getInstance(ULocale.ENGLISH)); - private static String showForm(PrintWriter out, String str, String str2, String str3, String transformed, String symbol) { + private static String showForm( + PrintWriter out, + String str, + String str2, + String str3, + String transformed, + String symbol) { if (!transformed.equals(str) && !transformed.equals(str2) && !transformed.equals(str3)) { - out.println("\u00A0\u00A0" + symbol + "" - + showTextConvertingHex(Utility.hex(transformed, 4, " + "), true) - + (UTF16.countCodePoint(transformed) != 1 ? "" : - " " + Default.ucd().getName(transformed, UCD_Types.NORMAL, " + ").toLowerCase()) - // maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), firstChar == '=') - + ""); + out.println( + "\u00A0\u00A0" + + symbol + + "" + + showTextConvertingHex(Utility.hex(transformed, 4, " + "), true) + + (UTF16.countCodePoint(transformed) != 1 + ? "" + : " " + + Default.ucd() + .getName(transformed, UCD_Types.NORMAL, " + ") + .toLowerCase()) + // maybeNameStyle(showTextConvertingHex(upper, firstChar != '='), + // firstChar == '=') + + ""); } return transformed; } - static public String getHeading(String name) { + public static String getHeading(String name) { final int pos = name.lastIndexOf(" ("); if (pos < 0) { return name; @@ -468,10 +610,12 @@ private static String maybeNameStyle(String string, boolean b) { return string; } - private static String nameStyle(String string) { // TODO Auto-generated method stub - String result = "" + Default.ucd().getCase(string, UCD_Types.FULL, UCD_Types.TITLE) + ""; + String result = + "" + + Default.ucd().getCase(string, UCD_Types.FULL, UCD_Types.TITLE) + + ""; // if it has any &xxx;, then restore them. int position = 0; while (true) { @@ -480,9 +624,10 @@ private static String nameStyle(String string) { } final int start = escapeMatch.start(); position = escapeMatch.end(); - result = result.substring(0,start) - + result.substring(start, position).toLowerCase() - + result.substring(position); + result = + result.substring(0, start) + + result.substring(start, position).toLowerCase() + + result.substring(position); } return result; } @@ -503,14 +648,17 @@ private static String showTextConvertingHex(String body, boolean addCharToHex) { if (len < 4 || len > 6) { continue; } - final int cp = Integer.parseInt(findHex.group(),16); + final int cp = Integer.parseInt(findHex.group(), 16); if (cp > 0x10FFFF) { continue; } final String insert = "\u00A0" + showChar(cp, true); - final String beginning = body.substring(0,start) - + "" + body.substring(start, position) + "" - + insert; + final String beginning = + body.substring(0, start) + + "" + + body.substring(start, position) + + "" + + insert; body = beginning + body.substring(position); position = beginning.length(); } @@ -519,24 +667,24 @@ private static String showTextConvertingHex(String body, boolean addCharToHex) { } /* -CROSS_REF: TAB "x" SP CHAR SP LCNAME LF - | TAB "x" SP CHAR SP "<" LCNAME ">" LF - // x is replaced by a right arrow - - | TAB "x" SP "(" LCNAME SP "-" SP CHAR ")" LF - | TAB "x" SP "(" "<" LCNAME ">" SP "-" SP CHAR ")" LF - // x is replaced by a right arrow; - // (second type as used for control and noncharacters) - - // In the forms with parentheses the "(","-" and ")" are removed - // and the order of CHAR and LCNAME is reversed; - // i.e. all inputs result in the same order of output - - | TAB "x" SP CHAR LF - // x is replaced by a right arrow - // (this type is the only one without LCNAME - // and is used for ideographs) - */ + CROSS_REF: TAB "x" SP CHAR SP LCNAME LF + | TAB "x" SP CHAR SP "<" LCNAME ">" LF + // x is replaced by a right arrow + + | TAB "x" SP "(" LCNAME SP "-" SP CHAR ")" LF + | TAB "x" SP "(" "<" LCNAME ">" SP "-" SP CHAR ")" LF + // x is replaced by a right arrow; + // (second type as used for control and noncharacters) + + // In the forms with parentheses the "(","-" and ")" are removed + // and the order of CHAR and LCNAME is reversed; + // i.e. all inputs result in the same order of output + + | TAB "x" SP CHAR LF + // x is replaced by a right arrow + // (this type is the only one without LCNAME + // and is used for ideographs) + */ static Matcher pointer = Pattern.compile("x \\((.*) - ([0-9A-F]+)\\)").matcher(""); static Matcher pointer1 = Pattern.compile("x ([0-9A-F]{4,6}) (.*)").matcher(""); static Matcher pointer2 = Pattern.compile("x ([0-9A-F]{4,6})").matcher(""); @@ -548,13 +696,13 @@ private static String getOther(String body) { int cp; String name = null; if (pointer.reset(body).matches()) { - cp = Integer.parseInt(pointer.group(2),16); + cp = Integer.parseInt(pointer.group(2), 16); name = checkName(body, cp, pointer.group(1)); } else if (pointer1.reset(body).matches()) { - cp = Integer.parseInt(pointer1.group(1),16); + cp = Integer.parseInt(pointer1.group(1), 16); name = checkName(body, cp, pointer1.group(2)); } else if (pointer2.reset(body).matches()) { - cp = Integer.parseInt(pointer2.group(1),16); + cp = Integer.parseInt(pointer2.group(1), 16); // name = UCharacter.getName(cp).toLowerCase(); // System.out.println("Irregular format: " + body); } else { @@ -562,7 +710,9 @@ private static String getOther(String body) { String mismatch2 = RegexUtilities.showMismatch(pointer2, body); throw new IllegalArgumentException("Bad format:\n\t" + mismatch + "\n\t" + mismatch2); } - return "\u2192 " + Utility.hex(cp,4) /*+ " " + showChar(cp)*/ + (name != null ? " " + name : ""); + return "\u2192 " + + Utility.hex(cp, 4) /*+ " " + showChar(cp)*/ + + (name != null ? " " + name : ""); } public static String checkName(String body, int cp, String name) { @@ -571,7 +721,8 @@ public static String checkName(String body, int cp, String name) { name2 = ""; } if (!name.equalsIgnoreCase(name2)) { - System.out.println("Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint)); + System.out.println( + "Mismatch in name for " + body + " in " + Utility.hex(lastCodePoint)); System.out.println("\tName is: " + name2); } return name; @@ -585,13 +736,14 @@ static String showChar(int cp, boolean addRlmIfNeeded) { } else if (cp == 0x7F) { rep = 0x2421; } - return "" + (char)rep + ""; + return "" + (char) rep + ""; } if (usePicture.contains(cp)) { return ""; - //String hex = Utility.hex(cp); - //return "" + hex + ""; + // String hex = Utility.hex(cp); + // return "" + hex + ""; } if (isWhiteSpace.contains(cp)) { return ""; @@ -610,7 +762,7 @@ static String showChar(int cp, boolean addRlmIfNeeded) { return result; } - //static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]"); + // static final UnicodeSet noname = new UnicodeSet("[[:ascii:][:ideographic:]]"); static final Map hasNoNameCan = new TreeMap(); static final Map hasNameCan = new TreeMap(); static final Map hasNoNameComp = new TreeMap(); @@ -620,7 +772,8 @@ static String showChar(int cp, boolean addRlmIfNeeded) { private static String checkCanonical(int codePoint, String body) { body = body.substring(2); if (lastDecompType != UCD_Types.CANONICAL) { - System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint)); + System.out.println( + "Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint)); } final String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint); final String hexed = Utility.hex(lastDecomp, 4, " "); @@ -633,7 +786,8 @@ private static String checkCanonical(int codePoint, String body) { } else if (hexed2.equalsIgnoreCase(body)) { hasNameCan.put(lastDecomp, UTF16.valueOf(codePoint)); } else { - System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint)); + System.out.println( + "Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint)); System.out.println("\tShould be: " + hexed); } lastDecompType = UCD_Types.NONE; @@ -643,7 +797,8 @@ private static String checkCanonical(int codePoint, String body) { private static String checkCompatibility(int codePoint, String body) { body = body.substring(2); if (lastDecompType <= UCD_Types.CANONICAL) { - System.out.println("Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint)); + System.out.println( + "Mismatching Decomposition Type: " + body + " in " + Utility.hex(codePoint)); } final String lastDecomp = Default.ucd().getDecompositionMapping(lastCodePoint); String hexed = Utility.hex(lastDecomp, 4, " "); @@ -660,7 +815,8 @@ private static String checkCompatibility(int codePoint, String body) { } else if (hexed2.equalsIgnoreCase(body)) { hasNameComp.put(lastDecomp, UTF16.valueOf(codePoint)); } else { - System.out.println("Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint)); + System.out.println( + "Mismatching Decomposition: " + body + " in " + Utility.hex(codePoint)); System.out.println("\tShould be: " + hexed); } lastDecompType = UCD_Types.NONE; @@ -670,10 +826,12 @@ private static String checkCompatibility(int codePoint, String body) { static class BlockInfo { BufferedReader in; String lastLine; - BlockInfo (String version, String filename) throws IOException { + + BlockInfo(String version, String filename) throws IOException { in = Utility.openUnicodeFile(filename, version, true, Utility.LATIN1_WINDOWS); - //in = FileUtilities.openUTF8Reader(dir, filename); + // in = FileUtilities.openUTF8Reader(dir, filename); } + boolean next(List inout) throws IOException { inout.clear(); if (lastLine != null) { @@ -693,19 +851,24 @@ boolean next(List inout) throws IOException { } return inout.size() > 0; } - } public static final String[][] LINE_MATCHER_VARIABLES = { {"$char", "[0-9A-F]{4,6}"}, - {"$name", "[0-9A-Z](?:[0-9A-Z\\- ]*[0-9A-Z])?"}, // alphanumeric, separated by spaces and '-' - {"$lcname", "[0-9a-zA-Z](?:[0-9a-zA-Z \\-]*[0-9a-zA-Z])?"}, // lowercase alphanumeric, separated by spaces and '-' + { + "$name", "[0-9A-Z](?:[0-9A-Z\\- ]*[0-9A-Z])?" + }, // alphanumeric, separated by spaces and '-' + { + "$lcname", "[0-9a-zA-Z](?:[0-9a-zA-Z \\-]*[0-9a-zA-Z])?" + }, // lowercase alphanumeric, separated by spaces and '-' // NOTE: lcname can contain uppercase characters - {"$comment", "\\([A-Za-z](?:[0-9A-Za-z, \\-]*[0-9A-Za-z])?\\)"}, // '(' alphanumeric (upper or lower) separated by spaces ')' + { + "$comment", "\\([A-Za-z](?:[0-9A-Za-z, \\-]*[0-9A-Za-z])?\\)" + }, // '(' alphanumeric (upper or lower) separated by spaces ')' }; enum LineMatcher { - //NAME_LINE: CHAR NAME LF + // NAME_LINE: CHAR NAME LF // NOTE: sometimes , sometimes TAB // // The CHAR and the corresponding image are echoed, // // followed by the name as given in NAME @@ -723,30 +886,30 @@ enum LineMatcher { NAME_LINE3("($char)\t($name) ($comment)(?: (\\*))?"), // NOTE: COMMENT should be "(" ... ")" // - //RESERVED_LINE: CHAR TAB + // RESERVED_LINE: CHAR TAB // // The CHAR is echoed followed by an icon for the // // reserved character and a fixed string e.g. // RESERVED_LINE("$char\t()"), - //COMMENT_LINE: "*" SP EXPAND_LINE + // COMMENT_LINE: "*" SP EXPAND_LINE // // * is replaced by BULLET, output line as comment // EXPAND_LINE // // Output line as comment COMMENT_LINE("\t\\* (.*)"), // - //ALIAS_LINE: "=" SP LINE + // ALIAS_LINE: "=" SP LINE // // Replace = by itself, output line as alias ALIAS_LINE("\t= (.*)"), // - //FORMALALIAS_LINE: "%" SP LINE + // FORMALALIAS_LINE: "%" SP LINE // // Replace % by U+203B, output line as formal alias FORMALALIAS_LINE("\t% (.*)"), // - //CROSS_REF: "X" SP CHAR SP LCNAME + // CROSS_REF: "X" SP CHAR SP LCNAME // // X is replaced by a right arrow CROSS_REF1("\tx ($char) ($name)"), CROSS_REF_SPACE("\tx ($char)(?: ($name))?"), - //NOTE: " x 5382" doesn't have name + // NOTE: " x 5382" doesn't have name // "X" SP "(" LCNAME SP "-" SP CHAR ")" CROSS_REF2("\tx \\(($lcname) - ($char)\\)"), CROSS_REF3("\tx \\(<($lcname)> - ($char)\\)"), @@ -757,46 +920,47 @@ enum LineMatcher { // // i.e. both inputs result in the same output CROSS_REF_XTRATAB1("\t\tx ($char) ($name)"), CROSS_REF_XTRATAB2("\t\tx \\(($lcname) - ($char)\\)"), - //NOTE: is "x", not "X" + // NOTE: is "x", not "X" // - //FILE_COMMENT: ";" LINE + // FILE_COMMENT: ";" LINE FILE_COMMENT(";(.*)"), - //EMPTY_LINE: LF + // EMPTY_LINE: LF // // Empty and ignored lines as well as // // file comments are ignored EMPTY_LINE(""), // - //SIDEBAR_LINE: ";;" LINE + // SIDEBAR_LINE: ";;" LINE // // Skip ';;' characters, output line // // as marginal note SIDEBAR_LINE(";;(.*)"), // - //IGNORED_LINE: ";" EXPAND_LINE + // IGNORED_LINE: ";" EXPAND_LINE // // Skip ':' character, ignore text // NOTE: : is wrong IGNORED_LINE("\t;(.*)"), // - //DECOMPOSITION: ":" SP EXPAND_LINE + // DECOMPOSITION: ":" SP EXPAND_LINE // // Replace ':' by EQUIV, expand line into // // decomposition DECOMPOSITION("\t: (.*)"), // - //COMPAT_MAPPING: "#" SP EXPAND_LINE - //COMPAT_MAPPING: "#" SP "<" LCTAG ">" SP EXPAND_LINE + // COMPAT_MAPPING: "#" SP EXPAND_LINE + // COMPAT_MAPPING: "#" SP "<" LCTAG ">" SP EXPAND_LINE // // Replace '#' by APPROX, output line as mapping; // // check the for balanced < > COMPAT_MAPPING2("\t# <($lctag)> (.*)"), COMPAT_MAPPING1("\t# (.*)"), - //NOTE: out of order + // NOTE: out of order // - //NOTICE_LINE: "@+" LINE + // NOTICE_LINE: "@+" LINE // // Skip '@+', output text as notice // "@+" TAB * SP LINE NOTICE_LINE_XTRATAB2("@\\+\t\t\\* (.*)"), NOTICE_LINE_XTRATAB1("@\\+\t ?\t(.*)"), NOTICE_LINE2("@\\+\t\\* (.*)"), NOTICE_LINE1("@\\+\t(.*)"), - // NOTE: @+ Italic symbols already encoded in the Letterlike Symbols block are omitted here to avoid duplicate encoding. + // NOTE: @+ Italic symbols already encoded in the Letterlike Symbols block are omitted + // here to avoid duplicate encoding. // has TAB SP TAB // NOTE: out of order // // Skip '@', output text as notice @@ -806,17 +970,17 @@ enum LineMatcher { // // a character code apply to the page/block/column // // and are italicized, but not indented // - //SUBTITLE: "@@@+" LINE + // SUBTITLE: "@@@+" LINE // // Skip "@@@+", output text as subtitle SUBTITLE("@@@\\+\t(.*)"), // - //SUBHEADER: "@" LINE + // SUBHEADER: "@" LINE // // Skip '@', output line as text as column header SUBHEADER_XTRATAB("@\t\t(.*)"), SUBHEADER("@\t(.*)"), // NOTE: has 2 tabs // - //BLOCKHEADER: "@@" BLOCKSTART BLOCKNAME BLOCKEND + // BLOCKHEADER: "@@" BLOCKSTART BLOCKNAME BLOCKEND // // Skip "@@", cause a page break and optional // // blank page, then output one or more charts // // followed by the list of character names. @@ -832,7 +996,7 @@ enum LineMatcher { // // If a comment is present it replaces the blockname // // when an ISO-style namelist is laid out // - //BLOCKNAME: LABEL + // BLOCKNAME: LABEL // LABEL SP "(" LABEL ")" BLOCKNAME2("@@\t(.*)\\(.*\\)"), BLOCKNAME("@@\t(.*)"), @@ -841,20 +1005,20 @@ enum LineMatcher { // // the blockname when an ISO-style namelist is // // laid out; it is ignored in the Unicode charts // - //BLOCKSTART: CHAR // First character position in block + // BLOCKSTART: CHAR // First character position in block BLOCKSTARTOREND("$char"), - //BLOCKEND: CHAR // Last character position in block - //PAGE_BREAK: "@@" // Insert a (column) break + // BLOCKEND: CHAR // Last character position in block + // PAGE_BREAK: "@@" // Insert a (column) break PAGE_BREAK("$char"), - //INDEX_TAB: "@@+" // Start a new index tab at latest BLOCKSTART + // INDEX_TAB: "@@+" // Start a new index tab at latest BLOCKSTART INDEX_TAB("@@\\+"), // - //TITLE: "@@@" LINE + // TITLE: "@@@" LINE TITLE("@@@\t(.*)"), // // Skip "@@@", output line as text // // Title is used in page headers // - //EXPAND_LINE: {CHAR | STRING}+ LF + // EXPAND_LINE: {CHAR | STRING}+ LF // // All instances of CHAR *) are replaced by // // CHAR NBSP x NBSP where x is the single Unicode // // character corresponding to CHAR. @@ -862,10 +1026,10 @@ enum LineMatcher { // // CHAR NBSP x NBSP where is the // // dotted circle NO_DEFINITION("\t(.*)"), - // NOTE: this is not defined. Example: " Final Unicode 5.1 names list." - // "00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK *" is not defined + // NOTE: this is not defined. Example: " Final Unicode 5.1 names list." + // "00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK *" is not defined - ; + ; Matcher matcher; private LineMatcher(String regexPattern) { @@ -883,15 +1047,19 @@ public static LineMatcher match(String input) { } return null; } + public String group() { return matcher.group(); } + public String group(int arg0) { return matcher.group(arg0); } + public int groupCount() { return matcher.groupCount(); } + @Override public String toString() { final StringBuilder result = new StringBuilder(name()); @@ -911,4 +1079,4 @@ public String toString() { return result.toString(); } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/MappingsForFractionalUCA.java b/unicodetools/src/main/java/org/unicode/text/UCA/MappingsForFractionalUCA.java index 83bdd6e52..f1b094752 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/MappingsForFractionalUCA.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/MappingsForFractionalUCA.java @@ -1,5 +1,7 @@ package org.unicode.text.UCA; +import com.ibm.icu.text.CanonicalIterator; +import com.ibm.icu.text.UTF16; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; @@ -7,15 +9,11 @@ import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; - import org.unicode.text.UCA.UCA.AppendToCe; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.UCD; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.CanonicalIterator; -import com.ibm.icu.text.UTF16; - /** * Prepares the UCA mappings for generation of FractionalUCA.txt. * @@ -26,28 +24,22 @@ public final class MappingsForFractionalUCA { private final UCA uca; /** - * UCA collation mapping data. - * Comparison is by sort key first, then by the string. - * CEs do not contain completely ignorable CEs. + * UCA collation mapping data. Comparison is by sort key first, then by the string. CEs do not + * contain completely ignorable CEs. */ /* package */ static class MappingWithSortKey implements Comparable { - /** - * Optional prefix (context) string. null if none. - */ + /** Optional prefix (context) string. null if none. */ private final String prefix; + private final String s; - /** - * Only non-zero collation elements, enforced by the constructors. - */ + /** Only non-zero collation elements, enforced by the constructors. */ private final CEList ces; /** - * Modified CEs, if any. - * If not null, then these are the CEs to be transformed into fractional CEs. + * Modified CEs, if any. If not null, then these are the CEs to be transformed into + * fractional CEs. */ private CEList modifiedCEs; - /** - * Standard 3-level UCA sort key "string" corresponding to ces. - */ + /** Standard 3-level UCA sort key "string" corresponding to ces. */ private final String sortKey; private MappingWithSortKey(UCA uca, String s) { @@ -73,9 +65,7 @@ public boolean hasPrefix() { return prefix != null; } - /** - * Returns the optional prefix (context) string. null if none. - */ + /** Returns the optional prefix (context) string. null if none. */ public String getPrefix() { return prefix; } @@ -90,6 +80,7 @@ public CEList getOriginalCEs() { /** * Returns the modified CEs, if set, or else the original CEs. + * * @see #setModifiedCEs(CEList) */ public CEList getCEs() { @@ -144,9 +135,8 @@ private static int comparePrefixes(String p1, String p2) { } /** - * Returns the mappings for FractionalUCA.txt. - * The mappings are sorted by UCA sort keys, canonically closed, - * and modified for improved collation performance. + * Returns the mappings for FractionalUCA.txt. The mappings are sorted by UCA sort keys, + * canonically closed, and modified for improved collation performance. */ /* package */ SortedSet getMappings() { final SortedSet ordered = getSortedUCAMappings(); @@ -157,8 +147,7 @@ private static int comparePrefixes(String p1, String p2) { /** * Returns a set of UCA mappings, sorted by their nearly-UCA-type sort key strings. * - *

This method also adds canonical equivalents (canonical closure), - * if any are missing. + *

This method also adds canonical equivalents (canonical closure), if any are missing. */ private SortedSet getSortedUCAMappings() { final String highCompat = UTF16.valueOf(0x2F805); @@ -175,9 +164,13 @@ private SortedSet getSortedUCAMappings() { break; } if (s.equals("\uFFFF") || s.equals("\uFFFE")) { - continue; // Suppress the FFFF and FFFE, since we are adding them artificially later. + continue; // Suppress the FFFF and FFFE, since we are adding them artificially + // later. } - if (s.equals("\uFA36") || s.equals("\uF900") || s.equals("\u2ADC") || s.equals(highCompat)) { + if (s.equals("\uFA36") + || s.equals("\uF900") + || s.equals("\u2ADC") + || s.equals(highCompat)) { System.out.println(" * " + Default.ucd().getCodeAndName(s)); } contentsForCanonicalIteration.add(s); @@ -251,7 +244,6 @@ private SortedSet getSortedUCAMappings() { continue; } - // Skip anything that is not FCD. if (!Default.nfd().isFCD(s)) { continue; @@ -260,8 +252,10 @@ private SortedSet getSortedUCAMappings() { // We ONLY add if the sort key would be different // Than what we would get if we didn't decompose!! final CEList ces = uca.getCEList(s, true); - final String sortKey = uca.getSortKey(ces, UCA_Types.NON_IGNORABLE, AppendToCe.none); - final String nonDecompSortKey = uca.getSortKey(s, UCA_Types.NON_IGNORABLE, false, AppendToCe.none); + final String sortKey = + uca.getSortKey(ces, UCA_Types.NON_IGNORABLE, AppendToCe.none); + final String nonDecompSortKey = + uca.getSortKey(s, UCA_Types.NON_IGNORABLE, false, AppendToCe.none); if (sortKey.equals(nonDecompSortKey)) { continue; } @@ -281,31 +275,32 @@ private SortedSet getSortedUCAMappings() { System.out.println("Done Adding canonical Equivalents -- added " + canCount); /* - for (int ch = 0; ch < 0x10FFFF; ++ch) { - Utility.dot(ch); - byte type = collator.getCEType(ch); - if (type >= UCA.FIXED_CE && !nfd.hasDecomposition(ch)) - continue; - } - String s = org.unicode.text.UTF16.valueOf(ch); - ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s); - } - - Hashtable multiTable = collator.getContracting(); - Enumeration enum = multiTable.keys(); - int ecount = 0; - while (enum.hasMoreElements()) { - Utility.dot(ecount++); - String s = (String)enum.nextElement(); - ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s); - } - */ + for (int ch = 0; ch < 0x10FFFF; ++ch) { + Utility.dot(ch); + byte type = collator.getCEType(ch); + if (type >= UCA.FIXED_CE && !nfd.hasDecomposition(ch)) + continue; + } + String s = org.unicode.text.UTF16.valueOf(ch); + ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s); + } + + Hashtable multiTable = collator.getContracting(); + Enumeration enum = multiTable.keys(); + int ecount = 0; + while (enum.hasMoreElements()) { + Utility.dot(ecount++); + String s = (String)enum.nextElement(); + ordered.put(collator.getSortKey(s, UCA.NON_IGNORABLE) + '\u0000' + s, s); + } + */ // JUST FOR TESTING final boolean TESTING = false; if (TESTING) { - final String sample = "\u3400\u3401\u4DB4\u4DB5\u4E00\u4E01\u9FA4\u9FA5\uAC00\uAC01\uD7A2\uD7A3"; + final String sample = + "\u3400\u3401\u4DB4\u4DB5\u4E00\u4E01\u9FA4\u9FA5\uAC00\uAC01\uD7A2\uD7A3"; for (int i = 0; i < sample.length(); ++i) { - final String s = sample.substring(i, i+1); + final String s = sample.substring(i, i + 1); ordered.add(new MappingWithSortKey(uca, s)); } } @@ -315,12 +310,12 @@ private SortedSet getSortedUCAMappings() { /** * Modifies some of the UCA mappings before they are converted to fractional CEs. + * *

    - *
  • Turns L+middle dot contractions into prefix rules. - *
  • Merges artificial secondary CEs into the preceding primary ones. - * DUCET primary CEs only use the "common" secondary weight. - * All secondary distinctions are made via additional secondary CEs. - * In FractionalUCA we change that, to reduce the number of expansions. + *
  • Turns L+middle dot contractions into prefix rules. + *
  • Merges artificial secondary CEs into the preceding primary ones. DUCET primary CEs only + * use the "common" secondary weight. All secondary distinctions are made via additional + * secondary CEs. In FractionalUCA we change that, to reduce the number of expansions. *
*/ private void modifyMappings(SortedSet ordered) { @@ -358,9 +353,13 @@ private void modifyMappings(SortedSet ordered) { // does not get merged into the L's primary CE. // (That would prevent it from turning into a prefix mapping.) String s = mapping.s; - if (s.length() == 2 && ces.length() == 2 && mapping.prefix == null - && (s.equals("l\u00B7") || s.equals("L\u00B7") - || s.equals("l\u0387") || s.equals("L\u0387"))) { + if (s.length() == 2 + && ces.length() == 2 + && mapping.prefix == null + && (s.equals("l\u00B7") + || s.equals("L\u00B7") + || s.equals("l\u0387") + || s.equals("L\u0387"))) { it.remove(); // Move the l/L to the prefix. final String prefix = s.substring(0, 1); @@ -386,8 +385,9 @@ private void modifyMappings(SortedSet ordered) { for (int i = 1; i < ces.length(); ++i) { if (CEList.getPrimary(ces.at(i)) != 0) { throw new IllegalArgumentException( - "UCA Mapping " + mapping + - "contains a primary CE after the initial ignorable CE"); + "UCA Mapping " + + mapping + + "contains a primary CE after the initial ignorable CE"); } } } else { @@ -397,12 +397,14 @@ private void modifyMappings(SortedSet ordered) { if (CEList.getPrimary(ce) != 0) { if (sec != UCA_Types.NEUTRAL_SECONDARY && sec != 0) { throw new IllegalArgumentException( - "UCA Mapping " + mapping + - "contains a primary CE with a non-common secondary weight"); + "UCA Mapping " + + mapping + + "contains a primary CE with a non-common secondary weight"); } } else if (sec > maxNormalSecondary) { - if (ces.length() == 2 && sec == lMiddleDotSec && - CEList.getPrimary(firstCE) == lMiddleDotPri) { + if (ces.length() == 2 + && sec == lMiddleDotSec + && CEList.getPrimary(firstCE) == lMiddleDotPri) { break; } if ((i + 1) < ces.length()) { @@ -411,8 +413,9 @@ private void modifyMappings(SortedSet ordered) { final int nextSec = CEList.getSecondary(nextCE); if (nextPri == 0 && nextSec > maxNormalSecondary) { throw new IllegalArgumentException( - "UCA Mapping " + mapping + - "contains two artificial secondary CEs in a row"); + "UCA Mapping " + + mapping + + "contains two artificial secondary CEs in a row"); } } // Check that the previous CE is a primary CE. @@ -441,7 +444,8 @@ private void modifyMappings(SortedSet ordered) { // Reduce the secondary weight to just after the common weight. sec = UCA_Types.NEUTRAL_SECONDARY + sec - maxNormalSecondary; // TODO: This is broken! - // Map secondaries of primary CEs vs. ignorable CEs to separate ranges of fractional secondaries. + // Map secondaries of primary CEs vs. ignorable CEs to separate ranges of + // fractional secondaries. final int previousTer = CEList.getTertiary(previousCE); newCEs[previous] = UCA.makeKey(previousPri, sec, previousTer); while (++previous < i) { @@ -454,7 +458,8 @@ private void modifyMappings(SortedSet ordered) { } // Store the modified CEs and continue with looking at the next CE. // We do not replace the whole mapping because the modified CEs - // are not well-formed (secondary weights of primary vs. ignorable CEs overlap now) + // are not well-formed (secondary weights of primary vs. ignorable CEs + // overlap now) // and therefore we should not use them to create a sort key. mapping.modifiedCEs = ces = new CEList(newCEs); --i; @@ -465,7 +470,7 @@ private void modifyMappings(SortedSet ordered) { } ordered.addAll(newMappings); System.out.println( - "Number of artificial secondary CEs merged into the preceding primary CEs: " + - numSecondariesMerged); + "Number of artificial secondary CEs merged into the preceding primary CEs: " + + numSecondariesMerged); } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/PrimariesToFractional.java b/unicodetools/src/main/java/org/unicode/text/UCA/PrimariesToFractional.java index a447952c2..21fa9e4e1 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/PrimariesToFractional.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/PrimariesToFractional.java @@ -1,17 +1,15 @@ package org.unicode.text.UCA; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.util.BitSet; import java.util.Iterator; import java.util.NoSuchElementException; - import org.unicode.text.UCD.Default; import org.unicode.text.UCD.UCD_Types; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - /** * Maps UCA-style primary weights to byte-fractional primaries. * @@ -24,18 +22,17 @@ public final class PrimariesToFractional { private final UCA uca; /** - * Fractional primary weight for numeric sorting (CODAN). - * Single-byte weight, lead byte for all computed whole-number CEs. + * Fractional primary weight for numeric sorting (CODAN). Single-byte weight, lead byte for all + * computed whole-number CEs. * - *

This must be a "homeless" weight. - * If any character or string mapped to a weight with this same lead byte, - * then we would get an illegal prefix overlap. + *

This must be a "homeless" weight. If any character or string mapped to a weight with this + * same lead byte, then we would get an illegal prefix overlap. */ private int numericFractionalPrimary; /** - * One flag per fractional primary lead byte for whether - * the fractional weights that start with that byte are sort-key-compressible. + * One flag per fractional primary lead byte for whether the fractional weights that start with + * that byte are sort-key-compressible. */ private final BitSet compressibleBytes = new BitSet(256); @@ -59,12 +56,12 @@ private static final class ScriptOptions { boolean beginsByte; boolean endsByte; /** - * If true, then primary weights of this group/script all have the same lead byte - * and are therefore compressible when writing sort keys. - * We need to know this before assigning + * If true, then primary weights of this group/script all have the same lead byte and are + * therefore compressible when writing sort keys. We need to know this before assigning * fractional primary weights so that we can assign them optimally. */ boolean compressible = true; + boolean defaultTwoBytePrimaries; boolean defaultTwoBytePunctuation; boolean twoBytesIfVariants = true; @@ -98,45 +95,49 @@ ScriptOptions finishByte() { endsByte = true; return this; } + ScriptOptions notCompressible() { if (!beginsByte && !endsByte) { throw new IllegalArgumentException( - "non-compressible script must begin or end with a lead byte boundary, " + - "or both; see LDML collation spec"); + "non-compressible script must begin or end with a lead byte boundary, " + + "or both; see LDML collation spec"); } compressible = false; return this; } + ScriptOptions twoBytePrimaries() { defaultTwoBytePrimaries = defaultTwoBytePunctuation = true; return this; } + ScriptOptions noTwoBytePrimariesIfVariants() { twoBytesIfVariants = false; return this; } + ScriptOptions twoBytePunctuation() { defaultTwoBytePunctuation = true; return this; } + ScriptOptions threeBytePunctuation() { defaultTwoBytePunctuation = false; return this; } + ScriptOptions minimalGap3() { useMinimalGap3 = true; return this; } } - /** - * FractionalUCA properties for a UCA primary weight. - */ + /** FractionalUCA properties for a UCA primary weight. */ public static class PrimaryToFractional { private ScriptOptions options; /** - * true if this primary is at the start of a group or script - * that begins with a new primary lead byte. + * true if this primary is at the start of a group or script that begins with a new primary + * lead byte. */ private boolean newByte; @@ -146,41 +147,39 @@ public static class PrimaryToFractional { private int fractionalPrimary; /** - * Stores fractional primaries for a siniform ideographic range, otherwise null. - * Offset by options.implicitRange.startCP. 0 for unassigned code points. + * Stores fractional primaries for a siniform ideographic range, otherwise null. Offset by + * options.implicitRange.startCP. 0 for unassigned code points. */ private int[] rangePrimaries; /** - * FractionalUCA sets neutralSec and neutralTer to the sec/ter values - * when secTerToFractional==null and both values are either 0 or neutral. - * These values are then added to secTerToFractional when it is allocated. + * FractionalUCA sets neutralSec and neutralTer to the sec/ter values when + * secTerToFractional==null and both values are either 0 or neutral. These values are then + * added to secTerToFractional when it is allocated. */ int neutralSec = -1; + int neutralTer = -1; /** - * {@link PrimaryToFractional} serves as a container for {@link SecTerToFractional}. - * {@link PrimaryToFractional} does not set or use this reference at all. - * We just avoid yet another map from primary weights to values, - * and another map lookup for the same primary. + * {@link PrimaryToFractional} serves as a container for {@link SecTerToFractional}. {@link + * PrimaryToFractional} does not set or use this reference at all. We just avoid yet another + * map from primary weights to values, and another map lookup for the same primary. * - *

This is null until there is a non-neutral secondary or tertiary weight - * for this primary. + *

This is null until there is a non-neutral secondary or tertiary weight for this + * primary. */ public SecTerToFractional secTerToFractional; private PrimaryToFractional() {} - /** - * Returns the planned number of fractional primary weight bytes. - */ + /** Returns the planned number of fractional primary weight bytes. */ private int getFractionalLength() { if (useSingleBytePrimary) { return 1; } - if (useTwoBytePrimary || - (options.defaultTwoBytePrimaries && !useThreeBytePrimary) || - (options.twoBytesIfVariants && secTerToFractional != null)) { + if (useTwoBytePrimary + || (options.defaultTwoBytePrimaries && !useThreeBytePrimary) + || (options.twoBytesIfVariants && secTerToFractional != null)) { return 2; } return 3; @@ -195,10 +194,10 @@ int getReorderCode() { } /** - * Returns the script-first fractional primary that precedes this UCA primary's - * own fractional primary, if this is the first primary of a group or script, - * otherwise returns 0. - * The script-first primary is reset, so that the next call with the same UCA primary returns 0. + * Returns the script-first fractional primary that precedes this UCA primary's own + * fractional primary, if this is the first primary of a group or script, otherwise returns + * 0. The script-first primary is reset, so that the next call with the same UCA primary + * returns 0. */ public int getAndResetScriptFirstFractionalPrimary() { if (options == null || !options.needToWriteScriptFirstFractional) { @@ -213,9 +212,7 @@ public boolean beginsByte() { return newByte; } - /** - * Returns the fractional primary weight for the UCA primary. - */ + /** Returns the fractional primary weight for the UCA primary. */ public int getFractionalPrimary() { return fractionalPrimary; } @@ -225,34 +222,35 @@ public int getSiniformRangeFractionalPrimary(int c) { } /** - * @return the script-first fractional primary of the reserved range before - * this primary's script, or 0 if there is none + * @return the script-first fractional primary of the reserved range before this primary's + * script, or 0 if there is none */ public int getReservedBeforeFractionalPrimary() { - return options != null && options.reservedBefore != null ? - options.reservedBefore.scriptFirstFractional : 0; + return options != null && options.reservedBefore != null + ? options.reservedBefore.scriptFirstFractional + : 0; } /** - * @return the special reorder code of the reserved range before - * this primary's script, or -1 if there is none + * @return the special reorder code of the reserved range before this primary's script, or + * -1 if there is none */ public int getReservedBeforeReorderCode() { - return options != null && options.reservedBefore != null ? - options.reservedBefore.reorderCode : -1; + return options != null && options.reservedBefore != null + ? options.reservedBefore.reorderCode + : -1; } } /** - * Computes valid FractionalUCA primary weights of desired byte lengths. - * Always starts with the first primary weight after 02. - * {@link PrimaryWeight#next(int)} increments - * one 1/2/3-byte weight to another 1/2/3-byte weight. + * Computes valid FractionalUCA primary weights of desired byte lengths. Always starts with the + * first primary weight after 02. {@link PrimaryWeight#next(int)} increments one 1/2/3-byte + * weight to another 1/2/3-byte weight. */ private static class PrimaryWeight { /** - * For most bytes except a primary weight's lead byte, 02 is ok. - * It just needs to be greater than the level separator 01. + * For most bytes except a primary weight's lead byte, 02 is ok. It just needs to be greater + * than the level separator 01. */ private static final int MIN_BYTE = 2; @@ -260,30 +258,30 @@ private static class PrimaryWeight { private static final int MAX2_UNCOMPRESSED = 0xff; /** - * Primary compression for sort keys uses bytes 03 and FF as compression terminators. - * The low terminator must be greater than the end-of-merged-string separator 02. + * Primary compression for sort keys uses bytes 03 and FF as compression terminators. The + * low terminator must be greater than the end-of-merged-string separator 02. */ private static final int MIN2_COMPRESSED = MIN_BYTE + 2; + private static final int MAX2_COMPRESSED = 0xfe; /** - * Increment byte2 a little more around single-byte primaries, - * for tailoring of at least 4 two-byte primaries or more than 1000 three-byte primaries. + * Increment byte2 a little more around single-byte primaries, for tailoring of at least 4 + * two-byte primaries or more than 1000 three-byte primaries. */ private static final int GAP2_FOR_SINGLE = 4; /** * Increment byte3 with a tailoring gap. * - *

When the gap is too large, then we allocate too many primary weights for - * a minor script and might overflow the single lead byte of a compressible reordering group. + *

When the gap is too large, then we allocate too many primary weights for a minor + * script and might overflow the single lead byte of a compressible reordering group. * *

When the gap is too small, then only a small number of characters can be tailored * (efficiently or at all) between root collation weights. * *

We can adjust in the {@link PrimariesToFractional#PrimariesToFractional(UCA)} - * constructor which script starts a new lead byte. - * See the comments there for criteria. + * constructor which script starts a new lead byte. See the comments there for criteria. */ private static final int GAP3 = 6; @@ -324,7 +322,7 @@ public int startNewByte(ScriptOptions options) { // At least a normal three-byte gap. inc1 = byte2 < maxByte2 || (byte3 + gap3) <= 0xff ? 1 : 2; } - if(inc1 != 1 && compressibleLeadByte) { + if (inc1 != 1 && compressibleLeadByte) { ++numErrors; System.out.flush(); System.err.printf( @@ -383,61 +381,61 @@ public int next(int newByteLength) { final int oByte2 = byte2; switch (lastByteLength) { - case 1: - switch (newByteLength) { - case 1: - // Gap of 1 lead byte between singles. - addTo1(2); - break; - case 2: - // Larger two-byte gap after a single. - addTo1(1); - byte2 = minByte2 + GAP2_FOR_SINGLE; - break; - case 3: - // Larger two-byte gap after a single. - addTo1(1); - byte2 = minByte2 + GAP2_FOR_SINGLE; - byte3 = MIN_BYTE; - break; - } - break; - case 2: - switch (newByteLength) { case 1: - // At least a larger two-byte gap before a single. - addTo1((byte2 + GAP2_FOR_SINGLE) <= maxByte2 ? 1 : 2); - byte2 = 0; - break; - case 2: - // Normal two-byte gap. - addTo2(2); - break; - case 3: - // At least a two-byte gap after a double. - addTo2(2); - byte3 = MIN_BYTE; - break; - } - break; - case 3: - switch (newByteLength) { - case 1: - // At least a larger two-byte gap before a single. - addTo1((byte2 + GAP2_FOR_SINGLE) <= maxByte2 ? 1 : 2); - byte2 = byte3 = 0; + switch (newByteLength) { + case 1: + // Gap of 1 lead byte between singles. + addTo1(2); + break; + case 2: + // Larger two-byte gap after a single. + addTo1(1); + byte2 = minByte2 + GAP2_FOR_SINGLE; + break; + case 3: + // Larger two-byte gap after a single. + addTo1(1); + byte2 = minByte2 + GAP2_FOR_SINGLE; + byte3 = MIN_BYTE; + break; + } break; case 2: - // At least a two-byte gap before a double. - addTo2(2); - byte3 = 0; + switch (newByteLength) { + case 1: + // At least a larger two-byte gap before a single. + addTo1((byte2 + GAP2_FOR_SINGLE) <= maxByte2 ? 1 : 2); + byte2 = 0; + break; + case 2: + // Normal two-byte gap. + addTo2(2); + break; + case 3: + // At least a two-byte gap after a double. + addTo2(2); + byte3 = MIN_BYTE; + break; + } break; case 3: - // Normal three-byte gap. - addTo3(gap3 + 1); + switch (newByteLength) { + case 1: + // At least a larger two-byte gap before a single. + addTo1((byte2 + GAP2_FOR_SINGLE) <= maxByte2 ? 1 : 2); + byte2 = byte3 = 0; + break; + case 2: + // At least a two-byte gap before a double. + addTo2(2); + byte3 = 0; + break; + case 3: + // Normal three-byte gap. + addTo3(gap3 + 1); + break; + } break; - } - break; } check(oByte1, oByte2, newByteLength, false); @@ -446,34 +444,33 @@ public int next(int newByteLength) { return getIntValue(); } - /** - * Checks that we made a good transition. - */ + /** Checks that we made a good transition. */ private void check(int oByte1, int oByte2, int newByteLength, boolean newFirstByte) { // verify results // right bytes are filled in, as requested switch (newByteLength) { - case 1: - assertTrue(byte1 != 0 && byte2 == 0 && byte3 == 0); - break; - case 2: - assertTrue(byte1 != 0 && byte2 != 0 && byte3 == 0); - break; - case 3: - assertTrue(byte1 != 0 && byte2 != 0 && byte3 != 0); - break; + case 1: + assertTrue(byte1 != 0 && byte2 == 0 && byte3 == 0); + break; + case 2: + assertTrue(byte1 != 0 && byte2 != 0 && byte3 == 0); + break; + case 3: + assertTrue(byte1 != 0 && byte2 != 0 && byte3 != 0); + break; } // neither is prefix of the other if (lastByteLength != newByteLength) { - final int minLength = lastByteLength < newByteLength ? lastByteLength : newByteLength; + final int minLength = + lastByteLength < newByteLength ? lastByteLength : newByteLength; switch (minLength) { - case 1: - assertTrue(byte1 != oByte1); - break; - case 2: - assertTrue(byte1 != oByte1 || byte2 != oByte2); - break; + case 1: + assertTrue(byte1 != oByte1); + break; + case 2: + assertTrue(byte1 != oByte1 || byte2 != oByte2); + break; } } @@ -558,9 +555,9 @@ public PrimaryToFractional next() { } /** - * This constructor just performs basic initialization. - * You must call {@link PrimariesToFractional#assignFractionalPrimaries(StringBuilder)} - * to build the data for the rest of the API. + * This constructor just performs basic initialization. You must call {@link + * PrimariesToFractional#assignFractionalPrimaries(StringBuilder)} to build the data for the + * rest of the API. */ public PrimariesToFractional(UCA uca) { this.uca = uca; @@ -572,11 +569,17 @@ public PrimariesToFractional(UCA uca) { // Some spaces and punctuation share a lead byte. setOptionsForScript(ReorderCodes.SPACE).newByte().notCompressible().twoBytePrimaries(); - setOptionsForScript(ReorderCodes.PUNCTUATION).finishByte().notCompressible().twoBytePrimaries(); + setOptionsForScript(ReorderCodes.PUNCTUATION) + .finishByte() + .notCompressible() + .twoBytePrimaries(); // Some general and currency symbols share a lead byte. setOptionsForScript(ReorderCodes.SYMBOL).newByte().notCompressible().twoBytePrimaries(); - setOptionsForScript(ReorderCodes.CURRENCY).finishByte().notCompressible().twoBytePrimaries(); + setOptionsForScript(ReorderCodes.CURRENCY) + .finishByte() + .notCompressible() + .twoBytePrimaries(); setOptionsForScript(ReorderCodes.DIGIT).wholeByte().notCompressible().twoBytePrimaries(); @@ -614,20 +617,27 @@ public PrimariesToFractional(UCA uca) { // Mark reserved ranges as not compressible, to avoid confusion, // and to avoid tools code issues with using multiple lead bytes. setOptionsForReservedRangeBeforeScript( - ReorderCodes.REORDER_RESERVED_BEFORE_LATIN, UCD_Types.LATIN_SCRIPT) - .wholeByte().notCompressible(); + ReorderCodes.REORDER_RESERVED_BEFORE_LATIN, UCD_Types.LATIN_SCRIPT) + .wholeByte() + .notCompressible(); // Latin uses multiple lead bytes, with single-byte primaries for A-Z. - setOptionsForScript(UCD_Types.LATIN_SCRIPT).wholeByte().notCompressible().twoBytePrimaries(); + setOptionsForScript(UCD_Types.LATIN_SCRIPT) + .wholeByte() + .notCompressible() + .twoBytePrimaries(); setOptionsForReservedRangeBeforeScript( - ReorderCodes.REORDER_RESERVED_AFTER_LATIN, UCD_Types.GREEK_SCRIPT) - .wholeByte().notCompressible(); + ReorderCodes.REORDER_RESERVED_AFTER_LATIN, UCD_Types.GREEK_SCRIPT) + .wholeByte() + .notCompressible(); // Recommended Script, and cased. setOptionsForScript(UCD_Types.GREEK_SCRIPT).newByte().twoBytePrimaries(); // Not a Recommended Script but cased, and easily fits into the same lead byte as Greek. setOptionsForScript(UCD_Types.COPTIC).twoBytePrimaries().threeBytePunctuation(); // Cyrillic uses one lead byte, with two-byte primaries for common characters. - setOptionsForScript(UCD_Types.CYRILLIC_SCRIPT).wholeByte() - .noTwoBytePrimariesIfVariants().twoBytePunctuation(); + setOptionsForScript(UCD_Types.CYRILLIC_SCRIPT) + .wholeByte() + .noTwoBytePrimariesIfVariants() + .twoBytePunctuation(); // Recommended Script, and cased; avoid lead byte overflow. setOptionsForScript(UCD_Types.GEORGIAN_SCRIPT).newByte().twoBytePrimaries(); // Recommended Script, and cased. Does not fit in a lead byte with Georgian. @@ -635,8 +645,10 @@ public PrimariesToFractional(UCA uca) { // Recommended Script, few primaries, with active computer/internet usage. setOptionsForScript(UCD_Types.HEBREW_SCRIPT).newByte().twoBytePrimaries(); // Arabic uses one lead byte, with two-byte primaries for common characters. - setOptionsForScript(UCD_Types.ARABIC_SCRIPT).wholeByte() - .noTwoBytePrimariesIfVariants().twoBytePunctuation(); + setOptionsForScript(UCD_Types.ARABIC_SCRIPT) + .wholeByte() + .noTwoBytePrimariesIfVariants() + .twoBytePunctuation(); // Recommended Script, few primaries. setOptionsForScript(UCD_Types.THAANA_SCRIPT).twoBytePrimaries(); // Ethiopic is a Recommended Script but needs three-byte primaries so that @@ -681,11 +693,15 @@ public PrimariesToFractional(UCA uca) { // Limited Use Script, avoid lead byte overflow. setOptionsForScript(UCD_Types.Vai).newByte(); // Hangul uses one lead byte, with two-byte primaries for conjoining Jamo L/V/T. - setOptionsForScript(UCD_Types.HANGUL_SCRIPT).wholeByte() - .noTwoBytePrimariesIfVariants().twoBytePunctuation(); + setOptionsForScript(UCD_Types.HANGUL_SCRIPT) + .wholeByte() + .noTwoBytePrimariesIfVariants() + .twoBytePunctuation(); // Kana uses one lead byte, with two-byte primaries for common characters. setOptionsForScripts( - UCD_Types.HIRAGANA_SCRIPT, UCD_Types.KATAKANA_SCRIPT, UCD_Types.KATAKANA_OR_HIRAGANA) + UCD_Types.HIRAGANA_SCRIPT, + UCD_Types.KATAKANA_SCRIPT, + UCD_Types.KATAKANA_OR_HIRAGANA) .wholeByte(); // Recommended Script, some characters have variants. setOptionsForScript(UCD_Types.BOPOMOFO_SCRIPT).newByte().twoBytePrimaries(); @@ -703,17 +719,20 @@ public PrimariesToFractional(UCA uca) { setOptionsForScripts(UCD_Types.Tangut).minimalGap3(); // Han uses many lead bytes, so that tailoring tens of thousands of characters // can use many two-byte primaries. - setOptionsForScript(UCD_Types.HAN_SCRIPT).wholeByte().notCompressible().twoBytePunctuation(); + setOptionsForScript(UCD_Types.HAN_SCRIPT) + .wholeByte() + .notCompressible() + .twoBytePunctuation(); // All other scripts get default options. } - + private ScriptOptions setOptionsForScript(int script) { return scriptOptions[script] = getOrCreateOptionsForScript(script); } - private ScriptOptions setOptionsForScripts(int ...scripts) { - ScriptOptions o = getOrCreateOptionsForScript(scripts[0]); // The other scripts are aliases. + private ScriptOptions setOptionsForScripts(int... scripts) { + ScriptOptions o = getOrCreateOptionsForScript(scripts[0]); // The other scripts are aliases. for (int script : scripts) { scriptOptions[script] = o; } @@ -740,9 +759,8 @@ private ScriptOptions getOrCreateOptionsForScript(int script) { } /** - * Loads the set of UCA primary weights, finds reordering group boundaries, - * assigns a fractional primary weight for every UCA primary, - * and writes the [top_byte] information. + * Loads the set of UCA primary weights, finds reordering group boundaries, assigns a fractional + * primary weight for every UCA primary, and writes the [top_byte] information. */ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo) { System.out.println("Finding Bumps"); @@ -768,7 +786,7 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo final Iterator regularPrimaries = uca.getRegularPrimaries().iterator(); final Iterator siniformRanges = getSiniformRanges().iterator(); - for (;;) { + for (; ; ) { int primary; int nextPrimary; int representativeCP; @@ -818,8 +836,13 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo assert options.beginsByte; final int firstFractional = fractionalPrimary.startNewByte(options); final int leadByte = Fractional.getLeadByte(firstFractional); - appendTopByteInfo(topByteInfo, previousGroupIsCompressible, - previousGroupLeadByte, leadByte, groupInfo, numPrimaries); + appendTopByteInfo( + topByteInfo, + previousGroupIsCompressible, + previousGroupLeadByte, + leadByte, + groupInfo, + numPrimaries); previousGroupLeadByte = leadByte; previousGroupIsCompressible = options.compressible; @@ -841,8 +864,13 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo final int leadByte = Fractional.getLeadByte(firstFractional); // Finish the previous reordering group. - appendTopByteInfo(topByteInfo, previousGroupIsCompressible, - previousGroupLeadByte, leadByte, groupInfo, numPrimaries); + appendTopByteInfo( + topByteInfo, + previousGroupIsCompressible, + previousGroupLeadByte, + leadByte, + groupInfo, + numPrimaries); previousGroupLeadByte = leadByte; previousGroupIsCompressible = options.compressible; @@ -851,7 +879,7 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo groupInfo.setLength(0); groupInfo.append(ReorderCodes.getShortName(reorderCode)); if (reorderCode == UCD_Types.HIRAGANA_SCRIPT) { - groupInfo.append(" Hrkt Kana"); // script aliases + groupInfo.append(" Hrkt Kana"); // script aliases } String groupComment = " starts new lead byte"; if (options.compressible) { @@ -885,7 +913,7 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo } groupInfo.append(ReorderCodes.getShortName(reorderCode)); if (reorderCode == UCD_Types.Meroitic_Cursive) { - groupInfo.append(" Mero"); // script aliases + groupInfo.append(" Mero"); // script aliases } System.out.printf( "[%s] # %s first primary\n", @@ -896,7 +924,9 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo } int currentByteLength = props.getFractionalLength(); - if (currentByteLength == 3 && fractionalPrimary.lastByteLength <= 2 && nextPrimary >= 0) { + if (currentByteLength == 3 + && fractionalPrimary.lastByteLength <= 2 + && nextPrimary >= 0) { // We slightly optimize the assignment of primary weights: // If a 3-byte primary is surrounded by one-or-two-byte primaries, // then we can shorten the middle one to two bytes as well, @@ -919,9 +949,10 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo // We do not want to auto-shorten just before the first-script primary weights // because that could increase the gap in between. final PrimaryToFractional nextProps = getProps(nextPrimary); - if (nextPrimary < Implicit.START && - nextProps != null && nextProps.getFractionalLength() <= 2 && - !nextProps.isFirstForScript(nextPrimary)) { + if (nextPrimary < Implicit.START + && nextProps != null + && nextProps.getFractionalLength() <= 2 + && !nextProps.isFirstForScript(nextPrimary)) { currentByteLength = 2; } } @@ -936,20 +967,25 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo if (DEBUG_FW) { System.out.println( currentByteLength - + ", " + old - + " => " + fractionalPrimary.toString() - + "\t" + Utility.hex(representativeCP)); + + ", " + + old + + " => " + + fractionalPrimary.toString() + + "\t" + + Utility.hex(representativeCP)); } if (implicitRange != null) { // We only computed the first fractional primary. // Now compute and store the remaining ones. props.rangePrimaries = new int[implicitRange.lastCP - implicitRange.startCP + 1]; - props.rangePrimaries[implicitRange.firstCP - implicitRange.startCP] = props.fractionalPrimary; + props.rangePrimaries[implicitRange.firstCP - implicitRange.startCP] = + props.fractionalPrimary; UnicodeSetIterator iter = new UnicodeSetIterator(implicitRange.set); - iter.next(); // Skip the first code point. + iter.next(); // Skip the first code point. while (iter.next()) { - props.rangePrimaries[iter.codepoint - implicitRange.startCP] = fractionalPrimary.next(currentByteLength); + props.rangePrimaries[iter.codepoint - implicitRange.startCP] = + fractionalPrimary.next(currentByteLength); ++numPrimaries; } } @@ -962,8 +998,13 @@ public PrimariesToFractional assignFractionalPrimaries(StringBuilder topByteInfo int leadByte = Fractional.getLeadByte(firstFractional); // Finish the previous reordering group. - appendTopByteInfo(topByteInfo, previousGroupIsCompressible, - previousGroupLeadByte, leadByte, groupInfo, numPrimaries); + appendTopByteInfo( + topByteInfo, + previousGroupIsCompressible, + previousGroupLeadByte, + leadByte, + groupInfo, + numPrimaries); previousGroupLeadByte = leadByte; // Record Hani. @@ -1006,8 +1047,14 @@ public Iterator iterator() { } }; } - private static void appendTopByteInfo(StringBuilder topByteInfo, boolean compress, - int b, int limit, CharSequence groupInfo, int count) { + + private static void appendTopByteInfo( + StringBuilder topByteInfo, + boolean compress, + int b, + int limit, + CharSequence groupInfo, + int count) { final boolean canCompress = (limit - b) == 1; if (compress) { if (!canCompress) { @@ -1016,24 +1063,30 @@ private static void appendTopByteInfo(StringBuilder topByteInfo, boolean compres // after printing error messages. System.out.flush(); System.err.println( - "reordering group {" + groupInfo + - "} marked for compression but uses more than one lead byte " + - Utility.hex(b, 2) + ".." + - Utility.hex(limit - 1, 2)); + "reordering group {" + + groupInfo + + "} marked for compression but uses more than one lead byte " + + Utility.hex(b, 2) + + ".." + + Utility.hex(limit - 1, 2)); System.err.flush(); compress = false; } } else if (canCompress) { System.out.println( - "# Note: reordering group {" + groupInfo + "} " + - "not marked for compression but uses only one lead byte " + - Utility.hex(b, 2)); + "# Note: reordering group {" + + groupInfo + + "} " + + "not marked for compression but uses only one lead byte " + + Utility.hex(b, 2)); } while (b < limit) { - topByteInfo.append("[top_byte\t"). - append(Utility.hex(b, 2)). - append("\t").append(groupInfo); + topByteInfo + .append("[top_byte\t") + .append(Utility.hex(b, 2)) + .append("\t") + .append(groupInfo); if (compress) { topByteInfo.append("\tCOMPRESS"); } @@ -1072,10 +1125,9 @@ private static int pinPrimary(int ucaPrimary) { } /** - * Returns the properties for the UCA primary weight. - * Returns the same object for all primary lead weights for each - * UCA implicit-weights range (only one for Han). - * Returns null if there are no data for this primary. + * Returns the properties for the UCA primary weight. Returns the same object for all primary + * lead weights for each UCA implicit-weights range (only one for Han). Returns null if there + * are no data for this primary. */ public PrimaryToFractional getProps(int ucaPrimary) { int index = pinPrimary(ucaPrimary); @@ -1083,8 +1135,8 @@ public PrimaryToFractional getProps(int ucaPrimary) { } /** - * Returns the properties for the UCA primary weight. - * Creates and caches a new one if there are no data for this primary yet. + * Returns the properties for the UCA primary weight. Creates and caches a new one if there are + * no data for this primary yet. */ public PrimaryToFractional getOrCreateProps(int ucaPrimary) { int index = pinPrimary(ucaPrimary); @@ -1114,9 +1166,10 @@ public int getNumericFractionalPrimary() { /** * Analyzes the UCA primary weights. + * *

    - *
  • Determines the lengths of the corresponding fractional weights. - *
  • Sets a flag for the first UCA primary in each reordering group. + *
  • Determines the lengths of the corresponding fractional weights. + *
  • Sets a flag for the first UCA primary in each reordering group. *
*/ private void findBumps() { @@ -1139,12 +1192,13 @@ private void findBumps() { // do nothing, assume Latin 1 is "frequent" } else { final byte cat = Fractional.getFixedCategory(ch); - if (cat == UCD_Types.OTHER_SYMBOL || - cat == UCD_Types.MATH_SYMBOL || - cat == UCD_Types.MODIFIER_SYMBOL || - (script != UCD_Types.COMMON_SCRIPT && - script != UCD_Types.INHERITED_SCRIPT && - !getOrCreateOptionsForScript(script).defaultTwoBytePunctuation)) { + if (cat == UCD_Types.OTHER_SYMBOL + || cat == UCD_Types.MATH_SYMBOL + || cat == UCD_Types.MODIFIER_SYMBOL + || (script != UCD_Types.COMMON_SCRIPT + && script != UCD_Types.INHERITED_SCRIPT + && !getOrCreateOptionsForScript(script) + .defaultTwoBytePunctuation)) { // Note: We do not test reorderCode == ReorderCodes.SYMBOL // because that includes Lm etc. props.useThreeBytePrimary = true; @@ -1153,8 +1207,9 @@ private void findBumps() { script = (short) reorderCode; } - if (script == UCD_Types.COMMON_SCRIPT || script == UCD_Types.INHERITED_SCRIPT || - script == UCD_Types.Unknown_Script) { + if (script == UCD_Types.COMMON_SCRIPT + || script == UCD_Types.INHERITED_SCRIPT + || script == UCD_Types.Unknown_Script) { // Not a real script, keep current options. } else if (script != options.reorderCode) { ScriptOptions newOptions = getOrCreateOptionsForScript(script); @@ -1166,14 +1221,21 @@ private void findBumps() { // - We emit (and implementations test) compressibility by primary lead bytes. // - The set of bytes usable for primary second bytes depends on // whether the lead byte is compressible. - props.newByte = options.endsByte || newOptions.beginsByte || - options.compressible != newOptions.compressible; + props.newByte = + options.endsByte + || newOptions.beginsByte + || options.compressible != newOptions.compressible; System.out.println( - (props.newByte ? "New primary lead byte:\t" : "Continue lead byte: \t") - + Utility.hex(primary) + " " - + ReorderCodes.getName(script) + " " - + Utility.hex(ch) + " " - + Default.ucd().getName(ch)); + (props.newByte + ? "New primary lead byte:\t" + : "Continue lead byte: \t") + + Utility.hex(primary) + + " " + + ReorderCodes.getName(script) + + " " + + Utility.hex(ch) + + " " + + Default.ucd().getName(ch)); options = newOptions; } } @@ -1184,8 +1246,9 @@ private void findBumps() { // Iterate over siniform ideographic ranges, start scripts. for (Implicit.Range r : uca.implicit.ranges) { short script = Fractional.getFixedScript(r.firstCP); - assert script != UCD_Types.COMMON_SCRIPT && script != UCD_Types.INHERITED_SCRIPT && - script != UCD_Types.Unknown_Script; + assert script != UCD_Types.COMMON_SCRIPT + && script != UCD_Types.INHERITED_SCRIPT + && script != UCD_Types.Unknown_Script; PrimaryToFractional props = getOrCreateProps(r.leadPrimary); props.options = getOrCreateOptionsForScript(script); props.options.firstPrimary = r.leadPrimary; @@ -1197,10 +1260,7 @@ private void findBumps() { hanProps.options.firstPrimary = Implicit.CJK_BASE; hanProps.newByte = true; - final char[][] singlePairs = { - {'a','z'}, {' ', ' '}, - {'0', '9'}, {'.', '.'}, {',', ','} - }; + final char[][] singlePairs = {{'a', 'z'}, {' ', ' '}, {'0', '9'}, {'.', '.'}, {',', ','}}; for (final char[] singlePair : singlePairs) { final char start = singlePair[0]; final char end = singlePair[1]; @@ -1209,45 +1269,56 @@ private void findBumps() { } } - final UnicodeSet twoByteChars = new UnicodeSet( - "[" + - // Cyrillic main exemplar characters from CLDR 22, - // for common locales plus Mongolian. - // We could make this dynamic, using CLDR's tools to fetch this data. - // Consider adding Cyrillic auxiliary exemplar characters. - "\u0430-\u044F\u0451-\u045C\u045E-\u045F\u0491\u0493\u0495\u049B\u049D" + - "\u04A3\u04A5\u04AF\u04B1\u04B3\u04B7\u04B9\u04BB\u04CA" + - "\u04D5\u04D9\u04E3\u04E9\u04EF" + - // Try to mostly fill the Cyrillic lead byte. - // Most of Cyrillic & Cyrillic Supplement - // but not archaic/historic characters. - "\u0400-\u045F\u048A-\u04F9\u0500-\u050D" + - - // Arabic main exemplar characters from CLDR 22, - // except for primary ignorable characters. - "\u0621-\u063A\u0641-\u064A\u066E\u0672\u0679\u067C\u067E" + - "\u0681\u0685\u0686\u0688\u0689\u0691\u0693\u0696\u0698\u069A" + - "\u06A9\u06AB\u06AF\u06BA\u06BC\u06BE\u06C1\u06C2\u06C4\u06C7\u06C9\u06CC\u06CD" + - "\u06D0\u06D2" + - // Try to mostly fill the Arabic lead byte. - // Some of the Unicode 1.1 Extended Arabic letters. - "\u0674-\u06A0" + - // Jamo L, V, T - "\u1100-\u1112\u1161-\u1175\u11A8-\u11C2" + - - // Try to mostly fill the Hangul lead byte. - // Some old Hangul letters, mostly with variants. - // Old initial consonants - "\u1114-\u1115\u111A-\u1123\u1127-\u112F\u1157-\u1159" + - // Old medial vowels - "\u1184-\u1188\u1191-\u1194\u119E-\u11A1" + - // Old final consonants - "\u11C7\u11C8\u11CC-\u11CE" + - - // Hiragana & Katakana main-block letters, - // but not extensions nor Hentaigana. - "\\u3041-\\u309E\u30A1-\u30FE" + - "]"); + final UnicodeSet twoByteChars = + new UnicodeSet( + "[" + + + // Cyrillic main exemplar characters from CLDR 22, + // for common locales plus Mongolian. + // We could make this dynamic, using CLDR's tools to fetch this + // data. + // Consider adding Cyrillic auxiliary exemplar characters. + "\u0430-\u044F\u0451-\u045C\u045E-\u045F\u0491\u0493\u0495\u049B\u049D" + + "\u04A3\u04A5\u04AF\u04B1\u04B3\u04B7\u04B9\u04BB\u04CA" + + "\u04D5\u04D9\u04E3\u04E9\u04EF" + + + // Try to mostly fill the Cyrillic lead byte. + // Most of Cyrillic & Cyrillic Supplement + // but not archaic/historic characters. + "\u0400-\u045F\u048A-\u04F9\u0500-\u050D" + + + + // Arabic main exemplar characters from CLDR 22, + // except for primary ignorable characters. + "\u0621-\u063A\u0641-\u064A\u066E\u0672\u0679\u067C\u067E" + + "\u0681\u0685\u0686\u0688\u0689\u0691\u0693\u0696\u0698\u069A" + + "\u06A9\u06AB\u06AF\u06BA\u06BC\u06BE\u06C1\u06C2\u06C4\u06C7\u06C9\u06CC\u06CD" + + "\u06D0\u06D2" + + + // Try to mostly fill the Arabic lead byte. + // Some of the Unicode 1.1 Extended Arabic letters. + "\u0674-\u06A0" + + + // Jamo L, V, T + "\u1100-\u1112\u1161-\u1175\u11A8-\u11C2" + + + + // Try to mostly fill the Hangul lead byte. + // Some old Hangul letters, mostly with variants. + // Old initial consonants + "\u1114-\u1115\u111A-\u1123\u1127-\u112F\u1157-\u1159" + + + // Old medial vowels + "\u1184-\u1188\u1191-\u1194\u119E-\u11A1" + + + // Old final consonants + "\u11C7\u11C8\u11CC-\u11CE" + + + + // Hiragana & Katakana main-block letters, + // but not extensions nor Hentaigana. + "\\u3041-\\u309E\u30A1-\u30FE" + + "]"); final UnicodeSetIterator twoByteIter = new UnicodeSetIterator(twoByteChars); while (twoByteIter.next()) { setTwoBytePrimaryFor(firstScriptPrimary, twoByteIter.codepoint); diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/RadicalStroke.java b/unicodetools/src/main/java/org/unicode/text/UCA/RadicalStroke.java index e10d5af1f..8bfd4a53b 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/RadicalStroke.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/RadicalStroke.java @@ -1,61 +1,52 @@ package org.unicode.text.UCA; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.IOException; import java.io.Writer; import java.util.Arrays; - import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.text.UCD.ToolUnicodePropertySource; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public final class RadicalStroke { private static final int MAX_RADICAL_NUMBER = 214; /** - * The Unicode 1.1 Unihan block was U+4E00..U+9FA5. - * The ideographs there were allocated in radical-stroke order, - * but some of the radical-stroke data was changed later. + * The Unicode 1.1 Unihan block was U+4E00..U+9FA5. The ideographs there were allocated in + * radical-stroke order, but some of the radical-stroke data was changed later. */ private static final int LAST_UNIHAN_11 = 0x9fa5; private static final boolean DEBUG = false; private String unicodeVersion; - /** - * Han character data in code point order. - */ + /** Han character data in code point order. */ private long[] rawHan; - /** - * Han character data in UCA radical-stroke order. - */ + /** Han character data in UCA radical-stroke order. */ private long[] orderedHan; - /** - * Maps radicalNumberAndSimplified to the radical character. - */ + /** Maps radicalNumberAndSimplified to the radical character. */ private String[] radToChar = new String[(MAX_RADICAL_NUMBER + 1) * 2]; - /** - * Maps radicalNumberAndSimplified to the radical character and its ideograph sibling. - */ + /** Maps radicalNumberAndSimplified to the radical character and its ideograph sibling. */ private String[] radToChars = new String[(MAX_RADICAL_NUMBER + 1) * 2]; - /** - * Radical strings. Avoid constructing them over and over. - */ + /** Radical strings. Avoid constructing them over and over. */ private String[] radicalStrings = new String[(MAX_RADICAL_NUMBER + 1) * 2]; /** - * Han characters for which code point order == radical-stroke order. - * Hand-picked exceptions that are hard to detect optimally - * (because there are 2 or 3 in a row out of order) are removed here, - * while other characters are removed automatically. + * Han characters for which code point order == radical-stroke order. Hand-picked exceptions + * that are hard to detect optimally (because there are 2 or 3 in a row out of order) are + * removed here, while other characters are removed automatically. */ - private UnicodeSet hanInCPOrder = new UnicodeSet(0x4e00, LAST_UNIHAN_11) - .remove(0x561f).remove(0x5620) - .remove(0x7adf).remove(0x7ae0) - .remove(0x9824).remove(0x9825); + private UnicodeSet hanInCPOrder = + new UnicodeSet(0x4e00, LAST_UNIHAN_11) + .remove(0x561f) + .remove(0x5620) + .remove(0x7adf) + .remove(0x7ae0) + .remove(0x9824) + .remove(0x9825); + private UnicodeSet hanNotInCPOrder; public RadicalStroke(String unicodeVersion) { @@ -73,15 +64,18 @@ public RadicalStroke(String unicodeVersion) { while (hanIter.next()) { int c = hanIter.codepoint; assert c >= 0; - int extension = (0x4E00 <= c && c <= 0xFFFF) ? 0 : 1; // see UCA implicit weights BASE FB40 vs. FB80 + int extension = + (0x4E00 <= c && c <= 0xFFFF) + ? 0 + : 1; // see UCA implicit weights BASE FB40 vs. FB80 String rs = rsUnicode.get(c); if (rs == null) { rs = "214'.63"; } // Use only the first radical-stroke value if there are multiple. - int delim = rs.indexOf(' '); // value separator in Unihan data files + int delim = rs.indexOf(' '); // value separator in Unihan data files if (delim < 0) { - delim = rs.indexOf('|'); // The new parser rewrites multi-values with a | separator. + delim = rs.indexOf('|'); // The new parser rewrites multi-values with a | separator. if (delim < 0) { delim = rs.length(); } @@ -98,43 +92,61 @@ public RadicalStroke(String unicodeVersion) { int radicalNumberAndSimplified = (radicalNumber << 1) | simplified; int residualStrokeCount = parseInt(rs, dot + 1, delim); long order = - ((long)radicalNumberAndSimplified << 31) | - (residualStrokeCount << 24) | - (extension << 23) | - c; + ((long) radicalNumberAndSimplified << 31) + | (residualStrokeCount << 24) + | (extension << 23) + | c; if (DEBUG) { String radical = rs.substring(0, dot); - System.out.println("U+" + Utility.hex(c) + " -> " + Long.toHexString(order) + - " rad " + radical + - " chars " + radToChars[radicalNumberAndSimplified] + - " residual " + residualStrokeCount); + System.out.println( + "U+" + + Utility.hex(c) + + " -> " + + Long.toHexString(order) + + " rad " + + radical + + " chars " + + radToChars[radicalNumberAndSimplified] + + " residual " + + residualStrokeCount); } if (extension == 0 && 0x4E00 <= c && c <= LAST_UNIHAN_11) { if (!hanInCPOrder.contains(c)) { if (DEBUG) { - System.out.println("*** out of order: " + - Long.toHexString(prevPrevOrder) + ' ' + - Long.toHexString(prevOrder) + " (" + - Long.toHexString(order) + ") (manually removed)"); + System.out.println( + "*** out of order: " + + Long.toHexString(prevPrevOrder) + + ' ' + + Long.toHexString(prevOrder) + + " (" + + Long.toHexString(order) + + ") (manually removed)"); } } else if (prevPrevOrder <= order && prevOrder > order) { // The previous character sorts higher than the surrounding ones. if (DEBUG) { - System.out.println("*** out of order: " + - Long.toHexString(prevPrevOrder) + " (" + - Long.toHexString(prevOrder) + ") " + - Long.toHexString(order)); + System.out.println( + "*** out of order: " + + Long.toHexString(prevPrevOrder) + + " (" + + Long.toHexString(prevOrder) + + ") " + + Long.toHexString(order)); } - int prevCodePoint = ((int)prevOrder) & 0x1fffff; + int prevCodePoint = ((int) prevOrder) & 0x1fffff; hanInCPOrder.remove(prevCodePoint); prevOrder = order; } else if (prevOrder > order) { // The current character sorts lower than the previous one. if (DEBUG) { - System.out.println("*** out of order: " + - Long.toHexString(prevPrevOrder) + ' ' + - Long.toHexString(prevOrder) + " (" + - Long.toHexString(order) + ')'); + System.out.println( + "*** out of order: " + + Long.toHexString(prevPrevOrder) + + ' ' + + Long.toHexString(prevOrder) + + " (" + + Long.toHexString(order) + + ')'); } hanInCPOrder.remove(c); } else { @@ -168,10 +180,10 @@ public void printRadicalStrokeOrder(Writer writer) throws IOException { /** * Prints the next radical and the Han ideographs that sort with it. * - * @param pos The data position for the next radical; - * initially use 0, then pass in the return value from the previous call. + * @param pos The data position for the next radical; initially use 0, then pass in the return + * value from the previous call. * @return The next radical's position, or -1 if there are none more. - * @throws IOException + * @throws IOException */ private int printNextRadical(int pos, Writer writer) throws IOException { if (pos == orderedHan.length) { @@ -179,19 +191,21 @@ private int printNextRadical(int pos, Writer writer) throws IOException { } StringBuilder sb = new StringBuilder("[radical "); long order = orderedHan[pos]; - int radicalNumberAndSimplified = (int)(order >> 31); + int radicalNumberAndSimplified = (int) (order >> 31); String radicalChars = radToChars[radicalNumberAndSimplified]; - sb.append(getRadicalStringFromShortData(getShortData(order))).append('='). - append(radicalChars).append(':'); + sb.append(getRadicalStringFromShortData(getShortData(order))) + .append('=') + .append(radicalChars) + .append(':'); int start = 0; int prev = 0; do { - int c = (int)order & 0x1fffff; + int c = (int) order & 0x1fffff; if (c != (prev + 1)) { // c does not continue a range. if (start < prev) { // Finish the previous range. - if ((start + 2) <= prev) { // at least 3 code points + if ((start + 2) <= prev) { // at least 3 code points sb.append('-'); } sb.appendCodePoint(prev); @@ -205,10 +219,10 @@ private int printNextRadical(int pos, Writer writer) throws IOException { break; } order = orderedHan[pos]; - } while ((int)(order >> 31) == radicalNumberAndSimplified); + } while ((int) (order >> 31) == radicalNumberAndSimplified); if (start < prev) { // Finish the last range. - if ((start + 2) <= prev) { // at least 3 code points + if ((start + 2) <= prev) { // at least 3 code points sb.append('-'); } sb.appendCodePoint(prev); @@ -221,15 +235,15 @@ private int printNextRadical(int pos, Writer writer) throws IOException { } public void printUnihanIndex(Writer writer) throws IOException { - writer.append("# Index characters for the unihan sort order in root.\n"). - append("# Each index character is an ideograph representing a radical,\n"). - append("# and sorts like the first ideograph in the radical-stroke order.\n"); + writer.append("# Index characters for the unihan sort order in root.\n") + .append("# Each index character is an ideograph representing a radical,\n") + .append("# and sorts like the first ideograph in the radical-stroke order.\n"); writer.append("# Unicode ").append(unicodeVersion).append('\n'); StringBuilder sb = new StringBuilder(); - for (int pos = 0;;) { + for (int pos = 0; ; ) { long order = orderedHan[pos]; - int c = (int)order & 0x1fffff; // First code point for the radical. - int radicalNumberAndSimplified = (int)(order >> 31); + int c = (int) order & 0x1fffff; // First code point for the radical. + int radicalNumberAndSimplified = (int) (order >> 31); String radicalChars = radToChars[radicalNumberAndSimplified]; // All radicals should be BMP characters. assert radicalChars.length() == UTF16.countCodePoint(radicalChars); @@ -237,45 +251,44 @@ public void printUnihanIndex(Writer writer) throws IOException { // use the one at index 1 which is in the original Unihan block // which has good font support, // rather than the one at index 0 which is in the radicals block. - sb.replace(0, sb.length(), "&").appendCodePoint(c). - append("=\\uFDD0").append(radicalChars.charAt(1)). - append(" # radical "). - append(getRadicalStringFromShortData(getShortData(order))).append('\n'); + sb.replace(0, sb.length(), "&") + .appendCodePoint(c) + .append("=\\uFDD0") + .append(radicalChars.charAt(1)) + .append(" # radical ") + .append(getRadicalStringFromShortData(getShortData(order))) + .append('\n'); writer.append(sb); do { if (++pos == orderedHan.length) { return; } order = orderedHan[pos]; - } while ((int)(order >> 31) == radicalNumberAndSimplified); + } while ((int) (order >> 31) == radicalNumberAndSimplified); } } - /** - * Returns a set of Han characters for which code point order == radical-stroke order. - */ + /** Returns a set of Han characters for which code point order == radical-stroke order. */ public UnicodeSet getHanInCPOrder() { return hanInCPOrder; } - /** - * Returns a set of Han characters for which code point order != radical-stroke order. - */ + /** Returns a set of Han characters for which code point order != radical-stroke order. */ public UnicodeSet getHanNotInCPOrder() { return hanNotInCPOrder; } /** - * Returns a long for the UCA order of ideographs, including the code point tie-breaker. - * Returns 0 for non-ideographs. + * Returns a long for the UCA order of ideographs, including the code point tie-breaker. Returns + * 0 for non-ideographs. */ public long getLongOrder(int cp) { return getData(cp); } /** - * Returns data in bit sets: 15..8=radicalNumber, 7=simplified, 6..0=residualStrokes. - * Returns 0 for non-ideographs. + * Returns data in bit sets: 15..8=radicalNumber, 7=simplified, 6..0=residualStrokes. Returns 0 + * for non-ideographs. */ public int getShortData(int cp) { return getShortData(getData(cp)); @@ -301,24 +314,22 @@ public static int getResidualStrokesFromShortData(int data) { return data & 0x3f; } - /** - * Returns the radical character for its number and simplified-ness. - */ + /** Returns the radical character for its number and simplified-ness. */ public String getRadicalCharFromShortData(int data) { int radicalNumberAndSimplified = data >> 7; assert radicalNumberAndSimplified >= 2; - return radicalNumberAndSimplified < radToChar.length ? - radToChar[radicalNumberAndSimplified] : null; + return radicalNumberAndSimplified < radToChar.length + ? radToChar[radicalNumberAndSimplified] + : null; } - /** - * Returns a string like "90" or "90'". - */ + /** Returns a string like "90" or "90'". */ public String getRadicalStringFromShortData(int data) { int radicalNumberAndSimplified = data >> 7; assert radicalNumberAndSimplified >= 2; - return radicalNumberAndSimplified < radicalStrings.length ? - radicalStrings[radicalNumberAndSimplified] : null; + return radicalNumberAndSimplified < radicalStrings.length + ? radicalStrings[radicalNumberAndSimplified] + : null; } private long getData(int cp) { @@ -327,7 +338,7 @@ private long getData(int cp) { int limit = rawHan.length; while (start < limit) { int i = (start + limit) / 2; - int midCP = (int)(rawHan[i] & 0x1fffff); + int midCP = (int) (rawHan[i] & 0x1fffff); if (cp < midCP) { limit = i; } else if (cp > midCP) { @@ -336,11 +347,11 @@ private long getData(int cp) { return rawHan[i]; } } - return 0; // not found + return 0; // not found } private static int getShortData(long order) { - return (int)(order >> 24); + return (int) (order >> 24); } private void getCJKRadicals(IndexUnicodeProperties iup) { @@ -364,22 +375,22 @@ private void getCJKRadicals(IndexUnicodeProperties iup) { assert c >= 0; String oldValue = radToChar[radicalNumberAndSimplified]; if (oldValue == null) { - assert c < 0x3000; // should be a radical code point - radToChar[radicalNumberAndSimplified] = radToChars[radicalNumberAndSimplified] = - Character.toString((char)c); + assert c < 0x3000; // should be a radical code point + radToChar[radicalNumberAndSimplified] = + radToChars[radicalNumberAndSimplified] = Character.toString((char) c); } else { assert 0x4e00 <= c && c <= LAST_UNIHAN_11; int oldCodePoint = oldValue.codePointAt(0); assert oldCodePoint < 0x3000; assert oldValue == radToChars[radicalNumberAndSimplified]; - radToChars[radicalNumberAndSimplified] = oldValue + (char)c; + radToChars[radicalNumberAndSimplified] = oldValue + (char) c; } } } /** - * Parses a small (max 3 digits) integer from a subsequence. - * Avoids creation of a subsequence object. + * Parses a small (max 3 digits) integer from a subsequence. Avoids creation of a subsequence + * object. */ private static int parseInt(String s, int start, int limit) { assert start < limit; diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/ReorderCodes.java b/unicodetools/src/main/java/org/unicode/text/UCA/ReorderCodes.java index 18fcd1e55..b46b53c81 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/ReorderCodes.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/ReorderCodes.java @@ -5,14 +5,12 @@ import org.unicode.text.UCD.UCD_Types; /** - * Helper class for reorder codes: - * Script code (0..FF) or reordering group code - * ({@link ReorderCodes#FIRST}..{@link ReorderCodes#LIMIT}). + * Helper class for reorder codes: Script code (0..FF) or reordering group code ({@link + * ReorderCodes#FIRST}..{@link ReorderCodes#LIMIT}). * - *

Note: The names of the special-group constants are the same as in - * {@link com.ibm.icu.text.Collator.ReorderCodes} - * but the script and reorder code numeric values - * are totally different from those in ICU. + *

Note: The names of the special-group constants are the same as in {@link + * com.ibm.icu.text.Collator.ReorderCodes} but the script and reorder code numeric values are + * totally different from those in ICU. */ public class ReorderCodes { public static final int FIRST = 0x100; @@ -29,48 +27,53 @@ public class ReorderCodes { public static final int FULL_LIMIT = 0x108; private static final String[] SPECIAL_NAMES = { - "SPACE", "PUNCTUATION", "SYMBOL", "CURRENCY", "DIGIT", null, - "REORDER_RESERVED_BEFORE_LATIN", "REORDER_RESERVED_AFTER_LATIN" + "SPACE", + "PUNCTUATION", + "SYMBOL", + "CURRENCY", + "DIGIT", + null, + "REORDER_RESERVED_BEFORE_LATIN", + "REORDER_RESERVED_AFTER_LATIN" }; /** - * Sample characters for collation-specific reordering groups. - * See the comments on {@link #getSampleCharacter(int)}. + * Sample characters for collation-specific reordering groups. See the comments on {@link + * #getSampleCharacter(int)}. */ private static final String[] SPECIAL_SAMPLES = { - "\u00A0", "\u201C", "\u263A", "\u20AC", "4", null, - "\uFF21", "\uFF3A" + "\u00A0", "\u201C", "\u263A", "\u20AC", "4", null, "\uFF21", "\uFF3A" }; public static final int getSpecialReorderCode(int ch) { final byte cat = Fractional.getFixedCategory(ch); switch (cat) { - case UCD_Types.SPACE_SEPARATOR: - case UCD_Types.LINE_SEPARATOR: - case UCD_Types.PARAGRAPH_SEPARATOR: - case UCD_Types.CONTROL: - return SPACE; - case UCD_Types.DASH_PUNCTUATION: - case UCD_Types.START_PUNCTUATION: - case UCD_Types.END_PUNCTUATION: - case UCD_Types.CONNECTOR_PUNCTUATION: - case UCD_Types.OTHER_PUNCTUATION: - case UCD_Types.INITIAL_PUNCTUATION: - case UCD_Types.FINAL_PUNCTUATION: - return PUNCTUATION; - case UCD_Types.OTHER_SYMBOL: - case UCD_Types.MATH_SYMBOL: - case UCD_Types.MODIFIER_SYMBOL: - return SYMBOL; - case UCD_Types.CURRENCY_SYMBOL: - return CURRENCY; - case UCD_Types.DECIMAL_DIGIT_NUMBER: - case UCD_Types.LETTER_NUMBER: - case UCD_Types.OTHER_NUMBER: - return DIGIT; - default: - // Lm etc. - return SYMBOL; + case UCD_Types.SPACE_SEPARATOR: + case UCD_Types.LINE_SEPARATOR: + case UCD_Types.PARAGRAPH_SEPARATOR: + case UCD_Types.CONTROL: + return SPACE; + case UCD_Types.DASH_PUNCTUATION: + case UCD_Types.START_PUNCTUATION: + case UCD_Types.END_PUNCTUATION: + case UCD_Types.CONNECTOR_PUNCTUATION: + case UCD_Types.OTHER_PUNCTUATION: + case UCD_Types.INITIAL_PUNCTUATION: + case UCD_Types.FINAL_PUNCTUATION: + return PUNCTUATION; + case UCD_Types.OTHER_SYMBOL: + case UCD_Types.MATH_SYMBOL: + case UCD_Types.MODIFIER_SYMBOL: + return SYMBOL; + case UCD_Types.CURRENCY_SYMBOL: + return CURRENCY; + case UCD_Types.DECIMAL_DIGIT_NUMBER: + case UCD_Types.LETTER_NUMBER: + case UCD_Types.OTHER_NUMBER: + return DIGIT; + default: + // Lm etc. + return SYMBOL; } } @@ -88,10 +91,10 @@ public static final String getName(int reorderCode) { // - Remove scripts supported by ICU4J UScript and CLDR ScriptMetadata. // - Add scripts not yet supported there. switch (reorderCode) { -// case UCD_Types.Old_Hungarian: -// return "Old_Hungarian"; - default: - throw new UnsupportedOperationException("unknown reorderCode " + reorderCode); + // case UCD_Types.Old_Hungarian: + // return "Old_Hungarian"; + default: + throw new UnsupportedOperationException("unknown reorderCode " + reorderCode); } } else { return SPECIAL_NAMES[reorderCode - FIRST]; @@ -108,10 +111,10 @@ public static final String getShortName(int reorderCode) { // - Remove scripts supported by ICU4J UScript and CLDR ScriptMetadata. // - Add scripts not yet supported there. switch (reorderCode) { -// case UCD_Types.Old_Hungarian: -// return "Hung"; - default: - throw new UnsupportedOperationException("unknown reorderCode " + reorderCode); + // case UCD_Types.Old_Hungarian: + // return "Hung"; + default: + throw new UnsupportedOperationException("unknown reorderCode " + reorderCode); } } else { return SPECIAL_NAMES[reorderCode - FIRST]; @@ -119,15 +122,16 @@ public static final String getShortName(int reorderCode) { } /** - * Returns a sample character string for the reorder code. - * For regular scripts, it is the sample character defined in CLDR ScriptMetadata.txt. - * For collation-specific codes it is a hardcoded value. + * Returns a sample character string for the reorder code. For regular scripts, it is the sample + * character defined in CLDR ScriptMetadata.txt. For collation-specific codes it is a hardcoded + * value. * *

It is probably a good idea to use sample characters that map to a single primary CE. */ public static final String getSampleCharacter(int reorderCode) { if (reorderCode < FIRST) { - final String scriptName = UCD.getScriptID_fromIndex((short) reorderCode, UCD_Types.SHORT); + final String scriptName = + UCD.getScriptID_fromIndex((short) reorderCode, UCD_Types.SHORT); final ScriptMetadata.Info info = ScriptMetadata.getInfo(scriptName); if (info != null) { return info.sampleChar; @@ -136,15 +140,16 @@ public static final String getSampleCharacter(int reorderCode) { // - Remove scripts supported by ICU4J UScript and CLDR ScriptMetadata. // - Add scripts not yet supported there. switch (reorderCode) { -// case UCD_Types.Old_Hungarian: -// return "𐲡"; - default: - throw new UnsupportedOperationException("unknown reorderCode " + reorderCode); + // case UCD_Types.Old_Hungarian: + // return "𐲡"; + default: + throw new UnsupportedOperationException("unknown reorderCode " + reorderCode); } } else { return SPECIAL_SAMPLES[reorderCode - FIRST]; } } + public static final String getScriptStartString(int reorderCode) { String sampleChar = getSampleCharacter(reorderCode); // Use the U+FDD0 prefix for reorder-reserved ranges diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/ReorderingTokens.java b/unicodetools/src/main/java/org/unicode/text/UCA/ReorderingTokens.java index 72a905638..cc96a8fea 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/ReorderingTokens.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/ReorderingTokens.java @@ -1,21 +1,17 @@ -/** - * - */ +/** */ package org.unicode.text.UCA; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.util.Collections; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.util.Counter; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.UCD; import org.unicode.text.UCD.UCD_Types; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.text.UnicodeSet; - class ReorderingTokens { Counter reorderingToken = new Counter(); @@ -23,8 +19,7 @@ class ReorderingTokens { private final Set primaryCount = new TreeSet(); private final UnicodeSet chars = new UnicodeSet(); - public ReorderingTokens() { - } + public ReorderingTokens() {} public ReorderingTokens(ReorderingTokens scriptSet) { or(scriptSet); @@ -42,21 +37,23 @@ public void or(ReorderingTokens set) { } public int cardinality() { - return reorderingToken.size() - + types.size(); + return reorderingToken.size() + types.size(); } void addInfoFrom(long primary, String source) { primaryCount.add(primary); chars.add(source); final int cp = Default.nfkd().normalize(source).codePointAt(0); - //for (int i = 0; i < source.length(); i += Character.charCount(cp)) { - //cp = source.codePointAt(i); + // for (int i = 0; i < source.length(); i += Character.charCount(cp)) { + // cp = source.codePointAt(i); final byte cat = Fractional.getFixedCategory(cp); final int script = Fractional.getFixedScript(cp); if (!(script == UCD_Types.Unknown_Script || script == UCD_Types.COMMON_SCRIPT) - && (cat == UCD_Types.OTHER_LETTER || cat == UCD_Types.UPPERCASE_LETTER || cat == UCD_Types.LOWERCASE_LETTER || cat == UCD_Types.TITLECASE_LETTER)) { + && (cat == UCD_Types.OTHER_LETTER + || cat == UCD_Types.UPPERCASE_LETTER + || cat == UCD_Types.LOWERCASE_LETTER + || cat == UCD_Types.TITLECASE_LETTER)) { // Add script aliases Hira & Hrkt before adding Kana. if (script == UCD_Types.KATAKANA_SCRIPT && !reorderingToken.containsKey("Hira")) { reorderingToken.add("Hira", 1); @@ -71,7 +68,7 @@ void addInfoFrom(long primary, String source) { } else { types.add(UCD.getCategoryID_fromIndex(cat, UCD_Types.SHORT), 1); } - //} + // } } static String getTypesCombined(String chr) { @@ -84,20 +81,22 @@ static String getTypesCombined(String chr) { return typeKD; } - public static String getTypes(String source) { - //StringBuilder result = new StringBuilder(); + // StringBuilder result = new StringBuilder(); String result; final int cp = source.codePointAt(0); - //for (int i = 0; i < source.length(); i += Character.charCount(cp)) { - //cp = source.codePointAt(i); + // for (int i = 0; i < source.length(); i += Character.charCount(cp)) { + // cp = source.codePointAt(i); final byte cat = Fractional.getFixedCategory(cp); final int script = Fractional.getFixedScript(cp); // if (result.length() != 0) { // result.append(' '); // } if (!(script == UCD_Types.Unknown_Script || script == UCD_Types.COMMON_SCRIPT) - && (cat == UCD_Types.OTHER_LETTER || cat == UCD_Types.UPPERCASE_LETTER || cat == UCD_Types.LOWERCASE_LETTER || cat == UCD_Types.TITLECASE_LETTER)) { + && (cat == UCD_Types.OTHER_LETTER + || cat == UCD_Types.UPPERCASE_LETTER + || cat == UCD_Types.LOWERCASE_LETTER + || cat == UCD_Types.TITLECASE_LETTER)) { result = (UCD.getScriptID_fromIndex((short) script, UCD_Types.SHORT)); } else { result = (UCD.getCategoryID_fromIndex(cat, UCD_Types.SHORT)); @@ -112,13 +111,14 @@ public String toString() { } static Set common = new TreeSet(); + static { common.add(UCD.getScriptID_fromIndex(UCD_Types.COMMON_SCRIPT, UCD_Types.SHORT)); common.add(UCD.getScriptID_fromIndex(UCD_Types.Unknown_Script, UCD_Types.SHORT)); common.add(UCD.getScriptID_fromIndex(UCD_Types.INHERITED_SCRIPT, UCD_Types.SHORT)); } - T appendTo(T result, boolean categoriesAlso) { + T appendTo(T result, boolean categoriesAlso) { try { if (reorderingToken.size() == 0 && types.size() == 0) { return result; @@ -127,10 +127,14 @@ T appendTo(T result, boolean categoriesAlso) { result.append("[").append(primaryCount.size() + "").append("]\t"); } if (!categoriesAlso) { - final String scriptNames = reorderingToken.size() != 0 ? CollectionUtilities.join(reorderingToken.keySet(), " ") : "Zyyy"; + final String scriptNames = + reorderingToken.size() != 0 + ? CollectionUtilities.join(reorderingToken.keySet(), " ") + : "Zyyy"; result.append(scriptNames); } else { - final String scriptNames = reorderingToken.size() != 0 ? joinCounter(reorderingToken) : ""; + final String scriptNames = + reorderingToken.size() != 0 ? joinCounter(reorderingToken) : ""; result.append(scriptNames); if (types.size() != 0) { final String catNames = joinCounter(types); @@ -139,7 +143,7 @@ T appendTo(T result, boolean categoriesAlso) { } result.append(catNames); } - //result.append("\t").append(chars.toPattern(false)); + // result.append("\t").append(chars.toPattern(false)); } return result; } catch (final IOException e) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/SecTerToFractional.java b/unicodetools/src/main/java/org/unicode/text/UCA/SecTerToFractional.java index 73eb3d866..49f112567 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/SecTerToFractional.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/SecTerToFractional.java @@ -3,9 +3,7 @@ import java.util.Map; import java.util.TreeMap; -/** - * Maps per-one-primary secondary and tertiary UCA weights to fractional weights. - */ +/** Maps per-one-primary secondary and tertiary UCA weights to fractional weights. */ public final class SecTerToFractional { // There is one instance of this class per primary weight // that has non-trivial secondary and/or tertiary weights. @@ -36,37 +34,31 @@ public final class SecTerToFractional { private static final int UCA_TERTIARY_LIMIT = UCA_Types.MAX_TERTIARY + 1; /** - * If true, then we store the secondary and tertiary weights - * for CEs like [00, 00, tt] and [00, ss, tt]. - * ss cannot be the common weight. - * {@link #commonSecTs2f} stores the tertiary weights of tertiary CEs [00, 00, tt], - * if there are any besides 00. + * If true, then we store the secondary and tertiary weights for CEs like [00, 00, tt] and [00, + * ss, tt]. ss cannot be the common weight. {@link #commonSecTs2f} stores the tertiary weights + * of tertiary CEs [00, 00, tt], if there are any besides 00. * - *

If false, then we store the weights for CEs like [pp, ss, tt] (where pp!=0). - * ss cannot be 00. - * {@link #commonSecTs2f} stores the tertiary weights of primary CEs like [pp, ss, tt] - * where ss=common, if there are any besides the common tertiary. + *

If false, then we store the weights for CEs like [pp, ss, tt] (where pp!=0). ss cannot be + * 00. {@link #commonSecTs2f} stores the tertiary weights of primary CEs like [pp, ss, tt] where + * ss=common, if there are any besides the common tertiary. */ private final boolean isPrimaryIgnorable; + private int commonSecLowestUCATer; /** - * Tertiaries for the 00 or common secondary weight. - * null if only 00 and common tertiary weights. + * Tertiaries for the 00 or common secondary weight. null if only 00 and common tertiary + * weights. * * @see #isPrimaryIgnorable */ private int[] commonSecTs2f; - /** - * Secondaries-to-fractional. - */ + /** Secondaries-to-fractional. */ private Map ss2f; private static final class SecondaryToFractional { private int fractionalSecondary; private int lowestUCATer; - /** - * Tertiaries-to-fractional. - */ + /** Tertiaries-to-fractional. */ private int[] ts2f; } @@ -110,8 +102,8 @@ private void appendTertiaries(int[] secTs2f, StringBuilder sb) { } /** - * After adding all of the secondary and tertiary weights for a primary, - * this method must be called before using {@link #getFractionalSecAndTer(int, int)}. + * After adding all of the secondary and tertiary weights for a primary, this method must be + * called before using {@link #getFractionalSecAndTer(int, int)}. */ public void assignFractionalWeights() { if (commonSecTs2f != null) { @@ -122,7 +114,8 @@ public void assignFractionalWeights() { terForcedToCommon = -1; } else { int secondLowestTer = commonSecLowestUCATer + 1; - while (secondLowestTer <= UCA_Types.MAX_TERTIARY && commonSecTs2f[secondLowestTer] == 0) { + while (secondLowestTer <= UCA_Types.MAX_TERTIARY + && commonSecTs2f[secondLowestTer] == 0) { ++secondLowestTer; } if (secondLowestTer == UCA_Types.NORMAL_HIRAGANA_TERTIARY) { @@ -178,7 +171,8 @@ public void assignFractionalWeights() { } } - private void setFractionalTertiaries(int[] secTs2f, int terForcedToCommon, Fractional.WeightIterator iter) { + private void setFractionalTertiaries( + int[] secTs2f, int terForcedToCommon, Fractional.WeightIterator iter) { for (int ter = UCA_Types.NEUTRAL_TERTIARY + 1; ter < secTs2f.length; ++ter) { if (ter == terForcedToCommon) { secTs2f[ter] = Fractional.COMMON_TER; @@ -230,9 +224,8 @@ public void addUCASecondaryAndTertiary(int sec, int ter) { } /** - * Converts the UCA secondary & tertiary weights to fractional weights. - * Returns an int with the fractional secondary in the upper 16 bits - * and the fractional tertiary in the lower 16 bits. + * Converts the UCA secondary & tertiary weights to fractional weights. Returns an int with the + * fractional secondary in the upper 16 bits and the fractional tertiary in the lower 16 bits. */ public int getFractionalSecAndTer(int sec, int ter) { checkUCAWeights(sec, ter); diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/TestCompatibilityCharacters.java b/unicodetools/src/main/java/org/unicode/text/UCA/TestCompatibilityCharacters.java index aee26adda..7dfb5686d 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/TestCompatibilityCharacters.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/TestCompatibilityCharacters.java @@ -1,5 +1,6 @@ package org.unicode.text.UCA; +import com.ibm.icu.text.UTF16; import java.io.File; import java.io.IOException; import java.io.PrintWriter; @@ -7,14 +8,11 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; - import org.unicode.text.UCD.Default; import org.unicode.text.UCD.UCD; import org.unicode.text.UCD.UCD_Types; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; - public final class TestCompatibilityCharacters { private static final byte MULTIPLES = 0x20, COMPRESSED = 0x40, OTHER_MASK = 0x1F; private static final BitSet compressSet = new BitSet(); @@ -22,7 +20,11 @@ public final class TestCompatibilityCharacters { static void testCompatibilityCharacters(UCA uca) throws IOException { final String fullFileName = "UCA_CompatComparison.txt"; - final PrintWriter testLog = Utility.openPrintWriter(UCA.getOutputDir() + File.separator + "log", fullFileName, Utility.UTF8_WINDOWS); + final PrintWriter testLog = + Utility.openPrintWriter( + UCA.getOutputDir() + File.separator + "log", + fullFileName, + Utility.UTF8_WINDOWS); final int[] kenCes = new int[50]; final int[] markCes = new int[50]; @@ -50,7 +52,7 @@ static void testCompatibilityCharacters(UCA uca) throws IOException { // fix type type = getDecompType(i); - final String s = String.valueOf((char)i); + final String s = String.valueOf((char) i); final int kenLen = uca.getCEs(s, decompType, kenCes); // true final int markLen = fixCompatibilityCE(uca, s, true, markCes, false); @@ -59,32 +61,34 @@ static void testCompatibilityCharacters(UCA uca) throws IOException { final String comp = CEList.toString(kenComp, kenCLen); if (arraysMatch(kenCes, kenLen, kenComp, kenCLen)) { - forLater.put((char)(COMPRESSED | type) + s, comp); + forLater.put((char) (COMPRESSED | type) + s, comp); continue; } if (type == UCD_Types.CANONICAL && multipleZeroPrimaries(markCes, markLen)) { - forLater.put((char)(MULTIPLES | type) + s, comp); + forLater.put((char) (MULTIPLES | type) + s, comp); continue; } - forLater.put((char)type + s, comp); + forLater.put((char) type + s, comp); } } final Iterator it = forLater.keySet().iterator(); - byte oldType = (byte)0xFF; // anything unique + byte oldType = (byte) 0xFF; // anything unique int caseCount = 0; WriteCollationData.writeVersionAndDate(testLog, fullFileName, true); - //log.println("# UCA Version: " + collator.getDataVersion() + "/" + collator.getUCDVersion()); - //log.println("Generated: " + getNormalDate()); + // log.println("# UCA Version: " + collator.getDataVersion() + "/" + + // collator.getUCDVersion()); + // log.println("Generated: " + getNormalDate()); while (it.hasNext()) { final String key = it.next(); - final byte type = (byte)key.charAt(0); + final byte type = (byte) key.charAt(0); if (type != oldType) { oldType = type; testLog.println("==============================================================="); testLog.print("CASE " + (caseCount++) + ": "); - final byte rType = (byte)(type & OTHER_MASK); - testLog.println(" Decomposition Type = " + UCD.getDecompositionTypeID_fromIndex(rType)); + final byte rType = (byte) (type & OTHER_MASK); + testLog.println( + " Decomposition Type = " + UCD.getDecompositionTypeID_fromIndex(rType)); if ((type & COMPRESSED) != 0) { testLog.println(" Successfully Compressed a la Ken"); testLog.println(" [XXXX.0020.YYYY][0000.ZZZZ.0002] => [XXXX.ZZZZ.YYYY]"); @@ -104,10 +108,12 @@ static void testCompatibilityCharacters(UCA uca) throws IOException { final String markStr = CEList.toString(markCes, markLen); if ((type & COMPRESSED) != 0) { - testLog.println("COMPRESSED #" + (++count) + ": " + Default.ucd().getCodeAndName(s)); + testLog.println( + "COMPRESSED #" + (++count) + ": " + Default.ucd().getCodeAndName(s)); testLog.println(" : " + comp); } else { - testLog.println("DIFFERENCE #" + (++count) + ": " + Default.ucd().getCodeAndName(s)); + testLog.println( + "DIFFERENCE #" + (++count) + ": " + Default.ucd().getCodeAndName(s)); testLog.println("generated : " + markStr); if (!markStr.equals(comp)) { testLog.println("compressed: " + comp); @@ -119,8 +125,8 @@ static void testCompatibilityCharacters(UCA uca) throws IOException { if (!nfd.equals(nfkd)) { testLog.println("NFD : " + Default.ucd().getCodeAndName(nfd)); } - //kenCLen = collator.getCEs(decomp, true, kenComp); - //log.println("decomp ce: " + CEList.toString(kenComp, kenCLen)); + // kenCLen = collator.getCEs(decomp, true, kenComp); + // log.println("decomp ce: " + CEList.toString(kenComp, kenCLen)); } testLog.println(); } @@ -182,10 +188,11 @@ private static int kenCompress(int[] markCes, int markLen) { if (CEList.getPrimary(next) == 0 && CEList.getSecondary(prev) == 0x20 && CEList.getTertiary(next) == 0x2) { - markCes[out - 1] = UCA.makeKey( - CEList.getPrimary(prev), - CEList.getSecondary(next), - CEList.getTertiary(prev)); + markCes[out - 1] = + UCA.makeKey( + CEList.getPrimary(prev), + CEList.getSecondary(next), + CEList.getTertiary(prev)); compressSet.set(CEList.getSecondary(next)); } else { markCes[out++] = next; @@ -206,9 +213,10 @@ private static boolean arraysMatch(int[] a, int aLen, int[] b, int bLen) { return true; } - private static int fixCompatibilityCE(UCA uca, String s, boolean decompose, int[] output, boolean compress) { + private static int fixCompatibilityCE( + UCA uca, String s, boolean decompose, int[] output, boolean compress) { final byte type = getDecompType(UTF16.charAt(s, 0)); - //char ch = s.charAt(0); + // char ch = s.charAt(0); final String decomp = Default.nfkd().normalize(s); int len = 0; @@ -217,39 +225,37 @@ private static int fixCompatibilityCE(UCA uca, String s, boolean decompose, int[ markLen = kenCompress(markCes, markLen); } - //for (int j = 0; j < decomp.length(); ++j) { + // for (int j = 0; j < decomp.length(); ++j) { for (int k = 0; k < markLen; ++k) { int t = CEList.getTertiary(markCes[k]); t = CEList.remap(k, type, t); /* - if (type != CANONICAL) { - if (0x3041 <= ch && ch <= 0x3094) t = 0xE; // hiragana - else if (0x30A1 <= ch && ch <= 0x30FA) t = 0x11; // katakana - } - switch (type) { - case COMPATIBILITY: t = (t == 8) ? 0xA : 4; break; - case COMPAT_FONT: t = (t == 8) ? 0xB : 5; break; - case COMPAT_NOBREAK: t = 0x1B; break; - case COMPAT_INITIAL: t = 0x17; break; - case COMPAT_MEDIAL: t = 0x18; break; - case COMPAT_FINAL: t = 0x19; break; - case COMPAT_ISOLATED: t = 0x1A; break; - case COMPAT_CIRCLE: t = (t == 0x11) ? 0x13 : (t == 8) ? 0xC : 6; break; - case COMPAT_SUPER: t = 0x14; break; - case COMPAT_SUB: t = 0x15; break; - case COMPAT_VERTICAL: t = 0x16; break; - case COMPAT_WIDE: t= (t == 8) ? 9 : 3; break; - case COMPAT_NARROW: t = (0xFF67 <= ch && ch <= 0xFF6F) ? 0x10 : 0x12; break; - case COMPAT_SMALL: t = (t == 0xE) ? 0xE : 0xF; break; - case COMPAT_SQUARE: t = (t == 8) ? 0x1D : 0x1C; break; - case COMPAT_FRACTION: t = 0x1E; break; - } - */ - output[len++] = UCA.makeKey( - CEList.getPrimary(markCes[k]), - CEList.getSecondary(markCes[k]), - t); - //} + if (type != CANONICAL) { + if (0x3041 <= ch && ch <= 0x3094) t = 0xE; // hiragana + else if (0x30A1 <= ch && ch <= 0x30FA) t = 0x11; // katakana + } + switch (type) { + case COMPATIBILITY: t = (t == 8) ? 0xA : 4; break; + case COMPAT_FONT: t = (t == 8) ? 0xB : 5; break; + case COMPAT_NOBREAK: t = 0x1B; break; + case COMPAT_INITIAL: t = 0x17; break; + case COMPAT_MEDIAL: t = 0x18; break; + case COMPAT_FINAL: t = 0x19; break; + case COMPAT_ISOLATED: t = 0x1A; break; + case COMPAT_CIRCLE: t = (t == 0x11) ? 0x13 : (t == 8) ? 0xC : 6; break; + case COMPAT_SUPER: t = 0x14; break; + case COMPAT_SUB: t = 0x15; break; + case COMPAT_VERTICAL: t = 0x16; break; + case COMPAT_WIDE: t= (t == 8) ? 9 : 3; break; + case COMPAT_NARROW: t = (0xFF67 <= ch && ch <= 0xFF6F) ? 0x10 : 0x12; break; + case COMPAT_SMALL: t = (t == 0xE) ? 0xE : 0xF; break; + case COMPAT_SQUARE: t = (t == 8) ? 0x1D : 0x1C; break; + case COMPAT_FRACTION: t = 0x1E; break; + } + */ + output[len++] = + UCA.makeKey(CEList.getPrimary(markCes[k]), CEList.getSecondary(markCes[k]), t); + // } } return len; } diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/UCA.java b/unicodetools/src/main/java/org/unicode/text/UCA/UCA.java index 6a11d2c25..4c50e9bea 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/UCA.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/UCA.java @@ -1,18 +1,18 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCA/UCA.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCA/UCA.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCA; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; -import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; @@ -28,7 +28,6 @@ import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.text.UCA.UCA_Statistics.RoBitSet; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.Normalizer; @@ -39,44 +38,37 @@ import org.unicode.text.utility.UTF16Plus; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - /** - * UCA is a working version of the UTS #10 Unicode Collation Algorithm, - * as described on https://www.unicode.org/reports/tr10/ + * UCA is a working version of the UTS #10 Unicode Collation Algorithm, as described on + * https://www.unicode.org/reports/tr10/ + * * @author Mark Davis - -

It is not optimized, although it does use some techniques that are required for -a real optimization, such as squeezing all the weights into 32 bits. - -

Invariants relied upon by the algorithm: - -

UCA Data: -

    -
  1. Tertiary values are less than 0x80 -
  2. Variables (marked with *), have a distinct, closed range of primaries. - That is, there are no variable CEs X, Z and non-ignorable CE Y such that X[1] <= Y[1] <= Z[1]
    - This saves a bit in each CE. -
  3. It needs to be fixed when reading: only non-zero weights (levels 1-3) are really variable! -
- -

Limits: If any of the weight limits are reached (FFFF for primary, 1FF for secondary, 7F for tertiary), -expanding characters can be used to achieve the right results, as discussed in UTR#10. - -

Remarks: -

Neither the old 14651 nor the old UCA algorithms for backwards really worked. -This is because of shared -characters between scripts with different directions, like French with Arabic or Greek. + *

It is not optimized, although it does use some techniques that are required for a real + * optimization, such as squeezing all the weights into 32 bits. + *

Invariants relied upon by the algorithm: + *

UCA Data: + *

    + *
  1. Tertiary values are less than 0x80 + *
  2. Variables (marked with *), have a distinct, closed range of primaries. That is, there + * are no variable CEs X, Z and non-ignorable CE Y such that X[1] <= Y[1] <= Z[1]
    + * This saves a bit in each CE. + *
  3. It needs to be fixed when reading: only non-zero weights (levels 1-3) are really + * variable! + *
+ *

Limits: If any of the weight limits are reached (FFFF for primary, 1FF for secondary, 7F + * for tertiary), expanding characters can be used to achieve the right results, as discussed in + * UTR#10. + *

Remarks: + *

Neither the old 14651 nor the old UCA algorithms for backwards really worked. This is + * because of shared characters between scripts with different directions, like French with + * Arabic or Greek. */ - -final public class UCA implements Comparator, UCA_Types { +public final class UCA implements Comparator, UCA_Types { // Utility function copied from // icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/RoundTripTest.java /** - * If abbreviated=true, returns a set which only a sampling of the original code points. - * density is the approximate total number of code points to returned for the entire set. + * If abbreviated=true, returns a set which only a sampling of the original code points. density + * is the approximate total number of code points to returned for the entire set. */ private static UnicodeSet abbreviateSet(UnicodeSet set, boolean abbreviated, int density) { if (!abbreviated) { @@ -107,9 +99,13 @@ private static UnicodeSet abbreviateSet(UnicodeSet set, boolean abbreviated) { return abbreviateSet(set, abbreviated, 100); } - //-------------------------------------------------------------------- + // -------------------------------------------------------------------- - public enum CollatorType {ducet, cldr, cldrWithoutFFFx} + public enum CollatorType { + ducet, + cldr, + cldrWithoutFFFx + } public static final String copyright = "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; @@ -119,10 +115,7 @@ public int compare(String a, String b) { return getSortKey(a).compareTo(getSortKey(b)); } - - /** - * Records the codeversion - */ + /** Records the codeversion */ private static final String codeVersion = "7"; // ============================================================= @@ -145,9 +138,7 @@ public int compare(String a, String b) { private final UCA_Data ucaData; public final Implicit implicit; - /** - * Sample characters and strings for charts and conformance tests. - */ + /** Sample characters and strings for charts and conformance tests. */ private final UnicodeSet moreSamples; // ============================================================= @@ -156,13 +147,15 @@ public int compare(String a, String b) { private String fileVersion = "??"; - // TODO: create these objects (with final fields) while building the data, not later when iterating + // TODO: create these objects (with final fields) while building the data, not later when + // iterating static final class Primary { int primary = -1; int nextPrimary = -1; private CharSequence representative; private Primary() {} + String getRepresentative() { // TODO: statistics should already store a String not a CharSequence return representative.toString(); @@ -214,12 +207,13 @@ public Iterator iterator() { } /** - * Initializes the collation from a stream of rules in the normal formal. - * If the source is null, uses the normal Unicode data files, which - * need to be in BASE_DIR. + * Initializes the collation from a stream of rules in the normal formal. If the source is null, + * uses the normal Unicode data files, which need to be in BASE_DIR. + * * @param type */ - public UCA(String sourceFile, String unicodeVersion, Remap primaryRemap) throws java.io.IOException { + public UCA(String sourceFile, String unicodeVersion, Remap primaryRemap) + throws java.io.IOException { fullData = sourceFile == null; fileVersion = sourceFile; @@ -263,25 +257,20 @@ public UCA(String sourceFile, String unicodeVersion, Remap primaryRemap) throws } } else */ { - final BufferedReader in = new BufferedReader( - new FileReader(sourceFile), BUFFER_SIZE); + final BufferedReader in = new BufferedReader(new FileReader(sourceFile), BUFFER_SIZE); addCollationElements(in); in.close(); } cleanup(); } - /** - * Returns all non-ignorable, below-implicit UCA primary weights. - */ + /** Returns all non-ignorable, below-implicit UCA primary weights. */ Iterable getRegularPrimaries() { // Start after the ignorable primary 0. return new PrimaryIterable(1); } - /** - * Returns all below-Han UCA primary weights, starting with ignorable 0. - */ + /** Returns all below-Han UCA primary weights, starting with ignorable 0. */ PrimaryIterable getIgnorableAndRegularPrimaries() { return new PrimaryIterable(0); } @@ -296,45 +285,49 @@ public Normalizer getNFDNormalizer() { /** * Constructs a sort key for given CEs. + * * @param ces collation elements * @param alternate choice of different 4th level weight construction * @param appendIdentical whether to append an identical level, and which kind of one - * @return Result is a String not really of Unicodes, but of weights. - * String is just a handy way of returning them in Java, since there are no - * unsigned shorts. + * @return Result is a String not really of Unicodes, but of weights. String is just a handy way + * of returning them in Java, since there are no unsigned shorts. */ public String getSortKey(CEList ces, byte alternate, AppendToCe appendIdentical) { return getSortKey(ces, "", alternate, defaultDecomposition, appendIdentical); } /** - * Constructs a sort key for a string of input Unicode characters. Uses - * default values for alternate and decomposition. + * Constructs a sort key for a string of input Unicode characters. Uses default values for + * alternate and decomposition. + * * @param sourceString string to make a sort key for. - * @return Result is a String not of really of Unicodes, but of weights. - * String is just a handy way of returning them in Java, since there are no - * unsigned shorts. + * @return Result is a String not of really of Unicodes, but of weights. String is just a handy + * way of returning them in Java, since there are no unsigned shorts. */ public String getSortKey(String sourceString) { - return getSortKey(null, sourceString, defaultAlternate, defaultDecomposition, AppendToCe.none); + return getSortKey( + null, sourceString, defaultAlternate, defaultDecomposition, AppendToCe.none); } /** - * Constructs a sort key for a string of input Unicode characters. Uses - * default value decomposition. + * Constructs a sort key for a string of input Unicode characters. Uses default value + * decomposition. + * * @param sourceString string to make a sort key for. * @param alternate choice of different 4th level weight construction - * @return Result is a String not of really of Unicodes, but of weights. - * String is just a handy way of returning them in Java, since there are no - * unsigned shorts. + * @return Result is a String not of really of Unicodes, but of weights. String is just a handy + * way of returning them in Java, since there are no unsigned shorts. */ - public String getSortKey(String sourceString, byte alternate) { return getSortKey(null, sourceString, alternate, defaultDecomposition, AppendToCe.none); } public static final int CE_FFFE = UCA.makeKey(0x1, 0x20, 2); - public enum AppendToCe {none, nfd, tieBreaker} + public enum AppendToCe { + none, + nfd, + tieBreaker + } private void setSourceString(String sourceString, boolean decomposition) { decompositionBuffer.setLength(0); @@ -348,42 +341,50 @@ private void setSourceString(String sourceString, boolean decomposition) { /** * Constructs a sort key for a string of input Unicode characters. + * * @param sourceString string to make a sort key for. * @param alternate choice of different 4th level weight construction - * @param decomposition true for UCA, false where the text is guaranteed to be - * normalization form C with no combining marks of class 0. + * @param decomposition true for UCA, false where the text is guaranteed to be normalization + * form C with no combining marks of class 0. * @param appendIdentical whether to append an identical level, and which kind of one - * @return Result is a String not of really of Unicodes, but of weights. - * String is just a handy way of returning them in Java, since there are no - * unsigned shorts. - */ - public String getSortKey(String sourceString, byte alternate, boolean decomposition, AppendToCe appendIdentical) { + * @return Result is a String not of really of Unicodes, but of weights. String is just a handy + * way of returning them in Java, since there are no unsigned shorts. + */ + public String getSortKey( + String sourceString, + byte alternate, + boolean decomposition, + AppendToCe appendIdentical) { return getSortKey(null, sourceString, alternate, decomposition, appendIdentical); } /** - * Constructs a sort key for given CEs, and/or a string of input Unicode characters. - * When the CEs are used up, then the sourceString is processed. + * Constructs a sort key for given CEs, and/or a string of input Unicode characters. When the + * CEs are used up, then the sourceString is processed. + * * @param ces collation elements to be considered first, can be null * @param sourceString string to make a sort key for, can be empty but not null * @param alternate choice of different 4th level weight construction - * @param decomposition true for UCA, false where the text is guaranteed to be - * normalization form C with no combining marks of class 0. + * @param decomposition true for UCA, false where the text is guaranteed to be normalization + * form C with no combining marks of class 0. * @param appendIdentical whether to append an identical level, and which kind of one - * @return Result is a String not really of Unicodes, but of weights. - * String is just a handy way of returning them in Java, since there are no - * unsigned shorts. - */ - public String getSortKey(CEList ces, String sourceString, byte alternate, boolean decomposition, AppendToCe appendIdentical) { + * @return Result is a String not really of Unicodes, but of weights. String is just a handy way + * of returning them in Java, since there are no unsigned shorts. + */ + public String getSortKey( + CEList ces, + String sourceString, + byte alternate, + boolean decomposition, + AppendToCe appendIdentical) { setSourceString(sourceString, decomposition); // Weight strings - not chars, weights. - primaries.setLength(0); // clear out - secondaries.setLength(0); // clear out - tertiaries.setLength(0); // clear out - quaternaries.setLength(0); // clear out - if (SHOW_CE) - { + primaries.setLength(0); // clear out + secondaries.setLength(0); // clear out + tertiaries.setLength(0); // clear out + quaternaries.setLength(0); // clear out + if (SHOW_CE) { debugList.setLength(0); // clear out } @@ -411,31 +412,31 @@ public String getSortKey(CEList ces, String sourceString, byte alternate, boolea } switch (alternate) { - case ZEROED: - if (isVariable(ce)) { - ce = 0; - } - break; - case SHIFTED_TRIMMED: - case SHIFTED: - if (CEList.getTertiary(ce) == 0) { - weight4 = 0; - } else if (ce == CE_FFFE) { - weight4 = getPrimary(ce); - lastWasVariable = false; - } else if (isVariable(ce)) { // variables - weight4 = getPrimary(ce); - lastWasVariable = true; - ce = 0; - } else if (lastWasVariable && getPrimary(ce) == 0) { // zap trailing ignorables - ce = 0; - weight4 = 0; - } else { // above variables - lastWasVariable = false; - weight4 = '\uFFFF'; - } - break; - // case NON_IGNORABLE: // doesn't ever change! + case ZEROED: + if (isVariable(ce)) { + ce = 0; + } + break; + case SHIFTED_TRIMMED: + case SHIFTED: + if (CEList.getTertiary(ce) == 0) { + weight4 = 0; + } else if (ce == CE_FFFE) { + weight4 = getPrimary(ce); + lastWasVariable = false; + } else if (isVariable(ce)) { // variables + weight4 = getPrimary(ce); + lastWasVariable = true; + ce = 0; + } else if (lastWasVariable && getPrimary(ce) == 0) { // zap trailing ignorables + ce = 0; + weight4 = 0; + } else { // above variables + lastWasVariable = false; + weight4 = '\uFFFF'; + } + break; + // case NON_IGNORABLE: // doesn't ever change! } if (SHOW_CE) { if (debugList.length() != 0) { @@ -478,24 +479,24 @@ public String getSortKey(CEList ces, String sourceString, byte alternate, boolea final StringBuilder result = primaries; if (strength >= 2) { - result.append(LEVEL_SEPARATOR); // separator + result.append(LEVEL_SEPARATOR); // separator result.append(secondaries); if (strength >= 3) { - result.append(LEVEL_SEPARATOR); // separator + result.append(LEVEL_SEPARATOR); // separator result.append(tertiaries); if (strength >= 4) { - result.append(LEVEL_SEPARATOR); // separator + result.append(LEVEL_SEPARATOR); // separator if (alternate == SHIFTED_TRIMMED) { int q; - for (q = quaternaries.length()-1; q >= 0; --q) { + for (q = quaternaries.length() - 1; q >= 0; --q) { if (quaternaries.charAt(q) != '\uFFFF') { break; } } - quaternaries.setLength(q+1); + quaternaries.setLength(q + 1); } result.append(quaternaries); - //appendInCodePointOrder(decompositionBuffer, result); + // appendInCodePointOrder(decompositionBuffer, result); } } } @@ -530,8 +531,7 @@ public static int strengthDifference(String sortKey1, String sortKey2) { if (c1 > c2) { return strength; } - if (c1 == LEVEL_SEPARATOR) - { + if (c1 == LEVEL_SEPARATOR) { --strength; // Separator! } } @@ -544,80 +544,60 @@ public static int strengthDifference(String sortKey1, String sortKey2) { return 0; } - /** - * Turns backwards (e.g. for French) on globally for all secondaries - */ + /** Turns backwards (e.g. for French) on globally for all secondaries */ public void setBackwards(boolean backwards) { useBackwards = backwards; } - /** - * Retrieves value applied by set. - */ + /** Retrieves value applied by set. */ public boolean isBackwards() { return useBackwards; } - /** - * Causes variables (those with *) to be set to all zero weights (level 1-3). - */ + /** Causes variables (those with *) to be set to all zero weights (level 1-3). */ public void setDecompositionState(boolean state) { defaultDecomposition = state; } - /** - * Retrieves value applied by set. - */ + /** Retrieves value applied by set. */ public boolean isDecomposed() { return defaultDecomposition; } - /** - * Causes variables (those with *) to be set to all zero weights (level 1-3). - */ + /** Causes variables (those with *) to be set to all zero weights (level 1-3). */ public void setAlternate(byte status) { defaultAlternate = status; } - /** - * Retrieves value applied by set. - */ + /** Retrieves value applied by set. */ public byte getAlternate() { return defaultAlternate; } /** - * Sets the maximum strength level to be included in the string. - * E.g. with 3, only weights of 1, 2, and 3 are included: level 4 weights are discarded. + * Sets the maximum strength level to be included in the string. E.g. with 3, only weights of 1, + * 2, and 3 are included: level 4 weights are discarded. */ public void setStrength(int inStrength) { strength = inStrength; } - /** - * Retrieves value applied by set. - */ + /** Retrieves value applied by set. */ public int getStrength() { return strength; } - /** - * Retrieves version - */ + /** Retrieves version */ public String getCodeVersion() { return codeVersion; } - /** - * Retrieves versions - */ + /** Retrieves versions */ public String getDataVersion() { return dataVersion; } - /** - * Retrieves versions - */ + /** Retrieves versions */ public String getUCDVersion() { return ucdVersion; } @@ -627,20 +607,20 @@ public static String codePointOrder(CharSequence s) { } /** - * Appends UTF-16 string - * with the values swapped around so that they compare in - * code-point order. Replace 0000 and 0001 by 0001 0001/2 + * Appends UTF-16 string with the values swapped around so that they compare in code-point + * order. Replace 0000 and 0001 by 0001 0001/2 + * * @param source Normal UTF-16 (Java) string * @return sort key (as string) - * @author Markus Scherer (cast into Java by MD) - * NOTE: changed to be longer, but handle isolated surrogates + * @author Markus Scherer (cast into Java by MD) NOTE: changed to be longer, but handle isolated + * surrogates */ public static StringBuffer appendInCodePointOrder(CharSequence source, StringBuffer target) { int cp; for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); - target.append((char)((cp >> 15) | 0x8000)); - target.append((char)(cp | 0x8000)); + target.append((char) ((cp >> 15) | 0x8000)); + target.append((char) (cp | 0x8000)); /* if (ch <= 1) { // hack to avoid nulls target.append('\u0001'); @@ -656,8 +636,8 @@ public static StringBuffer appendInCodePointOrder(CharSequence source, StringBuf * Gets all collation elements for the input string and appends them to the output stack. * * @param sourceString input string - * @param decomposition if true then the string is NFD'ed, - * otherwise it must already be normalized + * @param decomposition if true then the string is NFD'ed, otherwise it must already be + * normalized * @param output IntStack gets the collation elements appended */ public void getCEs(String sourceString, boolean decomposition, IntStack output) { @@ -673,8 +653,8 @@ public void getCEs(String sourceString, boolean decomposition, IntStack output) * Gets all collation elements for the input string and writes them to the output array. * * @param sourceString input string - * @param decomposition if true then the string is NFD'ed, - * otherwise it must already be normalized + * @param decomposition if true then the string is NFD'ed, otherwise it must already be + * normalized * @param output array where the collation elements are written * @return number of collation element integers written to the output * @throws IndexOutOfBoundsException if the output array is too short @@ -695,8 +675,8 @@ public int getCEs(String sourceString, boolean decomposition, int[] output) { * Gets all collation elements for the input string and returns them as a CEList. * * @param sourceString input string - * @param decomposition if true then the string is NFD'ed, - * otherwise it must already be normalized + * @param decomposition if true then the string is NFD'ed, otherwise it must already be + * normalized * @return a CEList with the collation elements */ public CEList getCEList(String sourceString, boolean decomposition) { @@ -705,7 +685,7 @@ public CEList getCEList(String sourceString, boolean decomposition) { // If there is only one CEList, then return that one. final CEList ces1 = nextCEs(); if (ces1 == null) { - return null; // not even one (should only happen for an empty sourceString) + return null; // not even one (should only happen for an empty sourceString) } CEList ces = nextCEs(); if (ces == null) { @@ -729,16 +709,13 @@ public CEList getCEList(String sourceString, boolean decomposition) { return stack.isEmpty() ? CEList.EMPTY : new CEList(stack); } - /** - * Returns true if there is an explicit mapping for ch, or one that starts with ch. - */ + /** Returns true if there is an explicit mapping for ch, or one that starts with ch. */ public boolean codePointHasExplicitMappings(int ch) { return ucaData.codePointHasExplicitMappings(ch); } /** - * Returns the primary weight from a 32-bit CE. - * The primary is 16 bits, stored in b31..b16. + * Returns the primary weight from a 32-bit CE. The primary is 16 bits, stored in b31..b16. * * @deprecated use {@link CEList#getPrimary(int)} */ @@ -748,8 +725,7 @@ public static char getPrimary(int ce) { } /** - * Returns the secondary weight from a 32-bit CE. - * The secondary is 9 bits, stored in b15..b7. + * Returns the secondary weight from a 32-bit CE. The secondary is 9 bits, stored in b15..b7. * * @deprecated use {@link CEList#getSecondary(int)} */ @@ -759,8 +735,7 @@ public static char getSecondary(int ce) { } /** - * Returns the tertiary weight from a 32-bit CE. - * The tertiary is 7 bits, stored in b6..b0. + * Returns the tertiary weight from a 32-bit CE. The tertiary is 7 bits, stored in b6..b0. * * @deprecated use {@link CEList#getTertiary(int)} */ @@ -769,33 +744,24 @@ public static char getTertiary(int ce) { return CEList.getTertiary(ce); } - /** - * Utility, used to determine whether a CE is variable or not. - */ - + /** Utility, used to determine whether a CE is variable or not. */ public boolean isVariable(int ce) { return (variableLowCE <= ce && ce <= variableHighCE); } - /** - * Utility, used to determine whether a CE is variable or not. - */ - + /** Utility, used to determine whether a CE is variable or not. */ public int getVariableLowCE() { return variableLowCE; } - /** - * Utility, used to determine whether a CE is variable or not. - */ - + /** Utility, used to determine whether a CE is variable or not. */ public int getVariableHighCE() { return variableHighCE; } /** - * Utility, used to make a CE from the pieces. They must already - * be in the right range of values. + * Utility, used to make a CE from the pieces. They must already be in the right range of + * values. */ public static int makeKey(int primary, int secondary, int tertiary) { return (primary << 16) | (secondary << 7) | tertiary; @@ -805,14 +771,11 @@ public static int makeKey(int primary, int secondary, int tertiary) { // Utility methods // ============================================================= - static public String toString(String sortKey) { + public static String toString(String sortKey) { return toString(sortKey, Integer.MAX_VALUE); } - /** - * Produces a human-readable string for a sort key. - * The 0000 separator is replaced by a '|' - */ - static public String toString(String sortKey, int level) { + /** Produces a human-readable string for a sort key. The 0000 separator is replaced by a '|' */ + public static String toString(String sortKey, int level) { final StringBuffer result = new StringBuffer(); boolean needSep = false; result.append("["); @@ -839,19 +802,18 @@ static public String toString(String sortKey, int level) { return result.toString(); } - static final int variableBottom = UCA.getPrimary(CE_FFFE)+1; + static final int variableBottom = UCA.getPrimary(CE_FFFE) + 1; /* * Produces a human-readable string for a sort key. * removed after unicodetools svn r641 */ - // static public String toStringUCA(String sortKey, String original, int variableTop, StringBuilder extraComment) + // static public String toStringUCA(String sortKey, String original, int variableTop, + // StringBuilder extraComment) - public static boolean isVariablePrimary(char primary, int variableTop, - boolean lastWasVariable) { - return primary == 0 ? lastWasVariable : - primary <= variableTop - && variableBottom <= primary; + public static boolean isVariablePrimary( + char primary, int variableTop, boolean lastWasVariable) { + return primary == 0 ? lastWasVariable : primary <= variableTop && variableBottom <= primary; } public static String toStringUCA(CEList ceList, int variableTop, StringBuilder extraComment) { @@ -872,12 +834,14 @@ public static String toStringUCA(CEList ceList, int variableTop, StringBuilder e lastWasVariable = isVariable; - result - .append("[") - .append(isVariable ? "*" : ".").append(Utility.hex(p)) - .append(".").append(Utility.hex(s)) - .append(".").append(Utility.hex(t)) - .append("]"); + result.append("[") + .append(isVariable ? "*" : ".") + .append(Utility.hex(p)) + .append(".") + .append(Utility.hex(s)) + .append(".") + .append(Utility.hex(t)) + .append("]"); } return result.toString(); } @@ -886,78 +850,58 @@ public static boolean isImplicitLeadCE(int ce) { return Implicit.isImplicitLeadPrimary(getPrimary(ce)); } - /** - * NFD required - */ + /** NFD required */ private static Normalizer toD; - /** - * Records the dataversion - */ + /** Records the dataversion */ public static final String BADVERSION = "Missing @version in data!!"; + private String dataVersion = BADVERSION; - /** - * Records the dataversion - */ + /** Records the dataversion */ private String ucdVersion = "?"; - /** - * Turns backwards (e.g. for French) on globally for all secondaries - */ + /** Turns backwards (e.g. for French) on globally for all secondaries */ private boolean useBackwards = false; - /** - * Choice of how to handle variables (those with *) - */ + /** Choice of how to handle variables (those with *) */ private byte defaultAlternate = SHIFTED; - /** - * For testing - */ + /** For testing */ private boolean defaultDecomposition = true; /** - * Sets the maximum strength level to be included in the string. - * E.g. with 3, only weights of 1, 2, and 3 are included: level 4 weights are discarded. + * Sets the maximum strength level to be included in the string. E.g. with 3, only weights of 1, + * 2, and 3 are included: level 4 weights are discarded. */ private int strength = 4; - /** - * Position in decompositionBuffer used when constructing sort key - */ + /** Position in decompositionBuffer used when constructing sort key */ private final int[] index = new int[1]; - /** - * File buffer size, used to make reads faster. - */ - private static final int BUFFER_SIZE = 64*1024; + /** File buffer size, used to make reads faster. */ + private static final int BUFFER_SIZE = 64 * 1024; // ============================================================= // Collation Element Memory Data Table Formats // ============================================================= - /** - * Temporary buffer used in getSortKey for the decomposed string - */ + /** Temporary buffer used in getSortKey for the decomposed string */ private final StringBuffer decompositionBuffer = new StringBuffer(); // was 0xFFC20101; /** - * We take advantage of the variables being in a closed range to save a bit per CE. - * The low and high values are initially set to be at the opposite ends of the range, - * as the table is built from the UCA data, they are narrowed in. - * The first three values are used in building; the last two in testing. + * We take advantage of the variables being in a closed range to save a bit per CE. The low and + * high values are initially set to be at the opposite ends of the range, as the table is built + * from the UCA data, they are narrowed in. The first three values are used in building; the + * last two in testing. */ - private int variableLowCE; // used for testing against - private int variableHighCE; // used for testing against + private int variableLowCE; // used for testing against - /** - * Marks whether we are using the full data set, or an abbreviated version for - * an applet. - */ + private int variableHighCE; // used for testing against + /** Marks whether we are using the full data set, or an abbreviated version for an applet. */ private final boolean fullData; // ============================================================= @@ -966,28 +910,24 @@ public static boolean isImplicitLeadCE(int ce) { // ============================================================= /** - * Temporary buffers used in getSortKey to store weights. - * These are NOT strings of Unicode characters--they are - * lists of weights. But this is a convenient way to store them, - * since Java doesn't have unsigned shorts. + * Temporary buffers used in getSortKey to store weights. These are NOT strings of Unicode + * characters--they are lists of weights. But this is a convenient way to store them, since Java + * doesn't have unsigned shorts. */ private final StringBuilder primaries = new StringBuilder(100); + private final StringBuilder secondaries = new StringBuilder(100); private final StringBuilder tertiaries = new StringBuilder(100); private final StringBuilder quaternaries = new StringBuilder(100); - /** - * Temporary buffer used to collect progress data for debugging - */ + /** Temporary buffer used to collect progress data for debugging */ StringBuffer debugList = new StringBuffer(100); private final StringBuffer hangulBuffer = new StringBuffer(); /** - * Returns the collation elements for the character or substring - * of the decomposition buffer starting at the index. - * Advances the index past that. - * Returns null at the end of the input. + * Returns the collation elements for the character or substring of the decomposition buffer + * starting at the index. Advances the index past that. Returns null at the end of the input. */ private CEList nextCEs() { if (index[0] >= decompositionBuffer.length()) { @@ -1018,16 +958,19 @@ CEList getCEListForImplicit(int c) { return new CEList(p, q); } - /** - * Constants for Hangul - */ + /** Constants for Hangul */ static final int // constants - SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, - LCount = 19, VCount = 21, TCount = 28, - NCount = VCount * TCount, // 588 - SCount = LCount * NCount, // 11172 - LastInitial = LBase + LCount-1, // last initial jamo - LastPrimary = SBase + (LCount-1) * VCount * TCount; // last corresponding primary + SBase = 0xAC00, + LBase = 0x1100, + VBase = 0x1161, + TBase = 0x11A7, + LCount = 19, + VCount = 21, + TCount = 28, + NCount = VCount * TCount, // 588 + SCount = LCount * NCount, // 11172 + LastInitial = LBase + LCount - 1, // last initial jamo + LastPrimary = SBase + (LCount - 1) * VCount * TCount; // last corresponding primary public static StringBuffer decomposeHangul(int s, StringBuffer result) { final int SIndex = s - SBase; @@ -1037,10 +980,10 @@ public static StringBuffer decomposeHangul(int s, StringBuffer result) { final int L = LBase + SIndex / NCount; final int V = VBase + (SIndex % NCount) / TCount; final int T = TBase + SIndex % TCount; - result.append((char)L); - result.append((char)V); + result.append((char) L); + result.append((char) V); if (T != TBase) { - result.append((char)T); + result.append((char) T); } return result; } @@ -1050,8 +993,7 @@ public static StringBuffer decomposeHangul(int s, StringBuffer result) { // ============================================================= /** - * Value for returning int as well as function return, - * since Java doesn't have output parameters + * Value for returning int as well as function return, since Java doesn't have output parameters */ private final int[] position = new int[1]; @@ -1060,7 +1002,8 @@ public UCAContents getContents(Normalizer skipDecomps) { } public class UCAContents { - private final Iterator> iter = ucaData.getSortedMappings().entrySet().iterator(); + private final Iterator> iter = + ucaData.getSortedMappings().entrySet().iterator(); private CEList ces; private final Normalizer skipDecomps; private final Normalizer nfkd; @@ -1093,9 +1036,7 @@ public void setDoEnableSamples(boolean newValue) { doSamples = newValue; } - /** - * returns a string - */ + /** returns a string */ public String next() { String result = null; // null if done ces = null; @@ -1106,7 +1047,7 @@ public String next() { result = entry.getKey(); if (UTF16Plus.isSingleCodePoint(result)) { final int c = result.codePointAt(0); - if (skipDecomps != null && !skipDecomps.isNormalized(c)) { // CHECK THIS + if (skipDecomps != null && !skipDecomps.isNormalized(c)) { // CHECK THIS result = null; continue; } @@ -1121,7 +1062,8 @@ public String next() { // Update statistics once after all mappings have been enumerated. if (!getStatistics().haveUnspecified) { if (DEBUG) { - System.out.println("Specified = " + getStatistics().unspecified.toPattern(true)); + System.out.println( + "Specified = " + getStatistics().unspecified.toPattern(true)); } final UnicodeSet temp = new UnicodeSet(); for (int i = 0; i <= 0x10ffff; ++i) { @@ -1147,7 +1089,8 @@ public String next() { getStatistics().unspecified = temp; usi.reset(abbreviateSet(getStatistics().unspecified, true)); if (DEBUG) { - System.out.println("Unspecified = " + getStatistics().unspecified.toPattern(true)); + System.out.println( + "Unspecified = " + getStatistics().unspecified.toPattern(true)); } getStatistics().haveUnspecified = true; } @@ -1191,13 +1134,15 @@ public String next() { ++currentRange; if (currentRange < SAMPLE_RANGES.length) { startOfRange = itemInRange = SAMPLE_RANGES[currentRange][0]; - endOfRange = SAMPLE_RANGES[currentRange].length > 1 - ? SAMPLE_RANGES[currentRange][1] + endOfRange = + SAMPLE_RANGES[currentRange].length > 1 + ? SAMPLE_RANGES[currentRange][1] : startOfRange; - //skip = ((endOfRange - startOfRange) / 3); + // skip = ((endOfRange - startOfRange) / 3); } - } else if (itemInRange > startOfRange + 5 && itemInRange < endOfRange - 5 /* - skip*/) { - //itemInRange += skip; + } else if (itemInRange > startOfRange + 5 + && itemInRange < endOfRange - 5 /* - skip*/) { + // itemInRange += skip; itemInRange = endOfRange - 5; } ces = getCEList(result, true); @@ -1207,9 +1152,7 @@ public String next() { return null; } - /** - * Returns the CEs for the string that was returned by the last call to next(). - */ + /** Returns the CEs for the string that was returned by the last call to next(). */ public CEList getCEs() { return ces; } @@ -1256,8 +1199,8 @@ public boolean isDoSamples() { private static final boolean VERBOSE = false; /** - * Adds the collation elements from a file (or other stream) in the UCA format. - * Values will override any previous mappings. + * Adds the collation elements from a file (or other stream) in the UCA format. Values will + * override any previous mappings. */ private void addCollationElements(BufferedReader in) throws java.io.IOException { final IntStack tempStack = new IntStack(100); @@ -1268,9 +1211,8 @@ private void addCollationElements(BufferedReader in) throws java.io.IOException while (true) { try { inputLine = in.readLine(); - if (inputLine == null) - { - break; // means file is done + if (inputLine == null) { + break; // means file is done } // HACK @@ -1281,19 +1223,18 @@ private void addCollationElements(BufferedReader in) throws java.io.IOException } final String line = cleanLine(inputLine); // remove comments, extra whitespace - if (line.isEmpty()) - { - continue; // skip empty lines + if (line.isEmpty()) { + continue; // skip empty lines } if (DEBUG_SHOW_LINE) { System.out.println("Processing: " + inputLine); } - position[0] = 0; // start at front of line + position[0] = 0; // start at front of line if (line.startsWith("@")) { if (line.startsWith("@version")) { - dataVersion = line.substring("@version".length()+1).trim(); + dataVersion = line.substring("@version".length() + 1).trim(); continue; } else if (line.startsWith("@implicitweights ")) { // @implicitweights 17000..18AFF; FB00 # Tangut and Tangut Components @@ -1315,7 +1256,7 @@ private void addCollationElements(BufferedReader in) throws java.io.IOException } // collect characters - multiChars.setLength(0); // clear buffer + multiChars.setLength(0); // clear buffer final char value = getChar(line, position); multiChars.append(value); @@ -1336,18 +1277,20 @@ private void addCollationElements(BufferedReader in) throws java.io.IOException // } } if (!fullData && RECORDING_DATA) { - if (value == 0 || value == '\t' || value == '\n' || value == '\r' + if (value == 0 + || value == '\t' + || value == '\n' + || value == '\r' || (0x20 <= value && value <= 0x7F) || (0x80 <= value && value <= 0xFF) - || (0x300 <= value && value <= 0x3FF) - ) { + || (0x300 <= value && value <= 0x3FF)) { System.out.println(" + \"" + inputLine + "\\n\""); } } // for recording information boolean record = true; /* if (multiChars.length() > 0) record = false; - else */ + else */ if (!toD.isNormalized(value)) { record = false; } @@ -1355,16 +1298,31 @@ record = false; // collect CEs wasImplicitLeadPrimary[0] = false; - final int ce = getCEFromLine(firstCodePoint, line, position, record, wasImplicitLeadPrimary, true); - int ce2 = getCEFromLine(firstCodePoint, line, position, record, wasImplicitLeadPrimary, false); + final int ce = + getCEFromLine( + firstCodePoint, + line, + position, + record, + wasImplicitLeadPrimary, + true); + int ce2 = + getCEFromLine( + firstCodePoint, + line, + position, + record, + wasImplicitLeadPrimary, + false); if (CHECK_UNIQUE && (ce2 == TERMINATOR || CHECK_UNIQUE_EXPANSIONS)) { if (!CHECK_UNIQUE_VARIABLES) { checkUnique(value, ce, 0, inputLine); // only need to check first value } else { final int key1 = ce >>> 16; - if (isVariable(ce)) { - checkUnique(value, 0, key1, inputLine); // only need to check first value - } + if (isVariable(ce)) { + checkUnique( + value, 0, key1, inputLine); // only need to check first value + } } } @@ -1373,7 +1331,14 @@ record = false; while (ce2 != TERMINATOR) { tempStack.push(ce2); - ce2 = getCEFromLine(firstCodePoint, line, position, record, wasImplicitLeadPrimary, false); + ce2 = + getCEFromLine( + firstCodePoint, + line, + position, + record, + wasImplicitLeadPrimary, + false); if (ce2 == TERMINATOR) { break; } @@ -1406,17 +1371,17 @@ public void overrideCE(String multiChars, int primary, int secondary, int tertia ucaData.add(multiChars, tempStack); } - /** - * - */ + /** */ private UnicodeSet extractSet(String inputLine) { - //# Variant secondaries: 0177..017B (5) - //# Digit secondaries: 017C..0198 (29) - final Matcher m = Pattern.compile(".*:\\s*([0-9A-Fa-f]+)\\.\\.([0-9A-Fa-f]+).*").matcher(""); + // # Variant secondaries: 0177..017B (5) + // # Digit secondaries: 017C..0198 (29) + final Matcher m = + Pattern.compile(".*:\\s*([0-9A-Fa-f]+)\\.\\.([0-9A-Fa-f]+).*").matcher(""); if (!m.reset(inputLine).matches()) { - throw new IllegalArgumentException("Failed to recognized special Ken lines: " + inputLine); + throw new IllegalArgumentException( + "Failed to recognized special Ken lines: " + inputLine); } - return new UnicodeSet(Integer.parseInt(m.group(1),16), Integer.parseInt(m.group(2),16)); + return new UnicodeSet(Integer.parseInt(m.group(1), 16), Integer.parseInt(m.group(2), 16)); } /* @@ -1429,9 +1394,7 @@ public Set getContractions() { return ucaData.getContractions(); } - /** - * Checks the internal tables corresponding to the UCA data. - */ + /** Checks the internal tables corresponding to the UCA data. */ private void cleanup() { ucaData.checkConsistency(); @@ -1482,16 +1445,22 @@ private void cleanup() { } */ - //fixlater; - variableLowCE = makeKey(1,0,0); - variableHighCE = makeKey(ucaData.variableHigh, CEList.SECONDARY_MAX, CEList.TERTIARY_MAX); // turn on bottom bits + // fixlater; + variableLowCE = makeKey(1, 0, 0); + variableHighCE = + makeKey( + ucaData.variableHigh, + CEList.SECONDARY_MAX, + CEList.TERTIARY_MAX); // turn on bottom bits - //int hangulHackBottom; - //int hangulHackTop; + // int hangulHackBottom; + // int hangulHackTop; - //hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & tertiaries - //hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries - //if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", " + Utility.hex(hangulHackTop)); + // hangulHackBottom = collationElements[0x1100] & 0xFFFF0000; // remove secondaries & + // tertiaries + // hangulHackTop = collationElements[0x11F9] | 0xFFFF; // bump up secondaries and tertiaries + // if (SHOW_STATS) System.out.println("\tHangul Hack: " + Utility.hex(hangulHackBottom) + ", + // " + Utility.hex(hangulHackTop)); // show some statistics if (SHOW_STATS) { @@ -1508,17 +1477,33 @@ private void cleanup() { } if (SHOW_STATS) { - System.out.println("\tMIN1/MAX1: " + Utility.hex(getStatistics().MIN1) + "/" + Utility.hex(getStatistics().MAX1)); + System.out.println( + "\tMIN1/MAX1: " + + Utility.hex(getStatistics().MIN1) + + "/" + + Utility.hex(getStatistics().MAX1)); } if (SHOW_STATS) { - System.out.println("\tMIN2/MAX2: " + Utility.hex(getStatistics().MIN2) + "/" + Utility.hex(getStatistics().MAX2)); + System.out.println( + "\tMIN2/MAX2: " + + Utility.hex(getStatistics().MIN2) + + "/" + + Utility.hex(getStatistics().MAX2)); } if (SHOW_STATS) { - System.out.println("\tMIN3/MAX3: " + Utility.hex(getStatistics().MIN3) + "/" + Utility.hex(getStatistics().MAX3)); + System.out.println( + "\tMIN3/MAX3: " + + Utility.hex(getStatistics().MIN3) + + "/" + + Utility.hex(getStatistics().MAX3)); } if (SHOW_STATS) { - System.out.println("\tVar Min/Max: " + Utility.hex(ucaData.variableLow) + "/" + Utility.hex(ucaData.variableHigh)); + System.out.println( + "\tVar Min/Max: " + + Utility.hex(ucaData.variableLow) + + "/" + + Utility.hex(ucaData.variableHigh)); } if (SHOW_STATS) { System.out.println("\tNon-Var Min: " + Utility.hex(ucaData.nonVariableLow)); @@ -1529,26 +1514,25 @@ private void cleanup() { } } - /** - * Remove comments, extra whitespace - */ + /** Remove comments, extra whitespace */ private String cleanLine(String line) { int commentPosition = line.indexOf('#'); if (commentPosition >= 0) { - line = line.substring(0,commentPosition); + line = line.substring(0, commentPosition); } commentPosition = line.indexOf('%'); if (commentPosition >= 0) { - line = line.substring(0,commentPosition); + line = line.substring(0, commentPosition); } return line.trim(); } /** * Get a char from a line, of form: ( | )* * - *@param position on input, the place to start at. - * On output, updated to point to the next place to search. - *@return the character, or NOT_A_CHAR when done + * + * @param position on input, the place to start at. On output, updated to point to the next + * place to search. + * @return the character, or NOT_A_CHAR when done */ // NOTE in case of surrogates, we buffer up the second character!! @@ -1580,11 +1564,11 @@ private char getChar(String line, int[] position) { } if (hexLimit >= start + 4) { position[0] = hexLimit; - final int cp = Integer.parseInt(line.substring(start,hexLimit),16); + final int cp = Integer.parseInt(line.substring(start, hexLimit), 16); if (cp <= 0xFFFF) { - return (char)cp; + return (char) cp; } - //DEBUGCHAR = true; + // DEBUGCHAR = true; charBuffer = UTF16.getTrailSurrogate(cp); return UTF16.getLeadSurrogate(cp); } @@ -1594,7 +1578,6 @@ private char getChar(String line, int[] position) { boolean DEBUGCHAR = false; - private CharSequence getRepresentativePrimary(int primary) { StringBuilder result = getStatistics().representativePrimary.get(primary); if (result == null) { @@ -1604,16 +1587,27 @@ private CharSequence getRepresentativePrimary(int primary) { } public int writeUsedWeights(PrintWriter p, int strength, MessageFormat mf) { - final RoBitSet weights = (strength == 1 ? getStatistics().getPrimarySet() - : strength == 2 ? getStatistics().getSecondarySet() - : getStatistics().getTertiarySet()); // strength == 1 ? getStatistics().primarySet : strength == 2 ? getStatistics().secondarySet : getStatistics().tertiarySet; + final RoBitSet weights = + (strength == 1 + ? getStatistics().getPrimarySet() + : strength == 2 + ? getStatistics().getSecondarySet() + : getStatistics().getTertiarySet()); // strength == 1 ? + // getStatistics().primarySet : strength + // == 2 ? getStatistics().secondarySet : + // getStatistics().tertiarySet; int first = -1; int count = 0; for (int i = 0; i <= weights.length(); ++i) { if (strength > 1) { if (weights.get(i)) { count++; - p.println(mf.format(new Object[] {Utility.hex((char)i), new Integer(getStatistics().stCounts[strength][i])})); + p.println( + mf.format( + new Object[] { + Utility.hex((char) i), + new Integer(getStatistics().stCounts[strength][i]) + })); } continue; } @@ -1622,11 +1616,18 @@ public int writeUsedWeights(PrintWriter p, int strength, MessageFormat mf) { first = i; } } else if (first != -1) { - final int last = i-1; + final int last = i - 1; final int diff = last - first + 1; count += diff; - final String lastStr = last == first ? "" : Utility.hex((char)last); - p.println(mf.format(new Object[] {Utility.hex((char)first),lastStr,new Integer(diff), new Integer(count)})); + final String lastStr = last == first ? "" : Utility.hex((char) last); + p.println( + mf.format( + new Object[] { + Utility.hex((char) first), + lastStr, + new Integer(diff), + new Integer(count) + })); first = -1; } } @@ -1635,24 +1636,31 @@ public int writeUsedWeights(PrintWriter p, int strength, MessageFormat mf) { /** * Gets a CE from a UCA format line - *@param value the first character for the line. Just used for statistics. - *@param line a string of form "[.0000.0000.0000.0000]..." - *@param position on input, the place to start at. - * On output, updated to point to the next place to search. + * + * @param value the first character for the line. Just used for statistics. + * @param line a string of form "[.0000.0000.0000.0000]..." + * @param position on input, the place to start at. On output, updated to point to the next + * place to search. */ - boolean haveVariableWarning = false; + boolean haveZeroVariableWarning = false; - private int getCEFromLine(int value, String line, int[] position, boolean record, boolean[] lastWasImplicitLead, boolean first) { + private int getCEFromLine( + int value, + String line, + int[] position, + boolean record, + boolean[] lastWasImplicitLead, + boolean first) { final int start = line.indexOf('[', position[0]); if (start == -1) { return TERMINATOR; } - boolean variable = line.charAt(start+1) == '*'; - int key1 = Integer.parseInt(line.substring(start+2,start+6),16); - final int key2 = Integer.parseInt(line.substring(start+7,start+11),16); - final int key3 = Integer.parseInt(line.substring(start+12,start+16),16); + boolean variable = line.charAt(start + 1) == '*'; + int key1 = Integer.parseInt(line.substring(start + 2, start + 6), 16); + final int key2 = Integer.parseInt(line.substring(start + 7, start + 11), 16); + final int key3 = Integer.parseInt(line.substring(start + 12, start + 16), 16); if (key1 == 0 && variable) { if (!haveZeroVariableWarning) { System.out.println("\tBAD DATA: Zero L1s cannot be variable!!: " + line); @@ -1661,17 +1669,23 @@ private int getCEFromLine(int value, String line, int[] position, boolean record variable = false; // FIX DATA FILE } if (key2 > CEList.SECONDARY_MAX) { - throw new IllegalArgumentException("Weight2 doesn't fit: " + Utility.hex(key2) + "," + line); + throw new IllegalArgumentException( + "Weight2 doesn't fit: " + Utility.hex(key2) + "," + line); } if (key3 > CEList.TERTIARY_MAX) { - throw new IllegalArgumentException("Weight3 doesn't fit: " + Utility.hex(key3) + "," + line); + throw new IllegalArgumentException( + "Weight3 doesn't fit: " + Utility.hex(key3) + "," + line); } // adjust variable bounds, if needed if (variable) { if (key1 > ucaData.nonVariableLow) { if (!haveVariableWarning) { - System.out.println("\tBAD DATA: Variable overlap, nonvariable low: " - + Utility.hex(ucaData.nonVariableLow) + ", line: \"" + line + "\""); + System.out.println( + "\tBAD DATA: Variable overlap, nonvariable low: " + + Utility.hex(ucaData.nonVariableLow) + + ", line: \"" + + line + + "\""); haveVariableWarning = true; } } else { @@ -1685,8 +1699,12 @@ private int getCEFromLine(int value, String line, int[] position, boolean record } else if (key1 != 0) { // not variable, not zero if (key1 < ucaData.variableHigh) { if (!haveVariableWarning) { - System.out.println("\tBAD DATA: Variable overlap, variable high: " - + Utility.hex(ucaData.variableHigh) + ", line: \"" + line + "\""); + System.out.println( + "\tBAD DATA: Variable overlap, variable high: " + + Utility.hex(ucaData.variableHigh) + + ", line: \"" + + line + + "\""); haveVariableWarning = true; } } else { @@ -1712,25 +1730,24 @@ private int getCEFromLine(int value, String line, int[] position, boolean record return makeKey(key1, key2, key3); } - - /** - * Used for checking data file integrity - */ + /** Used for checking data file integrity */ private final Map uniqueTable = new HashMap(); - /** - * Used for checking data file integrity - */ + /** Used for checking data file integrity */ private void checkUnique(char value, int result, int fourth, String line) { - if (!toD.isNormalized(value)) - { + if (!toD.isNormalized(value)) { return; // don't check decomposables. } - final Long ceObj = new Long(((long)result << 16) | fourth); + final Long ceObj = new Long(((long) result << 16) | fourth); final Character probe = uniqueTable.get(ceObj); if (probe != null) { - System.out.println("\tCE(" + Utility.hex(value) - + ")=CE(" + Utility.hex(probe.charValue()) + "); " + line); + System.out.println( + "\tCE(" + + Utility.hex(value) + + ")=CE(" + + Utility.hex(probe.charValue()) + + "); " + + line); } else { uniqueTable.put(ceObj, new Character(value)); } @@ -1748,13 +1765,14 @@ public static final String getOutputDir() { return Settings.Output.GEN_UCA_DIR + Default.ucdVersion() + "/"; } - /** * @return Returns the homelessSecondaries. */ public UnicodeSet getHomelessSecondaries() { if (getStatistics().homelessSecondaries == null) { - getStatistics().homelessSecondaries = new UnicodeSet(getStatistics().variantSecondaries).addAll(getStatistics().digitSecondaries); + getStatistics().homelessSecondaries = + new UnicodeSet(getStatistics().variantSecondaries) + .addAll(getStatistics().digitSecondaries); } return getStatistics().homelessSecondaries; } @@ -1765,7 +1783,12 @@ public static UCA buildCollator(Remap primaryRemap) { final Path dataPath = Settings.UnicodeTools.getDataPathForLatestVersion("uca"); final String file = Utility.searchDirectory(dataPath.toFile(), "allkeys", true, ".txt"); final UCA collator = new UCA(file, Default.ucdVersion(), primaryRemap); - if (VERBOSE) System.out.println("Built version " + collator.getDataVersion() + "/ucd: " + collator.getUCDVersion()); + if (VERBOSE) + System.out.println( + "Built version " + + collator.getDataVersion() + + "/ucd: " + + collator.getUCDVersion()); if (VERBOSE) System.out.println("Building UCD data"); return collator; } catch (final IOException e) { @@ -1779,19 +1802,21 @@ UCA_Statistics getStatistics() { public static final class Remap { private int counter = 0x100; - private final Map primaryRemap = new TreeMap(); - private final Map characterRemap = new TreeMap(); + private final Map primaryRemap = new TreeMap(); + private final Map characterRemap = new TreeMap(); private int variableHigh; private int firstDucetNonVariable; public Integer getRemappedPrimary(int ducetPrimary) { return primaryRemap.get(ducetPrimary); } + public IntStack getRemappedCharacter(int source) { return characterRemap.get(source); } /** * The UnicodeSet is not characters but primaries.... + * * @param items * @return */ @@ -1801,30 +1826,34 @@ public Remap addItems(UnicodeSet items) { } return this; } + public Remap putRemappedCharacters(int codePoint) { final IntStack stack = new IntStack(1); stack.append(makeKey(counter++, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY)); characterRemap.put(codePoint, stack); - //*130D.0020.0002 + // *130D.0020.0002 return this; } + public Map getCharacterRemap() { return Collections.unmodifiableMap(characterRemap); } + public void setFirstDucetNonVariable(int firstDucetNonVariable) { this.firstDucetNonVariable = primaryRemap.get(firstDucetNonVariable); } + public int getFirstDucetNonVariable() { return firstDucetNonVariable; } + public int getVariableHigh() { return variableHigh; } + public Remap setVariableHigh() { - variableHigh = counter-1; + variableHigh = counter - 1; return this; } } - } - diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Data.java b/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Data.java index 2e827f6c3..7289fa981 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Data.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Data.java @@ -1,16 +1,16 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCA/UCA_Data.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCA/UCA_Data.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCA; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -20,16 +20,12 @@ import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.text.UCA.UCA.Remap; import org.unicode.text.UCD.Normalizer; import org.unicode.text.UCD.UCD; import org.unicode.text.utility.IntStack; import org.unicode.text.utility.UTF16Plus; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class UCA_Data { static final boolean DEBUG = false; static final boolean DEBUG_SHOW_ADD = false; @@ -62,35 +58,33 @@ public UCA_Data(Normalizer toD, UCD ucd, Remap primaryRemap) { private final Map cesMap = new HashMap(); /** - * Maps single-character strings to one of the longest mappings - * (single-character or contractions) starting with those characters. - * (null for no mappings) + * Maps single-character strings to one of the longest mappings (single-character or + * contractions) starting with those characters. (null for no mappings) */ private final Map longestMap = new HashMap(); /** - * Set of single-character strings that start contractions - * which end with a non-zero combining class. + * Set of single-character strings that start contractions which end with a non-zero combining + * class. */ private final Set hasDiscontiguousContractions = new HashSet(); /** - * Set of all contraction strings. - * Same as all multi-character strings in cesMap, except in code point order. - * Used only for API that accesses this set. + * Set of all contraction strings. Same as all multi-character strings in cesMap, except in code + * point order. Used only for API that accesses this set. */ - private final Set contractions = new TreeSet( - new UTF16.StringComparator(true, false, UTF16.StringComparator.FOLD_CASE_DEFAULT)); + private final Set contractions = + new TreeSet( + new UTF16.StringComparator( + true, false, UTF16.StringComparator.FOLD_CASE_DEFAULT)); { checkConsistency(); } - /** - * Returns true if there is an explicit mapping for ch, or one that starts with ch. - */ + /** Returns true if there is an explicit mapping for ch, or one that starts with ch. */ public boolean codePointHasExplicitMappings(int ch) { - final String s = Character.toString((char)ch); + final String s = Character.toString((char) ch); final String longest = longestMap.get(s); return longest != null; } @@ -152,7 +146,8 @@ public void add(CharSequence source, IntStack ces) { } else { StringBuilder reps = statistics.representativePrimarySeconds.get(key1); if (reps == null) { - statistics.representativePrimarySeconds.put(key1, reps = new StringBuilder()); + statistics.representativePrimarySeconds.put( + key1, reps = new StringBuilder()); } reps.appendCodePoint(firstCodePoint); } @@ -187,28 +182,28 @@ public void add(CharSequence source, IntStack ces) { } // gather some statistics if (key1 != 0 && key1 < statistics.MIN1) { - statistics.MIN1 = (char)key1; + statistics.MIN1 = (char) key1; } if (key2 != 0 && key2 < statistics.MIN2) { - statistics.MIN2 = (char)key2; + statistics.MIN2 = (char) key2; } if (key3 != 0 && key3 < statistics.MIN3) { - statistics.MIN3 = (char)key3; + statistics.MIN3 = (char) key3; } if (key1 > statistics.MAX1) { - statistics.MAX1 = (char)key1; + statistics.MAX1 = (char) key1; } if (key2 > statistics.MAX2) { - statistics.MAX2 = (char)key2; + statistics.MAX2 = (char) key2; } if (key3 > statistics.MAX3) { - statistics.MAX3 = (char)key3; + statistics.MAX3 = (char) key3; } } - if (DEBUG_SHOW_ADD) { - System.out.println("Adding: " + ucd.getCodeAndName(sourceString) + CEList.toString(ces)); + System.out.println( + "Adding: " + ucd.getCodeAndName(sourceString) + CEList.toString(ces)); } cesMap.put(sourceString, new CEList(ces)); @@ -229,16 +224,16 @@ public void add(CharSequence source, IntStack ces) { hasDiscontiguousContractions.add(firstChar); } } - //if (DEBUG) checkConsistency(); + // if (DEBUG) checkConsistency(); } /** - * Returns the CEs for the longest matching buffer substring starting at i - * and moves the index to just after that substring. - * Discontiguously matched combining marks are removed from the buffer. + * Returns the CEs for the longest matching buffer substring starting at i and moves the index + * to just after that substring. Discontiguously matched combining marks are removed from the + * buffer. * - *

If there is no mapping for any character or substring at i, - * then the index is unchanged and null is returned. + *

If there is no mapping for any character or substring at i, then the index is unchanged + * and null is returned. */ public CEList fetchCEs(StringBuffer buffer, int[] index) { final int i = index[0]; @@ -265,7 +260,7 @@ public CEList fetchCEs(StringBuffer buffer, int[] index) { } j = next; } - j = i + s.length(); // Limit of the longest contiguous match. + j = i + s.length(); // Limit of the longest contiguous match. // Discontiguous-contraction matching. if (maybeDiscontiguous && s.length() < longest.length() && j < buffer.length()) { @@ -280,17 +275,18 @@ public CEList fetchCEs(StringBuffer buffer, int[] index) { while (k < buffer.length()) { nextCodePoint = buffer.codePointAt(k); cc = toD.getCanonicalClass(nextCodePoint); - if (cc == 0) { // stop with any zero (non-accent) + if (cc == 0) { // stop with any zero (non-accent) break; } final int next = k + Character.charCount(nextCodePoint); - if (cc == prevCC) { // blocked if same class as last + if (cc == prevCC) { // blocked if same class as last k = next; continue; } - prevCC = cc; // remember for next time + prevCC = cc; // remember for next time // nextString = s + nextCodePoint - final String nextString = new StringBuilder(s).appendCodePoint(nextCodePoint).toString(); + final String nextString = + new StringBuilder(s).appendCodePoint(nextCodePoint).toString(); final CEList nextCEs = cesMap.get(nextString); if (nextCEs == null) { k = next; @@ -313,12 +309,16 @@ public CEList fetchCEs(StringBuffer buffer, int[] index) { return ces; } - private static final UnicodeSet ILLEGAL_CODE_POINTS = new UnicodeSet("[:cs:]").freeze(); // doesn't depend on version + private static final UnicodeSet ILLEGAL_CODE_POINTS = + new UnicodeSet("[:cs:]").freeze(); // doesn't depend on version private void checkForIllegal(String string) { - if(ILLEGAL_CODE_POINTS.containsSome(string)) { - throw new IllegalArgumentException("String contains illegal characters: <" - + string + "> " + new UnicodeSet().addAll(string).retainAll(ILLEGAL_CODE_POINTS)); + if (ILLEGAL_CODE_POINTS.containsSome(string)) { + throw new IllegalArgumentException( + "String contains illegal characters: <" + + string + + "> " + + new UnicodeSet().addAll(string).retainAll(ILLEGAL_CODE_POINTS)); } } @@ -329,6 +329,7 @@ void checkConsistency() { private class MappingComparator implements Comparator { private final UTF16.StringComparator cmp = new UTF16.StringComparator(true, false, UTF16.StringComparator.FOLD_CASE_DEFAULT); + @Override public int compare(String left, String right) { // Note: add() enforces that mapping strings are not empty strings. @@ -343,11 +344,10 @@ public int compare(String left, String right) { } } - /** - * Returns an immutable Map of sorted strings (characters & contractions) to CEs. - */ + /** Returns an immutable Map of sorted strings (characters & contractions) to CEs. */ SortedMap getSortedMappings() { - final SortedMap sorted = new TreeMap(new MappingComparator()); + final SortedMap sorted = + new TreeMap(new MappingComparator()); sorted.putAll(cesMap); return Collections.unmodifiableSortedMap(sorted); } diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Statistics.java b/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Statistics.java index 59e9e2dcf..d27057841 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Statistics.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Statistics.java @@ -1,14 +1,11 @@ -/** - * - */ +/** */ package org.unicode.text.UCA; +import com.ibm.icu.text.UnicodeSet; import java.util.BitSet; import java.util.HashMap; import java.util.Map; -import com.ibm.icu.text.UnicodeSet; - public class UCA_Statistics { int[] secondaryCount = new int[0x200]; @@ -22,25 +19,24 @@ public class UCA_Statistics { private final RoBitSet secondarySetRo = new RoBitSet(secondarySet); private final RoBitSet tertiarySetRo = new RoBitSet(tertiarySet); - Map representativePrimary = new HashMap(); - Map representativePrimarySeconds = new HashMap(); + Map representativePrimary = new HashMap(); + Map representativePrimarySeconds = + new HashMap(); - /** - * For recording statistics - */ + /** For recording statistics */ int count1 = 0, count2 = 0, count3 = 0, max2 = 0, max3 = 0; + int oldKey1 = -1, oldKey2 = -1, oldKey3 = -1; UnicodeSet found = new UnicodeSet(); boolean haveUnspecified = false; UnicodeSet unspecified = new UnicodeSet(); - UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154); // TODO, fix - UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F); // TODO, fix + UnicodeSet variantSecondaries = new UnicodeSet(0x0153, 0x0154); // TODO, fix + UnicodeSet digitSecondaries = new UnicodeSet(0x155, 0x017F); // TODO, fix UnicodeSet homelessSecondaries; - /** - * Just for statistics - */ + /** Just for statistics */ int lastUniqueVariable = 0; + int renumberedVariable = 50; char MIN1 = '\uFFFF'; // start large; will be reset as table is built char MIN2 = '\uFFFF'; // start large; will be reset as table is built @@ -55,30 +51,37 @@ public class UCA_Statistics { public RoBitSet getPrimarySet() { return primarySetRo; } + public RoBitSet getSecondarySet() { return secondarySetRo; } + public RoBitSet getTertiarySet() { return tertiarySetRo; } // HACK for CJK - //secondarySet.set(0x0040); + // secondarySet.set(0x0040); public static class RoBitSet { private final BitSet guts; + public RoBitSet(BitSet bitSetToProtect) { guts = bitSetToProtect; } + public int length() { return guts.length(); } + public boolean get(int i) { return guts.get(i); } + public int nextSetBit(int i) { return guts.nextSetBit(i); } + public int size() { return guts.size(); } @@ -87,10 +90,12 @@ public int size() { public void setPrimary(int key1) { primarySet.set(key1); } + public void setSecondary(int key2) { secondarySet.set(key2); } + public void setTertiary(int key3) { tertiarySet.set(key3); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Types.java b/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Types.java index 5204819ad..312e09f2e 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Types.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/UCA_Types.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCA/UCA_Types.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCA/UCA_Types.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCA; import java.util.BitSet; @@ -24,22 +22,21 @@ public interface UCA_Types { static final int MAX_TERTIARY = 0x1F; /** - * Uppercase and normal-Kana UCA tertiary weights. - * Bits/weights 08-0C, 0E, 11, 12, 1D. - * See http://www.unicode.org/reports/tr10/#Tertiary_Weight_Table + * Uppercase and normal-Kana UCA tertiary weights. Bits/weights 08-0C, 0E, 11, 12, 1D. See + * http://www.unicode.org/reports/tr10/#Tertiary_Weight_Table */ - static final BitSet uppercaseTertiaries = BitSet.valueOf(new long[] { 0x20065F00 }); + static final BitSet uppercaseTertiaries = BitSet.valueOf(new long[] {0x20065F00}); /** Enum for alternate handling */ - public static final byte SHIFTED = 0, ZEROED = 1, NON_IGNORABLE = 2, SHIFTED_TRIMMED = 3, LAST = 3; + public static final byte SHIFTED = 0, + ZEROED = 1, + NON_IGNORABLE = 2, + SHIFTED_TRIMMED = 3, + LAST = 3; - /** - * Used to terminate a list of CEs - */ - public static final int TERMINATOR = 0xFFFFFFFF; // CE that marks end of string + /** Used to terminate a list of CEs */ + public static final int TERMINATOR = 0xFFFFFFFF; // CE that marks end of string - /** - * Special char value that means failed or terminated - */ + /** Special char value that means failed or terminated */ static final char NOT_A_CHAR = '\uFFFF'; } diff --git a/unicodetools/src/main/java/org/unicode/text/UCA/Validity.java b/unicodetools/src/main/java/org/unicode/text/UCA/Validity.java index cbe2c7948..67c9a9f25 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCA/Validity.java +++ b/unicodetools/src/main/java/org/unicode/text/UCA/Validity.java @@ -1,5 +1,11 @@ package org.unicode.text.UCA; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.CanonicalIterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.IOException; import java.io.PrintWriter; import java.util.Collection; @@ -13,7 +19,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.TransliteratorUtilities; import org.unicode.cldr.util.With; import org.unicode.props.BagFormatter; @@ -28,35 +33,32 @@ import org.unicode.text.utility.Pair; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.CanonicalIterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - final class Validity { private static final boolean DEBUG = false; private static final boolean GENERATED_NFC_MISMATCHES = true; - private static UnicodeSet compatibilityExceptions = new UnicodeSet("[\u0CCB\u0DDD\u017F\u1E9B\uFB05]"); - private static TreeMap sortedD = new TreeMap(); - private static TreeMap sortedN = new TreeMap(); - private static HashMap backD = new HashMap(); - private static HashMap backN = new HashMap(); - private static TreeMap duplicates = new TreeMap(); - private static PrintWriter log; - private static UCA uca; + private static UnicodeSet compatibilityExceptions = + new UnicodeSet("[\u0CCB\u0DDD\u017F\u1E9B\uFB05]"); + private static TreeMap sortedD = new TreeMap(); + private static TreeMap sortedN = new TreeMap(); + private static HashMap backD = new HashMap(); + private static HashMap backN = new HashMap(); + private static TreeMap duplicates = new TreeMap(); + private static PrintWriter log; + private static UCA uca; private static String ERROR_STRING = "ERROR"; private static String KNOWN_ISSUE_STRING = "Known Issue"; // Called by UCA.Main. static void writeCollationValidityLog() throws IOException { - log = Utility.openPrintWriter(UCA.getOutputDir(), "CheckCollationValidity.html", Utility.UTF8_WINDOWS); + log = + Utility.openPrintWriter( + UCA.getOutputDir(), "CheckCollationValidity.html", Utility.UTF8_WINDOWS); uca = WriteCollationData.getCollator(CollatorType.ducet); - log.println("\n" + - ""); + log.println( + "\n" + + ""); log.println("UCA Validity Log"); log.println(""); - indexFile.println("

UCA Default Collation Table

"); - indexFile.println("

Help"); - */ + indexFile.println(""); + indexFile.println("UCA Default Collation Table"); + indexFile.println(""); + indexFile.println(""); + indexFile.println("

UCA Default Collation Table

"); + indexFile.println("

Help"); + */ int columnCount = 0; @@ -568,11 +613,11 @@ static public void scriptChart() throws IOException { final Pair p = (Pair) it.next(); final int script = ((Integer) p.first).intValue(); - final int cp = ((Integer)((Pair)p.second).second).intValue(); + final int cp = ((Integer) ((Pair) p.second).second).intValue(); if (script != oldScript - // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT) - ) { + // && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT) + ) { closeFile(output); output = null; oldScript = script; @@ -596,7 +641,7 @@ static public void scriptChart() throws IOException { closeIndexFile(indexFile, "", SCRIPT, true); } - static public void addMapChar(Map m, Set stoplist, String key, String ch) { + public static void addMapChar(Map m, Set stoplist, String key, String ch) { if (stoplist.contains(key)) { return; } @@ -606,7 +651,7 @@ static public void addMapChar(Map m, Set stoplist, String key, String ch) { return; } } - Set result = (Set)m.get(key); + Set result = (Set) m.get(key); if (result == null) { result = new TreeSet(); m.put(key, result); @@ -614,13 +659,15 @@ static public void addMapChar(Map m, Set stoplist, String key, String ch) { result.add(ch); } - static public void indexChart() throws IOException { + public static void indexChart() throws IOException { HACK_KANA = false; final Map map = new TreeMap(); final Set stoplist = new TreeSet(); - final String[] stops = {"LETTER", "CHARACTER", "AND", "CAPITAL", "SMALL", "COMPATIBILITY", "WITH"}; + final String[] stops = { + "LETTER", "CHARACTER", "AND", "CAPITAL", "SMALL", "COMPATIBILITY", "WITH" + }; stoplist.addAll(Arrays.asList(stops)); System.out.println("Stop-list: " + stoplist); @@ -682,15 +729,23 @@ static public void indexChart() throws IOException { final int oldScript = -127; final int counter = 0; - final String[] replacement = new String[] {"%%%", "Name Index Charts", "$initialPage$", "chart_X.html"}; + final String[] replacement = + new String[] {"%%%", "Name Index Charts", "$initialPage$", "chart_X.html"}; final String folder = GEN_CHARTS_DIR + "name/"; - Utility.copyTextFile(Settings.SRC_UCA_DIR + "index.html", Utility.UTF8, folder + "index.html", replacement); - Utility.copyTextFile(Settings.SRC_UCA_DIR + "charts.css", Utility.LATIN1, folder + "charts.css"); - Utility.copyTextFile(Settings.SRC_UCA_DIR + "name_help.html", Utility.UTF8, folder + "help.html"); + Utility.copyTextFile( + Settings.SRC_UCA_DIR + "index.html", + Utility.UTF8, + folder + "index.html", + replacement); + Utility.copyTextFile( + Settings.SRC_UCA_DIR + "charts.css", Utility.LATIN1, folder + "charts.css"); + Utility.copyTextFile( + Settings.SRC_UCA_DIR + "name_help.html", Utility.UTF8, folder + "help.html"); indexFile = Utility.openPrintWriter(folder, "index_list.html", Utility.UTF8_WINDOWS); - Utility.appendFile(Settings.SRC_UCA_DIR + "index_header.html", Utility.UTF8, indexFile, replacement); + Utility.appendFile( + Settings.SRC_UCA_DIR + "index_header.html", Utility.UTF8, indexFile, replacement); int columnCount = 0; char lastInitial = 0; @@ -731,7 +786,6 @@ static public void indexChart() throws IOException { } output.println(""); - } closeFile(output); @@ -750,22 +804,30 @@ public static String showCell(String comp, String classType) { + "> "; } return "" + addCircle.transliterate(comp) - + "
" + Utility.hex(comp) + ""; + + (classType.isEmpty() ? " " : " class='" + classType + "'") + + " title='" + + Utility.hex(comp) + + " " + + Default.ucd().getName(comp) + + "'>" + + addCircle.transliterate(comp) + + "
" + + Utility.hex(comp) + + ""; } - static void showCell(PrintWriter output, String s, String classType, String extra, boolean skipName) { -// if (s.equals("\u0300")) { -// System.out.println(); -// } + static void showCell( + PrintWriter output, String s, String classType, String extra, boolean skipName) { + // if (s.equals("\u0300")) { + // System.out.println(); + // } if (isNew(s)) { classType = "new"; indexHasNew = true; } final String name = Default.ucd().getName(s); String comp = Default.nfc().normalize(s); - final int cat = Default.ucd().getCategory(UTF16.charAt(comp,0)); + final int cat = Default.ucd().getCategory(UTF16.charAt(comp, 0)); if (cat == Mn || cat == Mc || cat == Me) { comp = '\u25CC' + comp; if (s.equals("\u0300")) { @@ -773,33 +835,30 @@ static void showCell(PrintWriter output, String s, String classType, String extr } } - final String outline = "" - + Utility.quoteXML(comp, true) - + "
" - + Utility.hex(s) - //+ "
" + script - + "
"; + final String outline = + "" + + Utility.quoteXML(comp, true) + + "
" + + Utility.hex(s) + // + "
" + script + + "
"; output.println(outline); } - private static String showCell2( - String sortKey, - String s, - short script, - String classname) { + private static String showCell2(String sortKey, String s, short script, String classname) { final String name = Default.ucd().getName(s); - // if (s.equals("\u1eaf")) { // System.out.println("debug"); // } String comp = Default.nfc().normalize(s); - final int cat = Default.ucd().getCategory(UTF16.charAt(comp,0)); + final int cat = Default.ucd().getCategory(UTF16.charAt(comp, 0)); if (cat == Mn || cat == Mc || cat == Me) { comp = '\u25CC' + comp; if (s.equals("\u0300")) { @@ -813,18 +872,22 @@ private static String showCell2( // TODO: merge with showCell - final String outline = "" - + Utility.quoteXML(comp, true) - + "
" - + Utility.hex(s) - //+ "
" + script - + "
" - + (script == UNSUPPORTED - ? "" + Utility.quoteXML(name, true) + "" - : "") - ; + final String outline = + "" + + Utility.quoteXML(comp, true) + + "
" + + Utility.hex(s) + // + "
" + script + + "
" + + (script == UNSUPPORTED + ? "" + Utility.quoteXML(name, true) + "" + : ""); return outline; } @@ -841,8 +904,10 @@ static short getBestScript(String s) { return COMMON_SCRIPT; } - //static final IndexUnicodeProperties INDEX_UNICODE_PROPS = IndexUnicodeProperties.make(Default.ucd().getVersion()); - //static final UnicodeMap SCRIPT_EXTENSIONS = INDEX_UNICODE_PROPS.load(UcdProperty.Script_Extensions); + // static final IndexUnicodeProperties INDEX_UNICODE_PROPS = + // IndexUnicodeProperties.make(Default.ucd().getVersion()); + // static final UnicodeMap SCRIPT_EXTENSIONS = + // INDEX_UNICODE_PROPS.load(UcdProperty.Script_Extensions); static BitSet getBestScript(int original, String transformed, BitSet toReturn) { toReturn.clear(); @@ -860,7 +925,8 @@ static BitSet getBestScript(int original, String transformed, BitSet toReturn) { return toReturn; } - static ToolUnicodePropertySource properties = ToolUnicodePropertySource.make(Default.ucdVersion()); + static ToolUnicodePropertySource properties = + ToolUnicodePropertySource.make(Default.ucdVersion()); static UnicodeProperty SCRIPT_EXTENSIONS = properties.getProperty("script extensions"); private static void addScript(int cp, BitSet toReturn) { @@ -902,23 +968,9 @@ static int getFirstPrimary(String sortKey) { return (result << 16); } - static final String[] CLASSNAME = { - "q", - "q", - "q", - "t", - "s", - "p" - }; - - static final String[] XCLASSNAME = { - "eq", - "eq", - "eq", - "et", - "es", - "ep" - }; + static final String[] CLASSNAME = {"q", "q", "q", "t", "s", "p"}; + + static final String[] XCLASSNAME = {"eq", "eq", "eq", "et", "es", "ep"}; static PrintWriter indexFile; static String indexAnchorText; @@ -928,16 +980,23 @@ static int getFirstPrimary(String sortKey) { static PrintWriter openFile(int count, String directory, int script) throws IOException { final String scriptName = getChunkName(script, LONG); final String shortScriptName = getChunkName(script, SHORT); - final String hover = scriptName.equals(shortScriptName) ? "" : "' title='" + shortScriptName; - - final String fileName = "chart_" + scriptName.replace('/', '_').replace(' ', '_') + (count > 1 ? count + "" : "") + ".html"; - final PrintWriter output = Utility.openPrintWriter(directory, fileName, Utility.UTF8_WINDOWS); + final String hover = + scriptName.equals(shortScriptName) ? "" : "' title='" + shortScriptName; + + final String fileName = + "chart_" + + scriptName.replace('/', '_').replace(' ', '_') + + (count > 1 ? count + "" : "") + + ".html"; + final PrintWriter output = + Utility.openPrintWriter(directory, fileName, Utility.UTF8_WINDOWS); Utility.fixDot(); System.out.println("Writing: " + scriptName); showIndex(scriptName, fileName + hover); final String title = "UCA: " + scriptName; - output.println("\n" - + UtilityBase.HTML_HEAD); + output.println( + "\n" + + UtilityBase.HTML_HEAD); output.println("" + title + ""); output.println(""); output.println("

" + scriptName + "

"); @@ -947,13 +1006,15 @@ static PrintWriter openFile(int count, String directory, int script) throws IOEx static PrintWriter openFile2(int count, String directory, String name) throws IOException { final String fileName = "chart_" + name + (count > 1 ? count + "" : "") + ".html"; - final PrintWriter output = Utility.openPrintWriter(directory, fileName, Utility.UTF8_WINDOWS); + final PrintWriter output = + Utility.openPrintWriter(directory, fileName, Utility.UTF8_WINDOWS); Utility.fixDot(); System.out.println("Writing: " + name); showIndex(name, fileName); final String title = name; - output.println("\n" + - UtilityBase.HTML_HEAD); + output.println( + "\n" + + UtilityBase.HTML_HEAD); output.println("" + title + ""); output.println(""); output.println(""); @@ -979,23 +1040,21 @@ static void closeFile(PrintWriter output) { output.close(); } - static final int - NULL_ORDER = -7, - IGNORABLE_ORDER = -6, - SPACE = -5, - PUNCT = -4, - SYMBOL = -3, - CURRENCY = -2, - DIGIT = -1, - // scripts in here - CJK = 300, - CJK_EXTENSIONS = CJK + 1, - UNSUPPORTED = CJK_EXTENSIONS + 1, - CAT_OFFSET = UNSUPPORTED + 10, - // categories in here - NO_CASE_MAPPING = CAT_OFFSET+50, - SCRIPT_LIMIT = NO_CASE_MAPPING + 5 - NULL_ORDER; - + static final int NULL_ORDER = -7, + IGNORABLE_ORDER = -6, + SPACE = -5, + PUNCT = -4, + SYMBOL = -3, + CURRENCY = -2, + DIGIT = -1, + // scripts in here + CJK = 300, + CJK_EXTENSIONS = CJK + 1, + UNSUPPORTED = CJK_EXTENSIONS + 1, + CAT_OFFSET = UNSUPPORTED + 10, + // categories in here + NO_CASE_MAPPING = CAT_OFFSET + 50, + SCRIPT_LIMIT = NO_CASE_MAPPING + 5 - NULL_ORDER; static { if (CJK <= UCD_Names.SCRIPT.length) { @@ -1006,92 +1065,143 @@ static void closeFile(PrintWriter output) { static final Matcher CAT_REMAP = Pattern.compile("([A-Z][a-z]*)([A-Z].+)").matcher(""); static String getChunkName(int script, byte length) { - switch(script) { - case NO_CASE_MAPPING: return "NoCaseMapping"; - case NULL_ORDER: return "Ignored"; - case IGNORABLE_ORDER: return "Secondary"; - case SPACE: return "Whitespace"; - case PUNCT: return "Punctuation"; - case SYMBOL: return "General-Symbol"; - case CURRENCY: return "Currency-Symbol"; - case DIGIT: return "Digits"; - case CJK: return "CJK"; - case CJK_EXTENSIONS: return "CJK-Extensions"; - case UNSUPPORTED: return "Unsupported"; - default: - if (script >= CAT_OFFSET) { - Default.ucd(); - final String cat = UCD.getCategoryID_fromIndex((short)(script - CAT_OFFSET), length); - if (!CAT_REMAP.reset(cat).matches()) { - return cat; + switch (script) { + case NO_CASE_MAPPING: + return "NoCaseMapping"; + case NULL_ORDER: + return "Ignored"; + case IGNORABLE_ORDER: + return "Secondary"; + case SPACE: + return "Whitespace"; + case PUNCT: + return "Punctuation"; + case SYMBOL: + return "General-Symbol"; + case CURRENCY: + return "Currency-Symbol"; + case DIGIT: + return "Digits"; + case CJK: + return "CJK"; + case CJK_EXTENSIONS: + return "CJK-Extensions"; + case UNSUPPORTED: + return "Unsupported"; + default: + if (script >= CAT_OFFSET) { + Default.ucd(); + final String cat = + UCD.getCategoryID_fromIndex((short) (script - CAT_OFFSET), length); + if (!CAT_REMAP.reset(cat).matches()) { + return cat; + } else { + return CAT_REMAP.group(2) + "-" + CAT_REMAP.group(1); + } + } else if (script == HIRAGANA_SCRIPT && HACK_KANA) { + return length == SHORT ? "Kata-Hira" : "Katakana/Hiragana"; + } else if (script == Meroitic_Hieroglyphs) { + return length == SHORT ? "Meroitic" : "Meroitic_Hieroglyphs/Cursive"; } else { - return CAT_REMAP.group(2) + "-" + CAT_REMAP.group(1); + Default.ucd(); + return Default.ucd() + .getCase( + UCD.getScriptID_fromIndex((short) script, length), FULL, TITLE); } - } else if (script == HIRAGANA_SCRIPT && HACK_KANA) { - return length == SHORT ? "Kata-Hira" : "Katakana/Hiragana"; - } else if (script == Meroitic_Hieroglyphs ) { - return length == SHORT ? "Meroitic" : "Meroitic_Hieroglyphs/Cursive"; - } else { - Default.ucd(); - return Default.ucd().getCase(UCD.getScriptID_fromIndex((short)script, length), FULL, TITLE); - } } } - static public final byte COLLATION = 0, NORMALIZATION = 1, CASE = 2, NAME = 3, SCRIPT = 4, NAMELIST = 5; + public static final byte COLLATION = 0, + NORMALIZATION = 1, + CASE = 2, + NAME = 3, + SCRIPT = 4, + NAMELIST = 5; - static public void closeIndexFile(PrintWriter indexFile, String extra, byte choice, boolean doBreak) { + public static void closeIndexFile( + PrintWriter indexFile, String extra, byte choice, boolean doBreak) { final SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss"); df.setTimeZone(TimeZone.getTimeZone("GMT")); indexFile.println("


"); boolean gotOne = false; - gotOne = doIndexItem("Collation Charts", "collation", choice, COLLATION, gotOne, indexFile); - gotOne = doIndexItem("Normalization Charts", "normalization", choice, NORMALIZATION, gotOne, indexFile); + gotOne = + doIndexItem( + "Collation Charts", "collation", choice, COLLATION, gotOne, indexFile); + gotOne = + doIndexItem( + "Normalization Charts", + "normalization", + choice, + NORMALIZATION, + gotOne, + indexFile); gotOne = doIndexItem("Case Charts", "case", choice, CASE, gotOne, indexFile); gotOne = doIndexItem("Script Charts", "script", choice, SCRIPT, gotOne, indexFile); - //gotOne = doIndexItem("Name Index Charts", "name", choice, NAME, gotOne, indexFile); - gotOne = doIndexItem("Names List Charts", "nameslist", choice, NAMELIST, gotOne, indexFile); + // gotOne = doIndexItem("Name Index Charts", "name", choice, NAME, gotOne, + // indexFile); + gotOne = + doIndexItem( + "Names List Charts", + "nameslist", + choice, + NAMELIST, + gotOne, + indexFile); // if (choice != NORMALIZATION) { // if (gotOne && doBreak) indexFile.println("
"); - // indexFile.println("Normalization Charts
"); + // indexFile.println("Normalization Charts
"); // gotOne = true; // } // if (choice != CASE) { // if (gotOne && doBreak) indexFile.println("
"); - // indexFile.println("Case Charts
"); + // indexFile.println("Case Charts
"); // gotOne = true; // } // if (choice != SCRIPT) { // if (gotOne && doBreak) indexFile.println("
"); - // indexFile.println("Script Charts
"); + // indexFile.println("Script Charts
"); // gotOne = true; // } // if (choice != NAME) { // if (gotOne && doBreak) indexFile.println("
"); - // indexFile.println("Name Index Charts
"); + // indexFile.println("Name Index Charts
"); // gotOne = true; // } // if (choice != NAMELIST) { // if (gotOne && doBreak) indexFile.println("
"); - // indexFile.println("Names List Charts
"); + // indexFile.println("Names List Charts
"); // gotOne = true; // } indexFile.println("


"); indexFile.println("UCD: " + Default.ucd().getVersion() + extra); - indexFile.println("
" + WriteCollationData.getNormalDate() /*+ " MED"*/); + indexFile.println( + "
" + + WriteCollationData + .getNormalDate() /*+ " MED"*/); indexFile.println("

"); indexFile.close(); } - private static boolean doIndexItem(String htmlTitle, String folderName, - byte choice, byte thisChoice, boolean gotOne, PrintWriter indexFile) { + private static boolean doIndexItem( + String htmlTitle, + String folderName, + byte choice, + byte thisChoice, + boolean gotOne, + PrintWriter indexFile) { if (choice != thisChoice) { - indexFile.println("" + - htmlTitle + - "
"); + indexFile.println( + "" + + htmlTitle + + "
"); gotOne = true; } else { indexFile.println(htmlTitle + "
"); @@ -1118,11 +1228,12 @@ static boolean containsCase(String s) { return false; } - static final Transliterator addCircle = Transliterator.createFromRules( - "any-addCircle", "([[:Mn:][:Me:]]) > \u25CC $1", Transliterator.FORWARD); + static final Transliterator addCircle = + Transliterator.createFromRules( + "any-addCircle", "([[:Mn:][:Me:]]) > \u25CC $1", Transliterator.FORWARD); public static void writeCompositionChart() throws IOException { - final UCA uca = new UCA(null,"",null); + final UCA uca = new UCA(null, "", null); final Set letters = new TreeSet(); final Set marks = new TreeSet(uca); @@ -1133,14 +1244,20 @@ public static void writeCompositionChart() throws IOException { // UnicodeSet latin = new UnicodeSet("[:latin:]"); - final PrintWriter out = Utility.openPrintWriter(Settings.Output.GEN_DIR + "log/", "composition_chart.html", Utility.UTF8_WINDOWS); + final PrintWriter out = + Utility.openPrintWriter( + Settings.Output.GEN_DIR + "log/", + "composition_chart.html", + Utility.UTF8_WINDOWS); try { - out.println("\n" + - UtilityBase.HTML_HEAD); + out.println( + "\n" + + UtilityBase.HTML_HEAD); out.println(""); - - // header - out.print(""); - out.println(""); - out.println(""); - out.println(""); - out.println("

Instructions

" + HTMLString(title) + " Version" + version + "

" - + (show ? "Hide" : "Show") + " Key

"); - /* - - - - Instructions - - - - - // index - out.print("
- Collation Version-2.1.9d7 - -

Show Key -

"); - out.println(""); - out.println(""); - out.println(""); - out.println("

Instructions

" + HTMLString(title) + " Version" + version + "

" - + (show ? "Hide" : "Show") + " Key

"); - - out.print(""); - out.print(""); - out.println(""); - out.println("

"); - if (counter > 0) { - out.print("<<"); - } else { - out.print("<<"); - } - out.println("

"); - boolean lastFar = false; - for (int i = 0; i <= end; ++i) { - boolean far = (i < counter-2 || i > counter+2); - if (far && ((i % 5) != 0) && (i != end)) continue; - if (i != 0 && lastFar != far) out.print(" - "); - lastFar = far; - if (i != counter) { - out.print("" + i + ""); + static void showDiff(boolean showName, boolean firstColumn, int line, Object chobj) { + String ch = chobj.toString(); + String decomp = nfd.normalize(ch); + if (showName) { + if (ch.equals(decomp)) { + log.println(//title + counter + " " + Utility.hex(ch, " ") + + " " + ucd.getName(ch) + ); + } else { + log.println(//title + counter + " " + "" + Utility.hex(ch, " ") + + " " + ucd.getName(ch) + "" + ); + } } else { - out.print("" + i + ""); + String keyD = printableKey(backD.get(chobj)); + String keyN = printableKey(backN.get(chobj)); + if (keyD.equals(keyN)) { + log.println(//title + counter + " " + Utility.hex(ch, " ") + " " + keyN); + } else { + log.println(//title + counter + " " + "" + Utility.hex(ch, " ") + " " + keyN + + "
" + Utility.hex(decomp, " ") + " " + keyD + "" + ); + } } - out.println(); - } - out.println("

"); - if (counter < end) { - out.print(">>"); - } else { - out.print(">>"); } - out.println("

"); - // standard template!!! - out.println("
"); - //out.println("

"); - return out; - } - static int getStrengthDifference(String old, String newStr) { - int result = 5; - int min = old.length(); - if (newStr.length() < min) min = newStr.length(); - for (int i = 0; i < min; ++i) { - char ch1 = old.charAt(i); - char ch2 = newStr.charAt(i); - if (ch1 != ch2) return result; - // see if we get difference before we get 0000. - if (ch1 == 0) --result; + static String printableKey(Object keyobj) { + String sortKey; + if (keyobj == null) { + sortKey = "NULL!!"; + } else { + sortKey = keyobj.toString(); + sortKey = sortKey.substring(0,sortKey.length()-1); + sortKey = UCA.toString(sortKey); + } + return sortKey; + } + + + LINKS
+ CONTENTS + + + + static void writeTail(PrintWriter out, int counter, String title, String other, boolean show) throws IOException { + copyFile(out, "HTML-Part2.txt"); + + out.println("
"); + out.println(""); + + out.close(); + } + + static String pad (int number) { + String num = Integer.toString(number); + if (num.length() < 2) num = "0" + number; + return num; + } + + static PrintWriter writeHead(int counter, int end, String title, String other, String version, boolean show) throws IOException { + + PrintWriter out = new PrintWriter( + new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(UCA.BASE_DIR+"\\generated\\collation/" + title + pad(counter) + ".html"), + "UTF8"), + 4*1024)); + copyFile(out, "HTML-Part1.txt"); + + out.println(""); + out.println(""); + out.println("" + HTMLString(title) + ""); + out.println(""); + + // header + out.print(""); + out.println(""); + out.println(""); + out.println(""); + out.println("

Instructions

" + HTMLString(title) + " Version" + version + "

" + + (show ? "Hide" : "Show") + " Key

"); + /* + + + + Instructions + + + + + // index + out.print("
+ Collation Version-2.1.9d7 + +

Show Key +

"); + out.println(""); + out.println(""); + out.println(""); + out.println("

Instructions

" + HTMLString(title) + " Version" + version + "

" + + (show ? "Hide" : "Show") + " Key

"); + + out.print(""); + out.print(""); + out.println(""); + out.println("

"); + if (counter > 0) { + out.print("<<"); + } else { + out.print("<<"); + } + out.println("

"); + boolean lastFar = false; + for (int i = 0; i <= end; ++i) { + boolean far = (i < counter-2 || i > counter+2); + if (far && ((i % 5) != 0) && (i != end)) continue; + if (i != 0 && lastFar != far) out.print(" - "); + lastFar = far; + if (i != counter) { + out.print("" + i + ""); + } else { + out.print("" + i + ""); + } + out.println(); + } + out.println("

"); + if (counter < end) { + out.print(">>"); + } else { + out.print(">>"); + } + out.println("

"); + // standard template!!! + out.println("
"); + //out.println("

"); + return out; + } + + static int getStrengthDifference(String old, String newStr) { + int result = 5; + int min = old.length(); + if (newStr.length() < min) min = newStr.length(); + for (int i = 0; i < min; ++i) { + char ch1 = old.charAt(i); + char ch2 = newStr.charAt(i); + if (ch1 != ch2) return result; + // see if we get difference before we get 0000. + if (ch1 == 0) --result; + } + if (newStr.length() != old.length()) return 1; + return 0; } - if (newStr.length() != old.length()) return 1; - return 0; - } - static final boolean needsXMLQuote(String source, boolean quoteApos) { - for (int i = 0; i < source.length(); ++i) { - char ch = source.charAt(i); - if (ch < ' ' || ch == '<' || ch == '&' || ch == '>') return true; - if (quoteApos & ch == '\'') return true; - if (ch == '\"') return true; - if (ch >= '\uD800' && ch <= '\uDFFF') return true; - if (ch >= '\uFFFE') return true; + static final boolean needsXMLQuote(String source, boolean quoteApos) { + for (int i = 0; i < source.length(); ++i) { + char ch = source.charAt(i); + if (ch < ' ' || ch == '<' || ch == '&' || ch == '>') return true; + if (quoteApos & ch == '\'') return true; + if (ch == '\"') return true; + if (ch >= '\uD800' && ch <= '\uDFFF') return true; + if (ch >= '\uFFFE') return true; + } + return false; } - return false; - } - - public static final String XMLString(int[] cps) { - return XMLBaseString(cps, cps.length, true); - } - public static final String XMLString(int[] cps, int len) { - return XMLBaseString(cps, len, true); - } - - public static final String XMLString(String source) { - return XMLBaseString(source, true); - } - - public static final String HTMLString(int[] cps) { - return XMLBaseString(cps, cps.length, false); - } + public static final String XMLString(int[] cps) { + return XMLBaseString(cps, cps.length, true); + } - public static final String HTMLString(int[] cps, int len) { - return XMLBaseString(cps, len, false); - } + public static final String XMLString(int[] cps, int len) { + return XMLBaseString(cps, len, true); + } - public static final String HTMLString(String source) { - return XMLBaseString(source, false); - } + public static final String XMLString(String source) { + return XMLBaseString(source, true); + } - public static final String XMLBaseString(int[] cps, int len, boolean quoteApos) { - StringBuffer temp = new StringBuffer(); - for (int i = 0; i < len; ++i) { - temp.append((char)cps[i]); + public static final String HTMLString(int[] cps) { + return XMLBaseString(cps, cps.length, false); } - return XMLBaseString(temp.toString(), quoteApos); - } - public static final String XMLBaseString(String source, boolean quoteApos) { - if (!needsXMLQuote(source, quoteApos)) return source; - StringBuffer result = new StringBuffer(); - for (int i = 0; i < source.length(); ++i) { - char ch = source.charAt(i); - if (ch < ' ' - || ch >= '\u007F' && ch <= '\u009F' - || ch >= '\uD800' && ch <= '\uDFFF' - || ch >= '\uFFFE') { - result.append('\uFFFD'); - result.append("#x"); - result.append(cpName(ch)); - result.append(";"); - - } else if (quoteApos && ch == '\'') { - result.append("'"); - } else if (ch == '\"') { - result.append("""); - } else if (ch == '<') { - result.append("<"); - } else if (ch == '&') { - result.append("&"); - } else if (ch == '>') { - result.append(">"); - } else { - result.append(ch); - } + public static final String HTMLString(int[] cps, int len) { + return XMLBaseString(cps, len, false); } - return result.toString(); - } - static int mapToStartOfRange(int ch) { - if (ch <= 0x3400) return ch; // CJK Ideograph Extension A - if (ch <= 0x4DB5) return 0x3400; - if (ch <= 0x4E00) return ch; // CJK Ideograph - if (ch <= 0x9FA5) return 0x4E00; - if (ch <= 0xAC00) return ch; // Hangul Syllable - if (ch <= 0xD7A3) return 0xAC00; - if (ch <= 0xD800) return ch; // Non Private Use High Surrogate - if (ch <= 0xDB7F) return 0xD800; - if (ch <= 0xDB80) return ch; // Private Use High Surrogate - if (ch <= 0xDBFF) return 0xDB80; - if (ch <= 0xDC00) return ch; // Low Surrogate - if (ch <= 0xDFFF) return 0xDC00; - if (ch <= 0xE000) return ch; // Private Use - if (ch <= 0xF8FF) return 0xE000; - if (ch <= 0xF0000) return ch; // Plane 15 Private Use - if (ch <= 0xFFFFD) return 0xF0000; - if (ch <= 0x100000) return ch; // Plane 16 Private Use - return 0x100000; - } + public static final String HTMLString(String source) { + return XMLBaseString(source, false); + } - */ -} \ No newline at end of file + public static final String XMLBaseString(int[] cps, int len, boolean quoteApos) { + StringBuffer temp = new StringBuffer(); + for (int i = 0; i < len; ++i) { + temp.append((char)cps[i]); + } + return XMLBaseString(temp.toString(), quoteApos); + } + + public static final String XMLBaseString(String source, boolean quoteApos) { + if (!needsXMLQuote(source, quoteApos)) return source; + StringBuffer result = new StringBuffer(); + for (int i = 0; i < source.length(); ++i) { + char ch = source.charAt(i); + if (ch < ' ' + || ch >= '\u007F' && ch <= '\u009F' + || ch >= '\uD800' && ch <= '\uDFFF' + || ch >= '\uFFFE') { + result.append('\uFFFD'); + result.append("#x"); + result.append(cpName(ch)); + result.append(";"); + + } else if (quoteApos && ch == '\'') { + result.append("'"); + } else if (ch == '\"') { + result.append("""); + } else if (ch == '<') { + result.append("<"); + } else if (ch == '&') { + result.append("&"); + } else if (ch == '>') { + result.append(">"); + } else { + result.append(ch); + } + } + return result.toString(); + } + + static int mapToStartOfRange(int ch) { + if (ch <= 0x3400) return ch; // CJK Ideograph Extension A + if (ch <= 0x4DB5) return 0x3400; + if (ch <= 0x4E00) return ch; // CJK Ideograph + if (ch <= 0x9FA5) return 0x4E00; + if (ch <= 0xAC00) return ch; // Hangul Syllable + if (ch <= 0xD7A3) return 0xAC00; + if (ch <= 0xD800) return ch; // Non Private Use High Surrogate + if (ch <= 0xDB7F) return 0xD800; + if (ch <= 0xDB80) return ch; // Private Use High Surrogate + if (ch <= 0xDBFF) return 0xDB80; + if (ch <= 0xDC00) return ch; // Low Surrogate + if (ch <= 0xDFFF) return 0xDC00; + if (ch <= 0xE000) return ch; // Private Use + if (ch <= 0xF8FF) return 0xE000; + if (ch <= 0xF0000) return ch; // Plane 15 Private Use + if (ch <= 0xFFFFD) return 0xF0000; + if (ch <= 0x100000) return ch; // Plane 16 Private Use + return 0x100000; + } + + */ +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/BuildNames.java b/unicodetools/src/main/java/org/unicode/text/UCD/BuildNames.java index 28eca2419..526d7d303 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/BuildNames.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/BuildNames.java @@ -1,16 +1,15 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/BuildNames.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/BuildNames.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.text.UTF16; import java.io.IOException; import java.io.PrintWriter; import java.util.HashMap; @@ -19,13 +18,9 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.text.utility.LengthFirstComparator; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; - - public class BuildNames implements UCD_Types { static final boolean DEBUG = false; @@ -42,7 +37,10 @@ public static void main(String[] args) throws IOException { static int[] letters = new int[128]; static class Count { - Count(int count) {this.count = count;} + Count(int count) { + this.count = count; + } + int count; } @@ -92,11 +90,11 @@ static String transform(String line) { final char c = line.charAt(i); if (c == '-' || c == '<' || c == '>') { - if (result.length() > 0 && result.charAt(result.length()-1) != ' ') { + if (result.length() > 0 && result.charAt(result.length() - 1) != ' ') { result.append(' '); } result.append(c); - if (i + 1 < line.length() && line.charAt(i+1) != ' ') { + if (i + 1 < line.length() && line.charAt(i + 1) != ' ') { result.append(' '); } changed = true; @@ -104,12 +102,12 @@ static String transform(String line) { } if ('a' <= c && c <= 'z') { - result.append((char)(c - 'a' + 'A')); + result.append((char) (c - 'a' + 'A')); changed = true; continue; } if ('0' <= c && c <= '9') { - result.append('*').append((char)(c - '0' + 'A')); + result.append('*').append((char) (c - '0' + 'A')); changed = true; continue; } @@ -131,7 +129,9 @@ static void printWords(Map words) { while (it.hasNext()) { final String word = (String) it.next(); final Count count = (Count) words.get(word); - biggest.put(new Integer(-count.count * word.length()), word); // make it negative just to reverse the sort + biggest.put( + new Integer(-count.count * word.length()), + word); // make it negative just to reverse the sort } it = biggest.keySet().iterator(); @@ -153,9 +153,9 @@ static void collectWords() throws IOException { final PrintWriter log = Utility.openPrintWriterGenDir(fname, Utility.LATIN1_WINDOWS); System.out.println("Gathering data"); - //Counter counter = new Counter(); + // Counter counter = new Counter(); final String[] parts = new String[100]; - //int total = 0; + // int total = 0; int used = 0; int sum = 0; int longSum = 0; @@ -190,10 +190,9 @@ static void collectWords() throws IOException { for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp2)) { cp2 = UTF16.charAt(str, i); name = Default.ucd().getName(cp2, SHORT); - if (name == null) - { + if (name == null) { continue; - //name = transform(name); + // name = transform(name); } sum += name.length(); @@ -212,12 +211,12 @@ static void collectWords() throws IOException { } log.close(); Utility.fixDot(); - //System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%"); - //System.out.println("Strings: " + sum + ", " + (lastLink*4)); - System.out.println("Short Names sum: " + sum + ", average: " + (sum + 0.0)/used); - System.out.println("Long Names sum: " + longSum + ", average: " + (longSum + 0.0)/used); - System.out.println("Savings: " + (1 - (sum+0.0)/longSum)); - + // System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / + // used) + "%"); + // System.out.println("Strings: " + sum + ", " + (lastLink*4)); + System.out.println("Short Names sum: " + sum + ", average: " + (sum + 0.0) / used); + System.out.println("Long Names sum: " + longSum + ", average: " + (longSum + 0.0) / used); + System.out.println("Savings: " + (1 - (sum + 0.0) / longSum)); printWords(words); printWords(doubleWords); @@ -240,8 +239,15 @@ static void collectWords() throws IOException { final String round = CompactName.stringFromToken(test); final boolean goesRound = round.equals(s); if (false || !goesRound) { - System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")" - + (goesRound ? ": NO RT: '" + round + "'" : "")); + System.out.println( + "Compacting: '" + + s + + "': " + + i++ + + "(" + + CompactName.lastToken + + ")" + + (goesRound ? ": NO RT: '" + round + "'" : "")); } } @@ -260,8 +266,15 @@ static void collectWords() throws IOException { final String round = CompactName.stringFromToken(test); final boolean goesRound = round.equals(s); if (false || !goesRound) { - System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")" - + (!goesRound ? ": NO RT: '" + round + "'" : "")); + System.out.println( + "Compacting: '" + + s + + "': " + + i++ + + "(" + + CompactName.lastToken + + ")" + + (!goesRound ? ": NO RT: '" + round + "'" : "")); } } @@ -271,35 +284,37 @@ static void collectWords() throws IOException { System.out.println(i + ": '" + s + "'"); }*/ - System.out.println("Strings: " + sum - + ", " + (CompactName.spacedMinimum*4) - + ", " + (CompactName.lastToken*4) - ); - + System.out.println( + "Strings: " + + sum + + ", " + + (CompactName.spacedMinimum * 4) + + ", " + + (CompactName.lastToken * 4)); } /* - Set stuff = new TreeSet(); - for (int i = 0; i < letters.length; ++i) { - if (letters[i] != 0) { - stuff.add(new Integer((letters[i] << 8) + i)); - } - } - - it = stuff.iterator(); - while (it.hasNext()) { - int in = ((Integer) it.next()).intValue(); - System.out.println((char)(in & 0xFF) + ":\t" + String.valueOf(in >> 8)); - } - int r = addString(name); - if (!DEBUG && !rname.equals(name)) { - System.out.println("\tNo Round Trip: '" + rname + "'"); - } - */ + Set stuff = new TreeSet(); + for (int i = 0; i < letters.length; ++i) { + if (letters[i] != 0) { + stuff.add(new Integer((letters[i] << 8) + i)); + } + } + + it = stuff.iterator(); + while (it.hasNext()) { + int in = ((Integer) it.next()).intValue(); + System.out.println((char)(in & 0xFF) + ":\t" + String.valueOf(in >> 8)); + } + int r = addString(name); + if (!DEBUG && !rname.equals(name)) { + System.out.println("\tNo Round Trip: '" + rname + "'"); + } + */ static Map stringToInt = new HashMap(); static Map intToString = new HashMap(); - static final int[] remap = new int['Z'+1]; + static final int[] remap = new int['Z' + 1]; static final int maxToken; static { @@ -318,12 +333,13 @@ static void collectWords() throws IOException { } static final String[] unmap = new String[maxToken]; + static { unmap[0] = ""; for (int i = 0; i < remap.length; ++i) { final int x = remap[i]; if (x != 0) { - unmap[x] = String.valueOf((char)i); + unmap[x] = String.valueOf((char) i); } } } @@ -352,9 +368,9 @@ static String lookup(int i) { } else { final int value = links[i]; final int lead = value >>> 16; - final int trail = value & 0xFFFF; - //if (DEBUG) System.out.println("lead: " + lead + ", trail: " + trail); - result = lookup(lead) + lookup(trail); + final int trail = value & 0xFFFF; + // if (DEBUG) System.out.println("lead: " + lead + ", trail: " + trail); + result = lookup(lead) + lookup(trail); } if (trailingSpace) { result += ' '; @@ -378,7 +394,7 @@ static int getInt(String s) { if (in == null) { return -1; } - return ((Integer)in).intValue(); + return ((Integer) in).intValue(); } static int putString(String s, int lead, int trail) { @@ -391,7 +407,8 @@ static int putString(String s, int lead, int trail) { links[lastLink++] = value; if (DEBUG) { - System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail); + System.out.println( + "'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail); final String roundTrip = lookup(result); if (!roundTrip.equals(s)) { System.out.println("\t*** No Round Trip: '" + roundTrip + "'"); @@ -419,7 +436,7 @@ static int addString(String s) { // invariant. We break after a space if there is one. for (int i = 1; i < limit; ++i) { - final char c = s.charAt(i-1); + final char c = s.charAt(i - 1); spaceBits = 0; endOfFirst = i; if (c == ' ') { @@ -437,8 +454,14 @@ static int addString(String s) { final int trail = getInt(lastPart); if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair if (DEBUG) { - System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "") - + "' # '" + lastPart + "' MATCH BOTH"); + System.out.println( + s + + " => '" + + firstPart + + (spaceBits != 0 ? "*" : "") + + "' # '" + + lastPart + + "' MATCH BOTH"); } return putString(s, spaceBits | lead, trail); } @@ -483,14 +506,26 @@ static int addString(String s) { final int trail = getInt(lastPart); if (lead >= 0) { if (DEBUG) { - System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "") - + "' # '" + lastPart + "' MATCH FIRST"); + System.out.println( + s + + " => '" + + firstPart + + (spaceBits != 0 ? "*" : "") + + "' # '" + + lastPart + + "' MATCH FIRST"); } return putString(s, spaceBits | lead, addString(lastPart)); } else { if (DEBUG) { - System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "") - + "' # '" + lastPart + "' MATCH SECOND"); + System.out.println( + s + + " => '" + + firstPart + + (spaceBits != 0 ? "*" : "") + + "' # '" + + lastPart + + "' MATCH SECOND"); } return putString(s, spaceBits | addString(firstPart), trail); } @@ -498,7 +533,6 @@ static int addString(String s) { // otherwise, we failed to find anything. Then break before the last word, if there is one // otherwise break in the middle (but at even value) - if (lastSpace >= 0) { best_i = lastSpace; endOfFirst = lastSpace - 1; @@ -509,8 +543,14 @@ static int addString(String s) { final String firstPart = s.substring(0, endOfFirst); final String lastPart = s.substring(best_i); if (DEBUG) { - System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "") - + "' # '" + lastPart + "' FALLBACK"); + System.out.println( + s + + " => '" + + firstPart + + (spaceBits != 0 ? "*" : "") + + "' # '" + + lastPart + + "' FALLBACK"); } return putString(s, spaceBits | addString(firstPart), addString(lastPart)); } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/Charts.java b/unicodetools/src/main/java/org/unicode/text/UCD/Charts.java index a8409f82a..8c9504fee 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/Charts.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/Charts.java @@ -1,16 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/Charts.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/Charts.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; - -public class Charts { -} \ No newline at end of file +public class Charts {} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/CheckCollator.java b/unicodetools/src/main/java/org/unicode/text/UCD/CheckCollator.java index 427dfb45f..c513cb08f 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/CheckCollator.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/CheckCollator.java @@ -1,12 +1,11 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/CheckCollator.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/CheckCollator.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ // http://java.sun.com/j2se/1.3/docs/guide/intl/encoding.doc.html @@ -21,18 +20,19 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Locale; - import org.unicode.text.utility.Utility; /** - * This is a quick and dirty program to get some idea of collation performance, comparing old Java to new stuff. + * This is a quick and dirty program to get some idea of collation performance, comparing old Java + * to new stuff. */ -abstract public class CheckCollator { +public abstract class CheckCollator { static final String PREFIX = "C:\\ICUInternal\\icu4c\\collation-perf-data\\TestNames_"; static final boolean DO_RAW = false; static final NumberFormat nf = NumberFormat.getInstance(); static final NumberFormat percent = NumberFormat.getPercentInstance(); + static { nf.setMaximumFractionDigits(2); } @@ -41,8 +41,9 @@ public static void main(String[] args) throws IOException { // later, drive off of args - // choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, Thai - //test(Locale.KOREAN, "Korean"); + // choices are: Asian, Chinese, Japanese, Japanese_h, Japanese_k, Korean, Latin, Russian, + // Thai + // test(Locale.KOREAN, "Korean"); test(Locale.ENGLISH, "Latin"); test(Locale.FRENCH, "Latin"); test(Locale.JAPANESE, "Japanese"); @@ -60,7 +61,7 @@ public static void test(Locale loc, String name) throws IOException { final FileInputStream fis = new FileInputStream(fileName); final InputStreamReader isr = new InputStreamReader(fis, "UnicodeLittle"); - final BufferedReader br = new BufferedReader(isr, 32*1024); + final BufferedReader br = new BufferedReader(isr, 32 * 1024); int counter = 0; @@ -87,7 +88,6 @@ public static void test(Locale loc, String name) throws IOException { int size = list.size(); - // later, adjust these so we always get a reasonble number of tries int extraIterations = 200; @@ -95,7 +95,7 @@ public static void test(Locale loc, String name) throws IOException { size = limit; } - final String[] tests = new String [size]; + final String[] tests = new String[size]; for (int i = 0; i < size; ++i) { tests[i] = (String) list.get(i); @@ -106,12 +106,10 @@ public static void test(Locale loc, String name) throws IOException { final com.ibm.icu.text.Collator newCol = com.ibm.icu.text.Collator.getInstance(loc); final java.text.Collator oldCol = java.text.Collator.getInstance(loc); - double startTime, endTime; double delta, oldDelta; String probe; - // load classes at least once before starting newCol.compare("a", "b"); @@ -129,14 +127,18 @@ public static void test(Locale loc, String name) throws IOException { final byte[] oldKey = oldCol.getCollationKey(tests[i]).toByteArray(); oldSize += oldKey.length; } - delta = stringSize/(size + 0.0); + delta = stringSize / (size + 0.0); System.out.println("string size: " + nf.format(delta) + " bytes per key"); System.out.println(); - delta = oldDelta = (oldSize/(size + 0.0)); + delta = oldDelta = (oldSize / (size + 0.0)); System.out.println("old sortkey size: " + nf.format(delta) + " bytes per key "); - delta = (newSize/(size + 0.0)); - System.out.println("new sortkey size: " + nf.format(delta) + " bytes per key " + percent.format(delta/oldDelta)); + delta = (newSize / (size + 0.0)); + System.out.println( + "new sortkey size: " + + nf.format(delta) + + " bytes per key " + + percent.format(delta / oldDelta)); System.out.println(); // ================================================ @@ -152,7 +154,7 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - double overhead = (1000*(endTime - startTime) / counter); + double overhead = (1000 * (endTime - startTime) / counter); System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros"); counter = 0; @@ -166,9 +168,9 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead; - System.out.println("Old sort key time: " + nf.format(delta) - + " micros (" + counter + " iterations)"); + oldDelta = delta = (1000 * (endTime - startTime) / counter) - overhead; + System.out.println( + "Old sort key time: " + nf.format(delta) + " micros (" + counter + " iterations)"); // Sort Key: new time @@ -183,9 +185,14 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - delta = (1000*(endTime - startTime) / counter) - overhead; - System.out.println("New sort key time: " + nf.format(delta) - + " micros (" + counter + " iterations) " + percent.format(delta/oldDelta)); + delta = (1000 * (endTime - startTime) / counter) - overhead; + System.out.println( + "New sort key time: " + + nf.format(delta) + + " micros (" + + counter + + " iterations) " + + percent.format(delta / oldDelta)); System.out.println(); // ================================================ @@ -205,8 +212,9 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - overhead = (1000*(endTime - startTime) / counter); - System.out.println("overhead: " + nf.format((endTime - startTime) / counter) + " micros"); + overhead = (1000 * (endTime - startTime) / counter); + System.out.println( + "overhead: " + nf.format((endTime - startTime) / counter) + " micros"); // Raw Compare: old time @@ -221,9 +229,13 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - oldDelta = delta = (1000*(endTime - startTime) / counter) - overhead; - System.out.println("Old raw compare time: " + nf.format(delta) - + " micros (" + counter + " iterations)"); + oldDelta = delta = (1000 * (endTime - startTime) / counter) - overhead; + System.out.println( + "Old raw compare time: " + + nf.format(delta) + + " micros (" + + counter + + " iterations)"); // Raw Compare: new time @@ -238,9 +250,14 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - delta = (1000*(endTime - startTime) / counter) - overhead; - System.out.println("New raw compare time: " + nf.format(delta) - + " micros (" + counter + " iterations) " + percent.format(delta/oldDelta)); + delta = (1000 * (endTime - startTime) / counter) - overhead; + System.out.println( + "New raw compare time: " + + nf.format(delta) + + " micros (" + + counter + + " iterations) " + + percent.format(delta / oldDelta)); System.out.println(); } @@ -263,9 +280,9 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - overhead = delta = (1000*(endTime - startTime) / iterations); - System.out.println("Overhead: " + nf.format(delta) - + " micros (" + iterations + " iterations)"); + overhead = delta = (1000 * (endTime - startTime) / iterations); + System.out.println( + "Overhead: " + nf.format(delta) + " micros (" + iterations + " iterations)"); // old time @@ -279,10 +296,13 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead; - System.out.println("Old binary search time: " + nf.format(delta) - + " micros (" + iterations + " iterations)"); - + oldDelta = delta = (1000 * (endTime - startTime) / iterations) - overhead; + System.out.println( + "Old binary search time: " + + nf.format(delta) + + " micros (" + + iterations + + " iterations)"); // new time @@ -297,9 +317,14 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - delta = (1000*(endTime - startTime) / iterations) - overhead; - System.out.println("New binary search time: " + nf.format(delta) - + " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta)); + delta = (1000 * (endTime - startTime) / iterations) - overhead; + System.out.println( + "New binary search time: " + + nf.format(delta) + + " micros (" + + iterations + + " iterations) " + + percent.format(delta / oldDelta)); System.out.println(); // ================================================ @@ -320,9 +345,9 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - overhead = delta = (1000*(endTime - startTime) / iterations); - System.out.println("overhead: " + nf.format(delta) - + " micros (" + iterations + " iterations)"); + overhead = delta = (1000 * (endTime - startTime) / iterations); + System.out.println( + "overhead: " + nf.format(delta) + " micros (" + iterations + " iterations)"); // old time @@ -335,9 +360,9 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - oldDelta = delta = (1000*(endTime - startTime) / iterations) - overhead; - System.out.println("Old sort time: " + nf.format(delta) - + " micros (" + iterations + " iterations)"); + oldDelta = delta = (1000 * (endTime - startTime) / iterations) - overhead; + System.out.println( + "Old sort time: " + nf.format(delta) + " micros (" + iterations + " iterations)"); // new time @@ -350,9 +375,13 @@ public static void test(Locale loc, String name) throws IOException { } } endTime = System.currentTimeMillis(); - delta = (1000*(endTime - startTime) / iterations) - overhead; - System.out.println("New sort time: " + nf.format(delta) - + " micros (" + iterations + " iterations) " + percent.format(delta/oldDelta)); - + delta = (1000 * (endTime - startTime) / iterations) - overhead; + System.out.println( + "New sort time: " + + nf.format(delta) + + " micros (" + + iterations + + " iterations) " + + percent.format(delta / oldDelta)); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/CheckICU.java b/unicodetools/src/main/java/org/unicode/text/UCD/CheckICU.java index 170eb9397..8cad89ce8 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/CheckICU.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/CheckICU.java @@ -1,5 +1,8 @@ package org.unicode.text.UCD; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -9,19 +12,14 @@ import java.util.Iterator; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; -import org.unicode.props.BagFormatter; import org.unicode.cldr.util.props.UnicodeLabel; import org.unicode.jsp.ICUPropertyFactory; +import org.unicode.props.BagFormatter; import org.unicode.props.UnicodeProperty; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - public class CheckICU { static final BagFormatter bf = new BagFormatter(); @@ -37,21 +35,23 @@ public static void main(String[] args) throws IOException { static class ReplaceLabel extends UnicodeLabel { UnicodeProperty p; + ReplaceLabel(UnicodeProperty p) { this.p = p; } + @Override public String getValue(int codepoint, boolean isShort) { // TODO Auto-generated method stub - return p.getValue(codepoint, isShort).replace('_',' '); + return p.getValue(codepoint, isShort).replace('_', ' '); } + @Override public int getMaxWidth(boolean v) { return p.getMaxWidth(v); } } - public static void test() throws IOException { checkAvailable(); if (true) { @@ -63,9 +63,9 @@ public static void test() throws IOException { toolFactory = ToolUnicodePropertySource.make("4.0.0"); final String[] quickList = { - // "Canonical_Combining_Class", - // "Script", "Bidi_Mirroring_Glyph", "Case_Folding", - //"Numeric_Value" + // "Canonical_Combining_Class", + // "Script", "Bidi_Mirroring_Glyph", "Case_Folding", + // "Numeric_Value" }; for (final String element : quickList) { testProperty(element, -1); @@ -77,7 +77,8 @@ public static void test() throws IOException { final Collection availableTool = toolFactory.getAvailableNames(); final Collection availableICU = icuFactory.getAvailableNames(); - System.out.println(showDifferences("Property Aliases", "ICU", availableICU, "Tool", availableTool)); + System.out.println( + showDifferences("Property Aliases", "ICU", availableICU, "Tool", availableTool)); final Collection common = new TreeSet(availableICU); common.retainAll(availableTool); @@ -86,18 +87,16 @@ public static void test() throws IOException { System.out.println(UnicodeProperty.getTypeName(j)); final Iterator it = common.iterator(); while (it.hasNext()) { - final String prop = (String)it.next(); + final String prop = (String) it.next(); testProperty(prop, j); } } } - /** - * - */ + /** */ private static void checkAvailable() { - //generateFile("4.0.0", "DerivedCombiningClass"); - //generateFile("4.0.0", "DerivedCoreProperties"); + // generateFile("4.0.0", "DerivedCombiningClass"); + // generateFile("4.0.0", "DerivedCoreProperties"); final ULocale[] locales = Collator.getAvailableULocales(); System.out.println("Collation"); @@ -112,9 +111,7 @@ private static void checkAvailable() { } } System.out.println("Differing Collators:"); - final Set testSet = new HashSet(Arrays.asList(new String[] { - "nl", "de", "de_DE", "zh_TW" - })); + final Set testSet = new HashSet(Arrays.asList(new String[] {"nl", "de", "de_DE", "zh_TW"})); for (int k = 0; k < locales.length; ++k) { if (!testSet.contains(locales[k].toString())) { continue; @@ -123,43 +120,59 @@ private static void checkAvailable() { } } - /** - * - */ + /** */ private static void showCollationVariants(ULocale locale) { final String[] keywords = Collator.getKeywords(); System.out.println(locale.getDisplayName(ULocale.ENGLISH) + " [" + locale + "]"); for (int i = 0; i < Collator.getKeywords().length; ++i) { - final ULocale base = Collator.getFunctionalEquivalent(keywords[i], - locale - //new ULocale(locale + "@" + keywords[i] + "=standard") - ); + final ULocale base = + Collator.getFunctionalEquivalent( + keywords[i], locale + // new ULocale(locale + "@" + keywords[i] + "=standard") + ); if (true) { - System.out.println("\"" + base + "\" == Collator.getFunctionalEquivalent(\"" + keywords[i] + "\", \"" + locale + "\");"); + System.out.println( + "\"" + + base + + "\" == Collator.getFunctionalEquivalent(\"" + + keywords[i] + + "\", \"" + + locale + + "\");"); } final String[] values = Collator.getKeywordValues(keywords[i]); for (int j = 0; j < Collator.getKeywordValues(keywords[i]).length; ++j) { - final ULocale other = Collator.getFunctionalEquivalent(keywords[i], - new ULocale(locale + "@" + keywords[i] + "=" + values[j])); + final ULocale other = + Collator.getFunctionalEquivalent( + keywords[i], + new ULocale(locale + "@" + keywords[i] + "=" + values[j])); if (true) { System.out.println( - "\"" + other - + "\" == Collator.getFunctionalEquivalent(\"" + keywords[i] + "\"" + + other + + "\" == Collator.getFunctionalEquivalent(\"" + + keywords[i] + "\", new ULocale(\"" - + locale + "@" + keywords[i] + "=" + values[j] + "\");"); + + locale + + "@" + + keywords[i] + + "=" + + values[j] + + "\");"); } // HACK: commented line should work but doesn't if (!other.equals(base)) { - //if (other.toString().indexOf("@") >= 0) { - System.out.println("\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other); + // if (other.toString().indexOf("@") >= 0) { + System.out.println( + "\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other); } } } } /** - * Sample code that prints out the variants that 'make a difference' for a given locale. - * To iterate through the locales, use Collator.getVariant + * Sample code that prints out the variants that 'make a difference' for a given locale. To + * iterate through the locales, use Collator.getVariant */ private static void showCollationVariants2(ULocale locale) { final String[] keywords = Collator.getKeywords(); @@ -168,10 +181,13 @@ private static void showCollationVariants2(ULocale locale) { final ULocale base = Collator.getFunctionalEquivalent(keywords[i], locale); final String[] values = Collator.getKeywordValues(keywords[i]); for (int j = 0; j < Collator.getKeywordValues(keywords[i]).length; ++j) { - final ULocale other = Collator.getFunctionalEquivalent(keywords[i], - new ULocale(locale + "@" + keywords[i] + "=" + values[j])); + final ULocale other = + Collator.getFunctionalEquivalent( + keywords[i], + new ULocale(locale + "@" + keywords[i] + "=" + values[j])); if (!other.equals(base)) { - System.out.println("\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other); + System.out.println( + "\t" + keywords[i] + "=" + values[j] + "; \t" + base + "; \t" + other); } } } @@ -194,9 +210,10 @@ private static void checkUCD() throws IOException { leading.add(i); } } - final PrintWriter pw = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "Trailing.txt"); + final PrintWriter pw = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "Trailing.txt"); pw.println("+Trailing+Starter"); - bf.showSetNames(pw, new UnicodeSet(trailing).retainAll(starter)); + bf.showSetNames(pw, new UnicodeSet(trailing).retainAll(starter)); pw.println("+Trailing-Starter"); bf.showSetNames(pw, new UnicodeSet(trailing).removeAll(starter)); pw.println("-Trailing-Starter"); @@ -208,17 +225,17 @@ private static void checkUCD() throws IOException { pw.close(); } /* - * int icuType; - int toolType; - Collection icuAliases; - Collection toolAliases; - String firstDiffICU; - String firstDiffTool; - String firstDiffCP; - String icuProp; - String toolProp; - - */ + * int icuType; + int toolType; + Collection icuAliases; + Collection toolAliases; + String firstDiffICU; + String firstDiffTool; + String firstDiffCP; + String icuProp; + String toolProp; + + */ private static void testProperty(String prop, int typeFilter) { final UnicodeProperty icuProp = icuFactory.getProperty(prop); @@ -234,8 +251,11 @@ private static void testProperty(String prop, int typeFilter) { final int toolType = toolProp.getType(); if (icuType != toolType) { - System.out.println("FAILURE Type: ICU: " + UnicodeProperty.getTypeName(icuType) - + "\tTool: " + UnicodeProperty.getTypeName(toolType)); + System.out.println( + "FAILURE Type: ICU: " + + UnicodeProperty.getTypeName(icuType) + + "\tTool: " + + UnicodeProperty.getTypeName(toolType)); } Collection icuAliases = icuProp.getNameAliases(new ArrayList()); @@ -244,7 +264,8 @@ private static void testProperty(String prop, int typeFilter) { icuAliases = icuProp.getAvailableValues(new ArrayList()); toolAliases = toolProp.getAvailableValues(new ArrayList()); - System.out.println(showDifferences("Value Aliases", "ICU", icuAliases, "Tool", toolAliases)); + System.out.println( + showDifferences("Value Aliases", "ICU", icuAliases, "Tool", toolAliases)); // TODO do property value aliases itemFailures.clear(); @@ -274,9 +295,8 @@ private static void testProperty(String prop, int typeFilter) { if (firstDiffTool != null) { firstDiffTool = BagFormatter.hex.transliterate(firstDiffTool); } - System.out.println(firstDiffCP - + "\tICU: <" + firstDiffICU - + ">\tTool: <" + firstDiffTool + ">"); + System.out.println( + firstDiffCP + "\tICU: <" + firstDiffICU + ">\tTool: <" + firstDiffTool + ">"); } System.out.println("done"); @@ -295,12 +315,8 @@ static boolean equals(Object a, Object b) { return a.equals(b); } - static public String showDifferences( - String title, - String name1, - Collection set1, - String name2, - Collection set2) { + public static String showDifferences( + String title, String name1, Collection set1, String name2, Collection set2) { final Collection temp = new TreeSet(set1); temp.retainAll(set2); @@ -323,7 +339,6 @@ static public String showDifferences( result.append("\n"); } - temp.clear(); temp.addAll(set1); temp.removeAll(set2); @@ -342,9 +357,6 @@ static public String showDifferences( result.append("\n"); } - return result.toString(); } - - } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/ChineseFrequency.java b/unicodetools/src/main/java/org/unicode/text/UCD/ChineseFrequency.java index 439c2b4eb..859866e47 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/ChineseFrequency.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/ChineseFrequency.java @@ -1,5 +1,8 @@ package org.unicode.text.UCD; +import com.ibm.icu.text.DecimalFormat; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -7,16 +10,11 @@ import java.util.Iterator; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.text.utility.Pair; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.DecimalFormat; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; - public class ChineseFrequency { static NumberFormat percent = new DecimalFormat("0.000000%"); static NumberFormat percent3 = new DecimalFormat("000.000000%"); @@ -25,34 +23,37 @@ public class ChineseFrequency { static class InverseCompareTo implements Comparator { @Override public int compare(Object o1, Object o2) { - return -((Comparable)o1).compareTo(o2); + return -((Comparable) o1).compareTo(o2); } } - public static void test() throws IOException{ + public static void test() throws IOException { final Set freq_char = new TreeSet(new InverseCompareTo()); - final BufferedReader br = FileUtilities.openUTF8Reader(Settings.UnicodeTools.DICT_DIR, "kHYPLCDPF.txt"); + final BufferedReader br = + FileUtilities.openUTF8Reader(Settings.UnicodeTools.DICT_DIR, "kHYPLCDPF.txt"); double grandTotal = 0.0; while (true) { final String line = br.readLine(); if (line == null) { break; } - final String[] pieces = Utility.split(line,'\t'); - final int cp = Integer.parseInt(pieces[0],16); - final String[] says = Utility.split(pieces[1],','); + final String[] pieces = Utility.split(line, '\t'); + final int cp = Integer.parseInt(pieces[0], 16); + final String[] says = Utility.split(pieces[1], ','); long total = 0; for (final String say : says) { final int start = say.indexOf('('); final int end = say.indexOf(')'); - final long count = Long.parseLong(say.substring(start+1, end)); + final long count = Long.parseLong(say.substring(start + 1, end)); total += count; } grandTotal += total; freq_char.add(new Pair(new Long(total), new Integer(cp))); } br.close(); - final PrintWriter pw = FileUtilities.openUTF8Writer(Settings.UnicodeTools.DICT_DIR, "kHYPLCDPF_frequency.txt"); + final PrintWriter pw = + FileUtilities.openUTF8Writer( + Settings.UnicodeTools.DICT_DIR, "kHYPLCDPF_frequency.txt"); pw.write("\uFEFF"); pw.println("No.\tPercentage\tAccummulated\tHex\tChar"); @@ -61,7 +62,7 @@ public static void test() throws IOException{ double cummulative = 0; double cummulativePercentage = 0; while (it.hasNext()) { - final Pair item = (Pair)it.next(); + final Pair item = (Pair) it.next(); final Long total = (Long) item.first; final Integer cp = (Integer) item.second; final double current = total.longValue(); @@ -70,14 +71,18 @@ public static void test() throws IOException{ cummulativePercentage += percentage; pw.println( ++counter - //+ "\t" + number.format(current) - //+ "\t" + number.format(cummulative) - + "\t" + percent.format(percentage) - + "\t" + percent3.format(cummulativePercentage) - + "\t" + Integer.toHexString(cp.intValue()).toUpperCase() - + "\t" + UTF16.valueOf(cp.intValue())); + // + "\t" + number.format(current) + // + "\t" + number.format(cummulative) + + "\t" + + percent.format(percentage) + + "\t" + + percent3.format(cummulativePercentage) + + "\t" + + Integer.toHexString(cp.intValue()).toUpperCase() + + "\t" + + UTF16.valueOf(cp.intValue())); } - //pw.println("Grand total: " + (long)grandTotal); + // pw.println("Grand total: " + (long)grandTotal); pw.close(); } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/CodePointProperty.java b/unicodetools/src/main/java/org/unicode/text/UCD/CodePointProperty.java index 9d1b4efc5..1bb15039b 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/CodePointProperty.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/CodePointProperty.java @@ -1,29 +1,29 @@ package org.unicode.text.UCD; -import java.util.Iterator; import com.ibm.icu.text.UnicodeSet; +import java.util.Iterator; // Enumerated properties will be IntCodePointProperty. // The string values they return will be the property value names. // Binary properties are Enumerated properties. They return 0 or 1 -abstract public class CodePointProperty { +public abstract class CodePointProperty { // styles for names and string values static final byte SHORT = 0, DEFAULT = 1, LONG = 2, NORMAL_LIMIT = 3; // gets the property name - abstract public String getName(byte style); + public abstract String getName(byte style); // value may also be numeric, etc, but this returns string equivalent. - abstract public String getValue(int codePoint, byte style); + public abstract String getValue(int codePoint, byte style); // returns true if the code point has the value // works with any style that getValue takes - abstract public boolean hasValue(int codePoint, String value); + public abstract boolean hasValue(int codePoint, String value); // returns the set of all code points with that value. // same effect as using hasValue one by one, but faster internal implementation - abstract public UnicodeSet getSet(String value); + public abstract UnicodeSet getSet(String value); // returns a list of all possible values // logically the same as looping from 0..10FFFF with getValue and getStyleLimit, @@ -45,19 +45,25 @@ public boolean isUniformOverCategory(byte generalCategory) { // subclasses - static abstract public class IntCodePointProperty extends CodePointProperty { + public abstract static class IntCodePointProperty extends CodePointProperty { abstract int getNumericValue(int codePoint); + abstract int getMaxValue(); + abstract int getMinValue(); + static Iterator getAllNumericValues() { return null; } } - static abstract public class DoubleCodePointProperty extends CodePointProperty { + public abstract static class DoubleCodePointProperty extends CodePointProperty { abstract double getNumericValue(int codePoint); + abstract double getMaxValue(); + abstract double getMinValue(); + static Iterator getAllNumericValues() { return null; } @@ -67,7 +73,7 @@ static Iterator getAllNumericValues() { // register a new property static void register(CodePointProperty newProp) { - //... + // ... } // finds a registered property by name @@ -96,11 +102,13 @@ static Iterator getAllRegistered() { public boolean hasParameters() { return false; } + public void setParameters(String parameters) {} + public String getParameters() { return null; } // that way we could have [[:letter:]&[:contains(dot):]] -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/CompactName.java b/unicodetools/src/main/java/org/unicode/text/UCD/CompactName.java index 49f842c86..471d8e4e5 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/CompactName.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/CompactName.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/CompactName.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/CompactName.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; import java.io.IOException; @@ -31,10 +29,8 @@ public static void main(String[] args) throws IOException { final String s = CompactName.stringFromToken(i); System.out.println(s); } - } - static final char[] compactMap = new char[128]; static final char[] compactUnmap = new char[128]; @@ -108,7 +104,8 @@ static int addTokenForString(String s, int lead, int trail) { tokenList[lastToken++] = value; if (DEBUG) { - System.out.println("'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail); + System.out.println( + "'" + s + "', tokenList[" + result + "] = lead: " + lead + ", trail: " + trail); final String roundTrip = stringFromToken(result); if (!roundTrip.equals(s)) { System.out.println("\t*** No Round Trip: '" + roundTrip + "'"); @@ -160,10 +157,9 @@ static int tokenFromString(String s) { if (in == null) { return -1; } - return ((Integer)in).intValue(); + return ((Integer) in).intValue(); } - static int addWord(String s) { final int result = tokenFromString(s); @@ -233,8 +229,7 @@ static int addWord(String s) { } static void show(String s, String firstPart, String lastPart, String comment) { - System.out.println((s) + " => '" + (firstPart) - + "' # '" + (lastPart) + "' " + comment); + System.out.println((s) + " => '" + (firstPart) + "' # '" + (lastPart) + "' " + comment); } static void startLines() { @@ -259,7 +254,7 @@ static int addLine(String s) { } final String firstPart = s.substring(0, i); - final String lastPart = s.substring(i+1); + final String lastPart = s.substring(i + 1); final int lead = tokenFromString(firstPart); final int trail = tokenFromString(lastPart); @@ -302,4 +297,4 @@ static int addLine(String s) { System.out.println("SHOULD HAVE MATCHED!!"); throw new IllegalArgumentException("SHOULD HAVE MATCHED!! " + s); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/Compare14652.java b/unicodetools/src/main/java/org/unicode/text/UCD/Compare14652.java index ff7d78cb7..6954d11ba 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/Compare14652.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/Compare14652.java @@ -1,25 +1,21 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/Compare14652.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/Compare14652.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; - import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UnicodeSet; - // quick and dirty function for grabbing contents of ISO 14652 file public class Compare14652 implements UCD_Types { @@ -30,50 +26,50 @@ public static UnicodeSet getSet(int prop, byte propValue) { return UnifiedBinaryProperty.make(prop | propValue).getSet(); } - static UnicodeSet - titleSet = getSet(CATEGORY, Lt), - combiningSet = getSet(CATEGORY, Mc) - .addAll(getSet(CATEGORY, Me)) - .addAll(getSet(CATEGORY, Mn)), - zSet = getSet(CATEGORY, Zs) - .addAll(getSet(CATEGORY, Zl)) - .addAll(getSet(CATEGORY, Zp)), - pSet = getSet(CATEGORY, Pd) - .addAll(getSet(CATEGORY, Ps)) - .addAll(getSet(CATEGORY, Pe)) - .addAll(getSet(CATEGORY, Pc)) - .addAll(getSet(CATEGORY, Po)) - .addAll(getSet(CATEGORY, Pi)) - .addAll(getSet(CATEGORY, Pf)), - sSet = getSet(CATEGORY, Sm) - .addAll(getSet(CATEGORY, Sc)) - .addAll(getSet(CATEGORY, Sk)) - .addAll(getSet(CATEGORY, So)), - noSet = getSet(CATEGORY, No), - csSet = getSet(CATEGORY, Cs), - cfSet = getSet(CATEGORY, Cf), - cnSet = getSet(CATEGORY, Cn), - circled = getSet(DECOMPOSITION_TYPE, COMPAT_CIRCLE), - whitespaceSet = getSet(BINARY_PROPERTIES, White_space), - alphaSet = getSet(DERIVED, PropAlphabetic).addAll(combiningSet), - lowerSet = getSet(DERIVED, PropLowercase).addAll(titleSet).removeAll(circled), - upperSet = getSet(DERIVED, PropUppercase).addAll(titleSet).removeAll(circled), - digitSet = getSet(CATEGORY, Nd), - xdigitSet = new UnicodeSet("[a-fA-F\uFF21-\uFF26\uFF41-\uFF46]").addAll(digitSet), - spaceSet = whitespaceSet.size() == 0 ? zSet : whitespaceSet, + static UnicodeSet titleSet = getSet(CATEGORY, Lt), + combiningSet = + getSet(CATEGORY, Mc).addAll(getSet(CATEGORY, Me)).addAll(getSet(CATEGORY, Mn)), + zSet = getSet(CATEGORY, Zs).addAll(getSet(CATEGORY, Zl)).addAll(getSet(CATEGORY, Zp)), + pSet = + getSet(CATEGORY, Pd) + .addAll(getSet(CATEGORY, Ps)) + .addAll(getSet(CATEGORY, Pe)) + .addAll(getSet(CATEGORY, Pc)) + .addAll(getSet(CATEGORY, Po)) + .addAll(getSet(CATEGORY, Pi)) + .addAll(getSet(CATEGORY, Pf)), + sSet = + getSet(CATEGORY, Sm) + .addAll(getSet(CATEGORY, Sc)) + .addAll(getSet(CATEGORY, Sk)) + .addAll(getSet(CATEGORY, So)), + noSet = getSet(CATEGORY, No), + csSet = getSet(CATEGORY, Cs), + cfSet = getSet(CATEGORY, Cf), + cnSet = getSet(CATEGORY, Cn), + circled = getSet(DECOMPOSITION_TYPE, COMPAT_CIRCLE), + whitespaceSet = getSet(BINARY_PROPERTIES, White_space), + alphaSet = getSet(DERIVED, PropAlphabetic).addAll(combiningSet), + lowerSet = getSet(DERIVED, PropLowercase).addAll(titleSet).removeAll(circled), + upperSet = getSet(DERIVED, PropUppercase).addAll(titleSet).removeAll(circled), + digitSet = getSet(CATEGORY, Nd), + xdigitSet = new UnicodeSet("[a-fA-F\uFF21-\uFF26\uFF41-\uFF46]").addAll(digitSet), + spaceSet = whitespaceSet.size() == 0 ? zSet : whitespaceSet, controlSet = getSet(CATEGORY, Cc), punctSet = new UnicodeSet(pSet).addAll(sSet), - graphSet = new UnicodeSet(0,0x10ffff) - .removeAll(controlSet) - //.removeAll(getSet(CATEGORY, Cf)) - .removeAll(csSet) - .removeAll(cnSet) - .removeAll(zSet), - // Cc, Cf, Cs, Cn, Z - blankSet = new UnicodeSet(spaceSet).removeAll(new UnicodeSet("[\\u000A-\\u000D\\u0085]")) - .removeAll(getSet(CATEGORY, Zl)) - .removeAll(getSet(CATEGORY, Zp)); - + graphSet = + new UnicodeSet(0, 0x10ffff) + .removeAll(controlSet) + // .removeAll(getSet(CATEGORY, Cf)) + .removeAll(csSet) + .removeAll(cnSet) + .removeAll(zSet), + // Cc, Cf, Cs, Cn, Z + blankSet = + new UnicodeSet(spaceSet) + .removeAll(new UnicodeSet("[\\u000A-\\u000D\\u0085]")) + .removeAll(getSet(CATEGORY, Zl)) + .removeAll(getSet(CATEGORY, Zp)); static class Prop { String name; @@ -103,7 +99,7 @@ static class Prop { } else if (name.equals("space")) { guess = wsname; guessContents = spaceSet; - //Utility.showSetNames("Whitespace", spaceSet, true, Default.ucd); + // Utility.showSetNames("Whitespace", spaceSet, true, Default.ucd); } else if (name.equals("cntrl")) { guess = "gc=Cc"; guessContents = controlSet; @@ -121,21 +117,20 @@ static class Prop { guessContents = combiningSet; } - /*upper -lower -alpha -digit -outdigit -space -cntrl -punct -graph -xdigit -blank -toupper -tolower - */ + lower + alpha + digit + outdigit + space + cntrl + punct + graph + xdigit + blank + toupper + tolower + */ } void show(PrintWriter pw) { @@ -162,8 +157,9 @@ void show(PrintWriter pw) { pw.println("**************************************************"); pw.println(name); pw.println("**************************************************"); - Utility.showSetDifferences(pw, name, contents, guess, guessContents, false, true, null, Default.ucd()); - //pw.println(props[i].contents); + Utility.showSetDifferences( + pw, name, contents, guess, guessContents, false, true, null, Default.ucd()); + // pw.println(props[i].contents); } } @@ -173,21 +169,30 @@ void show(PrintWriter pw) { public static void main(String[] args) throws IOException { final String version = Default.ucd().getVersion(); - final PrintWriter log = Utility.openPrintWriterGenDir("Diff14652_" + version + ".txt", Utility.UTF8_WINDOWS); + final PrintWriter log = + Utility.openPrintWriterGenDir( + "Diff14652_" + version + ".txt", Utility.UTF8_WINDOWS); try { log.write('\uFEFF'); log.print("Version: " + version); if (false) { - final UnicodeSet ID = getSet(DERIVED, ID_Start).addAll(getSet(DERIVED, ID_Continue_NO_Cf)); - final UnicodeSet XID = getSet(DERIVED, Mod_ID_Start).addAll(getSet(DERIVED, Mod_ID_Continue_NO_Cf)); - final UnicodeSet alphanumSet = new UnicodeSet(alphaSet).addAll(digitSet).addAll(getSet(CATEGORY, Pc)); + final UnicodeSet ID = + getSet(DERIVED, ID_Start).addAll(getSet(DERIVED, ID_Continue_NO_Cf)); + final UnicodeSet XID = + getSet(DERIVED, Mod_ID_Start) + .addAll(getSet(DERIVED, Mod_ID_Continue_NO_Cf)); + final UnicodeSet alphanumSet = + new UnicodeSet(alphaSet).addAll(digitSet).addAll(getSet(CATEGORY, Pc)); Utility.showSetDifferences("ID", ID, "XID", XID, false, Default.ucd()); - Utility.showSetDifferences("ID", ID, "Alphabetic+Digit+Pc", alphanumSet, false, Default.ucd()); + Utility.showSetDifferences( + "ID", ID, "Alphabetic+Digit+Pc", alphanumSet, false, Default.ucd()); } - final BufferedReader br = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "ISO14652_CTYPE.txt", Utility.LATIN1); + final BufferedReader br = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "ISO14652_CTYPE.txt", Utility.LATIN1); while (true) { String line = br.readLine(); if (line == null) { @@ -213,7 +218,7 @@ public static void main(String[] args) throws IOException { continue; } if (ch == '<') { - addItems(line, props[propCount-1].contents); + addItems(line, props[propCount - 1].contents); } else { // new property System.out.println(line); @@ -237,30 +242,29 @@ public static void main(String[] args) throws IOException { log.println("**************************************************"); log.println(); /* -alpha, digit, punct, cntrl are all disjoint -space, cntrl, blank are pairwise disjoint with any of alpha, digit, xdigit -alpha includes upper, lower -graph includes alpha, digit, punct -print includes graph -xdigit includes digit - */ - final Prop - alpha = getProp("ISO_14652_alpha"), - upper = getProp("ISO_14652_upper"), - lower = getProp("ISO_14652_lower"), - graph = getProp("ISO_14652_graph"), - //print = getProp("ISO_14652_print"), - punct = getProp("ISO_14652_punct"), - digit = getProp("ISO_14652_digit"), - xdigit = getProp("ISO_14652_xdigit"), - space = getProp("ISO_14652_space"), - blank = getProp("ISO_14652_blank"), - cntrl = getProp("ISO_14652_cntrl"); + alpha, digit, punct, cntrl are all disjoint + space, cntrl, blank are pairwise disjoint with any of alpha, digit, xdigit + alpha includes upper, lower + graph includes alpha, digit, punct + print includes graph + xdigit includes digit + */ + final Prop alpha = getProp("ISO_14652_alpha"), + upper = getProp("ISO_14652_upper"), + lower = getProp("ISO_14652_lower"), + graph = getProp("ISO_14652_graph"), + // print = getProp("ISO_14652_print"), + punct = getProp("ISO_14652_punct"), + digit = getProp("ISO_14652_digit"), + xdigit = getProp("ISO_14652_xdigit"), + space = getProp("ISO_14652_space"), + blank = getProp("ISO_14652_blank"), + cntrl = getProp("ISO_14652_cntrl"); checkDisjoint(log, new Prop[] {alpha, digit, punct, cntrl}); - final Prop [] l1 = new Prop[] {space, cntrl, blank}; - final Prop [] l2 = new Prop[] {alpha, digit, xdigit}; + final Prop[] l1 = new Prop[] {space, cntrl, blank}; + final Prop[] l2 = new Prop[] {alpha, digit, xdigit}; for (int i = 0; i < l1.length; ++i) { for (int j = i + 1; j < l2.length; ++j) { checkDisjoint(log, l1[i], l2[j]); @@ -271,33 +275,35 @@ public static void main(String[] args) throws IOException { checkIncludes(log, graph, alpha); checkIncludes(log, graph, digit); checkIncludes(log, graph, punct); - //checkIncludes(log, print, graph); + // checkIncludes(log, print, graph); checkIncludes(log, xdigit, digit); - // possibly alpha, digit, punct, cntrl, space cover the !(Cn,Cs) - final UnicodeSet trRemainder = new UnicodeSet(cnSet) - .complement() - .removeAll(csSet) - .removeAll(digit.contents) - .removeAll(punct.contents) - .removeAll(alpha.contents) - .removeAll(cntrl.contents) - .removeAll(space.contents); + final UnicodeSet trRemainder = + new UnicodeSet(cnSet) + .complement() + .removeAll(csSet) + .removeAll(digit.contents) + .removeAll(punct.contents) + .removeAll(alpha.contents) + .removeAll(cntrl.contents) + .removeAll(space.contents); Utility.showSetNames(log, "TR Remainder: ", trRemainder, false, false, Default.ucd()); - final UnicodeSet propRemainder = new UnicodeSet(cnSet) - .complement() - .removeAll(csSet) - //.removeAll(noSet) - //.removeAll(cfSet) - .removeAll(digit.guessContents) - .removeAll(punct.guessContents) - .removeAll(alpha.guessContents) - .removeAll(cntrl.guessContents) - .removeAll(space.guessContents); - Utility.showSetNames(log, "Prop Remainder: ", propRemainder, false, false, Default.ucd()); + final UnicodeSet propRemainder = + new UnicodeSet(cnSet) + .complement() + .removeAll(csSet) + // .removeAll(noSet) + // .removeAll(cfSet) + .removeAll(digit.guessContents) + .removeAll(punct.guessContents) + .removeAll(alpha.guessContents) + .removeAll(cntrl.guessContents) + .removeAll(space.guessContents); + Utility.showSetNames( + log, "Prop Remainder: ", propRemainder, false, false, Default.ucd()); /* checkDisjoint(new Prop[] {alpha, digit, punct, cntrl}); @@ -339,7 +345,8 @@ static void checkDisjoint(PrintWriter log, Prop prop1, Prop prop2) { checkDisjoint(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents); } - static void checkDisjoint(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) { + static void checkDisjoint( + PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) { if (set.containsSome(set2)) { log.println(); log.println("Fails test: " + name + " disjoint-with " + name2); @@ -353,7 +360,8 @@ static void checkIncludes(PrintWriter log, Prop prop1, Prop prop2) { checkIncludes(log, prop1.guess, prop1.guessContents, prop2.guess, prop2.guessContents); } - static void checkIncludes(PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) { + static void checkIncludes( + PrintWriter log, String name, UnicodeSet set, String name2, UnicodeSet set2) { if (!set.containsAll(set2)) { log.println(); log.println("Fails test:" + name + " includes " + name2); @@ -378,8 +386,8 @@ static void addItems(String line, UnicodeSet contents) { int start, end; final int rangePoint = piece.indexOf(".."); if (rangePoint >= 0) { - start = parse(piece.substring(0,rangePoint)); - end = parse(piece.substring(rangePoint+2)); + start = parse(piece.substring(0, rangePoint)); + end = parse(piece.substring(rangePoint + 2)); } else { start = end = parse(piece); } @@ -391,18 +399,18 @@ static int parse(String piece) { if (!piece.startsWith("")) { throw new IllegalArgumentException("Bogus code point: " + piece); } - return Integer.parseInt(piece.substring(2,piece.length()-1), 16); + return Integer.parseInt(piece.substring(2, piece.length() - 1), 16); } static Prop getProp(String name) { - //System.out.println("Searching for: " + name); + // System.out.println("Searching for: " + name); for (int i = 0; i < propCount; ++i) { - //System.out.println("Checking: " + props[i].name); + // System.out.println("Checking: " + props[i].name); if (props[i].name.equals(name)) { return props[i]; } } - //System.out.println("Missed"); + // System.out.println("Missed"); return null; } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/CompareProperties.java b/unicodetools/src/main/java/org/unicode/text/UCD/CompareProperties.java index 3b7e9e9ed..c96df8b07 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/CompareProperties.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/CompareProperties.java @@ -1,16 +1,16 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/CompareProperties.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/CompareProperties.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.IOException; import java.io.PrintWriter; import java.text.NumberFormat; @@ -24,14 +24,9 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - -import org.unicode.text.utility.Settings; import org.unicode.text.utility.UnicodeDataFile.FileInfix; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class CompareProperties implements UCD_Types { static final boolean DO_DISJOINT = false; @@ -77,16 +72,18 @@ public int compare(Object o1, Object o2) { } /* - * + * * @author Davis * * Reverses the order of a comparison, for getting a list in reverse order */ public static class InverseComparator implements Comparator { private final Comparator other; + public InverseComparator(Comparator other) { this.other = other; } + @Override public int compare(Object a, Object b) { return other.compare(b, a); @@ -94,7 +91,7 @@ public int compare(Object a, Object b) { } /* - * + * * @author Davis * * Reverses the order of a comparison, for getting a list in reverse order @@ -102,47 +99,42 @@ public int compare(Object a, Object b) { public static class MethodComparator implements Comparator { @Override public int compare(Object a, Object b) { - return ((Comparable)a).compareTo(b); + return ((Comparable) a).compareTo(b); } } - public final static class UnicodeSetComparator implements Comparator { + public static final class UnicodeSetComparator implements Comparator { /** - * Compares two UnicodeSets, producing a transitive ordering. - * The ordering is based on the first codepoint that differs between them. - * @return -1 if first set contains the first different code point - * 1 if the second set does. - * 0 if there is no difference. - * If compareTo were added to UnicodeSet, this can be optimized to use list[i]. - * @author Davis + * Compares two UnicodeSets, producing a transitive ordering. The ordering is based on the + * first codepoint that differs between them. * + * @return -1 if first set contains the first different code point 1 if the second set does. + * 0 if there is no difference. If compareTo were added to UnicodeSet, this can be + * optimized to use list[i]. + * @author Davis */ @Override public int compare(Object o1, Object o2) { final UnicodeSetIterator it1 = new UnicodeSetIterator((UnicodeSet) o1); final UnicodeSetIterator it2 = new UnicodeSetIterator((UnicodeSet) o2); while (it1.nextRange()) { - if (!it2.nextRange()) - { + if (!it2.nextRange()) { return -1; // first has range while second exhausted } - if (it1.codepoint < it2.codepoint) - { + if (it1.codepoint < it2.codepoint) { return -1; // first has code point not in second } if (it1.codepoint > it2.codepoint) { return 1; } - if (it1.codepointEnd < it2.codepointEnd) - { + if (it1.codepointEnd < it2.codepointEnd) { return 1; // second has codepoint not in first } if (it1.codepointEnd > it2.codepointEnd) { return -1; } } - if (it2.nextRange()) - { + if (it2.nextRange()) { return 1; // second has range while first is exhausted } return 0; // otherwise we ran out in both of them, so equal @@ -195,7 +187,8 @@ private void fillPropertyValues() { value = new UnicodeSet(); map.put(probe.clone(), value); // Utility.fixDot(); - // System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + Default.ucd.getCodeAndName(cp)); + // System.out.println("Set Size: " + map.size() + ", total: " + total + ", " + + // Default.ucd.getCodeAndName(cp)); } value.add(cp); } @@ -230,7 +223,8 @@ private void getProperties() { System.out.println("\tSkipping " + getPropName(up) + "; default value"); continue; } - // System.out.println(Utility.hex(i) + " " + up.getName(LONG) + "(" + up.getName(SHORT) + ")"); + // System.out.println(Utility.hex(i) + " " + up.getName(LONG) + "(" + up.getName(SHORT) + // + ")"); // System.out.println("\t" + up.getValue(LONG) + "(" + up.getValue(SHORT) + ")"); sets[count] = new UnicodeSet(); disjoints[count] = new BitSet(); @@ -245,8 +239,10 @@ private void getProperties() { public void printPartition() throws IOException { System.out.println("Set Size: " + map.size()); - final PrintWriter output = Utility.openPrintWriterGenDir("Partition" - + FileInfix.getDefault().getFileSuffix(".txt"), Utility.LATIN1_WINDOWS); + final PrintWriter output = + Utility.openPrintWriterGenDir( + "Partition" + FileInfix.getDefault().getFileSuffix(".txt"), + Utility.LATIN1_WINDOWS); final Iterator it = map.keySet().iterator(); while (it.hasNext()) { @@ -271,8 +267,10 @@ public void printPartition() throws IOException { public void printStatistics() throws IOException { System.out.println("Set Size: " + map.size()); - final PrintWriter output = Utility.openPrintWriterGenDir("Statistics" - + FileInfix.getDefault().getFileSuffix(".txt"), Utility.LATIN1_WINDOWS); + final PrintWriter output = + Utility.openPrintWriterGenDir( + "Statistics" + FileInfix.getDefault().getFileSuffix(".txt"), + Utility.LATIN1_WINDOWS); System.out.println("Finding disjoints/contains"); for (int i = 0; i < count; ++i) { @@ -310,7 +308,7 @@ public void printStatistics() throws IOException { tempContains[i].andNot(contains[j]); } } - b = disjoints[i]; // don't worry + b = disjoints[i]; // don't worry for (int j = 0; j < b.size(); ++j) { if (b.get(j)) { b.andNot(contains[j]); @@ -333,7 +331,7 @@ public void printStatistics() throws IOException { Iterator it = m.keySet().iterator(); while (it.hasNext()) { final Object key = it.next(); - final int index = ((Integer)m.get(key)).intValue(); + final int index = ((Integer) m.get(key)).intValue(); boolean haveName = printBitSet(output, index, "EQUALS: ", equals[index], false); haveName = printBitSet(output, index, "CONTAINS: ", contains[index], haveName); haveName = printBitSet(output, index, "IS CONTAINED IN: ", isin[index], haveName); @@ -360,7 +358,8 @@ public void printStatistics() throws IOException { output.close(); } - private boolean printBitSet(PrintWriter output, int index, String title, BitSet b, boolean haveName) { + private boolean printBitSet( + PrintWriter output, int index, String title, BitSet b, boolean haveName) { if (!b.isEmpty()) { if (!haveName) { output.println(); @@ -390,10 +389,10 @@ private boolean printBitSet(PrintWriter output, int index, String title, BitSet } /* - UnicodeSet a_b = new UnicodeSet(); - UnicodeSet ab = new UnicodeSet(); - UnicodeSet _ab = new UnicodeSet(); - */ + UnicodeSet a_b = new UnicodeSet(); + UnicodeSet ab = new UnicodeSet(); + UnicodeSet _ab = new UnicodeSet(); + */ /* a_b.set(sets[i]).removeAll(sets[j]); ab.set(sets[i]).retainAll(sets[j]); @@ -411,14 +410,20 @@ private boolean printBitSet(PrintWriter output, int index, String title, BitSet if (gotName) output.println(); */ - private boolean showDiff(PrintWriter output, String title, int propIndex, UnicodeSet a_b, - double total, double limit, boolean gotName) { + private boolean showDiff( + PrintWriter output, + String title, + int propIndex, + UnicodeSet a_b, + double total, + double limit, + boolean gotName) { if (0 < a_b.size() && a_b.size() < limit) { if (!gotName) { gotName = true; output.print("\t" + getPropName(propIndex)); } - output.print("\t" + title + percent.format(a_b.size()/total)); + output.print("\t" + title + percent.format(a_b.size() / total)); } return gotName; } @@ -437,15 +442,18 @@ public static void listDifferences() throws IOException { Utility.openPrintWriterGenDir( "PropertyDifferences" + FileInfix.getDefault().getFileSuffix(".txt"), Utility.LATIN1_UNIX); - output.println("# Listing of relationships among properties, suitable for analysis by spreadsheet"); + output.println( + "# Listing of relationships among properties, suitable for analysis by spreadsheet"); output.println("# Generated for " + Default.ucd().getVersion()); output.println(Utility.generateDateLine()); output.println("# P1\tP2\tR(P1,P2)\tC(P1&P2)\tC(P1-P2)\tC(P2-P1)"); - for (int i = 1; i < UCD_Types.LIMIT_ENUM; ++i) { final int iType = i & 0xFF00; - if (iType == UCD_Types.JOINING_GROUP || iType == UCD_Types.AGE || iType == UCD_Types.COMBINING_CLASS || iType == UCD_Types.SCRIPT) { + if (iType == UCD_Types.JOINING_GROUP + || iType == UCD_Types.AGE + || iType == UCD_Types.COMBINING_CLASS + || iType == UCD_Types.SCRIPT) { continue; } final UCDProperty upi = UnifiedBinaryProperty.make(i, Default.ucd()); @@ -470,9 +478,12 @@ public static void listDifferences() throws IOException { output.println("#" + iNameLong); int last = -1; - for (int j = i+1; j < UCD_Types.LIMIT_ENUM; ++j) { + for (int j = i + 1; j < UCD_Types.LIMIT_ENUM; ++j) { final int jType = j & 0xFF00; - if (jType == UCD_Types.JOINING_GROUP || jType == UCD_Types.AGE || jType == UCD_Types.COMBINING_CLASS || jType == UCD_Types.SCRIPT + if (jType == UCD_Types.JOINING_GROUP + || jType == UCD_Types.AGE + || jType == UCD_Types.COMBINING_CLASS + || jType == UCD_Types.SCRIPT || (jType == iType && jType != UCD_Types.BINARY_PROPERTIES)) { continue; } @@ -487,13 +498,12 @@ public static void listDifferences() throws IOException { continue; } - if ((j >> 8) != last) { last = j >> 8; - System.out.println(); - System.out.print("\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]); - output.flush(); - output.println("#\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]); + System.out.println(); + System.out.print("\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]); + output.flush(); + output.println("#\t" + UCD_Names.SHORT_UNIFIED_PROPERTIES[last]); } else { System.out.print('.'); } @@ -503,7 +513,9 @@ public static void listDifferences() throws IOException { for (int cp = 0; cp <= 0x10FFFF; ++cp) { final int cat = Default.ucd().getCategory(cp); - if (cat == UCD_Types.UNASSIGNED || cat == UCD_Types.PRIVATE_USE || cat == UCD_Types.SURROGATE) { + if (cat == UCD_Types.UNASSIGNED + || cat == UCD_Types.PRIVATE_USE + || cat == UCD_Types.SURROGATE) { continue; } if (!Default.ucd().isAllocated(cp)) { @@ -532,24 +544,47 @@ public static void listDifferences() throws IOException { } final String jNameShort = upj.getFullName(UCD_Types.SHORT); - //String jNameLong = ubp.getFullID(j, LONG); + // String jNameLong = ubp.getFullID(j, LONG); - final String rel = bothCount == 0 ? "DISJOINT" - : i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS" - : i_jPropCount == 0 ? "CONTAINS" // depends on reverse output - : j_iPropCount == 0 ? "CONTAINS" - : "OVERLAPS"; + final String rel = + bothCount == 0 + ? "DISJOINT" + : i_jPropCount == 0 && j_iPropCount == 0 + ? "EQUALS" + : i_jPropCount == 0 + ? "CONTAINS" // depends on reverse output + : j_iPropCount == 0 ? "CONTAINS" : "OVERLAPS"; if (j_iPropCount > i_jPropCount) { // reverse output - output.println(jNameShort + "\t" + iNameShort + "\t" + rel - + "\t" + bothCount + "\t" + j_iPropCount + "\t" + i_jPropCount); + output.println( + jNameShort + + "\t" + + iNameShort + + "\t" + + rel + + "\t" + + bothCount + + "\t" + + j_iPropCount + + "\t" + + i_jPropCount); } else { - output.println(iNameShort + "\t" + jNameShort + "\t" + rel - + "\t" + bothCount + "\t" + i_jPropCount + "\t" + j_iPropCount); + output.println( + iNameShort + + "\t" + + jNameShort + + "\t" + + rel + + "\t" + + bothCount + + "\t" + + i_jPropCount + + "\t" + + j_iPropCount); } } } output.close(); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/ConvertUCD.java b/unicodetools/src/main/java/org/unicode/text/UCD/ConvertUCD.java index 0cb73b02b..501e4e308 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/ConvertUCD.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/ConvertUCD.java @@ -1,16 +1,15 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/ConvertUCD.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/ConvertUCD.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.text.UTF16; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -28,18 +27,15 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.text.utility.ChainException; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; - - -/** Simple program to merge UCD files into XML. Not yet documented!! +/** + * Simple program to merge UCD files into XML. Not yet documented!! + * * @author Mark Davis */ - public final class ConvertUCD implements UCD_Types { public static final boolean SHOW = false; public static final boolean DEBUG = false; @@ -60,26 +56,44 @@ public final class ConvertUCD implements UCD_Types { public static final String BASE_DIR31 = DATA_DIR + "\\3.1-Update\\"; */ - //public static final String blocksnamePlain = "Blocks.txt"; - //public static final String blocksname31 = "Blocks-4d2.beta"; + // public static final String blocksnamePlain = "Blocks.txt"; + // public static final String blocksname31 = "Blocks-4d2.beta"; - /** First item is file name, rest are field names (skipping character). - * "OMIT" is special -- means don't record + /** + * First item is file name, rest are field names (skipping character). "OMIT" is special -- + * means don't record */ - static String[][] labelList = { // Labels for the incoming files. Labels MUST match field order in file. // IMPORTANT - defaults of form y-=x must occur after x is encountered! // The one exception is "st", which is handled specially. // So file order is important. - //* - // 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER N J; ; ;01CC;01CB - // n gc cc bc dm dd dv nv bm on cm, uc lc tc - {"UnicodeData", "n", "gc", "ccc", "bc", "dm", "dd", "dv", "nv", "bm", "on", "OMIT", "*uc", "*lc", "*tc"}, - //{"ExtraProperties", "xp"}, + // * + // 01CA;LATIN CAPITAL LETTER NJ;Lu;0; L; 004E 004A; ; ; ;N ;LATIN CAPITAL LETTER + // N J; ; ;01CC;01CB + // n gc cc bc dm dd dv nv bm on + // cm, uc lc tc + { + "UnicodeData", + "n", + "gc", + "ccc", + "bc", + "dm", + "dd", + "dv", + "nv", + "bm", + "on", + "OMIT", + "*uc", + "*lc", + "*tc" + }, + // {"ExtraProperties", "xp"}, {"PropList", "binary"}, - //{"ExtraProperties", "xp"}, + // {"ExtraProperties", "xp"}, {"EastAsianWidth", "ea", "OMIT"}, {"LineBreak", "lb", "OMIT"}, @@ -89,15 +103,18 @@ public final class ConvertUCD implements UCD_Types { {"ArabicShaping", "OMIT", "jt", "jg"}, {"BidiMirroring", "*bg"}, {"Scripts", "sn"}, - {"BidiBrackets", "bpb", "bpt"}, // 0028; 0029; o # LEFT PARENTHESIS Bidi_Paired_Bracket, Bidi_Paired_Bracket_Type + { + "BidiBrackets", "bpb", "bpt" + }, // 0028; 0029; o # LEFT PARENTHESIS Bidi_Paired_Bracket, Bidi_Paired_Bracket_Type {"VerticalOrientation", "vo", "OMIT"}, - //{"Jamo", "jn"}, - //{"Scripts-1d4", "RANGE", "sn"}, - //{"Age", "*sn"}, - //*/ + // {"Jamo", "jn"}, + // {"Scripts-1d4", "RANGE", "sn"}, + // {"Age", "*sn"}, + // */ /* //*/ }; + static HashMap isHex = new HashMap(); static HashMap defaults = new HashMap(); @@ -108,7 +125,7 @@ public final class ConvertUCD implements UCD_Types { for (int i = 1; i < labels.length; ++i) { boolean hex = false; final String def = null; - //char appendChar = '\u0000'; + // char appendChar = '\u0000'; // pull off "*": hex interpretation if (labels[i].charAt(0) == '*') { // HEX value @@ -136,7 +153,7 @@ public final class ConvertUCD implements UCD_Types { if (hex) { isHex.put(labels[i], ""); } - //if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar)); + // if (appendChar != 0) appendDuplicates.put(labels[i], String.valueOf(appendChar)); defaults.put(labels[i], def); } } @@ -227,7 +244,7 @@ public final class ConvertUCD implements UCD_Types { // handles public static final String blocksname = "Blocks"; - //public static final String[][] labelList; + // public static final String[][] labelList; public static final boolean NEWPROPS = true; /* @@ -259,17 +276,19 @@ public final class ConvertUCD implements UCD_Types { */ static final String dataFilePrefix = "UCD_Data"; - // MAIN!! - public static void main (String[] args) throws Exception { + public static void main(String[] args) throws Exception { System.out.println("Building binary version of UCD"); - log = new PrintWriter(new BufferedWriter( - new OutputStreamWriter( - new FileOutputStream(Settings.Output.GEN_DIR + "ConvertUCD-log.txt"), - "UTF8"), - 32*1024)); + log = + new PrintWriter( + new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream( + Settings.Output.GEN_DIR + "ConvertUCD-log.txt"), + "UTF8"), + 32 * 1024)); log.write("\uFEFF"); // BOM try { @@ -352,19 +371,19 @@ void toJava(String version) throws Exception { } static PrintWriter log; - //static String directory = BASE_DIR; - //static Map appendDuplicates = new HashMap(); + // static String directory = BASE_DIR; + // static Map appendDuplicates = new HashMap(); - /** First item in labels is file name, rest are field names (skipping character). - * "OMIT" is special -- means don't record + /** + * First item in labels is file name, rest are field names (skipping character). "OMIT" is + * special -- means don't record */ - - List blockData = new LinkedList(); void readBlocks() throws Exception { System.out.println("Reading 'Blocks'"); - final BufferedReader input = Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1); + final BufferedReader input = + Utility.openUnicodeFile(blocksname, version, true, Utility.LATIN1); String line = ""; try { final String[] parts = new String[20]; @@ -386,11 +405,14 @@ void readBlocks() throws Exception { continue; } - final int count = Utility.split(line,';',parts); + final int count = Utility.split(line, ';', parts); if (count != 3) { throw new ChainException("Bad count in Blocks", null); } - blockData.add(new String[] {Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim()}); + blockData.add( + new String[] { + Utility.fromHex(parts[0]), Utility.fromHex(parts[1]), parts[2].trim() + }); } } catch (final Exception e) { @@ -416,7 +438,8 @@ void readSemi(String[] labels) throws Exception { if (version.equals(Settings.latestVersion)) { tempVersion = ""; } - final BufferedReader input = Utility.openUnicodeFile(labels[0], tempVersion, true, Utility.LATIN1); + final BufferedReader input = + Utility.openUnicodeFile(labels[0], tempVersion, true, Utility.LATIN1); if (input == null) { System.out.println("COULDN'T OPEN: " + labels[0]); return; @@ -450,7 +473,7 @@ void readSemi(String[] labels) throws Exception { continue; } - int count = Utility.split(line,';',parts); + int count = Utility.split(line, ';', parts); // fix malformed or simple lists. @@ -466,11 +489,12 @@ void readSemi(String[] labels) throws Exception { } } else { System.out.println("Too few fields: " + original); - throw new ChainException("too few fields: {0}", + throw new ChainException( + "too few fields: {0}", new Object[] {new Integer(line), new Integer(count)}); } } else if (count > labels.length) { - if (count == labels.length + 1 && parts[count-1].equals("")) { + if (count == labels.length + 1 && parts[count - 1].equals("")) { if (labels[0].equals("SpecialCasing")) { // In SpecialCasing.txt, the condition_list field is optional, // and the semicolon is documented as being a terminator, @@ -486,7 +510,8 @@ void readSemi(String[] labels) throws Exception { } } else { System.out.println("Too many fields: " + original); - throw new ChainException("too many fields: {0}", + throw new ChainException( + "too many fields: {0}", new Object[] {new Integer(line), new Integer(count)}); } } @@ -497,8 +522,8 @@ void readSemi(String[] labels) throws Exception { int cpStart; final int ddot = parts[0].indexOf("."); if (ddot >= 0) { - cpStart = UTF16.charAt(Utility.fromHex(parts[0].substring(0,ddot)), 0); - cpTop = UTF16.charAt(Utility.fromHex(parts[0].substring(ddot+2)), 0); + cpStart = UTF16.charAt(Utility.fromHex(parts[0].substring(0, ddot)), 0); + cpTop = UTF16.charAt(Utility.fromHex(parts[0].substring(ddot + 2)), 0); // System.out.println(Utility.hex(cpStart) + " ... " + Utility.hex(cpTop)); } else { cpStart = UTF16.charAt(Utility.fromHex(parts[0]), 0); @@ -519,7 +544,8 @@ void readSemi(String[] labels) throws Exception { } // END FIX!! properties.add(prop); - if (Utility.find(prop, UCD_Names.DeletedProperties, true) == -1) { // only undeleted + if (Utility.find(prop, UCD_Names.DeletedProperties, true) + == -1) { // only undeleted int end = UTF16.charAt(Utility.fromHex(parts[1]), 0); if (end == 0) { end = cpStart; @@ -545,55 +571,69 @@ void readSemi(String[] labels) throws Exception { } else { val = parts[i].trim(); } - if (key.equals("OMIT")) - { + if (key.equals("OMIT")) { continue; } - if (key.equals("RANGE")) - { + if (key.equals("RANGE")) { continue; } - if (val.equals("")) - { + if (val.equals("")) { continue; // skip empty values, they mean default } for (int cps = cpStart; cps <= cpTop; ++cps) { - if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) - { - continue; // skip condensed ranges + if (UCD.mapToRepresentative(cps, Integer.MAX_VALUE) != cps) { + continue; // skip condensed ranges } if (key.equals("binary")) { appendCharProperties(cps, val); } else if (key.equals("fc")) { final UData data = getEntry(cps); - final String type = parts[i-1].trim(); - if (type.equals("F") || type.equals("C") || type.equals("E") || type.equals("L")) { + final String type = parts[i - 1].trim(); + if (type.equals("F") + || type.equals("C") + || type.equals("E") + || type.equals("L")) { data.fullCaseFolding = val; - //System.out.println("*<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); + // System.out.println("*<" + parts[i-1] + "> Setting " + + // Utility.hex(cps) + ": " + Utility.hex(val)); } if (type.equals("S") || type.equals("C") || type.equals("L")) { data.simpleCaseFolding = val; - //System.out.println("<" + parts[i-1] + "> Setting " + Utility.hex(cps) + ": " + Utility.hex(val)); + // System.out.println("<" + parts[i-1] + "> Setting " + + // Utility.hex(cps) + ": " + Utility.hex(val)); } if (type.equals("I")) { data.simpleCaseFolding = val; setBinaryProperty(cps, CaseFoldTurkishI); if (DEBUG) { - System.out.println("SPOT-CHECK: <" + parts[i-1] + "> Setting " - + Utility.hex(cps) + ": " + Utility.hex(val)); + System.out.println( + "SPOT-CHECK: <" + + parts[i - 1] + + "> Setting " + + Utility.hex(cps) + + ": " + + Utility.hex(val)); } } - } else if (labels[0].equals("SpecialCasing") // special handling for special casing + } else if (labels[0].equals( + "SpecialCasing") // special handling for special + // casing && labels[4].equals("sc") && parts[4].trim().length() > 0) { if (i < 4) { if (DEBUG) { - System.out.println("Got special: " + Utility.hex(cps) + ", " - + Utility.hex(key) + ":" + Utility.hex(val)); + System.out.println( + "Got special: " + + Utility.hex(cps) + + ", " + + Utility.hex(key) + + ":" + + Utility.hex(val)); } - addCharData(cps, "sc", parts[4].trim() + ":" + key + ":" + val); + addCharData( + cps, "sc", parts[4].trim() + ":" + key + ":" + val); } } else { /*if (key.equals("sn")) { // SKIP UNDEFINED!! @@ -609,7 +649,7 @@ void readSemi(String[] labels) throws Exception { } catch (final Exception e) { System.err.println("*Exception at: " + line + ", " + e.getMessage()); throw e; - //System.err.println(e.getMessage()); + // System.err.println(e.getMessage()); } } } catch (final Throwable e) { @@ -618,8 +658,8 @@ void readSemi(String[] labels) throws Exception { } finally { input.close(); } - //printValues("JOINING_TYPE", jtSet); - //printValues("JOINING_GROUP", jgSet); + // printValues("JOINING_TYPE", jtSet); + // printValues("JOINING_GROUP", jgSet); } static void printValues(String title, Set s) { @@ -635,7 +675,8 @@ static void printValues(String title, Set s) { int count = 0; while (it.hasNext()) { final String value = it.next(); - System.out.println(" " + value.replace(' ', '-').toUpperCase() + " = " + (count++) + ","); + System.out.println( + " " + value.replace(' ', '-').toUpperCase() + " = " + (count++) + ","); } System.out.println(" LIMIT_" + title + " = " + count); System.out.println(";"); @@ -692,7 +733,8 @@ static void writeXML() throws IOException { String value = Utility.quoteXML((String) data.get(label)); output.write(" " + label + "='" + value + "'"); } - *//* + */ + /* output.write("/>\n"); } @@ -710,10 +752,15 @@ void writeJavaData() throws IOException { final int codePoint = -1; System.out.println("Writing " + dataFilePrefix + version); Settings.Output.ensureOutputDirs(); - final DataOutputStream dataOut = new DataOutputStream( - new BufferedOutputStream( - new FileOutputStream(Settings.Output.BIN_DIR + dataFilePrefix + version + ".bin"), - 128*1024)); + final DataOutputStream dataOut = + new DataOutputStream( + new BufferedOutputStream( + new FileOutputStream( + Settings.Output.BIN_DIR + + dataFilePrefix + + version + + ".bin"), + 128 * 1024)); // write header dataOut.writeByte(BINARY_FORMAT); @@ -732,7 +779,7 @@ void writeJavaData() throws IOException { while (it.hasNext()) { final Object cc = it.next(); - //codePoint = UTF32.char32At(cc,0); + // codePoint = UTF32.char32At(cc,0); if (DEBUG) { System.out.println(Utility.hex(cc)); } @@ -754,13 +801,14 @@ void writeJavaData() throws IOException { } System.out.println("Wrote Data " + count); } catch (final Exception e) { - throw new ChainException("Bad data write {0}", new Object [] {Utility.hex(codePoint)}, e); + throw new ChainException( + "Bad data write {0}", new Object[] {Utility.hex(codePoint)}, e); } finally { dataOut.close(); } } - //static String[] xsSplit = new String[40]; + // static String[] xsSplit = new String[40]; // Cache a little bit for speed int getEntryCodePoint = -1; @@ -791,15 +839,13 @@ UData getEntry(int cp) { if (charEntry == null) { charEntry = new UData(cp); charData.put(cc, charEntry); - //charEntry.put("c", cc); + // charEntry.put("c", cc); } getEntryCodePoint = cp; getEntryUData = charEntry; return charEntry; } - /** Adds the character data. Signals duplicates with an exception - */ - + /** Adds the character data. Signals duplicates with an exception */ void setBinaryProperty(int cp, int binProp) { final UData charEntry = getEntry(cp); charEntry.binaryProperties |= (1L << binProp); @@ -810,12 +856,12 @@ void appendCharProperties(int cp, String key) { setBinaryProperty(cp, ind); } - /** Adds the character data. Signals duplicates with an exception - */ + /** Adds the character data. Signals duplicates with an exception */ void addCharData(int cp, String key, String value) { - //if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + Utility.quoteJavaString(value)); + // if (cp < 10) System.out.println("A: " + Utility.hex(cp) + ", " + key + ", " + + // Utility.quoteJavaString(value)); final UData charEntry = getEntry(cp); - //if (cp < 10) System.out.println(" " + charEntry); + // if (cp < 10) System.out.println(" " + charEntry); if (SHOW_SAMPLE && cp == 0x221) { System.out.println("Sample: " + cp + ", " + key + ", " + value); @@ -836,24 +882,24 @@ void addCharData(int cp, String key, String value) { charEntry.decompositionType = CANONICAL; if (value.charAt(0) == '<') { final int pos = value.indexOf('>'); - String dType = value.substring(1,pos); + String dType = value.substring(1, pos); if (major < 2) { if (dType.charAt(0) == '+') { dType = dType.substring(1); } } - value = value.substring(pos+1); + value = value.substring(pos + 1); setField(charEntry, "dt", dType); } // FIX OLD if (major < 2) { int oldStyle = value.indexOf('<'); if (oldStyle > 0) { - value = value.substring(0,oldStyle); + value = value.substring(0, oldStyle); } oldStyle = value.indexOf('{'); if (oldStyle > 0) { - value = value.substring(0,oldStyle); + value = value.substring(0, oldStyle); } } setField(charEntry, key, Utility.fromHex(value)); @@ -875,10 +921,10 @@ void addCharData(int cp, String key, String value) { } setField(charEntry, "nv", value); /*} else if (key.equals("jt")) { - jtSet.add(value); - } else if (key.equals("jg")) { - jgSet.add(value); - */ + jtSet.add(value); + } else if (key.equals("jg")) { + jgSet.add(value); + */ } else { setField(charEntry, key, value); } @@ -886,7 +932,6 @@ void addCharData(int cp, String key, String value) { System.out.println("Sample Result:"); System.out.println(charEntry); } - } public void setField(UData uData, String fieldName, String fieldValue) { @@ -908,7 +953,8 @@ public void setField(UData uData, String fieldName, String fieldValue) { uData.fullUppercase = fieldValue; } else if (fieldName.equals("sl")) { if (DEBUG) { - System.out.println("Setting full lowercase to " + Utility.hex(fieldValue) + uData); + System.out.println( + "Setting full lowercase to " + Utility.hex(fieldValue) + uData); } uData.fullLowercase = fieldValue; } else if (fieldName.equals("st")) { @@ -922,16 +968,18 @@ public void setField(UData uData, String fieldName, String fieldValue) { } else if (fieldName.equals("xp")) { uData.binaryProperties |= 1L << Utility.lookup(fieldValue, UCD_Names.BP, true); - //UCD_Names.BP_OLD + // UCD_Names.BP_OLD } else if (fieldName.equals("gc")) { - uData.generalCategory = Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true); + uData.generalCategory = + Utility.lookup(fieldValue, UCD_Names.GENERAL_CATEGORY, true); // if (major >= 5 && uData.script == Unknown_Script // && uData.generalCategory != Cn // && uData.generalCategory != Cs // && uData.generalCategory != Co) { // uData.script = COMMON_SCRIPT; - // System.out.println("Resetting to Common Script: " + Utility.hex(uData.codePoint)); + // System.out.println("Resetting to Common Script: " + + // Utility.hex(uData.codePoint)); // } } else if (fieldName.equals("bc")) { uData.bidiClass = Utility.lookup(fieldValue, UCD_Names.BIDI_CLASS, true); @@ -953,7 +1001,8 @@ public void setField(UData uData, String fieldName, String fieldValue) { fieldValue = "compat"; } } - uData.decompositionType = Utility.lookup(fieldValue, UCD_Names.LONG_DECOMPOSITION_TYPE, true); + uData.decompositionType = + Utility.lookup(fieldValue, UCD_Names.LONG_DECOMPOSITION_TYPE, true); } else if (fieldName.equals("nt")) { uData.numericType = Utility.lookup(fieldValue, UCD_Names.LONG_NUMERIC_TYPE, true); @@ -969,8 +1018,12 @@ public void setField(UData uData, String fieldName, String fieldValue) { if (true) { // codepoint == 0x10D1D) { int debug = 0; } - uData.joiningGroup = Utility.lookupShort(fieldValue, - UCD_Names.OLD_JOINING_GROUP, UCD_Names.JOINING_GROUP, true); + uData.joiningGroup = + Utility.lookupShort( + fieldValue, + UCD_Names.OLD_JOINING_GROUP, + UCD_Names.JOINING_GROUP, + true); } else if (fieldName.equals("nv")) { if (major < 2) { if (fieldValue.equals("-")) { @@ -979,35 +1032,42 @@ public void setField(UData uData, String fieldName, String fieldValue) { } uData.numericValue = Utility.doubleFrom(fieldValue); } else if (fieldName.equals("ccc")) { - uData.combiningClass = (byte)Utility.intFrom(fieldValue); + uData.combiningClass = (byte) Utility.intFrom(fieldValue); if (uData.combiningClass == 9 && major >= 5) { if (DEBUG) { - System.out.println("setting Grapheme_Link " + Utility.hex(uData.codePoint) + "\t" + uData.name); + System.out.println( + "setting Grapheme_Link " + + Utility.hex(uData.codePoint) + + "\t" + + uData.name); } - uData.binaryProperties |= (1<= 5 && (uData.binaryProperties & 1<= 5 && (uData.binaryProperties & + // 1<$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/DerivedProperty.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; + +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.PrintWriter; import java.util.BitSet; import java.util.HashMap; import java.util.Map; - import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.Binary; import org.unicode.text.utility.ChainException; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public final class DerivedProperty implements UCD_Types { UCD ucdData; @@ -38,11 +35,11 @@ public final class DerivedProperty implements UCD_Types { // ADD CONSTANT to UCD_TYPES - static public UCDProperty make(int derivedPropertyID) { + public static UCDProperty make(int derivedPropertyID) { return make(derivedPropertyID, Default.ucd()); } - static public UCDProperty make(int derivedPropertyID, UCD ucd) { + public static UCDProperty make(int derivedPropertyID, UCD ucd) { if (derivedPropertyID < 0 || derivedPropertyID >= DERIVED_PROPERTY_LIMIT) { return null; } @@ -109,19 +106,18 @@ public String getValue(int cp, int propNumber) { */ private final UCDProperty[] dprops = new UCDProperty[50]; - static final String[] CaseNames = { - "Uppercase", - "Lowercase", - "Mixedcase"}; + static final String[] CaseNames = {"Uppercase", "Lowercase", "Mixedcase"}; class ExDProp extends UCDProperty { Normalizer nfx; + ExDProp(int i) { type = DERIVED_NORMALIZATION; nfx = nf[i]; name = "Expands_On_" + nfx.getName(); shortName = "XO_" + nfx.getName(); } + @Override public boolean hasValue(int cp) { if (ucdData.getDecompositionType(cp) == NONE) { @@ -133,11 +129,12 @@ public boolean hasValue(int cp) { } return false; } - }; + } + ; class NF_UnsafeStartProp extends UCDProperty { Normalizer nfx; - //int prop; + // int prop; NF_UnsafeStartProp(int i) { isStandard = false; @@ -146,6 +143,7 @@ class NF_UnsafeStartProp extends UCDProperty { name = nfx.getName() + "_UnsafeStart"; shortName = nfx.getName() + "_SS"; } + @Override public boolean hasValue(int cp) { if (ucdData.getCombiningClass(cp) != 0) { @@ -156,15 +154,13 @@ public boolean hasValue(int cp) { if (ucdData.getCombiningClass(first) != 0) { return true; } - if (nfx.isComposition() - && dprops[NFC_TrailingZero].hasValue(first)) - { + if (nfx.isComposition() && dprops[NFC_TrailingZero].hasValue(first)) { return true; // 1,3 == composing } return false; } - }; - + } + ; /* class HangulSyllableType extends UnicodeProperty { @@ -194,7 +190,6 @@ public boolean hasValue(int cp) { }; */ - class NFC_Prop extends UCDProperty { BitSet bitset; boolean filter = false; @@ -204,18 +199,26 @@ class NFC_Prop extends UCDProperty { isStandard = false; type = DERIVED_NORMALIZATION; final BitSet[] bitsets = new BitSet[3]; - switch(i) { - case NFC_Leading: bitsets[0] = bitset = new BitSet(); break; - case NFC_Resulting: bitsets[2] = bitset = new BitSet(); break; - case NFC_TrailingZero: keepNonZero = false; // FALL THRU - case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break; + switch (i) { + case NFC_Leading: + bitsets[0] = bitset = new BitSet(); + break; + case NFC_Resulting: + bitsets[2] = bitset = new BitSet(); + break; + case NFC_TrailingZero: + keepNonZero = false; // FALL THRU + case NFC_TrailingNonZero: + bitsets[1] = bitset = new BitSet(); + break; } filter = bitsets[1] != null; nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]); - name = Names[i-NFC_Leading]; - shortName = SNames[i-NFC_Leading]; + name = Names[i - NFC_Leading]; + shortName = SNames[i - NFC_Leading]; } + @Override public boolean hasValue(int cp) { boolean result = bitset.get(cp); @@ -224,21 +227,25 @@ public boolean hasValue(int cp) { } return result; } - final String[] Names = {"NFC_Leading", "NFC_TrailingNonZero", "NFC_TrailingZero", "NFC_Resulting"}; + + final String[] Names = { + "NFC_Leading", "NFC_TrailingNonZero", "NFC_TrailingZero", "NFC_Resulting" + }; final String[] SNames = {"NFC_L", "NFC_TNZ", "NFC_TZ", "NFC_R"}; final String[] Description = { - "Characters that can combine with following characters in NFC", - "Characters that can combine with previous characters in NFC, and have non-zero combining class", - "Characters that can combine with previous characters in NFC, and have zero combining class", - "Characters that can result from a combination of other characters in NFC", + "Characters that can combine with following characters in NFC", + "Characters that can combine with previous characters in NFC, and have non-zero combining class", + "Characters that can combine with previous characters in NFC, and have zero combining class", + "Characters that can result from a combination of other characters in NFC", }; - }; + } + ; class GenDProp extends UCDProperty { Normalizer nfx; Normalizer nfComp = null; - GenDProp (int i) { + GenDProp(int i) { isStandard = false; setValueType(STRING_PROP); type = DERIVED_NORMALIZATION; @@ -274,32 +281,37 @@ public String getValue(int cp, byte style) { final String normal = nfx.normalize(cps); if (!comp.equals(normal)) { final String norm = Utility.hex(normal); - final String pad = Utility.repeat(" ", 14-norm.length()); + final String pad = Utility.repeat(" ", 14 - norm.length()); cacheStr = name + "; " + norm + pad; } } return cacheStr; - //if (cp >= 0xAC00 && cp <= 0xD7A3) return true; - //System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps))); + // if (cp >= 0xAC00 && cp <= 0xD7A3) return true; + // System.out.println(Utility.hex(cps) + " => " + Utility.hex(nf[i-4].normalize(cps))); } // default + @Override - public boolean hasValue(int cp) { return getValue(cp).length() != 0; } - }; + public boolean hasValue(int cp) { + return getValue(cp).length() != 0; + } + } + ; class CaseDProp extends UCDProperty { byte val; - CaseDProp (int i) { + + CaseDProp(int i) { type = DERIVED_CORE; isStandard = false; val = (i == Missing_Uppercase ? Lu : i == Missing_Lowercase ? Ll : Lt); - name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase]; + name = "Possible_Missing_" + CaseNames[i - Missing_Uppercase]; } + @Override public boolean hasValue(int cp) { final byte cat = ucdData.getCategory(cp); - if (cat == val - || val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) { + if (cat == val || val != Lt && ucdData.getBinaryProperty(cp, Other_Uppercase)) { return false; } final byte xCat = getDecompCat(cp); @@ -308,14 +320,16 @@ public boolean hasValue(int cp) { } return false; } - }; + } + ; class QuickDProp extends UCDProperty { String NO; String MAYBE; Normalizer nfx; - QuickDProp (int i) { - //setValueType((i == NFC || i == NFKC) ? ENUMERATED_PROP : BINARY_PROP); + + QuickDProp(int i) { + // setValueType((i == NFC || i == NFKC) ? ENUMERATED_PROP : BINARY_PROP); setValueType(ENUMERATED_PROP); type = DERIVED_NORMALIZATION; nfx = nf[i]; @@ -342,25 +356,27 @@ public String getListingValue(int cp) { } @Override - public boolean hasValue(int cp) { return getValue(cp).length() != 0; } - }; + public boolean hasValue(int cp) { + return getValue(cp).length() != 0; + } + } + ; private DerivedProperty(UCD ucd) { ucdData = ucd; final IndexUnicodeProperties iupCurrent = IndexUnicodeProperties.make(ucd.getVersion()); - nfd = nf[NFD] = new Normalizer(UCD_Types.NFD, ucdData.getVersion()); nfc = nf[NFC] = new Normalizer(UCD_Types.NFC, ucdData.getVersion()); nfkd = nf[NFKD] = new Normalizer(UCD_Types.NFKD, ucdData.getVersion()); nfkc = nf[NFKC] = new Normalizer(UCD_Types.NFKC, ucdData.getVersion()); for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) { - dprops[i] = new ExDProp(i-ExpandsOnNFD); + dprops[i] = new ExDProp(i - ExpandsOnNFD); } for (int i = GenNFD; i <= GenNFKC; ++i) { - dprops[i] = new GenDProp(i-GenNFD); + dprops[i] = new GenDProp(i - GenNFD); } for (int i = NFC_Leading; i <= NFC_Resulting; ++i) { @@ -368,47 +384,51 @@ private DerivedProperty(UCD ucd) { } for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) { - dprops[i] = new NF_UnsafeStartProp(i-NFD_UnsafeStart); + dprops[i] = new NF_UnsafeStartProp(i - NFD_UnsafeStart); } - dprops[ID_Start] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "ID_Start"; - shortName = "IDS"; - } - @Override - public boolean hasValue(int cp) { - return ucdData.isIdentifierStart(cp); - } - }; + dprops[ID_Start] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "ID_Start"; + shortName = "IDS"; + } - dprops[ID_Continue_NO_Cf] = new UCDProperty() { - { - name = "ID_Continue"; - type = DERIVED_CORE; - shortName = "IDC"; - } - @Override - public boolean hasValue(int cp) { - return ucdData.isIdentifierContinue_NO_Cf(cp); - } - }; + @Override + public boolean hasValue(int cp) { + return ucdData.isIdentifierStart(cp); + } + }; + + dprops[ID_Continue_NO_Cf] = + new UCDProperty() { + { + name = "ID_Continue"; + type = DERIVED_CORE; + shortName = "IDC"; + } + + @Override + public boolean hasValue(int cp) { + return ucdData.isIdentifierContinue_NO_Cf(cp); + } + }; final StringBuffer tempBuf = new StringBuffer(); - //System.out.println("Deriving data for XID"); + // System.out.println("Deriving data for XID"); // special hack for middle dot XID_Continue_Set.add(0x00B7); - //System.out.println("Adding (2)" + ucdData.getCodeAndName(0x00B7)); - + // System.out.println("Adding (2)" + ucdData.getCodeAndName(0x00B7)); for (int cp = 0; cp < 0x10FFFF; ++cp) { // skip cases that can't matter if (!ucdData.isAssigned(cp)) { continue; } - if (ucdData.getBinaryProperty(cp, Pattern_Syntax) || ucdData.getBinaryProperty(cp, Pattern_White_Space)) { + if (ucdData.getBinaryProperty(cp, Pattern_Syntax) + || ucdData.getBinaryProperty(cp, Pattern_White_Space)) { continue; } @@ -447,15 +467,15 @@ public boolean hasValue(int cp) { // Now see if the statuses are compatible. if (status != status2) { - //System.out.println("Need to do something with:"); - //System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp)); - //System.out.println(" " + status2 + ": " + ucdData.getCodeAndName(tempBuf.toString())); + // System.out.println("Need to do something with:"); + // System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp)); + // System.out.println(" " + status2 + ": " + + // ucdData.getCodeAndName(tempBuf.toString())); if (status2 == 0) { status = 0; - } else if (status2 > status) - { + } else if (status2 > status) { status = status2; - //System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp)); + // System.out.println(" " + status + ": " + ucdData.getCodeAndName(cp)); } } } @@ -468,513 +488,572 @@ public boolean hasValue(int cp) { } } - dprops[Mod_ID_Start] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "XID_Start"; - shortName = "XIDS"; - } - @Override - public boolean hasValue(int cp) { - return XID_Start_Set.contains(cp); - } - }; + dprops[Mod_ID_Start] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "XID_Start"; + shortName = "XIDS"; + } - dprops[Mod_ID_Continue_NO_Cf] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "XID_Continue"; - shortName = "XIDC"; - } - @Override - public boolean hasValue(int cp) { - return XID_Continue_Set.contains(cp); - } - }; + @Override + public boolean hasValue(int cp) { + return XID_Start_Set.contains(cp); + } + }; - dprops[PropMath] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "Math"; - shortName = name; - } - @Override - public boolean hasValue(int cp) { - final byte cat = ucdData.getCategory(cp); - if (cat == Sm - || ucdData.getBinaryProperty(cp,Math_Property)) { - return true; - } - return false; - } - }; + dprops[Mod_ID_Continue_NO_Cf] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "XID_Continue"; + shortName = "XIDC"; + } - dprops[PropAlphabetic] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "Alphabetic"; - shortName = "Alpha"; - } - @Override - public boolean hasValue(int cp) { - final byte cat = ucdData.getCategory(cp); - if (cat == Lu || cat == Ll || cat == Lt || cat == Lm || cat == Lo || cat == Nl - || ucdData.getBinaryProperty(cp, Other_Alphabetic)) { - return true; - } - return false; - } - }; + @Override + public boolean hasValue(int cp) { + return XID_Continue_Set.contains(cp); + } + }; - dprops[PropLowercase] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "Lowercase"; - shortName = "Lower"; - } - @Override - public boolean hasValue(int cp) { - final byte cat = ucdData.getCategory(cp); - if (cat == Ll - || ucdData.getBinaryProperty(cp, Other_Lowercase)) { - return true; - } - return false; - } - }; + dprops[PropMath] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "Math"; + shortName = name; + } - dprops[PropUppercase] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "Uppercase"; - shortName = "Upper"; - } - @Override - public boolean hasValue(int cp) { - final byte cat = ucdData.getCategory(cp); - if (cat == Lu - || ucdData.getBinaryProperty(cp, Other_Uppercase)) { - return true; - } - return false; - } - }; + @Override + public boolean hasValue(int cp) { + final byte cat = ucdData.getCategory(cp); + if (cat == Sm || ucdData.getBinaryProperty(cp, Math_Property)) { + return true; + } + return false; + } + }; + + dprops[PropAlphabetic] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "Alphabetic"; + shortName = "Alpha"; + } + + @Override + public boolean hasValue(int cp) { + final byte cat = ucdData.getCategory(cp); + if (cat == Lu + || cat == Ll + || cat == Lt + || cat == Lm + || cat == Lo + || cat == Nl + || ucdData.getBinaryProperty(cp, Other_Alphabetic)) { + return true; + } + return false; + } + }; + + dprops[PropLowercase] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "Lowercase"; + shortName = "Lower"; + } + + @Override + public boolean hasValue(int cp) { + final byte cat = ucdData.getCategory(cp); + if (cat == Ll || ucdData.getBinaryProperty(cp, Other_Lowercase)) { + return true; + } + return false; + } + }; + + dprops[PropUppercase] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "Uppercase"; + shortName = "Upper"; + } + + @Override + public boolean hasValue(int cp) { + final byte cat = ucdData.getCategory(cp); + if (cat == Lu || ucdData.getBinaryProperty(cp, Other_Uppercase)) { + return true; + } + return false; + } + }; for (int i = Missing_Uppercase; i <= Missing_Mixedcase; ++i) { dprops[i] = new CaseDProp(i); } /* -(3) Singleton Decompositions: characters that can be derived from the UnicodeData file by -including all characters whose canonical decomposition consists of a single character. -(4) Non-Starter Decompositions: characters that can be derived from the UnicodeData -file by including all characters whose canonical decomposition consists of a sequence -of characters, the first of which has a non-zero combining class. - */ - dprops[FullCompExclusion] = new UCDProperty() { - { - type = DERIVED_NORMALIZATION; - name = "Full_Composition_Exclusion"; - shortName = "Comp_Ex"; - defaultValueStyle = defaultPropertyStyle = SHORT; - } - @Override - public boolean hasValue(int cp) { - if (!ucdData.isRepresented(cp)) { - return false; - } - final byte dtype = ucdData.getDecompositionType(cp); - if (dtype != CANONICAL) { - return false; - } + (3) Singleton Decompositions: characters that can be derived from the UnicodeData file by + including all characters whose canonical decomposition consists of a single character. + (4) Non-Starter Decompositions: characters that can be derived from the UnicodeData + file by including all characters whose canonical decomposition consists of a sequence + of characters, the first of which has a non-zero combining class. + */ + dprops[FullCompExclusion] = + new UCDProperty() { + { + type = DERIVED_NORMALIZATION; + name = "Full_Composition_Exclusion"; + shortName = "Comp_Ex"; + defaultValueStyle = defaultPropertyStyle = SHORT; + } - if (isCompEx(cp)) { - return true; - } - return false; - } - /*public String getListingValue(int cp) { - return "Comp_Ex"; - }*/ - /* - public String getListingValue(int cp) { - if (getValueType() != BINARY) return getValue(cp, SHORT); - return getProperty(SHORT); + @Override + public boolean hasValue(int cp) { + if (!ucdData.isRepresented(cp)) { + return false; + } + final byte dtype = ucdData.getDecompositionType(cp); + if (dtype != CANONICAL) { + return false; } - */ - }; - dprops[FullCompInclusion] = new UCDProperty() { - { - isStandard = false; - type = DERIVED_NORMALIZATION; - name = "Full_Composition_Inclusion"; - shortName = "Comp_In"; - defaultValueStyle = defaultPropertyStyle = SHORT; - } - @Override - public boolean hasValue(int cp) { - if (!ucdData.isRepresented(cp)) { - return false; - } - final byte dtype = ucdData.getDecompositionType(cp); - if (dtype != CANONICAL) { - return false; - } + if (isCompEx(cp)) { + return true; + } + return false; + } + /*public String getListingValue(int cp) { + return "Comp_Ex"; + }*/ + /* + public String getListingValue(int cp) { + if (getValueType() != BINARY) return getValue(cp, SHORT); + return getProperty(SHORT); + } + */ + }; + + dprops[FullCompInclusion] = + new UCDProperty() { + { + isStandard = false; + type = DERIVED_NORMALIZATION; + name = "Full_Composition_Inclusion"; + shortName = "Comp_In"; + defaultValueStyle = defaultPropertyStyle = SHORT; + } - if (isCompEx(cp)) { - return true; - } - return false; - } - }; + @Override + public boolean hasValue(int cp) { + if (!ucdData.isRepresented(cp)) { + return false; + } + final byte dtype = ucdData.getDecompositionType(cp); + if (dtype != CANONICAL) { + return false; + } - dprops[FC_NFKC_Closure] = new UCDProperty() { - { - type = DERIVED_NORMALIZATION; - setValueType(STRING_PROP); - name = "FC_NFKC_Closure"; - shortName = "FC_NFKC"; - } - @Override - public String getValue(int cp, byte style) { - if (!ucdData.isRepresented(cp)) { - return ""; - } - final String b = nfkc.normalize(fold(cp)); - final String c = nfkc.normalize(fold(b)); - if (c.equals(b)) { - return ""; - } - return "FNC; " + Utility.hex(c); - } // default - @Override - public boolean hasValue(int cp) { return getValue(cp).length() != 0; } - }; + if (isCompEx(cp)) { + return true; + } + return false; + } + }; - dprops[FC_NFC_Closure] = new UCDProperty() { - { - type = DERIVED_NORMALIZATION; - isStandard = false; - name = "FC_NFC_Closure"; - setValueType(STRING_PROP); - shortName = "FC_NFC"; - } - @Override - public String getValue(int cp, byte style) { - if (!ucdData.isRepresented(cp)) { - return ""; - } - final String b = nfc.normalize(fold(cp)); - final String c = nfc.normalize(fold(b)); - if (c.equals(b)) { - return ""; - } - return "FN; " + Utility.hex(c); - } // default - @Override - public boolean hasValue(int cp) { return getValue(cp).length() != 0; } - }; + dprops[FC_NFKC_Closure] = + new UCDProperty() { + { + type = DERIVED_NORMALIZATION; + setValueType(STRING_PROP); + name = "FC_NFKC_Closure"; + shortName = "FC_NFKC"; + } + + @Override + public String getValue(int cp, byte style) { + if (!ucdData.isRepresented(cp)) { + return ""; + } + final String b = nfkc.normalize(fold(cp)); + final String c = nfkc.normalize(fold(b)); + if (c.equals(b)) { + return ""; + } + return "FNC; " + Utility.hex(c); + } // default + + @Override + public boolean hasValue(int cp) { + return getValue(cp).length() != 0; + } + }; + + dprops[FC_NFC_Closure] = + new UCDProperty() { + { + type = DERIVED_NORMALIZATION; + isStandard = false; + name = "FC_NFC_Closure"; + setValueType(STRING_PROP); + shortName = "FC_NFC"; + } + + @Override + public String getValue(int cp, byte style) { + if (!ucdData.isRepresented(cp)) { + return ""; + } + final String b = nfc.normalize(fold(cp)); + final String c = nfc.normalize(fold(b)); + if (c.equals(b)) { + return ""; + } + return "FN; " + Utility.hex(c); + } // default + + @Override + public boolean hasValue(int cp) { + return getValue(cp).length() != 0; + } + }; for (int i = QuickNFD; i <= QuickNFKC; ++i) { dprops[i] = new QuickDProp(i - QuickNFD); } - dprops[DefaultIgnorable] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "Default_Ignorable_Code_Point"; - hasUnassigned = true; - shortName = "DI"; - } - - - UnicodeSet removals; - { - // Prepended_Concatenation_Mark characters - try { - UnicodeMap pcm = iupCurrent.loadEnum(UcdProperty.Prepended_Concatenation_Mark, Binary.class); - removals = new UnicodeSet(pcm.getSet(Binary.Yes)).freeze(); - } catch (Exception e) { - removals = UnicodeSet.EMPTY; - } - } + dprops[DefaultIgnorable] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "Default_Ignorable_Code_Point"; + hasUnassigned = true; + shortName = "DI"; + } + UnicodeSet removals; - /** - (See MakeUnicodeFiles.txt) - # Derived Property: Default_Ignorable_Code_Point - # Generated from - # Other_Default_Ignorable_Code_Point - # + Cf (Format characters) - # + Variation_Selector - # - White_Space - # - FFF9..FFFB (Interlinear annotation format characters) - # - 13430..13440 (Egyptian hieroglyph format characters) - # - Prepended_Concatenation_Mark (Exceptional format characters that should be visible) - */ - @Override - public boolean hasValue(int cp) { - if (removals.contains(cp)) { - return false; - } - if (ucdData.getBinaryProperty(cp, White_space)) { - return false; - } - if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) { - return true; - } + { + // Prepended_Concatenation_Mark characters + try { + UnicodeMap pcm = + iupCurrent.loadEnum( + UcdProperty.Prepended_Concatenation_Mark, Binary.class); + removals = new UnicodeSet(pcm.getSet(Binary.Yes)).freeze(); + } catch (Exception e) { + removals = UnicodeSet.EMPTY; + } + } - if (ucdData.getCompositeVersion() > 0x040000 && cp >= 0xFFF9 && cp <= 0xFFFB) { - return false; - } + /** + * (See MakeUnicodeFiles.txt) # Derived Property: Default_Ignorable_Code_Point # + * Generated from # Other_Default_Ignorable_Code_Point # + Cf (Format + * characters) # + Variation_Selector # - White_Space # - FFF9..FFFB + * (Interlinear annotation format characters) # - 13430..13440 (Egyptian + * hieroglyph format characters) # - Prepended_Concatenation_Mark (Exceptional + * format characters that should be visible) + */ + @Override + public boolean hasValue(int cp) { + if (removals.contains(cp)) { + return false; + } + if (ucdData.getBinaryProperty(cp, White_space)) { + return false; + } + if (ucdData.getBinaryProperty(cp, Other_Default_Ignorable_Code_Point)) { + return true; + } - // Unicode 12.0: 13430..13438 (Egyptian hieroglyph format characters) - if (ucdData.getCompositeVersion() >= 0x0c0000 && cp >= 0x13430 && cp <= 0x13438) { - return false; - } - // Unicode 15.0: 13439..13440 (Egyptian hieroglyph format characters) - if (ucdData.getCompositeVersion() >= 0x0f0000 && cp >= 0x13439 && cp <= 0x13440) { - return false; - } + if (ucdData.getCompositeVersion() > 0x040000 + && cp >= 0xFFF9 + && cp <= 0xFFFB) { + return false; + } - final byte cat = ucdData.getCategory(cp); - if (cat == Cf) { - return true; - } + // Unicode 12.0: 13430..13438 (Egyptian hieroglyph format characters) + if (ucdData.getCompositeVersion() >= 0x0c0000 + && cp >= 0x13430 + && cp <= 0x13438) { + return false; + } + // Unicode 15.0: 13439..13440 (Egyptian hieroglyph format characters) + if (ucdData.getCompositeVersion() >= 0x0f0000 + && cp >= 0x13439 + && cp <= 0x13440) { + return false; + } - if (ucdData.getCompositeVersion() <= 0x040000) { - return false; - } + final byte cat = ucdData.getCategory(cp); + if (cat == Cf) { + return true; + } - if (ucdData.getBinaryProperty(cp, Variation_Selector)) { - return true; - } - return false; - } + if (ucdData.getCompositeVersion() <= 0x040000) { + return false; + } - }; + if (ucdData.getBinaryProperty(cp, Variation_Selector)) { + return true; + } + return false; + } + }; - dprops[Case_Sensitive] = new UCDProperty() { - { - type = DERIVED_CORE; - isStandard = false; - name = "Case_Sensitive"; - hasUnassigned = false; - shortName = "CS"; - } + dprops[Case_Sensitive] = + new UCDProperty() { + { + type = DERIVED_CORE; + isStandard = false; + name = "Case_Sensitive"; + hasUnassigned = false; + shortName = "CS"; + } - UnicodeSet case_sensitive = null; - UnicodeSet tempSet = new UnicodeSet(); - UnicodeSet cased = null; - PrintWriter log; + UnicodeSet case_sensitive = null; + UnicodeSet tempSet = new UnicodeSet(); + UnicodeSet cased = null; + PrintWriter log; - private void addCase(String cps, byte c1, byte c2) { - final String temp = ucdData.getCase(cps, c1, c2); - if (temp.equals(cps)) { - return; - } + private void addCase(String cps, byte c1, byte c2) { + final String temp = ucdData.getCase(cps, c1, c2); + if (temp.equals(cps)) { + return; + } - //temp = nfc.normalize(temp); - //if (temp.equals(cps)) return; - - tempSet.clear(); - tempSet.addAll(cps); - tempSet.addAll(temp); - if (!case_sensitive.containsAll(tempSet)) { - tempSet.removeAll(case_sensitive); - if (!cased.containsAll(tempSet)) { - log.println(); - log.println("Adding " + tempSet + " because of: "); - log.println("\t" + ucdData.getCodeAndName(cps)); - log.println("=>\t" + ucdData.getCodeAndName(temp)); + // temp = nfc.normalize(temp); + // if (temp.equals(cps)) return; + + tempSet.clear(); + tempSet.addAll(cps); + tempSet.addAll(temp); + if (!case_sensitive.containsAll(tempSet)) { + tempSet.removeAll(case_sensitive); + if (!cased.containsAll(tempSet)) { + log.println(); + log.println("Adding " + tempSet + " because of: "); + log.println("\t" + ucdData.getCodeAndName(cps)); + log.println("=>\t" + ucdData.getCodeAndName(temp)); + } + case_sensitive.addAll(tempSet); + } } - case_sensitive.addAll(tempSet); - } - } - @Override - public boolean hasValue(int cp) { - if (case_sensitive == null) { - try { - log = Utility.openPrintWriterGenDir("log/Case_Sensitive_Log.txt", Utility.UTF8_UNIX); - - System.out.println("Building Case-Sensitive cache"); - case_sensitive = new UnicodeSet(); - cased = DerivedProperty.make(PropLowercase, ucdData).getSet() - .addAll(DerivedProperty.make(PropUppercase, ucdData).getSet()) - .addAll(UnifiedBinaryProperty.make(CATEGORY | Lt).getSet()); - for (int c = 0; c < 0x10FFFF; ++c) { - Utility.dot(c); - // skip cases that can't matter - if (!ucdData.isAssigned(c)) { - continue; + @Override + public boolean hasValue(int cp) { + if (case_sensitive == null) { + try { + log = + Utility.openPrintWriterGenDir( + "log/Case_Sensitive_Log.txt", Utility.UTF8_UNIX); + + System.out.println("Building Case-Sensitive cache"); + case_sensitive = new UnicodeSet(); + cased = + DerivedProperty.make(PropLowercase, ucdData) + .getSet() + .addAll( + DerivedProperty.make(PropUppercase, ucdData) + .getSet()) + .addAll( + UnifiedBinaryProperty.make(CATEGORY | Lt) + .getSet()); + for (int c = 0; c < 0x10FFFF; ++c) { + Utility.dot(c); + // skip cases that can't matter + if (!ucdData.isAssigned(c)) { + continue; + } + + final String cps = UTF16.valueOf(c); + addCase(cps, FULL, LOWER); + addCase(cps, FULL, UPPER); + addCase(cps, FULL, TITLE); + addCase(cps, FULL, FOLD); + addCase(cps, SIMPLE, LOWER); + addCase(cps, SIMPLE, UPPER); + addCase(cps, SIMPLE, TITLE); + addCase(cps, SIMPLE, FOLD); + } + Utility.fixDot(); + UnicodeSet temp; + log.println("Cased, but not Case_Sensitive"); + temp = new UnicodeSet().addAll(cased).removeAll(case_sensitive); + Utility.showSetNames(log, "", temp, false, false, ucdData); + + log.println("Case_Sensitive, but not Cased"); + temp = new UnicodeSet().addAll(case_sensitive).removeAll(cased); + Utility.showSetNames(log, "", temp, false, false, ucdData); + + log.println("Both Case_Sensitive, and Cased"); + temp = new UnicodeSet().addAll(case_sensitive).retainAll(cased); + log.println(temp); + System.out.println("Done Building Case-Sensitive cache"); + + log.close(); + + } catch (final Exception e) { + throw new ChainException("internal error", null, e); } - - final String cps = UTF16.valueOf(c); - addCase(cps, FULL, LOWER); - addCase(cps, FULL, UPPER); - addCase(cps, FULL, TITLE); - addCase(cps, FULL, FOLD); - addCase(cps, SIMPLE, LOWER); - addCase(cps, SIMPLE, UPPER); - addCase(cps, SIMPLE, TITLE); - addCase(cps, SIMPLE, FOLD); } - Utility.fixDot(); - UnicodeSet temp; - log.println("Cased, but not Case_Sensitive"); - temp = new UnicodeSet().addAll(cased).removeAll(case_sensitive); - Utility.showSetNames(log, "", temp, false, false, ucdData); - - log.println("Case_Sensitive, but not Cased"); - temp = new UnicodeSet().addAll(case_sensitive).removeAll(cased); - Utility.showSetNames(log, "", temp, false, false, ucdData); + return case_sensitive.contains(cp); + } + }; - log.println("Both Case_Sensitive, and Cased"); - temp = new UnicodeSet().addAll(case_sensitive).retainAll(cased); - log.println(temp); - System.out.println("Done Building Case-Sensitive cache"); + dprops[Other_Case_Ignorable] = + new UCDProperty() { + { + name = "Other_Case_Ignorable"; + shortName = "OCI"; + isStandard = false; + } - log.close(); + @Override + public boolean hasValue(int cp) { + switch (cp) { + case 0x27: + case 0x2019: + case 0xAD: + return true; + // case 0x2d: case 0x2010: case 0x2011: + /* + 0027 ; Other_Case_Ignorable # Po APOSTROPHE + 00AD ; Other_Case_Ignorable # Pd SOFT HYPHEN + 2019 ; Other_Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK + */ + } + return false; + } + }; - } catch (final Exception e) { - throw new ChainException("internal error", null, e); + dprops[Type_i] = + new UCDProperty() { + { + type = DERIVED_CORE; + isStandard = false; + name = "DSoft_Dotted"; + shortName = "DSDot"; } - } - return case_sensitive.contains(cp); - } - }; - dprops[Other_Case_Ignorable] = new UCDProperty() { - { - name = "Other_Case_Ignorable"; - shortName = "OCI"; - isStandard = false; + @Override + public boolean hasValue(int cp) { + if (hasSoftDot(cp)) { + return true; + } + if (nfkd.isNormalized(cp)) { + return false; + } + final String decomp = nfd.normalize(cp); + boolean ok = false; + for (int i = decomp.length() - 1; i >= 0; --i) { + final int ch = UTF16.charAt(decomp, i); + final int cc = ucdData.getCombiningClass(ch); + if (cc == 230) { + return false; + } + if (cc == 0) { + if (!hasSoftDot(ch)) { + return false; + } + ok = true; + } + } + return ok; + } - } - @Override - public boolean hasValue(int cp) { - switch(cp) { - case 0x27: case 0x2019: case 0xAD: return true; - // case 0x2d: case 0x2010: case 0x2011: - /* -0027 ; Other_Case_Ignorable # Po APOSTROPHE -00AD ; Other_Case_Ignorable # Pd SOFT HYPHEN -2019 ; Other_Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK - */ - } - return false; - } - }; + boolean hasSoftDot(int ch) { + return ch == 'i' + || ch == 'j' + || ch == 0x0268 + || ch == 0x0456 + || ch == 0x0458; + } + }; - dprops[Type_i] = new UCDProperty() { - { - type = DERIVED_CORE; - isStandard = false; - name = "DSoft_Dotted"; - shortName = "DSDot"; - } - @Override - public boolean hasValue(int cp) { - if (hasSoftDot(cp)) { - return true; - } - if (nfkd.isNormalized(cp)) { - return false; - } - final String decomp = nfd.normalize(cp); - boolean ok = false; - for (int i = decomp.length()-1; i >= 0; --i) { - final int ch = UTF16.charAt(decomp, i); - final int cc = ucdData.getCombiningClass(ch); - if (cc == 230) { - return false; + dprops[Case_Ignorable] = + new UCDProperty() { + { + name = "Case_Ignorable"; + isStandard = false; + shortName = "CI"; } - if (cc == 0) { - if (!hasSoftDot(ch)) { - return false; + + @Override + public boolean hasValue(int cp) { + final byte cat = ucdData.getCategory(cp); + // Word_Break(C) = MidLetter or MidNumLet, or + // General_Category(C) = Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format + // (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk). + + if (cat == Lm || cat == Cf || cat == Mn || cat == Me) { + return true; + } + if (dprops[Other_Case_Ignorable].hasValue(cp)) { + return true; } - ok = true; + return false; } - } - return ok; - } - boolean hasSoftDot(int ch) { - return ch == 'i' || ch == 'j' || ch == 0x0268 || ch == 0x0456 || ch == 0x0458; - } - }; - - dprops[Case_Ignorable] = new UCDProperty() { - { - name = "Case_Ignorable"; - isStandard = false; - shortName = "CI"; - } - @Override - public boolean hasValue(int cp) { - final byte cat = ucdData.getCategory(cp); - //Word_Break(C) = MidLetter or MidNumLet, or - //General_Category(C) = Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk). - - if (cat == Lm || cat == Cf || cat == Mn || cat == Me) { - return true; - } - if (dprops[Other_Case_Ignorable].hasValue(cp)) { - return true; - } - return false; - } - }; + }; /* - GraphemeExtend = 27, - GraphemeBase = 28, -# GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink -# GraphemeBase := + GraphemeExtend = 27, + GraphemeBase = 28, + # GraphemeExtend := Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink + # GraphemeBase := - */ - dprops[GraphemeExtend] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "Grapheme_Extend"; - shortName = "Gr_Ext"; - } - @Override - public boolean hasValue(int cp) { - //if (cp == 0x034F) return false; - //if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false; - // || cat == Mc - final byte cat = ucdData.getCategory(cp); - if (cat == Me || cat == Mn - || ucdData.getBinaryProperty(cp,Other_GraphemeExtend)) { - return true; - } - return false; - } - }; + */ + dprops[GraphemeExtend] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "Grapheme_Extend"; + shortName = "Gr_Ext"; + } - dprops[GraphemeBase] = new UCDProperty() { - { - type = DERIVED_CORE; - name = "Grapheme_Base"; - shortName = "Gr_Base"; + @Override + public boolean hasValue(int cp) { + // if (cp == 0x034F) return false; + // if (ucdData.getBinaryProperty(cp, GraphemeLink)) return false; + // || cat == Mc + final byte cat = ucdData.getCategory(cp); + if (cat == Me + || cat == Mn + || ucdData.getBinaryProperty(cp, Other_GraphemeExtend)) { + return true; + } + return false; + } + }; - } - @Override - public boolean hasValue(int cp) { - //if (cp == 0x034F) return false; - final byte cat = ucdData.getCategory(cp); - if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn || cat == Zl || cat == Zp) { - return false; - } - // || ucdData.getBinaryProperty(cp,GraphemeLink) - if (dprops[GraphemeExtend].hasValue(cp)) { - return false; - } - return true; - } - }; + dprops[GraphemeBase] = + new UCDProperty() { + { + type = DERIVED_CORE; + name = "Grapheme_Base"; + shortName = "Gr_Base"; + } + + @Override + public boolean hasValue(int cp) { + // if (cp == 0x034F) return false; + final byte cat = ucdData.getCategory(cp); + if (cat == Cc || cat == Cf || cat == Cs || cat == Co || cat == Cn + || cat == Zl || cat == Zp) { + return false; + } + // || ucdData.getBinaryProperty(cp,GraphemeLink) + if (dprops[GraphemeExtend].hasValue(cp)) { + return false; + } + return true; + } + }; for (final UCDProperty up : dprops) { if (up == null) { @@ -991,12 +1070,10 @@ public boolean hasValue(int cp) { byte getDecompCat(int cp) { final byte cat = ucdData.getCategory(cp); - if (cat == Lu - || ucdData.getBinaryProperty(cp, Other_Uppercase)) { + if (cat == Lu || ucdData.getBinaryProperty(cp, Other_Uppercase)) { return Lu; } - if (cat == Ll - || ucdData.getBinaryProperty(cp, Other_Lowercase)) { + if (cat == Ll || ucdData.getBinaryProperty(cp, Other_Lowercase)) { return Ll; } if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/DerivedPropertyLister.java b/unicodetools/src/main/java/org/unicode/text/UCD/DerivedPropertyLister.java index eb456c5c1..ae19764dd 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/DerivedPropertyLister.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/DerivedPropertyLister.java @@ -1,30 +1,29 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/DerivedPropertyLister.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/DerivedPropertyLister.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; + import java.io.PrintWriter; final class DerivedPropertyLister extends PropertyLister { static final boolean BRIDGE = false; - //static int enum = 0; + // static int enum = 0; - //private int propMask; - //private DerivedProperty dprop; + // private int propMask; + // private DerivedProperty dprop; private final UCDProperty uprop; int width; boolean varies; public DerivedPropertyLister(UCD ucd, int propMask, PrintWriter output) { - //this.propMask = propMask; + // this.propMask = propMask; this.output = output; ucdData = ucd; // this.dprop = new DerivedProperty(ucd); @@ -33,16 +32,20 @@ public DerivedPropertyLister(UCD ucd, int propMask, PrintWriter output) { width = super.minPropertyWidth(); switch (propMask) { - case UCD_Types.GenNFD: case UCD_Types.GenNFC: case UCD_Types.GenNFKD: case UCD_Types.GenNFKC: - alwaysBreaks = true; - break; - case UCD_Types.FC_NFKC_Closure: - alwaysBreaks = true; - width = 21; - break; - case UCD_Types.QuickNFC: case UCD_Types.QuickNFKC: - width = 11; - break; + case UCD_Types.GenNFD: + case UCD_Types.GenNFC: + case UCD_Types.GenNFKD: + case UCD_Types.GenNFKC: + alwaysBreaks = true; + break; + case UCD_Types.FC_NFKC_Closure: + alwaysBreaks = true; + width = 21; + break; + case UCD_Types.QuickNFC: + case UCD_Types.QuickNFKC: + width = 11; + break; } } @@ -51,17 +54,15 @@ public String valueName(int cp) { return uprop.getListingValue(cp); } - //public String optionalComment(int cp) { + // public String optionalComment(int cp) { // return super.optionalComment(cp) + " [" + ucdData.getCodeAndName(computedValue) + "]"; - //} - + // } @Override public int minPropertyWidth() { return width; } - /* public String optionalComment(int cp) { String id = ucdData.getCategoryID(cp); @@ -115,4 +116,3 @@ static void setComputedValue(int cp, String value) { */ } - diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/DiffPropertyLister.java b/unicodetools/src/main/java/org/unicode/text/UCD/DiffPropertyLister.java index 5244c02d7..e3eba9eba 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/DiffPropertyLister.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/DiffPropertyLister.java @@ -1,25 +1,24 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/DiffPropertyLister.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/DiffPropertyLister.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; -import java.io.PrintWriter; import com.ibm.icu.text.UnicodeSet; +import java.io.PrintWriter; class DiffPropertyLister extends PropertyLister { private UCD oldUCD; private final UnicodeSet set = new UnicodeSet(); private static final int NOPROPERTY = -1; - public DiffPropertyLister(String oldUCDName, String newUCDName, PrintWriter output, int property) { + public DiffPropertyLister( + String oldUCDName, String newUCDName, PrintWriter output, int property) { this.output = output; ucdData = UCD.make(newUCDName); if (property != NOPROPERTY) { @@ -74,15 +73,13 @@ public String optionalComment(int cp) { return normal; } - @Override byte getModCat(int cp) { final byte result = ucdData.getModCat(cp, breakByCategory ? CASED_LETTER_MASK : -1); - //System.out.println(breakByCategory + ", " + ucdData.getModCatID_fromIndex(result)); + // System.out.println(breakByCategory + ", " + ucdData.getModCatID_fromIndex(result)); return result; } - @Override public byte status(int cp) { if (newProp == null) { @@ -121,16 +118,18 @@ public byte status(int cp) { public String headerString() { String result; if (oldUCD != null) { - result = "# Differences between " - + major_minor_only(ucdData.getVersion()) - + " and " - + major_minor_only(oldUCD.getVersion()); + result = + "# Differences between " + + major_minor_only(ucdData.getVersion()) + + " and " + + major_minor_only(oldUCD.getVersion()); } else { - result = "# Designated as of " - + major_minor_only(ucdData.getVersion()) - + " [excluding removed Hangul Syllables]"; + result = + "# Designated as of " + + major_minor_only(ucdData.getVersion()) + + " [excluding removed Hangul Syllables]"; } - //System.out.println("hs: " + result); + // System.out.println("hs: " + result); return result; } @@ -167,6 +166,4 @@ private String major_minor_only(String s) { return s.substring(0, s.lastIndexOf('.')); } - } - diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java index 0a417d5ba..573828b04 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateBreakTest.java @@ -1,16 +1,18 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateBreakTest.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateBreakTest.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -27,10 +29,9 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; - -import org.unicode.props.UnicodeProperty; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; +import org.unicode.props.UnicodeProperty; import org.unicode.text.utility.Settings; import org.unicode.text.utility.UnicodeDataFile; import org.unicode.text.utility.Utility; @@ -38,12 +39,7 @@ import org.unicode.tools.Segmenter; import org.unicode.tools.Segmenter.Builder; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - -abstract public class GenerateBreakTest implements UCD_Types { +public abstract class GenerateBreakTest implements UCD_Types { private static final String DEBUG_STRING = "\u0001\u0061\u2060"; private static final boolean DEBUG_RULE_REPLACEMENT = true; @@ -78,7 +74,8 @@ public static void main(String[] args) throws IOException { throw new IllegalArgumentException( "obsolete command-line argument CLDR_BREAK: set -DCLDR=true instead"); } - System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); + System.out.println( + "Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); UCD ucd = Default.ucd(); new GenerateGraphemeBreakTest(ucd, Segmenter.Target.FOR_UCD).run(); new GenerateWordBreakTest(ucd, Segmenter.Target.FOR_UCD).run(); @@ -122,8 +119,8 @@ int addToMapLast(String label) { final int result = addToMap(label); final Set values = new HashSet(prop.getAvailableValues()); if (!values.equals(labels)) { - throw new IllegalArgumentException("Missing Property Values: " + prop.getName() - + ": " + values.removeAll(labels)); + throw new IllegalArgumentException( + "Missing Property Values: " + prop.getName() + ": " + values.removeAll(labels)); } return result; } @@ -158,7 +155,7 @@ public static boolean onCodepointBoundary(String s, int offset) { if (offset == 0 || offset == s.length()) { return true; } - if (UTF16.isLeadSurrogate(s.charAt(offset-1)) + if (UTF16.isLeadSurrogate(s.charAt(offset - 1)) && UTF16.isTrailSurrogate(s.charAt(offset))) { return false; } @@ -171,7 +168,7 @@ public int findFirstBase(String source, int start, int limit) { for (int i = start; i < limit; i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); final byte cat = ucd.getCategory(cp); - if (((1< " + showData(ucd, decomp, INFOPROPS, "\n\t")); shown = true; } - System.out.println(j + ": " + test2.fileName); + System.out.println(j + ": " + test2.fileName); } } } @@ -245,7 +245,9 @@ static String showData(UCD ucd, String source, UCDProperty[] props, String separ result.append(ucd.getCodeAndName(cp)); for (final UCDProperty prop2 : props) { result.append(", "); - result.append(prop2.getPropertyName(SHORT)).append('=').append(prop2.getValue(cp,SHORT)); + result.append(prop2.getPropertyName(SHORT)) + .append('=') + .append(prop2.getValue(cp, SHORT)); } } return result.toString(); @@ -263,19 +265,20 @@ boolean isBaseNSMStar(String source) { for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); final byte cat = ucd.getCategory(cp); - final int catMask = 1< samples = new ArrayList(); // should have one per property value, for the cross chart and before+after test - protected List extraSamples = new ArrayList(); // extras that are used in before+after test - protected List extraSingleSamples = new ArrayList(); // extras that are just added straight, no before+after, and also appear on charts - protected Set extraTestSamples = new LinkedHashSet<>(); // extras that are just added to tests, not to charts + protected List samples = + new ArrayList(); // should have one per property value, for the cross chart and + // before+after test + protected List extraSamples = + new ArrayList(); // extras that are used in before+after test + protected List extraSingleSamples = + new ArrayList< + String>(); // extras that are just added straight, no before+after, and also + // appear on charts + protected Set extraTestSamples = + new LinkedHashSet<>(); // extras that are just added to tests, not to charts protected int tableLimit = -1; protected int[] skippedSamples = new int[100]; @@ -369,7 +382,7 @@ void generateTerminalClosure() { private int ruleListCount = 0; protected boolean collectingRules = false; protected boolean needsFullBreakSample = true; - protected Map variables; + protected Map variables; public void setRule(String rule) { if (collectingRules) { @@ -386,8 +399,8 @@ public void run() throws IOException { findSamples(); // test individual cases - //printLine(out, samples[LB_ZW], "", samples[LB_CL]); - //printLine(out, samples[LB_ZW], " ", samples[LB_CL]); + // printLine(out, samples[LB_ZW], "", samples[LB_CL]); + // printLine(out, samples[LB_ZW], " ", samples[LB_CL]); boolean forCLDR = seg.target == Segmenter.Target.FOR_CLDR; String path = "UCD/" + ucd.getVersion() + '/' + (forCLDR ? "cldr/" : "auxiliary/"); @@ -396,100 +409,121 @@ public void run() throws IOException { outFilename = outFilename + "-cldr"; } final UnicodeDataFile fc = - UnicodeDataFile.openHTMLAndWriteHeader(path, outFilename). - setSkipCopyright(Settings.SKIP_COPYRIGHT); + UnicodeDataFile.openHTMLAndWriteHeader(path, outFilename) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter out = fc.out; /* PrintWriter out = Utility.openPrintWriter("auxiliary/" - + fileName + "BreakTest-" - + ucd.getVersion() - + ".html", Utility.UTF8_WINDOWS); - */ + + fileName + "BreakTest-" + + ucd.getVersion() + + ".html", Utility.UTF8_WINDOWS); + */ out.println(DOCTYPE); - out.println(""); + out.println( + ""); out.println("" + fileName + " Break Chart"); out.println(""); - out.println(""); out.println("

" + propertyName + " Chart

"); out.println("

Unicode Version: " + ucd.getVersion() + "

"); if (!Settings.BUILD_FOR_COMPARE) { out.println("

Date: " + Default.getDate() + "

"); } - out.println("

This page illustrates the application of the " + propertyName + " specification. " - + "The material here is informative, not normative.

" - + "

The first chart shows where breaks would appear between different sample characters or strings. " - + "The sample characters are chosen mechanically to represent the different properties used by the specification.

" - + "

Each cell shows the break-status for the position between the character(s) in its row header and the character(s) in its column header. " - + "The × symbol indicates no break, while the ÷ symbol indicated a break. " - + "The cells with × are also shaded to make it easier to scan the table. " - + "For example, in the cell at the intersection of the row headed by “CR” and the column headed by “LF”, there is a × symbol, " - + "indicating that there is no break between CR and LF.

"); + out.println( + "

This page illustrates the application of the " + + propertyName + + " specification. " + + "The material here is informative, not normative.

" + + "

The first chart shows where breaks would appear between different sample characters or strings. " + + "The sample characters are chosen mechanically to represent the different properties used by the specification.

" + + "

Each cell shows the break-status for the position between the character(s) in its row header and the character(s) in its column header. " + + "The × symbol indicates no break, while the ÷ symbol indicated a break. " + + "The cells with × are also shaded to make it easier to scan the table. " + + "For example, in the cell at the intersection of the row headed by “CR” and the column headed by “LF”, there is a × symbol, " + + "indicating that there is no break between CR and LF.

"); out.print("

"); if (fileName.equals("Grapheme") || fileName.equals("Word")) { - out.print("After the heavy blue line in the table are additional rows, either with different sample characters or for sequences" - + (fileName.equals("Word") ? ", such as “ALetter MidLetter”. " : ". ")); - } - out.println("Some column headers may be composed, reflecting “treat as” or “ignore” rules.

"); - out.print("

If your browser handles titles (tooltips), then hovering the mouse over the row header will show a sample character of that type. " - + "Hovering over a column header will show the sample character, plus its abbreviated general category and script. " - + "Hovering over the intersected cells shows the rule number that produces the break-status. " - + "For example, hovering over the cell at the intersection of "); - switch(fileName) { - case "Line": - out.print("H3 and JT shows ×, with the rule 26.03. "); break; - case "Grapheme": - out.print("LVT and T shows ×, with the rule 8.0. "); break; - case "Word": - out.print("ExtendNumLet and ALetter shows ×, with the rule 13.2. "); break; - case "Sentence": - out.print("ATerm and Close shows ×, with the rule 9.0. "); break; + out.print( + "After the heavy blue line in the table are additional rows, either with different sample characters or for sequences" + + (fileName.equals("Word") ? ", such as “ALetter MidLetter”. " : ". ")); + } + out.println( + "Some column headers may be composed, reflecting “treat as” or “ignore” rules.

"); + out.print( + "

If your browser handles titles (tooltips), then hovering the mouse over the row header will show a sample character of that type. " + + "Hovering over a column header will show the sample character, plus its abbreviated general category and script. " + + "Hovering over the intersected cells shows the rule number that produces the break-status. " + + "For example, hovering over the cell at the intersection of "); + switch (fileName) { + case "Line": + out.print("H3 and JT shows ×, with the rule 26.03. "); + break; + case "Grapheme": + out.print("LVT and T shows ×, with the rule 8.0. "); + break; + case "Word": + out.print("ExtendNumLet and ALetter shows ×, with the rule 13.2. "); + break; + case "Sentence": + out.print("ATerm and Close shows ×, with the rule 9.0. "); + break; } out.print("Checking below the table, "); - switch(fileName) { - case "Line": - out.print("rule 26.03 is “JT | H3 × JT”"); break; - case "Grapheme": - out.print("rule 8.0 is “( LVT | T) × T”"); break; - case "Word": - out.print("rule 13.2 is “ExtendNumLet × (AHLetter | Numeric | Katakana)”"); break; - case "Sentence": - out.print("rule 9.0 is “SATerm Close* × ( Close | Sp | ParaSep )”"); break; - } - out.println(", which is the one that applies to that case. " - + "Note that a rule is invoked only when no lower-numbered rules have applied.

"); + switch (fileName) { + case "Line": + out.print("rule 26.03 is “JT | H3 × JT”"); + break; + case "Grapheme": + out.print("rule 8.0 is “( LVT | T) × T”"); + break; + case "Word": + out.print("rule 13.2 is “ExtendNumLet × (AHLetter | Numeric | Katakana)”"); + break; + case "Sentence": + out.print("rule 9.0 is “SATerm Close* × ( Close | Sp | ParaSep )”"); + break; + } + out.println( + ", which is the one that applies to that case. " + + "Note that a rule is invoked only when no lower-numbered rules have applied.

"); if (fileName.equals("Line")) { - out.println("

The " + propertyName + " tests use tailoring of numbers described in Example 7 of Section 8.2, “Examples of Customization” of UAX #14.

"); + out.println( + "

The " + + propertyName + + " tests use tailoring of numbers described in Example 7 of Section 8.2, “Examples of Customization” of UAX #14.

"); } generateTable(out); - if (false) { out.println("

Character Type Breakdown

"); out.println("
"); for (int i = 0; i < sampleMap.size(); ++i) { - out.println(""); + out.println( + ""); } out.println("
" + sampleMap.getLabelFromIndex(i) - + "" + sampleMap.getSetFromIndex(i) - + "
" + + sampleMap.getLabelFromIndex(i) + + "" + + sampleMap.getSetFromIndex(i) + + "
"); } - out.println("
\n" + - "
\n" + - "
\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "
\n" + - "
\n" + - "
\n" + - "
"); + out.println( + "
\n" + + "
\n" + + "
\n" + + "\n" + + "\n" + + "\n" + + "\n" + + "
\n" + + "
\n" + + "
\n" + + "
"); for (int i = 0; i < 50; ++i) { out.println("
"); // leave blank lines so scroll-to-top works. } @@ -499,24 +533,23 @@ public void run() throws IOException { } private void generateTest( - boolean shortVersion, String path, String outFilename, - String propertyName) throws IOException { - TreeMap rulesFound = new TreeMap<>(); + boolean shortVersion, String path, String outFilename, String propertyName) + throws IOException { + TreeMap rulesFound = new TreeMap<>(); final List testCases = new ArrayList(); // do main test final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader( - path, - outFilename + (shortVersion ? "_SHORT" : "")). - setSkipCopyright(Settings.SKIP_COPYRIGHT); + path, outFilename + (shortVersion ? "_SHORT" : "")) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter out = fc.out; /* PrintWriter out = Utility.openPrintWriter("TR29/" + fileName + "BreakTest" - + (shortVersion ? "_SHORT" : "") - + "-" + ucd.getVersion() - + ".txt", Utility.UTF8_WINDOWS); - */ + + (shortVersion ? "_SHORT" : "") + + "-" + ucd.getVersion() + + ".txt", Utility.UTF8_WINDOWS); + */ int counter = 0; out.println("#"); @@ -578,13 +611,18 @@ private void generateTest( fc.close(); Set numbers = getMissing(fileName, rulesFound); if (!numbers.isEmpty()) { - //throw new IllegalArgumentException - System.err.println("***Rules missing from TESTS for " + fileName + ": " + numbers - + "You will need to add samples that trigger those rules. " - + "See https://sites.google.com/site/unicodetools/home/changing-ucd-properties#TOC-Adding-Segmentation-Sample-Strings"); + // throw new IllegalArgumentException + System.err.println( + "***Rules missing from TESTS for " + + fileName + + ": " + + numbers + + "You will need to add samples that trigger those rules. " + + "See https://sites.google.com/site/unicodetools/home/changing-ucd-properties#TOC-Adding-Segmentation-Sample-Strings"); } for (Entry entry : rulesFound.entrySet()) { - System.out.println("\"" + escaper.transform(entry.getValue()) + "\",\t\t//" + entry.getKey()); + System.out.println( + "\"" + escaper.transform(entry.getValue()) + "\",\t\t//" + entry.getKey()); } } @@ -601,13 +639,13 @@ private Set getMissing(String fileName, Map rulesFound) public void sampleDescription(PrintWriter out) {} - abstract public boolean isBreak(String source, int offset); + public abstract boolean isBreak(String source, int offset); - abstract public String fullBreakSample(); + public abstract String fullBreakSample(); - abstract public byte getType (int cp); + public abstract byte getType(int cp); - public byte getSampleType (int cp) { + public byte getSampleType(int cp) { return getType(cp); } @@ -619,7 +657,7 @@ public boolean highlightTableEntry(int x, int y, String s) { return false; } - abstract public String getTypeID(int s); + public abstract String getTypeID(int s); public String getTypeID(String s) { if (s == null) { @@ -693,9 +731,9 @@ String getInfo(String s) { result.append(", "); } result.append(ucd.getCodeAndName(cp)); - result.append(", gc=" + UCD.getCategoryID_fromIndex(ucd.getCategory(cp),SHORT)); - result.append(", sc=" + UCD.getScriptID_fromIndex(ucd.getScript(cp),SHORT)); - //result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp)) + result.append(", gc=" + UCD.getCategoryID_fromIndex(ucd.getCategory(cp), SHORT)); + result.append(", sc=" + UCD.getScriptID_fromIndex(ucd.getScript(cp), SHORT)); + // result.append(", lb=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp)) // + "=" + ucd.getLineBreakID_fromIndex(ucd.getLineBreak(cp), LONG)); } return result.toString(); @@ -713,10 +751,17 @@ public void generateTable(PrintWriter out) { } final String h = getTypeID(after); - types += "" + h + ""; + types += + "" + + h + + ""; - - //codes += "" + Utility.hex(after) + ""; + // codes += "" + Utility.hex(after) + + // ""; } out.println("" + types + ""); @@ -726,7 +771,10 @@ public void generateTable(PrintWriter out) { final String[] rule2 = new String[1]; for (int type = 0; type < samples.size(); ++type) { if (type == tableLimit) { - out.println(" "); + out.println( + " "); } final String before = samples.get(type); if (before == null) { @@ -734,7 +782,12 @@ public void generateTable(PrintWriter out) { } final String h = getTypeID(before); - String line = "" + h + ""; + String line = + "" + + h + + ""; for (int type2 = 0; type2 < tableLimit; ++type2) { @@ -758,7 +811,14 @@ public void generateTable(PrintWriter out) { } else if (t.equals(NOBREAK)) { background = " bgcolor='#CCCCFF'"; } - line += "" + t + ""; + line += + "" + + t + + ""; } out.println(line + ""); } @@ -776,49 +836,64 @@ public void generateTable(PrintWriter out) { } // gather the data for the rules - if (needsFullBreakSample ) { + if (needsFullBreakSample) { collectingRules = true; isBreak(fullBreakSample(), 1); collectingRules = false; } out.println("

" + linkAndAnchor("rules", "Rules") + "

"); - out.print("

This section shows the rules. They are mechanically modified for programmatic generation of the tables and test code, and" - + " thus do not match the UAX rules precisely. " - + "In particular:

" - + "
    " - + "
  1. The rules are cast into a form that is more like regular expressions.
  2. " - + "
  3. The rules “sot " + (fileName.equals("Line") ? "×" : "÷") + "”, “÷ eot”, and “÷ Any” are added mechanically, and have artificial numbers.
  4. " - + "
  5. The rules are given decimal numbers using tenths, and are written without prefix. For example, "); - switch(fileName) { - case "Line": - out.print("rule LB21a is given the number 21.1"); break; - case "Grapheme": - out.print("rule GB9a is given the number 9.1"); break; - case "Word": - out.print("rule WB13a is given the number 13.1"); break; - case "Sentence": - out.print("rule SB8a is given the number 8.1"); break; - } - out.print(".
  6. " - + "
  7. Any “treat as” or “ignore” rules are handled as discussed in UAX #" - + (fileName.equals("Line") ? "14" : "29") - + ", and thus reflected in a transformation of the rules usually not visible here. "); + out.print( + "

    This section shows the rules. They are mechanically modified for programmatic generation of the tables and test code, and" + + " thus do not match the UAX rules precisely. " + + "In particular:

    " + + "
      " + + "
    1. The rules are cast into a form that is more like regular expressions.
    2. " + + "
    3. The rules “sot " + + (fileName.equals("Line") ? "×" : "÷") + + "”, “÷ eot”, and “÷ Any” are added mechanically, and have artificial numbers.
    4. " + + "
    5. The rules are given decimal numbers using tenths, and are written without prefix. For example, "); + switch (fileName) { + case "Line": + out.print("rule LB21a is given the number 21.1"); + break; + case "Grapheme": + out.print("rule GB9a is given the number 9.1"); + break; + case "Word": + out.print("rule WB13a is given the number 13.1"); + break; + case "Sentence": + out.print("rule SB8a is given the number 8.1"); + break; + } + out.print( + ".
    6. " + + "
    7. Any “treat as” or “ignore” rules are handled as discussed in UAX #" + + (fileName.equals("Line") ? "14" : "29") + + ", and thus reflected in a transformation of the rules usually not visible here. "); if (fileName.equals("Line")) { - out.print("Where it does show up, an extra variable like CM+ may appear, and the rule may be recast. "); + out.print( + "Where it does show up, an extra variable like CM+ may appear, and the rule may be recast. "); } - out.print("In addition, final rules like “Any ÷ Any” may be recast as the equivalent expression “÷ Any”.
    8. "); + out.print( + "In addition, final rules like “Any ÷ Any” may be recast as the equivalent expression “÷ Any”.
    9. "); if (fileName.equals("Line")) { - out.print("Where a rule has multiple parts (lines), each one is numbered using hundredths, " - + "such as 21.01) × BA, 21.02) × HY, ... "); - } - out.println("In some cases, the numbering and form of a rule is changed due to “treat as” rules.
    10. " - + "
    " + "

    For the original rules" - + (fileName.equals("Word") || fileName.equals("Sentence") ? " and the macro values they use" : "") - + ", see UAX #" - + (fileName.equals("Line") ? "14" : "29") - + ".

    "); - //out.println("
      "); + out.print( + "Where a rule has multiple parts (lines), each one is numbered using hundredths, " + + "such as 21.01) × BA, 21.02) × HY, ... "); + } + out.println( + "In some cases, the numbering and form of a rule is changed due to “treat as” rules." + + "
" + + "

For the original rules" + + (fileName.equals("Word") || fileName.equals("Sentence") + ? " and the macro values they use" + : "") + + ", see UAX #" + + (fileName.equals("Line") ? "14" : "29") + + ".

"); + // out.println("
    "); out.println(""); // same pattern, but require _ at the end. final Matcher identifierMatcher = Segmenter.IDENTIFIER_PATTERN.matcher(""); @@ -838,49 +913,67 @@ public void generateTable(PrintWriter out) { if (replacement == null) { throw new IllegalArgumentException("Can't find variable: " + variable); } - final String prefix = ruleString.substring(0,identifierMatcher.start()); + final String prefix = ruleString.substring(0, identifierMatcher.start()); final String suffix = ruleString.substring(identifierMatcher.end()); if (DEBUG_RULE_REPLACEMENT) { - System.out.println("Replacing " + prefix + "$$" + variable + "$$" + suffix + "\t by \t" + replacement); + System.out.println( + "Replacing " + + prefix + + "$$" + + variable + + "$$" + + suffix + + "\t by \t" + + replacement); } ruleString = prefix + replacement + suffix; pos = identifierMatcher.start() + replacement.length(); } - String cleanRule = ruleString.replaceAll("[$]",""); - if (!isBreak("a",0)) { + String cleanRule = ruleString.replaceAll("[$]", ""); + if (!isBreak("a", 0)) { cleanRule = cleanRule.replace("sot ÷", "sot ×"); } final int parenPos = cleanRule.indexOf(')'); - final String ruleNumber = cleanRule.substring(0,parenPos); - final String ruleBody = cleanRule.substring(parenPos+1).trim(); + final String ruleNumber = cleanRule.substring(0, parenPos); + final String ruleBody = cleanRule.substring(parenPos + 1).trim(); int breakPoint = ruleBody.indexOf('×'); if (breakPoint < 0) { breakPoint = ruleBody.indexOf('÷'); } - out.println("" + - ""); - //out.println("
  • " + cleanRule + "
  • "); + out.println( + "" + + ""); + // out.println("
  • " + cleanRule + "
  • "); } out.println("
    " + linkAndAnchor("r" + ruleNumber, ruleNumber) + "" + ruleBody.substring(0,breakPoint) - + "" + ruleBody.substring(breakPoint, breakPoint+1) - + "" + ruleBody.substring(breakPoint+1) - + "
    " + + linkAndAnchor("r" + ruleNumber, ruleNumber) + + "" + + ruleBody.substring(0, breakPoint) + + "" + + ruleBody.substring(breakPoint, breakPoint + 1) + + "" + + ruleBody.substring(breakPoint + 1) + + "
    "); - //out.println("
"); + // out.println(""); Map rulesFound = new TreeMap<>(); if (extraSingleSamples.size() > 0) { out.println("

" + linkAndAnchor("samples", "Sample Strings") + "

"); - out.println("

" + - "The following samples illustrate the application of the rules. " + - "The blue lines indicate possible break points. " + - "If your browser supports titles (tooltips), then positioning the mouse over each character will show its name, " + - "while positioning between characters shows the number of the rule responsible for the break-status." + - "

"); + out.println( + "

" + + "The following samples illustrate the application of the rules. " + + "The blue lines indicate possible break points. " + + "If your browser supports titles (tooltips), then positioning the mouse over each character will show its name, " + + "while positioning between characters shows the number of the rule responsible for the break-status." + + "

"); out.println(""); for (int ii = 0; ii < extraSingleSamples.size(); ++ii) { - final String ruleNumber = String.valueOf(ii+1); - out.println(""); } @@ -897,7 +990,7 @@ Set getRuleNumbers() { for (int ii = 0; ii < ruleListCount; ++ii) { String ruleString = ruleList[ii]; final int parenPos = ruleString.indexOf(')'); - final String ruleNumber = ruleString.substring(0,parenPos); + final String ruleNumber = ruleString.substring(0, parenPos); results.add(Double.parseDouble(ruleNumber)); } return results; @@ -910,7 +1003,12 @@ public String linkAndAnchor(String anchor, String text) { static final String BREAK = "\u00F7"; static final String NOBREAK = "\u00D7"; - public void printLine(PrintWriter out, String source, boolean comments, boolean html, Map rulesFound) { + public void printLine( + PrintWriter out, + String source, + boolean comments, + boolean html, + Map rulesFound) { int cp; final StringBuffer string = new StringBuffer(); final StringBuffer comment = new StringBuffer("\t# "); @@ -920,7 +1018,12 @@ public void printLine(PrintWriter out, String source, boolean comments, boolean String status; if (html) { status = hasBreak ? " style='border-right: 1px solid blue'" : ""; - string.append("  "); + string.append( + "  "); } else { status = hasBreak ? BREAK : NOBREAK; string.append(status); @@ -935,12 +1038,19 @@ public void printLine(PrintWriter out, String source, boolean comments, boolean if (html) { status = hasBreak ? " style='border-right: 1px solid blue'" : ""; - string.append("" - + Utility.quoteXML(UtilityBase.getDisplay(cp), true) - + ""); - string.append("  "); + string.append( + "" + + Utility.quoteXML(UtilityBase.getDisplay(cp), true) + + ""); + string.append( + "  "); } else { if (string.length() > 0) { string.append(' '); @@ -980,20 +1090,21 @@ private void addToRules(Map rulesFound, String source, boolean h public void findSamples() { - // what we want is a list of sample characters. In the simple case, this is just one per type. - // However, if there are characters that have different types (when recommended or not), then + // what we want is a list of sample characters. In the simple case, this is just one per + // type. + // However, if there are characters that have different types (when recommended or not), + // then // we want a type for each cross-section /** - * Set of lb values that already have sample characters. - * Faster to test than Map.contains(lb) because - * it avoids creating an Integer object for each code point. + * Set of lb values that already have sample characters. Faster to test than + * Map.contains(lb) because it avoids creating an Integer object for each code point. */ final BitSet bitset = new BitSet(); /** - * Maps lb values to sample characters. - * We do not really need this map -- we could add each sample character directly to samples -- - * but adding them in sorted order by lb value stabilizes the output. + * Maps lb values to sample characters. We do not really need this map -- we could add each + * sample character directly to samples -- but adding them in sorted order by lb value + * stabilizes the output. */ final Map lbToSampleChar = new TreeMap(); @@ -1041,7 +1152,7 @@ public void findSamples() { public int findLastNon(String source, int offset, byte notLBType) { int cp; - for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) { + for (int i = offset - 1; i >= 0; i -= UTF16.getCharCount(cp)) { cp = UTF16.charAt(source, i); final byte f = getResolvedType(cp); if (f != notLBType) { @@ -1055,25 +1166,45 @@ public static UnicodeSet getSet(UCD ucd, int prop, byte propValue) { return UnifiedBinaryProperty.make(prop | propValue, ucd).getSet(); } - static public class Context { + public static class Context { public int cpBefore2, cpBefore, cpAfter, cpAfter2; public byte tBefore2, tBefore, tAfter, tAfter2; + @Override public String toString() { return "[" - + Utility.hex(cpBefore2) + "(" + tBefore2 + "), " - + Utility.hex(cpBefore) + "(" + tBefore + "), " - + Utility.hex(cpAfter) + "(" + tAfter + "), " - + Utility.hex(cpAfter2) + "(" + tAfter2 + ")]"; + + Utility.hex(cpBefore2) + + "(" + + tBefore2 + + "), " + + Utility.hex(cpBefore) + + "(" + + tBefore + + "), " + + Utility.hex(cpAfter) + + "(" + + tAfter + + "), " + + Utility.hex(cpAfter2) + + "(" + + tAfter2 + + ")]"; } } - public void getGraphemeBases(MyBreakIterator graphemeIterator, String source, int offset, int ignoreType, Context context) { + public void getGraphemeBases( + MyBreakIterator graphemeIterator, + String source, + int offset, + int ignoreType, + Context context) { context.cpBefore2 = context.cpBefore = context.cpAfter = context.cpAfter2 = -1; context.tBefore2 = context.tBefore = context.tAfter = context.tAfter2 = -1; - //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType); + // if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + + // ignoreType); - //MyBreakIterator graphemeIterator = new MyBreakIterator(new GenerateGraphemeBreakTest(ucd)); + // MyBreakIterator graphemeIterator = new MyBreakIterator(new + // GenerateGraphemeBreakTest(ucd)); graphemeIterator.set(source, offset); while (true) { @@ -1117,31 +1248,36 @@ public void getGraphemeBases(MyBreakIterator graphemeIterator, String source, in } } - - //============================================== + // ============================================== static class XGenerateBreakTest extends GenerateBreakTest { String sample; + { needsFullBreakSample = false; } - public XGenerateBreakTest(UCD ucd, Segmenter.Builder segBuilder, String sample, String filename, - String[] extraSamples, String[] extraSingleSamples) { + public XGenerateBreakTest( + UCD ucd, + Segmenter.Builder segBuilder, + String sample, + String filename, + String[] extraSamples, + String[] extraSingleSamples) { super(ucd, segBuilder.make()); this.sample = sample; final List rules = segBuilder.getRules(); collectingRules = true; - for (final Iterator it = rules.iterator(); it.hasNext();) { + for (final Iterator it = rules.iterator(); it.hasNext(); ) { final String rule = it.next(); setRule(rule); } variables = segBuilder.getOriginalVariables(); collectingRules = false; - map.add("Other", new UnicodeSet(0,0x10FFFF)); + map.add("Other", new UnicodeSet(0, 0x10FFFF)); final UnicodeMap segSamples = seg.getSamples(); final Collection x = segSamples.getAvailableValues(); - for (final Iterator it = x.iterator(); it.hasNext();) { + for (final Iterator it = x.iterator(); it.hasNext(); ) { final String label = it.next(); UnicodeSet values = segSamples.keySet(label); if (label.contains("ExtPict")) { // hack to use reasonable values @@ -1152,17 +1288,12 @@ public XGenerateBreakTest(UCD ucd, Segmenter.Builder segBuilder, String sample, map.add(label, values, true, false); } fileName = filename; - propertyName = (filename.equals("Grapheme") ? "Grapheme_Cluster" : fileName) - + "_Break"; + propertyName = (filename.equals("Grapheme") ? "Grapheme_Cluster" : fileName) + "_Break"; sampleMap = map; this.extraSamples.addAll(Arrays.asList(extraSamples)); - this.extraSingleSamples.addAll(Arrays.asList( - "\r\na\n\u0308", - "a\u0308", - " \u200D\u0646", - "\u0646\u200D " - )); + this.extraSingleSamples.addAll( + Arrays.asList("\r\na\n\u0308", "a\u0308", " \u200D\u0646", "\u0646\u200D ")); this.extraSingleSamples.addAll(Arrays.asList(extraSingleSamples)); } @@ -1193,358 +1324,368 @@ public byte getType(int cp) { static class Sampler { final UnicodeProperty prop; + Sampler(String propName) { prop = unicodePropertySource.getProperty(propName); } + String get(String value) { - return get(value,1); + return get(value, 1); } + String get(String value, int count) { for (String s : prop.getSet(value)) { if (--count == 0) { return s; } } - throw new IllegalArgumentException(prop.getName() + ":" + value - + " doesn't have " + count + " values"); + throw new IllegalArgumentException( + prop.getName() + ":" + value + " doesn't have " + count + " values"); } } static class GenerateGraphemeBreakTest extends XGenerateBreakTest { public GenerateGraphemeBreakTest(UCD ucd, Segmenter.Target target) { - super(ucd, + super( + ucd, Segmenter.make( ToolUnicodePropertySource.make(ucd.getVersion()), - "GraphemeClusterBreak", target), + "GraphemeClusterBreak", + target), "aa", "Grapheme", - new String[]{unicodePropertySource.getSet("GC=Cn").iterator().next()}, - new String[]{}); + new String[] {unicodePropertySource.getSet("GC=Cn").iterator().next()}, + new String[] {}); System.out.println("Target: " + seg.target); Sampler GCB = new Sampler("GCB"); - this.extraSingleSamples.addAll(Arrays.asList( - GCB.get("L") + GCB.get("L"), - GCB.get("LV") + GCB.get("T") + GCB.get("L"), - GCB.get("LVT") + GCB.get("T") + GCB.get("L"), - GCB.get("RI") + GCB.get("RI",2) + GCB.get("RI",3) + "b", - "a" + GCB.get("RI") + GCB.get("RI",2) + GCB.get("RI",3) + "b", - "a" + GCB.get("RI") + GCB.get("RI",2) + zwj + GCB.get("RI",3) + "b", - "a" + GCB.get("RI") + zwj + GCB.get("RI",2) + GCB.get("RI",3) + "b", - "a" + GCB.get("RI") + GCB.get("RI",2) + GCB.get("RI",3) + GCB.get("RI",4) + "b", - "a" + zwj, - "a" + "\u0308" + "b", - "a" + GCB.get("SpacingMark") + "b", - "a" + GCB.get("Prepend") + "b", - - //"a" + GCB.get("LinkingConsonant") + GCB.get("Virama") + GCB.get("LinkingConsonant") + "b", - //"a" + GCB.get("Virama") + GCB.get("LinkingConsonant") + "b", - //"a" + zwj + GCB.get("LinkingConsonant") + "b", - - //GCB.get("E_Base") + GCB.get("E_Modifier") + GCB.get("E_Base"), - //"a" + GCB.get("E_Modifier") + GCB.get("E_Base"), - - sampleEBase + sampleEMod + sampleEBase, - "a" + sampleEMod + sampleEBase, - "a" + sampleEMod + sampleEBase + zwj + sampleEmoji, - - sampleEBase + sampleEMod + sampleMn + zwj + sampleEBase + sampleEMod, - - sampleEmoji + zwj + sampleEmoji, - "a" + zwj + sampleEmoji, - sampleEXP + zwj + sampleEXP, - "a" + zwj + sampleEXP - - //zwj + GCB.get("EBG") + GCB.get("E_Modifier"), - //zwj + GCB.get("Glue_After_Zwj"), - //zwj + GCB.get("EBG"), - //GCB.get("EBG") + GCB.get("EBG") - )); + this.extraSingleSamples.addAll( + Arrays.asList( + GCB.get("L") + GCB.get("L"), + GCB.get("LV") + GCB.get("T") + GCB.get("L"), + GCB.get("LVT") + GCB.get("T") + GCB.get("L"), + GCB.get("RI") + GCB.get("RI", 2) + GCB.get("RI", 3) + "b", + "a" + GCB.get("RI") + GCB.get("RI", 2) + GCB.get("RI", 3) + "b", + "a" + GCB.get("RI") + GCB.get("RI", 2) + zwj + GCB.get("RI", 3) + "b", + "a" + GCB.get("RI") + zwj + GCB.get("RI", 2) + GCB.get("RI", 3) + "b", + "a" + + GCB.get("RI") + + GCB.get("RI", 2) + + GCB.get("RI", 3) + + GCB.get("RI", 4) + + "b", + "a" + zwj, + "a" + "\u0308" + "b", + "a" + GCB.get("SpacingMark") + "b", + "a" + GCB.get("Prepend") + "b", + + // "a" + GCB.get("LinkingConsonant") + GCB.get("Virama") + + // GCB.get("LinkingConsonant") + "b", + // "a" + GCB.get("Virama") + GCB.get("LinkingConsonant") + "b", + // "a" + zwj + GCB.get("LinkingConsonant") + "b", + + // GCB.get("E_Base") + GCB.get("E_Modifier") + GCB.get("E_Base"), + // "a" + GCB.get("E_Modifier") + GCB.get("E_Base"), + + sampleEBase + sampleEMod + sampleEBase, + "a" + sampleEMod + sampleEBase, + "a" + sampleEMod + sampleEBase + zwj + sampleEmoji, + sampleEBase + sampleEMod + sampleMn + zwj + sampleEBase + sampleEMod, + sampleEmoji + zwj + sampleEmoji, + "a" + zwj + sampleEmoji, + sampleEXP + zwj + sampleEXP, + "a" + zwj + sampleEXP + + // zwj + GCB.get("EBG") + GCB.get("E_Modifier"), + // zwj + GCB.get("Glue_After_Zwj"), + // zwj + GCB.get("EBG"), + // GCB.get("EBG") + GCB.get("EBG") + )); if (seg.target == Segmenter.Target.FOR_CLDR) { - this.extraSingleSamples.addAll(Arrays.asList( - "क" + "त", - "क" + "\u094D" + "त", - "क" + "\u094D" + "\u094D" + "त", - "क" + "\u094D" + zwj + "त", - "क" + "\u093C" + zwj + "\u094D" + "त", - "क" + "\u093C" + "\u094D" + zwj + "त", - "क" + "\u094D" + "त" + '\u094D' + "य", - "क" + "\u094D" + "a", - "a" + "\u094D" + "त", - "?" + "\u094D" + "त" - )); + this.extraSingleSamples.addAll( + Arrays.asList( + "क" + "त", + "क" + "\u094D" + "त", + "क" + "\u094D" + "\u094D" + "त", + "क" + "\u094D" + zwj + "त", + "क" + "\u093C" + zwj + "\u094D" + "त", + "क" + "\u093C" + "\u094D" + zwj + "त", + "क" + "\u094D" + "त" + '\u094D' + "य", + "क" + "\u094D" + "a", + "a" + "\u094D" + "त", + "?" + "\u094D" + "त")); } } } static class GenerateLineBreakTest extends XGenerateBreakTest { public GenerateLineBreakTest(UCD ucd, Segmenter.Target target) { - super(ucd, + super( + ucd, Segmenter.make( - ToolUnicodePropertySource.make(ucd.getVersion()), - "LineBreak", target), - "aa", "Line", + ToolUnicodePropertySource.make(ucd.getVersion()), "LineBreak", target), + "aa", + "Line", // extraSamples - new String[]{}, + new String[] {}, // extraSingleSamples - new String[]{ - "\u000Bぁ", //4.0 - "\rぁ", //5.02 - "\u0085ぁ", //5.04 - "\u200D☝", //8.1 - "ぁ\u2060", //11.01 - "\u2060ぁ", //11.02 - "ぁ̈ ", //12.2 - "\u200D ", //12.3 - "\u200D/", //13.04 - "——", //17.0 - "ぁ", //20.01 - "ぁ", //20.02 - "ぁ-", //21.02 - "ก․", //22.01 - "!․", //22.02 - "․․", //22.04 - "0․", //22.05 - "☝%", //23.01 - "ก0", //23.02 - "$☝", //24.01 - "$ก", //24.02 - "%ก", //24.03 - "ᄀ\u1160", //26.01 - "\u1160\u1160", //26.02 - "ᆨᆨ", //26.03 - "\u1160․", //27.01 - "\u1160%", //27.02 - "$\u1160", //27.03 - "☝🏻", //30.2 - "final", //999.0 - - "can't", - "can\u2019t", - "'can' not", - "can 'not'", - "bug(s) ", - "bug(s)\u00a0 ", - "..ます。XMLの..", - "ab\u00ADby", - "-3", - "e.g.", - "\u4e00.\u4e00.", - "a b", - "a \u200bb", - "a \u0308b", - "1\u0308b(a)-(b)", - "give book(s).", - "ま(す)", - "find .com", - "equals .35 cents", - "(s)he", - "{s}he", - "ˈsIləb(ə)l", - "ˈsIləb{ə}l", - "code(s).", - "code(s.)", - "code(s)!", - "code(s!)", - "code\\(s\\)", - "code( s )", - "code{s}", - "code{s}.", - "code{s}!", - "code\\{s\\}", - "code{ s }", - "cod(e)…(s)", - "(cod(e)…)s", - "cod{e}…{s}", - "{cod{e}…}s", - "(con-)lang", - "(con\u00AD)lang", - "(con‑)lang", - "(con)-lang", - "(con)\u00ADlang", - "(con)‑lang", - "{con-}lang", - "{con\u00AD}lang", - "{con‑}lang", - "{con}-lang", - "{con}\u00ADlang", - "{con}‑lang", - "cre\u0301(e\u0301)(e)", - "cre\u0301[er|e\u0301(e)(s)]", - "cre\u0301{er|e\u0301(e)(s)}", - "ambigu(̈)(e\u0308)", - "ambigu(«̈»)(e\u0308)", - "ambigu(« ̈ »)(e\u0308)", - "ambigu« ( ̈ ) »(e\u0308)", - "ambigu«\u202F( ̈ )\u202F»(e\u0308)", - "ambigu{̈}(e\u0308)", - "ambigu{«̈»}(e\u0308)", - "ambigu{« ̈ »}(e\u0308)", - "ambigu« { ̈ } »(e\u0308)", - "ambigu«\u202F{ ̈ }\u202F»(e\u0308)", - "(czerwono\u00AD‑)niebieska", - "(czerwono\u00AD)‑niebieska", - "(czerwono)\u00AD‑niebieska", - "{czerwono\u00AD‑}niebieska", - "{czerwono\u00AD}‑niebieska", - "{czerwono}\u00AD‑niebieska", - "operator[](0);", - "operator[](){}", - "本(を)読む", - "本(「を」)読む", - "本「(を)」読む", - "本{を}読む", - "本{「を」}読む", - "本[(を)]読む", - "(ニュー・)ヨーク", - "(ニュー)・ヨーク", - "{ニュー・}ヨーク", - "{ニュー}・ヨーク", - "(ᡐᡆᡑᡆ᠆)ᠪᠢᠴᠢᠭ\u180C", - "(ᡐᡆᡑᡆ)᠆ᠪᠢᠴᠢᠭ\u180C", - "{ᡐᡆᡑᡆ᠆}ᠪᠢᠴᠢᠭ\u180C", - "{ᡐᡆᡑᡆ}᠆ᠪᠢᠴᠢᠭ\u180C", - "(http://)xn--a", - "{http://}xn--a", - "(0,1)+(2,3)⊕(−4,5)⊖(6,7)", - "{0,1}+{2,3}⊕{−4,5}⊖{6,7}", - "ab", - "ab ", - "ab c", - "aま", - "हिन्दी ", - "यसगुचितीयसा ", - "印本", - "読む", - "入力しエ", - "位。記", - "本。", - "険」の", - "しょう", - "まa本", - "없어요 or 못", - "まab ", - "で使", - "する", - "のパン", - "う え お」", - "る 은영 に", - "しょう。", - "ムの一", - "フリ", - "フリー百", - "ピュータで使用する", - "ターキーを押", - "ション", - "a.2 ", - "a.2 क", - "a.2 本", - "a.2 本", - "a.2 ま", - "a.2 3", - "ab. 2", - "A.1 못", - "봤어. A.2 볼", - "봐요. A.3 못", - "요. A.4 못", - "a.2 「", - "に「バ(ba)」や「ス", - "る「UKポンド」)、エ", - "は、「=rand()」と", - "で、「!」と", - "訳「す", - "て「봤어?」と", - "の「そ", - "は「エ", - "例:「あ い", - "く、「평양은", - "に「제목(題名)은", - "典『ウィキ", - "で『英語", - "(s) 本", - "(s) ま", - "(s) ク", - "る。dog(犬)を", - "本(ま", - "本 (a", - "点 [編集]", - "a(s) ", - "(ザ・クイック・ブ", - "p(クイック・ブ", - "ab(ク", - "(印本)", - "ス(い", - "ド(ポ", - "ド (質", - "s)」ま", - "a)』", - "る」)は", - "ド」)、エ", - "rk)」も", - "ク(ab cd)」も", - "ン・マーク(ex", - "マー(ma)」な", - "ガワ」。こ", - "ク」ま", - "ワ」。こ", - "ク」ま、本", - "ク」、ク", - "ディア(ab)』", - "쪽이에요?」と聞", - "名)은 알아요?」と", - "貨) - (po", - "量) 〜 (po", - "ド重) 〜 力・重", - "ab\"(ま", - "は \"s\" ", - "は、\"The ", - "dog\" を", - "90\" と", - "ス・オーバー・ザ・レ", - "ス・ジャン", - "ン・フォック", - "イジー・ドッグ、和", - "メーション・マーク", - "ン・ク(a", - "ション・マ", - "本: ", - "本: ク", - "出典: フリー百", - "後…に", - "しょう。。。", - "き、!!、!!!と", - "は、?と!を", - "た、⁉(!?)の", - "や、⁈(?!)の", - "た ‽と", - "せ!100%の完", - "23本", - "ァベット26字を", - "例:£23", - "記号 £。", - "れる。qu", - "ま。", - "ま。ab ", - "る。数", - "る。こ", - "い。パ", - "ガワ」。これ", - "語のioの、2字を", - "、和", - "、タ", - "、か", - "、これでは ", - "し、abと", - // U+1F1E6 = base RI - "a\uD83C\uDDE6b", - "\uD83C\uDDF7\uD83C\uDDFA", - "\uD83C\uDDF7\uD83C\uDDFA\uD83C\uDDF8", - "\uD83C\uDDF7\uD83C\uDDFA\uD83C\uDDF8\uD83C\uDDEA", - "\uD83C\uDDF7\uD83C\uDDFA\u200B\uD83C\uDDF8\uD83C\uDDEA", - "\u05D0-\u05D0", - }); + new String[] { + "\u000Bぁ", // 4.0 + "\rぁ", // 5.02 + "\u0085ぁ", // 5.04 + "\u200D☝", // 8.1 + "ぁ\u2060", // 11.01 + "\u2060ぁ", // 11.02 + "ぁ̈ ", // 12.2 + "\u200D ", // 12.3 + "\u200D/", // 13.04 + "——", // 17.0 + "ぁ", // 20.01 + "ぁ", // 20.02 + "ぁ-", // 21.02 + "ก․", // 22.01 + "!․", // 22.02 + "․․", // 22.04 + "0․", // 22.05 + "☝%", // 23.01 + "ก0", // 23.02 + "$☝", // 24.01 + "$ก", // 24.02 + "%ก", // 24.03 + "ᄀ\u1160", // 26.01 + "\u1160\u1160", // 26.02 + "ᆨᆨ", // 26.03 + "\u1160․", // 27.01 + "\u1160%", // 27.02 + "$\u1160", // 27.03 + "☝🏻", // 30.2 + "final", // 999.0 + "can't", + "can\u2019t", + "'can' not", + "can 'not'", + "bug(s) ", + "bug(s)\u00a0 ", + "..ます。XMLの..", + "ab\u00ADby", + "-3", + "e.g.", + "\u4e00.\u4e00.", + "a b", + "a \u200bb", + "a \u0308b", + "1\u0308b(a)-(b)", + "give book(s).", + "ま(す)", + "find .com", + "equals .35 cents", + "(s)he", + "{s}he", + "ˈsIləb(ə)l", + "ˈsIləb{ə}l", + "code(s).", + "code(s.)", + "code(s)!", + "code(s!)", + "code\\(s\\)", + "code( s )", + "code{s}", + "code{s}.", + "code{s}!", + "code\\{s\\}", + "code{ s }", + "cod(e)…(s)", + "(cod(e)…)s", + "cod{e}…{s}", + "{cod{e}…}s", + "(con-)lang", + "(con\u00AD)lang", + "(con‑)lang", + "(con)-lang", + "(con)\u00ADlang", + "(con)‑lang", + "{con-}lang", + "{con\u00AD}lang", + "{con‑}lang", + "{con}-lang", + "{con}\u00ADlang", + "{con}‑lang", + "cre\u0301(e\u0301)(e)", + "cre\u0301[er|e\u0301(e)(s)]", + "cre\u0301{er|e\u0301(e)(s)}", + "ambigu(̈)(e\u0308)", + "ambigu(«̈»)(e\u0308)", + "ambigu(« ̈ »)(e\u0308)", + "ambigu« ( ̈ ) »(e\u0308)", + "ambigu«\u202F( ̈ )\u202F»(e\u0308)", + "ambigu{̈}(e\u0308)", + "ambigu{«̈»}(e\u0308)", + "ambigu{« ̈ »}(e\u0308)", + "ambigu« { ̈ } »(e\u0308)", + "ambigu«\u202F{ ̈ }\u202F»(e\u0308)", + "(czerwono\u00AD‑)niebieska", + "(czerwono\u00AD)‑niebieska", + "(czerwono)\u00AD‑niebieska", + "{czerwono\u00AD‑}niebieska", + "{czerwono\u00AD}‑niebieska", + "{czerwono}\u00AD‑niebieska", + "operator[](0);", + "operator[](){}", + "本(を)読む", + "本(「を」)読む", + "本「(を)」読む", + "本{を}読む", + "本{「を」}読む", + "本[(を)]読む", + "(ニュー・)ヨーク", + "(ニュー)・ヨーク", + "{ニュー・}ヨーク", + "{ニュー}・ヨーク", + "(ᡐᡆᡑᡆ᠆)ᠪᠢᠴᠢᠭ\u180C", + "(ᡐᡆᡑᡆ)᠆ᠪᠢᠴᠢᠭ\u180C", + "{ᡐᡆᡑᡆ᠆}ᠪᠢᠴᠢᠭ\u180C", + "{ᡐᡆᡑᡆ}᠆ᠪᠢᠴᠢᠭ\u180C", + "(http://)xn--a", + "{http://}xn--a", + "(0,1)+(2,3)⊕(−4,5)⊖(6,7)", + "{0,1}+{2,3}⊕{−4,5}⊖{6,7}", + "ab", + "ab ", + "ab c", + "aま", + "हिन्दी ", + "यसगुचितीयसा ", + "印本", + "読む", + "入力しエ", + "位。記", + "本。", + "険」の", + "しょう", + "まa本", + "없어요 or 못", + "まab ", + "で使", + "する", + "のパン", + "う え お」", + "る 은영 に", + "しょう。", + "ムの一", + "フリ", + "フリー百", + "ピュータで使用する", + "ターキーを押", + "ション", + "a.2 ", + "a.2 क", + "a.2 本", + "a.2 本", + "a.2 ま", + "a.2 3", + "ab. 2", + "A.1 못", + "봤어. A.2 볼", + "봐요. A.3 못", + "요. A.4 못", + "a.2 「", + "に「バ(ba)」や「ス", + "る「UKポンド」)、エ", + "は、「=rand()」と", + "で、「!」と", + "訳「す", + "て「봤어?」と", + "の「そ", + "は「エ", + "例:「あ い", + "く、「평양은", + "に「제목(題名)은", + "典『ウィキ", + "で『英語", + "(s) 本", + "(s) ま", + "(s) ク", + "る。dog(犬)を", + "本(ま", + "本 (a", + "点 [編集]", + "a(s) ", + "(ザ・クイック・ブ", + "p(クイック・ブ", + "ab(ク", + "(印本)", + "ス(い", + "ド(ポ", + "ド (質", + "s)」ま", + "a)』", + "る」)は", + "ド」)、エ", + "rk)」も", + "ク(ab cd)」も", + "ン・マーク(ex", + "マー(ma)」な", + "ガワ」。こ", + "ク」ま", + "ワ」。こ", + "ク」ま、本", + "ク」、ク", + "ディア(ab)』", + "쪽이에요?」と聞", + "名)은 알아요?」と", + "貨) - (po", + "量) 〜 (po", + "ド重) 〜 力・重", + "ab\"(ま", + "は \"s\" ", + "は、\"The ", + "dog\" を", + "90\" と", + "ス・オーバー・ザ・レ", + "ス・ジャン", + "ン・フォック", + "イジー・ドッグ、和", + "メーション・マーク", + "ン・ク(a", + "ション・マ", + "本: ", + "本: ク", + "出典: フリー百", + "後…に", + "しょう。。。", + "き、!!、!!!と", + "は、?と!を", + "た、⁉(!?)の", + "や、⁈(?!)の", + "た ‽と", + "せ!100%の完", + "23本", + "ァベット26字を", + "例:£23", + "記号 £。", + "れる。qu", + "ま。", + "ま。ab ", + "る。数", + "る。こ", + "い。パ", + "ガワ」。これ", + "語のioの、2字を", + "、和", + "、タ", + "、か", + "、これでは ", + "し、abと", + // U+1F1E6 = base RI + "a\uD83C\uDDE6b", + "\uD83C\uDDF7\uD83C\uDDFA", + "\uD83C\uDDF7\uD83C\uDDFA\uD83C\uDDF8", + "\uD83C\uDDF7\uD83C\uDDFA\uD83C\uDDF8\uD83C\uDDEA", + "\uD83C\uDDF7\uD83C\uDDFA\u200B\uD83C\uDDF8\uD83C\uDDEA", + "\u05D0-\u05D0", + }); // Additions for Unicode 14 LB30b [\p{Extended_Pictographic}&\p{Cn}] × EM ToolUnicodePropertySource propSource = ToolUnicodePropertySource.make(ucd.getVersion()); @@ -1558,18 +1699,21 @@ public GenerateLineBreakTest(UCD ucd, Segmenter.Target target) { UnicodeSet lb_EBase = propSource.getSet("lb=EB"); // [\p{Extended_Pictographic}-\p{Cn}-\p{lb=EB}] - UnicodeSet extPictAssigned = extPict.cloneAsThawed().removeAll(unassigned).removeAll(lb_EBase); + UnicodeSet extPictAssigned = + extPict.cloneAsThawed().removeAll(unassigned).removeAll(lb_EBase); String firstExtPictAssigned = UTF16.valueOf(extPictAssigned.charAt(0)); // [\p{Extended_Pictographic}-\p{Cn}-\p{lb=EB}] ÷ EM extraSingleSamples.add(firstExtPictAssigned + sampleEMod); } + @Override public boolean isBreak(String source, int offset) { return offset == 0 ? false : super.isBreak(source, offset); } + @Override public List genTestItems(String before, String after, List results) { - super.genTestItems(before,after,results); + super.genTestItems(before, after, results); results.add(before + " " + after); return results; } @@ -1577,15 +1721,21 @@ public List genTestItems(String before, String after, List resul static class GenerateSentenceBreakTest extends XGenerateBreakTest { public GenerateSentenceBreakTest(UCD ucd, Segmenter.Target target) { - super(ucd, - makeSegmenter(ucd, target), "aa", "Sentence", - new String[]{}, + super( + ucd, + makeSegmenter(ucd, target), + "aa", + "Sentence", + new String[] {}, getExtraSamples(ucd, target)); } + private static Builder makeSegmenter(UCD ucd, Segmenter.Target target) { - final Builder result = Segmenter.make( - ToolUnicodePropertySource.make(ucd.getVersion()), - "SentenceBreak", target); + final Builder result = + Segmenter.make( + ToolUnicodePropertySource.make(ucd.getVersion()), + "SentenceBreak", + target); final Segmenter segmenter = result.make(); final boolean failure = segmenter.breaksAt("etc.)\u2019 \u2018(the", 7); if (failure) { @@ -1593,38 +1743,41 @@ private static Builder makeSegmenter(UCD ucd, Segmenter.Target target) { } return result; } + static String[] getExtraSamples(UCD ucd, Segmenter.Target target) { final GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(ucd, target); - String[] extraSingleSamples = new String[] { - "(\"Go.\") (He did.)", - "(\u201CGo?\u201D) (He did.)", - "U.S.A\u0300. is", - "U.S.A\u0300? He", - "U.S.A\u0300.", - "3.4", - "c.d", - "C.d", - "c.D", - "C.D", - "etc.)\u2019 the", - "etc.)\u2019 The", - "etc.)\u2019 \u2018(the", - "etc.)\u2019 \u2018(The", - "etc.)\u2019 \u0308the", - "etc.)\u2019 \u0308The", - "etc.)\u2019\u0308The", - "etc.)\n\u0308The", - "the resp. leaders are", - "\u5B57.\u5B57", - "etc.\u5B83", - "etc.\u3002", - "\u5B57\u3002\u5B83", - "!\u0020\u0020", - }; - final String[] temp = new String [extraSingleSamples.length * 2]; + String[] extraSingleSamples = + new String[] { + "(\"Go.\") (He did.)", + "(\u201CGo?\u201D) (He did.)", + "U.S.A\u0300. is", + "U.S.A\u0300? He", + "U.S.A\u0300.", + "3.4", + "c.d", + "C.d", + "c.D", + "C.D", + "etc.)\u2019 the", + "etc.)\u2019 The", + "etc.)\u2019 \u2018(the", + "etc.)\u2019 \u2018(The", + "etc.)\u2019 \u0308the", + "etc.)\u2019 \u0308The", + "etc.)\u2019\u0308The", + "etc.)\n\u0308The", + "the resp. leaders are", + "\u5B57.\u5B57", + "etc.\u5B83", + "etc.\u3002", + "\u5B57\u3002\u5B83", + "!\u0020\u0020", + }; + final String[] temp = new String[extraSingleSamples.length * 2]; System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length); for (int i = 0; i < extraSingleSamples.length; ++i) { - temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme); + temp[i + extraSingleSamples.length] = + insertEverywhere(extraSingleSamples[i], "\u2060", grapheme); } extraSingleSamples = temp; return extraSingleSamples; @@ -1633,67 +1786,86 @@ static String[] getExtraSamples(UCD ucd, Segmenter.Target target) { static class GenerateWordBreakTest extends XGenerateBreakTest { public GenerateWordBreakTest(UCD ucd, Segmenter.Target target) { - super(ucd, + super( + ucd, Segmenter.make( - ToolUnicodePropertySource.make(ucd.getVersion()), - "WordBreak", target), + ToolUnicodePropertySource.make(ucd.getVersion()), "WordBreak", target), "aa", "Word", new String[] { - /*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", - "a:", - "a'", - "a'\u2060", - "a,", - "1:", - "1'", - "1,", - "1.\u2060", + /*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", + "a:", + "a'", + "a'\u2060", + "a,", + "1:", + "1'", + "1,", + "1.\u2060", }, - new String[]{} - ); + new String[] {}); System.out.println(); Sampler WB = new Sampler("WB"); - this.extraSingleSamples.addAll(Arrays.asList( - WB.get("ALetter") + WB.get("ALetter") + WB.get("ALetter"), - WB.get("ALetter") + WB.get("MidLetter") + WB.get("ALetter"), - WB.get("ALetter") + WB.get("MidLetter") + WB.get("MidLetter") + WB.get("ALetter"), - WB.get("Hebrew_Letter") + WB.get("Single_Quote"), - WB.get("Hebrew_Letter") + WB.get("Double_Quote") + WB.get("Hebrew_Letter"), - WB.get("ALetter") + WB.get("Numeric") + WB.get("Numeric") + WB.get("ALetter"), - WB.get("Numeric") + WB.get("MidNum") + WB.get("Numeric"), - WB.get("Numeric") + WB.get("MidNum") + WB.get("MidNum") + WB.get("Numeric"), - WB.get("Katakana") + WB.get("Katakana"), - WB.get("ALetter") + WB.get("ExtendNumLet") - + WB.get("Numeric") + WB.get("ExtendNumLet") - + WB.get("Katakana") + WB.get("ExtendNumLet"), - WB.get("ALetter") + WB.get("ExtendNumLet") + WB.get("ExtendNumLet") + WB.get("ALetter"), - WB.get("RI") + WB.get("RI",2) + WB.get("RI",3) + "b", - "a" + WB.get("RI") + WB.get("RI",2) + WB.get("RI",3) + "b", - "a" + WB.get("RI") + WB.get("RI",2) + zwj + WB.get("RI",3) + "b", - "a" + WB.get("RI") + zwj + WB.get("RI",2) + WB.get("RI",3) + "b", - "a" + WB.get("RI") + WB.get("RI",2) + WB.get("RI",3) + WB.get("RI",4) + "b", - - sampleEBase + sampleEMod + sampleEBase, - - sampleEmoji + zwj + sampleEmoji, - "a" + zwj + sampleEmoji, - sampleEXP + zwj + sampleEXP, - "a" + zwj + sampleEXP, - - sampleEBase + sampleEMod + sampleMn + zwj + sampleEBase + sampleEMod, - - sampleEmoji + sampleEMod, - zwj + sampleEmoji + sampleEMod, - zwj + sampleEmoji, - zwj + sampleEmoji, - sampleEmoji + sampleEmoji, - "a" + sampleMn + zwj + sampleMn + "b", - "a b" - )); - - // 1. ÷ (Numeric|ALetter) ÷ (MidLetter|MidNum|MidNumLet) ÷ (MidLetter|MidNum|MidNumLet) ÷ (Numeric|ALetter) ÷ - // 2. ÷ (Numeric|ALetter) × ExtendNumLet × (Numeric|ALetter) ÷ (MidLetter|MidNum|MidNumLet) ÷ (MidLetter|MidNum|MidNumLet) ÷ (Numeric|ALetter) ÷ + this.extraSingleSamples.addAll( + Arrays.asList( + WB.get("ALetter") + WB.get("ALetter") + WB.get("ALetter"), + WB.get("ALetter") + WB.get("MidLetter") + WB.get("ALetter"), + WB.get("ALetter") + + WB.get("MidLetter") + + WB.get("MidLetter") + + WB.get("ALetter"), + WB.get("Hebrew_Letter") + WB.get("Single_Quote"), + WB.get("Hebrew_Letter") + + WB.get("Double_Quote") + + WB.get("Hebrew_Letter"), + WB.get("ALetter") + + WB.get("Numeric") + + WB.get("Numeric") + + WB.get("ALetter"), + WB.get("Numeric") + WB.get("MidNum") + WB.get("Numeric"), + WB.get("Numeric") + + WB.get("MidNum") + + WB.get("MidNum") + + WB.get("Numeric"), + WB.get("Katakana") + WB.get("Katakana"), + WB.get("ALetter") + + WB.get("ExtendNumLet") + + WB.get("Numeric") + + WB.get("ExtendNumLet") + + WB.get("Katakana") + + WB.get("ExtendNumLet"), + WB.get("ALetter") + + WB.get("ExtendNumLet") + + WB.get("ExtendNumLet") + + WB.get("ALetter"), + WB.get("RI") + WB.get("RI", 2) + WB.get("RI", 3) + "b", + "a" + WB.get("RI") + WB.get("RI", 2) + WB.get("RI", 3) + "b", + "a" + WB.get("RI") + WB.get("RI", 2) + zwj + WB.get("RI", 3) + "b", + "a" + WB.get("RI") + zwj + WB.get("RI", 2) + WB.get("RI", 3) + "b", + "a" + + WB.get("RI") + + WB.get("RI", 2) + + WB.get("RI", 3) + + WB.get("RI", 4) + + "b", + sampleEBase + sampleEMod + sampleEBase, + sampleEmoji + zwj + sampleEmoji, + "a" + zwj + sampleEmoji, + sampleEXP + zwj + sampleEXP, + "a" + zwj + sampleEXP, + sampleEBase + sampleEMod + sampleMn + zwj + sampleEBase + sampleEMod, + sampleEmoji + sampleEMod, + zwj + sampleEmoji + sampleEMod, + zwj + sampleEmoji, + zwj + sampleEmoji, + sampleEmoji + sampleEmoji, + "a" + sampleMn + zwj + sampleMn + "b", + "a b")); + + // 1. ÷ (Numeric|ALetter) ÷ (MidLetter|MidNum|MidNumLet) ÷ (MidLetter|MidNum|MidNumLet) + // ÷ (Numeric|ALetter) ÷ + // 2. ÷ (Numeric|ALetter) × ExtendNumLet × (Numeric|ALetter) ÷ + // (MidLetter|MidNum|MidNumLet) ÷ (MidLetter|MidNum|MidNumLet) ÷ (Numeric|ALetter) ÷ for (String numLet : Arrays.asList("1", "a")) { for (String mid : Arrays.asList(":", ".", ",")) { for (String mid2 : Arrays.asList(":", ".", ",")) { @@ -1707,683 +1879,720 @@ public GenerateWordBreakTest(UCD ucd, Segmenter.Target target) { } } } + static String[] getExtraSamples(UCD ucd, Segmenter.Target target) { final GenerateBreakTest grapheme = new GenerateGraphemeBreakTest(ucd, target); - final String [] temp = { - "can't", - "can\u2019t", - "ab\u00ADby", - "a$-34,567.14%b", - "3a", - "c.d", - "C.d", - "c.D", - "C.D", + final String[] temp = { + "can't", + "can\u2019t", + "ab\u00ADby", + "a$-34,567.14%b", + "3a", + "c.d", + "C.d", + "c.D", + "C.D", }; - final String[] extraSingleSamples = new String [temp.length * 2]; + final String[] extraSingleSamples = new String[temp.length * 2]; System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length); for (int i = 0; i < temp.length; ++i) { - extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); + extraSingleSamples[i + temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); } return extraSingleSamples; } } - //static class OLDGenerateGraphemeBreakTest extends GenerateBreakTest { + // static class OLDGenerateGraphemeBreakTest extends GenerateBreakTest { - //OLDGenerateGraphemeBreakTest(UCD ucd) { - //super(ucd); - //fileName = "Grapheme"; - //sampleMap = map; - //} + // OLDGenerateGraphemeBreakTest(UCD ucd) { + // super(ucd); + // fileName = "Grapheme"; + // sampleMap = map; + // } - //Object foo = prop = unicodePropertySource.getProperty("Grapheme_Cluster_Break"); + // Object foo = prop = unicodePropertySource.getProperty("Grapheme_Cluster_Break"); - //final int - //CR = addToMap("CR"), - //LF = addToMap("LF"), - //Control = addToMap("Control"), - //Extend = addToMap("Extend"), - //L = addToMap("L"), - //V = addToMap("V"), - //T = addToMap("T"), - //LV = addToMap("LV"), - //LVT = addToMap("LVT"), - //Other = addToMapLast("Other"); + // final int + // CR = addToMap("CR"), + // LF = addToMap("LF"), + // Control = addToMap("Control"), + // Extend = addToMap("Extend"), + // L = addToMap("L"), + // V = addToMap("V"), + // T = addToMap("T"), + // LV = addToMap("LV"), + // LVT = addToMap("LVT"), + // Other = addToMapLast("Other"); //// stuff that subclasses need to override - //public String getTypeID(int cp) { - //return map.getLabel(cp); - //} + // public String getTypeID(int cp) { + // return map.getLabel(cp); + // } //// stuff that subclasses need to override - //public byte getType(int cp) { - //return (byte) map.getIndex(cp); - //} + // public byte getType(int cp) { + // return (byte) map.getIndex(cp); + // } - //public String fullBreakSample() { - //return "aa"; - //} + // public String fullBreakSample() { + // return "aa"; + // } - //public boolean isBreak(String source, int offset) { + // public boolean isBreak(String source, int offset) { - //setRule("1: sot ÷"); - //if (offset < 0 || offset > source.length()) return false; - //if (offset == 0) return true; + // setRule("1: sot ÷"); + // if (offset < 0 || offset > source.length()) return false; + // if (offset == 0) return true; - //setRule("2: ÷ eot"); - //if (offset == source.length()) return true; + // setRule("2: ÷ eot"); + // if (offset == source.length()) return true; //// UTF-16: never break in the middle of a code point - //if (!onCodepointBoundary(source, offset)) return false; + // if (!onCodepointBoundary(source, offset)) return false; //// now get the character before and after, and their types + // int cpBefore = UTF16.charAt(source, offset-1); + // int cpAfter = UTF16.charAt(source, offset); - //int cpBefore = UTF16.charAt(source, offset-1); - //int cpAfter = UTF16.charAt(source, offset); + // byte before = getResolvedType(cpBefore); + // byte after = getResolvedType(cpAfter); - //byte before = getResolvedType(cpBefore); - //byte after = getResolvedType(cpAfter); + // setRule("3: CR × LF"); + // if (before == CR && after == LF) return false; - //setRule("3: CR × LF"); - //if (before == CR && after == LF) return false; + // setRule("4: ( Control | CR | LF ) ÷"); + // if (before == CR || before == LF || before == Control) return true; - //setRule("4: ( Control | CR | LF ) ÷"); - //if (before == CR || before == LF || before == Control) return true; + // setRule("5: ÷ ( Control | CR | LF )"); + // if (after == Control || after == LF || after == CR) return true; - //setRule("5: ÷ ( Control | CR | LF )"); - //if (after == Control || after == LF || after == CR) return true; + // setRule("6: L × ( L | V | LV | LVT )"); + // if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; - //setRule("6: L × ( L | V | LV | LVT )"); - //if (before == L && (after == L || after == V || after == LV || after == LVT)) return false; + // setRule("7: ( LV | V ) × ( V | T )"); + // if ((before == LV || before == V) && (after == V || after == T)) return false; - //setRule("7: ( LV | V ) × ( V | T )"); - //if ((before == LV || before == V) && (after == V || after == T)) return false; + // setRule("8: ( LVT | T ) × T"); + // if ((before == LVT || before == T) && (after == T)) return false; - //setRule("8: ( LVT | T ) × T"); - //if ((before == LVT || before == T) && (after == T)) return false; - - //setRule("9: × Extend"); - //if (after == Extend) return false; + // setRule("9: × Extend"); + // if (after == Extend) return false; //// Otherwise break after all characters. - //setRule("10: Any ÷ Any"); - //return true; - - //} - - //} - - //============================================== - - //static class XGenerateWordBreakTest extends GenerateBreakTest { - - //GenerateGraphemeBreakTest grapheme; - //MyBreakIterator breaker; - //Context context = new Context(); - - //XGenerateWordBreakTest(UCD ucd) { - //super(ucd); - //grapheme = new GenerateGraphemeBreakTest(ucd); - //breaker = new MyBreakIterator(grapheme); - //fileName = "Word"; - //sampleMap = map; - //extraSamples = new String[] { - ///*"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", "1.\u2060" - //}; - - //String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" }; - //extraSingleSamples = new String [temp.length * 2]; - //System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length); - //for (int i = 0; i < temp.length; ++i) { - //extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); - //} - - //if (false) Utility.showSetDifferences("Katakana", map.getSetFromIndex(Katakana), - //"Script=Katakana", getSet(ucd, SCRIPT, KATAKANA_SCRIPT), false, ucd); - - //} - - //Object foo = prop = unicodePropertySource.getProperty("Word_Break"); - - ////static String LENGTH = "[\u30FC\uFF70]"; - ////static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]"; - ////static String KATAKANA_ITERATION = "[\u30FD\u30FE]"; - ////static String HIRAGANA_ITERATION = "[\u309D\u309E]"; - - //final int - //Format = addToMap("Format"), - //Katakana = addToMap("Katakana"), - //ALetter = addToMap("ALetter"), - //MidLetter = addToMap("MidLetter"), - ////MidNumLet = addToMap("MidNumLet"), - //MidNum = addToMap("MidNum"), - //Numeric = addToMap("Numeric"), - //ExtendNumLet = addToMap("ExtendNumLet"), - //Other = addToMapLast("Other"); + // setRule("10: Any ÷ Any"); + // return true; + + // } + + // } + + // ============================================== + + // static class XGenerateWordBreakTest extends GenerateBreakTest { + + // GenerateGraphemeBreakTest grapheme; + // MyBreakIterator breaker; + // Context context = new Context(); + + // XGenerateWordBreakTest(UCD ucd) { + // super(ucd); + // grapheme = new GenerateGraphemeBreakTest(ucd); + // breaker = new MyBreakIterator(grapheme); + // fileName = "Word"; + // sampleMap = map; + // extraSamples = new String[] { + /// *"\uFF70", "\uFF65", "\u30FD", */ "a\u2060", "a:", "a'", "a'\u2060", "a,", "1:", "1'", "1,", + // "1.\u2060" + // }; + + // String [] temp = {"can't", "can\u2019t", "ab\u00ADby", "a$-34,567.14%b", "3a" }; + // extraSingleSamples = new String [temp.length * 2]; + // System.arraycopy(temp, 0, extraSingleSamples, 0, temp.length); + // for (int i = 0; i < temp.length; ++i) { + // extraSingleSamples[i+temp.length] = insertEverywhere(temp[i], "\u2060", grapheme); + // } + + // if (false) Utility.showSetDifferences("Katakana", map.getSetFromIndex(Katakana), + // "Script=Katakana", getSet(ucd, SCRIPT, KATAKANA_SCRIPT), false, ucd); + + // } + + // Object foo = prop = unicodePropertySource.getProperty("Word_Break"); + + //// static String LENGTH = "[\u30FC\uFF70]"; + //// static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]"; + //// static String KATAKANA_ITERATION = "[\u30FD\u30FE]"; + //// static String HIRAGANA_ITERATION = "[\u309D\u309E]"; + + // final int + // Format = addToMap("Format"), + // Katakana = addToMap("Katakana"), + // ALetter = addToMap("ALetter"), + // MidLetter = addToMap("MidLetter"), + //// MidNumLet = addToMap("MidNumLet"), + // MidNum = addToMap("MidNum"), + // Numeric = addToMap("Numeric"), + // ExtendNumLet = addToMap("ExtendNumLet"), + // Other = addToMapLast("Other"); //// stuff that subclasses need to override - //public String getTypeID(int cp) { - //return map.getLabel(cp); - //} + // public String getTypeID(int cp) { + // return map.getLabel(cp); + // } //// stuff that subclasses need to override - //public byte getType(int cp) { - //return (byte) map.getIndex(cp); - //} + // public byte getType(int cp) { + // return (byte) map.getIndex(cp); + // } - //public String fullBreakSample() { - //return " a"; - //} + // public String fullBreakSample() { + // return " a"; + // } - //public int genTestItems(String before, String after, String[] results) { - //results[0] = before + after; - //results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; - //results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a'; - //results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; - //return 3; - //} + // public int genTestItems(String before, String after, String[] results) { + // results[0] = before + after; + // results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; + // results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + + // 'a'; + // results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; + // return 3; + // } - //public boolean isBreak(String source, int offset) { + // public boolean isBreak(String source, int offset) { - //setRule("1: sot ÷"); - //if (offset < 0 || offset > source.length()) return false; + // setRule("1: sot ÷"); + // if (offset < 0 || offset > source.length()) return false; - //if (offset == 0) return true; + // if (offset == 0) return true; - //setRule("2: ÷ eot"); - //if (offset == source.length()) return true; + // setRule("2: ÷ eot"); + // if (offset == source.length()) return true; //// Treat a grapheme cluster as if it were a single character: //// the first base character, if there is one; otherwise the first character. - //setRule("3: GC -> FC"); - //if (!grapheme.isBreak( source, offset)) return false; + // setRule("3: GC -> FC"); + // if (!grapheme.isBreak( source, offset)) return false; - //setRule("4: X Format* -> X"); - //byte afterChar = getResolvedType(source.charAt(offset)); - //if (afterChar == Format) return false; + // setRule("4: X Format* -> X"); + // byte afterChar = getResolvedType(source.charAt(offset)); + // if (afterChar == Format) return false; //// now get the base character before and after, and their types - //getGraphemeBases(breaker, source, offset, Format, context); + // getGraphemeBases(breaker, source, offset, Format, context); - //byte before = context.tBefore; - //byte after = context.tAfter; - //byte before2 = context.tBefore2; - //byte after2 = context.tAfter2; + // byte before = context.tBefore; + // byte after = context.tAfter; + // byte before2 = context.tBefore2; + // byte after2 = context.tAfter2; - ////Don't break between most letters + //// Don't break between most letters - //setRule("5: ALetter × ALetter"); - //if (before == ALetter && after == ALetter) return false; + // setRule("5: ALetter × ALetter"); + // if (before == ALetter && after == ALetter) return false; //// Don’t break letters across certain punctuation - //setRule("6: ALetter × MidLetter ALetter"); - //if (before == ALetter && after == MidLetter && after2 == ALetter) return false; + // setRule("6: ALetter × MidLetter ALetter"); + // if (before == ALetter && after == MidLetter && after2 == ALetter) return false; - //setRule("7: ALetter (MidLetter | MidNumLet) × ALetter"); - //if (before2 == ALetter && before == MidLetter && after == ALetter) return false; + // setRule("7: ALetter (MidLetter | MidNumLet) × ALetter"); + // if (before2 == ALetter && before == MidLetter && after == ALetter) return false; //// Don’t break within sequences of digits, or digits adjacent to letters. - //setRule("8: Numeric × Numeric"); - //if (before == Numeric && after == Numeric) return false; - - //setRule("9: ALetter × Numeric"); - //if (before == ALetter && after == Numeric) return false; + // setRule("8: Numeric × Numeric"); + // if (before == Numeric && after == Numeric) return false; - //setRule("10: Numeric × ALetter"); - //if (before == Numeric && after == ALetter) return false; + // setRule("9: ALetter × Numeric"); + // if (before == ALetter && after == Numeric) return false; + // setRule("10: Numeric × ALetter"); + // if (before == Numeric && after == ALetter) return false; //// Don’t break within sequences like: '-3.2' - //setRule("11: Numeric (MidNum | MidNumLet) × Numeric"); - //if (before2 == Numeric && before == MidNum && after == Numeric) return false; + // setRule("11: Numeric (MidNum | MidNumLet) × Numeric"); + // if (before2 == Numeric && before == MidNum && after == Numeric) return false; - //setRule("12: Numeric × (MidNum | MidNumLet) Numeric"); - //if (before == Numeric && after == MidNum && after2 == Numeric) return false; + // setRule("12: Numeric × (MidNum | MidNumLet) Numeric"); + // if (before == Numeric && after == MidNum && after2 == Numeric) return false; //// Don't break between Katakana - //setRule("13: Katakana × Katakana"); - //if (before == Katakana && after == Katakana) return false; + // setRule("13: Katakana × Katakana"); + // if (before == Katakana && after == Katakana) return false; //// Do not break from extenders - //setRule("13a: (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet"); - //if ((before == ALetter || before == Numeric || before == Katakana || before == ExtendNumLet) && after == ExtendNumLet) return false; + // setRule("13a: (ALetter | Numeric | Katakana | ExtendNumLet) × + // ExtendNumLet"); + // if ((before == ALetter || before == Numeric || before == Katakana || before == ExtendNumLet) + // && after == ExtendNumLet) return false; - //setRule("13b: ExtendNumLet × (ALetter | Numeric | Katakana)"); - //if (before == ExtendNumLet && (after == ALetter || after == Numeric || after == Katakana)) return false; + // setRule("13b: ExtendNumLet × (ALetter | Numeric | Katakana)"); + // if (before == ExtendNumLet && (after == ALetter || after == Numeric || after == Katakana)) + // return false; //// Otherwise break always. - //setRule("14: Any ÷ Any"); - //return true; + // setRule("14: Any ÷ Any"); + // return true; - //} + // } - //} + // } // ======================================== - //static class XGenerateLineBreakTest extends GenerateBreakTest { - - //GenerateGraphemeBreakTest grapheme; - //MyBreakIterator breaker; - //Context context = new Context(); - - //XGenerateLineBreakTest(UCD ucd) { - //super(ucd); - //grapheme = new GenerateGraphemeBreakTest(ucd); - //breaker = new MyBreakIterator(grapheme); - - //sampleMap = map; - //fileName = "Line"; - //extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", - //"-3", - //"e.g.", - //"\u4e00.\u4e00.", - //"a b", - //"a \u200bb", - //"a \u0308b", - //"1\u0308b(a)-(b)", - //}; - //} + // static class XGenerateLineBreakTest extends GenerateBreakTest { + + // GenerateGraphemeBreakTest grapheme; + // MyBreakIterator breaker; + // Context context = new Context(); + + // XGenerateLineBreakTest(UCD ucd) { + // super(ucd); + // grapheme = new GenerateGraphemeBreakTest(ucd); + // breaker = new MyBreakIterator(grapheme); + + // sampleMap = map; + // fileName = "Line"; + // extraSingleSamples = new String[] {"can't", "can\u2019t", "ab\u00ADby", + // "-3", + // "e.g.", + // "\u4e00.\u4e00.", + // "a b", + // "a \u200bb", + // "a \u0308b", + // "1\u0308b(a)-(b)", + // }; + // } //// all the other items are supplied in UCD_TYPES - ///*static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT, - //LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT, - //LB2_LIMIT = (byte)(LB_SUP + 1); - //*/ + /// *static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT, + // LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT, + // LB2_LIMIT = (byte)(LB_SUP + 1); + // */ - ///* - //private byte[] AsmusOrderToMyOrder = { - //LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO, - //LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM, + /// * + // private byte[] AsmusOrderToMyOrder = { + // LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO, + // LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM, //// missing from Pair Table - //LB_SP, LB_BK, LB_CR, LB_LF, + // LB_SP, LB_BK, LB_CR, LB_LF, //// resolved types below - //LB_CB, LB_AI, LB_SA, LB_SG, LB_XX, + // LB_CB, LB_AI, LB_SA, LB_SG, LB_XX, //// 3 JAMO CLASSES, plus supplementary - //LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP - //}; - - //private byte[] MyOrderToAsmusOrder = new byte[AsmusOrderToMyOrder.length]; - //{ - //for (byte i = 0; i < AsmusOrderToMyOrder.length; ++i) { - //MyOrderToAsmusOrder[AsmusOrderToMyOrder[i]] = i; - //} - //*/ - - //{ - ////System.out.println("Adding Linebreak"); - //for (int i = 0; i <= 0x10FFFF; ++i) { - //map.put(i, ucd.getLineBreak(i)); - //} - //for (int i = 0; i < LB_LIMIT; ++i) { - //map.setLabel(i, ucd.getLineBreakID_fromIndex((byte)i, SHORT)); - //} - ////System.out.println(map.getSetFromIndex(LB_CL)); - ////System.out.println("Done adding Linebreak"); - //} - - //public int mapType(int input) { - //int old = input; - //switch (input) { - //case LB_BA: input = 16; break; - //case LB_BB: input = 17; break; - //case LB_B2: input = 18; break; - //case LB_ZW: input = 19; break; - //case LB_CM: input = 20; break; - //case LB_WJ: input = 21; break; - - //case LB_SP: input = 22; break; - //case LB_BK: input = 23; break; - //case LB_NL: input = 24; break; - //case LB_CR: input = 25; break; - //case LB_LF: input = 26; break; - - //case LB_CB: input = 27; break; - //case LB_SA: input = 28; break; - //case LB_AI: input = 29; break; - //case LB_SG: input = 30; break; - //} - ////if (old != input) System.out.println(old + " => " + input); - //return input; - //} - - - //public void sampleDescription(PrintWriter out) { - //out.println("# Samples:"); - //out.println("# The test currently takes all pairs of linebreak types*,"); - //out.println("# picks a sample for each type, and generates three strings: "); - //out.println("#\t- the pair alone"); - //out.println("#\t- the pair alone with an imbeded space"); - //out.println("#\t- the pair alone with embedded combining marks"); - //out.println("# The sample for each type is simply the first code point (above NULL)"); - //out.println("# with that property."); - //out.println("# * Note:"); - //out.println("#\t- SG is omitted"); - //out.println("#\t- 3 different Jamo characters and a supplementary character are added"); - //out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments"); - //out.println("#\t instead of the linebreak property"); - //out.println("#"); - //} + // LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP + // }; + + // private byte[] MyOrderToAsmusOrder = new byte[AsmusOrderToMyOrder.length]; + // { + // for (byte i = 0; i < AsmusOrderToMyOrder.length; ++i) { + // MyOrderToAsmusOrder[AsmusOrderToMyOrder[i]] = i; + // } + // */ + + // { + //// System.out.println("Adding Linebreak"); + // for (int i = 0; i <= 0x10FFFF; ++i) { + // map.put(i, ucd.getLineBreak(i)); + // } + // for (int i = 0; i < LB_LIMIT; ++i) { + // map.setLabel(i, ucd.getLineBreakID_fromIndex((byte)i, SHORT)); + // } + //// System.out.println(map.getSetFromIndex(LB_CL)); + //// System.out.println("Done adding Linebreak"); + // } + + // public int mapType(int input) { + // int old = input; + // switch (input) { + // case LB_BA: input = 16; break; + // case LB_BB: input = 17; break; + // case LB_B2: input = 18; break; + // case LB_ZW: input = 19; break; + // case LB_CM: input = 20; break; + // case LB_WJ: input = 21; break; + + // case LB_SP: input = 22; break; + // case LB_BK: input = 23; break; + // case LB_NL: input = 24; break; + // case LB_CR: input = 25; break; + // case LB_LF: input = 26; break; + + // case LB_CB: input = 27; break; + // case LB_SA: input = 28; break; + // case LB_AI: input = 29; break; + // case LB_SG: input = 30; break; + // } + //// if (old != input) System.out.println(old + " => " + input); + // return input; + // } + + // public void sampleDescription(PrintWriter out) { + // out.println("# Samples:"); + // out.println("# The test currently takes all pairs of linebreak types*,"); + // out.println("# picks a sample for each type, and generates three strings: "); + // out.println("#\t- the pair alone"); + // out.println("#\t- the pair alone with an imbeded space"); + // out.println("#\t- the pair alone with embedded combining marks"); + // out.println("# The sample for each type is simply the first code point (above NULL)"); + // out.println("# with that property."); + // out.println("# * Note:"); + // out.println("#\t- SG is omitted"); + // out.println("#\t- 3 different Jamo characters and a supplementary character are added"); + // out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments"); + // out.println("#\t instead of the linebreak property"); + // out.println("#"); + // } //// stuff that subclasses need to override - //public int genTestItems(String before, String after, String[] results) { - //results[0] = before + after; - //results[1] = before + " " + after; - //results[2] = before + "\u0301\u0308" + after; - //return 3; - //} + // public int genTestItems(String before, String after, String[] results) { + // results[0] = before + after; + // results[1] = before + " " + after; + // results[2] = before + "\u0301\u0308" + after; + // return 3; + // } //// stuff that subclasses need to override - //boolean skipType(int type) { - //return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX - //|| type == LB_CB || type == LB_CR || type == LB_BK || type == LB_LF - //|| type == LB_NL || type == LB_SP; - //} + // boolean skipType(int type) { + // return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX + // || type == LB_CB || type == LB_CR || type == LB_BK || type == LB_LF + // || type == LB_NL || type == LB_SP; + // } //// stuff that subclasses need to override - //public String getTypeID(int cp) { - ///* - //byte result = getType(cp); - //if (result == LB_SUP) return "SUP"; - //if (result >= LB_LIMIT) return hNames[result - LB_LIMIT]; - //*/ + // public String getTypeID(int cp) { + /// * + // byte result = getType(cp); + // if (result == LB_SUP) return "SUP"; + // if (result >= LB_LIMIT) return hNames[result - LB_LIMIT]; + // */ //// return ucd.getLineBreakID_fromIndex(cp); // AsmusOrderToMyOrder[result]); - //return ucd.getLineBreakID(cp); // AsmusOrderToMyOrder[result]); - //} + // return ucd.getLineBreakID(cp); // AsmusOrderToMyOrder[result]); + // } - //public String fullBreakSample() { - //return ")a"; - //} + // public String fullBreakSample() { + // return ")a"; + // } //// stuff that subclasses need to override - //public byte getType(int cp) { - ///*if (cp > 0xFFFF) return LB_SUP; - //byte result = getHangulType(cp); - //if (result != hNot) return (byte)(result + LB_LIMIT); - //*/ + // public byte getType(int cp) { + /// *if (cp > 0xFFFF) return LB_SUP; + // byte result = getHangulType(cp); + // if (result != hNot) return (byte)(result + LB_LIMIT); + // */ //// return MyOrderToAsmusOrder[ucd.getLineBreak(cp)]; - //return ucd.getLineBreak(cp); - //} - - //public String getTableEntry(String before, String after, String[] ruleOut) { - //String t = "_"; // break - //boolean spaceBreak = isBreak(before + " " + after, before.length()+1); - //String spaceRule = getRule(); - - //boolean spaceBreak2 = isBreak(before + " " + after, before.length()); - //String spaceRule2 = getRule(); - - //boolean normalBreak = isBreak(before + after, before.length()); - //String normalRule = getRule(); - - //ruleOut[0] = normalRule; - //if (!normalBreak) { - //if (!spaceBreak && !spaceBreak2) { - //t = "^"; // don't break, even with intervening spaces - //} else { - //t = "%"; // don't break, but break with intervening spaces - //} - //if (!spaceRule2.equals(normalRule)) { - //ruleOut[0] += " [" + spaceRule2 + "]"; - //} - //if (!spaceRule.equals(normalRule) && !spaceRule.equals(spaceRule2)) { - //ruleOut[0] += " {" + spaceRule + "}"; - //} - //} - //return t; - //} - - //public boolean highlightTableEntry(int x, int y, String s) { - //return false; - ///* - //try { - //return !oldLineBreak[x][y].equals(s); - //} catch (Exception e) {} - //return true; - //*/ - //} - - ///* - //String[][] oldLineBreak = { - //{"^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "^", "%"}, - //{"_", "^", "%", "%", "^", "^", "^", "^", "", "%", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"^", "^", "%", "%", "%", "^", "^", "^", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "^", "%"}, - //{"%", "^", "%", "%", "%", "^", "^", "^", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"%", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "%", "%", "_", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "%", "%", "%", "_", "%", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "%", "_", "%", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "%", "_", "_", "_", "%", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "%", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "_", "^", "%"}, - //{"%", "^", "%", "%", "%", "^", "^", "^", "%", "%", "%", "%", "%", "%", "%", "%", "%", "%", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "_", "_", "_", "_", "%", "%", "_", "^", "^", "%"}, - //{"_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "_", "^", "%"}, - //{"_", "^", "%", "%", "%", "^", "^", "^", "_", "_", "%", "%", "_", "%", "%", "%", "_", "_", "^", "%"} - //}; - //*/ - - //public byte getResolvedType (int cp) { + // return ucd.getLineBreak(cp); + // } + + // public String getTableEntry(String before, String after, String[] ruleOut) { + // String t = "_"; // break + // boolean spaceBreak = isBreak(before + " " + after, before.length()+1); + // String spaceRule = getRule(); + + // boolean spaceBreak2 = isBreak(before + " " + after, before.length()); + // String spaceRule2 = getRule(); + + // boolean normalBreak = isBreak(before + after, before.length()); + // String normalRule = getRule(); + + // ruleOut[0] = normalRule; + // if (!normalBreak) { + // if (!spaceBreak && !spaceBreak2) { + // t = "^"; // don't break, even with intervening spaces + // } else { + // t = "%"; // don't break, but break with intervening spaces + // } + // if (!spaceRule2.equals(normalRule)) { + // ruleOut[0] += " [" + spaceRule2 + "]"; + // } + // if (!spaceRule.equals(normalRule) && !spaceRule.equals(spaceRule2)) { + // ruleOut[0] += " {" + spaceRule + "}"; + // } + // } + // return t; + // } + + // public boolean highlightTableEntry(int x, int y, String s) { + // return false; + /// * + // try { + // return !oldLineBreak[x][y].equals(s); + // } catch (Exception e) {} + // return true; + // */ + // } + + /// * + // String[][] oldLineBreak = { + // {"^", "^", "^", "^", "^", "^", "^", "^", + // "^", "^", "^", "^", "^", "^", "^", "^", + // "^", "^", "^", "%"}, + // {"_", "^", "%", "%", "^", "^", "^", "^", + // "", "%", "_", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"^", "^", "%", "%", "%", "^", "^", "^", + // "%", "%", "%", "%", "%", "%", "%", "%", + // "%", "%", "^", "%"}, + // {"%", "^", "%", "%", "%", "^", "^", "^", + // "%", "%", "%", "%", "%", "%", "%", "%", + // "%", "%", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "_", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "_", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "%", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "%", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"%", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "%", "%", "%", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "_", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "%", "%", "%", "_", "%", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "%", "%", "_", "%", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "%", "_", "_", "_", "%", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "_", "_", "_", "%", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "_", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "_", "_", "_", "_", "%", "%", + // "_", "_", "^", "%"}, + // {"%", "^", "%", "%", "%", "^", "^", "^", + // "%", "%", "%", "%", "%", "%", "%", "%", + // "%", "%", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "_", "_", "_", "_", "%", "%", + // "_", "^", "^", "%"}, + // {"_", "_", "_", "_", "_", "_", "_", "_", + // "_", "_", "_", "_", "_", "_", "_", "_", + // "_", "_", "^", "%"}, + // {"_", "^", "%", "%", "%", "^", "^", "^", + // "_", "_", "%", "%", "_", "%", "%", "%", + // "_", "_", "^", "%"} + // }; + // */ + + // public byte getResolvedType (int cp) { //// LB 1 Assign a line break category to each character of the input. - //// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. - //byte result = getType(cp); - //switch (result) { - //case LB_AI: result = LB_AI; break; + //// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this + // algorithm. + // byte result = getType(cp); + // switch (result) { + // case LB_AI: result = LB_AI; break; //// case LB_CB: result = LB_ID; break; - //case LB_SA: result = LB_AL; break; + // case LB_SA: result = LB_AL; break; //// case LB_SG: result = LB_XX; break; Surrogates; will never occur - //case LB_XX: result = LB_AL; break; - //} - ///* - //if (recommended) { - //if (getHangulType(cp) != hNot) { - //result = LB_ID; - //} - //} - //*/ - - //return result; - //} - - //public byte getSampleType (int cp) { - //if (ucd.getHangulSyllableType(cp) != NA) return LB_XX; - //return getType(cp); - //} - + // case LB_XX: result = LB_AL; break; + // } + /// * + // if (recommended) { + // if (getHangulType(cp) != hNot) { + // result = LB_ID; + // } + // } + // */ + + // return result; + // } + + // public byte getSampleType (int cp) { + // if (ucd.getHangulSyllableType(cp) != NA) return LB_XX; + // return getType(cp); + // } //// find out whether there is a break at offset //// WARNING: as a side effect, sets "rule" - //public boolean isBreak(String source, int offset) { + // public boolean isBreak(String source, int offset) { //// LB 1 Assign a line break category to each character of the input. - //// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm. + //// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this + // algorithm. //// this is taken care of in the getResolvedType function //// LB 2a Never break at the start of text - //setRule("2a: × sot"); - //if (offset <= 0) return false; + // setRule("2a: × sot"); + // if (offset <= 0) return false; //// LB 2b Always break at the end of text - //setRule("2b: ! eot"); - //if (offset >= source.length()) return true; - + // setRule("2b: ! eot"); + // if (offset >= source.length()) return true; //// UTF-16: never break in the middle of a code point //// now get the base character before and after, and their types - //getGraphemeBases(breaker, source, offset, -1, context); + // getGraphemeBases(breaker, source, offset, -1, context); - //byte before = context.tBefore; - //byte after = context.tAfter; - //byte before2 = context.tBefore2; - //byte after2 = context.tAfter2; - - - ////if (!onCodepointBoundary(source, offset)) return false; + // byte before = context.tBefore; + // byte after = context.tAfter; + // byte before2 = context.tBefore2; + // byte after2 = context.tAfter2; + //// if (!onCodepointBoundary(source, offset)) return false; //// now get the character before and after, and their types + //// int cpBefore = UTF16.charAt(source, offset-1); + //// int cpAfter = UTF16.charAt(source, offset); - ////int cpBefore = UTF16.charAt(source, offset-1); - ////int cpAfter = UTF16.charAt(source, offset); - - ////byte before = getResolvedType(cpBefore); - ////byte after = getResolvedType(cpAfter); + //// byte before = getResolvedType(cpBefore); + //// byte after = getResolvedType(cpAfter); - - //setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !"); + // setRule("3a: CR × LF ; ( BK | CR | LF | NL ) !"); //// Always break after hard line breaks (but never between CR and LF). //// CR ^ LF - //if (before == LB_CR && after == LB_LF) return false; - //if (before == LB_BK || before == LB_LF || before == LB_CR) return true; + // if (before == LB_CR && after == LB_LF) return false; + // if (before == LB_BK || before == LB_LF || before == LB_CR) return true; - ////LB 3b Don’t break before hard line breaks. - //setRule("3b: × ( BK | CR | LF )"); - //if (after == LB_BK || after == LB_LF || after == LB_CR) return false; + //// LB 3b Don’t break before hard line breaks. + // setRule("3b: × ( BK | CR | LF )"); + // if (after == LB_BK || after == LB_LF || after == LB_CR) return false; //// LB 4 Don’t break before spaces or zero-width space. - //setRule("4: × ( SP | ZW )"); - //if (after == LB_SP || after == LB_ZW) return false; + // setRule("4: × ( SP | ZW )"); + // if (after == LB_SP || after == LB_ZW) return false; //// LB 5 Break after zero-width space. - //setRule("5: ZW ÷"); - //if (before == LB_ZW) return true; - - //// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos. - //setRule("6: DGC -> FC"); - //if (!grapheme.isBreak( source, offset)) return false; - - ///* - //if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false; - //if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; - //if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; - //*/ - - //byte backBase = -1; - //boolean setBase = false; - //if (before == LB_CM) { - //setBase = true; - //int backOffset = findLastNon(source, offset, LB_CM); - //if (backOffset >= 0) { - //backBase = getResolvedType(UTF16.charAt(source, backOffset)); - //} - //} - + // setRule("5: ZW ÷"); + // if (before == LB_ZW) return true; + + //// LB 6 Don’t break graphemes (before combining marks, around virama or on sequences of + // conjoining Jamos. + // setRule("6: DGC -> FC"); + // if (!grapheme.isBreak( source, offset)) return false; + + /// * + // if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) + // return false; + // if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false; + // if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false; + // */ + + // byte backBase = -1; + // boolean setBase = false; + // if (before == LB_CM) { + // setBase = true; + // int backOffset = findLastNon(source, offset, LB_CM); + // if (backOffset >= 0) { + // backBase = getResolvedType(UTF16.charAt(source, backOffset)); + // } + // } //// LB 7 In all of the following rules, if a space is the base character for a combining mark, //// the space is changed to type ID. In other words, break before SP CM* in the same cases as //// one would break before an ID. - //setRule("7: SP CM* -> ID"); - //if (setBase && backBase == LB_SP) before = LB_ID; - //if (after == LB_SP && after2 == LB_CM) after = LB_ID; - - //setRule("7a: X CM* -> X"); - //if (after == LB_CM) return false; - //if (setBase && backBase != -1) before = LB_ID; + // setRule("7: SP CM* -> ID"); + // if (setBase && backBase == LB_SP) before = LB_ID; + // if (after == LB_SP && after2 == LB_CM) after = LB_ID; - //setRule("7b: CM -> AL"); - //if (setBase && backBase == -1) before = LB_AL; + // setRule("7a: X CM* -> X"); + // if (after == LB_CM) return false; + // if (setBase && backBase != -1) before = LB_ID; + // setRule("7b: CM -> AL"); + // if (setBase && backBase == -1) before = LB_AL; //// LB 8 Don’t break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. //// × CL, × EX, × IS, × SY - //setRule("8: × ( CL | EX | IS | SY )"); - //if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false; - + // setRule("8: × ( CL | EX | IS | SY )"); + // if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false; //// find the last non-space character; we will need it - //byte lastNonSpace = before; - //if (lastNonSpace == LB_SP) { - //int backOffset = findLastNon(source, offset, LB_SP); - //if (backOffset >= 0) { - //lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset)); - //} - //} + // byte lastNonSpace = before; + // if (lastNonSpace == LB_SP) { + // int backOffset = findLastNon(source, offset, LB_SP); + // if (backOffset >= 0) { + // lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset)); + // } + // } //// LB 9 Don’t break after ‘[’, even after spaces. //// OP SP* × - //setRule("9: OP SP* ×"); - //if (lastNonSpace == LB_OP) return false; + // setRule("9: OP SP* ×"); + // if (lastNonSpace == LB_OP) return false; //// LB 10 Don’t break within ‘�?[’, , even with intervening spaces. //// QU SP* × OP - //setRule("10: QU SP* × OP"); - //if (lastNonSpace == LB_QU && after == LB_OP) return false; + // setRule("10: QU SP* × OP"); + // if (lastNonSpace == LB_QU && after == LB_OP) return false; //// LB 11 Don’t break within ‘]h’, even with intervening spaces. //// CL SP* × NS - //setRule("11: CL SP* × NS"); - //if (lastNonSpace == LB_CL && after == LB_NS) return false; + // setRule("11: CL SP* × NS"); + // if (lastNonSpace == LB_CL && after == LB_NS) return false; //// LB 11a Don’t break within ‘——’, even with intervening spaces. //// B2 × B2 - //setRule("11a: B2 × B2"); - //if (lastNonSpace == LB_B2 && after == LB_B2) return false; - + // setRule("11a: B2 × B2"); + // if (lastNonSpace == LB_B2 && after == LB_B2) return false; //// LB 13 Don’t break before or after NBSP or WORD JOINER //// × GL //// GL × - //setRule("11b: × WJ ; WJ ×"); - //if (after == LB_WJ || before == LB_WJ) return false; + // setRule("11b: × WJ ; WJ ×"); + // if (after == LB_WJ || before == LB_WJ) return false; - //// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.] + //// [Note: by this time, all of the "X" in the table are accounted for. We can safely break + // after spaces.] //// LB 12 Break after spaces - //setRule("12: SP ÷"); - //if (before == LB_SP) return true; + // setRule("12: SP ÷"); + // if (before == LB_SP) return true; //// LB 13 Don’t break before or after NBSP or WORD JOINER - //setRule("13: × GL ; GL ×"); - //if (after == LB_GL || before == LB_GL) return false; + // setRule("13: × GL ; GL ×"); + // if (after == LB_GL || before == LB_GL) return false; //// LB 14 Don’t break before or after ‘�?’ - //setRule("14: × QU ; QU ×"); - //if (before == LB_QU || after == LB_QU) return false; + // setRule("14: × QU ; QU ×"); + // if (before == LB_QU || after == LB_QU) return false; //// LB 14a Break before and after CB - //setRule("14a: ÷ CB ; CB ÷"); - //if (before == LB_CB || after == LB_CB) return true; + // setRule("14a: ÷ CB ; CB ÷"); + // if (before == LB_CB || after == LB_CB) return true; //// LB 15 Don’t break before hyphen-minus, other hyphens, fixed-width spaces, //// small kana and other non- starters, or after acute accents: - //setRule("15: × ( BA | HY | NS ) ; BB ×"); - //if (after == LB_NS) return false; - //if (after == LB_HY) return false; - //if (after == LB_BA) return false; - //if (before == LB_BB) return false; - + // setRule("15: × ( BA | HY | NS ) ; BB ×"); + // if (after == LB_NS) return false; + // if (after == LB_HY) return false; + // if (after == LB_BA) return false; + // if (before == LB_BB) return false; - ////setRule("15a: HY × NU"); // NEW - ////if (before == LB_HY && after == LB_NU) return false; + //// setRule("15a: HY × NU"); // NEW + //// if (before == LB_HY && after == LB_NU) return false; //// LB 16 Don’t break between two ellipses, or between letters or numbers and ellipsis: //// Examples: ’9...’, ‘a...’, ‘H...’ - //setRule("16: ( AL | ID | IN | NU ) × IN"); - //if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false; - //if (before == LB_IN && after == LB_IN) return false; + // setRule("16: ( AL | ID | IN | NU ) × IN"); + // if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false; + // if (before == LB_IN && after == LB_IN) return false; //// Don't break alphanumerics. //// LB 17 Don’t break within ‘a9’, ‘3a’, or ‘H%’ @@ -2391,10 +2600,10 @@ static String[] getExtraSamples(UCD ucd, Segmenter.Target target) { //// Examples: $(12.35) 2,1234 (12)¢ 12.54¢ //// This is approximated with the following rules. (Some cases already handled above, //// like ‘9,’, ‘[9’.) - //setRule("17: ID × PO ; AL × NU; NU × AL"); - //if (before == LB_ID && after == LB_PO) return false; - //if (before == LB_AL && after == LB_NU) return false; - //if (before == LB_NU && after == LB_AL) return false; + // setRule("17: ID × PO ; AL × NU; NU × AL"); + // if (before == LB_ID && after == LB_PO) return false; + // if (before == LB_AL && after == LB_NU) return false; + // if (before == LB_NU && after == LB_AL) return false; //// LB 18 Don’t break between the following pairs of classes. //// CL × PO @@ -2410,362 +2619,372 @@ static String[] getExtraSamples(UCD ucd, Segmenter.Target target) { //// SY × NU //// Example pairs: ‘$9’, ‘$[’, ‘$-‘, ‘-9’, ‘/9’, ‘99’, ‘,9’, ‘9%’ ‘]%’ - //setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP )"); - //if (before == LB_CL && after == LB_PO) return false; - //if (before == LB_IS && after == LB_NU) return false; - //if (before == LB_NU && after == LB_NU) return false; - //if (before == LB_NU && after == LB_PO) return false; + // setRule("18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP + // )"); + // if (before == LB_CL && after == LB_PO) return false; + // if (before == LB_IS && after == LB_NU) return false; + // if (before == LB_NU && after == LB_NU) return false; + // if (before == LB_NU && after == LB_PO) return false; - //if (before == LB_HY && after == LB_NU) return false; + // if (before == LB_HY && after == LB_NU) return false; - //if (before == LB_PR && after == LB_AL) return false; - //if (before == LB_PR && after == LB_HY) return false; - //if (before == LB_PR && after == LB_ID) return false; - //if (before == LB_PR && after == LB_NU) return false; - //if (before == LB_PR && after == LB_OP) return false; + // if (before == LB_PR && after == LB_AL) return false; + // if (before == LB_PR && after == LB_HY) return false; + // if (before == LB_PR && after == LB_ID) return false; + // if (before == LB_PR && after == LB_NU) return false; + // if (before == LB_PR && after == LB_OP) return false; - //if (before == LB_SY && after == LB_NU) return false; + // if (before == LB_SY && after == LB_NU) return false; //// LB 15b Break after hyphen-minus, and before acute accents: - //setRule("18b: HY ÷ ; ÷ BB"); - //if (before == LB_HY) return true; - //if (after == LB_BB) return true; + // setRule("18b: HY ÷ ; ÷ BB"); + // if (before == LB_HY) return true; + // if (after == LB_BB) return true; //// LB 19 Don’t break between alphabetics (“at�?) //// AL × AL - //setRule("19: AL × AL"); - //if (before == LB_AL && after == LB_AL) return false; + // setRule("19: AL × AL"); + // if (before == LB_AL && after == LB_AL) return false; //// LB 20 Break everywhere else //// ALL ÷ //// ÷ ALL - //if (ucd.getCompositeVersion() > 0x040000) { - //setRule("19b: IS × AL"); - //if (before == LB_IS && after == LB_AL) return false; - //} + // if (ucd.getCompositeVersion() > 0x040000) { + // setRule("19b: IS × AL"); + // if (before == LB_IS && after == LB_AL) return false; + // } //// LB 20 Break everywhere else //// ALL ÷ //// ÷ ALL - //setRule("20: ALL ÷ ; ÷ ALL"); - //return true; - //} - //} - - //============================================== - - //static class XGenerateSentenceBreakTest extends GenerateBreakTest { - - //GenerateGraphemeBreakTest grapheme; - //MyBreakIterator breaker; - - //XGenerateSentenceBreakTest(UCD ucd) { - //super(ucd); - //grapheme = new GenerateGraphemeBreakTest(ucd); - //breaker = new MyBreakIterator(grapheme); - - //fileName = "Sentence"; - //extraSamples = new String[] { - //}; - - //extraSingleSamples = new String[] { - //"(\"Go.\") (He did.)", - //"(\u201CGo?\u201D) (He did.)", - //"U.S.A\u0300. is", - //"U.S.A\u0300? He", - //"U.S.A\u0300.", - //"3.4", - //"c.d", - //"etc.)\u2019 \u2018(the", - //"etc.)\u2019 \u2018(The", - //"the resp. leaders are", - //"\u5B57.\u5B57", - //"etc.\u5B83", - //"etc.\u3002", - //"\u5B57\u3002\u5B83", - //}; - //String[] temp = new String [extraSingleSamples.length * 2]; - //System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length); - //for (int i = 0; i < extraSingleSamples.length; ++i) { - //temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", grapheme); - //} - //extraSingleSamples = temp; - - //} - - //Object foo = prop = unicodePropertySource.getProperty("Sentence_Break"); - - //final int - //CR = addToMap("CR"), - //LF = addToMap("LF"), - //Extend = addToMap("Extend"), - //Sep = addToMap("Sep"), - //Format = addToMap("Format"), - //Sp = addToMap("Sp"), - //Lower = addToMap("Lower"), - //Upper = addToMap("Upper"), - //OLetter = addToMap("OLetter"), - //Numeric = addToMap("Numeric"), - //ATerm = addToMap("ATerm"), - //STerm = addToMap("STerm"), - //Close = addToMap("Close"), - //SContinue = addToMap("SContinue"), - //Other = addToMapLast("Other"); + // setRule("20: ALL ÷ ; ÷ ALL"); + // return true; + // } + // } + + // ============================================== + + // static class XGenerateSentenceBreakTest extends GenerateBreakTest { + + // GenerateGraphemeBreakTest grapheme; + // MyBreakIterator breaker; + + // XGenerateSentenceBreakTest(UCD ucd) { + // super(ucd); + // grapheme = new GenerateGraphemeBreakTest(ucd); + // breaker = new MyBreakIterator(grapheme); + + // fileName = "Sentence"; + // extraSamples = new String[] { + // }; + + // extraSingleSamples = new String[] { + // "(\"Go.\") (He did.)", + // "(\u201CGo?\u201D) (He did.)", + // "U.S.A\u0300. is", + // "U.S.A\u0300? He", + // "U.S.A\u0300.", + // "3.4", + // "c.d", + // "etc.)\u2019 \u2018(the", + // "etc.)\u2019 \u2018(The", + // "the resp. leaders are", + // "\u5B57.\u5B57", + // "etc.\u5B83", + // "etc.\u3002", + // "\u5B57\u3002\u5B83", + // }; + // String[] temp = new String [extraSingleSamples.length * 2]; + // System.arraycopy(extraSingleSamples, 0, temp, 0, extraSingleSamples.length); + // for (int i = 0; i < extraSingleSamples.length; ++i) { + // temp[i+extraSingleSamples.length] = insertEverywhere(extraSingleSamples[i], "\u2060", + // grapheme); + // } + // extraSingleSamples = temp; + + // } + + // Object foo = prop = unicodePropertySource.getProperty("Sentence_Break"); + + // final int + // CR = addToMap("CR"), + // LF = addToMap("LF"), + // Extend = addToMap("Extend"), + // Sep = addToMap("Sep"), + // Format = addToMap("Format"), + // Sp = addToMap("Sp"), + // Lower = addToMap("Lower"), + // Upper = addToMap("Upper"), + // OLetter = addToMap("OLetter"), + // Numeric = addToMap("Numeric"), + // ATerm = addToMap("ATerm"), + // STerm = addToMap("STerm"), + // Close = addToMap("Close"), + // SContinue = addToMap("SContinue"), + // Other = addToMapLast("Other"); //// stuff that subclasses need to override - //public String getTypeID(int cp) { - //return map.getLabel(cp); - //} + // public String getTypeID(int cp) { + // return map.getLabel(cp); + // } - //public String fullBreakSample() { - //return "!a"; - //} + // public String fullBreakSample() { + // return "!a"; + // } //// stuff that subclasses need to override - //public byte getType(int cp) { - //return (byte) map.getIndex(cp); - //} - - ///*LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7, - //LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15, - //LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = 23, - //LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28, - //LB_NL = 29, - //LB_WJ = 30, - //*/ - ///* - //static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5, - //Numeric = 6, Close = 7, ATerm = 8, Term = 9, Other = 10, - //LIMIT = Other + 1; - - //static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper", "Numeric", - //"Close", "ATerm", "Term", "Other" }; - - - //static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]"); - //static UnicodeSet atermSet = new UnicodeSet("[\\u002E]"); - //static UnicodeSet termSet = new UnicodeSet( - //"[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934" - //+ "\\u1362\\u1367\\u1368\\u104A\\u104B\\u166E" - //+ "\\u1803\\u1809\\u203c\\u203d" - //+ "\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]"); - - //static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase); - //static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase); - - //UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NU).getSet(); - //*/ - - ///* + // public byte getType(int cp) { + // return (byte) map.getIndex(cp); + // } + + /// *LB_XX = 0, LB_OP = 1, LB_CL = 2, LB_QU = 3, LB_GL = 4, LB_NS = 5, LB_EX = 6, LB_SY = 7, + // LB_IS = 8, LB_PR = 9, LB_PO = 10, LB_NU = 11, LB_AL = 12, LB_ID = 13, LB_IN = 14, LB_HY = 15, + // LB_CM = 16, LB_BB = 17, LB_BA = 18, LB_SP = 19, LB_BK = 20, LB_CR = 21, LB_LF = 22, LB_CB = + // 23, + // LB_SA = 24, LB_AI = 25, LB_B2 = 26, LB_SG = 27, LB_ZW = 28, + // LB_NL = 29, + // LB_WJ = 30, + // */ + /// * + // static final byte Format = 0, Sep = 1, Sp = 2, OLetter = 3, Lower = 4, Upper = 5, + // Numeric = 6, Close = 7, ATerm = 8, Term = 9, Other = 10, + // LIMIT = Other + 1; + + // static final String[] Names = {"Format", "Sep", "Sp", "OLetter", "Lower", "Upper", "Numeric", + // "Close", "ATerm", "Term", "Other" }; + + // static UnicodeSet sepSet = new UnicodeSet("[\\u000a\\u000d\\u0085\\u2029\\u2028]"); + // static UnicodeSet atermSet = new UnicodeSet("[\\u002E]"); + // static UnicodeSet termSet = new UnicodeSet( + // "[\\u0021\\u003F\\u0589\\u061f\\u06d4\\u0700-\\u0702\\u0934" + // + "\\u1362\\u1367\\u1368\\u104A\\u104B\\u166E" + // + "\\u1803\\u1809\\u203c\\u203d" + // + "\\u2048\\u2049\\u3002\\ufe52\\ufe57\\uff01\\uff0e\\uff1f\\uff61]"); + + // static UnicodeProperty lowercaseProp = UnifiedBinaryProperty.make(DERIVED | PropLowercase); + // static UnicodeProperty uppercaseProp = UnifiedBinaryProperty.make(DERIVED | PropUppercase); + + // UnicodeSet linebreakNS = UnifiedBinaryProperty.make(LINE_BREAK | LB_NU).getSet(); + // */ + + /// * //// stuff that subclasses need to override - //public String getTypeID(int cp) { - //byte type = getType(cp); - //return Names[type]; - //} + // public String getTypeID(int cp) { + // byte type = getType(cp); + // return Names[type]; + // } //// stuff that subclasses need to override - //public byte getType(int cp) { - //byte cat = ucd.getCategory(cp); - - //if (cat == Cf) return Format; - //if (sepSet.contains(cp)) return Sep; - //if (ucd.getBinaryProperty(cp, White_space)) return Sp; - //if (linebreakNS.contains(cp)) return Numeric; - //if (lowercaseProp.hasValue(cp)) return Lower; - //if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper; - //if (alphabeticSet.contains(cp)) return OLetter; - //if (atermSet.contains(cp)) return ATerm; - //if (termSet.contains(cp)) return Term; - //if (cat == Po || cat == Pe - //|| ucd.getLineBreak(cp) == LB_QU) return Close; - //return Other; - //} - //*/ - - //public int genTestItems(String before, String after, String[] results) { - //results[0] = before + after; - ///* - //results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; - //results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + 'a'; - //results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; - //*/ - //return 1; - //} - - //static Context context = new Context(); - - //public boolean isBreak(String source, int offset) { + // public byte getType(int cp) { + // byte cat = ucd.getCategory(cp); + + // if (cat == Cf) return Format; + // if (sepSet.contains(cp)) return Sep; + // if (ucd.getBinaryProperty(cp, White_space)) return Sp; + // if (linebreakNS.contains(cp)) return Numeric; + // if (lowercaseProp.hasValue(cp)) return Lower; + // if (uppercaseProp.hasValue(cp) || cat == Lt) return Upper; + // if (alphabeticSet.contains(cp)) return OLetter; + // if (atermSet.contains(cp)) return ATerm; + // if (termSet.contains(cp)) return Term; + // if (cat == Po || cat == Pe + // || ucd.getLineBreak(cp) == LB_QU) return Close; + // return Other; + // } + // */ + + // public int genTestItems(String before, String after, String[] results) { + // results[0] = before + after; + /// * + // results[1] = 'a' + before + "\u0301\u0308" + after + "\u0301\u0308" + 'a'; + // results[2] = 'a' + before + "\u0301\u0308" + samples[MidLetter] + after + "\u0301\u0308" + + // 'a'; + // results[3] = 'a' + before + "\u0301\u0308" + samples[MidNum] + after + "\u0301\u0308" + 'a'; + // */ + // return 1; + // } + + // static Context context = new Context(); + + // public boolean isBreak(String source, int offset) { //// Break at the start and end of text. - //setRule("1: sot ÷"); - //if (offset < 0 || offset > source.length()) return false; + // setRule("1: sot ÷"); + // if (offset < 0 || offset > source.length()) return false; - //if (offset == 0) return true; + // if (offset == 0) return true; - //setRule("2: ÷ eot"); - //if (offset == source.length()) return true; + // setRule("2: ÷ eot"); + // if (offset == source.length()) return true; - //setRule("3: Sep ÷"); - //byte beforeChar = getResolvedType(source.charAt(offset-1)); - //if (beforeChar == Sep) return true; + // setRule("3: Sep ÷"); + // byte beforeChar = getResolvedType(source.charAt(offset-1)); + // if (beforeChar == Sep) return true; //// Treat a grapheme cluster as if it were a single character: //// the first base character, if there is one; otherwise the first character. - //setRule("4: GC -> FC"); - //if (!grapheme.isBreak( source, offset)) return false; + // setRule("4: GC -> FC"); + // if (!grapheme.isBreak( source, offset)) return false; - //// Ignore interior Format characters. That is, ignore Format characters in all subsequent rules. - //setRule("5: X Format* -> X"); - //byte afterChar = getResolvedType(source.charAt(offset)); - //if (afterChar == Format) return false; + //// Ignore interior Format characters. That is, ignore Format characters in all subsequent + // rules. + // setRule("5: X Format* -> X"); + // byte afterChar = getResolvedType(source.charAt(offset)); + // if (afterChar == Format) return false; - //getGraphemeBases(breaker, source, offset, Format, context); - //byte before = context.tBefore; - //byte after = context.tAfter; - //byte before2 = context.tBefore2; - //byte after2 = context.tAfter2; + // getGraphemeBases(breaker, source, offset, Format, context); + // byte before = context.tBefore; + // byte after = context.tAfter; + // byte before2 = context.tBefore2; + // byte after2 = context.tAfter2; //// HACK COPY for rule collection! - //if (collectingRules) { - //setRule("6: ATerm × ( Numeric | Lower )"); - //setRule("7: Upper ATerm × Upper"); + // if (collectingRules) { + // setRule("6: ATerm × ( Numeric | Lower )"); + // setRule("7: Upper ATerm × Upper"); //// setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); - //setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF | STerm | ATerm) )* Lower"); - //setRule("8a: STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)"); - //setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); - //setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); - //setRule("11: ( Term | ATerm ) Close* Sp* ÷"); - //setRule("12: Any × Any"); - //collectingRules = false; - //} - - //// Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence. - - //if (before == ATerm) { - //setRule("6: ATerm × ( Numeric | Lower )"); - //if (after == Lower || after == Numeric) return false; - //setRule("7: Upper ATerm × Upper"); - //if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper); - //if (before2 == Upper && after == Upper) return false; - //} + // setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF | STerm | ATerm) + // )* Lower"); + // setRule("8a: STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)"); + // setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); + // setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); + // setRule("11: ( Term | ATerm ) Close* Sp* ÷"); + // setRule("12: Any × Any"); + // collectingRules = false; + // } + + //// Do not break after ambiguous terminators like period, if immediately followed by a number + // or lowercase letter, is between uppercase letters, or if the first following letter + // (optionally after certain punctuation) is lowercase. For example, a period may be an + // abbreviation or numeric period, and not mark the end of a sentence. + + // if (before == ATerm) { + // setRule("6: ATerm × ( Numeric | Lower )"); + // if (after == Lower || after == Numeric) return false; + // setRule("7: Upper ATerm × Upper"); + // if (DEBUG_GRAPHEMES) System.out.println(context + ", " + Upper); + // if (before2 == Upper && after == Upper) return false; + // } //// The following cases are all handled together. //// First we loop backwards, checking for the different types. - //MyBreakIterator graphemeIterator = new MyBreakIterator(grapheme); - //graphemeIterator.set(source, offset); + // MyBreakIterator graphemeIterator = new MyBreakIterator(grapheme); + // graphemeIterator.set(source, offset); - //int state = 0; - //int lookAfter = -1; - //int cp; - //byte t; - //boolean gotSpace = false; - //boolean gotClose = false; + // int state = 0; + // int lookAfter = -1; + // int cp; + // byte t; + // boolean gotSpace = false; + // boolean gotClose = false; - //behindLoop: - //while (true) { - //cp = graphemeIterator.previousBase(); - //if (cp == -1) break; - //t = getResolvedType(cp); - //if (SHOW_TYPE) System.out.println(ucd.getCodeAndName(cp) + ", " + getTypeID(cp)); + // behindLoop: + // while (true) { + // cp = graphemeIterator.previousBase(); + // if (cp == -1) break; + // t = getResolvedType(cp); + // if (SHOW_TYPE) System.out.println(ucd.getCodeAndName(cp) + ", " + getTypeID(cp)); - //if (t == Format) continue; // ignore all formats! + // if (t == Format) continue; // ignore all formats! - //switch (state) { - //case 0: - //if (t == Sp) { + // switch (state) { + // case 0: + // if (t == Sp) { //// loop as long as we have Space - //gotSpace = true; - //continue behindLoop; - //} else if (t == Close) { - //gotClose = true; - //state = 1; // go to close loop - //continue behindLoop; - //} - //break; - //case 1: - //if (t == Close) { + // gotSpace = true; + // continue behindLoop; + // } else if (t == Close) { + // gotClose = true; + // state = 1; // go to close loop + // continue behindLoop; + // } + // break; + // case 1: + // if (t == Close) { //// loop as long as we have Close - //continue behindLoop; - //} - //break; - //} - //if (t == ATerm) { - //lookAfter = ATerm; - //} else if (t == STerm) { - //lookAfter = STerm; - //} - //break; - //} + // continue behindLoop; + // } + // break; + // } + // if (t == ATerm) { + // lookAfter = ATerm; + // } else if (t == STerm) { + // lookAfter = STerm; + // } + // break; + // } //// if we didn't find ATerm or Term, bail - //if (lookAfter == -1) { + // if (lookAfter == -1) { //// Otherwise, do not break //// Any × Any (11) - //setRule("12: Any × Any"); - //return false; - //} + // setRule("12: Any × Any"); + // return false; + // } //// ATerm Close* Sp*×(¬( OLetter))* Lower(8) - //// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator. + //// Break after sentence terminators, but include closing punctuation, trailing spaces, and + // (optionally) a paragraph separator. //// ( Term | ATerm ) Close*×( Close | Sp | Sep )(9) //// ( Term | ATerm ) Close* Sp×( Sp | Sep )(10) //// ( Term | ATerm ) Close* Sp*÷(11) - //// We DID find one. Loop to see if the right side is ok. - //graphemeIterator.set(source, offset); - //boolean isFirst = true; - //while (true) { - //cp = graphemeIterator.nextBase(); - //if (cp == -1) break; - //t = getResolvedType(cp); - //if (SHOW_TYPE) System.out.println(ucd.getCodeAndName(cp) + ", " + getTypeID(cp)); - - //if (t == Format) continue; // skip format characters! - - //if (isFirst) { - //isFirst = false; - //if (lookAfter == ATerm && t == Upper) { - //setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF | STerm | ATerm) )* Lower"); - //return false; - //} - //if (gotSpace) { - //if (t == Sp || t == Sep) { - //setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); - //return false; - //} - //} else if (t == Close || t == Sp || t == Sep) { - //setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); - //return false; - //} - //if (lookAfter == STerm) break; - //} + // graphemeIterator.set(source, offset); + // boolean isFirst = true; + // while (true) { + // cp = graphemeIterator.nextBase(); + // if (cp == -1) break; + // t = getResolvedType(cp); + // if (SHOW_TYPE) System.out.println(ucd.getCodeAndName(cp) + ", " + getTypeID(cp)); + + // if (t == Format) continue; // skip format characters! + + // if (isFirst) { + // isFirst = false; + // if (lookAfter == ATerm && t == Upper) { + // setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF | STerm | ATerm) + // )* Lower"); + // return false; + // } + // if (gotSpace) { + // if (t == Sp || t == Sep) { + // setRule("10: ( Term | ATerm ) Close* Sp × ( Sp | Sep )"); + // return false; + // } + // } else if (t == Close || t == Sp || t == Sep) { + // setRule("9: ( Term | ATerm ) Close* × ( Close | Sp | Sep )"); + // return false; + // } + // if (lookAfter == STerm) break; + // } //// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6 - //if (t != OLetter && t != Upper && t != Lower) continue; - //if (t == Lower) { - //setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); - //return false; - //} - //break; - //} - //setRule("11: ( Term | ATerm ) Close* Sp* ÷"); - //return true; - //} - //} + // if (t != OLetter && t != Upper && t != Lower) continue; + // if (t == Lower) { + // setRule("8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower"); + // return false; + // } + // break; + // } + // setRule("11: ( Term | ATerm ) Close* Sp* ÷"); + // return true; + // } + // } static final boolean DEBUG_GRAPHEMES = false; - static final Transliterator escaper = Transliterator.createFromRules( - "escape", "::[[:di:][:c:]] any-hex/c;", Transliterator.FORWARD); + static final Transliterator escaper = + Transliterator.createFromRules( + "escape", "::[[:di:][:c:]] any-hex/c;", Transliterator.FORWARD); static class MyBreakIterator { int offset = 0; @@ -2776,8 +2995,9 @@ static class MyBreakIterator { MyBreakIterator(GenerateBreakTest breaker) { this.breaker = breaker; // = new GenerateGraphemeBreakTest() } + public MyBreakIterator set(String source, int offset) { - //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset); + // if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset); string = source; this.offset = offset; return this; @@ -2793,7 +3013,7 @@ public int nextBase() { break; } } - //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result)); + // if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result)); return result; } @@ -2807,7 +3027,7 @@ public int previousBase() { } } final int result = UTF16.charAt(string, offset); - //if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result)); + // if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result)); return result; } } @@ -2903,6 +3123,7 @@ public static String[] add(String[] strings1, String... strings2) { public String getSample(UnicodeProperty prop2, String value) { return getSample(prop2, value, 1); } + public String getSample(UnicodeProperty prop2, String value, int count) { UnicodeSet us = prop2.getSet(value); if (prop.getName().startsWith("Extended_Pictographic")) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java index 60532cacd..a525af616 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseFolding.java @@ -1,16 +1,15 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateCaseFolding.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateCaseFolding.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.text.UTF16; import java.io.IOException; import java.io.PrintWriter; import java.util.BitSet; @@ -19,24 +18,22 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.text.utility.Settings; import org.unicode.text.utility.UnicodeDataFile; import org.unicode.text.utility.UnicodeDataFile.FileInfix; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; - public class GenerateCaseFolding implements UCD_Types { public static boolean DEBUG = false; public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase - public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting - public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting + public static boolean PICK_SHORT = + false; // picks short value for SIMPLE if in FULL, changes weighting + public static boolean NF_CLOSURE = + false; // picks short value for SIMPLE if in FULL, changes weighting static final int CHECK_CHAR = 0x130; // for debugging, change to actual character, otherwise -1 // PICK_SHORT & NF_CLOSURE = false for old style - /*public static void main(String[] args) throws java.io.IOException { makeCaseFold(arg[0]); //getAge(); @@ -45,15 +42,15 @@ public class GenerateCaseFolding implements UCD_Types { static PrintWriter log; - public static void makeCaseFold(boolean normalized) throws java.io.IOException { PICK_SHORT = NF_CLOSURE = normalized; String suffix = FileInfix.getDefault().getFileSuffix(".txt"); - log = Utility.openPrintWriter( - Settings.Output.GEN_DIR + "/log", - "CaseFoldingLog" + suffix, - Utility.LATIN1_UNIX); + log = + Utility.openPrintWriter( + Settings.Output.GEN_DIR + "/log", + "CaseFoldingLog" + suffix, + Utility.LATIN1_UNIX); System.out.println("Writing Log: " + "CaseFoldingLog" + suffix); System.out.println("Making Full Data"); @@ -81,7 +78,7 @@ public static void makeCaseFold(boolean normalized) throws java.io.IOException { final String directory = "UCD/" + Default.ucd().getVersion() + '/'; final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, filename) - .setSkipCopyright(Settings.SKIP_COPYRIGHT); + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter out = fc.out; /* @@ -104,7 +101,10 @@ public static void makeCaseFold(boolean normalized) throws java.io.IOException { final String rSimple = simpleData.get(UTF16.valueOf(ch)); final String rFullTurkish = fullDataTurkish.get(UTF16.valueOf(ch)); final String rSimpleTurkish = simpleDataTurkish.get(UTF16.valueOf(ch)); - if (rFull == null && rSimple == null && rFullTurkish == null && rSimpleTurkish == null) { + if (rFull == null + && rSimple == null + && rFullTurkish == null + && rSimpleTurkish == null) { continue; } @@ -119,7 +119,7 @@ public static void makeCaseFold(boolean normalized) throws java.io.IOException { drawLine(out, ch, "T", "i"); } else if (ch == 0x131) { // do nothing - //drawLine(out, ch, "I", "i"); + // drawLine(out, ch, "I", "i"); } else { drawLine(out, ch, type, rFull); } @@ -146,12 +146,12 @@ public static void makeCaseFold(boolean normalized) throws java.io.IOException { /* Goal is following (with no entries for 0131 or 0069) -0049; C; 0069; # LATIN CAPITAL LETTER I -0049; T; 0131; # LATIN CAPITAL LETTER I + 0049; C; 0069; # LATIN CAPITAL LETTER I + 0049; T; 0131; # LATIN CAPITAL LETTER I -0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE -0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE - */ + 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE + 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE + */ static void drawLine(PrintWriter out, int ch, String type, String result) { String comment = ""; @@ -164,21 +164,32 @@ static void drawLine(PrintWriter out, int ch, String type, String result) { } else { Utility.fixDot(); System.out.println("PROBLEM WITH: " + Default.ucd().getCodeAndName(ch)); - comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] "; + comment = + "[DIFF " + + Utility.hex(lower, " ") + + ", " + + Utility.hex(lower2, " ") + + "] "; } } } - out.println(Utility.hex(ch) - + "; " + type - + "; " + Utility.hex(result, " ") - + "; # " + comment + Default.ucd().getName(ch)); + out.println( + Utility.hex(ch) + + "; " + + type + + "; " + + Utility.hex(result, " ") + + "; # " + + comment + + Default.ucd().getName(ch)); } static int probeCh = 0x01f0; static String shower = UTF16.valueOf(probeCh); - static Map getCaseFolding(boolean full, boolean nfClose, String condition) throws java.io.IOException { + static Map getCaseFolding(boolean full, boolean nfClose, String condition) + throws java.io.IOException { final Map> data = new TreeMap>(); final Map repChar = new TreeMap(); @@ -186,7 +197,7 @@ static Map getCaseFolding(boolean full, boolean nfClose, String for (int ch = 0; ch <= 0x10FFFF; ++ch) { Utility.dot(ch); - //if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch)); + // if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch)); if (!Default.ucd().isRepresented(ch)) { continue; } @@ -232,7 +243,7 @@ static Map getCaseFolding(boolean full, boolean nfClose, String Utility.fixDot(); log.println("Non-Optimal Representative " + message); log.println(" Rep:\t" + Default.ucd().getCodeAndName(rep)); - log.println(" Set:\t" + toString(set,true, true)); + log.println(" Set:\t" + toString(set, true, true)); } log.println(); @@ -268,7 +279,7 @@ static int goodness(String s, boolean full, String condition) { if (s == null) { return 0; } - int result = 32-s.length(); + int result = 32 - s.length(); if (!PICK_SHORT) { result = s.length(); } @@ -278,7 +289,7 @@ static int goodness(String s, boolean full, String condition) { // Cherokee case-folds to uppercase letters which were encoded first. // It became bicameral in Unicode 8 with the addition of lowercase letters. int first = s.codePointAt(0); - boolean isCherokee = 0x13A0 <= first && first <= 0x13FF; // original Cherokee block + boolean isCherokee = 0x13A0 <= first && first <= 0x13FF; // original Cherokee block final String low; if (isCherokee) { low = upper(lower(s, full, condition), full, condition); @@ -302,7 +313,6 @@ static int goodness(String s, boolean full, String condition) { return result; } - /* static HashSet temp = new HashSet(); static void normalize(HashSet set) { @@ -325,34 +335,39 @@ static void normalize(HashSet set) { */ /* - String - String lower1 = Default.ucd.getLowercase(ch); - String lower2 = Default.ucd.toLowercase(ch,option); - - char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0); - //String lower1 = String.valueOf(Default.ucd.getLowercase(ch)); - //String lower = Default.ucd.toLowercase(ch2,option); - String upper = Default.ucd.toUppercase(ch2,option); - String lowerUpper = Default.ucd.toLowercase(upper,option); - //String title = Default.ucd.toTitlecase(ch2,option); - //String lowerTitle = Default.ucd.toLowercase(upper,option); - - if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { // - output.println(Utility.hex(ch) - + "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E") - + "; " + Utility.hex(lowerUpper," ") - + ";\t#" + Default.ucd.getName(ch) - ); - //if (!lowerUpper.equals(lower)) { - // output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower)); - //} - //if (!lowerUpper.equals(lowerTitle)) { - // output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle)); - //} - } - */ - - static void getClosure(int ch, Map> data, boolean full, boolean nfClose, String condition) { + String + String lower1 = Default.ucd.getLowercase(ch); + String lower2 = Default.ucd.toLowercase(ch,option); + + char ch2 = Default.ucd.getLowercase(Default.ucd.getUppercase(ch).charAt(0)).charAt(0); + //String lower1 = String.valueOf(Default.ucd.getLowercase(ch)); + //String lower = Default.ucd.toLowercase(ch2,option); + String upper = Default.ucd.toUppercase(ch2,option); + String lowerUpper = Default.ucd.toLowercase(upper,option); + //String title = Default.ucd.toTitlecase(ch2,option); + //String lowerTitle = Default.ucd.toLowercase(upper,option); + + if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { // + output.println(Utility.hex(ch) + + "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E") + + "; " + Utility.hex(lowerUpper," ") + + ";\t#" + Default.ucd.getName(ch) + ); + //if (!lowerUpper.equals(lower)) { + // output.println("Warning1: " + Utility.hex(lower) + " " + Default.ucd.getName(lower)); + //} + //if (!lowerUpper.equals(lowerTitle)) { + // output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Default.ucd.getName(lowerTitle)); + //} + } + */ + + static void getClosure( + int ch, + Map> data, + boolean full, + boolean nfClose, + String condition) { if (ch == '\u023F') { System.out.println("???"); } @@ -378,45 +393,46 @@ static void getClosure(int ch, Map> data, boolean full, bool // close it main: - while (true) { - final Iterator it = set.iterator(); - while (it.hasNext()) { - final String s = it.next(); - // do funny stuff since we can't modify set while iterating - // We don't do this because if the source is not normalized, we don't want to normalize - if (nfClose) { - if (add(set, Default.nfd().normalize(s), data)) { - continue main; - } - if (add(set, Default.nfc().normalize(s), data)) { - continue main; - } - if (add(set, Default.nfkd().normalize(s), data)) { - continue main; - } - if (add(set, Default.nfkc().normalize(s), data)) { - continue main; - } + while (true) { + final Iterator it = set.iterator(); + while (it.hasNext()) { + final String s = it.next(); + // do funny stuff since we can't modify set while iterating + // We don't do this because if the source is not normalized, we don't want to + // normalize + if (nfClose) { + if (add(set, Default.nfd().normalize(s), data)) { + continue main; } - if (add(set, lower(s, full, condition), data)) { + if (add(set, Default.nfc().normalize(s), data)) { continue main; } - if (add(set, title(s, full, condition), data)) { + if (add(set, Default.nfkd().normalize(s), data)) { continue main; } - if (add(set, upper(s, full, condition), data)) { + if (add(set, Default.nfkc().normalize(s), data)) { continue main; } } - break; + if (add(set, lower(s, full, condition), data)) { + continue main; + } + if (add(set, title(s, full, condition), data)) { + continue main; + } + if (add(set, upper(s, full, condition), data)) { + continue main; + } } + break; + } if (set.size() > 1) { data.put(charStr, set); } } static String lower(String s, boolean full, String condition) { - final String result = lower2(s,full, condition); + final String result = lower2(s, full, condition); return result.replace('\u03C2', '\u03C3'); // HACK for lower } @@ -501,9 +517,8 @@ static String toString(Set set, boolean name, boolean crtab) { } static boolean specialNormalizationDiffers(int ch) { - if (ch == 0x00DF) - { - return true; // es-zed + if (ch == 0x00DF) { + return true; // es-zed } return !Default.nfkd().isNormalized(ch); } @@ -516,25 +531,21 @@ static String specialNormalization(String s) { } static boolean isExcluded(int ch) { - // if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE - if (ch == 0x0132 || ch == 0x0133) - { + // if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT + // ABOVE + if (ch == 0x0132 || ch == 0x0133) { return true; // skip IJ, ij } - if (0x1F16A <= ch && ch <= 0x1F16C) - { + if (0x1F16A <= ch && ch <= 0x1F16C) { return true; // skip raised MC/MD/MR signs } - if (ch == 0x037A) - { - return true; // skip GREEK YPOGEGRAMMENI + if (ch == 0x037A) { + return true; // skip GREEK YPOGEGRAMMENI } - if (0x249C <= ch && ch <= 0x24B5) - { + if (0x249C <= ch && ch <= 0x24B5) { return true; // skip PARENTHESIZED LATIN SMALL LETTER A.. } - if (0x20A8 <= ch && ch <= 0x217B) - { + if (0x20A8 <= ch && ch <= 0x217B) { return true; // skip Rupee.. } @@ -542,7 +553,7 @@ static boolean isExcluded(int ch) { if (type == COMPAT_SQUARE) { return true; } - //if (type == COMPAT_UNSPECIFIED) return true; + // if (type == COMPAT_UNSPECIFIED) return true; return false; } @@ -613,12 +624,15 @@ static void generateSpecialCasing(boolean normalize) throws IOException { System.out.println("BUpper: " + Default.ucd().getCodeAndName(bupper)); } - // presumably if there is a single code point, it would already be in the simple mappings + // presumably if there is a single code point, it would already be in the simple + // mappings - if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1 + if (UTF16.countCodePoint(flower) == 1 + && UTF16.countCodePoint(fupper) == 1 && UTF16.countCodePoint(title) == 1) { if (ch == CHECK_CHAR) { - System.out.println("Skipping single code point: " + Default.ucd().getCodeAndName(ch)); + System.out.println( + "Skipping single code point: " + Default.ucd().getCodeAndName(ch)); } continue; } @@ -633,7 +647,8 @@ static void generateSpecialCasing(boolean normalize) throws IOException { } // fix special cases - // if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue; + // if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) + // continue; if (flower.equals(blower)) { flower = lower; } @@ -655,51 +670,80 @@ static void generateSpecialCasing(boolean normalize) throws IOException { final String name = Default.ucd().getName(ch); - final int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1 - : ch == 0x130 ? 2 - : name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 4 - : name.indexOf("LIGATURE") >= 0 ? 3 - : name.indexOf("GEGRAMMENI") < 0 ? 5 - : UTF16.countCodePoint(ftitle) == 1 ? 6 - : UTF16.countCodePoint(fupper) == 2 ? 7 - : 8; - - if (ch == CHECK_CHAR) { - System.out.println("Order: " + order + " for " + Default.ucd().getCodeAndName(ch)); - } - - // HACK - final boolean denormalize = !normalize && order != 6 && order != 7; - - final String mapping = Utility.hex(ch) - + "; " + Utility.hex(flower.equals(base) ? chstr : denormalize ? Default.nfd().normalize(flower) : flower) - + "; " + Utility.hex(ftitle.equals(base) ? chstr : denormalize ? Default.nfd().normalize(ftitle) : ftitle) - + "; " + Utility.hex(fupper.equals(base) ? chstr : denormalize ? Default.nfd().normalize(fupper) : fupper) - + "; # " + Default.ucd().getName(ch); - - // special exclusions - if (isExcluded(ch)) { - log.println("# " + mapping); - } else { - int x = ch; - if (ch == 0x01F0) - { - x = 0x03B1; // HACK to reorder the same - } - sorted.put(new Integer((order << 24) | x), mapping); - } + final int order = + name.equals("LATIN SMALL LETTER SHARP S") + ? 1 + : ch == 0x130 + ? 2 + : name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 + ? 4 + : name.indexOf("LIGATURE") >= 0 + ? 3 + : name.indexOf("GEGRAMMENI") < 0 + ? 5 + : UTF16.countCodePoint(ftitle) == 1 + ? 6 + : UTF16.countCodePoint(fupper) + == 2 + ? 7 + : 8; + + if (ch == CHECK_CHAR) { + System.out.println("Order: " + order + " for " + Default.ucd().getCodeAndName(ch)); + } + + // HACK + final boolean denormalize = !normalize && order != 6 && order != 7; + + final String mapping = + Utility.hex(ch) + + "; " + + Utility.hex( + flower.equals(base) + ? chstr + : denormalize + ? Default.nfd().normalize(flower) + : flower) + + "; " + + Utility.hex( + ftitle.equals(base) + ? chstr + : denormalize + ? Default.nfd().normalize(ftitle) + : ftitle) + + "; " + + Utility.hex( + fupper.equals(base) + ? chstr + : denormalize + ? Default.nfd().normalize(fupper) + : fupper) + + "; # " + + Default.ucd().getName(ch); + + // special exclusions + if (isExcluded(ch)) { + log.println("# " + mapping); + } else { + int x = ch; + if (ch == 0x01F0) { + x = 0x03B1; // HACK to reorder the same + } + sorted.put(new Integer((order << 24) | x), mapping); + } } log.close(); System.out.println("Writing"); - //String newFile = "DerivedData/SpecialCasing" + suffix2 + UnicodeDataFile.getFileSuffix(true); - //PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); + // String newFile = "DerivedData/SpecialCasing" + suffix2 + + // UnicodeDataFile.getFileSuffix(true); + // PrintWriter out = Utility.openPrintWriter(newFile, Utility.LATIN1_UNIX); final UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader( - "UCD/" + Default.ucd().getVersion() + '/', - "SpecialCasing" + suffix2). - setSkipCopyright(Settings.SKIP_COPYRIGHT); + "UCD/" + Default.ucd().getVersion() + '/', + "SpecialCasing" + suffix2) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter out = udf.out; /* String[] batName = {""}; @@ -708,7 +752,7 @@ static void generateSpecialCasing(boolean normalize) throws IOException { out.println(UnicodeDataFile.generateDateLine()); out.println("#"); */ - //Utility.appendFile("org/unicode/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out); + // Utility.appendFile("org/unicode/text/UCD/SpecialCasingHeader.txt", Utility.UTF8, out); final Iterator it = sorted.keySet().iterator(); int lastOrder = -1; @@ -716,33 +760,49 @@ static void generateSpecialCasing(boolean normalize) throws IOException { final Integer key = it.next(); final String line = sorted.get(key); final int order = key.intValue() >> 24; - if (order != lastOrder) { - lastOrder = order; - out.println(); - boolean skipLine = false; - switch(order) { - case 1: - out.println("# The German es-zed is special--the normal mapping is to SS."); - out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase())"); - break; - case 2: - out.println("# Preserve canonical equivalence for I with dot. Turkic is handled below."); - break; - case 3: out.println("# Ligatures"); break; - case 4: skipLine = true; break; - case 5: out.println("# No corresponding uppercase precomposed character"); break; - case 6: Utility.appendFile(Settings.SRC_UCD_DIR + "SpecialCasingIota.txt", Utility.UTF8, out); break; - case 7: out.println("# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); break; - case 8: skipLine = true; break; - } - if (!skipLine) { - out.println(); - } - } - out.println(line); + if (order != lastOrder) { + lastOrder = order; + out.println(); + boolean skipLine = false; + switch (order) { + case 1: + out.println("# The German es-zed is special--the normal mapping is to SS."); + out.println( + "# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase())"); + break; + case 2: + out.println( + "# Preserve canonical equivalence for I with dot. Turkic is handled below."); + break; + case 3: + out.println("# Ligatures"); + break; + case 4: + skipLine = true; + break; + case 5: + out.println("# No corresponding uppercase precomposed character"); + break; + case 6: + Utility.appendFile( + Settings.SRC_UCD_DIR + "SpecialCasingIota.txt", Utility.UTF8, out); + break; + case 7: + out.println( + "# Some characters with YPOGEGRAMMENI also have no corresponding titlecases"); + break; + case 8: + skipLine = true; + break; + } + if (!skipLine) { + out.println(); + } + } + out.println(line); } Utility.appendFile(Settings.SRC_UCD_DIR + "SpecialCasingFooter.txt", Utility.UTF8, out); udf.close(); - //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); + // Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseTest.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseTest.java index 924ab0cfd..4ab620e49 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseTest.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateCaseTest.java @@ -1,29 +1,27 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateCaseTest.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateCaseTest.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.text.UTF16; import java.io.IOException; import java.io.PrintWriter; - import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; - -abstract public class GenerateCaseTest implements UCD_Types { +public abstract class GenerateCaseTest implements UCD_Types { public static void main(String[] args) throws IOException { - System.out.println("Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); + System.out.println( + "Remember to add length marks (half & full) and other punctuation for sentence, with FF61"); - final PrintWriter out = Utility.openPrintWriterGenDir("log/CaseTest.txt", Utility.UTF8_WINDOWS); + final PrintWriter out = + Utility.openPrintWriterGenDir("log/CaseTest.txt", Utility.UTF8_WINDOWS); out.println("# CaseTest"); out.println("# Generated: " + Default.getDate() + ", MED"); @@ -47,9 +45,7 @@ public static void main(String[] args) throws IOException { final String upper = Default.ucd().getCase(cp, FULL, UPPER); final String title = Default.ucd().getCase(cp, FULL, TITLE); final String fold = Default.ucd().getCase(cp, FULL, FOLD); - if (lower.equals(upper) - && lower.equals(title) - && lower.equals(fold)) { + if (lower.equals(upper) && lower.equals(title) && lower.equals(fold)) { continue; } @@ -67,11 +63,10 @@ public static void main(String[] args) throws IOException { final String title1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, TITLE)); final String fold1 = Default.nfc().normalize(Default.ucd().getCase(s2, FULL, FOLD)); - if (lower1.equals(Default.nfc().normalize(lower+testChar)) - && upper1.equals(Default.nfc().normalize(upper+testChar)) - && title1.equals(Default.nfc().normalize(title+testChar)) - && fold1.equals(Default.nfc().normalize(fold+testChar)) - ) { + if (lower1.equals(Default.nfc().normalize(lower + testChar)) + && upper1.equals(Default.nfc().normalize(upper + testChar)) + && title1.equals(Default.nfc().normalize(title + testChar)) + && fold1.equals(Default.nfc().normalize(fold + testChar))) { continue; } @@ -90,13 +85,17 @@ static void write(PrintWriter out, String ss, boolean doComment) { final String upper = Default.nfc().normalize(Default.ucd().getCase(s, FULL, UPPER)); final String title = Default.nfc().normalize(Default.ucd().getCase(s, FULL, TITLE)); final String fold = Default.nfc().normalize(Default.ucd().getCase(s, FULL, FOLD)); - out.println(Utility.hex(ss) + "; " - + Utility.hex(lower) + "; " - + Utility.hex(upper) + "; " - + Utility.hex(title) + "; " - + Utility.hex(fold) - + (doComment ? "\t# " + Default.ucd().getName(ss) : "") - ); + out.println( + Utility.hex(ss) + + "; " + + Utility.hex(lower) + + "; " + + Utility.hex(upper) + + "; " + + Utility.hex(title) + + "; " + + Utility.hex(fold) + + (doComment ? "\t# " + Default.ucd().getName(ss) : "")); counter++; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusables.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusables.java index 134cdffa9..b0e881226 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusables.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusables.java @@ -1,18 +1,28 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateConfusables.java,v $ - * $Date: 2010-06-19 00:29:21 $ - * $Revision: 1.32 $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateConfusables.java,v $ $Date: + * 2010-06-19 00:29:21 $ $Revision: 1.32 $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.dev.util.UnicodeMap.EntryRange; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.LocaleData; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.IOException; @@ -35,7 +45,6 @@ import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.ArrayComparator; import org.unicode.cldr.util.Counter; @@ -43,45 +52,32 @@ import org.unicode.cldr.util.With; import org.unicode.cldr.util.XEquivalenceClass; import org.unicode.cldr.util.XEquivalenceClass.Linkage; -import org.unicode.props.BagFormatter; import org.unicode.cldr.util.props.UnicodeLabel; -import org.unicode.props.UnicodeProperty; import org.unicode.idna.Idna.IdnaType; import org.unicode.idna.Uts46; +import org.unicode.props.BagFormatter; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.ScriptInfo; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.Binary; import org.unicode.props.UcdPropertyValues.NFKD_Quick_Check_Values; import org.unicode.props.UcdPropertyValues.Script_Values; +import org.unicode.props.UnicodeProperty; import org.unicode.text.utility.Settings; import org.unicode.text.utility.UnicodeTransform; import org.unicode.text.utility.Utility; import org.unicode.tools.Confusables; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.dev.util.UnicodeMap.EntryRange; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.LocaleData; -import com.ibm.icu.util.ULocale; - - public class GenerateConfusables { static { System.setProperty("SCRIPT_UNICODE_VERSION", Default.ucdVersion()); } + private static final String REFERENCE_VERSION = Settings.lastVersion; static final Normalizer NFKD = Default.nfkd(); - private static final ToolUnicodeTransformFactory TOOL_FACTORY = new ToolUnicodeTransformFactory(); + private static final ToolUnicodeTransformFactory TOOL_FACTORY = + new ToolUnicodeTransformFactory(); // Align these three normally. private static final String version = Settings.latestVersion; private static final String REVISION = Settings.latestVersion; @@ -89,13 +85,15 @@ public class GenerateConfusables { static final String reformatedInternal = Settings.UnicodeTools.getDataPathString("security", REVISION) + "/data/"; - public static final String GEN_SECURITY_DIR = Settings.Output.GEN_DIR + "security/" + REVISION + "/"; + public static final String GEN_SECURITY_DIR = + Settings.Output.GEN_DIR + "security/" + REVISION + "/"; - // static final XIDModifications REFERENCE_VALUES = new XIDModifications(Settings.UNICODETOOLS_DIRECTORY + "data/security/" + // static final XIDModifications REFERENCE_VALUES = new + // XIDModifications(Settings.UNICODETOOLS_DIRECTORY + "data/security/" // + REFERENCE_VERSION // + "/xidmodifications.txt"); - //static final UnicodeSet OLD_CONFUSABLE_TARGETS = new UnicodeSet(); + // static final UnicodeSet OLD_CONFUSABLE_TARGETS = new UnicodeSet(); static final Counter LAST_COUNT = new Counter(); static final boolean DEBUG = false; @@ -103,43 +101,61 @@ public class GenerateConfusables { static { String path = Settings.UnicodeTools.getDataPathString("security", REFERENCE_VERSION); Confusables REFERENCE_VALUES = new Confusables(path); - for (EntryRange entry : REFERENCE_VALUES.getRawMapToRepresentative(Confusables.Style.MA).entryRanges()) { + for (EntryRange entry : + REFERENCE_VALUES.getRawMapToRepresentative(Confusables.Style.MA).entryRanges()) { if (entry.string != null) { LAST_COUNT.add(entry.value, 1); } else { LAST_COUNT.add(entry.value, entry.codepointEnd - entry.codepoint + 1); } } -// OLD_CONFUSABLE_TARGETS.addAll(REFERENCE_VALUES.getStyle2map().get(Confusables.Style.MA).values()).freeze(); -// System.out.println(OLD_CONFUSABLE_TARGETS.toPattern(false)); + // + // OLD_CONFUSABLE_TARGETS.addAll(REFERENCE_VALUES.getStyle2map().get(Confusables.Style.MA).values()).freeze(); + // System.out.println(OLD_CONFUSABLE_TARGETS.toPattern(false)); for (String s : LAST_COUNT.getKeysetSortedByCount(false)) { - if (DEBUG) System.out.println(LAST_COUNT.get(s) + "\t" + s + "\tU+" + Utility.hex(s) + "\t" + Default.ucd().getName(s)); + if (DEBUG) + System.out.println( + LAST_COUNT.get(s) + + "\t" + + s + + "\tU+" + + Utility.hex(s) + + "\t" + + Default.ucd().getName(s)); } } static final String indir = reformatedInternal + "source/"; static final UCD DEFAULT_UCD = Default.ucd(); - static final UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(version); // ICUPropertyFactory.make(); + static final UnicodeProperty.Factory ups = + ToolUnicodePropertySource.make(version); // ICUPropertyFactory.make(); + static { // USE the tool unicode set instead of ICU, which may not be using the latest version. UnicodeSet.setDefaultXSymbolTable(ups.getXSymbolTable()); UnicodeTransform.setFactory(TOOL_FACTORY); } + static final UnicodeSet COMMON_OR_INHERITED; static final UnicodeSet CASED; static final UnicodeSet COMMON_OR_INHERITED_NFKD; static final UnicodeSet CASED_NFKD; static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(version); - static final UnicodeMap> scriptExtensions = iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); - static final UnicodeSet notNFKD = iup.loadEnum(UcdProperty.NFKD_Quick_Check, NFKD_Quick_Check_Values.class).getSet(NFKD_Quick_Check_Values.No); + static final UnicodeMap> scriptExtensions = + iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); + static final UnicodeSet notNFKD = + iup.loadEnum(UcdProperty.NFKD_Quick_Check, NFKD_Quick_Check_Values.class) + .getSet(NFKD_Quick_Check_Values.No); + static { UnicodeSet common = scriptExtensions.getSet(Collections.singleton(Script_Values.Common)); - UnicodeSet inherited = scriptExtensions.getSet(Collections.singleton(Script_Values.Inherited)); + UnicodeSet inherited = + scriptExtensions.getSet(Collections.singleton(Script_Values.Inherited)); COMMON_OR_INHERITED = new UnicodeSet(common).addAll(inherited).freeze(); CASED = iup.loadEnum(UcdProperty.Changes_When_Casefolded, Binary.class).getSet(Binary.Yes); COMMON_OR_INHERITED_NFKD = new UnicodeSet(COMMON_OR_INHERITED); CASED_NFKD = new UnicodeSet(CASED); - for (String s: notNFKD) { + for (String s : notNFKD) { if (s.equals("𝐉")) { int debug = 0; } @@ -161,12 +177,11 @@ public class GenerateConfusables { private static final String EXCAPE_FUNNY_RULE = ":: [[:C:]-[:cn:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; "; - static final Transliterator EXCAPE_FUNNY = Transliterator.createFromRules( - "any-html", EXCAPE_FUNNY_RULE, Transliterator.FORWARD); + static final Transliterator EXCAPE_FUNNY = + Transliterator.createFromRules("any-html", EXCAPE_FUNNY_RULE, Transliterator.FORWARD); static BagFormatter makeFormatter() { - return new BagFormatter(ups) - .setLineSeparator("\n"); + return new BagFormatter(ups).setLineSeparator("\n"); } private static final boolean SHOW_SUPPRESS = false; @@ -177,7 +192,7 @@ static BagFormatter makeFormatter() { public static void main(String[] args) throws IOException { System.setProperty("line.separator", "\n"); - //quickTest(); + // quickTest(); if (args.length == 0) { args = new String[] {"-b", "-c"}; } @@ -204,9 +219,10 @@ public static void main(String[] args) throws IOException { e.printStackTrace(); } finally { System.out.println("Done"); - System.out.println("!!! Remember to run TestSecurity.java, after refreshing the generated files in Eclipse at " - + GEN_SECURITY_DIR - + " !!!"); + System.out.println( + "!!! Remember to run TestSecurity.java, after refreshing the generated files in Eclipse at " + + GEN_SECURITY_DIR + + " !!!"); } } @@ -224,34 +240,46 @@ private static void generateAsciify() throws IOException { continue; } builder.append(line).append('\n'); - if (DEBUG) System.out.println(" + \"" + com.ibm.icu.impl.Utility.escape(line) + "\\n\""); + if (DEBUG) + System.out.println(" + \"" + com.ibm.icu.impl.Utility.escape(line) + "\\n\""); } if (DEBUG) System.out.println(";"); in.close(); final String rules = builder.toString(); - final Transliterator asciify = Transliterator.createFromRules("asciify", rules, Transliterator.FORWARD); + final Transliterator asciify = + Transliterator.createFromRules("asciify", rules, Transliterator.FORWARD); in = FileUtilities.openUTF8Reader(indir, "asciify_examples.txt"); if (DEBUG) System.out.println("String[][] translitTestCases = {"); - if (DEBUG) System.out.println("//{\"" + "SAMPLE" + "\", \"" + "EXPECTED TRANSFORM" + "\"},"); + if (DEBUG) + System.out.println("//{\"" + "SAMPLE" + "\", \"" + "EXPECTED TRANSFORM" + "\"},"); while (true) { final String line = Utility.readDataLine(in); if (line == null) { break; } - if (DEBUG) System.out.println("{\"" + com.ibm.icu.impl.Utility.escape(line) + "\", \"" + asciify.transform(line) + "\"},"); + if (DEBUG) + System.out.println( + "{\"" + + com.ibm.icu.impl.Utility.escape(line) + + "\", \"" + + asciify.transform(line) + + "\"},"); } if (DEBUG) System.out.println("};"); in.close(); } private static final UnicodeSet LATIN = new UnicodeSet("[:script=latin:]").freeze(); - private static final UnicodeSet LATIN_PLUS = new UnicodeSet("[[:script=latin:][:script=common:][:script=inherited:]]").freeze(); + private static final UnicodeSet LATIN_PLUS = + new UnicodeSet("[[:script=latin:][:script=common:][:script=inherited:]]").freeze(); private static final UnicodeSet ASCII = new UnicodeSet("[:ASCII:]").freeze(); - private static final UnicodeSet MARKS_AND_ASCII = new UnicodeSet("[[:mark:][:ASCII:]]").freeze(); + private static final UnicodeSet MARKS_AND_ASCII = + new UnicodeSet("[[:mark:][:ASCII:]]").freeze(); private static void generateLatin() throws IOException { - // pick out only those items where the source and target both have some latin, and no non-latin - final Map mapping = new TreeMap(UCAComparator); + // pick out only those items where the source and target both have some latin, and no + // non-latin + final Map mapping = new TreeMap(UCAComparator); addLatin(mapping, "confusables-source.txt"); addLatin(mapping, "confusables-intentional.txt"); final Set sorted = new TreeSet(UCAComparator); @@ -275,12 +303,24 @@ private static void generateLatin() throws IOException { reason = "!CH"; } } - if (new UnicodeSet().addAll(Default.nfd().normalize(target)).removeAll(MARKS_AND_ASCII).size() > 0) { + if (new UnicodeSet() + .addAll(Default.nfd().normalize(target)) + .removeAll(MARKS_AND_ASCII) + .size() + > 0) { reason += " XXX"; } - if (DEBUG) System.out.println(source + "\t→\t" + target + - " ; #" + reason + "\t" + DEFAULT_UCD.getCodeAndName(source) + "\t→\t" + DEFAULT_UCD.getCodeAndName(target)); - + if (DEBUG) + System.out.println( + source + + "\t→\t" + + target + + " ; #" + + reason + + "\t" + + DEFAULT_UCD.getCodeAndName(source) + + "\t→\t" + + DEFAULT_UCD.getCodeAndName(target)); } } @@ -310,8 +350,9 @@ private static void addLatin(Map mapping, String fileName) throw } final String old = mapping.get(source); - if (old!=null) { - if (DEBUG) System.out.println("Overriding " + source + "=>" + old + " with " + target); + if (old != null) { + if (DEBUG) + System.out.println("Overriding " + source + "=>" + old + " with " + target); } // skip NFKC forms @@ -322,30 +363,29 @@ private static void addLatin(Map mapping, String fileName) throw continue; } - mapping.put(source, target); } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException("Can't process <" + oldLine + ">").initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Can't process <" + oldLine + ">") + .initCause(e); } } in.close(); } - private static Matcher HEX = Pattern.compile( - "\\b([A-F0-9]{4,6})\\b" + - "|U+([a-fA-F0-9]{4,6})\\b" + - "|\\\\u([a-fA-F0-9]{4})" + - "|\\\\U([a-fA-F0-9]{6})" + - "|\\\\u\\{([a-fA-F0-9]{1,6})\\}").matcher(""); + private static Matcher HEX = + Pattern.compile( + "\\b([A-F0-9]{4,6})\\b" + + "|U+([a-fA-F0-9]{4,6})\\b" + + "|\\\\u([a-fA-F0-9]{4})" + + "|\\\\U([a-fA-F0-9]{6})" + + "|\\\\u\\{([a-fA-F0-9]{1,6})\\}") + .matcher(""); /** - * Convert a string with a mixture of hex and normal characters. - * Anything like the following is converted from hex to chars - * and all spaces are removed - * hexChar = \b[A-F0-9]{4,6}\b - * | U+[a-fA-F0-9]{4,6} - * | \\u[a-fA-F0-9]{4} - * | \\U[a-fA-F0-9]{6} - * | \\u{[a-fA-F0-9]{1,6} + * Convert a string with a mixture of hex and normal characters. Anything like the following is + * converted from hex to chars and all spaces are removed hexChar = \b[A-F0-9]{4,6}\b | + * U+[a-fA-F0-9]{4,6} | \\u[a-fA-F0-9]{4} | \\U[a-fA-F0-9]{6} | \\u{[a-fA-F0-9]{1,6} + * * @param hexOrChars * @return */ @@ -381,9 +421,7 @@ private static String fromHexLenient(String hexOrChars) { // final String result = (String) CollectionUtilities.getBest(x, betterTargetIsLess, -1); // } - /** - * - */ + /** */ // private static UnicodeSet _Non_IICore; // // private static UnicodeSet getNonIICore() { @@ -429,7 +467,8 @@ private static String fromHexLenient(String hexOrChars) { // } // br.close(); // } catch (Exception e) { - // throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e); + // throw (RuntimeException) new RuntimeException("Failure on line " + + // line).initCause(e); // } // _Non_IICore.removeAll(cjk_nic); // } @@ -443,91 +482,88 @@ private static String fromHexLenient(String hexOrChars) { // } private static PrintWriter log; + private static final String ARROW = "→"; // \u2194 private static final String BACKARROW = "\u2190"; static UnicodeSet UNASSIGNED = - ups.getSet("gc=Cn") - .addAll(ups.getSet("gc=Co")) - .addAll(ups.getSet("gc=Cs")).freeze(); + ups.getSet("gc=Cn").addAll(ups.getSet("gc=Co")).addAll(ups.getSet("gc=Cs")).freeze(); private static UnicodeSet SKIP_SET = - ups.getSet("gc=Cc") - .addAll(ups.getSet("gc=Cf")) - .addAll(UNASSIGNED).freeze(); + ups.getSet("gc=Cc").addAll(ups.getSet("gc=Cf")).addAll(UNASSIGNED).freeze(); private static UnicodeSet WHITESPACE = ups.getSet("Whitespace=Yes").freeze(); static UnicodeSet GC_LOWERCASE = ups.getSet("gc=Ll").freeze(); private static UnicodeSet _skipNFKD; private static UnicodeSet COMBINING = - ups.getSet("gc=Mn") - .addAll(ups.getSet("gc=Me")) - .add(0x3099) - .add(0x309A).freeze(); - private static UnicodeSet INVISIBLES = - ups.getSet("default-ignorable-codepoint=true").freeze(); - private static UnicodeSet XIDContinueSet = - ups.getSet("XID_Continue=true").freeze(); + ups.getSet("gc=Mn").addAll(ups.getSet("gc=Me")).add(0x3099).add(0x309A).freeze(); + private static UnicodeSet INVISIBLES = ups.getSet("default-ignorable-codepoint=true").freeze(); + private static UnicodeSet XIDContinueSet = ups.getSet("XID_Continue=true").freeze(); private static UnicodeSet XID = XIDContinueSet; private static UnicodeSet RTL = new UnicodeSet("[[:bc=R:][:bc=AL:][:bc=AN:]]").freeze(); private static UnicodeSet CONTROLS = new UnicodeSet("[[:cc:][:Zl:][:Zp:]]").freeze(); private static final char LRM = '\u200E'; - private static UnicodeSet commonAndInherited = new UnicodeSet("[[:script=common:][:script=inherited:]]"); - + private static UnicodeSet commonAndInherited = + new UnicodeSet("[[:script=common:][:script=inherited:]]"); private static Map gatheredNFKD = new TreeMap(); private static UnicodeMap nfcMap; private static UnicodeMap nfkcMap; - private static Comparator codepointComparator = new UTF16.StringComparator(true,false,0); - static Comparator UCAComparator = new org.unicode.cldr.util.MultiComparator(new Comparator[] { - Collator.getInstance(ULocale.ROOT), - //UCA.buildCollator(null), - codepointComparator}); - - private static UnicodeSet setsToAbbreviate = new UnicodeSet("[" + - "\\u3400-\\u4DB5" + - "\\u4E00-\\u9FA5" + - "\\uA000-\\uA48C" + - "\\uAC00-\\uD7A3" + - "\\u1100-\\u11FF" + - "\\uFB00-\\uFEFC" + - "\\u2460-\\u24FF" + - "\\u3251-\\u33FF" + - "\\u4DC0-\\u4DFF" + - "\\u3165-\\u318E" + - "\\uA490-\\uA4C6" + - "\\U00010140-\\U00010174" + - "\\U0001D300-\\U0001D356" + - "\\U0001D000-\\U0001D1DD" + - "\\U00020000-\\U0002A6D6" + - "\\U0001D400-\\U0001D7FF" + - "[:script=Canadian_Aboriginal:]" + - "[:script=ETHIOPIC:]" + - "[:script=Tagalog:]" + - "[:script=Hanunoo:]" + - "[:script=Buhid:]" + - "[:script=Tagbanwa:]" + - "[:script=Deseret:]" + - "[:script=Shavian:]" + - "[:script=Ogham:]" + - "[:script=Old Italic:]" + - "[:script=Runic:]" + - "[:script=Gothic:]" + - "[:script=Ugaritic:]" + - "[:script=Linear B:]" + - "[:script=Cypriot:]" + - "[:script=Coptic:]" + - "[:script=Syriac:]" + - "[:script=Glagolitic:]" + - "[:script=Glagolitic:]" + - "[:script=Old Persian:]" + - "[:script=Kharoshthi:]" + - "[:script=Osmanya:]" + - "[:default ignorable code point:]" + - "]").freeze(); + private static Comparator codepointComparator = new UTF16.StringComparator(true, false, 0); + static Comparator UCAComparator = + new org.unicode.cldr.util.MultiComparator( + new Comparator[] { + Collator.getInstance(ULocale.ROOT), + // UCA.buildCollator(null), + codepointComparator + }); + + private static UnicodeSet setsToAbbreviate = + new UnicodeSet( + "[" + + "\\u3400-\\u4DB5" + + "\\u4E00-\\u9FA5" + + "\\uA000-\\uA48C" + + "\\uAC00-\\uD7A3" + + "\\u1100-\\u11FF" + + "\\uFB00-\\uFEFC" + + "\\u2460-\\u24FF" + + "\\u3251-\\u33FF" + + "\\u4DC0-\\u4DFF" + + "\\u3165-\\u318E" + + "\\uA490-\\uA4C6" + + "\\U00010140-\\U00010174" + + "\\U0001D300-\\U0001D356" + + "\\U0001D000-\\U0001D1DD" + + "\\U00020000-\\U0002A6D6" + + "\\U0001D400-\\U0001D7FF" + + "[:script=Canadian_Aboriginal:]" + + "[:script=ETHIOPIC:]" + + "[:script=Tagalog:]" + + "[:script=Hanunoo:]" + + "[:script=Buhid:]" + + "[:script=Tagbanwa:]" + + "[:script=Deseret:]" + + "[:script=Shavian:]" + + "[:script=Ogham:]" + + "[:script=Old Italic:]" + + "[:script=Runic:]" + + "[:script=Gothic:]" + + "[:script=Ugaritic:]" + + "[:script=Linear B:]" + + "[:script=Cypriot:]" + + "[:script=Coptic:]" + + "[:script=Syriac:]" + + "[:script=Glagolitic:]" + + "[:script=Glagolitic:]" + + "[:script=Old Persian:]" + + "[:script=Kharoshthi:]" + + "[:script=Osmanya:]" + + "[:default ignorable code point:]" + + "]") + .freeze(); /** * @throws IOException - * */ private static void generateIDN() throws IOException { final IdentifierInfo info = IdentifierInfo.getIdentifierInfo(); @@ -538,18 +574,13 @@ private static void generateIDN() throws IOException { // static final String PROHIBITED = "Restricted ; "; // static final String UNPROHIBITED = "Allowed ; "; private static final boolean suppress_NFKC = true; - /** - * - */ - + /** */ - /** - * - */ + /** */ static void generateDecompFile() throws IOException { final PrintWriter out = FileUtilities.openUTF8Writer(reformatedInternal, "decomps.txt"); final UnicodeProperty dt = ups.getProperty("Decomposition_Type"); - for (final Iterator it = dt.getAvailableValues().iterator(); it.hasNext();) { + for (final Iterator it = dt.getAvailableValues().iterator(); it.hasNext(); ) { final String value = (String) it.next(); if (value.equalsIgnoreCase("none") || value.equalsIgnoreCase("canonical")) { continue; @@ -558,12 +589,12 @@ static void generateDecompFile() throws IOException { out.println(""); out.println("# Decomposition_Type = " + value); out.println(""); - for (final UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) { + for (final UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next(); ) { final String source = usi.getString(); final String target = getModifiedNKFC(source); writeSourceTargetLine(out, source, "N", target, value, ARROW); } - //bf.showSetNames(out, s); + // bf.showSetNames(out, s); out.flush(); } out.close(); @@ -571,46 +602,45 @@ static void generateDecompFile() throws IOException { public static class FakeBreak extends UnicodeLabel { UnicodeSet nobreakSet = setsToAbbreviate; + @Override public String getValue(int codepoint, boolean isShort) { - return nobreakSet.contains(codepoint) ? "" - : (codepoint & 1) == 0 ? "O" - : "E"; + return nobreakSet.contains(codepoint) ? "" : (codepoint & 1) == 0 ? "O" : "E"; } } public static class FakeBreak2 extends UnicodeLabel { - UnicodeSet nobreakSet = new UnicodeSet(setsToAbbreviate) - .addAll(new UnicodeSet(IDNOutputSet).complement()) - .addAll(new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus).complement()); + UnicodeSet nobreakSet = + new UnicodeSet(setsToAbbreviate) + .addAll(new UnicodeSet(IDNOutputSet).complement()) + .addAll( + new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus) + .complement()); @Override public String getValue(int codepoint, boolean isShort) { - return nobreakSet.contains(codepoint) ? "" - : (codepoint & 1) == 0 ? "O" - : "E"; + return nobreakSet.contains(codepoint) ? "" : (codepoint & 1) == 0 ? "O" : "E"; } } - /** - * - */ + /** */ // private static void showRemapped(PrintWriter out, String title, UnicodeMap remap) { // out.println(""); // out.println("# " + title); // out.println(""); // int count = 0; - // for (final UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); usi.next();) { - // writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null, ARROW); + // for (final UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); + // usi.next();) { + // writeSourceTargetLine(out, usi.getString(), "remap-to", + // (String)remap.getValue(usi.codepoint), null, ARROW); // count++; // } // out.println(""); // out.println("# Total code points: " + count); // } - /** - * - */ + /** */ static UnicodeSet IDNOutputSet; + static UnicodeSet IDNInputSet; private static UnicodeSet _preferredIDSet; @@ -627,19 +657,21 @@ static UnicodeSet getIdentifierSet() { continue; } // get IDNA - //int idnaType = GenerateStringPrep.getIDNAType(cp); - //if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp); - //if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp); + // int idnaType = GenerateStringPrep.getIDNAType(cp); + // if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp); + // if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp); final IdnaType idnaType = Uts46.SINGLETON.getType(cp); switch (idnaType) { - case valid: case deviation: - IDNOutputSet.add(cp); - // fall thru! - case mapped: case ignored: - IDNInputSet.add(cp); - break; - case disallowed: - // no action + case valid: + case deviation: + IDNOutputSet.add(cp); + // fall thru! + case mapped: + case ignored: + IDNInputSet.add(cp); + break; + case disallowed: + // no action } } _preferredIDSet = new UnicodeSet(IDNOutputSet).addAll(XIDContinueSet); @@ -648,7 +680,8 @@ static UnicodeSet getIdentifierSet() { return _preferredIDSet; } - private static UnicodeSet SKIP_EXCEPTIONS = new UnicodeSet().add(0x1E9A).add('ſ').add('ſt').add('ẛ').add("Ϲ").add("ϲ").freeze(); + private static UnicodeSet SKIP_EXCEPTIONS = + new UnicodeSet().add(0x1E9A).add('ſ').add('ſt').add('ẛ').add("Ϲ").add("ϲ").freeze(); private static UnicodeSet getSkipNFKD() { nfcMap = new UnicodeMap(); @@ -684,8 +717,7 @@ private static UnicodeSet getSkipNFKD() { || decompType == UCD_Types.COMPAT_WIDE || decompType == UCD_Types.COMPAT_WIDE || cp == '﬩' - || cp == '︒' - ) { + || cp == '︒') { _skipNFKD.add(cp); continue; } @@ -698,7 +730,7 @@ private static UnicodeSet getSkipNFKD() { if (DEBUG) System.out.println("\t" + DEFAULT_UCD.getCodeAndName(kmapped)); kmapped = getModifiedNKFC(source); // for debugging } - nfkcMap.put(cp,kmapped); + nfkcMap.put(cp, kmapped); } if (mapped.equals(source)) { continue; @@ -719,10 +751,11 @@ private static UnicodeSet getSkipNFKD() { /** * Returns the script of the input text. Script values of COMMON and INHERITED are ignored. + * * @param source Input text. - * @return Script value found in the text. - * If more than one script values are found, then UCD_Types.UNUSED_SCRIPT is returned. - * If no script value is found (other than COMMON or INHERITED), then UCD_Types.COMMON_SCRIPT is returned. + * @return Script value found in the text. If more than one script values are found, then + * UCD_Types.UNUSED_SCRIPT is returned. If no script value is found (other than COMMON or + * INHERITED), then UCD_Types.COMMON_SCRIPT is returned. */ public static int getSingleScript(String source) { if (source.length() == 0) { @@ -745,19 +778,21 @@ public static int getSingleScript(String source) { return lastScript; } - /** - * - */ + /** */ private static void generateConfusables() throws IOException { log = FileUtilities.openUTF8Writer(reformatedInternal, "log.txt"); - //fixMichel(indir, outdir); + // fixMichel(indir, outdir); generateConfusables(indir, reformatedInternal, GEN_SECURITY_DIR); log.close(); if (false) { - for (final Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext();) { - final String source = (String)it.next(); - if (DEBUG) System.out.println(DEFAULT_UCD.getCodeAndName(source) - + " => " + DEFAULT_UCD.getCodeAndName((String)gatheredNFKD.get(source))); + for (final Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext(); ) { + final String source = (String) it.next(); + if (DEBUG) + System.out.println( + DEFAULT_UCD.getCodeAndName(source) + + " => " + + DEFAULT_UCD.getCodeAndName( + (String) gatheredNFKD.get(source))); } } } @@ -794,20 +829,28 @@ public int compareTo(Object o) { /** * @param relation TODO - * */ - private static void writeSourceTargetLine(PrintWriter out, String source, String tag, String target, String reason, String relation) { + private static void writeSourceTargetLine( + PrintWriter out, + String source, + String tag, + String target, + String reason, + String relation) { out.print( Utility.hex(source) - + " ;\t" + Utility.hex(target) - + (tag == null ? "" : " ;\t" + tag) - //+ " ;\t" + (preferredID.contains(source) ? "ID" : "") - + "\t#" - + (isXid(source) ? "" : "*") - + arrowLiterals(source, target, relation) - + DEFAULT_UCD.getName(source) + " " + relation + " " - + DEFAULT_UCD.getName(target) - ); + + " ;\t" + + Utility.hex(target) + + (tag == null ? "" : " ;\t" + tag) + // + " ;\t" + (preferredID.contains(source) ? "ID" : "") + + "\t#" + + (isXid(source) ? "" : "*") + + arrowLiterals(source, target, relation) + + DEFAULT_UCD.getName(source) + + " " + + relation + + " " + + DEFAULT_UCD.getName(target)); if (reason != null) { out.print("\t# " + reason); } @@ -829,10 +872,11 @@ private static String rtlProtect(String source) { return source; } - private static class MyEquivalenceClass extends XEquivalenceClass { + private static class MyEquivalenceClass extends XEquivalenceClass { public MyEquivalenceClass() { super("NONE"); } + public boolean addCheck(String a, String b, String reason) { // quick check for illegal containment, before changing object if (checkForBad(a, b, reason) || checkForBad(b, a, reason)) { @@ -842,49 +886,54 @@ public boolean addCheck(String a, String b, String reason) { // full check for any resulting illegal containment. // illegal if for any x, y, x is a proper superstring of y final Set equivalences = getEquivalences(a); - for (final Iterator it = equivalences.iterator(); it.hasNext();) { - final String x = (String)it.next(); - if (!UTF16.hasMoreCodePointsThan(x,1)) { + for (final Iterator it = equivalences.iterator(); it.hasNext(); ) { + final String x = (String) it.next(); + if (!UTF16.hasMoreCodePointsThan(x, 1)) { continue; } - for (final Iterator it2 = equivalences.iterator(); it2.hasNext();) { - final String y = (String)it2.next(); + for (final Iterator it2 = equivalences.iterator(); it2.hasNext(); ) { + final String y = (String) it2.next(); if (x.equals(y)) { continue; } if (x.indexOf(y) >= 0) { - throw new RuntimeException("Illegal containment: " - + DEFAULT_UCD.getCodeAndName(x) + " contains " - + DEFAULT_UCD.getCodeAndName(y) + " because " - + DEFAULT_UCD.getCodeAndName(a) + " ~ " - + DEFAULT_UCD.getCodeAndName(b) + " because of " - + reason); + throw new RuntimeException( + "Illegal containment: " + + DEFAULT_UCD.getCodeAndName(x) + + " contains " + + DEFAULT_UCD.getCodeAndName(y) + + " because " + + DEFAULT_UCD.getCodeAndName(a) + + " ~ " + + DEFAULT_UCD.getCodeAndName(b) + + " because of " + + reason); } } } return true; } - /** - * - */ + /** */ private boolean checkForBad(String a, String b, String reason) { final Set equivalences = getEquivalences(b); - for (final Iterator it = equivalences.iterator(); it.hasNext();) { - final String b2 = (String)it.next(); + for (final Iterator it = equivalences.iterator(); it.hasNext(); ) { + final String b2 = (String) it.next(); if (a.equals(b2)) { continue; } if (b2.indexOf(a) >= 0 || a.indexOf(b2) >= 0) { - log.println("Illegal containment: " - + DEFAULT_UCD.getCodeAndName(a) - + " overlaps " - + DEFAULT_UCD.getCodeAndName(b2) - + "\n\tfrom " - + DEFAULT_UCD.getCodeAndName(b) - + "\n\twith reason " - + reason + " plus " - + getReasons(b2, b)); + log.println( + "Illegal containment: " + + DEFAULT_UCD.getCodeAndName(a) + + " overlaps " + + DEFAULT_UCD.getCodeAndName(b2) + + "\n\tfrom " + + DEFAULT_UCD.getCodeAndName(b) + + "\n\twith reason " + + reason + + " plus " + + getReasons(b2, b)); return true; } } @@ -938,17 +987,19 @@ public void close(String reason) { // do all the combinations for all the paradigms for (int lower = 0; lower < 2; ++lower) { for (int sameScript = 0; sameScript < 2; ++sameScript) { - for (final Iterator it = cloneForSafety.iterator(); it.hasNext();) { + for (final Iterator it = cloneForSafety.iterator(); it.hasNext(); ) { final String item = (String) it.next(); - if (!UTF16.hasMoreCodePointsThan(item,1)) - { + if (!UTF16.hasMoreCodePointsThan(item, 1)) { continue; // just for speed } reasons.setLength(0); - final String mapped = mapString(item, reasons, lower == 1, sameScript == 1); + final String mapped = + mapString(item, reasons, lower == 1, sameScript == 1); if (!isEquivalent(item, mapped)) { if (addCheck(item, mapped, reasons.toString())) { - // if (DEBUG) System.out.println("Closing: " + DEFAULT_UCD.getCodeAndName(item) + " => " + DEFAULT_UCD.getCodeAndName(mapped)); + // if (DEBUG) System.out.println("Closing: " + + // DEFAULT_UCD.getCodeAndName(item) + " => " + + // DEFAULT_UCD.getCodeAndName(mapped)); addedItem = true; } } @@ -958,10 +1009,9 @@ public void close(String reason) { } while (addedItem); } - /** - * - */ - private String mapString(String item, StringBuffer reasons, boolean onlyLowercase, boolean onlySameScript) { + /** */ + private String mapString( + String item, StringBuffer reasons, boolean onlyLowercase, boolean onlySameScript) { if (DEBUG && item.startsWith("\u03D2")) { System.out.println("foo"); } @@ -984,18 +1034,20 @@ private String mapString(String item, StringBuffer reasons, boolean onlyLowercas private Object getBestForm(Collection x) { if (x.size() != 1) { - return "[" + x + "]"; + return "[" + x + "]"; } final Object item = x.iterator().next(); if (!(item instanceof Collection)) { return x.toString(); } - return getBestForm((Collection)item); + return getBestForm((Collection) item); } public String getParadigm(String item, boolean onlyLowercase, boolean onlySameScript) { - // 0049 ; 006C ; MA # ( I → l ) LATIN CAPITAL LETTER I → LATIN SMALL LETTER L # - // 042E ; 0049 004F ; MA # ( Ю → IO ) CYRILLIC CAPITAL LETTER YU → LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER O # + // 0049 ; 006C ; MA # ( I → l ) LATIN CAPITAL LETTER I → LATIN SMALL + // LETTER L # + // 042E ; 0049 004F ; MA # ( Ю → IO ) CYRILLIC CAPITAL LETTER YU → + // LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER O # // fails, since 0049 should not occur in the target Set filteredSet = new HashSet(); final Set equivalences = getEquivalences(item); @@ -1004,50 +1056,58 @@ public String getParadigm(String item, boolean onlyLowercase, boolean onlySameSc // } main: - for (final Object element : equivalences) { - final String other = (String) element; - if (item.equals("\u2CFE") && (other.equals("\u22D7") || other.equals("\u00B7\u1433"))) { - int debug = 0; - } + for (final Object element : equivalences) { + final String other = (String) element; + if (item.equals("\u2CFE") + && (other.equals("\u22D7") || other.equals("\u00B7\u1433"))) { + int debug = 0; + } - final String combined = item + other; + final String combined = item + other; - if (onlyLowercase) { - final boolean isLowercase = combined.equals(DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); - if (!isLowercase) { - continue; - } + if (onlyLowercase) { + final boolean isLowercase = + combined.equals( + DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); + if (!isLowercase) { + continue; } - if (onlySameScript) { - final boolean isMixed = ScriptInfo.isMixedScript(combined); - if (isMixed) { - continue; - } + } + if (onlySameScript) { + final boolean isMixed = ScriptInfo.isMixedScript(combined); + if (isMixed) { + continue; } + } - // verify idempotence - final int[] codePointArray = With.codePointArray(other); - if (codePointArray.length == 1) { - // String otherParadigm = getParadigm(other, onlyLowercase, onlySameScript); - // if (otherParadigm != null && !item.equals(otherParadigm)) { - // continue main; - // } - } else { - for (int codepoint : codePointArray) { - final String codePointString = UTF16.valueOf(codepoint); - String otherParadigm = getParadigm(codePointString, onlyLowercase, onlySameScript); - if (otherParadigm != null && !codePointString.equals(otherParadigm)) { - continue main; - } + // verify idempotence + final int[] codePointArray = With.codePointArray(other); + if (codePointArray.length == 1) { + // String otherParadigm = getParadigm(other, + // onlyLowercase, onlySameScript); + // if (otherParadigm != null && + // !item.equals(otherParadigm)) { + // continue main; + // } + } else { + for (int codepoint : codePointArray) { + final String codePointString = UTF16.valueOf(codepoint); + String otherParadigm = + getParadigm(codePointString, onlyLowercase, onlySameScript); + if (otherParadigm != null && !codePointString.equals(otherParadigm)) { + continue main; } } - - filteredSet.add(other); } + + filteredSet.add(other); + } // } - return CollectionUtilities.getBest(filteredSet, + return CollectionUtilities.getBest( + filteredSet, // onlyLowercase || onlySameScript ? betterTargetIsLessFavorNeutral : - betterTargetIsLess, -1); + betterTargetIsLess, + -1); } public Set getOrderedExplicitItems() { @@ -1055,14 +1115,13 @@ public Set getOrderedExplicitItems() { cloneForSafety.addAll(getExplicitItems()); return cloneForSafety; } - /** - * - */ + /** */ // public void writeSource(PrintWriter out) { // final Set items = getOrderedExplicitItems(); // for (final Iterator it = items.iterator(); it.hasNext();) { // final String item = (String) it.next(); - // final String paradigm = CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1); + // final String paradigm = CollectionUtilities.getBest(getEquivalences(item), + // betterTargetIsLess, -1); // if (item.equals(paradigm)) { // continue; // } @@ -1072,13 +1131,13 @@ public Set getOrderedExplicitItems() { } private static class RawData { - Map> data = new TreeMap>(); + Map> data = new TreeMap>(); public void add(String source, String target, String type) { if (betterTargetIsLess.compare(source, target) < 0) { - add2(source,target,type); + add2(source, target, type); } else { - add2(target,source,type); + add2(target, source, type); } } @@ -1109,7 +1168,8 @@ private static class DataSet { private static String testChar = UTF16.valueOf(0x10A3A); - public DataSet add(String source, String target, String type, int lineCount, String errorLine) { + public DataSet add( + String source, String target, String type, int lineCount, String errorLine) { if (SKIP_SET.containsAll(source) || SKIP_SET.containsAll(target)) { return this; } @@ -1121,11 +1181,19 @@ public DataSet add(String source, String target, String type, int lineCount, Str COMBINING.containsAll(nsource); COMBINING.containsAll(ntarget); } - System.err.println("ERROR: Mixed combining classes: " + lineCount + "\t" + errorLine + "\t" + Utility.hex(nsource) + "\t" + Utility.hex(ntarget)); + System.err.println( + "ERROR: Mixed combining classes: " + + lineCount + + "\t" + + errorLine + + "\t" + + Utility.hex(nsource) + + "\t" + + Utility.hex(ntarget)); } // if it is just a compatibility match, return - //if (nsource.equals(ntarget)) return this; + // if (nsource.equals(ntarget)) return this; if (type.indexOf("skip") >= 0) { return this; } @@ -1135,10 +1203,11 @@ public DataSet add(String source, String target, String type, int lineCount, Str type = getReasonFromFilename(type); - // if it is base + combining sequence => base2 + same combining sequence, do just the base - final int nsourceFirst = UTF16.charAt(nsource,0); + // if it is base + combining sequence => base2 + same combining sequence, do just the + // base + final int nsourceFirst = UTF16.charAt(nsource, 0); final String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst)); - final int ntargetFirst = UTF16.charAt(ntarget,0); + final int ntargetFirst = UTF16.charAt(ntarget, 0); final String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst)); if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) { @@ -1146,16 +1215,17 @@ public DataSet add(String source, String target, String type, int lineCount, Str target = UTF16.valueOf(ntargetFirst); type += "-base"; } - //type += ":" + lineCount; + // type += ":" + lineCount; final String combined = source + target; if (DEBUG && combined.indexOf("\u0430") >= 0) { System.out.println(DEFAULT_UCD.getCodeAndName(combined)); } - final boolean isLowercase = combined.equals(DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); + final boolean isLowercase = + combined.equals(DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); final boolean isMixed = ScriptInfo.isMixedScript(combined); // Here's where we add data, if you need to debug - raw.add(source,target,type); + raw.add(source, target, type); dataMixedAnycase.add(source, target, type); // if (isLowercase) { // dataMixedLowercase.add(source, target, type); @@ -1174,10 +1244,11 @@ public String toString() { return dataMixedAnycase.toString(); } - /* *//** + /* */ + /** * @param errorLine TODO - * - *//* + */ + /* private DataSet add(Data newData, String errorLine) { if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) { System.out.println("Problem with " + errorLine); @@ -1193,8 +1264,10 @@ private DataSet add(Data newData, String errorLine) { } return this; } - */ // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt" + */ + // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt" private static final int NORMAL = 0, FOLDING = 1, OLD = 2; + private static final UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]").freeze(); public DataSet addFile(String directory, String filename) throws IOException { @@ -1223,7 +1296,7 @@ public DataSet addFile(String directory, String filename) throws IOException { isFont = true; continue; } - final String[] pieces = Utility.split(line,';'); + final String[] pieces = Utility.split(line, ';'); if (pieces.length < 2) { System.err.println("Error on: (" + count + ")\t" + line); continue; @@ -1233,9 +1306,14 @@ public DataSet addFile(String directory, String filename) throws IOException { final String targetString = INVISIBLES.stripFrom(pieces[1].trim(), true); if (!targetString.equals(pieces[1].trim())) { - if (DEBUG) System.out.println("**\t" + Utility.hex(pieces[0].trim()) + ";\t" + Utility.hex(targetString)); + if (DEBUG) + System.out.println( + "**\t" + + Utility.hex(pieces[0].trim()) + + ";\t" + + Utility.hex(targetString)); } - if (kind==FOLDING) { + if (kind == FOLDING) { final String target = fromHexOld(targetString); final String source = fromHexOld(sourceString); final String nsource = NFKD.normalize(source); @@ -1267,26 +1345,47 @@ public DataSet addFile(String directory, String filename) throws IOException { in.close(); return this; } catch (final Exception e) { - throw (RuntimeException) new RuntimeException("Failure with file: " - + directory + filename + " on line: " + count - + ": " + line).initCause(e); + throw (RuntimeException) + new RuntimeException( + "Failure with file: " + + directory + + filename + + " on line: " + + count + + ": " + + line) + .initCause(e); } } private void add2(String source, String target, String type, int count, String line) { - //if (pieces.length > 2) type = pieces[2].trim(); + // if (pieces.length > 2) type = pieces[2].trim(); final String nfkdSource = NFKD.normalize(source); final String nfkdTarget = NFKD.normalize(target); if (NSM.containsAll(source) && NSM.containsNone(target) || NSM.containsAll(target) && NSM.containsNone(source)) { if (SHOW_SUPPRESS) { - System.out.println("*** SUPPRESSING NSM Difference\t" - + count + "\t" + DEFAULT_UCD.getCodeAndName(source) + ";\t" + DEFAULT_UCD.getCodeAndName(target) + ";\t" + line); + System.out.println( + "*** SUPPRESSING NSM Difference\t" + + count + + "\t" + + DEFAULT_UCD.getCodeAndName(source) + + ";\t" + + DEFAULT_UCD.getCodeAndName(target) + + ";\t" + + line); } } else if (suppress_NFKC && nfkdSource.equals(nfkdTarget)) { if (SHOW_SUPPRESS) { - System.out.println("*** Suppressing nfkc for:\t" - + count + "\t" + DEFAULT_UCD.getCodeAndName(source) + ";\t" + DEFAULT_UCD.getCodeAndName(target) + ";\t" + line); + System.out.println( + "*** Suppressing nfkc for:\t" + + count + + "\t" + + DEFAULT_UCD.getCodeAndName(source) + + ";\t" + + DEFAULT_UCD.getCodeAndName(target) + + ";\t" + + line); } } else { add(source, target, type, count, line); @@ -1294,7 +1393,8 @@ private void add2(String source, String target, String type, int count, String l } public void writeSource(String directory, String filename) throws IOException { - final PrintWriter out = openAndWriteHeader(directory, filename, "Source File for IDN Confusables"); + final PrintWriter out = + openAndWriteHeader(directory, filename, "Source File for IDN Confusables"); // PrintWriter out = FileUtilities.openUTF8Writer(directory, filename); // out.println("# Source File for IDN Confusables"); // out.println("# $ Revision: 1.32 $"); @@ -1304,8 +1404,12 @@ public void writeSource(String directory, String filename) throws IOException { out.close(); } - public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException { - final PrintWriter out = openAndWriteHeader(directory, filename, "Recommended confusable mapping for IDN"); + public void writeSourceOrder( + String directory, String filename, boolean appendFile, boolean skipNFKEquivs) + throws IOException { + final PrintWriter out = + openAndWriteHeader( + directory, filename, "Recommended confusable mapping for IDN"); // PrintWriter out = FileUtilities.openUTF8Writer(directory, filename); // out.println("# Recommended confusable mapping for IDN"); // out.println("# $ Revision: 1.32 $"); @@ -1314,23 +1418,38 @@ public void writeSourceOrder(String directory, String filename, boolean appendFi if (appendFile) { final String[] replacements = {"%date%", Default.getDate()}; - Utility.appendFile(Settings.SRC_UCD_DIR + "confusablesHeader.txt", - Utility.UTF8_WINDOWS, out, replacements); - } - Relation, String> confusableMap - = Relation.of(new TreeMap(MyPairComparator), TreeSet.class); + Utility.appendFile( + Settings.SRC_UCD_DIR + "confusablesHeader.txt", + Utility.UTF8_WINDOWS, + out, + replacements); + } + Relation, String> confusableMap = + Relation.of(new TreeMap(MyPairComparator), TreeSet.class); if (true) { - // writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, + // writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, + // Lowercase Confusables", skipNFKEquivs, // true, true, confusableMap); - // writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, + // writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, + // Anycase Confusables", skipNFKEquivs, // false, true, confusableMap); - // writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, + // writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, + // Lowercase Confusables", skipNFKEquivs, // true, false, confusableMap); - writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, - false, false, confusableMap); + writeSourceOrder( + out, + dataMixedAnycase, + "MA", + "Mixed-Script, Anycase Confusables", + skipNFKEquivs, + false, + false, + confusableMap); Counter> counter = new Counter(); - Map, Pair> examples = new HashMap, Pair>(); - for (Entry, Set> entry : confusableMap.keyValuesSet()) { + Map, Pair> examples = + new HashMap, Pair>(); + for (Entry, Set> entry : + confusableMap.keyValuesSet()) { final Set set = entry.getValue(); counter.add(set, 1); if (!examples.containsKey(set)) { @@ -1338,54 +1457,70 @@ public void writeSourceOrder(String directory, String filename, boolean appendFi } } for (Set entry : counter) { - if (DEBUG) System.out.println(counter.get(entry) + "\t" + entry + "\t" + examples.get(entry)); + if (DEBUG) + System.out.println( + counter.get(entry) + "\t" + entry + "\t" + examples.get(entry)); } // } else { - // writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, false, false); - // writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, false); - // writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, false, false); - // writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, + // Lowercase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, + // Anycase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, + // Lowercase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, + // Anycase Confusables", skipNFKEquivs, false, false); } out.close(); } - private static Comparator> MyPairComparator = new Comparator>() { - public int compare(Pair o1, Pair o2) { - int result = UCAComparator.compare(o1.getFirst(), o2.getFirst()); - return result != 0 ? result : UCAComparator.compare(o1.getSecond(), o2.getSecond()); - } - }; + private static Comparator> MyPairComparator = + new Comparator>() { + public int compare(Pair o1, Pair o2) { + int result = UCAComparator.compare(o1.getFirst(), o2.getFirst()); + return result != 0 + ? result + : UCAComparator.compare(o1.getSecond(), o2.getSecond()); + } + }; /** * @param skipNFKEquivs TODO * @param onlyLowercase TODO * @param onlySingleScript TODO * @param confusableMap - * */ - private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, - boolean skipNFKEquivs, boolean onlyLowercase, boolean onlySingleScript, + private void writeSourceOrder( + PrintWriter out, + MyEquivalenceClass data, + String tag, + String title, + boolean skipNFKEquivs, + boolean onlyLowercase, + boolean onlySingleScript, Relation, String> confusableMap) { // first get all the sets. Then get the best paradigm from each. Then sort. // Set setOfSets = data.getEquivalenceSets(); // Map orderedResults = new TreeMap(betterTargetIsLess); // for (Iterator it = setOfSets.iterator(); it.hasNext();) { // Set setOfEquivs = (Set) it.next(); - // Object item = CollectionUtilities.getBest(setOfEquivs, betterTargetIsLess, -1); + // Object item = CollectionUtilities.getBest(setOfEquivs, + // betterTargetIsLess, -1); // // } - //int c = codepointComparator.compare("\uFFFF", "\uD800\uDC00"); - //System.out.println("Code Point Compare: " + c); + // int c = codepointComparator.compare("\uFFFF", "\uD800\uDC00"); + // System.out.println("Code Point Compare: " + c); final Set items = data.getOrderedExplicitItems(); // out.println(); // out.println("# " + title); // out.println(); int count = 0; final UnicodeSet preferredID = getIdentifierSet(); - final ArrayComparator ac = new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator}); + final ArrayComparator ac = + new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator}); final Set orderedPairs = new TreeSet(ac); - for (final Iterator it = items.iterator(); it.hasNext();) { + for (final Iterator it = items.iterator(); it.hasNext(); ) { final String source = (String) it.next(); - if (UTF16.hasMoreCodePointsThan(source,1)) { + if (UTF16.hasMoreCodePointsThan(source, 1)) { continue; } if (source.equals("\u2CFE") || source.equals("\u22D7")) { @@ -1404,16 +1539,18 @@ private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String t } } orderedPairs.add(new String[] {target, source}); - Pair pair = new Pair(target, source); + Pair pair = new Pair(target, source); confusableMap.put(pair, tag); } String lastTarget = null; - for (final Iterator it = orderedPairs.iterator(); it.hasNext();) { + for (final Iterator it = orderedPairs.iterator(); it.hasNext(); ) { final String[] pair = (String[]) it.next(); final String source = pair[1]; final String target = pair[0]; - final List> reasons = data.getReasons(source, target); - final String reason = XEquivalenceClass.toString(reasons, myLinkageTransform); // fixReason(reasons); + final List> reasons = data.getReasons(source, target); + final String reason = + XEquivalenceClass.toString( + reasons, myLinkageTransform); // fixReason(reasons); if (lastTarget != null && !lastTarget.equals(target)) { out.println(); } @@ -1427,9 +1564,7 @@ private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String t out.println(); } - /** - * - */ + /** */ // private String fixReason(List reasons) { // final List first = (List)reasons.get(0); // String result = ""; @@ -1470,9 +1605,9 @@ private void checkChar(String string) { public Set getEquivalences(String string) { return dataMixedAnycase.getEquivalences(string); } - /* *//** - * - *//* + /* */ + /** */ + /* public DataSet clean() { // remove all skips DataSet tempSet = new DataSet(); @@ -1542,30 +1677,32 @@ public DataSet clean() { } return s; } - *//** - * - *//* + */ + /** */ + /* private void remove(Data already) { String[] key = {already.source, already.target}; dataMap.remove(key); dataSet.remove(already); }*/ - /** - * - */ + /** */ public void close(String reason) { dataMixedAnycase.close(reason); // dataMixedLowercase.close(reason); // dataSingleAnycase.close(reason); // dataSingleLowercase.close(reason); } - /** - * - */ + /** */ public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) { int count = 0; - for (final UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); it.next(); ) { - add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine); + for (final UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); + it.next(); ) { + add( + it.getString(), + (String) decompMap.getValue(it.codepoint), + type, + ++count, + errorLine); } } @@ -1580,6 +1717,7 @@ public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) { private static class MyCollectionFilter implements Predicate { UnicodeSet outputAllowed; int minLength; + @Override public boolean test(String item) { if (!outputAllowed.containsAll(item)) { @@ -1591,14 +1729,18 @@ public boolean test(String item) { } return true; } - }; + } + ; /** * @param script TODO * @throws IOException - * */ - public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException { - final PrintWriter out = openAndWriteHeader(outdir, filename, "Summary: Recommended confusable mapping for IDN"); + public void writeSummary( + String outdir, String filename, boolean outputOnly, UnicodeSet script) + throws IOException { + final PrintWriter out = + openAndWriteHeader( + outdir, filename, "Summary: Recommended confusable mapping for IDN"); // PrintWriter out = FileUtilities.openUTF8Writer(outdir, filename); // out.print('\uFEFF'); // out.println("# Summary: Recommended confusable mapping for IDN"); @@ -1609,7 +1751,8 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni final MyEquivalenceClass data = dataMixedAnycase; final Set items = data.getOrderedExplicitItems(); // for (Iterator it = items.iterator(); it.hasNext();) { - // if (DEBUG) System.out.println(DEFAULT_UCD.getCodeAndName((String)it.next())); + // if (DEBUG) + // System.out.println(DEFAULT_UCD.getCodeAndName((String)it.next())); // } int count = 0; final UnicodeSet preferredID = getIdentifierSet(); @@ -1617,11 +1760,12 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni final Set itemsSeen = new HashSet(); final Set equivalents = new TreeSet(betterTargetIsLess); final MyCollectionFilter myFilter = new MyCollectionFilter(); - myFilter.outputAllowed= new UnicodeSet("[[\u0021-\u007E]-[:letter:]]") - .addAll(IdentifierInfo.getIdentifierInfo().remainingOutputSet) - .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict); + myFilter.outputAllowed = + new UnicodeSet("[[\u0021-\u007E]-[:letter:]]") + .addAll(IdentifierInfo.getIdentifierInfo().remainingOutputSet) + .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict); - for (final Iterator it = items.iterator(); it.hasNext();) { + for (final Iterator it = items.iterator(); it.hasNext(); ) { String target = (String) it.next(); if (itemsSeen.contains(target)) { continue; @@ -1643,34 +1787,47 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni } } scriptTest: - if (script != null) { - // see if at least one item contains the target script - for (final Iterator it2 = equivalents.iterator(); it2.hasNext();) { - final String item = (String) it2.next(); - if (script.containsAll(item)) { - target = item; - for (final Iterator it3 = equivalents.iterator(); it3.hasNext();) { - representable.addAll((String)it3.next()); - } - break scriptTest; + if (script != null) { + // see if at least one item contains the target script + for (final Iterator it2 = equivalents.iterator(); it2.hasNext(); ) { + final String item = (String) it2.next(); + if (script.containsAll(item)) { + target = item; + for (final Iterator it3 = equivalents.iterator(); it3.hasNext(); ) { + representable.addAll((String) it3.next()); } + break scriptTest; } - continue; // skip this one } + continue; // skip this one + } out.println(); out.println("#\t" + CollectionUtilities.join(equivalents, "\t")); String status = ""; // getStatus(target); - out.println(status + "\t" + "(\u200E " + target + " \u200E)\t" + Utility.hex(target) + "\t " + DEFAULT_UCD.getName(target)); - //if (UTF16.hasMoreCodePointsThan(source,1)) continue; - for (final Iterator it2 = equivalents.iterator(); it2.hasNext();) { + out.println( + status + + "\t" + + "(\u200E " + + target + + " \u200E)\t" + + Utility.hex(target) + + "\t " + + DEFAULT_UCD.getName(target)); + // if (UTF16.hasMoreCodePointsThan(source,1)) continue; + for (final Iterator it2 = equivalents.iterator(); it2.hasNext(); ) { final String source = (String) it2.next(); if (source.equals(target)) { continue; } - //boolean compatEqual = Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target)); - //if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue; - final String reason = XEquivalenceClass.toString(data.getReasons(source, target), myLinkageTransform); // fixReason(data.getReasons(source, target)); - //if (!outputAllowed.containsAll(source)) continue; + // boolean compatEqual = + // Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target)); + // if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue; + final String reason = + XEquivalenceClass.toString( + data.getReasons(source, target), + myLinkageTransform); // fixReason(data.getReasons(source, + // target)); + // if (!outputAllowed.containsAll(source)) continue; // if (compatEqual) { // out.print("\u21D0"); // } else { @@ -1679,8 +1836,17 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni final String reasonOrEmpty = reason.length() == 0 ? "" : "\t# " + reason; status = ""; // getStatus(source); - out.println(BACKARROW + status + "\t" + "(\u200E " + source + " \u200E)\t" + Utility.hex(source) + "\t " + DEFAULT_UCD.getName(source) - + reasonOrEmpty); + out.println( + BACKARROW + + status + + "\t" + + "(\u200E " + + source + + " \u200E)\t" + + Utility.hex(source) + + "\t " + + DEFAULT_UCD.getName(source) + + reasonOrEmpty); count++; } } @@ -1700,26 +1866,29 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni out.close(); } - - public void writeWholeScripts(String outdir, String filename) throws IOException { - final UnicodeSet commonAndInherited = new UnicodeSet( - "[[:script=common:][:script=inherited:]]"); - - final WholeScript wsLower = new WholeScript( - new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) - .removeAll(new UnicodeSet("[A-Z]")), "L"); - final WholeScript wsAny = new WholeScript( - new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) - .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict), "A"); + final UnicodeSet commonAndInherited = + new UnicodeSet("[[:script=common:][:script=inherited:]]"); + + final WholeScript wsLower = + new WholeScript( + new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) + .removeAll(new UnicodeSet("[A-Z]")), + "L"); + final WholeScript wsAny = + new WholeScript( + new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) + .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict), + "A"); final MyEquivalenceClass data = new MyEquivalenceClass(); for (final Object element : dataMixedAnycase.getSamples()) { String target = (String) element; final Set equivalents = getEquivalences(target); boolean first = true; - for (final Iterator it2 = equivalents.iterator(); it2.hasNext();) { - final String cleaned = CollectionUtilities.remove((String)it2.next(), commonAndInherited); + for (final Iterator it2 = equivalents.iterator(); it2.hasNext(); ) { + final String cleaned = + CollectionUtilities.remove((String) it2.next(), commonAndInherited); if (cleaned.length() == 0) { continue; } @@ -1732,7 +1901,7 @@ public void writeWholeScripts(String outdir, String filename) throws IOException } } final Set itemsSeen = new HashSet(); - for (final Iterator it = data.getOrderedExplicitItems().iterator(); it.hasNext();) { + for (final Iterator it = data.getOrderedExplicitItems().iterator(); it.hasNext(); ) { final String target = (String) it.next(); if (itemsSeen.contains(target)) { continue; @@ -1742,7 +1911,8 @@ public void writeWholeScripts(String outdir, String filename) throws IOException wsAny.addEquivalents(equivalents); wsLower.addEquivalents(equivalents); } - final PrintWriter out = openAndWriteHeader(outdir, filename, "Summary: Whole-Script Confusables"); + final PrintWriter out = + openAndWriteHeader(outdir, filename, "Summary: Whole-Script Confusables"); // PrintWriter out = FileUtilities.openUTF8Writer(outdir, filename); // out.print('\uFEFF'); // out.println("# Summary: Whole-Script Confusables"); @@ -1762,9 +1932,7 @@ public void writeWholeScripts(String outdir, String filename) throws IOException wsAny.write(out); out.close(); } - /** - * - */ + /** */ // private String getStatus(String source) { // // TODO Auto-generated method stub // final int val = betterTargetIsLess.getValue(source); @@ -1802,10 +1970,12 @@ static class WholeScript { private final UnicodeSet[] script_set = new UnicodeSet[UCD_Types.LIMIT_SCRIPT]; private final BagFormatter bf = makeFormatter(); private final String label; + { for (short i = 0; i < UCD_Types.LIMIT_SCRIPT; ++i) { script_representables[i] = new UnicodeSet(); - //script_set[i] = new UnicodeSet("[:script=" + DEFAULT_UCD.getScriptID(i, UCD_Types.LONG) + ":]"); // ugly hack + // script_set[i] = new UnicodeSet("[:script=" + DEFAULT_UCD.getScriptID(i, + // UCD_Types.LONG) + ":]"); // ugly hack script_set[i] = SCRIPT_PROPERTY.getSet(UCD.getScriptID_fromIndex(i)); // ugly hack } bf.setValueSource(ups.getProperty("script")); @@ -1824,8 +1994,8 @@ void addEquivalents(Set set) { // if we have y ~ x, and both are single scripts // that means that x can be represented in script(y), // and y can be represented in script(x). - for (final Iterator it = set.iterator(); it.hasNext();) { - final String item1 = (String)it.next(); + for (final Iterator it = set.iterator(); it.hasNext(); ) { + final String item1 = (String) it.next(); if (!filterSet.containsAll(item1)) { continue; } @@ -1833,8 +2003,8 @@ void addEquivalents(Set set) { if (script1 == UCD_Types.UNUSED_SCRIPT) { continue; } - for (final Iterator it2 = set.iterator(); it2.hasNext();) { - final String item2 = (String)it2.next(); + for (final Iterator it2 = set.iterator(); it2.hasNext(); ) { + final String item2 = (String) it2.next(); if (!filterSet.containsAll(item2)) { continue; } @@ -1851,22 +2021,27 @@ public static class UnicodeSetToScript { public short getScript() { return script; } + public UnicodeSetToScript setScript(short script) { this.script = script; return this; } + public UnicodeSet getSet() { return set; } + public UnicodeSetToScript setSet(UnicodeSet set) { this.set = set; return this; } + private UnicodeSet set; private short script; } - UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UCD_Types.LIMIT_SCRIPT][]; + UnicodeSetToScript[][] scriptToUnicodeSetToScript = + new UnicodeSetToScript[UCD_Types.LIMIT_SCRIPT][]; UnicodeSet[] fastReject = new UnicodeSet[UCD_Types.LIMIT_SCRIPT]; boolean finished = false; @@ -1894,11 +2069,14 @@ void finish() { if (script_set[j].containsNone(script_representables[k])) { continue; } - final UnicodeSet items = new UnicodeSet(script_set[j]).retainAll(script_representables[k]); - final UnicodeSetToScript uss = new UnicodeSetToScript().setScript(k).setSet(items); + final UnicodeSet items = + new UnicodeSet(script_set[j]).retainAll(script_representables[k]); + final UnicodeSetToScript uss = + new UnicodeSetToScript().setScript(k).setSet(items); curr.add(uss); } - scriptToUnicodeSetToScript[j] = (UnicodeSetToScript[]) curr.toArray(new UnicodeSetToScript[curr.size()]); + scriptToUnicodeSetToScript[j] = + (UnicodeSetToScript[]) curr.toArray(new UnicodeSetToScript[curr.size()]); fastReject[j] = accept.complement(); } finished = true; @@ -1907,7 +2085,7 @@ void finish() { void write(PrintWriter out) throws IOException { finish(); - Map, String> reorder = new TreeMap<>(); // reorder alphabetically + Map, String> reorder = new TreeMap<>(); // reorder alphabetically for (short j = 0; j < UCD_Types.LIMIT_SCRIPT; ++j) { final UnicodeSetToScript[] unicodeSetToScripts = scriptToUnicodeSetToScript[j]; @@ -1923,7 +2101,7 @@ void write(PrintWriter out) throws IOException { // get other side UnicodeSet items2 = UnicodeSet.EMPTY; final UnicodeSetToScript[] unicodeSetToScripts2 = scriptToUnicodeSetToScript[k]; - for (int qq = 0; qq < unicodeSetToScripts2.length; ++qq) { + for (int qq = 0; qq < unicodeSetToScripts2.length; ++qq) { final UnicodeSetToScript uss2 = unicodeSetToScripts2[qq]; if (uss2.getScript() == j) { items2 = uss2.getSet(); @@ -1931,19 +2109,35 @@ void write(PrintWriter out) throws IOException { } } - final String sname = UCD.getScriptID_fromIndex(j, UCD_Types.SHORT) + "; " - + UCD.getScriptID_fromIndex(k, UCD_Types.SHORT) + "; " + label; - final String name = getScriptIndexName(j, UCD_Types.LONG) - + "; " + getScriptIndexName(k, UCD_Types.LONG); + final String sname = + UCD.getScriptID_fromIndex(j, UCD_Types.SHORT) + + "; " + + UCD.getScriptID_fromIndex(k, UCD_Types.SHORT) + + "; " + + label; + final String name = + getScriptIndexName(j, UCD_Types.LONG) + + "; " + + getScriptIndexName(k, UCD_Types.LONG); StringWriter b = new StringWriter(); PrintWriter out2 = new PrintWriter(b); - out2.println("# " + name + ": " - + items.toPattern(false) + "; " + items2.toPattern(false) + "\n"); + out2.println( + "# " + + name + + ": " + + items.toPattern(false) + + "; " + + items2.toPattern(false) + + "\n"); bf.setValueSource(sname); bf.showSetNames(out2, items); out2.println(""); out2.flush(); - reorder.put(Pair.of(getScriptIndexName(j, UCD_Types.LONG), getScriptIndexName(k, UCD_Types.LONG)), b.toString()); + reorder.put( + Pair.of( + getScriptIndexName(j, UCD_Types.LONG), + getScriptIndexName(k, UCD_Types.LONG)), + b.toString()); out2.close(); } } @@ -1951,19 +2145,21 @@ void write(PrintWriter out) throws IOException { out.print(s.getValue()); } } + public String getScriptIndexName(short scriptIndex, byte length) { - return UCharacter.toTitleCase(Locale.ENGLISH, UCD.getScriptID_fromIndex(scriptIndex, length), null); + return UCharacter.toTitleCase( + Locale.ENGLISH, UCD.getScriptID_fromIndex(scriptIndex, length), null); } - } /** * @throws IOException - * */ // private static void fixMichel(String indir, String outdir) throws IOException { - // final BufferedReader in = FileUtilities.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt"); - // final PrintWriter out = FileUtilities.openUTF8Writer(outdir, "new-tr36comments-annex.txt"); + // final BufferedReader in = FileUtilities.openUTF8Reader(indir + "michel/", + // "tr36comments-annex.txt"); + // final PrintWriter out = FileUtilities.openUTF8Writer(outdir, + // "new-tr36comments-annex.txt"); // while (true) { // final String line = Utility.readDataLine(in); // if (line == null) { @@ -1982,15 +2178,14 @@ public String getScriptIndexName(short scriptIndex, byte length) { // in.close(); // out.close(); // } - /** - * - */ - + /** */ private static void generateSource() throws IOException { final File dir = new File(indir); final String[] names = dir.list(); - final Set sources = new TreeSet(new ArrayComparator( - new Comparator[] {codepointComparator, codepointComparator})); + final Set sources = + new TreeSet( + new ArrayComparator( + new Comparator[] {codepointComparator, codepointComparator})); final int[] count = new int[1]; for (int i = 0; i < names.length; ++i) { @@ -2013,7 +2208,7 @@ private static void generateSource() throws IOException { if (line.length() == 0) { continue; } - final String[] pieces = Utility.split(line,';'); + final String[] pieces = Utility.split(line, ';'); if (pieces.length < 2) { System.err.println("Error on: " + line); continue; @@ -2022,7 +2217,8 @@ private static void generateSource() throws IOException { String target = fromHexOld(pieces[1]); if (source.length() == 0 || target.length() == 0) { - throw new IllegalArgumentException("zero-length item: " + count[0] + ":\t" + line); + throw new IllegalArgumentException( + "zero-length item: " + count[0] + ":\t" + line); } // check for identical combining sequences @@ -2033,9 +2229,9 @@ private static void generateSource() throws IOException { } if (true) { - final int nsourceFirst = UTF16.charAt(nsource,0); + final int nsourceFirst = UTF16.charAt(nsource, 0); final String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst)); - final int ntargetFirst = UTF16.charAt(ntarget,0); + final int ntargetFirst = UTF16.charAt(ntarget, 0); final String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst)); if (nsourceRest.equals(ntargetRest)) { source = UTF16.valueOf(nsourceFirst); @@ -2052,15 +2248,17 @@ private static void generateSource() throws IOException { } in.close(); } - final PrintWriter out = FileUtilities.openUTF8Writer(reformatedInternal, "confusableSource.txt"); - for (final Iterator it = sources.iterator(); it.hasNext();) { + final PrintWriter out = + FileUtilities.openUTF8Writer(reformatedInternal, "confusableSource.txt"); + for (final Iterator it = sources.iterator(); it.hasNext(); ) { final String[] sourceItem = (String[]) it.next(); writeSourceTargetLine(out, sourceItem[0], null, sourceItem[1], null, ARROW); } out.close(); } - private static void generateConfusables(String indir, String reformatedInternal, String draftDir) throws IOException { + private static void generateConfusables( + String indir, String reformatedInternal, String draftDir) throws IOException { final File dir = new File(indir); final String[] names = dir.list(); final DataSet total = new DataSet(); @@ -2112,11 +2310,11 @@ private static void generateConfusables(String indir, String reformatedInternal, if (DEBUG) System.out.println(nfkcMap.get('ſ')); ds.addUnicodeMap(nfkcMap, "nfkc", "nfkc"); - //if (DEBUG) System.out.println(ds); + // if (DEBUG) System.out.println(ds); ds.checkChar("ſ"); ds.close("*"); ds.checkChar("ſ"); - //ds.write(outdir, "new-decomp.txt", false, false); + // ds.write(outdir, "new-decomp.txt", false, false); total.addAll(ds); ds.checkChar("ſ"); total.close("*"); @@ -2125,12 +2323,12 @@ private static void generateConfusables(String indir, String reformatedInternal, total.writeData(reformatedInternal + "/source/", "confusablesRaw.txt"); total.writeSummary(draftDir, "confusablesSummary.txt", false, null); total.writeSummary(reformatedInternal, "confusablesSummaryIdentifier.txt", true, null); - //total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true, + // total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true, // new UnicodeSet("[[:script=Cyrillic:][:script=common:][:script=inherited:]]")); - //total.writeWholeScripts(draftDir, "confusablesWholeScript.txt"); + // total.writeWholeScripts(draftDir, "confusablesWholeScript.txt"); total.writeSourceOrder(draftDir, "confusables.txt", false, false); - //DataSet clean = total.clean(); - //clean.write(outdir, "confusables.txt", true); + // DataSet clean = total.clean(); + // clean.write(outdir, "confusables.txt", true); } /* BufferedReader in = FileUtilities.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"); @@ -2246,9 +2444,7 @@ private static void gen() throws IOException { // + DEFAULT_UCD.getName(source) // + " " + ARROW + " " + DEFAULT_UCD.getName(target); // } - /** - * - */ + /** */ /* private static void add(Map m, String source, String target, int count) { if (source.length() == 0 || target.length() == 0) return; if (preferSecondAsSource(source, target)) { @@ -2271,10 +2467,11 @@ private static void gen() throws IOException { */ private static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess(false); - //private static _BetterTargetIsLess betterTargetIsLessFavorNeutral = new _BetterTargetIsLess(true); + // private static _BetterTargetIsLess betterTargetIsLessFavorNeutral = new + // _BetterTargetIsLess(true); private static boolean isXid(String x) { - return XID.containsAll(x); + return XID.containsAll(x); } private static class _BetterTargetIsLess implements Comparator { @@ -2284,6 +2481,7 @@ private static class _BetterTargetIsLess implements Comparator { _BetterTargetIsLess(boolean favorNeutral) { this.favorNeutral = favorNeutral; } + @Override public int compare(String a, String b) { if (a.equals(b)) { @@ -2294,7 +2492,7 @@ public int compare(String a, String b) { // longer is better (less) final int ca = UTF16.countCodePoint(a); final int cb = UTF16.countCodePoint(b); - if (ca != cb) { + if (ca != cb) { return ca > cb ? -1 : 1; } @@ -2305,7 +2503,8 @@ public int compare(String a, String b) { long ldiff = lasta - lastb; if (ldiff != 0) { return ldiff < 0 ? 1 : -1; // bigger count is less!! - }; + } + ; if (favorNeutral) { boolean isCommonA = COMMON_OR_INHERITED.containsAll(a); @@ -2376,7 +2575,8 @@ private int getValue(String a) { // lower is better } return lastValue; } - }; + } + ; // private static int compare(boolean a, boolean b) { // return a == b ? 0 : a ? 1 : -1; @@ -2417,49 +2617,57 @@ public static String getMostRecentAge(String a) { // private static String getCodeCharName(String a) { // return UCD.getCode(a) + "( " + a + " ) " + DEFAULT_UCD.getName(a); // } - /** - * Returns the part between - and . - */ + /** Returns the part between - and . */ private static String getReasonFromFilename(String type) { int period = type.lastIndexOf('.'); if (period < 0) { period = type.length(); } final int dash = type.lastIndexOf('-', period); - return type.substring(dash+1,period); + return type.substring(dash + 1, period); } - private static Normalizer modNFKC ; + private static Normalizer modNFKC; static String getModifiedNKFC(String cf) { if (modNFKC == null) { - modNFKC = new Normalizer(UCD_Types.NFKC, Default.ucdVersion()); + modNFKC = new Normalizer(UCD_Types.NFKC, Default.ucdVersion()); modNFKC.setSpacingSubstitute(); } return modNFKC.normalize(cf); } - static PrintWriter openAndWriteHeader(String dir, String filename, String title) throws IOException { + static PrintWriter openAndWriteHeader(String dir, String filename, String title) + throws IOException { final PrintWriter out = FileUtilities.openUTF8Writer(dir, filename); out.print('\uFEFF'); - //int trNumber, String title, String filename, String version - out.println(Utility.getBaseDataHeader(filename, 39, "Unicode Security Mechanisms", version)); -// out.println("# " + title); -// out.println("# File: " + filename); -// out.println("# Version: " + version); -// out.println("# Generated: " + Default.getDate()); -// out.println("# Checkin: $Revision: 1.32 $"); -// out.println("#"); -// out.println("# For documentation and usage, see http://www.unicode.org/reports/tr39/"); -// out.println("#"); + // int trNumber, String title, String filename, String version + out.println( + Utility.getBaseDataHeader(filename, 39, "Unicode Security Mechanisms", version)); + // out.println("# " + title); + // out.println("# File: " + filename); + // out.println("# Version: " + version); + // out.println("# Generated: " + Default.getDate()); + // out.println("# Checkin: $Revision: 1.32 $"); + // out.println("#"); + // out.println("# For documentation and usage, see + // http://www.unicode.org/reports/tr39/"); + // out.println("#"); return out; } private static String fromHexOld(String targetString) { - String result = Utility.fromHex(targetString.trim(),true); + String result = Utility.fromHex(targetString.trim(), true); final String result2 = fromHexLenient(targetString); if (!result.equals(result2)) { - if (DEBUG) System.out.println("Changing hex\t" + targetString + "\t=>old\t" + result + "\t=>new\t" + result2); + if (DEBUG) + System.out.println( + "Changing hex\t" + + targetString + + "\t=>old\t" + + result + + "\t=>new\t" + + result2); result = result2; } return result; @@ -2475,27 +2683,37 @@ private static String fromHex(String hexOrChars) { } final String result2 = fromHexLenient(hexOrChars); if (!result.equals(result2)) { - if (DEBUG) System.out.println("Changing hex\t" + hexOrChars + "\t=>old\t" + result + "\t=>new\t" + result2); + if (DEBUG) + System.out.println( + "Changing hex\t" + + hexOrChars + + "\t=>old\t" + + result + + "\t=>new\t" + + result2); result = result2; } return result; } - private static Transform, String> myLinkageTransform = new Transform, String>() { - @Override - public String transform(Linkage source) { - String sourceString = source.reasons.toString(); - sourceString = sourceString.substring(1,sourceString.length()-1); - return source.result == null ? "" : - source.result.length() == 0 ? "\u21d2" : - ARROW + rtlProtect(source.result) + ARROW; - } - - }; + private static Transform, String> myLinkageTransform = + new Transform, String>() { + @Override + public String transform(Linkage source) { + String sourceString = source.reasons.toString(); + sourceString = sourceString.substring(1, sourceString.length() - 1); + return source.result == null + ? "" + : source.result.length() == 0 + ? "\u21d2" + : ARROW + rtlProtect(source.result) + ARROW; + } + }; // Copied from ICU CollectionUtilities. /** * Retain matching items + * * @param * @param * @param c @@ -2503,7 +2721,7 @@ public String transform(Linkage source) { * @return */ static > U retainAll(U c, Predicate f) { - for (Iterator it = c.iterator(); it.hasNext();) { + for (Iterator it = c.iterator(); it.hasNext(); ) { T item = it.next(); if (!f.test(item)) it.remove(); } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusablesCopy.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusablesCopy.java index 24f152b45..64e468b65 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusablesCopy.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateConfusablesCopy.java @@ -1,18 +1,29 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateConfusables.java,v $ - * $Date: 2010-06-19 00:29:21 $ - * $Revision: 1.32 $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateConfusables.java,v $ $Date: + * 2010-06-19 00:29:21 $ $Revision: 1.32 $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.LocaleData; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.IOException; @@ -37,7 +48,6 @@ import java.util.function.Predicate; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.IdUsage; @@ -48,40 +58,25 @@ import org.unicode.cldr.util.With; import org.unicode.cldr.util.XEquivalenceClass; import org.unicode.cldr.util.XEquivalenceClass.Linkage; -import org.unicode.props.BagFormatter; import org.unicode.cldr.util.props.UnicodeLabel; -import org.unicode.props.UnicodeProperty; import org.unicode.idna.Idna.IdnaType; import org.unicode.idna.Uts46; +import org.unicode.props.BagFormatter; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.ScriptInfo; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.Binary; import org.unicode.props.UcdPropertyValues.NFKD_Quick_Check_Values; import org.unicode.props.UcdPropertyValues.Script_Values; +import org.unicode.props.UnicodeProperty; import org.unicode.text.utility.Settings; import org.unicode.text.utility.UnicodeTransform; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.LocaleData; -import com.ibm.icu.util.ULocale; - - public class GenerateConfusablesCopy { private static final Normalizer NFKD = Default.nfkd(); - private static final ToolUnicodeTransformFactory TOOL_FACTORY = new ToolUnicodeTransformFactory(); + private static final ToolUnicodeTransformFactory TOOL_FACTORY = + new ToolUnicodeTransformFactory(); // Align these three normally. private static final String version = Settings.latestVersion; private static final String REVISION = Settings.latestVersion; @@ -91,27 +86,35 @@ public class GenerateConfusablesCopy { Settings.UnicodeTools.getDataPathString("security", REVISION) + "/data/"; private static final String indir = outdir + "source/"; private static final UCD DEFAULT_UCD = Default.ucd(); - private static final UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(version); // ICUPropertyFactory.make(); + private static final UnicodeProperty.Factory ups = + ToolUnicodePropertySource.make(version); // ICUPropertyFactory.make(); + static { // USE the tool unicode set instead of ICU, which may not be using the latest version. UnicodeSet.setDefaultXSymbolTable(ups.getXSymbolTable()); UnicodeTransform.setFactory(TOOL_FACTORY); } + static final UnicodeSet COMMON_OR_INHERITED; static final UnicodeSet CASED; static final UnicodeSet COMMON_OR_INHERITED_NFKD; static final UnicodeSet CASED_NFKD; static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(version); - static final UnicodeMap> scriptExtensions = iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); + static final UnicodeMap> scriptExtensions = + iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); + static { UnicodeSet common = scriptExtensions.getSet(Collections.singleton(Script_Values.Common)); - UnicodeSet inherited = scriptExtensions.getSet(Collections.singleton(Script_Values.Inherited)); + UnicodeSet inherited = + scriptExtensions.getSet(Collections.singleton(Script_Values.Inherited)); COMMON_OR_INHERITED = new UnicodeSet(common).addAll(inherited).freeze(); CASED = iup.loadEnum(UcdProperty.Changes_When_Casefolded, Binary.class).getSet(Binary.Yes); COMMON_OR_INHERITED_NFKD = new UnicodeSet(COMMON_OR_INHERITED); CASED_NFKD = new UnicodeSet(CASED); - final UnicodeSet notNFKD = iup.loadEnum(UcdProperty.NFKD_Quick_Check, NFKD_Quick_Check_Values.class).getSet(NFKD_Quick_Check_Values.No); - for (String s: notNFKD) { + final UnicodeSet notNFKD = + iup.loadEnum(UcdProperty.NFKD_Quick_Check, NFKD_Quick_Check_Values.class) + .getSet(NFKD_Quick_Check_Values.No); + for (String s : notNFKD) { if (s.equals("𝐉")) { int debug = 0; } @@ -133,12 +136,11 @@ public class GenerateConfusablesCopy { private static final String EXCAPE_FUNNY_RULE = ":: [[:C:]-[:cn:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; "; - private static final Transliterator EXCAPE_FUNNY = Transliterator.createFromRules( - "any-html", EXCAPE_FUNNY_RULE, Transliterator.FORWARD); + private static final Transliterator EXCAPE_FUNNY = + Transliterator.createFromRules("any-html", EXCAPE_FUNNY_RULE, Transliterator.FORWARD); private static BagFormatter makeFormatter() { - return new BagFormatter(ups) - .setLineSeparator("\n"); + return new BagFormatter(ups).setLineSeparator("\n"); } private static final boolean SHOW_SUPPRESS = false; @@ -148,7 +150,7 @@ private static BagFormatter makeFormatter() { // private static final UnicodeSet SPECIAL = new UnicodeSet("[\u01DD\u0259]").freeze(); public static void main(String[] args) throws IOException { - //quickTest(); + // quickTest(); try { for (final String arg : args) { @@ -196,7 +198,8 @@ private static void generateAsciify() throws IOException { System.out.println(";"); in.close(); final String rules = builder.toString(); - final Transliterator asciify = Transliterator.createFromRules("asciify", rules, Transliterator.FORWARD); + final Transliterator asciify = + Transliterator.createFromRules("asciify", rules, Transliterator.FORWARD); in = FileUtilities.openUTF8Reader(indir, "asciify_examples.txt"); System.out.println("String[][] translitTestCases = {"); System.out.println("//{\"" + "SAMPLE" + "\", \"" + "EXPECTED TRANSFORM" + "\"},"); @@ -205,20 +208,28 @@ private static void generateAsciify() throws IOException { if (line == null) { break; } - System.out.println("{\"" + com.ibm.icu.impl.Utility.escape(line) + "\", \"" + asciify.transform(line) + "\"},"); + System.out.println( + "{\"" + + com.ibm.icu.impl.Utility.escape(line) + + "\", \"" + + asciify.transform(line) + + "\"},"); } System.out.println("};"); in.close(); } private static final UnicodeSet LATIN = new UnicodeSet("[:script=latin:]").freeze(); - private static final UnicodeSet LATIN_PLUS = new UnicodeSet("[[:script=latin:][:script=common:][:script=inherited:]]").freeze(); + private static final UnicodeSet LATIN_PLUS = + new UnicodeSet("[[:script=latin:][:script=common:][:script=inherited:]]").freeze(); private static final UnicodeSet ASCII = new UnicodeSet("[:ASCII:]").freeze(); - private static final UnicodeSet MARKS_AND_ASCII = new UnicodeSet("[[:mark:][:ASCII:]]").freeze(); + private static final UnicodeSet MARKS_AND_ASCII = + new UnicodeSet("[[:mark:][:ASCII:]]").freeze(); private static void generateLatin() throws IOException { - // pick out only those items where the source and target both have some latin, and no non-latin - final Map mapping = new TreeMap(UCAComparator); + // pick out only those items where the source and target both have some latin, and no + // non-latin + final Map mapping = new TreeMap(UCAComparator); addLatin(mapping, "confusables-source.txt"); addLatin(mapping, "confusables-intentional.txt"); final Set sorted = new TreeSet(UCAComparator); @@ -242,12 +253,23 @@ private static void generateLatin() throws IOException { reason = "!CH"; } } - if (new UnicodeSet().addAll(Default.nfd().normalize(target)).removeAll(MARKS_AND_ASCII).size() > 0) { + if (new UnicodeSet() + .addAll(Default.nfd().normalize(target)) + .removeAll(MARKS_AND_ASCII) + .size() + > 0) { reason += " XXX"; } - System.out.println(source + "\t→\t" + target + - " ; #" + reason + "\t" + DEFAULT_UCD.getCodeAndName(source) + "\t→\t" + DEFAULT_UCD.getCodeAndName(target)); - + System.out.println( + source + + "\t→\t" + + target + + " ; #" + + reason + + "\t" + + DEFAULT_UCD.getCodeAndName(source) + + "\t→\t" + + DEFAULT_UCD.getCodeAndName(target)); } } @@ -277,7 +299,7 @@ private static void addLatin(Map mapping, String fileName) throw } final String old = mapping.get(source); - if (old!=null) { + if (old != null) { System.out.println("Overriding " + source + "=>" + old + " with " + target); } @@ -289,30 +311,29 @@ private static void addLatin(Map mapping, String fileName) throw continue; } - mapping.put(source, target); } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException("Can't process <" + oldLine + ">").initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Can't process <" + oldLine + ">") + .initCause(e); } } in.close(); } - private static Matcher HEX = Pattern.compile( - "\\b([A-F0-9]{4,6})\\b" + - "|U+([a-fA-F0-9]{4,6})\\b" + - "|\\\\u([a-fA-F0-9]{4})" + - "|\\\\U([a-fA-F0-9]{6})" + - "|\\\\u\\{([a-fA-F0-9]{1,6})\\}").matcher(""); + private static Matcher HEX = + Pattern.compile( + "\\b([A-F0-9]{4,6})\\b" + + "|U+([a-fA-F0-9]{4,6})\\b" + + "|\\\\u([a-fA-F0-9]{4})" + + "|\\\\U([a-fA-F0-9]{6})" + + "|\\\\u\\{([a-fA-F0-9]{1,6})\\}") + .matcher(""); /** - * Convert a string with a mixture of hex and normal characters. - * Anything like the following is converted from hex to chars - * and all spaces are removed - * hexChar = \b[A-F0-9]{4,6}\b - * | U+[a-fA-F0-9]{4,6} - * | \\u[a-fA-F0-9]{4} - * | \\U[a-fA-F0-9]{6} - * | \\u{[a-fA-F0-9]{1,6} + * Convert a string with a mixture of hex and normal characters. Anything like the following is + * converted from hex to chars and all spaces are removed hexChar = \b[A-F0-9]{4,6}\b | + * U+[a-fA-F0-9]{4,6} | \\u[a-fA-F0-9]{4} | \\U[a-fA-F0-9]{6} | \\u{[a-fA-F0-9]{1,6} + * * @param hexOrChars * @return */ @@ -348,9 +369,7 @@ private static String fromHexLenient(String hexOrChars) { // final String result = (String) CollectionUtilities.getBest(x, betterTargetIsLess, -1); // } - /** - * - */ + /** */ // private static UnicodeSet _Non_IICore; // // private static UnicodeSet getNonIICore() { @@ -396,7 +415,8 @@ private static String fromHexLenient(String hexOrChars) { // } // br.close(); // } catch (Exception e) { - // throw (RuntimeException) new RuntimeException("Failure on line " + line).initCause(e); + // throw (RuntimeException) new RuntimeException("Failure on line " + + // line).initCause(e); // } // _Non_IICore.removeAll(cjk_nic); // } @@ -410,91 +430,88 @@ private static String fromHexLenient(String hexOrChars) { // } private static PrintWriter log; + private static final String ARROW = "→"; // \u2194 private static final String BACKARROW = "\u2190"; private static UnicodeSet UNASSIGNED = - ups.getSet("gc=Cn") - .addAll(ups.getSet("gc=Co")) - .addAll(ups.getSet("gc=Cs")).freeze(); + ups.getSet("gc=Cn").addAll(ups.getSet("gc=Co")).addAll(ups.getSet("gc=Cs")).freeze(); private static UnicodeSet SKIP_SET = - ups.getSet("gc=Cc") - .addAll(ups.getSet("gc=Cf")) - .addAll(UNASSIGNED).freeze(); + ups.getSet("gc=Cc").addAll(ups.getSet("gc=Cf")).addAll(UNASSIGNED).freeze(); private static UnicodeSet WHITESPACE = ups.getSet("Whitespace=Yes").freeze(); private static UnicodeSet GC_LOWERCASE = ups.getSet("gc=Ll").freeze(); private static UnicodeSet _skipNFKD; private static UnicodeSet COMBINING = - ups.getSet("gc=Mn") - .addAll(ups.getSet("gc=Me")) - .add(0x3099) - .add(0x309A).freeze(); - private static UnicodeSet INVISIBLES = - ups.getSet("default-ignorable-codepoint=true").freeze(); - private static UnicodeSet XIDContinueSet = - ups.getSet("XID_Continue=true").freeze(); + ups.getSet("gc=Mn").addAll(ups.getSet("gc=Me")).add(0x3099).add(0x309A).freeze(); + private static UnicodeSet INVISIBLES = ups.getSet("default-ignorable-codepoint=true").freeze(); + private static UnicodeSet XIDContinueSet = ups.getSet("XID_Continue=true").freeze(); private static UnicodeSet XID = XIDContinueSet; private static UnicodeSet RTL = new UnicodeSet("[[:bc=R:][:bc=AL:][:bc=AN:]]").freeze(); private static UnicodeSet CONTROLS = new UnicodeSet("[[:cc:][:Zl:][:Zp:]]").freeze(); private static final char LRM = '\u200E'; - private static UnicodeSet commonAndInherited = new UnicodeSet("[[:script=common:][:script=inherited:]]"); - + private static UnicodeSet commonAndInherited = + new UnicodeSet("[[:script=common:][:script=inherited:]]"); private static Map gatheredNFKD = new TreeMap(); private static UnicodeMap nfcMap; private static UnicodeMap nfkcMap; - private static Comparator codepointComparator = new UTF16.StringComparator(true,false,0); - private static Comparator UCAComparator = new org.unicode.cldr.util.MultiComparator(new Comparator[] { - Collator.getInstance(ULocale.ROOT), - //UCA.buildCollator(null), - codepointComparator}); - - private static UnicodeSet setsToAbbreviate = new UnicodeSet("[" + - "\\u3400-\\u4DB5" + - "\\u4E00-\\u9FA5" + - "\\uA000-\\uA48C" + - "\\uAC00-\\uD7A3" + - "\\u1100-\\u11FF" + - "\\uFB00-\\uFEFC" + - "\\u2460-\\u24FF" + - "\\u3251-\\u33FF" + - "\\u4DC0-\\u4DFF" + - "\\u3165-\\u318E" + - "\\uA490-\\uA4C6" + - "\\U00010140-\\U00010174" + - "\\U0001D300-\\U0001D356" + - "\\U0001D000-\\U0001D1DD" + - "\\U00020000-\\U0002A6D6" + - "\\U0001D400-\\U0001D7FF" + - "[:script=Canadian_Aboriginal:]" + - "[:script=ETHIOPIC:]" + - "[:script=Tagalog:]" + - "[:script=Hanunoo:]" + - "[:script=Buhid:]" + - "[:script=Tagbanwa:]" + - "[:script=Deseret:]" + - "[:script=Shavian:]" + - "[:script=Ogham:]" + - "[:script=Old Italic:]" + - "[:script=Runic:]" + - "[:script=Gothic:]" + - "[:script=Ugaritic:]" + - "[:script=Linear B:]" + - "[:script=Cypriot:]" + - "[:script=Coptic:]" + - "[:script=Syriac:]" + - "[:script=Glagolitic:]" + - "[:script=Glagolitic:]" + - "[:script=Old Persian:]" + - "[:script=Kharoshthi:]" + - "[:script=Osmanya:]" + - "[:default ignorable code point:]" + - "]").freeze(); + private static Comparator codepointComparator = new UTF16.StringComparator(true, false, 0); + private static Comparator UCAComparator = + new org.unicode.cldr.util.MultiComparator( + new Comparator[] { + Collator.getInstance(ULocale.ROOT), + // UCA.buildCollator(null), + codepointComparator + }); + + private static UnicodeSet setsToAbbreviate = + new UnicodeSet( + "[" + + "\\u3400-\\u4DB5" + + "\\u4E00-\\u9FA5" + + "\\uA000-\\uA48C" + + "\\uAC00-\\uD7A3" + + "\\u1100-\\u11FF" + + "\\uFB00-\\uFEFC" + + "\\u2460-\\u24FF" + + "\\u3251-\\u33FF" + + "\\u4DC0-\\u4DFF" + + "\\u3165-\\u318E" + + "\\uA490-\\uA4C6" + + "\\U00010140-\\U00010174" + + "\\U0001D300-\\U0001D356" + + "\\U0001D000-\\U0001D1DD" + + "\\U00020000-\\U0002A6D6" + + "\\U0001D400-\\U0001D7FF" + + "[:script=Canadian_Aboriginal:]" + + "[:script=ETHIOPIC:]" + + "[:script=Tagalog:]" + + "[:script=Hanunoo:]" + + "[:script=Buhid:]" + + "[:script=Tagbanwa:]" + + "[:script=Deseret:]" + + "[:script=Shavian:]" + + "[:script=Ogham:]" + + "[:script=Old Italic:]" + + "[:script=Runic:]" + + "[:script=Gothic:]" + + "[:script=Ugaritic:]" + + "[:script=Linear B:]" + + "[:script=Cypriot:]" + + "[:script=Coptic:]" + + "[:script=Syriac:]" + + "[:script=Glagolitic:]" + + "[:script=Glagolitic:]" + + "[:script=Old Persian:]" + + "[:script=Kharoshthi:]" + + "[:script=Osmanya:]" + + "[:default ignorable code point:]" + + "]") + .freeze(); /** * @throws IOException - * */ private static void generateIDN() throws IOException { final IdentifierInfo info = IdentifierInfo.getIdentifierInfo(); @@ -512,15 +529,20 @@ private static IdentifierInfo getIdentifierInfo() { } return info; } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException("Unable to access data").initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Unable to access data").initCause(e); } } // private final boolean mergeRanges = true; - private UnicodeSet removalSet, remainingOutputSet, inputSet_strict, inputSet_lenient, nonstarting; + private UnicodeSet removalSet, + remainingOutputSet, + inputSet_strict, + inputSet_lenient, + nonstarting; UnicodeSet propNFKCSet; - //UnicodeSet notInXID; + // UnicodeSet notInXID; UnicodeSet xidPlus; private final UnicodeMap additions = new UnicodeMap(); @@ -550,15 +572,19 @@ private IdentifierInfo() throws IOException { propNFKCSet = ups.getSet("NFKC_QuickCheck=N").complement(); final UnicodeSet propXIDContinueSet = ups.getSet("XID_Continue=Yes"); - //removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant"); + // removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant"); loadFileData(); - xidPlus = new UnicodeSet(propXIDContinueSet).addAll(additions.keySet()).retainAll(propNFKCSet); + xidPlus = + new UnicodeSet(propXIDContinueSet) + .addAll(additions.keySet()) + .retainAll(propNFKCSet); getIdentifierSet(); - //notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus); - //removals.putAll(notInXID, PROHIBITED + NOT_IN_XID); - //UnicodeSet notNfkcXid = new UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet); - //removals.putAll(notNfkcXid, PROHIBITED + "compat variant"); + // notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus); + // removals.putAll(notInXID, PROHIBITED + NOT_IN_XID); + // UnicodeSet notNfkcXid = new + // UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet); + // removals.putAll(notNfkcXid, PROHIBITED + "compat variant"); removalSet = new UnicodeSet(); for (final Reason value : removals.values()) { if (value.isRestricted()) { @@ -569,14 +595,14 @@ private IdentifierInfo() throws IOException { remainingOutputSet = new UnicodeSet(IDNOutputSet).removeAll(removalSet); - final UnicodeSet remainingInputSet1 = new UnicodeSet(IDNInputSet) - .removeAll(removalSet).removeAll(remainingOutputSet); + final UnicodeSet remainingInputSet1 = + new UnicodeSet(IDNInputSet).removeAll(removalSet).removeAll(remainingOutputSet); final UnicodeSet remainingInputSet = new UnicodeSet(); final UnicodeSet specialRemove = new UnicodeSet(); // remove any others that don't normalize/case fold to something in // the output set - for (final UnicodeSetIterator usi = new UnicodeSetIterator( - remainingInputSet1); usi.next();) { + for (final UnicodeSetIterator usi = new UnicodeSetIterator(remainingInputSet1); + usi.next(); ) { final String nss = getModifiedNKFC(usi.getString()); final String cf = DEFAULT_UCD.getCase(nss, UCD_Types.FULL, UCD_Types.FOLD); final String cf2 = getModifiedNKFC(cf); @@ -588,30 +614,30 @@ private IdentifierInfo() throws IOException { } // filter out the items that are case foldings of items in output inputSet_strict = new UnicodeSet(); - for (final UnicodeSetIterator usi = new UnicodeSetIterator( - remainingInputSet); usi.next();) { + for (final UnicodeSetIterator usi = new UnicodeSetIterator(remainingInputSet); + usi.next(); ) { final String ss = usi.getString(); final String nss = getModifiedNKFC(ss); final String cf = DEFAULT_UCD.getCase(ss, UCD_Types.FULL, UCD_Types.FOLD); if (usi.codepoint == 0x2126 || usi.codepoint == 0x212B) { System.out.println("check"); } - //> > 2126 ; retained-input-only-CF # (?) OHM SIGN - //> > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN + // > > 2126 ; retained-input-only-CF # (?) OHM SIGN + // > > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN - if (!remainingOutputSet.containsAll(nss) - && remainingOutputSet.containsAll(cf)) { + if (!remainingOutputSet.containsAll(nss) && remainingOutputSet.containsAll(cf)) { inputSet_strict.add(ss); } } // hack inputSet_strict.remove(0x03F4).remove(0x2126).remove(0x212B); - inputSet_lenient = new UnicodeSet(remainingInputSet) - .removeAll(inputSet_strict); - nonstarting = new UnicodeSet(remainingOutputSet).addAll( - remainingInputSet).retainAll(new UnicodeSet("[:M:]")); + inputSet_lenient = new UnicodeSet(remainingInputSet).removeAll(inputSet_strict); + nonstarting = + new UnicodeSet(remainingOutputSet) + .addAll(remainingInputSet) + .retainAll(new UnicodeSet("[:M:]")); reviews = new UnicodeMap(); - //reviews.putAll(removals); + // reviews.putAll(removals); for (final Reason value : removals.values()) { reviews.putAll(removals.getSet(value), value.propertyFileFormat()); } @@ -635,37 +661,37 @@ private IdentifierInfo() throws IOException { lowerIsBetter.freeze(); // add special values: - //lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0)); - - final UnicodeMap nonstartingmap = new UnicodeMap().putAll(nonstarting, - "nonstarting"); - final UnicodeMap.Composer composer = new UnicodeMap.Composer() { - @Override - public Object compose(int codepoint, String string, Object a, Object b) { - if (a == null) { - return b; - } else if (b == null) { - return a; - } else { - return a.toString() + "-" + b.toString(); - } - } - }; + // lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0)); + + final UnicodeMap nonstartingmap = new UnicodeMap().putAll(nonstarting, "nonstarting"); + final UnicodeMap.Composer composer = + new UnicodeMap.Composer() { + @Override + public Object compose(int codepoint, String string, Object a, Object b) { + if (a == null) { + return b; + } else if (b == null) { + return a; + } else { + return a.toString() + "-" + b.toString(); + } + } + }; reviews.composeWith(nonstartingmap, composer); reviews.putAll(new UnicodeSet(IDNInputSet).complement(), ""); - final UnicodeMap.Composer composer2 = new UnicodeMap.Composer() { - @Override - public Object compose(int codepoint, String string, Object a, Object b) { - if (b == null) { - return a; - } - return "remap-to-" + Utility.hex(b.toString()); - } - }; - //reviews.composeWith(remap, composer2); + final UnicodeMap.Composer composer2 = + new UnicodeMap.Composer() { + @Override + public Object compose(int codepoint, String string, Object a, Object b) { + if (b == null) { + return a; + } + return "remap-to-" + Utility.hex(b.toString()); + } + }; + // reviews.composeWith(remap, composer2); removals2 = new UnicodeMap().putAll(recastRemovals); - removals2.putAll(ups.getSet("XID_Continue=Yes").complement(), - PROHIBITED + NOT_IN_XID); + removals2.putAll(ups.getSet("XID_Continue=Yes").complement(), PROHIBITED + NOT_IN_XID); removals2.setMissing("future?"); additions.freeze(); @@ -693,37 +719,40 @@ enum Reason { recommended; private static Reason fromString(String string) { - String rawReason = string.trim().replace("-","_"); + String rawReason = string.trim().replace("-", "_"); if (rawReason.equals("allowed")) { rawReason = recommended_scripts; } return valueOf(rawReason); } + public boolean isRestricted() { return this != Reason.inclusion && this != Reason.recommended; } + @Override public String toString() { - return name().replace("_","-"); + return name().replace("_", "-"); } + public String propertyFileFormat() { return (isRestricted() ? PROHIBITED : UNPROHIBITED) + toString(); } + public boolean replaceBy(Reason possibleReplacement) { return compareTo(possibleReplacement) > 0 - || this == historic && possibleReplacement == limited_use - ; // && this != historic; + || this == historic + && possibleReplacement == limited_use; // && this != historic; } } - /** - * - */ + /** */ private void loadFileData() throws IOException { BufferedReader br; String line; // get all the removals. br = FileUtilities.openUTF8Reader(indir, "removals.txt"); - removals.putAll(new UnicodeSet("[^[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"), + removals.putAll( + new UnicodeSet("[^[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"), Reason.recommended); UnicodeSet sources = new UnicodeSet(); @@ -748,7 +777,9 @@ private void loadFileData() throws IOException { final String codelist = pieces[0].trim(); final Reason reasons = Reason.fromString(pieces[1]); if (pieces[0].startsWith("[")) { - sources = TestUnicodeInvariants.parseUnicodeSet(codelist); //.retainAll(allocated); + sources = + TestUnicodeInvariants.parseUnicodeSet( + codelist); // .retainAll(allocated); } else { final String[] codes = Utility.split(codelist, ' '); for (final String code : codes) { @@ -766,7 +797,8 @@ private void loadFileData() throws IOException { } removals.putAll(sources, reasons); // if (reasons == Reason.recommended) { - // removals.putAll(sources, UNPROHIBITED + recommended_scripts); + // removals.putAll(sources, UNPROHIBITED + + // recommended_scripts); // } else if (reasons.equals("inclusion")) { // removals.putAll(sources, UNPROHIBITED + reasons); // } else { @@ -798,22 +830,22 @@ private void loadFileData() throws IOException { final Info scriptInfo = ScriptMetadata.getInfo(script); final IdUsage idUsage = scriptInfo.idUsage; Reason status; - switch(idUsage) { - case ASPIRATIONAL: - case LIMITED_USE: - status = Reason.limited_use; - break; - case EXCLUSION: - status = Reason.historic; - break; - case RECOMMENDED: - default: - status = null; - break; // do nothing; + switch (idUsage) { + case ASPIRATIONAL: + case LIMITED_USE: + status = Reason.limited_use; + break; + case EXCLUSION: + status = Reason.historic; + break; + case RECOMMENDED: + default: + status = null; + break; // do nothing; } if (status != null) { final UnicodeSet us = IDENTIFIER_INFO.getSetWith(script); - //final UnicodeSet us = new UnicodeSet().applyPropertyAlias("script", script); + // final UnicodeSet us = new UnicodeSet().applyPropertyAlias("script", script); for (final String s : us) { if (hasRecommendedScript.contains(s)) { continue; // skip those that have at least one recommended script @@ -821,22 +853,30 @@ private void loadFileData() throws IOException { final Reason old = removals.get(s); if (old == null) { removals.put(s, status); - } else if (!old.equals(status)){ + } else if (!old.equals(status)) { if (old.replaceBy(status)) { - removalCollision.put(s, "REPLACING " + old + "\t!= (script metadata)\t" + status); + removalCollision.put( + s, + "REPLACING " + old + "\t!= (script metadata)\t" + status); removals.put(s, status); } else { - removalCollision.put(s, "Retaining " + old + "\t!= (script metadata)\t" + status); + removalCollision.put( + s, + "Retaining " + old + "\t!= (script metadata)\t" + status); } } } } } for (final String value : removalCollision.values()) { - System.out.println("*Removal Collision\t" + value + "\n\t" + removalCollision.getSet(value).toPattern(false)); + System.out.println( + "*Removal Collision\t" + + value + + "\n\t" + + removalCollision.getSet(value).toPattern(false)); } removals.freeze(); - //removals.putAll(getNonIICore(), PROHIBITED + "~IICore"); + // removals.putAll(getNonIICore(), PROHIBITED + "~IICore"); br.close(); // // get the word chars @@ -880,19 +920,18 @@ void printIDNStuff() throws IOException { generateDecompFile(); } - /** - * - */ + /** */ private void writeIDReview() throws IOException { - final BagFormatter bf = makeFormatter() - .setUnicodePropertyFactory(ups) - .setLabelSource(null) - .setShowLiteral(EXCAPE_FUNNY) - .setMergeRanges(true); + final BagFormatter bf = + makeFormatter() + .setUnicodePropertyFactory(ups) + .setLabelSource(null) + .setShowLiteral(EXCAPE_FUNNY) + .setMergeRanges(true); final PrintWriter out = openAndWriteHeader(outdir, "review.txt", "Review List for IDN"); // PrintWriter out = FileUtilities.openUTF8Writer(outdir, "review.txt"); - //reviews.putAll(UNASSIGNED, ""); + // reviews.putAll(UNASSIGNED, ""); // out.print("\uFEFF"); // out.println("# Review List for IDN"); // out.println("# $Revision: 1.32 $"); @@ -901,10 +940,11 @@ private void writeIDReview() throws IOException { final UnicodeSet fullSet = reviews.keySet("").complement(); - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(reviews).setMain("Reviews", "GCB", - UnicodeProperty.ENUMERATED, "1.0")); - //bf.setMergeRanges(false); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(reviews) + .setMain("Reviews", "GCB", UnicodeProperty.ENUMERATED, "1.0")); + // bf.setMergeRanges(false); final FakeBreak fakeBreak = new FakeBreak(); bf.setRangeBreakSource(fakeBreak); @@ -912,33 +952,29 @@ private void writeIDReview() throws IOException { out.println("# Characters allowed in IDNA"); out.println(""); bf.showSetNames(out, new UnicodeSet(fullSet)); // .removeAll(bigSets) - //bf.setMergeRanges(true); + // bf.setMergeRanges(true); // out.println(""); // out.println("# Large Ranges"); // out.println(""); // bf.showSetNames(out, new UnicodeSet(fullSet).retainAll(bigSets)); out.println(""); out.println("# Characters disallowed in IDNA"); - out - .println("# The IDNA spec doesn't allow any of these characters,"); - out - .println("# so don't report any of them as being missing from the above list."); - out - .println("# Some possible future additions, once IDNA updates to Unicode 4.1, are given."); + out.println("# The IDNA spec doesn't allow any of these characters,"); + out.println("# so don't report any of them as being missing from the above list."); + out.println( + "# Some possible future additions, once IDNA updates to Unicode 4.1, are given."); out.println(""); - //bf.setRangeBreakSource(UnicodeLabel.NULL); - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(removals2).setMain("Removals", "GCB", - UnicodeProperty.ENUMERATED, "1.0")); - //bf.setValueSource(UnicodeLabel.NULL); - bf.showSetNames(out, new UnicodeSet(IDNInputSet).complement() - .removeAll(UNASSIGNED)); + // bf.setRangeBreakSource(UnicodeLabel.NULL); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(removals2) + .setMain("Removals", "GCB", UnicodeProperty.ENUMERATED, "1.0")); + // bf.setValueSource(UnicodeLabel.NULL); + bf.showSetNames(out, new UnicodeSet(IDNInputSet).complement().removeAll(UNASSIGNED)); out.close(); } - /** - * - */ + /** */ private void writeIDChars() throws IOException { final BagFormatter bf = makeFormatter(); bf.setLabelSource(null); @@ -947,7 +983,9 @@ private void writeIDChars() throws IOException { final UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]"); - final PrintWriter out = openAndWriteHeader(outdir, "idnchars.txt", "Recommended Identifier Profiles for IDN"); + final PrintWriter out = + openAndWriteHeader( + outdir, "idnchars.txt", "Recommended Identifier Profiles for IDN"); out.println("# Allowed as output characters"); out.println(""); @@ -979,22 +1017,20 @@ private void writeIDChars() throws IOException { bf.setValueSource("nonstarting"); bf.showSetNames(out, nonstarting); - //out.println(""); + // out.println(""); - //showRemapped(out, "Characters remapped on input in GUIs -- Not required by profile!", remap); + // showRemapped(out, "Characters remapped on input in GUIs -- Not required by profile!", + // remap); out.close(); } - - /** - * - */ + /** */ private void showExtras(BagFormatter bf, UnicodeSet source, UnicodeSet letters) { final UnicodeSet extra = new UnicodeSet(source).removeAll(letters); if (extra.size() != 0) { final UnicodeSet fixed = new UnicodeSet(); - for (final UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next(); ) { if (!letters.containsAll(NFKD.normalize(it.getString()))) { fixed.add(it.codepoint); } @@ -1003,16 +1039,18 @@ private void showExtras(BagFormatter bf, UnicodeSet source, UnicodeSet letters) } } - /** - * - */ + /** */ private void printIDModifications() throws IOException { final BagFormatter bf = makeFormatter(); bf.setLabelSource(null); bf.setShowLiteral(EXCAPE_FUNNY); bf.setMergeRanges(true); - PrintWriter out = openAndWriteHeader(outdir + "../", "xidmodifications.txt", "Security Profile for General Identifiers"); + PrintWriter out = + openAndWriteHeader( + outdir + "../", + "xidmodifications.txt", + "Security Profile for General Identifiers"); /* PrintWriter out = FileUtilities.openUTF8Writer(outdir, "xidmodifications.txt"); out.println("# Security Profile for General Identifiers"); @@ -1020,8 +1058,8 @@ private void printIDModifications() throws IOException { out.println("# $Date: 2010-06-19 00:29:21 $"); */ - //String skipping = "[^[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"; - //UnicodeSet skippingSet = new UnicodeSet(skipping); + // String skipping = "[^[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"; + // UnicodeSet skippingSet = new UnicodeSet(skipping); out.println("# All code points not explicitly listed "); out.println("# have the values: restricted; not-chars"); @@ -1032,11 +1070,16 @@ private void printIDModifications() throws IOException { * reason1 = (String)it.next(); bf.setValueSource(reason1); * out.println(""); bf.showSetNames(out, removals.getSet(reason1)); } */ - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(recastRemovals).setMain("Removals", "GCB", - UnicodeProperty.ENUMERATED, "1.0")); - - final Set fullListing = new HashSet(Arrays.asList("technical limited-use historic discouraged obsolete".split("\\s+"))); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(recastRemovals) + .setMain("Removals", "GCB", UnicodeProperty.ENUMERATED, "1.0")); + + final Set fullListing = + new HashSet( + Arrays.asList( + "technical limited-use historic discouraged obsolete" + .split("\\s+"))); final Set sortedValues = new TreeSet(UCAComparator); sortedValues.addAll(recastRemovals.values()); System.out.println("Restriction Values: " + sortedValues); @@ -1048,8 +1091,9 @@ private void printIDModifications() throws IOException { out.println(""); out.println("#\tStatus/Type:\t" + value); out.println(""); - //bf.setMergeRanges(Collections.disjoint(fullListing, Arrays.asList(value.split("[\\s;]+")))); - //bf.setMergeRanges(value.propertyFileFormat()); + // bf.setMergeRanges(Collections.disjoint(fullListing, + // Arrays.asList(value.split("[\\s;]+")))); + // bf.setMergeRanges(value.propertyFileFormat()); bf.showSetNames(out, uset); } @@ -1060,13 +1104,16 @@ private void printIDModifications() throws IOException { // bf.setValueSource("addition"); // bf.showSetNames(out, additions.keySet()); - //showRemapped(out, "Characters remapped on input", remap); + // showRemapped(out, "Characters remapped on input", remap); out.close(); - out = openAndWriteHeader(outdir, "xidAllowed.txt", "Security Profile for General Identifiers"); + out = + openAndWriteHeader( + outdir, "xidAllowed.txt", "Security Profile for General Identifiers"); final UnicodeSet allowed = new UnicodeSet(xidPlus).removeAll(removals.keySet()); - final UnicodeSet cfAllowed = new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet); + final UnicodeSet cfAllowed = + new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet); allowed.removeAll(cfAllowed); bf.setValueSource("case_folded"); out.println("# XID characters allowed (no uppercase)"); @@ -1080,41 +1127,45 @@ private void printIDModifications() throws IOException { out.close(); final UnicodeMap someRemovals = new UnicodeMap(); - final UnicodeMap.Composer myComposer = new UnicodeMap.Composer() { - @Override - public Object compose(int codePoint, String string, Object a, Object b) { - if (b == null) { - return null; - } - String x = (String)b; - if (false) { - if (!IDNOutputSet.contains(codePoint)) { - return "~IDNA"; - } - if (!xidPlus.contains(codePoint)) { - return "~Unicode Identifier"; - } - } - if (x.startsWith(PROHIBITED)) { - x = x.substring(PROHIBITED.length()); - } - //if (!propNFKCSet.contains(codePoint)) x += "*"; - if (GC_LOWERCASE.contains(codePoint)) { - final String upper = DEFAULT_UCD.getCase(codePoint, UCD_Types.FULL, UCD_Types.UPPER); - if (upper.equals(UTF16.valueOf(codePoint)) - && x.equals("technical symbol (phonetic)")) { - x = "technical symbol (phonetic with no uppercase)"; + final UnicodeMap.Composer myComposer = + new UnicodeMap.Composer() { + @Override + public Object compose(int codePoint, String string, Object a, Object b) { + if (b == null) { + return null; + } + String x = (String) b; + if (false) { + if (!IDNOutputSet.contains(codePoint)) { + return "~IDNA"; + } + if (!xidPlus.contains(codePoint)) { + return "~Unicode Identifier"; + } + } + if (x.startsWith(PROHIBITED)) { + x = x.substring(PROHIBITED.length()); + } + // if (!propNFKCSet.contains(codePoint)) x += "*"; + if (GC_LOWERCASE.contains(codePoint)) { + final String upper = + DEFAULT_UCD.getCase( + codePoint, UCD_Types.FULL, UCD_Types.UPPER); + if (upper.equals(UTF16.valueOf(codePoint)) + && x.equals("technical symbol (phonetic)")) { + x = "technical symbol (phonetic with no uppercase)"; + } + } + return x; } - } - return x; - } - }; + }; someRemovals.composeWith(recastRemovals, myComposer); - final UnicodeSet nonIDNA = new UnicodeSet(IDNOutputSet).addAll(IDNInputSet).complement(); + final UnicodeSet nonIDNA = + new UnicodeSet(IDNOutputSet).addAll(IDNInputSet).complement(); someRemovals.putAll(nonIDNA, "~IDNA"); someRemovals.putAll(new UnicodeSet(xidPlus).complement(), "~Unicode Identifier"); someRemovals.putAll(UNASSIGNED, null); // clear extras - //someRemovals = removals; + // someRemovals = removals; out = FileUtilities.openUTF8Writer(outdir, "draft-restrictions.txt"); out.println("# Characters restricted in domain names"); out.println("#"); @@ -1122,30 +1173,37 @@ public Object compose(int codePoint, String string, Object a, Object b) { out.println("# UTR #36: Unicode Security Considerations"); out.println("# http://unicode.org/draft/reports/tr36/tr36.html"); out.println("# According to the recommendations in that document, these characters"); - out.println("# would be restricted in domain names: people would only be able to use them"); + out.println( + "# would be restricted in domain names: people would only be able to use them"); out.println("# by using lenient security settings."); out.println("#"); - out.println("# If you have any feedback on this list, please use the submission form at:"); + out.println( + "# If you have any feedback on this list, please use the submission form at:"); out.println("# http://unicode.org/reporting.html."); out.println("#"); out.println("# Notes:"); out.println("# - Characters are listed along with a reason for their removal."); - out.println("# - Characters listed as ~IDNA are excluded at this point in domain names,"); - out.println("# in many cases because the international domain name specification does not contain"); - out.println("# characters beyond Unicode 3.2. At this point in time, feedback on those characters"); + out.println( + "# - Characters listed as ~IDNA are excluded at this point in domain names,"); + out.println( + "# in many cases because the international domain name specification does not contain"); + out.println( + "# characters beyond Unicode 3.2. At this point in time, feedback on those characters"); out.println("# is not relevant."); - out.println("# - Characters listed as ~Unicode Identifiers are restricted because they"); + out.println( + "# - Characters listed as ~Unicode Identifiers are restricted because they"); out.println("# do not fit the specification of identifiers given in"); out.println("# UAX #31: Identifier and Pattern Syntax"); out.println("# http://unicode.org/reports/tr31/"); - out.println("# - Characters listed as ~IICore are restricted because they are Ideographic,"); + out.println( + "# - Characters listed as ~IICore are restricted because they are Ideographic,"); out.println("# but not part of the IICore set defined by the IRG as the minimal set"); out.println("# of required ideographs for East Asian use."); bf.setRangeBreakSource(new FakeBreak2()); if (true) { final Set values = new TreeSet(someRemovals.getAvailableValues()); - for (final Iterator it = values.iterator(); it.hasNext();) { + for (final Iterator it = values.iterator(); it.hasNext(); ) { final String reason1 = (String) it.next(); bf.setValueSource(reason1); final UnicodeSet keySet = someRemovals.keySet(reason1); @@ -1156,18 +1214,18 @@ public Object compose(int codePoint, String string, Object a, Object b) { UnicodeSet newRecommended = new UnicodeSet(keySet).retainAll(current); for (String s : newRecommended) { // [:script=Phag:] ; historic # UAX31 T4 # Phags Pa - System.out.println(Utility.hex(s) - + "\t;\thistoric\t#\t" - + DEFAULT_UCD.getName(s)); + System.out.println( + Utility.hex(s) + "\t;\thistoric\t#\t" + DEFAULT_UCD.getName(s)); } } out.println(""); bf.showSetNames(out, keySet); } } else { - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(someRemovals).setMain("Removals", "GCB", - UnicodeProperty.ENUMERATED, "1.0")); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(someRemovals) + .setMain("Removals", "GCB", UnicodeProperty.ENUMERATED, "1.0")); bf.showSetNames(out, someRemovals.keySet()); } out.close(); @@ -1178,18 +1236,13 @@ public Object compose(int codePoint, String string, Object a, Object b) { private static final String UNPROHIBITED = "allowed ; "; private static final String NOT_IN_XID = "not in XID+"; private static final boolean suppress_NFKC = true; - /** - * - */ + /** */ - - /** - * - */ + /** */ private static void generateDecompFile() throws IOException { final PrintWriter out = FileUtilities.openUTF8Writer(outdir, "decomps.txt"); final UnicodeProperty dt = ups.getProperty("Decomposition_Type"); - for (final Iterator it = dt.getAvailableValues().iterator(); it.hasNext();) { + for (final Iterator it = dt.getAvailableValues().iterator(); it.hasNext(); ) { final String value = (String) it.next(); if (value.equalsIgnoreCase("none") || value.equalsIgnoreCase("canonical")) { continue; @@ -1198,12 +1251,12 @@ private static void generateDecompFile() throws IOException { out.println(""); out.println("# Decomposition_Type = " + value); out.println(""); - for (final UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next();) { + for (final UnicodeSetIterator usi = new UnicodeSetIterator(s); usi.next(); ) { final String source = usi.getString(); final String target = getModifiedNKFC(source); writeSourceTargetLine(out, source, "N", target, value, ARROW); } - //bf.showSetNames(out, s); + // bf.showSetNames(out, s); out.flush(); } out.close(); @@ -1211,45 +1264,43 @@ private static void generateDecompFile() throws IOException { private static class FakeBreak extends UnicodeLabel { UnicodeSet nobreakSet = setsToAbbreviate; + @Override public String getValue(int codepoint, boolean isShort) { - return nobreakSet.contains(codepoint) ? "" - : (codepoint & 1) == 0 ? "O" - : "E"; + return nobreakSet.contains(codepoint) ? "" : (codepoint & 1) == 0 ? "O" : "E"; } } private static class FakeBreak2 extends UnicodeLabel { - UnicodeSet nobreakSet = new UnicodeSet(setsToAbbreviate) - .addAll(new UnicodeSet(IDNOutputSet).complement()) - .addAll(new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus).complement()); + UnicodeSet nobreakSet = + new UnicodeSet(setsToAbbreviate) + .addAll(new UnicodeSet(IDNOutputSet).complement()) + .addAll( + new UnicodeSet(IdentifierInfo.getIdentifierInfo().xidPlus) + .complement()); @Override public String getValue(int codepoint, boolean isShort) { - return nobreakSet.contains(codepoint) ? "" - : (codepoint & 1) == 0 ? "O" - : "E"; + return nobreakSet.contains(codepoint) ? "" : (codepoint & 1) == 0 ? "O" : "E"; } } - /** - * - */ + /** */ // private static void showRemapped(PrintWriter out, String title, UnicodeMap remap) { // out.println(""); // out.println("# " + title); // out.println(""); // int count = 0; - // for (final UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); usi.next();) { - // writeSourceTargetLine(out, usi.getString(), "remap-to", (String)remap.getValue(usi.codepoint), null, ARROW); + // for (final UnicodeSetIterator usi = new UnicodeSetIterator(remap.keySet()); + // usi.next();) { + // writeSourceTargetLine(out, usi.getString(), "remap-to", + // (String)remap.getValue(usi.codepoint), null, ARROW); // count++; // } // out.println(""); // out.println("# Total code points: " + count); // } - /** - * - */ + /** */ private static UnicodeSet IDNOutputSet, IDNInputSet, _preferredIDSet; private static UnicodeSet getIdentifierSet() { @@ -1265,19 +1316,21 @@ private static UnicodeSet getIdentifierSet() { continue; } // get IDNA - //int idnaType = GenerateStringPrep.getIDNAType(cp); - //if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp); - //if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp); + // int idnaType = GenerateStringPrep.getIDNAType(cp); + // if (idnaType == GenerateStringPrep.OK) IDNOutputSet.add(cp); + // if (idnaType != GenerateStringPrep.ILLEGAL) IDNInputSet.add(cp); final IdnaType idnaType = Uts46.SINGLETON.getType(cp); switch (idnaType) { - case valid: case deviation: - IDNOutputSet.add(cp); - // fall thru! - case mapped: case ignored: - IDNInputSet.add(cp); - break; - case disallowed: - // no action + case valid: + case deviation: + IDNOutputSet.add(cp); + // fall thru! + case mapped: + case ignored: + IDNInputSet.add(cp); + break; + case disallowed: + // no action } } _preferredIDSet = new UnicodeSet(IDNOutputSet).addAll(XIDContinueSet); @@ -1286,7 +1339,8 @@ private static UnicodeSet getIdentifierSet() { return _preferredIDSet; } - private static UnicodeSet SKIP_EXCEPTIONS = new UnicodeSet().add(0x1E9A).add('ſ').add('ſt').add('ẛ').add("Ϲ").add("ϲ").freeze(); + private static UnicodeSet SKIP_EXCEPTIONS = + new UnicodeSet().add(0x1E9A).add('ſ').add('ſt').add('ẛ').add("Ϲ").add("ϲ").freeze(); private static UnicodeSet getSkipNFKD() { nfcMap = new UnicodeMap(); @@ -1322,8 +1376,7 @@ private static UnicodeSet getSkipNFKD() { || decompType == UCD_Types.COMPAT_WIDE || decompType == UCD_Types.COMPAT_WIDE || cp == '﬩' - || cp == '︒' - ) { + || cp == '︒') { _skipNFKD.add(cp); continue; } @@ -1336,7 +1389,7 @@ private static UnicodeSet getSkipNFKD() { System.out.println("\t" + DEFAULT_UCD.getCodeAndName(kmapped)); kmapped = getModifiedNKFC(source); // for debugging } - nfkcMap.put(cp,kmapped); + nfkcMap.put(cp, kmapped); } if (mapped.equals(source)) { continue; @@ -1359,15 +1412,16 @@ private static UnicodeSet getSkipNFKD() { private static boolean isMixedScript(String source) { return IDENTIFIER_INFO.setIdentifier(source).isMultiScript(); - //return getSingleScript(source) == UCD_Types.UNUSED_SCRIPT; + // return getSingleScript(source) == UCD_Types.UNUSED_SCRIPT; } /** * Returns the script of the input text. Script values of COMMON and INHERITED are ignored. + * * @param source Input text. - * @return Script value found in the text. - * If more than one script values are found, then UCD_Types.UNUSED_SCRIPT is returned. - * If no script value is found (other than COMMON or INHERITED), then UCD_Types.COMMON_SCRIPT is returned. + * @return Script value found in the text. If more than one script values are found, then + * UCD_Types.UNUSED_SCRIPT is returned. If no script value is found (other than COMMON or + * INHERITED), then UCD_Types.COMMON_SCRIPT is returned. */ public static int getSingleScript(String source) { if (source.length() == 0) { @@ -1390,19 +1444,19 @@ public static int getSingleScript(String source) { return lastScript; } - /** - * - */ + /** */ private static void generateConfusables() throws IOException { log = FileUtilities.openUTF8Writer(outdir, "log.txt"); - //fixMichel(indir, outdir); + // fixMichel(indir, outdir); generateConfusables(indir, outdir); log.close(); if (false) { - for (final Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext();) { - final String source = (String)it.next(); - System.out.println(DEFAULT_UCD.getCodeAndName(source) - + " => " + DEFAULT_UCD.getCodeAndName((String)gatheredNFKD.get(source))); + for (final Iterator it = gatheredNFKD.keySet().iterator(); it.hasNext(); ) { + final String source = (String) it.next(); + System.out.println( + DEFAULT_UCD.getCodeAndName(source) + + " => " + + DEFAULT_UCD.getCodeAndName((String) gatheredNFKD.get(source))); } } } @@ -1439,20 +1493,28 @@ public int compareTo(Object o) { /** * @param relation TODO - * */ - private static void writeSourceTargetLine(PrintWriter out, String source, String tag, String target, String reason, String relation) { + private static void writeSourceTargetLine( + PrintWriter out, + String source, + String tag, + String target, + String reason, + String relation) { out.print( Utility.hex(source) - + " ;\t" + Utility.hex(target) - + (tag == null ? "" : " ;\t" + tag) - //+ " ;\t" + (preferredID.contains(source) ? "ID" : "") - + "\t#" - + (isXid(source) ? "" : "*") - + arrowLiterals(source, target, relation) - + DEFAULT_UCD.getName(source) + " " + relation + " " - + DEFAULT_UCD.getName(target) - ); + + " ;\t" + + Utility.hex(target) + + (tag == null ? "" : " ;\t" + tag) + // + " ;\t" + (preferredID.contains(source) ? "ID" : "") + + "\t#" + + (isXid(source) ? "" : "*") + + arrowLiterals(source, target, relation) + + DEFAULT_UCD.getName(source) + + " " + + relation + + " " + + DEFAULT_UCD.getName(target)); if (reason != null) { out.print("\t# " + reason); } @@ -1474,10 +1536,11 @@ private static String rtlProtect(String source) { return source; } - private static class MyEquivalenceClass extends XEquivalenceClass { + private static class MyEquivalenceClass extends XEquivalenceClass { public MyEquivalenceClass() { super("NONE"); } + public boolean addCheck(String a, String b, String reason) { // quick check for illegal containment, before changing object if (checkForBad(a, b, reason) || checkForBad(b, a, reason)) { @@ -1487,49 +1550,54 @@ public boolean addCheck(String a, String b, String reason) { // full check for any resulting illegal containment. // illegal if for any x, y, x is a proper superstring of y final Set equivalences = getEquivalences(a); - for (final Iterator it = equivalences.iterator(); it.hasNext();) { - final String x = (String)it.next(); - if (!UTF16.hasMoreCodePointsThan(x,1)) { + for (final Iterator it = equivalences.iterator(); it.hasNext(); ) { + final String x = (String) it.next(); + if (!UTF16.hasMoreCodePointsThan(x, 1)) { continue; } - for (final Iterator it2 = equivalences.iterator(); it2.hasNext();) { - final String y = (String)it2.next(); + for (final Iterator it2 = equivalences.iterator(); it2.hasNext(); ) { + final String y = (String) it2.next(); if (x.equals(y)) { continue; } if (x.indexOf(y) >= 0) { - throw new RuntimeException("Illegal containment: " - + DEFAULT_UCD.getCodeAndName(x) + " contains " - + DEFAULT_UCD.getCodeAndName(y) + " because " - + DEFAULT_UCD.getCodeAndName(a) + " ~ " - + DEFAULT_UCD.getCodeAndName(b) + " because of " - + reason); + throw new RuntimeException( + "Illegal containment: " + + DEFAULT_UCD.getCodeAndName(x) + + " contains " + + DEFAULT_UCD.getCodeAndName(y) + + " because " + + DEFAULT_UCD.getCodeAndName(a) + + " ~ " + + DEFAULT_UCD.getCodeAndName(b) + + " because of " + + reason); } } } return true; } - /** - * - */ + /** */ private boolean checkForBad(String a, String b, String reason) { final Set equivalences = getEquivalences(b); - for (final Iterator it = equivalences.iterator(); it.hasNext();) { - final String b2 = (String)it.next(); + for (final Iterator it = equivalences.iterator(); it.hasNext(); ) { + final String b2 = (String) it.next(); if (a.equals(b2)) { continue; } if (b2.indexOf(a) >= 0 || a.indexOf(b2) >= 0) { - log.println("Illegal containment: " - + DEFAULT_UCD.getCodeAndName(a) - + " overlaps " - + DEFAULT_UCD.getCodeAndName(b2) - + "\n\tfrom " - + DEFAULT_UCD.getCodeAndName(b) - + "\n\twith reason " - + reason + " plus " - + getReasons(b2, b)); + log.println( + "Illegal containment: " + + DEFAULT_UCD.getCodeAndName(a) + + " overlaps " + + DEFAULT_UCD.getCodeAndName(b2) + + "\n\tfrom " + + DEFAULT_UCD.getCodeAndName(b) + + "\n\twith reason " + + reason + + " plus " + + getReasons(b2, b)); return true; } } @@ -1575,56 +1643,58 @@ private boolean checkForBad(String a, String b, String reason) { // } public void close(String reason) { - Map,String> mapItems = new HashMap(); - Map newItems = new HashMap(); + Map, String> mapItems = new HashMap(); + Map newItems = new HashMap(); Set bestSelector = new TreeSet(betterTargetIsLess); main: - while (true) { - System.out.println("Starting"); - // do all the combinations for all the paradigms - int count = 0; - for (Set set : getEquivalenceSets()) { - System.out.println(count++ + "\tChecking: " + set); - for (String item : set) { - if (!UTF16.hasMoreCodePointsThan(item,1)) { - continue; - } - newItems.putAll(mapString(item, set)); - } - if (!newItems.isEmpty()) { - System.out.println("\tAdding: " + newItems); - bestSelector.addAll(set); - String baseItem = bestSelector.iterator().next(); - bestSelector.clear(); - - for (Entry mapped : newItems.entrySet()) { - String newItem = mapped.getKey(); - String newReasion = mapped.getValue(); - mapItems.put(Row.of(baseItem, newItem), newReasion); - } - newItems.clear(); + while (true) { + System.out.println("Starting"); + // do all the combinations for all the paradigms + int count = 0; + for (Set set : getEquivalenceSets()) { + System.out.println(count++ + "\tChecking: " + set); + for (String item : set) { + if (!UTF16.hasMoreCodePointsThan(item, 1)) { + continue; } + newItems.putAll(mapString(item, set)); } - if (!mapItems.isEmpty()) { - // if we add anything, then we may have changed the equivalence classes - // so we restart everything from the beginning - // May change this in the future - for (Entry, String> entry : mapItems.entrySet()) { - addCheck(entry.getKey().get0(), entry.getKey().get1(), entry.getValue() + reason); + if (!newItems.isEmpty()) { + System.out.println("\tAdding: " + newItems); + bestSelector.addAll(set); + String baseItem = bestSelector.iterator().next(); + bestSelector.clear(); + + for (Entry mapped : newItems.entrySet()) { + String newItem = mapped.getKey(); + String newReasion = mapped.getValue(); + mapItems.put(Row.of(baseItem, newItem), newReasion); } - mapItems.clear(); - continue main; + newItems.clear(); } - return; // we didn't change anything, so we're done } + if (!mapItems.isEmpty()) { + // if we add anything, then we may have changed the equivalence classes + // so we restart everything from the beginning + // May change this in the future + for (Entry, String> entry : mapItems.entrySet()) { + addCheck( + entry.getKey().get0(), + entry.getKey().get1(), + entry.getValue() + reason); + } + mapItems.clear(); + continue main; + } + return; // we didn't change anything, so we're done + } } /** * @param alreadyIn * @param alreadyIn * @param combinations - * */ private Map mapString(String item, Set alreadyIn) { if (false && item.startsWith("\u03D2")) { @@ -1641,18 +1711,19 @@ private Map mapString(String item, Set alreadyIn) { for (Entry sb : combinations.entrySet()) { String result = sb.getKey(); String reasons = sb.getValue(); - newCombinations.put(result+cps, reasons); + newCombinations.put(result + cps, reasons); if (equivs != null) { for (String equiv : equivs) { if (cps.equals(equiv)) { continue; } final List x = getReasons(cps, equiv); - newCombinations.put(result+equiv, reasons + getBestForm(x)); + newCombinations.put(result + equiv, reasons + getBestForm(x)); } } } - // swap the results, so that combinations always contains the current values, and newCombinations is always empty + // swap the results, so that combinations always contains the current values, and + // newCombinations is always empty Map temp = combinations; combinations = newCombinations; newCombinations = temp; @@ -1666,18 +1737,20 @@ private Map mapString(String item, Set alreadyIn) { private Object getBestForm(Collection x) { if (x.size() != 1) { - return "[" + x + "]"; + return "[" + x + "]"; } final Object item = x.iterator().next(); if (!(item instanceof Collection)) { return x.toString(); } - return getBestForm((Collection)item); + return getBestForm((Collection) item); } public String getParadigm(String item, boolean onlyLowercase, boolean onlySameScript) { - // 0049 ; 006C ; MA # ( I → l ) LATIN CAPITAL LETTER I → LATIN SMALL LETTER L # - // 042E ; 0049 004F ; MA # ( Ю → IO ) CYRILLIC CAPITAL LETTER YU → LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER O # + // 0049 ; 006C ; MA # ( I → l ) LATIN CAPITAL LETTER I → LATIN SMALL + // LETTER L # + // 042E ; 0049 004F ; MA # ( Ю → IO ) CYRILLIC CAPITAL LETTER YU → + // LATIN CAPITAL LETTER I, LATIN CAPITAL LETTER O # // fails, since 0049 should not occur in the target Set filteredSet = new HashSet(); final Set equivalences = getEquivalences(item); @@ -1686,45 +1759,55 @@ public String getParadigm(String item, boolean onlyLowercase, boolean onlySameSc } main: - for (final Object element : equivalences) { - final String other = (String) element; + for (final Object element : equivalences) { + final String other = (String) element; - final String combined = item + other; + final String combined = item + other; - if (onlyLowercase) { - final boolean isLowercase = combined.equals(DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); - if (!isLowercase) { - continue; - } + if (onlyLowercase) { + final boolean isLowercase = + combined.equals( + DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); + if (!isLowercase) { + continue; } - if (onlySameScript) { - final boolean isMixed = isMixedScript(combined); - if (isMixed) { - continue; - } + } + if (onlySameScript) { + final boolean isMixed = isMixedScript(combined); + if (isMixed) { + continue; } + } - // verify idempotence - final int[] codePointArray = With.codePointArray(other); - if (codePointArray.length == 1) { - // String otherParadigm = getParadigm(other, onlyLowercase, onlySameScript); - // if (otherParadigm != null && !item.equals(otherParadigm)) { - // continue main; - // } - } else { - for (int codepoint : codePointArray) { - final String codePointString = UTF16.valueOf(codepoint); - String otherParadigm = getParadigm(codePointString, onlyLowercase, onlySameScript); - if (otherParadigm != null && !codePointString.equals(otherParadigm)) { - continue main; - } + // verify idempotence + final int[] codePointArray = With.codePointArray(other); + if (codePointArray.length == 1) { + // String otherParadigm = getParadigm(other, + // onlyLowercase, onlySameScript); + // if (otherParadigm != null && + // !item.equals(otherParadigm)) { + // continue main; + // } + } else { + for (int codepoint : codePointArray) { + final String codePointString = UTF16.valueOf(codepoint); + String otherParadigm = + getParadigm(codePointString, onlyLowercase, onlySameScript); + if (otherParadigm != null && !codePointString.equals(otherParadigm)) { + continue main; } } - - filteredSet.add(other); } + + filteredSet.add(other); + } // } - return CollectionUtilities.getBest(filteredSet, onlyLowercase || onlySameScript ? betterTargetIsLessFavorNeutral : betterTargetIsLess, -1); + return CollectionUtilities.getBest( + filteredSet, + onlyLowercase || onlySameScript + ? betterTargetIsLessFavorNeutral + : betterTargetIsLess, + -1); } public Set getOrderedExplicitItems() { @@ -1732,14 +1815,13 @@ public Set getOrderedExplicitItems() { cloneForSafety.addAll(getExplicitItems()); return cloneForSafety; } - /** - * - */ + /** */ // public void writeSource(PrintWriter out) { // final Set items = getOrderedExplicitItems(); // for (final Iterator it = items.iterator(); it.hasNext();) { // final String item = (String) it.next(); - // final String paradigm = CollectionUtilities.getBest(getEquivalences(item), betterTargetIsLess, -1); + // final String paradigm = CollectionUtilities.getBest(getEquivalences(item), + // betterTargetIsLess, -1); // if (item.equals(paradigm)) { // continue; // } @@ -1749,13 +1831,13 @@ public Set getOrderedExplicitItems() { } private static class RawData { - Map> data = new TreeMap>(); + Map> data = new TreeMap>(); public void add(String source, String target, String type) { if (betterTargetIsLess.compare(source, target) < 0) { - add2(source,target,type); + add2(source, target, type); } else { - add2(target,source,type); + add2(target, source, type); } } @@ -1779,14 +1861,15 @@ public void writeSource(PrintWriter out) { private static class DataSet { MyEquivalenceClass dataMixedAnycase = new MyEquivalenceClass(); -// MyEquivalenceClass dataMixedLowercase = new MyEquivalenceClass(); -// MyEquivalenceClass dataSingleLowercase = new MyEquivalenceClass(); -// MyEquivalenceClass dataSingleAnycase = new MyEquivalenceClass(); + // MyEquivalenceClass dataMixedLowercase = new MyEquivalenceClass(); + // MyEquivalenceClass dataSingleLowercase = new MyEquivalenceClass(); + // MyEquivalenceClass dataSingleAnycase = new MyEquivalenceClass(); RawData raw = new RawData(); private static String testChar = UTF16.valueOf(0x10A3A); - public DataSet add(String source, String target, String type, int lineCount, String errorLine) { + public DataSet add( + String source, String target, String type, int lineCount, String errorLine) { if (SKIP_SET.containsAll(source) || SKIP_SET.containsAll(target)) { return this; } @@ -1798,11 +1881,19 @@ public DataSet add(String source, String target, String type, int lineCount, Str COMBINING.containsAll(nsource); COMBINING.containsAll(ntarget); } - System.err.println("ERROR: Mixed combining classes: " + lineCount + "\t" + errorLine + "\t" + Utility.hex(nsource) + "\t" + Utility.hex(ntarget)); + System.err.println( + "ERROR: Mixed combining classes: " + + lineCount + + "\t" + + errorLine + + "\t" + + Utility.hex(nsource) + + "\t" + + Utility.hex(ntarget)); } // if it is just a compatibility match, return - //if (nsource.equals(ntarget)) return this; + // if (nsource.equals(ntarget)) return this; if (type.indexOf("skip") >= 0) { return this; } @@ -1812,10 +1903,11 @@ public DataSet add(String source, String target, String type, int lineCount, Str type = getReasonFromFilename(type); - // if it is base + combining sequence => base2 + same combining sequence, do just the base - final int nsourceFirst = UTF16.charAt(nsource,0); + // if it is base + combining sequence => base2 + same combining sequence, do just the + // base + final int nsourceFirst = UTF16.charAt(nsource, 0); final String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst)); - final int ntargetFirst = UTF16.charAt(ntarget,0); + final int ntargetFirst = UTF16.charAt(ntarget, 0); final String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst)); if (nsourceRest.length() != 0 && nsourceRest.equals(ntargetRest)) { @@ -1823,26 +1915,27 @@ public DataSet add(String source, String target, String type, int lineCount, Str target = UTF16.valueOf(ntargetFirst); type += "-base"; } - //type += ":" + lineCount; + // type += ":" + lineCount; final String combined = source + target; if (combined.indexOf("\u0430") >= 0) { System.out.println(DEFAULT_UCD.getCodeAndName(combined)); } - final boolean isLowercase = combined.equals(DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); + final boolean isLowercase = + combined.equals(DEFAULT_UCD.getCase(combined, UCD_Types.FULL, UCD_Types.FOLD)); final boolean isMixed = isMixedScript(combined); // Here's where we add data, if you need to debug - raw.add(source,target,type); + raw.add(source, target, type); dataMixedAnycase.add(source, target, type); -// if (isLowercase) { -// dataMixedLowercase.add(source, target, type); -// } -// if (!isMixed) { -// dataSingleAnycase.add(source, target, type); -// } -// if (!isMixed && isLowercase) { -// dataSingleLowercase.add(source, target, type); -// } + // if (isLowercase) { + // dataMixedLowercase.add(source, target, type); + // } + // if (!isMixed) { + // dataSingleAnycase.add(source, target, type); + // } + // if (!isMixed && isLowercase) { + // dataSingleLowercase.add(source, target, type); + // } return this; } @@ -1851,10 +1944,11 @@ public String toString() { return dataMixedAnycase.toString(); } - /* *//** + /* */ + /** * @param errorLine TODO - * - *//* + */ + /* private DataSet add(Data newData, String errorLine) { if (controls.containsSome(newData.source) || controls.containsSome(newData.target)) { System.out.println("Problem with " + errorLine); @@ -1870,8 +1964,10 @@ private DataSet add(Data newData, String errorLine) { } return this; } - */ // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt" + */ + // Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt" private static final int NORMAL = 0, FOLDING = 1, OLD = 2; + private static final UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]").freeze(); public DataSet addFile(String directory, String filename) throws IOException { @@ -1900,7 +1996,7 @@ public DataSet addFile(String directory, String filename) throws IOException { isFont = true; continue; } - final String[] pieces = Utility.split(line,';'); + final String[] pieces = Utility.split(line, ';'); if (pieces.length < 2) { System.out.println("Error on: (" + count + ")\t" + line); continue; @@ -1910,9 +2006,13 @@ public DataSet addFile(String directory, String filename) throws IOException { final String targetString = INVISIBLES.stripFrom(pieces[1].trim(), true); if (!targetString.equals(pieces[1].trim())) { - System.out.println("**\t" + Utility.hex(pieces[0].trim()) + ";\t" + Utility.hex(targetString)); + System.out.println( + "**\t" + + Utility.hex(pieces[0].trim()) + + ";\t" + + Utility.hex(targetString)); } - if (kind==FOLDING) { + if (kind == FOLDING) { final String target = fromHexOld(targetString); final String source = fromHexOld(sourceString); final String nsource = NFKD.normalize(source); @@ -1944,26 +2044,47 @@ public DataSet addFile(String directory, String filename) throws IOException { in.close(); return this; } catch (final Exception e) { - throw (RuntimeException) new RuntimeException("Failure with file: " - + directory + filename + " on line: " + count - + ": " + line).initCause(e); + throw (RuntimeException) + new RuntimeException( + "Failure with file: " + + directory + + filename + + " on line: " + + count + + ": " + + line) + .initCause(e); } } private void add2(String source, String target, String type, int count, String line) { - //if (pieces.length > 2) type = pieces[2].trim(); + // if (pieces.length > 2) type = pieces[2].trim(); final String nfkdSource = NFKD.normalize(source); final String nfkdTarget = NFKD.normalize(target); if (NSM.containsAll(source) && NSM.containsNone(target) || NSM.containsAll(target) && NSM.containsNone(source)) { if (SHOW_SUPPRESS) { - System.out.println("*** SUPPRESSING NSM Difference\t" - + count + "\t" + DEFAULT_UCD.getCodeAndName(source) + ";\t" + DEFAULT_UCD.getCodeAndName(target) + ";\t" + line); + System.out.println( + "*** SUPPRESSING NSM Difference\t" + + count + + "\t" + + DEFAULT_UCD.getCodeAndName(source) + + ";\t" + + DEFAULT_UCD.getCodeAndName(target) + + ";\t" + + line); } } else if (suppress_NFKC && nfkdSource.equals(nfkdTarget)) { if (SHOW_SUPPRESS) { - System.out.println("*** Suppressing nfkc for:\t" - + count + "\t" + DEFAULT_UCD.getCodeAndName(source) + ";\t" + DEFAULT_UCD.getCodeAndName(target) + ";\t" + line); + System.out.println( + "*** Suppressing nfkc for:\t" + + count + + "\t" + + DEFAULT_UCD.getCodeAndName(source) + + ";\t" + + DEFAULT_UCD.getCodeAndName(target) + + ";\t" + + line); } } else { add(source, target, type, count, line); @@ -1971,7 +2092,8 @@ private void add2(String source, String target, String type, int count, String l } public void writeSource(String directory, String filename) throws IOException { - final PrintWriter out = openAndWriteHeader(directory, filename, "Source File for IDN Confusables"); + final PrintWriter out = + openAndWriteHeader(directory, filename, "Source File for IDN Confusables"); // PrintWriter out = FileUtilities.openUTF8Writer(directory, filename); // out.println("# Source File for IDN Confusables"); // out.println("# $ Revision: 1.32 $"); @@ -1981,8 +2103,12 @@ public void writeSource(String directory, String filename) throws IOException { out.close(); } - public void writeSourceOrder(String directory, String filename, boolean appendFile, boolean skipNFKEquivs) throws IOException { - final PrintWriter out = openAndWriteHeader(directory, filename, "Recommended confusable mapping for IDN"); + public void writeSourceOrder( + String directory, String filename, boolean appendFile, boolean skipNFKEquivs) + throws IOException { + final PrintWriter out = + openAndWriteHeader( + directory, filename, "Recommended confusable mapping for IDN"); // PrintWriter out = FileUtilities.openUTF8Writer(directory, filename); // out.println("# Recommended confusable mapping for IDN"); // out.println("# $ Revision: 1.32 $"); @@ -1991,23 +2117,56 @@ public void writeSourceOrder(String directory, String filename, boolean appendFi if (appendFile) { final String[] replacements = {"%date%", Default.getDate()}; - Utility.appendFile(Settings.SRC_UCD_DIR + "confusablesHeader.txt", - Utility.UTF8_WINDOWS, out, replacements); - } - Relation, String> confusableMap - = Relation.of(new TreeMap(MyPairComparator), TreeSet.class); + Utility.appendFile( + Settings.SRC_UCD_DIR + "confusablesHeader.txt", + Utility.UTF8_WINDOWS, + out, + replacements); + } + Relation, String> confusableMap = + Relation.of(new TreeMap(MyPairComparator), TreeSet.class); if (true) { - writeSourceOrder(out, dataMixedAnycase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, - true, true, confusableMap); - writeSourceOrder(out, dataMixedAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, - false, true, confusableMap); - writeSourceOrder(out, dataMixedAnycase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, - true, false, confusableMap); - writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, - false, false, confusableMap); + writeSourceOrder( + out, + dataMixedAnycase, + "SL", + "Single-Script, Lowercase Confusables", + skipNFKEquivs, + true, + true, + confusableMap); + writeSourceOrder( + out, + dataMixedAnycase, + "SA", + "Single-Script, Anycase Confusables", + skipNFKEquivs, + false, + true, + confusableMap); + writeSourceOrder( + out, + dataMixedAnycase, + "ML", + "Mixed-Script, Lowercase Confusables", + skipNFKEquivs, + true, + false, + confusableMap); + writeSourceOrder( + out, + dataMixedAnycase, + "MA", + "Mixed-Script, Anycase Confusables", + skipNFKEquivs, + false, + false, + confusableMap); Counter> counter = new Counter(); - Map, Pair> examples = new HashMap, Pair>(); - for (Entry, Set> entry : confusableMap.keyValuesSet()) { + Map, Pair> examples = + new HashMap, Pair>(); + for (Entry, Set> entry : + confusableMap.keyValuesSet()) { final Set set = entry.getValue(); counter.add(set, 1); if (!examples.containsKey(set)) { @@ -2015,54 +2174,69 @@ public void writeSourceOrder(String directory, String filename, boolean appendFi } } for (Set entry : counter) { - System.out.println(counter.get(entry) + "\t" + entry + "\t" + examples.get(entry)); + System.out.println( + counter.get(entry) + "\t" + entry + "\t" + examples.get(entry)); } // } else { - // writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, Lowercase Confusables", skipNFKEquivs, false, false); - // writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, Anycase Confusables", skipNFKEquivs, false, false); - // writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, Lowercase Confusables", skipNFKEquivs, false, false); - // writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, Anycase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataSingleLowercase, "SL", "Single-Script, + // Lowercase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataSingleAnycase, "SA", "Single-Script, + // Anycase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataMixedLowercase, "ML", "Mixed-Script, + // Lowercase Confusables", skipNFKEquivs, false, false); + // writeSourceOrder(out, dataMixedAnycase, "MA", "Mixed-Script, + // Anycase Confusables", skipNFKEquivs, false, false); } out.close(); } - private static Comparator> MyPairComparator = new Comparator>() { - public int compare(Pair o1, Pair o2) { - int result = UCAComparator.compare(o1.getFirst(), o2.getFirst()); - return result != 0 ? result : UCAComparator.compare(o1.getSecond(), o2.getSecond()); - } - }; + private static Comparator> MyPairComparator = + new Comparator>() { + public int compare(Pair o1, Pair o2) { + int result = UCAComparator.compare(o1.getFirst(), o2.getFirst()); + return result != 0 + ? result + : UCAComparator.compare(o1.getSecond(), o2.getSecond()); + } + }; /** * @param skipNFKEquivs TODO * @param onlyLowercase TODO * @param onlySingleScript TODO * @param confusableMap - * */ - private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String tag, String title, - boolean skipNFKEquivs, boolean onlyLowercase, boolean onlySingleScript, + private void writeSourceOrder( + PrintWriter out, + MyEquivalenceClass data, + String tag, + String title, + boolean skipNFKEquivs, + boolean onlyLowercase, + boolean onlySingleScript, Relation, String> confusableMap) { // first get all the sets. Then get the best paradigm from each. Then sort. // Set setOfSets = data.getEquivalenceSets(); // Map orderedResults = new TreeMap(betterTargetIsLess); // for (Iterator it = setOfSets.iterator(); it.hasNext();) { // Set setOfEquivs = (Set) it.next(); - // Object item = CollectionUtilities.getBest(setOfEquivs, betterTargetIsLess, -1); + // Object item = CollectionUtilities.getBest(setOfEquivs, + // betterTargetIsLess, -1); // // } - //int c = codepointComparator.compare("\uFFFF", "\uD800\uDC00"); - //System.out.println("Code Point Compare: " + c); + // int c = codepointComparator.compare("\uFFFF", "\uD800\uDC00"); + // System.out.println("Code Point Compare: " + c); final Set items = data.getOrderedExplicitItems(); out.println(); out.println("# " + title); out.println(); int count = 0; final UnicodeSet preferredID = getIdentifierSet(); - final ArrayComparator ac = new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator}); + final ArrayComparator ac = + new ArrayComparator(new Comparator[] {UCAComparator, UCAComparator}); final Set orderedPairs = new TreeSet(ac); - for (final Iterator it = items.iterator(); it.hasNext();) { + for (final Iterator it = items.iterator(); it.hasNext(); ) { final String source = (String) it.next(); - if (UTF16.hasMoreCodePointsThan(source,1)) { + if (UTF16.hasMoreCodePointsThan(source, 1)) { continue; } final String target = data.getParadigm(source, onlyLowercase, onlySingleScript); @@ -2078,16 +2252,18 @@ private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String t } } orderedPairs.add(new String[] {target, source}); - Pair pair = new Pair(target, source); + Pair pair = new Pair(target, source); confusableMap.put(pair, tag); } String lastTarget = null; - for (final Iterator it = orderedPairs.iterator(); it.hasNext();) { + for (final Iterator it = orderedPairs.iterator(); it.hasNext(); ) { final String[] pair = (String[]) it.next(); final String source = pair[1]; final String target = pair[0]; - final List> reasons = data.getReasons(source, target); - final String reason = XEquivalenceClass.toString(reasons, myLinkageTransform); // fixReason(reasons); + final List> reasons = data.getReasons(source, target); + final String reason = + XEquivalenceClass.toString( + reasons, myLinkageTransform); // fixReason(reasons); if (lastTarget != null && !lastTarget.equals(target)) { out.println(); } @@ -2100,9 +2276,7 @@ private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String t out.println(); } - /** - * - */ + /** */ // private String fixReason(List reasons) { // final List first = (List)reasons.get(0); // String result = ""; @@ -2129,9 +2303,9 @@ private void writeSourceOrder(PrintWriter out, MyEquivalenceClass data, String t public void addAll(DataSet ds) { dataMixedAnycase.addAll(ds.dataMixedAnycase); -// dataMixedLowercase.addAll(ds.dataMixedLowercase); -// dataSingleAnycase.addAll(ds.dataSingleAnycase); -// dataSingleLowercase.addAll(ds.dataSingleLowercase); + // dataMixedLowercase.addAll(ds.dataMixedLowercase); + // dataSingleAnycase.addAll(ds.dataSingleAnycase); + // dataSingleLowercase.addAll(ds.dataSingleLowercase); } private void checkChar(String string) { @@ -2143,9 +2317,9 @@ private void checkChar(String string) { public Set getEquivalences(String string) { return dataMixedAnycase.getEquivalences(string); } - /* *//** - * - *//* + /* */ + /** */ + /* public DataSet clean() { // remove all skips DataSet tempSet = new DataSet(); @@ -2215,30 +2389,32 @@ public DataSet clean() { } return s; } - *//** - * - *//* + */ + /** */ + /* private void remove(Data already) { String[] key = {already.source, already.target}; dataMap.remove(key); dataSet.remove(already); }*/ - /** - * - */ + /** */ public void close(String reason) { dataMixedAnycase.close(reason); -// dataMixedLowercase.close(reason); -// dataSingleAnycase.close(reason); -// dataSingleLowercase.close(reason); + // dataMixedLowercase.close(reason); + // dataSingleAnycase.close(reason); + // dataSingleLowercase.close(reason); } - /** - * - */ + /** */ public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) { int count = 0; - for (final UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); it.next(); ) { - add(it.getString(), (String)decompMap.getValue(it.codepoint), type, ++count, errorLine); + for (final UnicodeSetIterator it = new UnicodeSetIterator(decompMap.keySet()); + it.next(); ) { + add( + it.getString(), + (String) decompMap.getValue(it.codepoint), + type, + ++count, + errorLine); } } @@ -2253,6 +2429,7 @@ public void addUnicodeMap(UnicodeMap decompMap, String type, String errorLine) { private static class MyCollectionFilter implements Predicate { UnicodeSet outputAllowed; int minLength; + @Override public boolean test(String item) { if (!outputAllowed.containsAll(item)) { @@ -2264,14 +2441,18 @@ public boolean test(String item) { } return true; } - }; + } + ; /** * @param script TODO * @throws IOException - * */ - public void writeSummary(String outdir, String filename, boolean outputOnly, UnicodeSet script) throws IOException { - final PrintWriter out = openAndWriteHeader(outdir, filename, "Summary: Recommended confusable mapping for IDN"); + public void writeSummary( + String outdir, String filename, boolean outputOnly, UnicodeSet script) + throws IOException { + final PrintWriter out = + openAndWriteHeader( + outdir, filename, "Summary: Recommended confusable mapping for IDN"); // PrintWriter out = FileUtilities.openUTF8Writer(outdir, filename); // out.print('\uFEFF'); // out.println("# Summary: Recommended confusable mapping for IDN"); @@ -2290,11 +2471,12 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni final Set itemsSeen = new HashSet(); final Set equivalents = new TreeSet(betterTargetIsLess); final MyCollectionFilter myFilter = new MyCollectionFilter(); - myFilter.outputAllowed= new UnicodeSet("[[\u0021-\u007E]-[:letter:]]") - .addAll(IdentifierInfo.getIdentifierInfo().remainingOutputSet) - .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict); + myFilter.outputAllowed = + new UnicodeSet("[[\u0021-\u007E]-[:letter:]]") + .addAll(IdentifierInfo.getIdentifierInfo().remainingOutputSet) + .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict); - for (final Iterator it = items.iterator(); it.hasNext();) { + for (final Iterator it = items.iterator(); it.hasNext(); ) { String target = (String) it.next(); if (itemsSeen.contains(target)) { continue; @@ -2316,34 +2498,47 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni } } scriptTest: - if (script != null) { - // see if at least one item contains the target script - for (final Iterator it2 = equivalents.iterator(); it2.hasNext();) { - final String item = (String) it2.next(); - if (script.containsAll(item)) { - target = item; - for (final Iterator it3 = equivalents.iterator(); it3.hasNext();) { - representable.addAll((String)it3.next()); - } - break scriptTest; + if (script != null) { + // see if at least one item contains the target script + for (final Iterator it2 = equivalents.iterator(); it2.hasNext(); ) { + final String item = (String) it2.next(); + if (script.containsAll(item)) { + target = item; + for (final Iterator it3 = equivalents.iterator(); it3.hasNext(); ) { + representable.addAll((String) it3.next()); } + break scriptTest; } - continue; // skip this one } + continue; // skip this one + } out.println(); out.println("#\t" + CollectionUtilities.join(equivalents, "\t")); String status = ""; // getStatus(target); - out.println(status + "\t" + "(\u200E " + target + " \u200E)\t" + Utility.hex(target) + "\t " + DEFAULT_UCD.getName(target)); - //if (UTF16.hasMoreCodePointsThan(source,1)) continue; - for (final Iterator it2 = equivalents.iterator(); it2.hasNext();) { + out.println( + status + + "\t" + + "(\u200E " + + target + + " \u200E)\t" + + Utility.hex(target) + + "\t " + + DEFAULT_UCD.getName(target)); + // if (UTF16.hasMoreCodePointsThan(source,1)) continue; + for (final Iterator it2 = equivalents.iterator(); it2.hasNext(); ) { final String source = (String) it2.next(); if (source.equals(target)) { continue; } - //boolean compatEqual = Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target)); - //if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue; - final String reason = XEquivalenceClass.toString(data.getReasons(source, target), myLinkageTransform); // fixReason(data.getReasons(source, target)); - //if (!outputAllowed.containsAll(source)) continue; + // boolean compatEqual = + // Default.nfkd().normalize(source).equals(Default.nfkd().normalize(target)); + // if (EXCLUDE_CONFUSABLE_COMPAT && compatEqual) continue; + final String reason = + XEquivalenceClass.toString( + data.getReasons(source, target), + myLinkageTransform); // fixReason(data.getReasons(source, + // target)); + // if (!outputAllowed.containsAll(source)) continue; // if (compatEqual) { // out.print("\u21D0"); // } else { @@ -2352,8 +2547,17 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni final String reasonOrEmpty = reason.length() == 0 ? "" : "\t# " + reason; status = ""; // getStatus(source); - out.println(BACKARROW + status + "\t" + "(\u200E " + source + " \u200E)\t" + Utility.hex(source) + "\t " + DEFAULT_UCD.getName(source) - + reasonOrEmpty); + out.println( + BACKARROW + + status + + "\t" + + "(\u200E " + + source + + " \u200E)\t" + + Utility.hex(source) + + "\t " + + DEFAULT_UCD.getName(source) + + reasonOrEmpty); count++; } } @@ -2373,26 +2577,29 @@ public void writeSummary(String outdir, String filename, boolean outputOnly, Uni out.close(); } - - public void writeWholeScripts(String outdir, String filename) throws IOException { - final UnicodeSet commonAndInherited = new UnicodeSet( - "[[:script=common:][:script=inherited:]]"); - - final WholeScript wsLower = new WholeScript( - new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) - .removeAll(new UnicodeSet("[A-Z]")), "L"); - final WholeScript wsAny = new WholeScript( - new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) - .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict), "A"); + final UnicodeSet commonAndInherited = + new UnicodeSet("[[:script=common:][:script=inherited:]]"); + + final WholeScript wsLower = + new WholeScript( + new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) + .removeAll(new UnicodeSet("[A-Z]")), + "L"); + final WholeScript wsAny = + new WholeScript( + new UnicodeSet(IdentifierInfo.getIdentifierInfo().remainingOutputSet) + .addAll(IdentifierInfo.getIdentifierInfo().inputSet_strict), + "A"); final MyEquivalenceClass data = new MyEquivalenceClass(); for (final Object element : dataMixedAnycase.getSamples()) { String target = (String) element; final Set equivalents = getEquivalences(target); boolean first = true; - for (final Iterator it2 = equivalents.iterator(); it2.hasNext();) { - final String cleaned = CollectionUtilities.remove((String)it2.next(), commonAndInherited); + for (final Iterator it2 = equivalents.iterator(); it2.hasNext(); ) { + final String cleaned = + CollectionUtilities.remove((String) it2.next(), commonAndInherited); if (cleaned.length() == 0) { continue; } @@ -2405,7 +2612,7 @@ public void writeWholeScripts(String outdir, String filename) throws IOException } } final Set itemsSeen = new HashSet(); - for (final Iterator it = data.getOrderedExplicitItems().iterator(); it.hasNext();) { + for (final Iterator it = data.getOrderedExplicitItems().iterator(); it.hasNext(); ) { final String target = (String) it.next(); if (itemsSeen.contains(target)) { continue; @@ -2415,7 +2622,8 @@ public void writeWholeScripts(String outdir, String filename) throws IOException wsAny.addEquivalents(equivalents); wsLower.addEquivalents(equivalents); } - final PrintWriter out = openAndWriteHeader(outdir, filename, "Summary: Whole-Script Confusables"); + final PrintWriter out = + openAndWriteHeader(outdir, filename, "Summary: Whole-Script Confusables"); // PrintWriter out = FileUtilities.openUTF8Writer(outdir, filename); // out.print('\uFEFF'); // out.println("# Summary: Whole-Script Confusables"); @@ -2435,9 +2643,7 @@ public void writeWholeScripts(String outdir, String filename) throws IOException wsAny.write(out); out.close(); } - /** - * - */ + /** */ // private String getStatus(String source) { // // TODO Auto-generated method stub // final int val = betterTargetIsLess.getValue(source); @@ -2475,10 +2681,12 @@ static class WholeScript { private final UnicodeSet[] script_set = new UnicodeSet[UCD_Types.LIMIT_SCRIPT]; private final BagFormatter bf = makeFormatter(); private final String label; + { for (short i = 0; i < UCD_Types.LIMIT_SCRIPT; ++i) { script_representables[i] = new UnicodeSet(); - //script_set[i] = new UnicodeSet("[:script=" + DEFAULT_UCD.getScriptID(i, UCD_Types.LONG) + ":]"); // ugly hack + // script_set[i] = new UnicodeSet("[:script=" + DEFAULT_UCD.getScriptID(i, + // UCD_Types.LONG) + ":]"); // ugly hack script_set[i] = SCRIPT_PROPERTY.getSet(UCD.getScriptID_fromIndex(i)); // ugly hack } bf.setValueSource(ups.getProperty("script")); @@ -2497,8 +2705,8 @@ void addEquivalents(Set set) { // if we have y ~ x, and both are single scripts // that means that x can be represented in script(y), // and y can be represented in script(x). - for (final Iterator it = set.iterator(); it.hasNext();) { - final String item1 = (String)it.next(); + for (final Iterator it = set.iterator(); it.hasNext(); ) { + final String item1 = (String) it.next(); if (!filterSet.containsAll(item1)) { continue; } @@ -2506,8 +2714,8 @@ void addEquivalents(Set set) { if (script1 == UCD_Types.UNUSED_SCRIPT) { continue; } - for (final Iterator it2 = set.iterator(); it2.hasNext();) { - final String item2 = (String)it2.next(); + for (final Iterator it2 = set.iterator(); it2.hasNext(); ) { + final String item2 = (String) it2.next(); if (!filterSet.containsAll(item2)) { continue; } @@ -2524,22 +2732,27 @@ public static class UnicodeSetToScript { public short getScript() { return script; } + public UnicodeSetToScript setScript(short script) { this.script = script; return this; } + public UnicodeSet getSet() { return set; } + public UnicodeSetToScript setSet(UnicodeSet set) { this.set = set; return this; } + private UnicodeSet set; private short script; } - UnicodeSetToScript[][] scriptToUnicodeSetToScript = new UnicodeSetToScript[UCD_Types.LIMIT_SCRIPT][]; + UnicodeSetToScript[][] scriptToUnicodeSetToScript = + new UnicodeSetToScript[UCD_Types.LIMIT_SCRIPT][]; UnicodeSet[] fastReject = new UnicodeSet[UCD_Types.LIMIT_SCRIPT]; boolean finished = false; @@ -2567,11 +2780,14 @@ void finish() { if (script_set[j].containsNone(script_representables[k])) { continue; } - final UnicodeSet items = new UnicodeSet(script_set[j]).retainAll(script_representables[k]); - final UnicodeSetToScript uss = new UnicodeSetToScript().setScript(k).setSet(items); + final UnicodeSet items = + new UnicodeSet(script_set[j]).retainAll(script_representables[k]); + final UnicodeSetToScript uss = + new UnicodeSetToScript().setScript(k).setSet(items); curr.add(uss); } - scriptToUnicodeSetToScript[j] = (UnicodeSetToScript[]) curr.toArray(new UnicodeSetToScript[curr.size()]); + scriptToUnicodeSetToScript[j] = + (UnicodeSetToScript[]) curr.toArray(new UnicodeSetToScript[curr.size()]); fastReject[j] = accept.complement(); } finished = true; @@ -2580,7 +2796,7 @@ void finish() { void write(PrintWriter out) throws IOException { finish(); - Map, String> reorder = new TreeMap<>(); // reorder alphabetically + Map, String> reorder = new TreeMap<>(); // reorder alphabetically for (short j = 0; j < UCD_Types.LIMIT_SCRIPT; ++j) { final UnicodeSetToScript[] unicodeSetToScripts = scriptToUnicodeSetToScript[j]; @@ -2596,7 +2812,7 @@ void write(PrintWriter out) throws IOException { // get other side UnicodeSet items2 = UnicodeSet.EMPTY; final UnicodeSetToScript[] unicodeSetToScripts2 = scriptToUnicodeSetToScript[k]; - for (int qq = 0; qq < unicodeSetToScripts2.length; ++qq) { + for (int qq = 0; qq < unicodeSetToScripts2.length; ++qq) { final UnicodeSetToScript uss2 = unicodeSetToScripts2[qq]; if (uss2.getScript() == j) { items2 = uss2.getSet(); @@ -2604,19 +2820,35 @@ void write(PrintWriter out) throws IOException { } } - final String sname = UCD.getScriptID_fromIndex(j, UCD_Types.SHORT) + "; " - + UCD.getScriptID_fromIndex(k, UCD_Types.SHORT) + "; " + label; - final String name = getScriptIndexName(j, UCD_Types.LONG) - + "; " + getScriptIndexName(k, UCD_Types.LONG); + final String sname = + UCD.getScriptID_fromIndex(j, UCD_Types.SHORT) + + "; " + + UCD.getScriptID_fromIndex(k, UCD_Types.SHORT) + + "; " + + label; + final String name = + getScriptIndexName(j, UCD_Types.LONG) + + "; " + + getScriptIndexName(k, UCD_Types.LONG); StringWriter b = new StringWriter(); PrintWriter out2 = new PrintWriter(b); - out2.println("# " + name + ": " - + items.toPattern(false) + "; " + items2.toPattern(false) + "\n"); + out2.println( + "# " + + name + + ": " + + items.toPattern(false) + + "; " + + items2.toPattern(false) + + "\n"); bf.setValueSource(sname); bf.showSetNames(out2, items); out2.println(""); out2.flush(); - reorder.put(Pair.of(getScriptIndexName(j, UCD_Types.LONG), getScriptIndexName(k, UCD_Types.LONG)), b.toString()); + reorder.put( + Pair.of( + getScriptIndexName(j, UCD_Types.LONG), + getScriptIndexName(k, UCD_Types.LONG)), + b.toString()); out2.close(); } } @@ -2624,19 +2856,21 @@ void write(PrintWriter out) throws IOException { out.print(s.getValue()); } } + public String getScriptIndexName(short scriptIndex, byte length) { - return UCharacter.toTitleCase(Locale.ENGLISH, UCD.getScriptID_fromIndex(scriptIndex, length), null); + return UCharacter.toTitleCase( + Locale.ENGLISH, UCD.getScriptID_fromIndex(scriptIndex, length), null); } - } /** * @throws IOException - * */ // private static void fixMichel(String indir, String outdir) throws IOException { - // final BufferedReader in = FileUtilities.openUTF8Reader(indir + "michel/", "tr36comments-annex.txt"); - // final PrintWriter out = FileUtilities.openUTF8Writer(outdir, "new-tr36comments-annex.txt"); + // final BufferedReader in = FileUtilities.openUTF8Reader(indir + "michel/", + // "tr36comments-annex.txt"); + // final PrintWriter out = FileUtilities.openUTF8Writer(outdir, + // "new-tr36comments-annex.txt"); // while (true) { // final String line = Utility.readDataLine(in); // if (line == null) { @@ -2655,15 +2889,14 @@ public String getScriptIndexName(short scriptIndex, byte length) { // in.close(); // out.close(); // } - /** - * - */ - + /** */ private static void generateSource() throws IOException { final File dir = new File(indir); final String[] names = dir.list(); - final Set sources = new TreeSet(new ArrayComparator( - new Comparator[] {codepointComparator, codepointComparator})); + final Set sources = + new TreeSet( + new ArrayComparator( + new Comparator[] {codepointComparator, codepointComparator})); final int[] count = new int[1]; for (int i = 0; i < names.length; ++i) { @@ -2686,7 +2919,7 @@ private static void generateSource() throws IOException { if (line.length() == 0) { continue; } - final String[] pieces = Utility.split(line,';'); + final String[] pieces = Utility.split(line, ';'); if (pieces.length < 2) { System.out.println("Error on: " + line); continue; @@ -2695,7 +2928,8 @@ private static void generateSource() throws IOException { String target = fromHexOld(pieces[1]); if (source.length() == 0 || target.length() == 0) { - throw new IllegalArgumentException("zero-length item: " + count[0] + ":\t" + line); + throw new IllegalArgumentException( + "zero-length item: " + count[0] + ":\t" + line); } // check for identical combining sequences @@ -2706,9 +2940,9 @@ private static void generateSource() throws IOException { } if (true) { - final int nsourceFirst = UTF16.charAt(nsource,0); + final int nsourceFirst = UTF16.charAt(nsource, 0); final String nsourceRest = nsource.substring(UTF16.getCharCount(nsourceFirst)); - final int ntargetFirst = UTF16.charAt(ntarget,0); + final int ntargetFirst = UTF16.charAt(ntarget, 0); final String ntargetRest = ntarget.substring(UTF16.getCharCount(ntargetFirst)); if (nsourceRest.equals(ntargetRest)) { source = UTF16.valueOf(nsourceFirst); @@ -2726,7 +2960,7 @@ private static void generateSource() throws IOException { in.close(); } final PrintWriter out = FileUtilities.openUTF8Writer(outdir, "confusableSource.txt"); - for (final Iterator it = sources.iterator(); it.hasNext();) { + for (final Iterator it = sources.iterator(); it.hasNext(); ) { final String[] sourceItem = (String[]) it.next(); writeSourceTargetLine(out, sourceItem[0], null, sourceItem[1], null, ARROW); } @@ -2763,10 +2997,10 @@ private static void generateConfusables(String indir, String outdir) throws IOEx newDir = outdir + "/source/"; } ds.writeSource(newDir, newName); - //ds.close("*"); + // ds.close("*"); total.addAll(ds); } - //total.close("t*"); + // total.close("t*"); // add normalized data // for (int i = 0; i <= 0x10FFFF; ++i) { @@ -2777,20 +3011,20 @@ private static void generateConfusables(String indir, String outdir) throws IOEx getSkipNFKD(); DataSet ds = new DataSet(); ds.addUnicodeMap(nfcMap, "nfc", "nfc"); - //ds.close("*"); + // ds.close("*"); total.addAll(ds); - //total.close("*"); + // total.close("*"); total.checkChar("ſ"); ds = new DataSet(); System.out.println(nfkcMap.get('ſ')); ds.addUnicodeMap(nfkcMap, "nfkc", "nfkc"); - //System.out.println(ds); + // System.out.println(ds); ds.checkChar("ſ"); - //ds.close("*"); + // ds.close("*"); ds.checkChar("ſ"); - //ds.write(outdir, "new-decomp.txt", false, false); + // ds.write(outdir, "new-decomp.txt", false, false); total.addAll(ds); ds.checkChar("ſ"); total.close("*"); @@ -2799,12 +3033,12 @@ private static void generateConfusables(String indir, String outdir) throws IOEx total.writeData(outdir + "/source/", "confusablesRaw.txt"); total.writeSummary(outdir + "../", "confusablesSummary.txt", false, null); total.writeSummary(outdir, "confusablesSummaryIdentifier.txt", true, null); - //total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true, + // total.writeSummary(outdir, "confusablesSummaryCyrillic.txt", true, // new UnicodeSet("[[:script=Cyrillic:][:script=common:][:script=inherited:]]")); total.writeWholeScripts(outdir + "../", "confusablesWholeScript.txt"); total.writeSourceOrder(outdir + "../", "confusables.txt", false, false); - //DataSet clean = total.clean(); - //clean.write(outdir, "confusables.txt", true); + // DataSet clean = total.clean(); + // clean.write(outdir, "confusables.txt", true); } /* BufferedReader in = FileUtilities.openUTF8Reader(Utility.BASE_DIR + "confusables/", "DiacriticFolding.txt"); @@ -2920,9 +3154,7 @@ private static void gen() throws IOException { // + DEFAULT_UCD.getName(source) // + " " + ARROW + " " + DEFAULT_UCD.getName(target); // } - /** - * - */ + /** */ /* private static void add(Map m, String source, String target, int count) { if (source.length() == 0 || target.length() == 0) return; if (preferSecondAsSource(source, target)) { @@ -2944,19 +3176,19 @@ private static void gen() throws IOException { }; */ - private static Integer - MARK_NOT_NFC = new Integer(50), - MARK_NFC = new Integer(40), - MARK_INPUT_LENIENT = new Integer(30), - MARK_INPUT_STRICT = new Integer(20), - MARK_OUTPUT = new Integer(10), - MARK_ASCII = new Integer(10); + private static Integer MARK_NOT_NFC = new Integer(50), + MARK_NFC = new Integer(40), + MARK_INPUT_LENIENT = new Integer(30), + MARK_INPUT_STRICT = new Integer(20), + MARK_OUTPUT = new Integer(10), + MARK_ASCII = new Integer(10); private static _BetterTargetIsLess betterTargetIsLess = new _BetterTargetIsLess(false); - private static _BetterTargetIsLess betterTargetIsLessFavorNeutral = new _BetterTargetIsLess(true); + private static _BetterTargetIsLess betterTargetIsLessFavorNeutral = + new _BetterTargetIsLess(true); private static boolean isXid(String x) { - return XID.containsAll(x); + return XID.containsAll(x); } private static class _BetterTargetIsLess implements Comparator { @@ -2966,6 +3198,7 @@ private static class _BetterTargetIsLess implements Comparator { _BetterTargetIsLess(boolean favorNeutral) { this.favorNeutral = favorNeutral; } + @Override public int compare(String a, String b) { if (a.equals(b)) { @@ -3001,7 +3234,7 @@ public int compare(String a, String b) { // longer is better (less) final int ca = UTF16.countCodePoint(a); final int cb = UTF16.countCodePoint(b); - if (ca != cb) { + if (ca != cb) { return ca > cb ? -1 : 1; } @@ -3054,7 +3287,8 @@ private int getValue(String a) { // lower is better } return lastValue; } - }; + } + ; // private static int compare(boolean a, boolean b) { // return a == b ? 0 : a ? 1 : -1; @@ -3095,29 +3329,28 @@ public static String getMostRecentAge(String a) { // private static String getCodeCharName(String a) { // return UCD.getCode(a) + "( " + a + " ) " + DEFAULT_UCD.getName(a); // } - /** - * Returns the part between - and . - */ + /** Returns the part between - and . */ private static String getReasonFromFilename(String type) { int period = type.lastIndexOf('.'); if (period < 0) { period = type.length(); } final int dash = type.lastIndexOf('-', period); - return type.substring(dash+1,period); + return type.substring(dash + 1, period); } - private static Normalizer modNFKC ; + private static Normalizer modNFKC; private static String getModifiedNKFC(String cf) { if (modNFKC == null) { - modNFKC = new Normalizer(UCD_Types.NFKC, Default.ucdVersion()); + modNFKC = new Normalizer(UCD_Types.NFKC, Default.ucdVersion()); modNFKC.setSpacingSubstitute(); } return modNFKC.normalize(cf); } - private static PrintWriter openAndWriteHeader(String dir, String filename, String title) throws IOException { + private static PrintWriter openAndWriteHeader(String dir, String filename, String title) + throws IOException { final PrintWriter out = FileUtilities.openUTF8Writer(dir, filename); out.print('\uFEFF'); out.println("# " + title); @@ -3132,10 +3365,11 @@ private static PrintWriter openAndWriteHeader(String dir, String filename, Strin } private static String fromHexOld(String targetString) { - String result = Utility.fromHex(targetString.trim(),true); + String result = Utility.fromHex(targetString.trim(), true); final String result2 = fromHexLenient(targetString); if (!result.equals(result2)) { - System.out.println("Changing hex\t" + targetString + "\t=>old\t" + result + "\t=>new\t" + result2); + System.out.println( + "Changing hex\t" + targetString + "\t=>old\t" + result + "\t=>new\t" + result2); result = result2; } return result; @@ -3151,22 +3385,24 @@ private static String fromHex(String hexOrChars) { } final String result2 = fromHexLenient(hexOrChars); if (!result.equals(result2)) { - System.out.println("Changing hex\t" + hexOrChars + "\t=>old\t" + result + "\t=>new\t" + result2); + System.out.println( + "Changing hex\t" + hexOrChars + "\t=>old\t" + result + "\t=>new\t" + result2); result = result2; } return result; } - private static Transform, String> myLinkageTransform = new Transform, String>() { - @Override - public String transform(Linkage source) { - String sourceString = source.reasons.toString(); - sourceString = sourceString.substring(1,sourceString.length()-1); - return source.result == null ? "" : - source.result.length() == 0 ? "\u21d2" : - ARROW + rtlProtect(source.result) + ARROW; - } - - }; - + private static Transform, String> myLinkageTransform = + new Transform, String>() { + @Override + public String transform(Linkage source) { + String sourceString = source.reasons.toString(); + sourceString = sourceString.substring(1, sourceString.length() - 1); + return source.result == null + ? "" + : source.result.length() == 0 + ? "\u21d2" + : ARROW + rtlProtect(source.result) + ARROW; + } + }; } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java index 180be85d5..42706826e 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateData.java @@ -1,33 +1,26 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateData.java,v $ - * $Date: 2009-08-18 23:38:46 $ - * $Revision: 1.43 $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateData.java,v $ $Date: + * 2009-08-18 23:38:46 $ $Revision: 1.43 $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.text.UTF16; import java.io.IOException; import java.io.PrintWriter; import java.util.Iterator; import java.util.Set; import java.util.TreeSet; - import org.unicode.text.utility.Settings; import org.unicode.text.utility.UTF32; import org.unicode.text.utility.UnicodeDataFile; -import org.unicode.text.utility.UnicodeDataFile.FileInfix; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; - - public class GenerateData implements UCD_Types { /* static final boolean DEBUG = false; @@ -748,21 +741,25 @@ public static void generateVerticalSlice(int startEnum, int endEnum, } */ - static public void writeNormalizerTestSuite(String directory, String fileName) throws IOException { + public static void writeNormalizerTestSuite(String directory, String fileName) + throws IOException { - final UnicodeDataFile fc = UnicodeDataFile.openAndWriteHeader(directory, fileName).setSkipCopyright(Settings.SKIP_COPYRIGHT); + final UnicodeDataFile fc = + UnicodeDataFile.openAndWriteHeader(directory, fileName) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter log = fc.out; - //final String suffix = FileInfix.getDefault().getFileSuffix(".txt"); - //final String newFile = directory + fileName + suffix; - //PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX); - //String[] batName = {""}; - //String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, fileName, UnicodeDataFile.getFileSuffix(true), batName); + // final String suffix = FileInfix.getDefault().getFileSuffix(".txt"); + // final String newFile = directory + fileName + suffix; + // PrintWriter log = Utility.openPrintWriter(newFile, Utility.UTF8_UNIX); + // String[] batName = {""}; + // String mostRecent = org.unicode.cldr.util.Utility.generateBat(directory, fileName, + // UnicodeDataFile.getFileSuffix(true), batName); final String[] example = new String[256]; - //log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false)); - //log.println(UnicodeDataFile.generateDateLine()); + // log.println("# " + fileName + UnicodeDataFile.getFileSuffix(false)); + // log.println(UnicodeDataFile.generateDateLine()); // log.println("#"); // log.println("# Normalization Test Suite"); // log.println("# Format:"); @@ -771,7 +768,8 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t // log.println("# Comments are indicated with hash marks"); // log.println("#"); // log.println("# CONFORMANCE:"); - // log.println("# 1. The following invariants must be true for all conformant implementations"); + // log.println("# 1. The following invariants must be true for all conformant + // implementations"); // log.println("#"); // log.println("# NFC"); // log.println("# c2 == NFC(c1) == NFC(c2) == NFC(c3)"); @@ -782,13 +780,17 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t // log.println("# c5 == NFD(c4) == NFD(c5)"); // log.println("#"); // log.println("# NFKC"); - // log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)"); + // log.println("# c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == + // NFKC(c5)"); // log.println("#"); // log.println("# NFKD"); - // log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)"); + // log.println("# c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == + // NFKD(c5)"); // log.println("#"); - // log.println("# 2. For every code point X assigned in this version of Unicode that is not specifically"); - // log.println("# listed in Part 1, the following invariants must be true for all conformant"); + // log.println("# 2. For every code point X assigned in this version of Unicode that + // is not specifically"); + // log.println("# listed in Part 1, the following invariants must be true for all + // conformant"); // log.println("# implementations:"); // log.println("#"); // log.println("# X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)"); @@ -807,7 +809,8 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t log.println("#"); log.println("@Part1 # Character by character test"); - log.println("# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms."); + log.println( + "# All characters not explicitly occurring in c1 of Part 1 have identical NFC, D, KC, KD forms."); log.println("#"); for (int ch = 0; ch < 0x10FFFF; ++ch) { @@ -819,7 +822,7 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t continue; } final String cc = UTF32.valueOf32(ch); - writeLine(cc,log, true); + writeLine(cc, log, true); } Utility.fixDot(); @@ -863,7 +866,7 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t // add character with higher class, same class, lower class String sample = ""; - for (int i = c+1; i < example.length; ++i) { + for (int i = c + 1; i < example.length; ++i) { if (example[i] == null) { continue; } @@ -871,7 +874,7 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t break; } sample += example[c]; - for (int i = c-1; i > 0; --i) { + for (int i = c - 1; i > 0; --i) { if (example[i] == null) { continue; } @@ -910,7 +913,7 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t if (Default.ucd().getDecompositionType(ch) != CANONICAL) { continue; } - //if (!Default.nfc().isNormalized(ch)) continue; + // if (!Default.nfc().isNormalized(ch)) continue; final String s = Default.ucd().getDecompositionMapping(ch); if (UTF16.hasMoreCodePointsThan(s, 2)) { continue; @@ -930,15 +933,15 @@ static public void writeNormalizerTestSuite(String directory, String fileName) t } Utility.fixDot(); - for (final Iterator it = prilist.iterator(); it.hasNext();) { - writeLine((String)it.next(),log, false); + for (final Iterator it = prilist.iterator(); it.hasNext(); ) { + writeLine((String) it.next(), log, false); } Utility.fixDot(); log.println("#"); log.println("# EOF"); fc.close(); - //Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); + // Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), batName[0]); } /* @@ -981,18 +984,35 @@ static void writeLine(String cc, PrintWriter log, boolean check) { // printout log.println( - Utility.hex(cc," ") + ";" + Utility.hex(c," ") + ";" + Utility.hex(d," ") + ";" - + Utility.hex(kc," ") + ";" + Utility.hex(kd," ") + Utility.hex(cc, " ") + + ";" + + Utility.hex(c, " ") + + ";" + + Utility.hex(d, " ") + + ";" + + Utility.hex(kc, " ") + + ";" + + Utility.hex(kd, " ") + "; # (" - + comma(cc) + "; " + comma(c) + "; " + comma(d) + "; " + comma(kc) + "; " + comma(kd) + "; " - + ") " + Default.ucd().getName(cc)); + + comma(cc) + + "; " + + comma(c) + + "; " + + comma(d) + + "; " + + comma(kc) + + "; " + + comma(kd) + + "; " + + ") " + + Default.ucd().getName(cc)); } static StringBuffer commaResult = new StringBuffer(); // not recursive!!! static final String comma(String s) { - //if (true) return s; + // if (true) return s; commaResult.setLength(0); int cp; for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateHanTransliterator.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateHanTransliterator.java index 06834678f..b61c527bd 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateHanTransliterator.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateHanTransliterator.java @@ -1,16 +1,25 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateHanTransliterator.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateHanTransliterator.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Replaceable; +import com.ibm.icu.text.ReplaceableString; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeMatcher; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -26,7 +35,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; @@ -35,19 +43,6 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Replaceable; -import com.ibm.icu.text.ReplaceableString; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeMatcher; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; - - public final class GenerateHanTransliterator implements UCD_Types { static final boolean DISAMBIG = false; @@ -71,7 +66,8 @@ public static void readUnihan() throws java.io.IOException { log.println("Unihan check"); log.println(""); - final BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion(), true, Utility.UTF8); + final BufferedReader in = + Utility.openUnicodeFile("Unihan", Default.ucdVersion(), true, Utility.UTF8); final Map properties = new TreeMap(); @@ -101,10 +97,10 @@ public static void readUnihan() throws java.io.IOException { integerCode = new Integer(code); } - final int tabPos2 = line.indexOf('\t', tabPos+1); - final String property = line.substring(tabPos+1, tabPos2).trim(); + final int tabPos2 = line.indexOf('\t', tabPos + 1); + final String property = line.substring(tabPos + 1, tabPos2).trim(); - String propertyValue = line.substring(tabPos2+1).trim(); + String propertyValue = line.substring(tabPos2 + 1).trim(); if (propertyValue.indexOf("U+") >= 0) { propertyValue = fromHexUnicode.transliterate(propertyValue); } @@ -149,12 +145,16 @@ public static void readUnihan() throws java.io.IOException { Iterator it = props.iterator(); log.println("

    "); while (it.hasNext()) { - final String property = (String)it.next(); + final String property = (String) it.next(); final HanInfo values = (HanInfo) properties.get(property); log.println("
  1. " + property + "
    • "); - log.println("count: " + values.count - + ", min length: " + values.minLen - + ", max length: " + values.maxLen); + log.println( + "count: " + + values.count + + ", min length: " + + values.minLen + + ", max length: " + + values.maxLen); log.println("
    • samples:"); Utility.print(log, values.samples, "; "); log.println("
  2. "); @@ -178,12 +178,22 @@ public static void readUnihan() throws java.io.IOException { if (ovalue.equals(uvalue)) { redundants.add(key); } else if (++unequalCount < 5) { - log.println("

    " + Integer.toString(key.intValue(),16) - + ": " + ovalue + ", " + uvalue + "

    "); + log.println( + "

    " + + Integer.toString(key.intValue(), 16) + + ": " + + ovalue + + ", " + + uvalue + + "

    "); } } - log.println("

    Total Unique: " + (otherInfo.size() - redundants.size()) - + "(out of" + otherInfo.size() + ")

    "); + log.println( + "

    Total Unique: " + + (otherInfo.size() - redundants.size()) + + "(out of" + + otherInfo.size() + + ")

    "); } log.println("

    Checking Redundants for kTotalStrokes

    "); @@ -192,7 +202,7 @@ public static void readUnihan() throws java.io.IOException { final Map kTotalStrokesMap = ((HanInfo) properties.get("kTotalStrokes")).map; final int[] radCount = new int[512]; it = kRSUnicodeMap.keySet().iterator(); - while(it.hasNext()) { + while (it.hasNext()) { final Integer key = (Integer) it.next(); final String uvalue = (String) kRSUnicodeMap.get(key); if (uvalue.endsWith(".0")) { @@ -201,7 +211,7 @@ public static void readUnihan() throws java.io.IOException { continue; } final int rs = getRadicalStroke(uvalue); - radCount[rs>>8] = Integer.parseInt(tvalue); + radCount[rs >> 8] = Integer.parseInt(tvalue); } } @@ -209,22 +219,32 @@ public static void readUnihan() throws java.io.IOException { it = kTotalStrokesMap.keySet().iterator(); unequalCount = 0; redundants.clear(); - while(it.hasNext()) { + while (it.hasNext()) { final Integer key = (Integer) it.next(); final String uvalue = (String) kRSUnicodeMap.get(key); final int rs = getRadicalStroke(uvalue); final String tvalue = (String) kTotalStrokesMap.get(key); final int t = Integer.parseInt(tvalue); - final int projected = radCount[rs>>8] + (rs & 0xFF); + final int projected = radCount[rs >> 8] + (rs & 0xFF); if (t == projected) { redundants.add(key); } else if (++unequalCount < 5) { - log.println("

    " + Integer.toString(key.intValue(),16) - + ": " + t + ", " + projected + "

    "); - } - } - log.println("

    Total Unique: " + (kTotalStrokesMap.size() - redundants.size()) - + "(out of" + kTotalStrokesMap.size() + ")

    "); + log.println( + "

    " + + Integer.toString(key.intValue(), 16) + + ": " + + t + + ", " + + projected + + "

    "); + } + } + log.println( + "

    Total Unique: " + + (kTotalStrokesMap.size() - redundants.size()) + + "(out of" + + kTotalStrokesMap.size() + + ")

    "); log.println(""); in.close(); @@ -233,13 +253,13 @@ public static void readUnihan() throws java.io.IOException { static int getRadicalStroke(String s) { int dotPos = s.indexOf('.'); - final int strokes = Integer.parseInt(s.substring(dotPos+1)); + final int strokes = Integer.parseInt(s.substring(dotPos + 1)); int radical = 0; if (s.charAt(dotPos - 1) == '\'') { radical = 256; --dotPos; } - radical += Integer.parseInt(s.substring(0,dotPos)); + radical += Integer.parseInt(s.substring(0, dotPos)); return (radical << 8) + strokes; } @@ -297,11 +317,18 @@ public static void fixedMandarin() throws IOException { final UnicodeSet gotAtLeastOne = new UnicodeSet(gotMandarin).addAll(gotHanyu); final Map outmap = new TreeMap(Collator.getInstance(new ULocale("zh"))); for (final UnicodeSetIterator it = new UnicodeSetIterator(gotAtLeastOne); it.next(); ) { - //String code = UTF16.valueOf(it.codepoint); + // String code = UTF16.valueOf(it.codepoint); final String hanyu = (String) kHanyuPinlu.getValue(it.codepoint); final String mandarin = (String) kMandarin.getValue(it.codepoint); - final String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'(')); - final String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' ')); + final String hPinyin = + hanyu == null + ? null + : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu, '(')); + final String mPinyin = + mandarin == null + ? null + : digitPinyin_accentPinyin.transliterate( + getUpTo(mandarin.toLowerCase(), ' ')); final String uPinyin = hPinyin != null ? hPinyin : mPinyin; UnicodeSet s = (UnicodeSet) outmap.get(uPinyin); if (s == null) { @@ -312,7 +339,7 @@ public static void fixedMandarin() throws IOException { } final String filename = "Raw_Transliterator_Han_Latin.txt"; final PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, filename); - for (final Iterator it = outmap.keySet().iterator(); it.hasNext();) { + for (final Iterator it = outmap.keySet().iterator(); it.hasNext(); ) { final String pinyin = (String) it.next(); final UnicodeSet uset = (UnicodeSet) outmap.get(pinyin); if (uset.size() == 1) { @@ -329,14 +356,16 @@ public static void fixedMandarin() throws IOException { public static class PairComparator implements Comparator { Comparator first; Comparator second; + PairComparator(Comparator first, Comparator second) { this.first = first; this.second = second; } + @Override public int compare(Object o1, Object o2) { - final Pair p1 = (Pair)o1; - final Pair p2 = (Pair)o2; + final Pair p1 = (Pair) o1; + final Pair p2 = (Pair) o2; final int result = first.compare(p1.first, p2.first); if (result != 0) { return result; @@ -365,8 +394,15 @@ public static void quickMandarin() throws Exception { final String code = UTF16.valueOf(it.codepoint); final String hanyu = (String) kHanyuPinlu.getValue(it.codepoint); final String mandarin = (String) kMandarin.getValue(it.codepoint); - final String hPinyin = hanyu == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu,'(')); - final String mPinyin = mandarin == null ? null : digitPinyin_accentPinyin.transliterate(getUpTo(mandarin.toLowerCase(),' ')); + final String hPinyin = + hanyu == null + ? null + : digitPinyin_accentPinyin.transliterate(getUpTo(hanyu, '(')); + final String mPinyin = + mandarin == null + ? null + : digitPinyin_accentPinyin.transliterate( + getUpTo(mandarin.toLowerCase(), ' ')); final String uPinyin = hPinyin != null ? hPinyin : mPinyin; String iPinyin = icuPinyin.transliterate(code).trim(); @@ -386,13 +422,21 @@ public static void quickMandarin() throws Exception { } if (gPinyin != null && !gPinyin.equals(uPinyin)) { - log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code - + "\t" + (uPinyin == null ? "" : uPinyin) - + "\t" + (iPinyin == null ? "" : iPinyin.equals(gPinyin) ? "" : iPinyin) - + "\t" + (gPinyin == null ? "" : gPinyin) - + "\t" + (hanyu == null ? "" : hanyu + " / ") - + (mandarin == null ? "" : mandarin) - ); + log.println( + (++counter) + + "\t" + + Utility.hex(it.codepoint) + + "\t" + + code + + "\t" + + (uPinyin == null ? "" : uPinyin) + + "\t" + + (iPinyin == null ? "" : iPinyin.equals(gPinyin) ? "" : iPinyin) + + "\t" + + (gPinyin == null ? "" : gPinyin) + + "\t" + + (hanyu == null ? "" : hanyu + " / ") + + (mandarin == null ? "" : mandarin)); if (hanyu != null) { hCount++; } @@ -404,26 +448,42 @@ public static void quickMandarin() throws Exception { if (isEqualOrNull(uPinyin, iPinyin)) { continue; } - log.println((++counter) + "\t" + Utility.hex(it.codepoint) + "\t" + code - + "\t" + (uPinyin == null ? "" : uPinyin) - + "\t" + (iPinyin == null ? "" : iPinyin) - + "\t" + (gPinyin == null ? "" : gPinyin) - + "\t" + (hanyu == null ? "" : hanyu + " / ") - + (mandarin == null ? "" : mandarin) - ); + log.println( + (++counter) + + "\t" + + Utility.hex(it.codepoint) + + "\t" + + code + + "\t" + + (uPinyin == null ? "" : uPinyin) + + "\t" + + (iPinyin == null ? "" : iPinyin) + + "\t" + + (gPinyin == null ? "" : gPinyin) + + "\t" + + (hanyu == null ? "" : hanyu + " / ") + + (mandarin == null ? "" : mandarin)); } log.println("kHanyuPinlu count: " + hCount); - final Collator col = Collator.getInstance(new Locale("zh","","PINYIN")); + final Collator col = Collator.getInstance(new Locale("zh", "", "PINYIN")); final UnicodeSet tailored = col.getTailoredSet().addAll(gotAtLeastOne); - final Collator pinyinCollator = new RuleBasedCollator( - "&[before 1] a < \u0101 <<< \u0100 << \u00E1 <<< \u00C1 << \u01CE <<< \u01CD << \u00E0 <<< \u00C0 << a <<< A" + - "&[before 1] e < \u0113 <<< \u0112 << \u00E9 <<< \u00C9 << \u011B <<< \u011A << \u00E8 <<< \u00C8 << e <<< A" + - "&[before 1] i < \u012B <<< \u012A << \u00ED <<< \u00CD << \u01D0 <<< \u01CF << \u00EC <<< \u00CC << i <<< I" + - "&[before 1] o < \u014D <<< \u014C << \u00F3 <<< \u00D3 << \u01D2 <<< \u01D1 << \u00F2 <<< \u00D2 << o <<< O" + - "&[before 1] u < \u016B <<< \u016A << \u00FA <<< \u00DA << \u01D4 <<< \u01D3 << \u00F9 <<< \u00D9 << u <<< U" + - " << \u01D6 <<< \u01D5 << \u01D8 <<< \u01D7 << \u01DA <<< \u01D9 << \u01DC <<< \u01DB << \u00FC"); - printSortedChars("ICU_Pinyin_Sort.txt", col, tailored, reformed, kHanyuPinlu, kMandarin, pinyinCollator); + final Collator pinyinCollator = + new RuleBasedCollator( + "&[before 1] a < \u0101 <<< \u0100 << \u00E1 <<< \u00C1 << \u01CE <<< \u01CD << \u00E0 <<< \u00C0 << a <<< A" + + "&[before 1] e < \u0113 <<< \u0112 << \u00E9 <<< \u00C9 << \u011B <<< \u011A << \u00E8 <<< \u00C8 << e <<< A" + + "&[before 1] i < \u012B <<< \u012A << \u00ED <<< \u00CD << \u01D0 <<< \u01CF << \u00EC <<< \u00CC << i <<< I" + + "&[before 1] o < \u014D <<< \u014C << \u00F3 <<< \u00D3 << \u01D2 <<< \u01D1 << \u00F2 <<< \u00D2 << o <<< O" + + "&[before 1] u < \u016B <<< \u016A << \u00FA <<< \u00DA << \u01D4 <<< \u01D3 << \u00F9 <<< \u00D9 << u <<< U" + + " << \u01D6 <<< \u01D5 << \u01D8 <<< \u01D7 << \u01DA <<< \u01D9 << \u01DC <<< \u01DB << \u00FC"); + printSortedChars( + "ICU_Pinyin_Sort.txt", + col, + tailored, + reformed, + kHanyuPinlu, + kMandarin, + pinyinCollator); /* MultiComparator mcol = new MultiComparator(new Comparator[] { new UnicodeMapComparator(reformed, pinyinCollator), col}); @@ -435,14 +495,16 @@ public static void quickMandarin() throws Exception { static class UnicodeMapComparator implements Comparator { UnicodeMap map; Comparator comp; + UnicodeMapComparator(UnicodeMap map, Comparator comp) { this.map = map; this.comp = comp; } + @Override public int compare(Object o1, Object o2) { - final int c1 = UTF16.charAt((String) o1,0); - final int c2 = UTF16.charAt((String) o2,0); + final int c1 = UTF16.charAt((String) o1, 0); + final int c2 = UTF16.charAt((String) o2, 0); final Object v1 = map.getValue(c1); final Object v2 = map.getValue(c2); if (v1 == null) { @@ -460,7 +522,7 @@ public int compare(Object o1, Object o2) { static class MultiComparator implements Comparator { private final Comparator[] comparators; - public MultiComparator (Comparator[] comparators) { + public MultiComparator(Comparator[] comparators) { this.comparators = comparators; } @@ -477,17 +539,23 @@ public int compare(Object arg0, Object arg1) { continue; } if (result > 0) { - return i+1; + return i + 1; } - return -(i+1); + return -(i + 1); } return 0; } } - private static void printSortedChars(String file, Comparator col, UnicodeSet tailored, - UnicodeMap map, UnicodeMap hanyu, UnicodeMap mand, Comparator p2) - throws IOException { + private static void printSortedChars( + String file, + Comparator col, + UnicodeSet tailored, + UnicodeMap map, + UnicodeMap hanyu, + UnicodeMap mand, + Comparator p2) + throws IOException { final Set set = new TreeSet(col); final PrintWriter pw = Utility.openPrintWriterGenDir("log/" + file, Utility.UTF8_WINDOWS); for (final UnicodeSetIterator it = new UnicodeSetIterator(tailored); it.next(); ) { @@ -496,29 +564,29 @@ private static void printSortedChars(String file, Comparator col, UnicodeSet tai String lastm = ""; String lasts = ""; for (final Iterator it2 = set.iterator(); it2.hasNext(); ) { - final String s = (String)it2.next(); - String m = map == null ? null : (String) map.getValue(UTF16.charAt(s,0)); + final String s = (String) it2.next(); + String m = map == null ? null : (String) map.getValue(UTF16.charAt(s, 0)); if (m == null) { m = ""; } String info = m; - if (p2.compare(lastm,m) > 0) { + if (p2.compare(lastm, m) > 0) { info = info + "\t" + lastm + " > " + m + "\t"; Object temp; - temp = hanyu.getValue(UTF16.charAt(lasts,0)); + temp = hanyu.getValue(UTF16.charAt(lasts, 0)); if (temp != null) { info += "[" + temp + "]"; } - temp = mand.getValue(UTF16.charAt(lasts,0)); + temp = mand.getValue(UTF16.charAt(lasts, 0)); if (temp != null) { info += "[" + temp + "]"; } info += " > "; - temp = hanyu.getValue(UTF16.charAt(s,0)); + temp = hanyu.getValue(UTF16.charAt(s, 0)); if (temp != null) { info += "[" + temp + "]"; } - temp = mand.getValue(UTF16.charAt(s,0)); + temp = mand.getValue(UTF16.charAt(s, 0)); if (temp != null) { info += "[" + temp + "]"; } @@ -530,7 +598,9 @@ private static void printSortedChars(String file, Comparator col, UnicodeSet tai pw.close(); } - static void addField(String dir, String file, int hexCodeFieldNumber, int valueNumber, UnicodeMap result) throws IOException { + static void addField( + String dir, String file, int hexCodeFieldNumber, int valueNumber, UnicodeMap result) + throws IOException { final BufferedReader br = FileUtilities.openUTF8Reader((String) dir, (String) file); while (true) { String line = br.readLine(); @@ -547,7 +617,7 @@ static void addField(String dir, String file, int hexCodeFieldNumber, int valueN if (line.startsWith("#") || line.length() == 0) { continue; } - final String[] pieces = Utility.split(line,'\t'); + final String[] pieces = Utility.split(line, '\t'); result.put(Integer.parseInt(pieces[hexCodeFieldNumber], 16), pieces[valueNumber]); } br.close(); @@ -559,12 +629,13 @@ static boolean isEqualOrNull(String a, String b) { } return a.equals(b); } + public static String getUpTo(String s, char ch) { final int pos = s.indexOf(ch); if (pos < 0) { return s; } - return s.substring(0,pos); + return s.substring(0, pos); } public static void main(int typeIn) throws IOException { @@ -579,24 +650,24 @@ public static void main(int typeIn) throws IOException { System.out.println("Quoting: " + quoteNonLetters.toRules(true)); System.out.println("Quoting: " + quoteNonLetters.toRules(true)); - String key; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn String filename; switch (type) { - case DEFINITION: - key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn - filename = "Raw_Transliterator_Han_Latin_Definition"; - break; - case JAPANESE: - key = "kJapaneseOn"; - filename = "Raw_Transliterator_ja_Latin"; - break; - case CHINESE: - key = "kMandarin"; - filename = "Raw_Transliterator_Han_Latin"; - break; - default: throw new IllegalArgumentException("Unexpected option: must be 0..2"); + case DEFINITION: + key = "kDefinition"; // kMandarin, kKorean, kJapaneseKun, kJapaneseOn + filename = "Raw_Transliterator_Han_Latin_Definition"; + break; + case JAPANESE: + key = "kJapaneseOn"; + filename = "Raw_Transliterator_ja_Latin"; + break; + case CHINESE: + key = "kMandarin"; + filename = "Raw_Transliterator_Han_Latin"; + break; + default: + throw new IllegalArgumentException("Unexpected option: must be 0..2"); } filename += Default.ucd().getVersion() + ".txt"; @@ -619,7 +690,9 @@ public static void main(int typeIn) throws IOException { log.println(); log.println("@Unihan Data"); log.println(); - out2 = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "unihan_kmandarinDump.txt"); + out2 = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR, "unihan_kmandarinDump.txt"); readUnihanData(key); @@ -640,31 +713,52 @@ public static void main(int typeIn) throws IOException { } } - it = unihanMap.keySet().iterator(); final Map badPinyin = new TreeMap(); - final PrintWriter out2 = Utility.openPrintWriterGenDir("log/Raw_mapping.txt", Utility.UTF8_WINDOWS); + final PrintWriter out2 = + Utility.openPrintWriterGenDir("log/Raw_mapping.txt", Utility.UTF8_WINDOWS); try { while (it.hasNext()) { final String keyChar = (String) it.next(); final String def = (String) unihanMap.get(keyChar); if (!isValidPinyin(def)) { final String fixedDef = fixPinyin(def); - err.println(Default.ucd().getCode(keyChar) + "\t" + keyChar + "\t" + fixedDef + "\t#" + def - + (fixedDef.equals(def) ? " FAIL" : "")); + err.println( + Default.ucd().getCode(keyChar) + + "\t" + + keyChar + + "\t" + + fixedDef + + "\t#" + + def + + (fixedDef.equals(def) ? " FAIL" : "")); Utility.addToSet(badPinyin, def, keyChar); } // check both ways final String digitDef = accentPinyin_digitPinyin.transliterate(def); final String accentDef = digitPinyin_accentPinyin.transliterate(digitDef); if (!accentDef.equals(def)) { - err.println("Failed Digit Pinyin: " - + Default.ucd().getCode(keyChar) + "\t" + keyChar + "\t" - + def + " => " + digitDef + " => " + accentDef); + err.println( + "Failed Digit Pinyin: " + + Default.ucd().getCode(keyChar) + + "\t" + + keyChar + + "\t" + + def + + " => " + + digitDef + + " => " + + accentDef); } - out2.println(toHexUnicode.transliterate(keyChar) - + "\tkMandarin\t" + digitDef.toUpperCase() + "\t# " + keyChar + ";\t" + def); + out2.println( + toHexUnicode.transliterate(keyChar) + + "\tkMandarin\t" + + digitDef.toUpperCase() + + "\t# " + + keyChar + + ";\t" + + def); } err.println(); err.println("Summary of Bad syllables"); @@ -700,8 +794,7 @@ public static void main(int typeIn) throws IOException { while (it.hasNext()) { final String keyChar = (String) it.next(); String def = (String) unihanMap.get(keyChar); - if (def == null) - { + if (def == null) { continue; // skipping // sort longer definitions first! } @@ -713,15 +806,19 @@ public static void main(int typeIn) throws IOException { def += " " + toSub.transliterate(String.valueOf(defCount)); } - lenSet.add(new Pair( - new Pair(new Integer(-UTF16.countCodePoint(keyChar)), - new Pair(new Integer(-def.length()), new Integer(rank++))), + lenSet.add( + new Pair( + new Pair( + new Integer(-UTF16.countCodePoint(keyChar)), + new Pair(new Integer(-def.length()), new Integer(rank++))), + new Pair(keyChar, def))); + backSet.add( + new Pair( + new Pair( + new Integer(-def.toString().length()), new Integer(rank++)), new Pair(keyChar, def))); - backSet.add(new Pair( - new Pair(new Integer(-def.toString().length()), new Integer(rank++)), - new Pair(keyChar, def))); - definitionCount.put(oldDef, new Integer(defCount+1)); + definitionCount.put(oldDef, new Integer(defCount + 1)); gotAlready.add(keyChar); } @@ -742,15 +839,21 @@ public static void main(int typeIn) throws IOException { def += " " + toSub.transliterate(String.valueOf(defCount)); } - lenSet.add(new Pair( - new Pair(new Integer(-UTF16.countCodePoint(keyChar)), - new Pair(new Integer(-def.toString().length()), new Integer(rank++))), + lenSet.add( + new Pair( + new Pair( + new Integer(-UTF16.countCodePoint(keyChar)), + new Pair( + new Integer(-def.toString().length()), + new Integer(rank++))), + new Pair(keyChar, def))); + backSet.add( + new Pair( + new Pair( + new Integer(-def.toString().length()), new Integer(rank++)), new Pair(keyChar, def))); - backSet.add(new Pair( - new Pair(new Integer(-def.toString().length()), new Integer(rank++)), - new Pair(keyChar, def))); - definitionCount.put(oldDef, new Integer(defCount+1)); + definitionCount.put(oldDef, new Integer(defCount + 1)); } // First, find the ones that we want a definition for, based on the ranking @@ -770,8 +873,11 @@ public static void main(int typeIn) throws IOException { final String def = (String) p.second; if (!gotIt.contains(def)) { if (unihanNonSingular) { - out.println(quoteNonLetters.transliterate(keyChar) - + " < " + quoteNonLetters.transliterate(def) + ";"); + out.println( + quoteNonLetters.transliterate(keyChar) + + " < " + + quoteNonLetters.transliterate(def) + + ";"); } else { doReverse.add(keyChar); } @@ -780,7 +886,6 @@ public static void main(int typeIn) throws IOException { } } - it = lenSet.iterator(); while (it.hasNext()) { Pair p = (Pair) it.next(); @@ -790,9 +895,12 @@ public static void main(int typeIn) throws IOException { final String def = (String) p.second; final String rel = !DO_SIMPLE && doReverse.contains(keyChar) ? "<>" : ">"; - out.println(quoteNonLetters.transliterate(keyChar) + rel - + quoteNonLetters.transliterate(def) + "|\\ ;"); - //if (TESTING) System.out.println("# " + code + " > " + definition); + out.println( + quoteNonLetters.transliterate(keyChar) + + rel + + quoteNonLetters.transliterate(def) + + "|\\ ;"); + // if (TESTING) System.out.println("# " + code + " > " + definition); } out.println("\u3002 <> '.';"); @@ -806,7 +914,6 @@ public static void main(int typeIn) throws IOException { out.println(":: fullwidth-halfwidth ();"); */ - System.out.println("Total: " + totalCount); System.out.println("Defined Count: " + count); @@ -876,39 +983,33 @@ public static void main(int typeIn) throws IOException { } } - //http://fog.ccsf.cc.ca.us/~jliou/phonetic.htm + // http://fog.ccsf.cc.ca.us/~jliou/phonetic.htm // longer ones must be AFTER! // longer ones must be AFTER! static final String[] initialPinyin = { - "", - "b", "p", "m", "f", - "d", "t", "n", "l", - "z", "c", "s", - "zh", "ch", "sh", "r", - "j", "q", "x", - "g", "k", "h", - "y", "w"}; // added to make checking simpler + "", "b", "p", "m", "f", "d", "t", "n", "l", "z", "c", "s", "zh", "ch", "sh", "r", "j", "q", + "x", "g", "k", "h", "y", "w" + }; // added to make checking simpler static final String[] finalPinyin = { - "a", "ai", "ao", "an", "ang", - "o", "ou", "ong", - "e", "ei", "er", "en", "eng", - "i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong", - "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng", - "ü", "üe", "üan", "ün" + "a", "ai", "ao", "an", "ang", "o", "ou", "ong", "e", "ei", "er", "en", "eng", "i", "ia", + "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong", "u", "ua", "uo", "uai", "ui", "uan", + "un", "uang", "ueng", "ü", "üe", "üan", "ün" }; // Don't bother with the following rules; just add w,y to initials // When “i” stands alone, a “y” will be added before it as “yi”. // If “i” is the first letter of the syllable it will be changed to “y”. // When “u” stands alone, a “w” will be added before it as “wu”. - // If “u” is the first letter of the syllable it will be changed to “w”. e.g. “uang -> wang”. + // If “u” is the first letter of the syllable it will be changed to “w”. e.g. “uang -> + // wang”. // When “ü” stands alone, a “y” will be added before it and “ü” will be changed to “u” as “yu”. - // If “ü” is the first letter of the syllable, then the spelling will be changed to “yu”. e.g. “üan -> yuan”. - //Note: The nasal final “ueng” never occurs after an initial but always form a syllable by itself. + // If “ü” is the first letter of the syllable, then the spelling will be changed to “yu”. + // e.g. “üan -> yuan”. + // Note: The nasal final “ueng” never occurs after an initial but always form a syllable by + // itself. // The “o” in “iou” is hidden, so it will be wrote as “iu”. But, don’t forget to pronounce it. // The “e” in “uei” is hidden, so it will be wrote as “ui”. But, don’t forget to pronounce it. - public static final String[] pinyin_bopomofo = { "a", "\u311a", "ai", "\u311e", @@ -951,7 +1052,7 @@ public static void main(int typeIn) throws IOException { "chong", "\u3114\u3121\u3125", "chou", "\u3114\u3121", "chu", "\u3114\u3128", - //"chua", "XXX", + // "chua", "XXX", "chuai", "\u3114\u3128\u311e", "chuan", "\u3114\u3128\u3122", "chuang", "\u3114\u3128\u3124", @@ -1328,8 +1429,9 @@ public static void main(int typeIn) throws IOException { }; static final Set fullPinyin = new TreeSet(); + static { - for (int i = 0; i < pinyin_bopomofo.length; i+= 2) { + for (int i = 0; i < pinyin_bopomofo.length; i += 2) { fullPinyin.add(pinyin_bopomofo[i]); } } @@ -1344,10 +1446,10 @@ static boolean isValidPinyin(String s) { static boolean isValidPinyin2(String s) { s = dropTones.transliterate(s); - for (int i = initialPinyin.length-1; i >= 0; --i) { + for (int i = initialPinyin.length - 1; i >= 0; --i) { if (s.startsWith(initialPinyin[i])) { final String end = s.substring(initialPinyin[i].length()); - for (int j = finalPinyin.length-1; j >= 0; --j) { + for (int j = finalPinyin.length - 1; j >= 0; --j) { if (end.equals(finalPinyin[j])) { return true; } @@ -1407,47 +1509,48 @@ static boolean isValidPinyin2(String s) { U+7878 · nüè #nuè */ - static Transliterator fixTypos = Transliterator.createFromRules("fix_typos", - "$cons=[bcdfghjklmnpqrstvwxyz];" - +"$nlet=[^[:Letter:][:Mark:]];" - +"$cons{iou}$nlet > iu;" - +"$cons{em}$nlet > an;" - +"$cons{uen}$nlet > ueng;" - +"$cons{ve}$nlet > üe;" - +"$cons{v}$nlet > ü;" - +"$cons{yue}$nlet > iu;" - +"$cons{yng}$nlet > ing;" - +"$cons{yu}$nlet > iu;" - //+"$cons{ue} > üe;" - +"jj > j;" - //+"$nlet{ng}$nlet > eng;" - //+"$nlet{n}$nlet > en;" - //+"$nlet{m}$nlet > en;" - +"$nlet{au}$nlet > ao;" - - // new fixes - +"zhueng}$nlet > zhong;" - +"zhuen}$nlet > zhuan;" - +"lue > lüe;" - +"liong > liang;" - +"nue > nüe;" - +"chua > chuo;" - +"yian > yan;" - +"yie > ye;" - +"lüan > luan;" - +"iong > yong;" - , Transliterator.FORWARD); - + static Transliterator fixTypos = + Transliterator.createFromRules( + "fix_typos", + "$cons=[bcdfghjklmnpqrstvwxyz];" + + "$nlet=[^[:Letter:][:Mark:]];" + + "$cons{iou}$nlet > iu;" + + "$cons{em}$nlet > an;" + + "$cons{uen}$nlet > ueng;" + + "$cons{ve}$nlet > üe;" + + "$cons{v}$nlet > ü;" + + "$cons{yue}$nlet > iu;" + + "$cons{yng}$nlet > ing;" + + "$cons{yu}$nlet > iu;" + // +"$cons{ue} > üe;" + + "jj > j;" + // +"$nlet{ng}$nlet > eng;" + // +"$nlet{n}$nlet > en;" + // +"$nlet{m}$nlet > en;" + + "$nlet{au}$nlet > ao;" + + // new fixes + + "zhueng}$nlet > zhong;" + + "zhuen}$nlet > zhuan;" + + "lue > lüe;" + + "liong > liang;" + + "nue > nüe;" + + "chua > chuo;" + + "yian > yan;" + + "yie > ye;" + + "lüan > luan;" + + "iong > yong;", + Transliterator.FORWARD); static String fixPinyin(String s) { final String original = s; - //err.println("Source: " + s); + // err.println("Source: " + s); s = accentPinyin_digitPinyin.transliterate(s); - //err.println("Digit: " + s); + // err.println("Digit: " + s); s = fixTypos.transliterate(s); - //err.println("fixed: " + s); + // err.println("fixed: " + s); s = digitPinyin_accentPinyin.transliterate(s); - //err.println("Result: " + s); + // err.println("Result: " + s); if (isValidPinyin(s)) { return s; } @@ -1478,7 +1581,10 @@ static void readFrequencyData(int type) throws java.io.IOException { if (type == CHINESE) { System.out.println("Reading chinese_frequency.txt"); - br = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "dict/chinese_frequency.txt", Utility.UTF8); + br = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "dict/chinese_frequency.txt", + Utility.UTF8); counter = 0; while (true) { line = Utility.readDataLine(br); @@ -1490,9 +1596,9 @@ static void readFrequencyData(int type) throws java.io.IOException { } Utility.dot(counter++); final int tabPos = line.indexOf('\t'); - final int rank = Integer.parseInt(line.substring(0,tabPos)); - final int cp = line.charAt(tabPos+1); - //if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp)); + final int rank = Integer.parseInt(line.substring(0, tabPos)); + final int cp = line.charAt(tabPos + 1); + // if ((rank % 100) == 0) System.out.println(rank + ", " + Utility.hex(cp)); combinedRank.add(new Pair(new Integer(rank), UTF16.valueOf(cp))); } br.close(); @@ -1501,7 +1607,10 @@ static void readFrequencyData(int type) throws java.io.IOException { if (type == JAPANESE) { System.out.println("Reading japanese_frequency.txt"); - br = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "dict/japanese_frequency.txt", Utility.UTF8); + br = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "dict/japanese_frequency.txt", + Utility.UTF8); final Map japaneseMap = new HashMap(); while (true) { line = Utility.readDataLine(br); @@ -1514,15 +1623,17 @@ static void readFrequencyData(int type) throws java.io.IOException { Utility.dot(counter++); final int tabPos = line.indexOf(' '); - final int tabPos2 = line.indexOf(' ', tabPos+1); - final int freq = Integer.parseInt(line.substring(tabPos2+1)); + final int tabPos2 = line.indexOf(' ', tabPos + 1); + final int freq = Integer.parseInt(line.substring(tabPos2 + 1)); - for (int i = tabPos+1; i < tabPos2; ++i) { + for (int i = tabPos + 1; i < tabPos2; ++i) { final int cp = line.charAt(i); final int script = Default.ucd().getScript(cp); if (script != HAN_SCRIPT) { - if (script != HIRAGANA_SCRIPT && script != KATAKANA_SCRIPT - && cp != 0x30FB && cp != 0x30FC) { + if (script != HIRAGANA_SCRIPT + && script != KATAKANA_SCRIPT + && cp != 0x30FB + && cp != 0x30FC) { System.out.println("Huh: " + Default.ucd().getCodeAndName(cp)); } continue; @@ -1540,10 +1651,8 @@ static void readFrequencyData(int type) throws java.io.IOException { final Comparable val = (Comparable) japaneseMap.get(key); combinedRank.add(new Pair(new Integer(++countJapanese), key)); } - } - int overallRank = 0; it = combinedRank.iterator(); @@ -1557,7 +1666,7 @@ static void readFrequencyData(int type) throws java.io.IOException { // make up rankMap, rankList - while(it.hasNext()) { + while (it.hasNext()) { final Pair p = (Pair) it.next(); if (showFrequency) { log.println(p.first + ", " + p.second); @@ -1686,7 +1795,6 @@ static void printCollection(PrintWriter p, Collection c) { } } - static Map rankMap = new TreeMap(); // maps from single char strings to overall rank static List rankList = new ArrayList(10000); @@ -1699,7 +1807,9 @@ static void readCDICTDefinitions(int type) throws IOException { } System.out.println("Reading " + fname); - final BufferedReader br = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); + final BufferedReader br = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); int counter = 0; final String[] pieces = new String[50]; String line = ""; @@ -1715,33 +1825,33 @@ static void readCDICTDefinitions(int type) throws IOException { } Utility.dot(counter++); - final int pinyinStart = line.indexOf('['); - final int pinyinEnd = line.indexOf(']', pinyinStart+1); - final int defStart = line.indexOf('/', pinyinEnd+1); - final int defEnd = line.indexOf('/', defStart+1); + final int pinyinEnd = line.indexOf(']', pinyinStart + 1); + final int defStart = line.indexOf('/', pinyinEnd + 1); + final int defEnd = line.indexOf('/', defStart + 1); final int firstData = pinyinStart >= 0 ? pinyinStart : defStart; - final String word = line.substring(0,firstData).trim(); + final String word = line.substring(0, firstData).trim(); if (type == DEFINITION) { - definition = fixDefinition(line.substring(defStart+1, defEnd), line); + definition = fixDefinition(line.substring(defStart + 1, defEnd), line); addCheck(word, definition, line); } else if (pinyinStart >= 0) { - definition = line.substring(pinyinStart+1, pinyinEnd).trim(); + definition = line.substring(pinyinStart + 1, pinyinEnd).trim(); if (type == JAPANESE) { processEdict(word, definition, line); } else { definition = digitToPinyin(definition, line); - //definition = Utility.replace(definition, " ", "\\ "); + // definition = Utility.replace(definition, " ", "\\ "); addCheck(word, definition, line); } } } br.close(); } catch (final Exception e) { - throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e); + throw new ChainException( + "{0} Failed at {1}", new Object[] {new Integer(counter), line}, e); } } @@ -1752,7 +1862,9 @@ static void readOverrides(int type) throws IOException { final String fname = "Chinese_override.txt"; System.out.println("Reading " + fname); - final BufferedReader br = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); + final BufferedReader br = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); int counter = 0; final String[] pieces = new String[50]; String line = ""; @@ -1767,15 +1879,15 @@ static void readOverrides(int type) throws IOException { continue; } Utility.dot(counter++); - //System.out.println(line); + // System.out.println(line); // skip code - line=line.toLowerCase(); + line = line.toLowerCase(); final int wordStart = line.indexOf('\t') + 1; final int wordEnd = line.indexOf('\t', wordStart); final String word = line.substring(wordStart, wordEnd); - final String definition = fixPinyin(line.substring(wordEnd+1)); + final String definition = fixPinyin(line.substring(wordEnd + 1)); final String old = (String) unihanMap.get(word); if (old != null) { if (!old.equals(definition)) { @@ -1783,9 +1895,17 @@ static void readOverrides(int type) throws IOException { System.out.println("Overriding Failure"); noOverrideFailure = false; } - err.println("Overriding Failure: " + word - + "\t" + old + " " + toHexUnicode.transliterate(old) - + "\t" + definition + " " + toHexUnicode.transliterate(definition)); + err.println( + "Overriding Failure: " + + word + + "\t" + + old + + " " + + toHexUnicode.transliterate(old) + + "\t" + + definition + + " " + + toHexUnicode.transliterate(definition)); } } else { addCheck(word, definition, line); @@ -1794,17 +1914,17 @@ static void readOverrides(int type) throws IOException { } br.close(); } catch (final Exception e) { - throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e); + throw new ChainException( + "{0} Failed at {1}", new Object[] {new Integer(counter), line}, e); } } - /* - @Unihan Data + @Unihan Data -Bad pinyin data: \u4E7F ? LE -\u7684 ? de, de, dí, dì - */ + Bad pinyin data: \u4E7F ? LE + \u7684 ? de, de, dí, dì + */ static void fixChineseOverrides() throws IOException { @@ -1819,7 +1939,9 @@ static void fixChineseOverrides() throws IOException { final String pinyinPrefix = "Bad pinyin data: "; System.out.println("Reading " + fname); - final BufferedReader br = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); + final BufferedReader br = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); try { while (true) { line = Utility.readDataLine(br); @@ -1837,7 +1959,6 @@ static void fixChineseOverrides() throws IOException { } Utility.dot(counter++); - if (line.charAt(0) == '@') { continue; } @@ -1846,19 +1967,20 @@ static void fixChineseOverrides() throws IOException { } line = line.toLowerCase(); - //System.out.println(Default.ucd.getCode(line)); + // System.out.println(Default.ucd.getCode(line)); // skip code final int wordStart = line.indexOf('\t') + 1; final int wordEnd = line.indexOf('\t', wordStart); final String word = line.substring(wordStart, wordEnd).trim(); - final int defStart = wordEnd+1; + final int defStart = wordEnd + 1; int defEnd = line.indexOf(',', defStart); if (defEnd < 0) { defEnd = line.length(); } - String definition = fixCircumflex.transliterate(line.substring(defStart, defEnd).trim()); + String definition = + fixCircumflex.transliterate(line.substring(defStart, defEnd).trim()); final String notones = dropTones.transliterate(definition); if (definition.equals(notones)) { @@ -1872,7 +1994,8 @@ static void fixChineseOverrides() throws IOException { out.println(hex.transliterate(word) + "\t" + word + "\t" + definition); } } catch (final Exception e) { - throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e); + throw new ChainException( + "{0} Failed at {1}", new Object[] {new Integer(counter), line}, e); } finally { br.close(); } @@ -1881,8 +2004,6 @@ static void fixChineseOverrides() throws IOException { } } - - static Set overrideSet = new HashSet(); static void processEdict(String word, String definition, String line) { @@ -1905,8 +2026,7 @@ static void processEdict(String word, String definition, String line) { // find next CJK block // where CJK really means anything but kana final int type = find(word, kana, offset, offset2, word.length(), false, false); - if (type == UnicodeMatcher.U_MISMATCH) - { + if (type == UnicodeMatcher.U_MISMATCH) { break; // we are done. } pairList[pairCount][0] = offset[0]; @@ -1925,20 +2045,30 @@ static void processEdict(String word, String definition, String line) { if (pairCount < 1) { System.out.println("No Kanji on line, skipping"); - System.out.println(hex.transliterate(word) + " > " + hex.transliterate(definition) - + ", " + kanaToLatin.transliterate(definition)); + System.out.println( + hex.transliterate(word) + + " > " + + hex.transliterate(definition) + + ", " + + kanaToLatin.transliterate(definition)); return; } // Now generate the rules - if (DEBUG && pairCount > 1) { System.out.println("Paircount: " + pairCount); - System.out.println("\t" + hex.transliterate(word) + " > " + hex.transliterate(definition) + ", " + kanaToLatin.transliterate(definition)); + System.out.println( + "\t" + + hex.transliterate(word) + + " > " + + hex.transliterate(definition) + + ", " + + kanaToLatin.transliterate(definition)); } - pairList[pairCount][0] = word.length(); // to make the algorithm easier, we add a termination + pairList[pairCount][0] = + word.length(); // to make the algorithm easier, we add a termination int delta = 0; // the current difference in positions between the definition and the word for (int i = 0; i < pairCount; ++i) { @@ -1948,13 +2078,15 @@ static void processEdict(String word, String definition, String line) { System.out.println(start + ", " + limit + ", " + delta); } - // that part was easy. the hard part is figuring out where this corresponds to in the definition. + // that part was easy. the hard part is figuring out where this corresponds to in the + // definition. // For now, we use a simple mechanism. - // The word and the definition should match to this point, so we just use the start (offset by delta) + // The word and the definition should match to this point, so we just use the start + // (offset by delta) // We'll check just to be sure. - final int lastLimit = i == 0 ? 0 : pairList[i-1][1]; + final int lastLimit = i == 0 ? 0 : pairList[i - 1][1]; final int defStart = start + delta; @@ -1969,36 +2101,57 @@ static void processEdict(String word, String definition, String line) { if (!firstGood) { // Houston, we have a problem. Utility.fixDot(); - System.out.println("Suspect line: " + hex.transliterate(word) + " > " + hex.transliterate(definition) - + ", " + kanaToLatin.transliterate(definition)); - System.out.println("\tNo match for " + hex.transliterate(word.substring(lastLimit, start)) - + " at end of " + hex.transliterate(definition.substring(0, defStart))); + System.out.println( + "Suspect line: " + + hex.transliterate(word) + + " > " + + hex.transliterate(definition) + + ", " + + kanaToLatin.transliterate(definition)); + System.out.println( + "\tNo match for " + + hex.transliterate(word.substring(lastLimit, start)) + + " at end of " + + hex.transliterate(definition.substring(0, defStart))); break; // BAIL } // For the limit of the defintion, we get the intermediate portion of the word // then search for it in the definition. - // We could get tripped up if the end of the transliteration of the Kanji matched the start. + // We could get tripped up if the end of the transliteration of the Kanji matched the + // start. // If so, we should find out on the next pass. int defLimit; if (limit == word.length()) { defLimit = definition.length(); } else { - final String afterPart = word.substring(limit, pairList[i+1][0]); - defLimit = definition.indexOf(afterPart, defStart+1); // we assume the CJK is at least one! + final String afterPart = word.substring(limit, pairList[i + 1][0]); + defLimit = + definition.indexOf( + afterPart, defStart + 1); // we assume the CJK is at least one! if (defLimit < 0) { final String afterPart2 = katakanatoHiragana.transliterate(afterPart); - defLimit = definition.indexOf(afterPart2, defStart+1); // we assume the CJK is at least one! + defLimit = + definition.indexOf( + afterPart2, defStart + 1); // we assume the CJK is at least one! } if (defLimit < 0) { // Houston, we have a problem. Utility.fixDot(); - System.out.println("Suspect line: " + hex.transliterate(word) + " > " + hex.transliterate(definition) - + ", " + kanaToLatin.transliterate(definition)); - System.out.println("\tNo match for " + hex.transliterate(afterPart) - + " in " + hex.transliterate(definition.substring(0, defStart+1))); + System.out.println( + "Suspect line: " + + hex.transliterate(word) + + " > " + + hex.transliterate(definition) + + ", " + + kanaToLatin.transliterate(definition)); + System.out.println( + "\tNo match for " + + hex.transliterate(afterPart) + + " in " + + hex.transliterate(definition.substring(0, defStart + 1))); } break; } @@ -2007,7 +2160,8 @@ static void processEdict(String word, String definition, String line) { defPart = kanaToLatin.transliterate(defPart); // FOR NOW, JUNK the context before!! - // String contextWord = word.substring(0, start) + "{" + word.substring(start, limit) + "}" + word.substring(limit); + // String contextWord = word.substring(0, start) + "{" + word.substring(start, limit) + + // "}" + word.substring(limit); String contextWord = word.substring(start, limit); if (limit != word.length()) { contextWord += "}" + word.substring(limit); @@ -2015,34 +2169,43 @@ static void processEdict(String word, String definition, String line) { addCheck(contextWord, defPart, line); if (DEBUG && pairCount > 1) { - System.out.println("\t" + hex.transliterate(contextWord) + " > " + hex.transliterate(defPart)); + System.out.println( + "\t" + hex.transliterate(contextWord) + " > " + hex.transliterate(defPart)); } delta = defLimit - limit; } - } // Useful Utilities? /** - * Returns the start of the first substring that matches m. - * Most arguments are the same as UnicodeMatcher.matches, except for offset[] - * @positive Use true if you want the first point that matches, and false if you want the first point that doesn't match. - * @offset On input, the starting position. On output, the start of the match position (not the end!!) + * Returns the start of the first substring that matches m. Most arguments are the same as + * UnicodeMatcher.matches, except for offset[] + * + * @positive Use true if you want the first point that matches, and false if you want the first + * point that doesn't match. + * @offset On input, the starting position. On output, the start of the match position (not the + * end!!) */ - static int find(Replaceable s, UnicodeMatcher m, int[] offset, int limit, boolean incremental, boolean positive) { + static int find( + Replaceable s, + UnicodeMatcher m, + int[] offset, + int limit, + boolean incremental, + boolean positive) { final int direction = offset[0] <= limit ? 1 : -1; - while (offset[0] != limit) { final int original = offset[0]; - final int type = m.matches(s, offset, limit, incremental); // if successful, changes offset. + final int type = + m.matches(s, offset, limit, incremental); // if successful, changes offset. if (type == UnicodeMatcher.U_MISMATCH) { if (!positive) { return UnicodeMatcher.U_MATCH; } - offset[0] += direction; // used to skip to next code unit, in the positive case + offset[0] += direction; // used to skip to next code unit, in the positive case // !! This should be safe, and saves checking the length of the code point } else if (positive) { offset[0] = original; // reset to the start position!!! @@ -2053,11 +2216,19 @@ static int find(Replaceable s, UnicodeMatcher m, int[] offset, int limit, boolea } /** - * Returns the start/limit of the first substring that matches m. Most arguments are the same as find().
    - * Warning: if the search is backwards, then substringEnd will contain the start of the substring - * and offset will contain the limit of the substring. + * Returns the start/limit of the first substring that matches m. Most arguments are the same as + * find().
    + * Warning: if the search is backwards, then substringEnd will contain the start + * of the substring and offset will contain the limit of the substring. */ - static int find(Replaceable s, UnicodeMatcher m, int[] offset, int[] offset2, int limit, boolean incremental, boolean positive) { + static int find( + Replaceable s, + UnicodeMatcher m, + int[] offset, + int[] offset2, + int limit, + boolean incremental, + boolean positive) { final int type = find(s, m, offset, limit, incremental, positive); if (type == UnicodeMatcher.U_MISMATCH) { return type; @@ -2067,12 +2238,25 @@ static int find(Replaceable s, UnicodeMatcher m, int[] offset, int[] offset2, in return type; } - static int find(String ss, UnicodeMatcher m, int[] offset, int limit, boolean incremental, boolean positive) { + static int find( + String ss, + UnicodeMatcher m, + int[] offset, + int limit, + boolean incremental, + boolean positive) { // UGLY that we have to create a wrapper! return find(new ReplaceableString(ss), m, offset, limit, incremental, positive); } - static int find(String ss, UnicodeMatcher m, int[] offset, int[] offset2, int limit, boolean incremental, boolean positive) { + static int find( + String ss, + UnicodeMatcher m, + int[] offset, + int[] offset2, + int limit, + boolean incremental, + boolean positive) { // UGLY that we have to create a wrapper! return find(new ReplaceableString(ss), m, offset, offset2, limit, incremental, positive); } @@ -2099,10 +2283,10 @@ static void addCheck2(String word, String definition, String line) { return; } - if (pua.containsSome(word) ) { + if (pua.containsSome(word)) { Utility.fixDot(); System.out.println("PUA on: " + line); - } else if (numbers.containsAll(definition) ) { + } else if (numbers.containsAll(definition)) { Utility.fixDot(); System.out.println("Only numbers on: " + line); } else { @@ -2123,7 +2307,9 @@ static void readCDICT() throws IOException { System.out.println("Reading cdict.txt"); final String fname = "cdict.txt"; - final BufferedReader br = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); + final BufferedReader br = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "dict/" + fname, Utility.UTF8); int counter = 0; final String[] pieces = new String[50]; String line = ""; @@ -2139,23 +2325,22 @@ static void readCDICT() throws IOException { } Utility.dot(counter++); final int tabPos = line.indexOf('['); - String word = line.substring(0,tabPos).trim(); + String word = line.substring(0, tabPos).trim(); word = Utility.replace(word, "\uFE4D", ""); word = Utility.replace(word, ".", ""); word = Utility.replace(word, "/", ""); word = Utility.replace(word, "(", ""); word = Utility.replace(word, ")", ""); - - final int tab2Pos = line.indexOf(']', tabPos+1); - final String pinyins = line.substring(tabPos+1, tab2Pos); + final int tab2Pos = line.indexOf(']', tabPos + 1); + final String pinyins = line.substring(tabPos + 1, tab2Pos); final int len = Utility.split(pinyins, ' ', pieces); if (word.length() != len) { log.println("Len mismatch: " + line); continue; } for (int i = 0; i < len; ++i) { - final String chr = word.substring(i, i+1); + final String chr = word.substring(i, i + 1); final String piece = digitToPinyin(pieces[i], line); @@ -2199,7 +2384,8 @@ static void readCDICT() throws IOException { } } catch (final Exception e) { - throw new ChainException("{0} Failed at {1}" , new Object []{new Integer(counter), line}, e); + throw new ChainException( + "{0} Failed at {1}", new Object[] {new Integer(counter), line}, e); } } @@ -2218,7 +2404,8 @@ static String digitToPinyin(String source, String line) { static void readUnihanData(String key) throws java.io.IOException { - final BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion(), true, Utility.UTF8); + final BufferedReader in = + Utility.openUnicodeFile("Unihan", Default.ucdVersion(), true, Utility.UTF8); final int count = 0; int lineCounter = 0; @@ -2239,14 +2426,14 @@ static void readUnihanData(String key) throws java.io.IOException { line = line.trim(); final int tabPos = line.indexOf('\t'); - final int tabPos2 = line.indexOf('\t', tabPos+1); + final int tabPos2 = line.indexOf('\t', tabPos + 1); final String scode = line.substring(2, tabPos).trim(); final int code = Integer.parseInt(scode, 16); - final String property = line.substring(tabPos+1, tabPos2).trim(); + final String property = line.substring(tabPos + 1, tabPos2).trim(); - String propertyValue = line.substring(tabPos2+1).trim(); + String propertyValue = line.substring(tabPos2 + 1).trim(); if (propertyValue.indexOf("U+") >= 0) { propertyValue = fromHexUnicode.transliterate(propertyValue); } @@ -2262,14 +2449,13 @@ static void readUnihanData(String key) throws java.io.IOException { if (key.equals("kMandarin") && property.equals("kHanyuPinlu")) { // U+64D4 kHanyuPinlu dan1(297), dan4(61), dan5(36) - final String[] piece = Utility.split(propertyValue,'('); + final String[] piece = Utility.split(propertyValue, '('); final String pinyin = digitToPinyin(piece[0], line); log.println(scode + "\t" + pinyin + "\t" + line); - kHanyuPinlu.put(Integer.parseInt(scode,16), pinyin); + kHanyuPinlu.put(Integer.parseInt(scode, 16), pinyin); } if (property.equals(key) - || key.equals("kJapaneseOn") && property.equals("kJapaneseKun") - ) { + || key.equals("kJapaneseOn") && property.equals("kJapaneseKun")) { storeDef(out, code, propertyValue, line); } } @@ -2280,7 +2466,7 @@ static void readUnihanData(String key) throws java.io.IOException { static void storeDef(PrintWriter out, int cp, String rawDefinition, String line) { // skip spaces & numbers at start int start; - for (start = 0;start < rawDefinition.length(); ++start) { + for (start = 0; start < rawDefinition.length(); ++start) { final char ch = rawDefinition.charAt(start); if (ch != ' ' && ch != '\t' && (ch < '0' || ch > '9')) { break; @@ -2302,7 +2488,7 @@ static void storeDef(PrintWriter out, int cp, String rawDefinition, String line) } // IF CHINESE or JAPANESE, stop at first space!!! - rawDefinition = rawDefinition.substring(start,end); + rawDefinition = rawDefinition.substring(start, end); if (type == DEFINITION) { storeDef2(out, cp, rawDefinition, line); @@ -2310,7 +2496,7 @@ static void storeDef(PrintWriter out, int cp, String rawDefinition, String line) if (rawDefinition.indexOf(' ') < 0) { storeDef2(out, cp, rawDefinition, line); } else { - final String [] pieces = Utility.split(rawDefinition, ' '); + final String[] pieces = Utility.split(rawDefinition, ' '); for (final String piece : pieces) { storeDef2(out, cp, piece, line); } @@ -2321,20 +2507,26 @@ static void storeDef(PrintWriter out, int cp, String rawDefinition, String line) static void storeDef2(PrintWriter out, int cp, String definition, String line) { if (type == CHINESE) { // since data are messed up, terminate after first digit - int end3 = findInString(definition, "12345")+1; + int end3 = findInString(definition, "12345") + 1; if (end3 == 0) { - log.println("Bad pinyin data: " + hex.transliterate(UTF16.valueOf(cp)) - + "\t" + UTF16.valueOf(cp) + "\t" + definition); + log.println( + "Bad pinyin data: " + + hex.transliterate(UTF16.valueOf(cp)) + + "\t" + + UTF16.valueOf(cp) + + "\t" + + definition); end3 = definition.length(); } definition = definition.substring(0, end3); definition = digitToPinyin(definition, line); - out2.println(Utility.hex(cp) + '\t' + UTF16.valueOf(cp) + "\t" + definition.toLowerCase()); + out2.println( + Utility.hex(cp) + '\t' + UTF16.valueOf(cp) + "\t" + definition.toLowerCase()); } if (type == DEFINITION) { - definition = removeMatched(definition,'(', ')', line); - definition = removeMatched(definition,'[', ']', line); + definition = removeMatched(definition, '(', ')', line); + definition = removeMatched(definition, '[', ']', line); definition = fixDefinition(definition, line); } definition = definition.trim(); @@ -2342,7 +2534,11 @@ static void storeDef2(PrintWriter out, int cp, String definition, String line) { if (definition.length() == 0) { Utility.fixDot(); - err.println("Zero value for " + Default.ucd().getCode(cp) + " on: " + hex.transliterate(line)); + err.println( + "Zero value for " + + Default.ucd().getCode(cp) + + " on: " + + hex.transliterate(line)); } else { addCheck(UTF16.valueOf(cp), definition, line); } @@ -2364,7 +2560,6 @@ static String fixDefinition(String definition, String rawDefinition) { return definition; } - // WARNING not supplemenatary-safe! static int findInString(String source, String chars) { @@ -2384,12 +2579,12 @@ static String removeMatched(String source, char start, char end, String original if (pos < 0) { break; } - int epos = source.indexOf(end, pos+1); + int epos = source.indexOf(end, pos + 1); if (epos < 0) { - epos = source.length()-1; + epos = source.length() - 1; log.println("Mismatches with " + start + ", " + end + ": " + originalLine); } - source = source.substring(0,pos) + source.substring(epos+1); + source = source.substring(0, pos) + source.substring(epos + 1); } return source; } @@ -2402,32 +2597,39 @@ static String removeMatched(String source, char start, char end, String original static StringBuffer handlePinyinTemp = new StringBuffer(); static final Transliterator hex = Transliterator.getInstance("[^\\u0020-\\u007F] hex"); - static final Transliterator quoteNonLetters = Transliterator.createFromRules("any-quotenonletters", - "([[\\u0020-\\u007E]-[:L:]-[\\'\\{\\}]-[0-9]]) > \\u005C $1; " - + "\\' > \\'\\';", + static final Transliterator quoteNonLetters = + Transliterator.createFromRules( + "any-quotenonletters", + "([[\\u0020-\\u007E]-[:L:]-[\\'\\{\\}]-[0-9]]) > \\u005C $1; " + + "\\' > \\'\\';", Transliterator.FORWARD); - static final Transliterator toSub = Transliterator.createFromRules("any-subscript", - " 0 > \u2080; " - + " 1 > \u2081; " - + " 2 > \u2082; " - + " 3 > \u2084; " - + " 4 > \u2084; " - + " 5 > \u2085; " - + " 6 > \u2086; " - + " 7 > \u2087; " - + " 8 > \u2088; " - + " 9 > \u2089; ", + static final Transliterator toSub = + Transliterator.createFromRules( + "any-subscript", + " 0 > \u2080; " + + " 1 > \u2081; " + + " 2 > \u2082; " + + " 3 > \u2084; " + + " 4 > \u2084; " + + " 5 > \u2085; " + + " 6 > \u2086; " + + " 7 > \u2087; " + + " 8 > \u2088; " + + " 9 > \u2089; ", Transliterator.FORWARD); - static final Transliterator kanaToLatin = Transliterator.createFromRules("any-subscript", - " $kata = [[:katakana:]\u30FC]; " - + "[:hiragana:] {} [:^hiragana:] > ' '; " - + "$kata {} [^[:hiragana:]$kata] > ' '; " - + "::Katakana-Latin; " - + "::Hiragana-Latin;", + static final Transliterator kanaToLatin = + Transliterator.createFromRules( + "any-subscript", + " $kata = [[:katakana:]\u30FC]; " + + "[:hiragana:] {} [:^hiragana:] > ' '; " + + "$kata {} [^[:hiragana:]$kata] > ' '; " + + "::Katakana-Latin; " + + "::Hiragana-Latin;", Transliterator.FORWARD); - static final Transliterator katakanatoHiragana = Transliterator.getInstance("katakana-hiragana"); + static final Transliterator katakanatoHiragana = + Transliterator.getInstance("katakana-hiragana"); static final UnicodeSet kana = new UnicodeSet("[[:hiragana:][:katakana:]\u30FC]"); // since we are working in NFC, we don't worry about the combining marks. @@ -2443,6 +2645,7 @@ static void add(String ID, Transliterator t) { System.out.println("Registering: " + ID + ", " + t.toRules(true)); Transliterator.registerFactory(ID, singleton); } + @Override public Transliterator getInstance(String ID) { return (Transliterator) m.get(ID); @@ -2451,130 +2654,137 @@ public Transliterator getInstance(String ID) { static Transliterator digitPinyin_accentPinyin; - static Transliterator accentPinyin_digitPinyin = Transliterator.createFromRules("accentPinyin_digitPinyin", - "::NFD; " - + " ([\u0304\u0301\u030C\u0300\u0306]) ([[:Mark:][:Letter:]]+) > $2 | $1;" - + "\u0304 > '1'; \u0301 > '2'; \u030C > '3'; \u0300 > '4'; \u0306 > '3';" - + " ::NFC;", Transliterator.FORWARD); + static Transliterator accentPinyin_digitPinyin = + Transliterator.createFromRules( + "accentPinyin_digitPinyin", + "::NFD; " + + " ([\u0304\u0301\u030C\u0300\u0306]) ([[:Mark:][:Letter:]]+) > $2 | $1;" + + "\u0304 > '1'; \u0301 > '2'; \u030C > '3'; \u0300 > '4'; \u0306 > '3';" + + " ::NFC;", + Transliterator.FORWARD); - static Transliterator fixCircumflex = Transliterator.createFromRules("fix_circumflex", - "::NFD; \u0306 > \u030C; ::NFC;", Transliterator.FORWARD); + static Transliterator fixCircumflex = + Transliterator.createFromRules( + "fix_circumflex", "::NFD; \u0306 > \u030C; ::NFC;", Transliterator.FORWARD); - static Transliterator dropTones = Transliterator.createFromRules("drop_tones", - "::NFD; \u0304 > ; \u0301 > ; \u030C > ; \u0300 > ; \u0306 > ; ::NFC;", Transliterator.FORWARD); + static Transliterator dropTones = + Transliterator.createFromRules( + "drop_tones", + "::NFD; \u0304 > ; \u0301 > ; \u030C > ; \u0300 > ; \u0306 > ; ::NFC;", + Transliterator.FORWARD); static { - final String dt = "1 > \u0304;\n" - + "2 <> \u0301;\n" - + "3 <> \u030C;\n" - + "4 <> \u0300;\n" - + "5 <> ;"; - - final String dp = "# syllable is ...vowel+ consonant* number\n" - + "# 'a', 'e' are the preferred bases\n" - + "# otherwise 'o'\n" - + "# otherwise last vowel\n" - + "::NFC;\n" - + "$vowel = [aAeEiIoOuUüÜ];\n" - + "$consonant = [[a-z A-Z] - [$vowel]];\n" - + "$digit = [1-5];\n" - + "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n" - + "([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n" - + "($vowel) ($consonant*) ($digit) > $1 &digit-tone($3) $2;\n" - + "($digit) > &digit-tone($1);\n" - + "::NFC;\n"; - - final Transliterator at = Transliterator.createFromRules("digit-tone", dt, Transliterator.FORWARD); + final String dt = + "1 > \u0304;\n" + "2 <> \u0301;\n" + "3 <> \u030C;\n" + "4 <> \u0300;\n" + "5 <> ;"; + + final String dp = + "# syllable is ...vowel+ consonant* number\n" + + "# 'a', 'e' are the preferred bases\n" + + "# otherwise 'o'\n" + + "# otherwise last vowel\n" + + "::NFC;\n" + + "$vowel = [aAeEiIoOuUüÜ];\n" + + "$consonant = [[a-z A-Z] - [$vowel]];\n" + + "$digit = [1-5];\n" + + "([aAeE]) ($vowel* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n" + + "([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit) > $1 &digit-tone($3) $2;\n" + + "($vowel) ($consonant*) ($digit) > $1 &digit-tone($3) $2;\n" + + "($digit) > &digit-tone($1);\n" + + "::NFC;\n"; + + final Transliterator at = + Transliterator.createFromRules("digit-tone", dt, Transliterator.FORWARD); System.out.println(at.transliterate("a1a2a3a4a5")); DummyFactory.add(at.getID(), at); - digitPinyin_accentPinyin = Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD); - System.out.println(digitPinyin_accentPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2")); - + digitPinyin_accentPinyin = + Transliterator.createFromRules("digit-pinyin", dp, Transliterator.FORWARD); + System.out.println( + digitPinyin_accentPinyin.transliterate("an2 aon2 oan2 ion2 oin2 uin2 iun2")); } /* - static String convertTones(String source, String debugLine) { - try { - result = new StringBuffer(); - main: - for (int i = 0; i < source.length(); ++i) { - ch = source.charAt(i); - switch (ch) { - case ':': - if (i > 0) { - char last = result.charAt(result.length()-1); - if (last == 'u') { - result.setCharAt(result.length()-1, 'ü'); - continue main; - } else if (last == 'U') { - result.setCharAt(result.length()-1, 'Ü'); - continue main; + static String convertTones(String source, String debugLine) { + try { + result = new StringBuffer(); + main: + for (int i = 0; i < source.length(); ++i) { + ch = source.charAt(i); + switch (ch) { + case ':': + if (i > 0) { + char last = result.charAt(result.length()-1); + if (last == 'u') { + result.setCharAt(result.length()-1, 'ü'); + continue main; + } else if (last == 'U') { + result.setCharAt(result.length()-1, 'Ü'); + continue main; + } } - } - break; - case '1': break; // skip character - case '2': case '3': case '4': case '5': - applyToPrecedingBase(result, ch-'0'); - break; - default: - result.append(ch); - break; + break; + case '1': break; // skip character + case '2': case '3': case '4': case '5': + applyToPrecedingBase(result, ch-'0'); + break; + default: + result.append(ch); + break; + } } } - } - source = source.trim(); - char ch = source.charAt(source.length()-1); - int num = (int)(ch-'1'); - if (num < 0 || num > 5) throw new Exception("none"); - handlePinyinTemp.setLength(0); - boolean gotIt = false; - boolean messageIfNoGotIt = true; + source = source.trim(); + char ch = source.charAt(source.length()-1); + int num = (int)(ch-'1'); + if (num < 0 || num > 5) throw new Exception("none"); + handlePinyinTemp.setLength(0); + boolean gotIt = false; + boolean messageIfNoGotIt = true; - for (int i = source.length()-2; i >= 0; --i) { - ch = source.charAt(i); - if (ch == ':') { - ch = 'Ü'; - --i; - } - if ('0' <= ch && ch <= '9') break; - if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) { - Utility.fixDot(); - System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")"); - break; - } - if (!gotIt) switch (ch) { - case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break; - case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break; - case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break; - case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break; - case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break; - case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break; + for (int i = source.length()-2; i >= 0; --i) { + ch = source.charAt(i); + if (ch == ':') { + ch = 'Ü'; + --i; + } + if ('0' <= ch && ch <= '9') break; + if (ch != 'Ü' && (ch < 'A' || ch > 'Z')) { + Utility.fixDot(); + System.out.println("Warning: non-ASCII in " + hex.transliterate(source) + " (" + hex.transliterate(debugLine) + ")"); + break; + } + if (!gotIt) switch (ch) { + case 'A': ch = "AÁ\u0102À\u0100".charAt(num); gotIt = true; break; + case 'E': ch = "EÉ\u0114È\u0112".charAt(num); gotIt = true; break; + case 'I': ch = "IÍ\u012CÌ\u012A".charAt(num); gotIt = true; break; + case 'O': ch = "OÓ\u014EÒ\u014C".charAt(num); gotIt = true; break; + case 'U': ch = "UÚ\u016CÙ\u016A".charAt(num); gotIt = true; break; + case 'Ü': ch = "Ü\u01D7\u01D9\u01DB\u01D5".charAt(num); gotIt = true; break; + } + handlePinyinTemp.insert(0,ch); } - handlePinyinTemp.insert(0,ch); - } - if (!gotIt && num > 0) { - handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num)); - if (messageIfNoGotIt) { - err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp - .toString()); + if (!gotIt && num > 0) { + handlePinyinTemp.append(" \u0301\u0306\u0300\u0304".charAt(num)); + if (messageIfNoGotIt) { + err.println("Missing vowel?: " + debugLine + " -> " + handlePinyinTemp + .toString()); + } } + source = handlePinyinTemp.toString().toLowerCase(); + } catch (Exception e) { + log.println("Bad line: " + debugLine); } - source = handlePinyinTemp.toString().toLowerCase(); - } catch (Exception e) { - log.println("Bad line: " + debugLine); + return source; } - return source; - } -/* -A and e trump all other vowels and always take the tone mark. -There are no Mandarin syllables that contain both a and e. -In the combination ou, o takes the mark. -In all other cases, the final vowel takes the mark. - */ + /* + A and e trump all other vowels and always take the tone mark. + There are no Mandarin syllables that contain both a and e. + In the combination ou, o takes the mark. + In all other cases, the final vowel takes the mark. + */ /* static String applyToPrecedingBase(StringBuffer result, int tone) { for (int i = result.length()-1; i >= 0; --i) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateNamedSequences.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateNamedSequences.java index ad82ca430..acb34dd92 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateNamedSequences.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateNamedSequences.java @@ -1,67 +1,75 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateNamedSequences.java,v $ + *

    $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateNamedSequences.java,v $ * - ******************************************************************************* + *

    ****************************************************************************** */ - package org.unicode.text.UCD; + +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; - import org.unicode.text.utility.Settings; import org.unicode.text.utility.UnicodeDataFile; -import org.unicode.text.utility.UnicodeDataFile.FileInfix; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; - public final class GenerateNamedSequences implements UCD_Types { static final boolean DEBUG = false; - static public String showVarGlyphs(String code0, String code1, String shape, String description) { + public static String showVarGlyphs( + String code0, String code1, String shape, String description) { if (DEBUG) { System.out.println(code0 + ", " + code1 + ", [" + shape + "]"); } String abbShape = ""; if (shape.length() != 0) { - abbShape = '-' + shape.substring(0,4); + abbShape = '-' + shape.substring(0, 4); if (description.indexOf("feminine") >= 0) { abbShape += "fem"; } } - return "U+" + code0 + "+U+" + code1 + "/" + shape
-                + ""; + return "U+"
+                + code0
+                + "+U+"
+                + code1
+                + "/"
+                + shape
+                + ""; } /* -# Field 0: the variation sequence -# Field 1: the description of the desired appearance -# Field 2: where the appearance is only different in in particular shaping environments -# this field lists them. The possible values are: isolated, initial, medial, final. -# If more than one is present, there are spaces between them. - */ - static public void generate(String filename2) throws IOException { - + # Field 0: the variation sequence + # Field 1: the description of the desired appearance + # Field 2: where the appearance is only different in in particular shaping environments + # this field lists them. The possible values are: isolated, initial, medial, final. + # If more than one is present, there are spaces between them. + */ + public static void generate(String filename2) throws IOException { // read the data and compose the table - String table = "

" + linkAndAnchor("s" + ruleNumber, ruleNumber) + ""); + final String ruleNumber = String.valueOf(ii + 1); + out.println( + "
" + + linkAndAnchor("s" + ruleNumber, ruleNumber) + + ""); printLine(out, extraSingleSamples.get(ii), true, true, rulesFound); out.println("
"; + String table = + "
Rep GlyphHex SequenceNameCopyable
"; final String[] splits = new String[4]; final String[] codes = new String[20]; final String[] shapes = new String[4]; - final BufferedReader in = Utility.openUnicodeFile(filename2, Default.ucdVersion(), true, Utility.LATIN1); + final BufferedReader in = + Utility.openUnicodeFile(filename2, Default.ucdVersion(), true, Utility.LATIN1); final Transliterator unicodexml = Transliterator.getInstance("hex/xml"); while (true) { String line = Utility.readDataLine(in); @@ -81,27 +89,41 @@ static public void generate(String filename2) throws IOException { if (codes[i].length() == 0) { continue; } - UTF16.append(codeBuffer, Integer.parseInt(codes[i],16)); + UTF16.append(codeBuffer, Integer.parseInt(codes[i], 16)); } final String codeWithHyphens = splits[1].replaceAll("\\s+", "-"); final String codeAlt = "U+" + splits[1].replaceAll("\\s", " U+"); final String codeString = unicodexml.transliterate(codeBuffer.toString()); - // 03E2 + // 03E2 - //table += "\n"; + // table += "\n"; String imageName = "images/U" + codeWithHyphens + ".gif"; if (splits[1].compareTo("1780") >= 0 && splits[1].compareTo("1800") < 0) { final String codeNoSpaces2 = splits[1].replaceAll("\\s", ""); imageName = "http://www.unicode.org/reports/tr28/images/" + codeNoSpaces2 + ".gif"; } - table += "" - + "" - + "" - + "" - + "" - + "\n"; + table += + "" + + "" + + "" + + "" + + "" + + "\n"; if (DEBUG) { System.out.println(splits[1] + "\t" + codeString); } @@ -112,10 +134,12 @@ static public void generate(String filename2) throws IOException { // now write out the results final String directory = "UCD/" + Default.ucd().getVersion() + "/extra/"; - final UnicodeDataFile outfile = UnicodeDataFile.openHTMLAndWriteHeader(directory, filename2) - .setSkipCopyright(Settings.SKIP_COPYRIGHT); + final UnicodeDataFile outfile = + UnicodeDataFile.openHTMLAndWriteHeader(directory, filename2) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); - final PrintWriter out = outfile.out; // Utility.openPrintWriter(filename, Utility.LATIN1_UNIX); + final PrintWriter out = + outfile.out; // Utility.openPrintWriter(filename, Utility.LATIN1_UNIX); /* String[] batName = {""}; String mostRecent = UnicodeDataFile.generateBat(directory, filename, UnicodeDataFile.getFileSuffix(true), batName); @@ -129,14 +153,19 @@ static public void generate(String filename2) throws IOException { */ final String[] replacementList = { - "@revision@", Default.ucd().getVersion(), - //"@updateDirectory@", updateDirectory, - "@date@", Default.getDate(), - "@table@", table}; - - Utility.appendFile(Settings.SRC_UCD_DIR + "NamedSequences-Template.html", Utility.UTF8, out, replacementList); + "@revision@", Default.ucd().getVersion(), + // "@updateDirectory@", updateDirectory, + "@date@", Default.getDate(), + "@table@", table + }; + + Utility.appendFile( + Settings.SRC_UCD_DIR + "NamedSequences-Template.html", + Utility.UTF8, + out, + replacementList); outfile.close(); - //Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]); + // Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]); } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStandardizedVariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStandardizedVariants.java index adb5e4801..0045bc7b8 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStandardizedVariants.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStandardizedVariants.java @@ -1,58 +1,66 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateStandardizedVariants.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateStandardizedVariants.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; + +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.VersionInfo; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; import java.util.HashMap; import java.util.Map; - import org.unicode.props.UnicodeProperty; import org.unicode.text.utility.Settings; import org.unicode.text.utility.UnicodeDataFile; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.VersionInfo; - public final class GenerateStandardizedVariants implements UCD_Types { static final boolean DEBUG = false; - static public String showVarGlyphs(String code0, String code1, String shape, String description) { + public static String showVarGlyphs( + String code0, String code1, String shape, String description) { if (DEBUG) { System.out.println(code0 + ", " + code1 + ", [" + shape + "]"); } String abbShape = ""; if (shape.length() != 0) { - abbShape = '-' + shape.substring(0,4); + abbShape = '-' + shape.substring(0, 4); if (description.indexOf("feminine") >= 0) { abbShape += "fem"; } } - return "U+" + code0 + "+U+" + code1 + "/" + shape
-                + ""; + return "U+"
+                + code0
+                + "+U+"
+                + code1
+                + "/"
+                + shape
+                + ""; } /* -# Field 0: the variation sequence -# Field 1: the description of the desired appearance -# Field 2: where the appearance is only different in in particular shaping environments -# this field lists them. The possible values are: isolated, initial, medial, final. -# If more than one is present, there are spaces between them. - */ - static public void generate() throws IOException { + # Field 0: the variation sequence + # Field 1: the description of the desired appearance + # Field 2: where the appearance is only different in in particular shaping environments + # this field lists them. The possible values are: isolated, initial, medial, final. + # If more than one is present, there are spaces between them. + */ + public static void generate() throws IOException { if (Default.ucdVersionInfo().compareTo(VersionInfo.getInstance(9)) >= 0) { // StandardizedVariants.html is obsolete since Unicode 9.0. return; @@ -60,7 +68,8 @@ static public void generate() throws IOException { // read the data and compose the table - String table = "

Rep GlyphHex SequenceNameCopyable
U+" + codes[0] + "
U+" + codes[0] + "
(" + codeAlt + ")
" - + splits[1] + "
" + splits[1] + "" + name + "" + codeString + "
("
+                            + codeAlt
+                            + ")
" + + splits[1] + + "
" + + splits[1] + + "" + + name + + "" + + codeString + + "
"; + String table = + "
Rep GlyphCharacter SequenceContextAlt GlyphDescription of variant appearance
"; final String[] splits = new String[4]; final String[] codes = new String[2]; @@ -70,7 +79,9 @@ static public void generate() throws IOException { final UnicodeProperty ui = tups.getProperty("Unified_Ideograph"); UnicodeSet uiSet = ui.getSet("Yes"); - final BufferedReader in = Utility.openUnicodeFile("StandardizedVariants", Default.ucdVersion(), true, Utility.LATIN1); + final BufferedReader in = + Utility.openUnicodeFile( + "StandardizedVariants", Default.ucdVersion(), true, Utility.LATIN1); while (true) { final String line = Utility.readDataLine(in); if (line == null) { @@ -80,20 +91,24 @@ static public void generate() throws IOException { continue; } - //nal int count = Utility.split(line, ';', splits); - //nal int codeCount = Utility.split(splits[0], ' ', codes); + // nal int count = Utility.split(line, ';', splits); + // nal int codeCount = Utility.split(splits[0], ' ', codes); final int code = Utility.codePointFromHex(codes[0]); if (uiSet.contains(code)) { continue; } - // 03E2 + // 03E2 final String glyphPart = "refglyph?24-" + codes[0]; final String substitute = FIX_GLYPH_PART.get(glyphPart); - table += "\n"; + table += + "\n"; table += "\n"; String shape = splits[2].trim(); @@ -132,9 +147,8 @@ static public void generate() throws IOException { final String version = ucd.getVersion(); final UnicodeDataFile outfile = UnicodeDataFile.openHTMLAndWriteHeader( - "UCD/" + version + '/', - "StandardizedVariants") - .setSkipCopyright(Settings.SKIP_COPYRIGHT); + "UCD/" + version + '/', "StandardizedVariants") + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter out = outfile.out; @@ -143,17 +157,17 @@ static public void generate() throws IOException { String updateDirectory; String lastDirectory; String partialFilename; - if (ucd.getCompositeVersion() < 0x40100) { // version < 4.1.0 - updateDirectory = version.substring(0,lastDot) + "-Update"; - final int updateV = version.charAt(version.length()-1) - '0'; + if (ucd.getCompositeVersion() < 0x40100) { // version < 4.1.0 + updateDirectory = version.substring(0, lastDot) + "-Update"; + final int updateV = version.charAt(version.length() - 1) - '0'; if (updateV != 0) { - updateDirectory += (char)('1' + updateV); + updateDirectory += (char) ('1' + updateV); } if (DEBUG) { System.out.println("updateDirectory: " + updateDirectory); } partialFilename = "StandardizedVariants-" + ucd.getVersion(); - } else if (ucd.getCompositeVersion() == 0x40100) { // version == 4.1.0 + } else if (ucd.getCompositeVersion() == 0x40100) { // version == 4.1.0 updateDirectory = "4.1/ucd"; partialFilename = "StandardizedVariants"; } else { @@ -162,105 +176,110 @@ static public void generate() throws IOException { } lastDirectory = lastVersion + "/ucd"; - final String[] replacementList = { - "@revision@", Default.ucd().getVersion(), - "@updateDirectory@", updateDirectory, - "@lastDirectory@", lastDirectory, - "@filename@", partialFilename, - "@date@", Default.getDate(), - "@table@", table}; + "@revision@", Default.ucd().getVersion(), + "@updateDirectory@", updateDirectory, + "@lastDirectory@", lastDirectory, + "@filename@", partialFilename, + "@date@", Default.getDate(), + "@table@", table + }; - Utility.appendFile(Settings.SRC_UCD_DIR + "StandardizedVariants-Template.html", Utility.UTF8, out, replacementList); + Utility.appendFile( + Settings.SRC_UCD_DIR + "StandardizedVariants-Template.html", + Utility.UTF8, + out, + replacementList); outfile.close(); } - static Map FIX_GLYPH_PART = new HashMap(); + static Map FIX_GLYPH_PART = new HashMap(); + static { final String[][] HACKS = { - {"refglyph?24-25FB", "varglyph?24-25FB-FE0E"}, - {"refglyph?24-25FB", "varglyph?24-25FB-FE0E"}, - {"refglyph?24-25FC", "varglyph?24-25FC-FE0E"}, - {"refglyph?24-25FC", "varglyph?24-25FC-FE0E"}, - {"refglyph?24-25FD", "varglyph?24-25FD-FE0E"}, - {"refglyph?24-25FD", "varglyph?24-25FD-FE0E"}, - {"refglyph?24-25FE", "varglyph?24-25FE-FE0E"}, - {"refglyph?24-25FE", "varglyph?24-25FE-FE0E"}, - {"refglyph?24-2614", "varglyph?24-2614-FE0E"}, - {"refglyph?24-2614", "varglyph?24-2614-FE0E"}, - {"refglyph?24-2615", "varglyph?24-2615-FE0E"}, - {"refglyph?24-2615", "varglyph?24-2615-FE0E"}, - {"refglyph?24-267B", "varglyph?24-267B-FE0E"}, - {"refglyph?24-267B", "varglyph?24-267B-FE0E"}, - {"refglyph?24-267F", "varglyph?24-267F-FE0E"}, - {"refglyph?24-267F", "varglyph?24-267F-FE0E"}, - {"refglyph?24-2693", "varglyph?24-2693-FE0E"}, - {"refglyph?24-2693", "varglyph?24-2693-FE0E"}, - {"refglyph?24-26A0", "varglyph?24-26A0-FE0E"}, - {"refglyph?24-26A0", "varglyph?24-26A0-FE0E"}, - {"refglyph?24-26A1", "varglyph?24-26A1-FE0E"}, - {"refglyph?24-26A1", "varglyph?24-26A1-FE0E"}, - {"refglyph?24-26AA", "varglyph?24-26AA-FE0E"}, - {"refglyph?24-26AA", "varglyph?24-26AA-FE0E"}, - {"refglyph?24-26AB", "varglyph?24-26AB-FE0E"}, - {"refglyph?24-26AB", "varglyph?24-26AB-FE0E"}, - {"refglyph?24-26BD", "varglyph?24-26BD-FE0E"}, - {"refglyph?24-26BD", "varglyph?24-26BD-FE0E"}, - {"refglyph?24-26BE", "varglyph?24-26BE-FE0E"}, - {"refglyph?24-26BE", "varglyph?24-26BE-FE0E"}, - {"refglyph?24-26C4", "varglyph?24-26C4-FE0E"}, - {"refglyph?24-26C4", "varglyph?24-26C4-FE0E"}, - {"refglyph?24-26C5", "varglyph?24-26C5-FE0E"}, - {"refglyph?24-26C5", "varglyph?24-26C5-FE0E"}, - {"refglyph?24-26D4", "varglyph?24-26D4-FE0E"}, - {"refglyph?24-26D4", "varglyph?24-26D4-FE0E"}, - {"refglyph?24-26EA", "varglyph?24-26EA-FE0E"}, - {"refglyph?24-26EA", "varglyph?24-26EA-FE0E"}, - {"refglyph?24-26F2", "varglyph?24-26F2-FE0E"}, - {"refglyph?24-26F2", "varglyph?24-26F2-FE0E"}, - {"refglyph?24-26F3", "varglyph?24-26F3-FE0E"}, - {"refglyph?24-26F3", "varglyph?24-26F3-FE0E"}, - {"refglyph?24-26F5", "varglyph?24-26F5-FE0E"}, - {"refglyph?24-26F5", "varglyph?24-26F5-FE0E"}, - {"refglyph?24-26FA", "varglyph?24-26FA-FE0E"}, - {"refglyph?24-26FA", "varglyph?24-26FA-FE0E"}, - {"refglyph?24-26FD", "varglyph?24-26FD-FE0E"}, - {"refglyph?24-26FD", "varglyph?24-26FD-FE0E"}, - {"refglyph?24-2757", "varglyph?24-2757-FE0E"}, - {"refglyph?24-2757", "varglyph?24-2757-FE0E"}, - {"refglyph?24-2934", "varglyph?24-2934-FE0E"}, - {"refglyph?24-2934", "varglyph?24-2934-FE0E"}, - {"refglyph?24-2935", "varglyph?24-2935-FE0E"}, - {"refglyph?24-2935", "varglyph?24-2935-FE0E"}, - {"refglyph?24-2B05", "varglyph?24-2B05-FE0E"}, - {"refglyph?24-2B05", "varglyph?24-2B05-FE0E"}, - {"refglyph?24-2B06", "varglyph?24-2B06-FE0E"}, - {"refglyph?24-2B06", "varglyph?24-2B06-FE0E"}, - {"refglyph?24-2B07", "varglyph?24-2B07-FE0E"}, - {"refglyph?24-2B07", "varglyph?24-2B07-FE0E"}, - {"refglyph?24-2B1B", "varglyph?24-2B1B-FE0E"}, - {"refglyph?24-2B1B", "varglyph?24-2B1B-FE0E"}, - {"refglyph?24-2B1C", "varglyph?24-2B1C-FE0E"}, - {"refglyph?24-2B1C", "varglyph?24-2B1C-FE0E"}, - {"refglyph?24-2B50", "varglyph?24-2B50-FE0E"}, - {"refglyph?24-2B50", "varglyph?24-2B50-FE0E"}, - {"refglyph?24-2B55", "varglyph?24-2B55-FE0E"}, - {"refglyph?24-2B55", "varglyph?24-2B55-FE0E"}, - {"refglyph?24-303D", "varglyph?24-303D-FE0E"}, - {"refglyph?24-303D", "varglyph?24-303D-FE0E"}, - {"refglyph?24-3297", "varglyph?24-3297-FE0E"}, - {"refglyph?24-3297", "varglyph?24-3297-FE0E"}, - {"refglyph?24-3299", "varglyph?24-3299-FE0E"}, - {"refglyph?24-3299", "varglyph?24-3299-FE0E"}, - {"refglyph?24-1F004", "varglyph?24-1F004-FE0E"}, - {"refglyph?24-1F004", "varglyph?24-1F004-FE0E"}, - {"refglyph?24-1F17F", "varglyph?24-1F17F-FE0E"}, - {"refglyph?24-1F17F", "varglyph?24-1F17F-FE0E"}, - {"refglyph?24-1F21A", "varglyph?24-1F21A-FE0E"}, - {"refglyph?24-1F21A", "varglyph?24-1F21A-FE0E"}, - {"refglyph?24-1F22F", "varglyph?24-1F22F-FE0E"}, - {"refglyph?24-1F22F", "varglyph?24-1F22F-FE0E"}, + {"refglyph?24-25FB", "varglyph?24-25FB-FE0E"}, + {"refglyph?24-25FB", "varglyph?24-25FB-FE0E"}, + {"refglyph?24-25FC", "varglyph?24-25FC-FE0E"}, + {"refglyph?24-25FC", "varglyph?24-25FC-FE0E"}, + {"refglyph?24-25FD", "varglyph?24-25FD-FE0E"}, + {"refglyph?24-25FD", "varglyph?24-25FD-FE0E"}, + {"refglyph?24-25FE", "varglyph?24-25FE-FE0E"}, + {"refglyph?24-25FE", "varglyph?24-25FE-FE0E"}, + {"refglyph?24-2614", "varglyph?24-2614-FE0E"}, + {"refglyph?24-2614", "varglyph?24-2614-FE0E"}, + {"refglyph?24-2615", "varglyph?24-2615-FE0E"}, + {"refglyph?24-2615", "varglyph?24-2615-FE0E"}, + {"refglyph?24-267B", "varglyph?24-267B-FE0E"}, + {"refglyph?24-267B", "varglyph?24-267B-FE0E"}, + {"refglyph?24-267F", "varglyph?24-267F-FE0E"}, + {"refglyph?24-267F", "varglyph?24-267F-FE0E"}, + {"refglyph?24-2693", "varglyph?24-2693-FE0E"}, + {"refglyph?24-2693", "varglyph?24-2693-FE0E"}, + {"refglyph?24-26A0", "varglyph?24-26A0-FE0E"}, + {"refglyph?24-26A0", "varglyph?24-26A0-FE0E"}, + {"refglyph?24-26A1", "varglyph?24-26A1-FE0E"}, + {"refglyph?24-26A1", "varglyph?24-26A1-FE0E"}, + {"refglyph?24-26AA", "varglyph?24-26AA-FE0E"}, + {"refglyph?24-26AA", "varglyph?24-26AA-FE0E"}, + {"refglyph?24-26AB", "varglyph?24-26AB-FE0E"}, + {"refglyph?24-26AB", "varglyph?24-26AB-FE0E"}, + {"refglyph?24-26BD", "varglyph?24-26BD-FE0E"}, + {"refglyph?24-26BD", "varglyph?24-26BD-FE0E"}, + {"refglyph?24-26BE", "varglyph?24-26BE-FE0E"}, + {"refglyph?24-26BE", "varglyph?24-26BE-FE0E"}, + {"refglyph?24-26C4", "varglyph?24-26C4-FE0E"}, + {"refglyph?24-26C4", "varglyph?24-26C4-FE0E"}, + {"refglyph?24-26C5", "varglyph?24-26C5-FE0E"}, + {"refglyph?24-26C5", "varglyph?24-26C5-FE0E"}, + {"refglyph?24-26D4", "varglyph?24-26D4-FE0E"}, + {"refglyph?24-26D4", "varglyph?24-26D4-FE0E"}, + {"refglyph?24-26EA", "varglyph?24-26EA-FE0E"}, + {"refglyph?24-26EA", "varglyph?24-26EA-FE0E"}, + {"refglyph?24-26F2", "varglyph?24-26F2-FE0E"}, + {"refglyph?24-26F2", "varglyph?24-26F2-FE0E"}, + {"refglyph?24-26F3", "varglyph?24-26F3-FE0E"}, + {"refglyph?24-26F3", "varglyph?24-26F3-FE0E"}, + {"refglyph?24-26F5", "varglyph?24-26F5-FE0E"}, + {"refglyph?24-26F5", "varglyph?24-26F5-FE0E"}, + {"refglyph?24-26FA", "varglyph?24-26FA-FE0E"}, + {"refglyph?24-26FA", "varglyph?24-26FA-FE0E"}, + {"refglyph?24-26FD", "varglyph?24-26FD-FE0E"}, + {"refglyph?24-26FD", "varglyph?24-26FD-FE0E"}, + {"refglyph?24-2757", "varglyph?24-2757-FE0E"}, + {"refglyph?24-2757", "varglyph?24-2757-FE0E"}, + {"refglyph?24-2934", "varglyph?24-2934-FE0E"}, + {"refglyph?24-2934", "varglyph?24-2934-FE0E"}, + {"refglyph?24-2935", "varglyph?24-2935-FE0E"}, + {"refglyph?24-2935", "varglyph?24-2935-FE0E"}, + {"refglyph?24-2B05", "varglyph?24-2B05-FE0E"}, + {"refglyph?24-2B05", "varglyph?24-2B05-FE0E"}, + {"refglyph?24-2B06", "varglyph?24-2B06-FE0E"}, + {"refglyph?24-2B06", "varglyph?24-2B06-FE0E"}, + {"refglyph?24-2B07", "varglyph?24-2B07-FE0E"}, + {"refglyph?24-2B07", "varglyph?24-2B07-FE0E"}, + {"refglyph?24-2B1B", "varglyph?24-2B1B-FE0E"}, + {"refglyph?24-2B1B", "varglyph?24-2B1B-FE0E"}, + {"refglyph?24-2B1C", "varglyph?24-2B1C-FE0E"}, + {"refglyph?24-2B1C", "varglyph?24-2B1C-FE0E"}, + {"refglyph?24-2B50", "varglyph?24-2B50-FE0E"}, + {"refglyph?24-2B50", "varglyph?24-2B50-FE0E"}, + {"refglyph?24-2B55", "varglyph?24-2B55-FE0E"}, + {"refglyph?24-2B55", "varglyph?24-2B55-FE0E"}, + {"refglyph?24-303D", "varglyph?24-303D-FE0E"}, + {"refglyph?24-303D", "varglyph?24-303D-FE0E"}, + {"refglyph?24-3297", "varglyph?24-3297-FE0E"}, + {"refglyph?24-3297", "varglyph?24-3297-FE0E"}, + {"refglyph?24-3299", "varglyph?24-3299-FE0E"}, + {"refglyph?24-3299", "varglyph?24-3299-FE0E"}, + {"refglyph?24-1F004", "varglyph?24-1F004-FE0E"}, + {"refglyph?24-1F004", "varglyph?24-1F004-FE0E"}, + {"refglyph?24-1F17F", "varglyph?24-1F17F-FE0E"}, + {"refglyph?24-1F17F", "varglyph?24-1F17F-FE0E"}, + {"refglyph?24-1F21A", "varglyph?24-1F21A-FE0E"}, + {"refglyph?24-1F21A", "varglyph?24-1F21A-FE0E"}, + {"refglyph?24-1F22F", "varglyph?24-1F22F-FE0E"}, + {"refglyph?24-1F22F", "varglyph?24-1F22F-FE0E"}, }; for (final String[] pair : HACKS) { FIX_GLYPH_PART.put(pair[0], pair[1]); diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStringPrep.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStringPrep.java index 4ac8f5ba8..d135c05a0 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStringPrep.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateStringPrep.java @@ -1,5 +1,14 @@ package org.unicode.text.UCD; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.IDNA; +import com.ibm.icu.text.StringPrepParseException; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -8,30 +17,18 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.TransliteratorUtilities; -import org.unicode.props.BagFormatter; import org.unicode.cldr.util.props.UnicodeLabel; +import org.unicode.props.BagFormatter; import org.unicode.text.UCD.TestData.RegexMatcher; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.IDNA; -import com.ibm.icu.text.StringPrepParseException; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; - - class GenerateStringPrep implements UCD_Types { - public static void main (String[] args) throws IOException { - //checkChars(false); + public static void main(String[] args) throws IOException { + // checkChars(false); new GenerateStringPrep().genStringPrep(); System.out.println("Done"); } @@ -42,41 +39,46 @@ public static void main (String[] args) throws IOException { ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(""); ToolUnicodePropertySource ups32 = ToolUnicodePropertySource.make("3.2.0"); - //UnicodeSet id_continue = ups.getSet("ID_Continue=true"); + // UnicodeSet id_continue = ups.getSet("ID_Continue=true"); UnicodeSet xid_continue = ups.getSet("XID_Continue=Yes"); UnicodeSet wordChars = new UnicodeSet(); + { if (false) { wordChars.addAll(ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())); wordChars.retainAll(ups.getSet("gc=Sk")); } - wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" + - " \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" + - " \\u055A \\u02B9 \\u02BA]")); - //wordChars.removeAll(xid_continue); + wordChars.addAll( + new UnicodeSet( + "[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" + + " \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0" + + " \\u055A \\u02B9 \\u02BA]")); + // wordChars.removeAll(xid_continue); } UnicodeSet patternProp = ups.getSet("Pattern_Syntax=Yes").removeAll(wordChars); UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement(); - UnicodeSet non_spacing = new UnicodeSet(ups.getSet("gc=Me")) - .addAll(ups.getSet("gc=Mn")) - .removeAll(ups.getSet("Default_Ignorable_Code_Point=Yes")); + UnicodeSet non_spacing = + new UnicodeSet(ups.getSet("gc=Me")) + .addAll(ups.getSet("gc=Mn")) + .removeAll(ups.getSet("Default_Ignorable_Code_Point=Yes")); UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars); - //UnicodeSet[] decompChars = new UnicodeSet[100]; + // UnicodeSet[] decompChars = new UnicodeSet[100]; UCD ucd = Default.ucd(); static Collator uca0 = Collator.getInstance(ULocale.ENGLISH); + { uca0.setStrength(Collator.IDENTICAL); } - static GenerateHanTransliterator.MultiComparator uca - = new GenerateHanTransliterator.MultiComparator(new Comparator[] { - uca0, new UTF16.StringComparator()}); - UnicodeSet bidiR = new UnicodeSet( - "[[:Bidi_Class=AL:][:Bidi_Class=R:]]"); + static GenerateHanTransliterator.MultiComparator uca = + new GenerateHanTransliterator.MultiComparator( + new Comparator[] {uca0, new UTF16.StringComparator()}); + + UnicodeSet bidiR = new UnicodeSet("[[:Bidi_Class=AL:][:Bidi_Class=R:]]"); UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]"); UnicodeSet hasNoUpper = new UnicodeSet(); @@ -86,10 +88,10 @@ public static void main (String[] args) throws IOException { UnicodeSet isCaseFolded = new UnicodeSet(); void genStringPrep() throws IOException { - //showScriptToBlock(); + // showScriptToBlock(); bf.setShowLiteral(TransliteratorUtilities.toHTMLControl); bf.setUnicodePropertyFactory(ups); - //bf.setValueSource(UnicodeLabel.NULL); + // bf.setValueSource(UnicodeLabel.NULL); if (false) { System.out.println("word chars: " + bf.showSetNames(wordChars)); @@ -129,13 +131,19 @@ void genStringPrep() throws IOException { System.out.println(bf.showSetNames(hasNoUpper)); Utility.fixDot(); - final PrintWriter htmlOut = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "idn-chars.html"); - final PrintWriter htmlOut2 = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "script-chars.html"); - PrintWriter textOut = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "idn-chars.txt"); + final PrintWriter htmlOut = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "idn-chars.html"); + final PrintWriter htmlOut2 = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "script-chars.html"); + PrintWriter textOut = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "idn-chars.txt"); textOut.println('\uFEFF'); textOut.println("For documentation, see idn-chars.html"); - Utility.appendFile("./org/unicode/text/UCD/idn-charsHeader.html", Utility.UTF8_WINDOWS, htmlOut, + Utility.appendFile( + "./org/unicode/text/UCD/idn-charsHeader.html", + Utility.UTF8_WINDOWS, + htmlOut, new String[] {"%date%", Default.getDate()}); /* out @@ -157,8 +165,7 @@ void genStringPrep() throws IOException { htmlOut2.println("
Rep GlyphCharacter SequenceContextAlt GlyphDescription of variant appearance
U+" + codes[0] + "
U+"
+                            + codes[0]
+                            + "" + splits[0] + "
"); for (int scriptCode = 0; scriptCode < coreChars.length; ++scriptCode) { - if (scriptCode == COMMON_SCRIPT - || scriptCode == INHERITED_SCRIPT) { + if (scriptCode == COMMON_SCRIPT || scriptCode == INHERITED_SCRIPT) { continue; } showCodes(htmlOut, textOut, scriptCode, htmlOut2); @@ -182,9 +189,10 @@ void genStringPrep() throws IOException { textOut.println(); textOut.println("# *** FOR REVIEW ***"); bf.setLabelSource(UnicodeLabel.NULL); - for (final Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) { + for (final Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); + it.hasNext(); ) { textOut.println(); - final String value = (String)it.next(); + final String value = (String) it.next(); bf.setValueSource(value); bf.showSetNames(textOut, suspect.keySet(value)); } @@ -193,28 +201,33 @@ void genStringPrep() throws IOException { bf = new BagFormatter(); bf.setUnicodePropertyFactory(ups); textOut.println(); - textOut.println("# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***"); + textOut.println( + "# *** Comparison of IDN with CF_NFKC_ID (case-folded, NFKC, XID), U3.2 only ***"); final UnicodeSet U32 = ups32.getSet("gc=cn").complement(); - final UnicodeSet CF_NFKC_ID = new UnicodeSet(xid_continue).retainAll(isNFKC).retainAll(isCaseFolded).retainAll(U32); + final UnicodeSet CF_NFKC_ID = + new UnicodeSet(xid_continue) + .retainAll(isNFKC) + .retainAll(isCaseFolded) + .retainAll(U32); bf.showSetDifferences(textOut, "CF_NFKC_ID", CF_NFKC_ID, "IDN", idnaTypeSet[OK]); textOut.close(); - } - /** - * - */ + /** */ private void showScriptToBlock() { - final UnicodeMap scripts = ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap(); - final UnicodeMap blocks = ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap(); - final UnicodeMap.Composer myCompose = new UnicodeMap.Composer() { - @Override - public Object compose(int codepoint, String string, Object a, Object b) { - return a + "\t" + b; - } - }; + final UnicodeMap scripts = + ToolUnicodePropertySource.make("").getProperty("script").getUnicodeMap(); + final UnicodeMap blocks = + ToolUnicodePropertySource.make("").getProperty("block").getUnicodeMap(); + final UnicodeMap.Composer myCompose = + new UnicodeMap.Composer() { + @Override + public Object compose(int codepoint, String string, Object a, Object b) { + return a + "\t" + b; + } + }; final UnicodeMap sb = scripts.cloneAsThawed().composeWith(blocks, myCompose); - for (final Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext();) { + for (final Iterator it = sb.getAvailableValues(new TreeSet()).iterator(); it.hasNext(); ) { System.out.println(it.next()); } throw new IllegalArgumentException(); @@ -223,93 +236,89 @@ public Object compose(int codepoint, String string, Object a, Object b) { Map scriptToGif = CollectionUtilities.asMap(script_to_gif); static String[][] script_to_gif = { - - {"Common","common.gif"}, //Miscellaneous_Symbols - {"Inherited","combiningdiacritics.gif"}, //Combining_Diacritical_Marks - {"Arabic","arabic.gif"}, //Arabic - {"Armenian","armenian.gif"}, //Armenian - {"Bengali","bengali.gif"}, //Bengali - {"Bopomofo","bopomofo.gif"}, //Bopomofo - {"Braille","braillesymbols.gif"}, //Braille_Patterns - {"Buginese","buginese.gif"}, //Buginese - {"Buhid","buhid.gif"}, //Buhid - {"Canadian_Aboriginal","canadiansyllabics.gif"}, //Unified_Canadian_Aboriginal_Syllabics - {"Cherokee","cherokee.gif"}, //Cherokee - {"Coptic","coptic.gif"}, //Coptic - {"Cypriot","cypriot.gif"}, //Cypriot_Syllabary - {"Cyrillic","cyrillic.gif"}, //Cyrillic - {"Deseret","deseret.gif"}, //Deseret - {"Devanagari","devanagari.gif"}, //Devanagari - {"Ethiopic","ethiopic.gif"}, //Ethiopic - {"Georgian","georgian.gif"}, //Georgian - {"Glagolitic","glagolitic.gif"}, //Glagolitic - {"Gothic","gothic.gif"}, //Gothic - {"Greek","greek.gif"}, //Greek_and_Coptic - {"Gujarati","gujarati.gif"}, //Gujarati - {"Gurmukhi","gurmukhi.gif"}, //Gurmukhi - {"Han","cjkideographcompat.gif"}, //CJK_Compatibility_Ideographs - {"Han","kangxiradicals.gif"}, //Kangxi_Radicals - {"Hangul","hangulsyllables.gif"}, //Hangul_Syllables - {"Hanunoo","hanunoo.gif"}, //Hanunoo - {"Hebrew","hebrew.gif"}, //Hebrew - {"Hiragana","hiragana.gif"}, //Hiragana - {"Kannada","kannada.gif"}, //Kannada - {"Katakana","katakana.gif"}, //Katakana - {"Kharoshthi","kharoshthi.gif"}, //Kharoshthi - {"Khmer","khmer.gif"}, //Khmer - {"Lao","lao.gif"}, //Lao - {"Latin","latin.gif"}, //Basic_Latin - {"Limbu","limbu.gif"}, //Limbu - {"Linear_B","linearbsyllabary.gif"}, //Linear_B_Syllabary - {"Malayalam","malayalam.gif"}, //Malayalam - {"Mongolian","mongolian.gif"}, //Mongolian - {"Myanmar","myanmar.gif"}, //Myanmar - {"New_Tai_Lue","newtailu.gif"}, //New_Tai_Lue - {"Ogham","ogham.gif"}, //Ogham - {"Old_Italic","olditalic.gif"}, //Old_Italic - {"Old_Persian","oldpersiancuneiform.gif"}, //Old_Persian - {"Oriya","oriya.gif"}, //Oriya - {"Osmanya","osmanya.gif"}, //Osmanya - {"Runic","runic.gif"}, //Runic - {"Shavian","shavian.gif"}, //Shavian - {"Sinhala","sinhala.gif"}, //Sinhala - {"Syloti_Nagri","silotinagri.gif"}, //Syloti_Nagri - {"Syriac","syriac.gif"}, //Syriac - {"Tagalog","tagalog.gif"}, //Tagalog - {"Tagbanwa","tagbanwa.gif"}, //Tagbanwa - {"Tai_Le","taile.gif"}, //Tai_Le - {"Tamil","tamil.gif"}, //Tamil - {"Telugu","telugu.gif"}, //Telugu - {"Thaana","thaana.gif"}, //Thaana - {"Thai","thai.gif"}, //Thai - {"Tibetan","tibetan.gif"}, //Tibetan - {"Tifinagh","tifinagh.gif"}, //Tifinagh - {"Ugaritic","ugaritic.gif"}, //Ugaritic - {"Yi","yi.gif"}, //Yi_Syllables - + {"Common", "common.gif"}, // Miscellaneous_Symbols + {"Inherited", "combiningdiacritics.gif"}, // Combining_Diacritical_Marks + {"Arabic", "arabic.gif"}, // Arabic + {"Armenian", "armenian.gif"}, // Armenian + {"Bengali", "bengali.gif"}, // Bengali + {"Bopomofo", "bopomofo.gif"}, // Bopomofo + {"Braille", "braillesymbols.gif"}, // Braille_Patterns + {"Buginese", "buginese.gif"}, // Buginese + {"Buhid", "buhid.gif"}, // Buhid + {"Canadian_Aboriginal", "canadiansyllabics.gif"}, // Unified_Canadian_Aboriginal_Syllabics + {"Cherokee", "cherokee.gif"}, // Cherokee + {"Coptic", "coptic.gif"}, // Coptic + {"Cypriot", "cypriot.gif"}, // Cypriot_Syllabary + {"Cyrillic", "cyrillic.gif"}, // Cyrillic + {"Deseret", "deseret.gif"}, // Deseret + {"Devanagari", "devanagari.gif"}, // Devanagari + {"Ethiopic", "ethiopic.gif"}, // Ethiopic + {"Georgian", "georgian.gif"}, // Georgian + {"Glagolitic", "glagolitic.gif"}, // Glagolitic + {"Gothic", "gothic.gif"}, // Gothic + {"Greek", "greek.gif"}, // Greek_and_Coptic + {"Gujarati", "gujarati.gif"}, // Gujarati + {"Gurmukhi", "gurmukhi.gif"}, // Gurmukhi + {"Han", "cjkideographcompat.gif"}, // CJK_Compatibility_Ideographs + {"Han", "kangxiradicals.gif"}, // Kangxi_Radicals + {"Hangul", "hangulsyllables.gif"}, // Hangul_Syllables + {"Hanunoo", "hanunoo.gif"}, // Hanunoo + {"Hebrew", "hebrew.gif"}, // Hebrew + {"Hiragana", "hiragana.gif"}, // Hiragana + {"Kannada", "kannada.gif"}, // Kannada + {"Katakana", "katakana.gif"}, // Katakana + {"Kharoshthi", "kharoshthi.gif"}, // Kharoshthi + {"Khmer", "khmer.gif"}, // Khmer + {"Lao", "lao.gif"}, // Lao + {"Latin", "latin.gif"}, // Basic_Latin + {"Limbu", "limbu.gif"}, // Limbu + {"Linear_B", "linearbsyllabary.gif"}, // Linear_B_Syllabary + {"Malayalam", "malayalam.gif"}, // Malayalam + {"Mongolian", "mongolian.gif"}, // Mongolian + {"Myanmar", "myanmar.gif"}, // Myanmar + {"New_Tai_Lue", "newtailu.gif"}, // New_Tai_Lue + {"Ogham", "ogham.gif"}, // Ogham + {"Old_Italic", "olditalic.gif"}, // Old_Italic + {"Old_Persian", "oldpersiancuneiform.gif"}, // Old_Persian + {"Oriya", "oriya.gif"}, // Oriya + {"Osmanya", "osmanya.gif"}, // Osmanya + {"Runic", "runic.gif"}, // Runic + {"Shavian", "shavian.gif"}, // Shavian + {"Sinhala", "sinhala.gif"}, // Sinhala + {"Syloti_Nagri", "silotinagri.gif"}, // Syloti_Nagri + {"Syriac", "syriac.gif"}, // Syriac + {"Tagalog", "tagalog.gif"}, // Tagalog + {"Tagbanwa", "tagbanwa.gif"}, // Tagbanwa + {"Tai_Le", "taile.gif"}, // Tai_Le + {"Tamil", "tamil.gif"}, // Tamil + {"Telugu", "telugu.gif"}, // Telugu + {"Thaana", "thaana.gif"}, // Thaana + {"Thai", "thai.gif"}, // Thai + {"Tibetan", "tibetan.gif"}, // Tibetan + {"Tifinagh", "tifinagh.gif"}, // Tifinagh + {"Ugaritic", "ugaritic.gif"}, // Ugaritic + {"Yi", "yi.gif"}, // Yi_Syllables }; UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT]; + { for (int i = 0; i < idnaTypeSet.length; ++i) { idnaTypeSet[i] = new UnicodeSet(); } } + static final int OK = 0, DELETED = 1, ILLEGAL = 2, REMAPPED = 3, IDNA_TYPE_LIMIT = 4; - /** - * - */ - static public int getIDNAType(int cp) { + /** */ + public static int getIDNAType(int cp) { inbuffer.setLength(0); UTF16.append(inbuffer, cp); try { - intermediate = IDNA.convertToASCII(inbuffer, - IDNA.DEFAULT); // USE_STD3_RULES + intermediate = IDNA.convertToASCII(inbuffer, IDNA.DEFAULT); // USE_STD3_RULES if (intermediate.length() == 0) { return DELETED; } - outbuffer = IDNA.convertToUnicode(intermediate, - IDNA.USE_STD3_RULES); + outbuffer = IDNA.convertToUnicode(intermediate, IDNA.USE_STD3_RULES); } catch (final StringPrepParseException e) { return ILLEGAL; } catch (final Exception e) { @@ -321,6 +330,7 @@ static public int getIDNAType(int cp) { } return OK; } + static StringBuffer inbuffer = new StringBuffer(); static StringBuffer intermediate, outbuffer; @@ -334,18 +344,23 @@ static public int getIDNAType(int cp) { * @param coreChars * @param decompChars */ - private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) { + private void showCodes( + PrintWriter htmlOut, PrintWriter textOut, int scriptCode, PrintWriter htmlOut2) { if (coreChars[scriptCode] == null) { return; } Default.ucd(); String script = UCD.getScriptID_fromIndex((byte) scriptCode); - script = Utility.getUnskeleton(script.toLowerCase(),true); + script = Utility.getUnskeleton(script.toLowerCase(), true); System.out.println(script); htmlOut.println(); - final String scriptLine = ""; + final String scriptLine = + ""; htmlOut.println(scriptLine); htmlOut2.println(scriptLine); textOut.println(); @@ -374,8 +389,7 @@ private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, final String name = Default.ucd().getName(it.codepoint); if (name.indexOf("MUSICAL SYMBOL") >= 0 || name.indexOf("DINGBA") >= 0 - || name.indexOf("RADICAL ") >= 0 - ) { + || name.indexOf("RADICAL ") >= 0) { cat = "XX"; } suspect.put(it.codepoint, cat); @@ -385,7 +399,14 @@ private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode, uca); } if (bicameralNoupper.size() != 0) { - printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode, uca); + printlnSet( + htmlOut, + textOut, + script, + "Atomic-no-uppercase", + bicameralNoupper, + scriptCode, + uca); } if (pattern.size() != 0) { printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode, uca); @@ -398,10 +419,24 @@ private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, } if (remappedIsNFKC.size() != 0) { - printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode, uca); + printlnSet( + htmlOut, + textOut, + script, + "IDN-Remapped-Case-Atomic", + remappedIsNFKC, + scriptCode, + uca); } if (remappedIsNFKCDecomp.size() != 0) { - printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-NFD-Decomposable", remappedIsNFKCDecomp, scriptCode, uca); + printlnSet( + htmlOut, + textOut, + script, + "IDN-Remapped-Case-NFD-Decomposable", + remappedIsNFKCDecomp, + scriptCode, + uca); } if (remapped.size() != 0) { printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode, uca); @@ -414,34 +449,47 @@ private void showCodes(PrintWriter htmlOut, PrintWriter textOut, int scriptCode, } } - private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) throws IOException { + private void showCodes(PrintWriter htmlOut, PrintWriter textOut, UnicodeSet uset) + throws IOException { Default.ucd(); String script = UCD.getScriptID_fromIndex(INHERITED_SCRIPT); - script = Utility.getUnskeleton(script.toLowerCase(),true); - final String scriptLine = ""; + script = Utility.getUnskeleton(script.toLowerCase(), true); + final String scriptLine = + ""; htmlOut.println(scriptLine); final UnicodeMap m = getPositions(); - for (final Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); it.hasNext(); ) { + for (final Iterator it = m.getAvailableValues(new TreeSet(uca)).iterator(); + it.hasNext(); ) { final String type = (String) it.next(); final UnicodeSet current = m.keySet(type).retainAll(non_spacing); if (current.size() == 0) { continue; } - printlnSet(htmlOut, textOut, script, "Visible_Combining_Marks_" + type, current, INHERITED_SCRIPT, positionComparator); + printlnSet( + htmlOut, + textOut, + script, + "Visible_Combining_Marks_" + type, + current, + INHERITED_SCRIPT, + positionComparator); } } /** * @throws IOException - * */ private UnicodeMap getPositions() throws IOException { final UnicodeMap result = new UnicodeMap(); - final BufferedReader in = FileUtilities.openUTF8Reader(Settings.UnicodeTools.DATA_DIR + "confusables/", "positions.txt"); - String type="Undetermined"; + final BufferedReader in = + FileUtilities.openUTF8Reader( + Settings.UnicodeTools.DATA_DIR + "confusables/", "positions.txt"); + String type = "Undetermined"; while (true) { final String line = Utility.readDataLine(in); if (line == null) { @@ -456,23 +504,22 @@ private UnicodeMap getPositions() throws IOException { } final String[] pieces = Utility.split(line, ';'); final String code = Utility.fromHex(pieces[0]); - result.put(UTF16.charAt(code,0), type); + result.put(UTF16.charAt(code, 0), type); } return result; } - static Comparator positionComparator = new Comparator() { - @Override - public int compare(Object o1, Object o2) { - final String s1 = (String)o1; - final String s2 = (String)o2; - return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2)); - } - }; + static Comparator positionComparator = + new Comparator() { + @Override + public int compare(Object o1, Object o2) { + final String s1 = (String) o1; + final String s2 = (String) o2; + return Default.ucd().getName(s1).compareTo(Default.ucd().getName(s2)); + } + }; - /** - * - */ + /** */ private UnicodeSet extract(UnicodeSet other, UnicodeSet core) { final UnicodeSet decomp = new UnicodeSet(core).retainAll(other); core.removeAll(decomp); @@ -488,17 +535,32 @@ private UnicodeSet extract(UnicodeSet other, UnicodeSet core) { * @param comparator TODO * @param uca */ - private void printlnSet(PrintWriter htmlOut, PrintWriter textOut, - String script, String title, UnicodeSet unicodeset, int scriptCode, Comparator comparator) { + private void printlnSet( + PrintWriter htmlOut, + PrintWriter textOut, + String script, + String title, + UnicodeSet unicodeset, + int scriptCode, + Comparator comparator) { if (unicodeset == null) { return; } final int size = unicodeset.size(); - final String dir = unicodeset.containsSome(bidiR) - && unicodeset.containsNone(bidiL) ? " dir='rtl'" : ""; - htmlOut.println(""); + final String dir = + unicodeset.containsSome(bidiR) && unicodeset.containsNone(bidiL) + ? " dir='rtl'" + : ""; + htmlOut.println( + ""); htmlOut.print(""); + + // clear storage + data.clear(); + break; } } log.println("
Script: " + script + "
Script: " + + script + + "
Script: " + script + "
Script: " + + script + + "
" + title + " (" - + TestData.nf.format(size) + ")
" + + title + + " (" + + TestData.nf.format(size) + + ")
"); // categorization textOut.println(); @@ -509,14 +571,12 @@ private void printlnSet(PrintWriter htmlOut, PrintWriter textOut, usi.reset(unicodeset); while (usi.nextRange()) { if (usi.codepoint == usi.codepointEnd) { - htmlOut.print(formatCode(UTF16 - .valueOf(usi.codepoint))); + htmlOut.print(formatCode(UTF16.valueOf(usi.codepoint))); } else { - htmlOut.print(formatCode(UTF16 - .valueOf(usi.codepoint)) - + ".. " - + formatCode(UTF16 - .valueOf(usi.codepointEnd))); + htmlOut.print( + formatCode(UTF16.valueOf(usi.codepoint)) + + ".. " + + formatCode(UTF16.valueOf(usi.codepointEnd))); } } bf.showSetNames(textOut, unicodeset); @@ -527,13 +587,13 @@ private void printlnSet(PrintWriter htmlOut, PrintWriter textOut, final String x = usi.getString(); final boolean foo = reordered.add(x); if (!foo) { - throw new IllegalArgumentException("Collision with " - + Default.ucd().getCodeAndName(x)); + throw new IllegalArgumentException( + "Collision with " + Default.ucd().getCodeAndName(x)); } } - for (final Iterator it = reordered.iterator(); it.hasNext();) { + for (final Iterator it = reordered.iterator(); it.hasNext(); ) { final Object key = it.next(); - htmlOut.print(formatCode((String)key)); + htmlOut.print(formatCode((String) key)); } bf.showSetNames(textOut, reordered); } @@ -545,16 +605,18 @@ private void printlnSet(PrintWriter htmlOut, PrintWriter textOut, * @return */ private String formatCode(String string) { - final int cat = ucd.getCategory(UTF16.charAt(string,0)); + final int cat = ucd.getCategory(UTF16.charAt(string, 0)); String pad = "\u00A0", pad1 = pad; if (cat == Me || cat == Mn) { pad = "\u00A0\u00A0"; pad1 = "\u00A0\u00A0\u25cc"; } - return "" - + pad1 - + TransliteratorUtilities.toHTMLControl.transliterate(string) - + pad - + " "; + return "" + + pad1 + + TransliteratorUtilities.toHTMLControl.transliterate(string) + + pad + + " "; } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateThaiBreaks.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateThaiBreaks.java index e86e0bf81..416fa55ee 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateThaiBreaks.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateThaiBreaks.java @@ -1,45 +1,44 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateThaiBreaks.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/GenerateThaiBreaks.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; + +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; - import org.unicode.text.utility.Utility; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class GenerateThaiBreaks { - public static void main(String [] args) throws IOException { - - final BufferedReader br = new BufferedReader( - new InputStreamReader( - new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle")); + public static void main(String[] args) throws IOException { + + final BufferedReader br = + new BufferedReader( + new InputStreamReader( + new FileInputStream( + "c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), + "UnicodeLittle")); final PrintWriter out = null; try { final UnicodeSet ignorables = new UnicodeSet(); /* new UnicodeSet(0xE30, 0xE3A); - ignorables.add(0x0E40, 0x0E44); // add logical order exception - ignorables.add(0x0E47, 0x0E4E); - */ + ignorables.add(0x0E40, 0x0E44); // add logical order exception + ignorables.add(0x0E47, 0x0E4E); + */ ignorables.add(0, ' '); // add controls ignorables.add('.'); - final UnicodeSet initials = new UnicodeSet(); final UnicodeSet finals = new UnicodeSet(); final UnicodeSet medials = new UnicodeSet(); @@ -69,23 +68,23 @@ public static void main(String [] args) throws IOException { continue; } - initials.add(temp.substring(0,1)); - //initials.add(temp.substring(0,2)); - finals.add(temp.substring(temp.length()-1)); - //finals.add(temp.substring(temp.length()-1)); + initials.add(temp.substring(0, 1)); + // initials.add(temp.substring(0,2)); + finals.add(temp.substring(temp.length() - 1)); + // finals.add(temp.substring(temp.length()-1)); for (int i = 1; i < temp.length() - 1; ++i) { - //medials.add(temp.substring(i, i+2)); - medials.add(temp.substring(i, i+1)); + // medials.add(temp.substring(i, i+2)); + medials.add(temp.substring(i, i + 1)); } - //medials.add(temp.substring(temp.length() - 2, temp.length() - 1)); + // medials.add(temp.substring(temp.length() - 2, temp.length() - 1)); } System.out.println("initials size: " + initials.size()); System.out.println("finals size: " + finals.size()); System.out.println("medials size: " + medials.size()); - //out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS); + // out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS); // out.write('\uFEFF'); final UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]"); @@ -93,7 +92,8 @@ public static void main(String [] args) throws IOException { final UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals); - final UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all); + final UnicodeSet missingThai = + new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all); System.out.println("Never occur: " + missingThai.toPattern(true)); Utility.showSetNames("", missingThai, true, Default.ucd()); @@ -133,15 +133,22 @@ public static void main(String [] args) throws IOException { static class MyBreaker implements Utility.Breaker { @Override public String get(Object current, Object old) { - if (old == null || UTF16.charAt(current.toString(), 0) == UTF16.charAt(old.toString(), 0)) { + if (old == null + || UTF16.charAt(current.toString(), 0) == UTF16.charAt(old.toString(), 0)) { Default.ucd(); - return current.toString() + "(" + UCD.getCode(current.toString().substring(1)) + "))"; + return current.toString() + + "(" + + UCD.getCode(current.toString().substring(1)) + + "))"; } else { Default.ucd(); return "\n" + current + "(" + UCD.getCode(current.toString()) + "))"; } } + @Override - public boolean filter(Object current) { return true; } + public boolean filter(Object current) { + return true; + } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateWholeScriptConfusables.java b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateWholeScriptConfusables.java index 2d9a3e79d..c8c4a6d58 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GenerateWholeScriptConfusables.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GenerateWholeScriptConfusables.java @@ -1,5 +1,7 @@ package org.unicode.text.UCD; +import com.ibm.icu.text.SpoofChecker; +import com.ibm.icu.text.UnicodeSet; import java.lang.reflect.Constructor; import java.lang.reflect.Field; import java.lang.reflect.Method; @@ -12,463 +14,481 @@ import java.util.Map.Entry; import java.util.stream.IntStream; -import com.ibm.icu.text.SpoofChecker; -import com.ibm.icu.text.UnicodeSet; - /** * A class to detect and generate strings that are whole-script confusable with input strings. - * - * The "public static void main" shows an example of this class's usage. The method - * {@link #getWholeScriptConfusables} is the primary public endpoint. - * - * This class uses reflection to access some internal machinery from SpoofChecker, so it is not + * + *

The "public static void main" shows an example of this class's usage. The method {@link + * #getWholeScriptConfusables} is the primary public endpoint. + * + *

This class uses reflection to access some internal machinery from SpoofChecker, so it is not * recommended for use in production-ready code. - * - * This script can be run stand-alone by passing a whitespace-separated list of words as the + * + *

This script can be run stand-alone by passing a whitespace-separated list of words as the * argument list. - * + * * @author Shane Carr * @see com.ibm.icu.text.SpoofChecker */ public class GenerateWholeScriptConfusables { - final ConfusableGraphNode base; - final SpoofChecker sc; - final SpoofCheckerWrapper scw; - - // as of Unicode 9, the longest confusable skeleton is 18 characters - final static int MAX_CONFUSABLE_STRING_LENGTH = 18; - - // -------------------------- - // Start demo code - // -------------------------- - - public static void main(String[] args) { - - // Ignore these four code points that have low-quality confusable skeletons and produce - // poor whole-script confusables - UnicodeSet ignorables = new UnicodeSet(); - ignorables.add('т'); - ignorables.add('ц'); - ignorables.add('τ'); - ignorables.add('к'); - - @SuppressWarnings("deprecation") - SpoofChecker sc = new SpoofChecker.Builder().setAllowedChars(SpoofChecker.RECOMMENDED) - .setChecks(SpoofChecker.CONFUSABLE).build(); - GenerateWholeScriptConfusables wsc = new GenerateWholeScriptConfusables(sc, ignorables); - // Loop over each argument as a word - List resultStrings = new ArrayList(); - List resultScripts = new ArrayList(); - for (String word : args) { - resultStrings.clear(); - resultScripts.clear(); - - // Compute the whole script confusables - wsc.getWholeScriptConfusables(word, resultStrings, resultScripts); - - // Print out the matches to the console - String matches = String.join(", ", - IntStream.range(0, resultStrings.size()) - .mapToObj(i -> (resultStrings.get(i) + " (" + resultScripts.get(i) + ")")) - .toArray(String[]::new)); - System.out.println(word + ": " + matches); - } - } - - // -------------------------- - // Start class definition - // -------------------------- - - /** - * Create a new WholeScriptConfusables instance and build the internal data structure. - * - * @param sc The SpoofChecker containing the data to use when building the data structure. - */ - public GenerateWholeScriptConfusables(SpoofChecker sc) { - this.sc = sc; - this.scw = new SpoofCheckerWrapper(sc); - this.base = build(null); - } - - /** - * Create a new WholeScriptConfusables instance and build the internal data structure. - * - * @param sc The SpoofChecker containing the data to use when building the data structure. - * @param ignorables A set of characters to omit from the internal data structure. Useful for - * ignoring specific entries from the confusables table. - */ - public GenerateWholeScriptConfusables(SpoofChecker sc, UnicodeSet ignorables) { - this.sc = sc; - this.scw = new SpoofCheckerWrapper(sc); - this.base = build(ignorables); - } - - /** - * Builds a confusable graph based on the provided SpoofData and allowed chars. - */ - private ConfusableGraphNode build(UnicodeSet ignorables) { - ConfusableGraphNode base = new ConfusableGraphNode(); - StringBuilder sb = new StringBuilder(); - UnicodeSet allowedChars = sc.getAllowedChars(); - - // Loop over all entries in the SpoofData - for (int i = 0; i < scw.spoofData.length(); i++) { - int codePoint = scw.spoofData.codePointAt(i); - - // Ignore entries that are not in the allowedChars set. - if (!allowedChars.contains(codePoint)) { - continue; - } - - // Ignore certain code points by the user's request - if (ignorables != null && ignorables.contains(codePoint)) { - continue; - } - - // Add this entry to the data structure. - sb.setLength(0); - scw.spoofData.appendValueTo(i, sb); - ConfusableGraphNode node = base; - for (int offset = 0; offset < sb.length();) { - int skeletonCodePoint = sb.codePointAt(offset); - offset += Character.charCount(skeletonCodePoint); - node = node.addAndGetTransition(skeletonCodePoint); - } - node.addCompletion(codePoint); - } - return base; - } - - /** - * Computes examples of whole script confusables for the given input string. Appends the example - * strings to the first output collection, and the corresponding set of scripts to the second - * output collection. - * - * @param input The string for which to compute whole script confusables. - * @param resultStrings The collection to which to append the whole script confusables. - * @param resultScripts The collection to which to append the sets of scripts corresponding to the - * whole script confusables. - */ - public void getWholeScriptConfusables(CharSequence input, Collection resultStrings, - Collection resultScripts) { - // Compute the skeleton string. - String skeleton = sc.getSkeleton(input); - - // Allocate space used during traversal. scriptsByIndex might allocate more space than - // needed since we need to allocate only for the number of code points. - ConfusableGraphNode[] currentNodes = new ConfusableGraphNode[MAX_CONFUSABLE_STRING_LENGTH]; - WholeScriptIterationSet[] scriptsByIndex = new WholeScriptIterationSet[skeleton.length() + 1]; - BitSet temp = SpoofCheckerWrapper.ScriptSet_new(); - - // For each code point in the skeleton string: - int index = 0; - for (int utf16Offset = 0; utf16Offset < skeleton.length(); index++) { - int skeletonCodePoint = skeleton.codePointAt(utf16Offset); - utf16Offset += Character.charCount(skeletonCodePoint); - - // Push the graph nodes forward. - // Will forget the node at the end of the array, if such a node exists. - for (int i = currentNodes.length - 1; i > 0; i--) { - ConfusableGraphNode prev = currentNodes[i - 1]; - currentNodes[i] = (prev == null ? null : prev.getTransition(skeletonCodePoint)); - } - - // Grab the new node for this skeleton char. - currentNodes[0] = base.getTransition(skeletonCodePoint); - - WholeScriptIterationSet next = new WholeScriptIterationSet(); - - // Perform transition using the skeleton character. - temp.clear(); - scw.getAugmentedScriptSet(skeletonCodePoint, temp); - next.addIntersectionsOf(scriptsByIndex[index], temp, skeletonCodePoint); - - // Perform transition using entries from the confusables table. - for (int i = 0; i < currentNodes.length; i++) { - ConfusableGraphNode node = currentNodes[i]; - if (node == null) { - continue; - } - UnicodeSet completions = node.getCompletions(); - if (completions == null) { - continue; + final ConfusableGraphNode base; + final SpoofChecker sc; + final SpoofCheckerWrapper scw; + + // as of Unicode 9, the longest confusable skeleton is 18 characters + static final int MAX_CONFUSABLE_STRING_LENGTH = 18; + + // -------------------------- + // Start demo code + // -------------------------- + + public static void main(String[] args) { + + // Ignore these four code points that have low-quality confusable skeletons and produce + // poor whole-script confusables + UnicodeSet ignorables = new UnicodeSet(); + ignorables.add('т'); + ignorables.add('ц'); + ignorables.add('τ'); + ignorables.add('к'); + + @SuppressWarnings("deprecation") + SpoofChecker sc = + new SpoofChecker.Builder() + .setAllowedChars(SpoofChecker.RECOMMENDED) + .setChecks(SpoofChecker.CONFUSABLE) + .build(); + GenerateWholeScriptConfusables wsc = new GenerateWholeScriptConfusables(sc, ignorables); + // Loop over each argument as a word + List resultStrings = new ArrayList(); + List resultScripts = new ArrayList(); + for (String word : args) { + resultStrings.clear(); + resultScripts.clear(); + + // Compute the whole script confusables + wsc.getWholeScriptConfusables(word, resultStrings, resultScripts); + + // Print out the matches to the console + String matches = + String.join( + ", ", + IntStream.range(0, resultStrings.size()) + .mapToObj( + i -> + (resultStrings.get(i) + + " (" + + resultScripts.get(i) + + ")")) + .toArray(String[]::new)); + System.out.println(word + ": " + matches); } - for (int j = 0; j < completions.size(); j++) { - int completionCodePoint = completions.charAt(j); - temp.clear(); - scw.getAugmentedScriptSet(completionCodePoint, temp); - next.addIntersectionsOf(scriptsByIndex[index - i], temp, completionCodePoint); - } - } - - scriptsByIndex[index + 1] = next; } - // Compute the scripts of the input string. - temp.clear(); - scw.getResolvedScriptSet(input, temp); - - // Add the possible whole scripts to the destination and return. - scriptsByIndex[index].extractNoOverlap(temp, resultScripts, resultStrings); - } - - /** - * Data structure used when generating whole script confusables. Contains a set of script sets, - * where each script set is mapped to a corresponding string. - */ - private static class WholeScriptIterationSet { - private Map map = new HashMap(); + // -------------------------- + // Start class definition + // -------------------------- /** - * Adds all BitSets that result from intersecting the entries in other1 with other2. If there - * are N entries in other1, this method will add between 0 and N entries to this instance. For - * each new entry, records the code point sequence consisting of the previous code point - * sequence from other1 (if available) and the new codePoint passed to this function. If other1 - * is null, this method behaves the same as add(other2, "", codePoint). + * Create a new WholeScriptConfusables instance and build the internal data structure. * - * For example, if: other1 = { { A }, { A, B }, { C } } other2 = { B, C } - * - * then calling this method will add the following sets: { { B }, { C } } + * @param sc The SpoofChecker containing the data to use when building the data structure. */ - public void addIntersectionsOf(WholeScriptIterationSet other1, BitSet other2, int codePoint) { - // Trivial case - if (other1 == null) { - add(other2, "", codePoint); - return; - } - - // Compute and add intersections - BitSet temp = SpoofCheckerWrapper.ScriptSet_new(); - for (Entry entry : other1.map.entrySet()) { - temp.clear(); - temp.or(entry.getKey()); - temp.and(other2); - add(temp, entry.getValue(), codePoint); - } + public GenerateWholeScriptConfusables(SpoofChecker sc) { + this.sc = sc; + this.scw = new SpoofCheckerWrapper(sc); + this.base = build(null); } /** - * Adds the given BitSet to this set if it doesn't already exist. Never keeps a reference to the - * given BitSet. If the BitSet is added, constructs a new string consisting of the baseString - * plus the given codePoint, and associates it with the BitSet. + * Create a new WholeScriptConfusables instance and build the internal data structure. + * + * @param sc The SpoofChecker containing the data to use when building the data structure. + * @param ignorables A set of characters to omit from the internal data structure. Useful for + * ignoring specific entries from the confusables table. */ - public void add(BitSet ss, String baseString, int codePoint) { - if (!map.containsKey(ss)) { - BitSet copy = (BitSet) ss.clone(); - StringBuilder sb = new StringBuilder(baseString); - sb.appendCodePoint(codePoint); - map.put(copy, sb.toString()); - } + public GenerateWholeScriptConfusables(SpoofChecker sc, UnicodeSet ignorables) { + this.sc = sc; + this.scw = new SpoofCheckerWrapper(sc); + this.base = build(ignorables); + } + + /** Builds a confusable graph based on the provided SpoofData and allowed chars. */ + private ConfusableGraphNode build(UnicodeSet ignorables) { + ConfusableGraphNode base = new ConfusableGraphNode(); + StringBuilder sb = new StringBuilder(); + UnicodeSet allowedChars = sc.getAllowedChars(); + + // Loop over all entries in the SpoofData + for (int i = 0; i < scw.spoofData.length(); i++) { + int codePoint = scw.spoofData.codePointAt(i); + + // Ignore entries that are not in the allowedChars set. + if (!allowedChars.contains(codePoint)) { + continue; + } + + // Ignore certain code points by the user's request + if (ignorables != null && ignorables.contains(codePoint)) { + continue; + } + + // Add this entry to the data structure. + sb.setLength(0); + scw.spoofData.appendValueTo(i, sb); + ConfusableGraphNode node = base; + for (int offset = 0; offset < sb.length(); ) { + int skeletonCodePoint = sb.codePointAt(offset); + offset += Character.charCount(skeletonCodePoint); + node = node.addAndGetTransition(skeletonCodePoint); + } + node.addCompletion(codePoint); + } + return base; } /** - * Identifies entries in this instance that have no overlap with the query, and unions all of - * those entries into the destination. + * Computes examples of whole script confusables for the given input string. Appends the example + * strings to the first output collection, and the corresponding set of scripts to the second + * output collection. * - * For example, if this set contained: { { A }, { A, B }, { B, C }, { D } } and query was { A, B - * } then { D } would be unioned into destination, as D is the only script contained in a BitSet - * that has an empty intersection with the query. - * - * For all entries added to destination, adds an example string to the given Collection. + * @param input The string for which to compute whole script confusables. + * @param resultStrings The collection to which to append the whole script confusables. + * @param resultScripts The collection to which to append the sets of scripts corresponding to + * the whole script confusables. */ - public void extractNoOverlap(BitSet query, Collection resultScripts, - Collection samples) { - BitSet temp = SpoofCheckerWrapper.ScriptSet_new(); - for (Entry entry : map.entrySet()) { - BitSet ss = entry.getKey(); - if (ss.isEmpty()) { - continue; + public void getWholeScriptConfusables( + CharSequence input, + Collection resultStrings, + Collection resultScripts) { + // Compute the skeleton string. + String skeleton = sc.getSkeleton(input); + + // Allocate space used during traversal. scriptsByIndex might allocate more space than + // needed since we need to allocate only for the number of code points. + ConfusableGraphNode[] currentNodes = new ConfusableGraphNode[MAX_CONFUSABLE_STRING_LENGTH]; + WholeScriptIterationSet[] scriptsByIndex = + new WholeScriptIterationSet[skeleton.length() + 1]; + BitSet temp = SpoofCheckerWrapper.ScriptSet_new(); + + // For each code point in the skeleton string: + int index = 0; + for (int utf16Offset = 0; utf16Offset < skeleton.length(); index++) { + int skeletonCodePoint = skeleton.codePointAt(utf16Offset); + utf16Offset += Character.charCount(skeletonCodePoint); + + // Push the graph nodes forward. + // Will forget the node at the end of the array, if such a node exists. + for (int i = currentNodes.length - 1; i > 0; i--) { + ConfusableGraphNode prev = currentNodes[i - 1]; + currentNodes[i] = (prev == null ? null : prev.getTransition(skeletonCodePoint)); + } + + // Grab the new node for this skeleton char. + currentNodes[0] = base.getTransition(skeletonCodePoint); + + WholeScriptIterationSet next = new WholeScriptIterationSet(); + + // Perform transition using the skeleton character. + temp.clear(); + scw.getAugmentedScriptSet(skeletonCodePoint, temp); + next.addIntersectionsOf(scriptsByIndex[index], temp, skeletonCodePoint); + + // Perform transition using entries from the confusables table. + for (int i = 0; i < currentNodes.length; i++) { + ConfusableGraphNode node = currentNodes[i]; + if (node == null) { + continue; + } + UnicodeSet completions = node.getCompletions(); + if (completions == null) { + continue; + } + for (int j = 0; j < completions.size(); j++) { + int completionCodePoint = completions.charAt(j); + temp.clear(); + scw.getAugmentedScriptSet(completionCodePoint, temp); + next.addIntersectionsOf(scriptsByIndex[index - i], temp, completionCodePoint); + } + } + + scriptsByIndex[index + 1] = next; } + + // Compute the scripts of the input string. temp.clear(); - temp.or(ss); - temp.and(query); - if (temp.isEmpty()) { - resultScripts.add((BitSet) ss.clone()); - samples.add(entry.getValue()); - } - } - } + scw.getResolvedScriptSet(input, temp); - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(" ss : map.entrySet()) { - SpoofCheckerWrapper.ScriptSet_appendStringTo(ss.getKey(), sb); - sb.append(" "); - } - sb.append("}>"); - return sb.toString(); - } - } - - /** - * Data structure used for efficient lookup of confusable skeletons. Each node is associated with - * an entry from a confusable skeleton. Each node has some set of "transitions" (edges from this - * node to the next node in a skeleton) and "completions" (edges from this node back to the base - * node). The "transitions" edges each have a prototype character associated with them, and the - * "completions" edges each have a codepoint associated with them. - */ - private static class ConfusableGraphNode { - private Map edges; - private UnicodeSet completions; - - public void addCompletion(int confusableCodePoint) { - if (completions == null) { - completions = new UnicodeSet(); - } - completions.add(confusableCodePoint); + // Add the possible whole scripts to the destination and return. + scriptsByIndex[index].extractNoOverlap(temp, resultScripts, resultStrings); } - public UnicodeSet getCompletions() { - return completions; - } + /** + * Data structure used when generating whole script confusables. Contains a set of script sets, + * where each script set is mapped to a corresponding string. + */ + private static class WholeScriptIterationSet { + private Map map = new HashMap(); + + /** + * Adds all BitSets that result from intersecting the entries in other1 with other2. If + * there are N entries in other1, this method will add between 0 and N entries to this + * instance. For each new entry, records the code point sequence consisting of the previous + * code point sequence from other1 (if available) and the new codePoint passed to this + * function. If other1 is null, this method behaves the same as add(other2, "", codePoint). + * + *

For example, if: other1 = { { A }, { A, B }, { C } } other2 = { B, C } + * + *

then calling this method will add the following sets: { { B }, { C } } + */ + public void addIntersectionsOf( + WholeScriptIterationSet other1, BitSet other2, int codePoint) { + // Trivial case + if (other1 == null) { + add(other2, "", codePoint); + return; + } + + // Compute and add intersections + BitSet temp = SpoofCheckerWrapper.ScriptSet_new(); + for (Entry entry : other1.map.entrySet()) { + temp.clear(); + temp.or(entry.getKey()); + temp.and(other2); + add(temp, entry.getValue(), codePoint); + } + } - public ConfusableGraphNode addAndGetTransition(int skeletonCodePoint) { - if (edges == null) { - edges = new HashMap(); - } - ConfusableGraphNode destination = edges.get(skeletonCodePoint); - if (destination == null) { - destination = new ConfusableGraphNode(); - edges.put(skeletonCodePoint, destination); - } - return destination; - } + /** + * Adds the given BitSet to this set if it doesn't already exist. Never keeps a reference to + * the given BitSet. If the BitSet is added, constructs a new string consisting of the + * baseString plus the given codePoint, and associates it with the BitSet. + */ + public void add(BitSet ss, String baseString, int codePoint) { + if (!map.containsKey(ss)) { + BitSet copy = (BitSet) ss.clone(); + StringBuilder sb = new StringBuilder(baseString); + sb.appendCodePoint(codePoint); + map.put(copy, sb.toString()); + } + } - public ConfusableGraphNode getTransition(int skeletonCodePoint) { - if (edges == null) { - return null; - } - return edges.get(skeletonCodePoint); - } - } - - /** - * A wrapper around SpoofChecker enabling access to various private methods inside. Exposes a - * public API similar to that of the original SpoofChecker and SpoofData. - * - * Uses reflection to access private members, meaning that this code might break from an ICU - * update and need to be updated. - */ - static class SpoofCheckerWrapper { - SpoofChecker sc; - SpoofData spoofData; - - // Initialize some useful constants - static Class SpoofChecker, SpoofData, ScriptSet; - static { - try { - SpoofChecker = Class.forName("com.ibm.icu.text.SpoofChecker"); - SpoofData = Class.forName("com.ibm.icu.text.SpoofChecker$SpoofData"); - ScriptSet = Class.forName("com.ibm.icu.text.SpoofChecker$ScriptSet"); - } catch (ClassNotFoundException | SecurityException e) { - e.printStackTrace(); - } - } + /** + * Identifies entries in this instance that have no overlap with the query, and unions all + * of those entries into the destination. + * + *

For example, if this set contained: { { A }, { A, B }, { B, C }, { D } } and query was + * { A, B } then { D } would be unioned into destination, as D is the only script contained + * in a BitSet that has an empty intersection with the query. + * + *

For all entries added to destination, adds an example string to the given Collection. + */ + public void extractNoOverlap( + BitSet query, Collection resultScripts, Collection samples) { + BitSet temp = SpoofCheckerWrapper.ScriptSet_new(); + for (Entry entry : map.entrySet()) { + BitSet ss = entry.getKey(); + if (ss.isEmpty()) { + continue; + } + temp.clear(); + temp.or(ss); + temp.and(query); + if (temp.isEmpty()) { + resultScripts.add((BitSet) ss.clone()); + samples.add(entry.getValue()); + } + } + } - public SpoofCheckerWrapper(SpoofChecker sc) { - this.sc = sc; - this.spoofData = new SpoofData(); + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(" ss : map.entrySet()) { + SpoofCheckerWrapper.ScriptSet_appendStringTo(ss.getKey(), sb); + sb.append(" "); + } + sb.append("}>"); + return sb.toString(); + } } - private Object invokeMethod(String methodName, Class[] parameterTypes, Object[] parameters) { - try { - // Get the method - Method method = SpoofChecker.getDeclaredMethod(methodName, parameterTypes); - method.setAccessible(true); + /** + * Data structure used for efficient lookup of confusable skeletons. Each node is associated + * with an entry from a confusable skeleton. Each node has some set of "transitions" (edges from + * this node to the next node in a skeleton) and "completions" (edges from this node back to the + * base node). The "transitions" edges each have a prototype character associated with them, and + * the "completions" edges each have a codepoint associated with them. + */ + private static class ConfusableGraphNode { + private Map edges; + private UnicodeSet completions; + + public void addCompletion(int confusableCodePoint) { + if (completions == null) { + completions = new UnicodeSet(); + } + completions.add(confusableCodePoint); + } - // Call the method on the SpoofChecker object - return method.invoke(sc, parameters); + public UnicodeSet getCompletions() { + return completions; + } - } catch (Exception e) { - e.printStackTrace(); - return null; - } - } + public ConfusableGraphNode addAndGetTransition(int skeletonCodePoint) { + if (edges == null) { + edges = new HashMap(); + } + ConfusableGraphNode destination = edges.get(skeletonCodePoint); + if (destination == null) { + destination = new ConfusableGraphNode(); + edges.put(skeletonCodePoint, destination); + } + return destination; + } - private Object invokeSpoofDataMethod(String methodName, Class[] parameterTypes, - Object[] parameters) { - try { - // Get the method - Method method = SpoofData.getDeclaredMethod(methodName, parameterTypes); - method.setAccessible(true); - - // Get the SpoofData object - Field field = SpoofChecker.class.getDeclaredField("fSpoofData"); - field.setAccessible(true); - Object spoofData = field.get(sc); - - // Call the method on the SpoofData object - return method.invoke(spoofData, parameters); - - } catch (Exception e) { - e.printStackTrace(); - return null; - } + public ConfusableGraphNode getTransition(int skeletonCodePoint) { + if (edges == null) { + return null; + } + return edges.get(skeletonCodePoint); + } } - private static Object invokeScriptSetMethod(BitSet ss, String methodName, - Class[] parameterTypes, Object[] parameters) { - try { - // Get the method - Method method = ScriptSet.getDeclaredMethod(methodName, parameterTypes); - method.setAccessible(true); + /** + * A wrapper around SpoofChecker enabling access to various private methods inside. Exposes a + * public API similar to that of the original SpoofChecker and SpoofData. + * + *

Uses reflection to access private members, meaning that this code might break from an ICU + * update and need to be updated. + */ + static class SpoofCheckerWrapper { + SpoofChecker sc; + SpoofData spoofData; + + // Initialize some useful constants + static Class SpoofChecker, SpoofData, ScriptSet; + + static { + try { + SpoofChecker = Class.forName("com.ibm.icu.text.SpoofChecker"); + SpoofData = Class.forName("com.ibm.icu.text.SpoofChecker$SpoofData"); + ScriptSet = Class.forName("com.ibm.icu.text.SpoofChecker$ScriptSet"); + } catch (ClassNotFoundException | SecurityException e) { + e.printStackTrace(); + } + } - // Call the method on the ScriptSet object - return method.invoke(ss, parameters); + public SpoofCheckerWrapper(SpoofChecker sc) { + this.sc = sc; + this.spoofData = new SpoofData(); + } - } catch (Exception e) { - e.printStackTrace(); - return null; - } - } + private Object invokeMethod( + String methodName, Class[] parameterTypes, Object[] parameters) { + try { + // Get the method + Method method = SpoofChecker.getDeclaredMethod(methodName, parameterTypes); + method.setAccessible(true); - // ----------------------------------------------------- - // Simulated interface to SpoofChecker private methods - // ----------------------------------------------------- - - public static BitSet ScriptSet_new() { - try { - Constructor constructor = ScriptSet.getDeclaredConstructor(new Class[] {}); - constructor.setAccessible(true); - return (BitSet) constructor.newInstance(); - } catch (Exception e) { - e.printStackTrace(); - return null; - } - } + // Call the method on the SpoofChecker object + return method.invoke(sc, parameters); - public static void ScriptSet_appendStringTo(BitSet ss, StringBuilder sb) { - invokeScriptSetMethod(ss, "appendStringTo", new Class[] {StringBuilder.class}, - new Object[] {sb}); - } + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } - public void getAugmentedScriptSet(int codePoint, BitSet result) { - invokeMethod("getAugmentedScriptSet", new Class[] {Integer.TYPE, ScriptSet}, - new Object[] {codePoint, result}); - } + private Object invokeSpoofDataMethod( + String methodName, Class[] parameterTypes, Object[] parameters) { + try { + // Get the method + Method method = SpoofData.getDeclaredMethod(methodName, parameterTypes); + method.setAccessible(true); + + // Get the SpoofData object + Field field = SpoofChecker.class.getDeclaredField("fSpoofData"); + field.setAccessible(true); + Object spoofData = field.get(sc); + + // Call the method on the SpoofData object + return method.invoke(spoofData, parameters); + + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } - public void getResolvedScriptSet(CharSequence input, BitSet result) { - // getAugmentedScriptSet/getResolvedScriptSet take a ScriptSet instead of a BitSet - invokeMethod("getResolvedScriptSet", new Class[] {CharSequence.class, ScriptSet}, - new Object[] {input, result}); - } + private static Object invokeScriptSetMethod( + BitSet ss, String methodName, Class[] parameterTypes, Object[] parameters) { + try { + // Get the method + Method method = ScriptSet.getDeclaredMethod(methodName, parameterTypes); + method.setAccessible(true); + + // Call the method on the ScriptSet object + return method.invoke(ss, parameters); - class SpoofData { - public int length() { - return (int) invokeSpoofDataMethod("length", new Class[] {}, new Object[] {}); - } + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } - public int codePointAt(int index) { - return (int) invokeSpoofDataMethod("codePointAt", new Class[] {Integer.TYPE}, - new Object[] {index}); - } + // ----------------------------------------------------- + // Simulated interface to SpoofChecker private methods + // ----------------------------------------------------- + + public static BitSet ScriptSet_new() { + try { + Constructor constructor = ScriptSet.getDeclaredConstructor(new Class[] {}); + constructor.setAccessible(true); + return (BitSet) constructor.newInstance(); + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } - public void appendValueTo(int index, StringBuilder sb) { - invokeSpoofDataMethod("appendValueTo", new Class[] {Integer.TYPE, StringBuilder.class}, - new Object[] {index, sb}); - } + public static void ScriptSet_appendStringTo(BitSet ss, StringBuilder sb) { + invokeScriptSetMethod( + ss, "appendStringTo", new Class[] {StringBuilder.class}, new Object[] {sb}); + } + + public void getAugmentedScriptSet(int codePoint, BitSet result) { + invokeMethod( + "getAugmentedScriptSet", + new Class[] {Integer.TYPE, ScriptSet}, + new Object[] {codePoint, result}); + } + + public void getResolvedScriptSet(CharSequence input, BitSet result) { + // getAugmentedScriptSet/getResolvedScriptSet take a ScriptSet instead of a BitSet + invokeMethod( + "getResolvedScriptSet", + new Class[] {CharSequence.class, ScriptSet}, + new Object[] {input, result}); + } + + class SpoofData { + public int length() { + return (int) invokeSpoofDataMethod("length", new Class[] {}, new Object[] {}); + } + + public int codePointAt(int index) { + return (int) + invokeSpoofDataMethod( + "codePointAt", new Class[] {Integer.TYPE}, new Object[] {index}); + } + + public void appendValueTo(int index, StringBuilder sb) { + invokeSpoofDataMethod( + "appendValueTo", + new Class[] {Integer.TYPE, StringBuilder.class}, + new Object[] {index, sb}); + } + } } - } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/GetTypology.java b/unicodetools/src/main/java/org/unicode/text/UCD/GetTypology.java index 102eafcd7..82ba1a083 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/GetTypology.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/GetTypology.java @@ -1,5 +1,8 @@ package org.unicode.text.UCD; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -9,27 +12,23 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.UnicodeSetPrettyPrinter; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class GetTypology { public static void main(String[] args) throws IOException { final UnicodeMap data2 = new UnicodeMap(); - final Map data = new TreeMap(); - final Map> uniqueUnordered = new HashMap(); + final Map data = new TreeMap(); + final Map> uniqueUnordered = new HashMap(); - final Map> labelToPrefixes = new TreeMap(); - final Map> toOriginals = new TreeMap(); + final Map> labelToPrefixes = new TreeMap(); + final Map> toOriginals = new TreeMap(); final String filename = "U52M09XXXX.lst"; - final BufferedReader br = FileUtilities.openUTF8Reader(Settings.UnicodeTools.UCD_DIR, filename); + final BufferedReader br = + FileUtilities.openUTF8Reader(Settings.UnicodeTools.UCD_DIR, filename); final StringBuilder name = new StringBuilder(); String nameString = null; @@ -39,7 +38,7 @@ public static void main(String[] args) throws IOException { break; } final String[] parts = line.split("\t"); - final int cp = Integer.parseInt(parts[0],16); + final int cp = Integer.parseInt(parts[0], 16); name.setLength(0); for (int i = 0; i < parts.length; ++i) { @@ -58,7 +57,7 @@ public static void main(String[] args) throws IOException { continue; } - part = part.substring(1, part.length()-1); + part = part.substring(1, part.length() - 1); final String original = part; part = part.replaceAll("[^\\-0-9A-Za-z]", "_"); Set canonicalized = toOriginals.get(part); @@ -90,7 +89,8 @@ public static void main(String[] args) throws IOException { } name.append(part); nameString = name.toString(); - final TreeSet nameSet = new TreeSet(Arrays.asList(nameString.toLowerCase().split("\\|"))); + final TreeSet nameSet = + new TreeSet(Arrays.asList(nameString.toLowerCase().split("\\|"))); final String unorderedName = CollectionUtilities.join(nameSet, "|").toLowerCase(); Set names = uniqueUnordered.get(unorderedName); if (names == null) { @@ -102,7 +102,9 @@ public static void main(String[] args) throws IOException { } br.close(); - final PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "/classification", "classification_analysis.txt"); + final PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "/classification", "classification_analysis.txt"); out.println("# Source:\t" + filename); int count; out.println(); @@ -122,7 +124,6 @@ public static void main(String[] args) throws IOException { out.println(); out.println("# Total:\t" + count); - count = 0; out.println(); out.println("@ Problems with label format"); @@ -141,7 +142,8 @@ public static void main(String[] args) throws IOException { out.println(); out.println("@ Labels with multiple prefixes"); - out.println("# These need to be examined to ensure consistent semantics of labels with the same prefix."); + out.println( + "# These need to be examined to ensure consistent semantics of labels with the same prefix."); out.println("# Format:"); out.println("# label ; count; characters"); out.println(); @@ -183,10 +185,14 @@ public static void main(String[] args) throws IOException { out.close(); } - static final UnicodeSet TO_QUOTE = new UnicodeSet("[[:z:][:me:][:mn:][:di:][:c:]-[\u0020]]").freeze(); + static final UnicodeSet TO_QUOTE = + new UnicodeSet("[[:z:][:me:][:mn:][:di:][:c:]-[\u0020]]").freeze(); static final UnicodeSetPrettyPrinter pp = new UnicodeSetPrettyPrinter().setToQuote(TO_QUOTE); - private static void showPrefixes(Map> labelToPrefixes, PrintWriter out, boolean singles) { + private static void showPrefixes( + Map> labelToPrefixes, + PrintWriter out, + boolean singles) { int count = 0; for (final String label : labelToPrefixes.keySet()) { final Map prefixes = labelToPrefixes.get(label); @@ -195,7 +201,14 @@ private static void showPrefixes(Map> labelToPre } for (final String prefix : prefixes.keySet()) { final UnicodeSet samples = prefixes.get(prefix); - out.println(label + " ; \t" + prefix + " ;\t" + samples.size() + " ;\t" + showUnicodeSet(samples)); + out.println( + label + + " ; \t" + + prefix + + " ;\t" + + samples.size() + + " ;\t" + + showUnicodeSet(samples)); count++; } } @@ -206,7 +219,7 @@ private static void showPrefixes(Map> labelToPre private static String showUnicodeSet(UnicodeSet samples) { String result = pp.format(samples); if (result.length() > 120) { - result = result.substring(0,120) + "…"; + result = result.substring(0, 120) + "…"; } return result; } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/IANANames.java b/unicodetools/src/main/java/org/unicode/text/UCD/IANANames.java index d05f2acdd..760860098 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/IANANames.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/IANANames.java @@ -1,16 +1,17 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/IANANames.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/IANANames.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; import java.io.IOException; import java.util.HashMap; @@ -18,14 +19,9 @@ import java.util.Locale; import java.util.Map; import java.util.TreeMap; - import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class IANANames implements UCD_Types { private final Map aliasToBase = new TreeMap(); private final Map aliasToComment = new TreeMap(); @@ -54,10 +50,19 @@ public static void testSensitivity() throws IOException { final String base = iNames.getBase(alias); final String otherBase = iNames.getBase(other); if (!base.equals(otherBase)) { - System.out.println("Collision between: " + alias + " (" + base + ") and " - + other + " (" + otherBase + ")"); + System.out.println( + "Collision between: " + + alias + + " (" + + base + + ") and " + + other + + " (" + + otherBase + + ")"); } else { - System.out.println("Alias Variant: " + alias + " and " + other + " (" + base + ")"); + System.out.println( + "Alias Variant: " + alias + " and " + other + " (" + base + ")"); } } else { m.put(skeleton, alias); @@ -69,12 +74,15 @@ public static void testSensitivity() throws IOException { final UnicodeSetIterator usi = new UnicodeSetIterator(removed); while (usi.next()) { final char c = (char) usi.codepoint; // safe, can't be supplementary - System.out.println("0x" + usi.codepoint + "\t'" + c + "'\t" + UCharacter.getName(usi.codepoint)); + System.out.println( + "0x" + usi.codepoint + "\t'" + c + "'\t" + UCharacter.getName(usi.codepoint)); } } public IANANames() throws IOException { - final BufferedReader in = Utility.openReadFile(Settings.UnicodeTools.DATA_DIR + "IANA/character-sets.txt", Utility.LATIN1); + final BufferedReader in = + Utility.openReadFile( + Settings.UnicodeTools.DATA_DIR + "IANA/character-sets.txt", Utility.LATIN1); try { boolean atStart = true; String lastName = ""; @@ -97,7 +105,8 @@ public IANANames() throws IOException { if (line.startsWith("Name:") || line.startsWith("Alias:")) { lastName = add(line, lastName, counter); - } else if (line.startsWith("Source:") || line.startsWith("MIBenum:") + } else if (line.startsWith("Source:") + || line.startsWith("MIBenum:") || line.startsWith(" ")) { continue; } else if (line.equals("REFERENCES")) { @@ -117,7 +126,7 @@ private String add(String line, String baseName, int counter) { if (pos < 0) { throw new IllegalArgumentException("Bad line: " + counter + " '" + line + "'"); } - String alias = line.substring(pos+2).trim(); + String alias = line.substring(pos + 2).trim(); // get comment String comment = null; @@ -138,13 +147,28 @@ private String add(String line, String baseName, int counter) { if (baseName.equals(alias)) { System.out.println(); } - System.out.println("Adding " + alias + "\t=> " + baseName + (comment != null ? "\t(" + comment + ")" : "")); + System.out.println( + "Adding " + + alias + + "\t=> " + + baseName + + (comment != null ? "\t(" + comment + ")" : "")); } // check if it is stored already final String oldbaseName = (String) aliasToBase.get(alias); if (oldbaseName != null) { - System.out.println("Duplicate alias (" + alias + ", " + oldbaseName + ", " + baseName + "): " - + counter + " '" + line + "'"); + System.out.println( + "Duplicate alias (" + + alias + + ", " + + oldbaseName + + ", " + + baseName + + "): " + + counter + + " '" + + line + + "'"); } aliasToBase.put(alias, baseName); if (comment != null) { @@ -159,9 +183,7 @@ public Iterator getIterator() { return aliasToBase.keySet().iterator(); } - /** - * Returns the name for this alias, or "" if there is none - */ + /** Returns the name for this alias, or "" if there is none */ public String getBase(String alias) { return (String) aliasToBase.get(alias); } @@ -192,7 +214,7 @@ public static String removeNonAlphanumeric(String s, UnicodeSet removed) { } } } - //if (removedZero) System.out.println("Removed 0 from " + s + " => " + result); + // if (removedZero) System.out.println("Removed 0 from " + s + " => " + result); return result.toString(); } } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/IDNTester.java b/unicodetools/src/main/java/org/unicode/text/UCD/IDNTester.java index a3fd5440c..ec2e0780a 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/IDNTester.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/IDNTester.java @@ -1,19 +1,17 @@ package org.unicode.text.UCD; -import java.io.IOException; -import java.io.PrintWriter; - -import org.unicode.cldr.draft.FileUtilities; -import org.unicode.cldr.util.UnicodeSetPrettyPrinter; -import org.unicode.text.utility.Settings; -import org.unicode.text.utility.Utility; - import com.ibm.icu.text.Collator; import com.ibm.icu.text.IDNA; import com.ibm.icu.text.StringPrepParseException; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; +import java.io.IOException; +import java.io.PrintWriter; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.cldr.util.UnicodeSetPrettyPrinter; +import org.unicode.text.utility.Settings; +import org.unicode.text.utility.Utility; public class IDNTester { static StringBuffer inbuffer = new StringBuffer(); @@ -26,7 +24,11 @@ public class IDNTester { static UnicodeSet IDOutput32 = new UnicodeSet(); static UnicodeSet IDInputOnly50 = new UnicodeSet(); static UnicodeSet IDOutput50 = new UnicodeSet(); - static UnicodeSetPrettyPrinter pp = new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT)).setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)); + static UnicodeSetPrettyPrinter pp = + new UnicodeSetPrettyPrinter() + .setOrdering(Collator.getInstance(ULocale.ROOT)) + .setSpaceComparator( + Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)); static PrintWriter pw; public static void main(String[] args) throws IOException { @@ -74,13 +76,13 @@ private static void initialize() { final UnicodeSet oddballs = new UnicodeSet("[\u034F \u180B-\u180D \uFE00-\uFE0F _]"); final UCD U32 = UCD.make("3.2.0"); final Normalizer nfkc32 = new Normalizer(UCD_Types.NFKC, "3.2.0"); - final UCDProperty xid32 = DerivedProperty.make(UCD_Types.Mod_ID_Continue_NO_Cf,U32); + final UCDProperty xid32 = DerivedProperty.make(UCD_Types.Mod_ID_Continue_NO_Cf, U32); final UnicodeSet IDInput32 = xid32.getSet(); IDInput32.add('-').removeAll(oddballs); final UCD U50 = UCD.make("5.0.0"); final Normalizer nfkc50 = new Normalizer(UCD_Types.NFKC, "5.0.0"); - final UCDProperty xid50 = DerivedProperty.make(UCD_Types.Mod_ID_Continue_NO_Cf,U50); + final UCDProperty xid50 = DerivedProperty.make(UCD_Types.Mod_ID_Continue_NO_Cf, U50); final UnicodeSet IDInput50 = xid50.getSet(); IDInput50.add('-').removeAll(oddballs); @@ -105,7 +107,8 @@ private static void initialize() { initialized = true; } - private static void splitSet(UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD ucd, Normalizer nfkc, int i) { + private static void splitSet( + UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD ucd, Normalizer nfkc, int i) { if (i < 0x7F) { outputSet.add(i); return; @@ -125,20 +128,18 @@ private static void splitSet(UnicodeSet inputOnlySet, UnicodeSet outputSet, UCD inputOnlySet.add(i); } - static public int getIDNAType(int cp) { + public static int getIDNAType(int cp) { if (cp == '-') { return OK; } inbuffer.setLength(0); UTF16.append(inbuffer, cp); try { - intermediate = IDNA.convertToASCII(inbuffer, - IDNA.DEFAULT); // USE_STD3_RULES + intermediate = IDNA.convertToASCII(inbuffer, IDNA.DEFAULT); // USE_STD3_RULES if (intermediate.length() == 0) { return DELETED; } - outbuffer = IDNA.convertToUnicode(intermediate, - IDNA.USE_STD3_RULES); + outbuffer = IDNA.convertToUnicode(intermediate, IDNA.USE_STD3_RULES); } catch (final StringPrepParseException e) { return ILLEGAL; } catch (final Exception e) { @@ -150,5 +151,4 @@ static public int getIDNAType(int cp) { } return OK; } - } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/IdentifierInfo.java b/unicodetools/src/main/java/org/unicode/text/UCD/IdentifierInfo.java index 501bf8b92..3673ae0b2 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/IdentifierInfo.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/IdentifierInfo.java @@ -1,5 +1,15 @@ package org.unicode.text.UCD; +import com.google.common.base.Splitter; +import com.google.common.collect.HashMultimap; +import com.google.common.collect.Multimap; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableSet; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ICUException; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -14,14 +24,11 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.IdUsage; import org.unicode.cldr.draft.ScriptMetadata.Info; import org.unicode.props.BagFormatter; -import org.unicode.props.UnicodeProperty; -import org.unicode.props.UnicodeProperty.Factory; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.ScriptInfo; import org.unicode.props.UcdProperty; @@ -30,24 +37,16 @@ import org.unicode.props.UcdPropertyValues.General_Category_Values; import org.unicode.props.UcdPropertyValues.Identifier_Type_Values; import org.unicode.props.UcdPropertyValues.NFKC_Quick_Check_Values; +import org.unicode.props.UnicodeProperty; +import org.unicode.props.UnicodeProperty.Factory; import org.unicode.text.UCD.GenerateConfusables.FakeBreak; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Multimap; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.locale.XCldrStub.ImmutableSet; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ICUException; - public class IdentifierInfo { - private static final Set SINGLETON_INCLUSION = Collections.singleton(Identifier_Type.inclusion); + private static final Set SINGLETON_INCLUSION = + Collections.singleton(Identifier_Type.inclusion); private static final boolean MAIN_CODE = true; @@ -64,7 +63,8 @@ static IdentifierInfo getIdentifierInfo() { } return info; } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException("Unable to access data").initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Unable to access data").initCause(e); } } @@ -76,26 +76,23 @@ static IdentifierInfo getIdentifierInfo() { private static Integer MARK_ASCII = Integer.valueOf(10); static final String NOT_IN_XID = "not in XID+"; - static final IndexUnicodeProperties LATEST = IndexUnicodeProperties - .make(Default.ucdVersion()); - - private static final UnicodeMap GENERAL_CATEGORY = LATEST - .loadEnum(UcdProperty.General_Category, General_Category_Values.class); - - private static final UnicodeSet NON_CHARACTERS = GENERAL_CATEGORY - .getSet(General_Category_Values.Unassigned); - private static final UnicodeSet DEPRECATED = LATEST - .loadEnum(UcdProperty.Deprecated, Binary.class) - .getSet(Binary.Yes); - private static final UnicodeSet DEFAULT_IGNORABLE = LATEST - .loadEnum(UcdProperty.Default_Ignorable_Code_Point, Binary.class) - .getSet(Binary.Yes); - private static final UnicodeSet WHITESPACE = LATEST - .loadEnum(UcdProperty.White_Space, Binary.class) - .getSet(Binary.Yes); - private static final UnicodeSet NOT_NFKC = LATEST - .loadEnum(UcdProperty.NFKC_Quick_Check, NFKC_Quick_Check_Values.class) - .getSet(NFKC_Quick_Check_Values.No); + static final IndexUnicodeProperties LATEST = IndexUnicodeProperties.make(Default.ucdVersion()); + + private static final UnicodeMap GENERAL_CATEGORY = + LATEST.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + + private static final UnicodeSet NON_CHARACTERS = + GENERAL_CATEGORY.getSet(General_Category_Values.Unassigned); + private static final UnicodeSet DEPRECATED = + LATEST.loadEnum(UcdProperty.Deprecated, Binary.class).getSet(Binary.Yes); + private static final UnicodeSet DEFAULT_IGNORABLE = + LATEST.loadEnum(UcdProperty.Default_Ignorable_Code_Point, Binary.class) + .getSet(Binary.Yes); + private static final UnicodeSet WHITESPACE = + LATEST.loadEnum(UcdProperty.White_Space, Binary.class).getSet(Binary.Yes); + private static final UnicodeSet NOT_NFKC = + LATEST.loadEnum(UcdProperty.NFKC_Quick_Check, NFKC_Quick_Check_Values.class) + .getSet(NFKC_Quick_Check_Values.No); // private final boolean mergeRanges = true; @@ -105,13 +102,15 @@ static IdentifierInfo getIdentifierInfo() { private UnicodeSet inputSet_lenient; private UnicodeSet nonstarting; UnicodeSet propNFKCSet; - //UnicodeSet notInXID; + // UnicodeSet notInXID; UnicodeSet xidPlus; private final UnicodeMap additions = new UnicodeMap<>(); private final UnicodeMap remap = new UnicodeMap<>(); - private final UnicodeMap removals = new UnicodeMap(); - private final UnicodeMap> identifierTypesMap = new UnicodeMap<>(); + private final UnicodeMap removals = + new UnicodeMap(); + private final UnicodeMap> identifierTypesMap = + new UnicodeMap<>(); private final UnicodeMap recastRemovals = new UnicodeMap(); private UnicodeMap reviews, removals2; @@ -122,7 +121,7 @@ static IdentifierInfo getIdentifierInfo() { public static void main(String[] args) throws IOException { final IdentifierInfo info = IdentifierInfo.getIdentifierInfo(); // show singletons - Multimap> singleToSets = HashMultimap.create(); + Multimap> singleToSets = HashMultimap.create(); for (Set value : info.identifierTypesMap.getAvailableValues()) { for (Identifier_Type v : value) { singleToSets.put(v, value); @@ -132,7 +131,9 @@ public static void main(String[] args) throws IOException { Collection> sets = singleToSets.get(value); System.out.println(value + ":\t " + CollectionUtilities.join(sets, " ")); } - System.out.println(info.identifierTypesMap.getSet(singleToSets.get(Identifier_Type.not_characters).iterator().next())); + System.out.println( + info.identifierTypesMap.getSet( + singleToSets.get(Identifier_Type.not_characters).iterator().next())); info.printIDNStuff(); } @@ -155,15 +156,19 @@ private IdentifierInfo() throws IOException { propNFKCSet = UPS.getSet("NFKC_QuickCheck=N").complement(); final UnicodeSet propXIDContinueSet = UPS.getSet("XID_Continue=Yes"); - //removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant"); + // removals.putAll(propNFKCSet.complement(), PROHIBITED + "compat variant"); loadFileData(); - xidPlus = new UnicodeSet(propXIDContinueSet).addAll(additions.keySet()).retainAll(propNFKCSet); + xidPlus = + new UnicodeSet(propXIDContinueSet) + .addAll(additions.keySet()) + .retainAll(propNFKCSet); GenerateConfusables.getIdentifierSet(); - //notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus); - //removals.putAll(notInXID, PROHIBITED + NOT_IN_XID); - //UnicodeSet notNfkcXid = new UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet); - //removals.putAll(notNfkcXid, PROHIBITED + "compat variant"); + // notInXID = new UnicodeSet(IDNOutputSet).removeAll(xidPlus); + // removals.putAll(notInXID, PROHIBITED + NOT_IN_XID); + // UnicodeSet notNfkcXid = new + // UnicodeSet(xidPlus).removeAll(removals.keySet()).removeAll(propNFKCSet); + // removals.putAll(notNfkcXid, PROHIBITED + "compat variant"); removalSet = new UnicodeSet(); for (final IdentifierInfo.Identifier_Type value : removals.values()) { if (value.isRestricted()) { @@ -174,14 +179,16 @@ private IdentifierInfo() throws IOException { remainingOutputSet = new UnicodeSet(GenerateConfusables.IDNOutputSet).removeAll(removalSet); - final UnicodeSet remainingInputSet1 = new UnicodeSet(GenerateConfusables.IDNInputSet) - .removeAll(removalSet).removeAll(remainingOutputSet); + final UnicodeSet remainingInputSet1 = + new UnicodeSet(GenerateConfusables.IDNInputSet) + .removeAll(removalSet) + .removeAll(remainingOutputSet); final UnicodeSet remainingInputSet = new UnicodeSet(); final UnicodeSet specialRemove = new UnicodeSet(); // remove any others that don't normalize/case fold to something in // the output set - for (final UnicodeSetIterator usi = new UnicodeSetIterator( - remainingInputSet1); usi.next();) { + for (final UnicodeSetIterator usi = new UnicodeSetIterator(remainingInputSet1); + usi.next(); ) { final String nss = GenerateConfusables.getModifiedNKFC(usi.getString()); final String cf = DEFAULT_UCD.getCase(nss, UCD_Types.FULL, UCD_Types.FOLD); final String cf2 = GenerateConfusables.getModifiedNKFC(cf); @@ -193,30 +200,30 @@ private IdentifierInfo() throws IOException { } // filter out the items that are case foldings of items in output inputSet_strict = new UnicodeSet(); - for (final UnicodeSetIterator usi = new UnicodeSetIterator( - remainingInputSet); usi.next();) { + for (final UnicodeSetIterator usi = new UnicodeSetIterator(remainingInputSet); + usi.next(); ) { final String ss = usi.getString(); final String nss = GenerateConfusables.getModifiedNKFC(ss); final String cf = DEFAULT_UCD.getCase(ss, UCD_Types.FULL, UCD_Types.FOLD); if (DEBUG && (usi.codepoint == 0x2126 || usi.codepoint == 0x212B)) { System.out.println("check"); } - //> > 2126 ; retained-input-only-CF # (?) OHM SIGN - //> > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN + // > > 2126 ; retained-input-only-CF # (?) OHM SIGN + // > > 212B ; retained-input-only-CF # (?) ANGSTROM SIGN - if (!remainingOutputSet.containsAll(nss) - && remainingOutputSet.containsAll(cf)) { + if (!remainingOutputSet.containsAll(nss) && remainingOutputSet.containsAll(cf)) { inputSet_strict.add(ss); } } // hack inputSet_strict.remove(0x03F4).remove(0x2126).remove(0x212B); - inputSet_lenient = new UnicodeSet(remainingInputSet) - .removeAll(inputSet_strict); - nonstarting = new UnicodeSet(remainingOutputSet).addAll( - remainingInputSet).retainAll(new UnicodeSet("[:M:]")); + inputSet_lenient = new UnicodeSet(remainingInputSet).removeAll(inputSet_strict); + nonstarting = + new UnicodeSet(remainingOutputSet) + .addAll(remainingInputSet) + .retainAll(new UnicodeSet("[:M:]")); reviews = new UnicodeMap<>(); - //reviews.putAll(removals); + // reviews.putAll(removals); for (final IdentifierInfo.Identifier_Type value : removals.values()) { reviews.putAll(removals.getSet(value), value.propertyFileFormat()); } @@ -240,22 +247,23 @@ private IdentifierInfo() throws IOException { lowerIsBetter.freeze(); // add special values: - //lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0)); - - final UnicodeMap nonstartingmap = new UnicodeMap().putAll(nonstarting, - "nonstarting"); - final UnicodeMap.Composer composer = new UnicodeMap.Composer() { - @Override - public String compose(int codepoint, String string, String a, String b) { - if (a == null) { - return b; - } else if (b == null) { - return a; - } else { - return a.toString() + "-" + b.toString(); - } - } - }; + // lowerIsBetter.putAll(new UnicodeSet("["), new Integer(0)); + + final UnicodeMap nonstartingmap = + new UnicodeMap().putAll(nonstarting, "nonstarting"); + final UnicodeMap.Composer composer = + new UnicodeMap.Composer() { + @Override + public String compose(int codepoint, String string, String a, String b) { + if (a == null) { + return b; + } else if (b == null) { + return a; + } else { + return a.toString() + "-" + b.toString(); + } + } + }; reviews.composeWith(nonstartingmap, composer); reviews.putAll(new UnicodeSet(GenerateConfusables.IDNInputSet).complement(), ""); // final UnicodeMap.Composer composer2 = new UnicodeMap.Composer() { @@ -267,9 +275,10 @@ public String compose(int codepoint, String string, String a, String b) { // return "remap-to-" + Utility.hex(b.toString()); // } // }; - //reviews.composeWith(remap, composer2); + // reviews.composeWith(remap, composer2); removals2 = new UnicodeMap().putAll(recastRemovals); - removals2.putAll(UPS.getSet("XID_Continue=Yes").complement(), + removals2.putAll( + UPS.getSet("XID_Continue=Yes").complement(), Identifier_Status.restricted + " ; " + NOT_IN_XID); removals2.setMissing("future?"); @@ -289,15 +298,18 @@ public enum Identifier_Status { allowed("Allowed"), restricted("Restricted"); final String name; + private Identifier_Status(String name) { this.name = name; } + @Override public String toString() { return name; } + public static Identifier_Status fromString(String string) { - String rawReason = string.trim().replace("-","_").toLowerCase(Locale.ENGLISH); + String rawReason = string.trim().replace("-", "_").toLowerCase(Locale.ENGLISH); return valueOf(rawReason); } } @@ -305,7 +317,7 @@ public static Identifier_Status fromString(String string) { public enum Identifier_Type { recommended(Identifier_Type_Values.Recommended, Identifier_Status.allowed), inclusion(Identifier_Type_Values.Inclusion, Identifier_Status.allowed), - //aspirational(Identifier_Type_Values.Aspirational, Identifier_Status.restricted), + // aspirational(Identifier_Type_Values.Aspirational, Identifier_Status.restricted), limited_use(Identifier_Type_Values.Limited_Use, Identifier_Status.restricted), uncommon_use(Identifier_Type_Values.Uncommon_Use, Identifier_Status.restricted), technical(Identifier_Type_Values.Technical, Identifier_Status.restricted), @@ -329,10 +341,10 @@ private Identifier_Type(Identifier_Type_Values type, Identifier_Status identifie } public static IdentifierInfo.Identifier_Type fromString(String string) { - String rawReason = string.trim().replace("-","_").toLowerCase(Locale.ENGLISH); + String rawReason = string.trim().replace("-", "_").toLowerCase(Locale.ENGLISH); if (rawReason.equals("allowed")) { return Identifier_Type.recommended; - //rawReason = GenerateConfusables.recommended_scripts; + // rawReason = GenerateConfusables.recommended_scripts; } else if (rawReason.equals("historic")) { return Identifier_Type.obsolete; } else if (rawReason.equals("aspirational")) { @@ -351,17 +363,20 @@ public static IdentifierInfo.Identifier_Type fromString(String string) { public boolean isRestricted() { return this != Identifier_Type.inclusion && this != Identifier_Type.recommended; } + @Override public String toString() { return name; } + public String propertyFileFormat() { return identifierStatus + " ; " + name; } + public boolean replaceBy(IdentifierInfo.Identifier_Type possibleReplacement) { return compareTo(possibleReplacement) < 0 - // || this == historic && possibleReplacement == limited_use - ; // && this != historic; + // || this == historic && possibleReplacement == limited_use + ; // && this != historic; } static Splitter SPACE_SPLITTER = Splitter.on(' ').omitEmptyStrings().trimResults(); @@ -374,15 +389,16 @@ public static Set fromStringSet(String strings) { return results; } } - /** - * - */ + /** */ private void loadFileData() throws IOException { BufferedReader br; String line; // get all the removals. br = FileUtilities.openUTF8Reader(GenerateConfusables.indir, "removals.txt"); - removals.putAll(0,0x10FFFF, // new UnicodeSet("[^[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"), + removals.putAll( + 0, + 0x10FFFF, // new + // UnicodeSet("[^[:gc=cn:][:gc=co:][:gc=cs:][:gc=cc:]-[:whitespace:]]"), Identifier_Type.recommended); UnicodeSet sources = new UnicodeSet(); @@ -419,7 +435,9 @@ private void loadFileData() throws IOException { } final String codelist = pieces[0].trim(); if (UnicodeSet.resemblesPattern(pieces[0], 0)) { - sources = TestUnicodeInvariants.parseUnicodeSet(codelist); //.retainAll(allocated); + sources = + TestUnicodeInvariants.parseUnicodeSet( + codelist); // .retainAll(allocated); if (sources.contains("ᢰ")) { int x = 0; } @@ -449,8 +467,7 @@ private void loadFileData() throws IOException { addToRemovalSets(s, reasons); - if (oldReason == Identifier_Type.inclusion - || oldReason == reasons) { + if (oldReason == Identifier_Type.inclusion || oldReason == reasons) { continue; // always ok } if (override @@ -459,12 +476,22 @@ private void loadFileData() throws IOException { || reasons == Identifier_Type.inclusion) { removals.put(s, reasons); } else { - if (DEBUG) System.out.println("Skipping: " + Utility.hex(s) + " " + s + ", old: " + oldReason + " new: " + reasons); + if (DEBUG) + System.out.println( + "Skipping: " + + Utility.hex(s) + + " " + + s + + ", old: " + + oldReason + + " new: " + + reasons); } } } // if (reasons == Reason.recommended) { - // removals.putAll(sources, UNPROHIBITED + recommended_scripts); + // removals.putAll(sources, UNPROHIBITED + + // recommended_scripts); // } else if (reasons.equals("inclusion")) { // removals.putAll(sources, UNPROHIBITED + reasons); // } else { @@ -502,28 +529,26 @@ private void loadFileData() throws IOException { for (final String script : scripts) { String shortName = UcdPropertyValues.Script_Values.forName(script).getShortName(); Info scriptInfo = ScriptMetadata.getInfo(shortName); - final IdUsage idUsage = scriptInfo != null - ? scriptInfo.idUsage - : IdUsage.EXCLUSION; + final IdUsage idUsage = scriptInfo != null ? scriptInfo.idUsage : IdUsage.EXCLUSION; IdentifierInfo.Identifier_Type status; - switch(idUsage) { - // case ASPIRATIONAL: - // status = Identifier_Type.aspirational; - // break; - case LIMITED_USE: - status = Identifier_Type.limited_use; - break; - case EXCLUSION: - status = Identifier_Type.exclusion; - break; - case RECOMMENDED: - default: - status = null; - break; // do nothing; + switch (idUsage) { + // case ASPIRATIONAL: + // status = Identifier_Type.aspirational; + // break; + case LIMITED_USE: + status = Identifier_Type.limited_use; + break; + case EXCLUSION: + status = Identifier_Type.exclusion; + break; + case RECOMMENDED: + default: + status = null; + break; // do nothing; } if (status != null) { final UnicodeSet us = ScriptInfo.IDENTIFIER_INFO.getSetWith(script); - //final UnicodeSet us = new UnicodeSet().applyPropertyAlias("script", script); + // final UnicodeSet us = new UnicodeSet().applyPropertyAlias("script", script); for (final String s : us) { if (hasRecommendedScript.contains(s)) { continue; // skip those that have at least one recommended script @@ -532,19 +557,26 @@ private void loadFileData() throws IOException { final IdentifierInfo.Identifier_Type old = removals.get(s); if (old == null) { removals.put(s, status); - } else if (!old.equals(status)){ + } else if (!old.equals(status)) { if (old.replaceBy(status)) { - removalCollision.put(s, "REPLACING " + old + "\t!= (script metadata)\t" + status); + removalCollision.put( + s, "REPLACING " + old + "\t!= (script metadata)\t" + status); removals.put(s, status); } else { - removalCollision.put(s, "Retaining " + old + "\t!= (script metadata)\t" + status); + removalCollision.put( + s, "Retaining " + old + "\t!= (script metadata)\t" + status); } } } } } for (final String value : removalCollision.values()) { - if (DEBUG) System.out.println("*Removal Collision\t" + value + "\n\t" + removalCollision.getSet(value).toPattern(false)); + if (DEBUG) + System.out.println( + "*Removal Collision\t" + + value + + "\n\t" + + removalCollision.getSet(value).toPattern(false)); } removals.freeze(); @@ -553,26 +585,38 @@ private void loadFileData() throws IOException { // Clean up values by setting to singletons. ORDER is important!! identifierTypesMap.putAll(NOT_NFKC, Collections.singleton(Identifier_Type.not_nfkc)); - identifierTypesMap.putAll(DEFAULT_IGNORABLE, Collections.singleton(Identifier_Type.default_ignorable)); + identifierTypesMap.putAll( + DEFAULT_IGNORABLE, Collections.singleton(Identifier_Type.default_ignorable)); identifierTypesMap.putAll(DEPRECATED, Collections.singleton(Identifier_Type.deprecated)); - identifierTypesMap.putAll(NON_CHARACTERS, Collections.singleton(Identifier_Type.not_characters)); - identifierTypesMap.putAll(GENERAL_CATEGORY.getSet(General_Category_Values.Private_Use), Collections.singleton(Identifier_Type.not_characters)); - identifierTypesMap.putAll(GENERAL_CATEGORY.getSet(General_Category_Values.Surrogate), Collections.singleton(Identifier_Type.not_characters)); - UnicodeSet controlNonWhitespace = new UnicodeSet(GENERAL_CATEGORY.getSet(General_Category_Values.Control)) - .removeAll(WHITESPACE); - identifierTypesMap.putAll(controlNonWhitespace, Collections.singleton(Identifier_Type.not_characters)); + identifierTypesMap.putAll( + NON_CHARACTERS, Collections.singleton(Identifier_Type.not_characters)); + identifierTypesMap.putAll( + GENERAL_CATEGORY.getSet(General_Category_Values.Private_Use), + Collections.singleton(Identifier_Type.not_characters)); + identifierTypesMap.putAll( + GENERAL_CATEGORY.getSet(General_Category_Values.Surrogate), + Collections.singleton(Identifier_Type.not_characters)); + UnicodeSet controlNonWhitespace = + new UnicodeSet(GENERAL_CATEGORY.getSet(General_Category_Values.Control)) + .removeAll(WHITESPACE); + identifierTypesMap.putAll( + controlNonWhitespace, Collections.singleton(Identifier_Type.not_characters)); // restore inclusions UnicodeSet inclusions2 = identifierTypesMap.getSet(SINGLETON_INCLUSION); - System.out.println("Restoring inclusions: " + new UnicodeSet(inclusions).removeAll(inclusions2)); + System.out.println( + "Restoring inclusions: " + new UnicodeSet(inclusions).removeAll(inclusions2)); identifierTypesMap.putAll(inclusions, SINGLETON_INCLUSION); - identifierTypesMap.putAll(identifierTypesMap.getSet(null), Collections.singleton(Identifier_Type.recommended)); + identifierTypesMap.putAll( + identifierTypesMap.getSet(null), + Collections.singleton(Identifier_Type.recommended)); // make immutable // special hack for Exclusion + Obsolete!! for (Set value : identifierTypesMap.getAvailableValues()) { - if (value.contains(Identifier_Type.exclusion) && value.contains(Identifier_Type.obsolete)) { + if (value.contains(Identifier_Type.exclusion) + && value.contains(Identifier_Type.obsolete)) { UnicodeSet set = identifierTypesMap.getSet(value); EnumSet value2 = EnumSet.copyOf(value); value2.remove(Identifier_Type.obsolete); @@ -580,7 +624,7 @@ private void loadFileData() throws IOException { } } identifierTypesMap.freeze(); - //removals.putAll(getNonIICore(), PROHIBITED + "~IICore"); + // removals.putAll(getNonIICore(), PROHIBITED + "~IICore"); br.close(); // // get the word chars @@ -600,7 +644,8 @@ private void loadFileData() throws IOException { // pieces[2].trim(), 16))); // } else { // if (XIDContinueSet.contains(code)) { - // if (GenerateConfusables.DEBUG) System.out.println("Already in XID continue: " + // if (GenerateConfusables.DEBUG) System.out.println("Already in XID continue: + // " // + line); // continue; // } @@ -616,12 +661,13 @@ private void loadFileData() throws IOException { } - - private void addToRemovalSets(String codepoint, final IdentifierInfo.Identifier_Type identifierType) { + private void addToRemovalSets( + String codepoint, final IdentifierInfo.Identifier_Type identifierType) { Set oldSet = identifierTypesMap.get(codepoint); if (oldSet == null || identifierType == Identifier_Type.recommended) { identifierTypesMap.put(codepoint, Collections.singleton(identifierType)); - } else if (identifierType == Identifier_Type.inclusion && !oldSet.contains(Identifier_Type.recommended)) { + } else if (identifierType == Identifier_Type.inclusion + && !oldSet.contains(Identifier_Type.recommended)) { identifierTypesMap.put(codepoint, Collections.singleton(identifierType)); } else if (!oldSet.contains(identifierType)) { EnumSet newSet = EnumSet.copyOf(oldSet); @@ -630,7 +676,11 @@ private void addToRemovalSets(String codepoint, final IdentifierInfo.Identifier_ } } - enum Style {flat, byValue}; + enum Style { + flat, + byValue + }; + void printIDNStuff() throws IOException { printIdentifierTypes(Style.byValue); printIdentifierTypes(Style.flat); @@ -641,19 +691,23 @@ void printIDNStuff() throws IOException { writeIDReviewInternal(); } - /** - * - */ + /** */ private void writeIDReviewInternal() throws IOException { - final BagFormatter bf = GenerateConfusables.makeFormatter() - .setUnicodePropertyFactory(UPS) - .setLabelSource(null) - .setShowLiteral(GenerateConfusables.EXCAPE_FUNNY) - .setMergeRanges(true); - - final PrintWriter out = GenerateConfusables.openAndWriteHeader(GenerateConfusables.reformatedInternal, "review.txt", "Review List for IDN"); - // PrintWriter out = FileUtilities.openUTF8Writer(outdir, "review.txt"); - //reviews.putAll(UNASSIGNED, ""); + final BagFormatter bf = + GenerateConfusables.makeFormatter() + .setUnicodePropertyFactory(UPS) + .setLabelSource(null) + .setShowLiteral(GenerateConfusables.EXCAPE_FUNNY) + .setMergeRanges(true); + + final PrintWriter out = + GenerateConfusables.openAndWriteHeader( + GenerateConfusables.reformatedInternal, + "review.txt", + "Review List for IDN"); + // PrintWriter out = FileUtilities.openUTF8Writer(outdir, + // "review.txt"); + // reviews.putAll(UNASSIGNED, ""); // out.print("\uFEFF"); // out.println("# Review List for IDN"); // out.println("# $Revision: 1.32 $"); @@ -662,10 +716,11 @@ private void writeIDReviewInternal() throws IOException { final UnicodeSet fullSet = reviews.keySet("").complement(); - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(reviews).setMain("Reviews", "GCB", - UnicodeProperty.ENUMERATED, "1.0")); - //bf.setMergeRanges(false); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(reviews) + .setMain("Reviews", "GCB", UnicodeProperty.ENUMERATED, "1.0")); + // bf.setMergeRanges(false); final FakeBreak fakeBreak = new FakeBreak(); bf.setRangeBreakSource(fakeBreak); @@ -673,33 +728,33 @@ private void writeIDReviewInternal() throws IOException { out.println("# Characters allowed in IDNA"); out.println(""); bf.showSetNames(out, new UnicodeSet(fullSet)); // .removeAll(bigSets) - //bf.setMergeRanges(true); + // bf.setMergeRanges(true); // out.println(""); // out.println("# Large Ranges"); // out.println(""); // bf.showSetNames(out, new UnicodeSet(fullSet).retainAll(bigSets)); out.println(""); out.println("# Characters disallowed in IDNA"); - out - .println("# The IDNA spec doesn't allow any of these characters,"); - out - .println("# so don't report any of them as being missing from the above list."); - out - .println("# Some possible future additions, once IDNA updates to Unicode 4.1, are given."); + out.println("# The IDNA spec doesn't allow any of these characters,"); + out.println("# so don't report any of them as being missing from the above list."); + out.println( + "# Some possible future additions, once IDNA updates to Unicode 4.1, are given."); out.println(""); - //bf.setRangeBreakSource(UnicodeLabel.NULL); - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(removals2).setMain("Removals", "GCB", - UnicodeProperty.ENUMERATED, "1.0")); - //bf.setValueSource(UnicodeLabel.NULL); - bf.showSetNames(out, new UnicodeSet(GenerateConfusables.IDNInputSet).complement() - .removeAll(GenerateConfusables.UNASSIGNED)); + // bf.setRangeBreakSource(UnicodeLabel.NULL); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(removals2) + .setMain("Removals", "GCB", UnicodeProperty.ENUMERATED, "1.0")); + // bf.setValueSource(UnicodeLabel.NULL); + bf.showSetNames( + out, + new UnicodeSet(GenerateConfusables.IDNInputSet) + .complement() + .removeAll(GenerateConfusables.UNASSIGNED)); out.close(); } - /** - * - */ + /** */ private void writeIDCharsInternal() throws IOException { final BagFormatter bf = GenerateConfusables.makeFormatter(); bf.setLabelSource(null); @@ -708,7 +763,11 @@ private void writeIDCharsInternal() throws IOException { final UnicodeSet letters = new UnicodeSet("[[:Alphabetic:][:Mark:][:Nd:]]"); - final PrintWriter out = GenerateConfusables.openAndWriteHeader(GenerateConfusables.reformatedInternal, "idnchars.txt", "Recommended Identifier Profiles for IDN"); + final PrintWriter out = + GenerateConfusables.openAndWriteHeader( + GenerateConfusables.reformatedInternal, + "idnchars.txt", + "Recommended Identifier Profiles for IDN"); out.println("# Allowed as output characters"); out.println(""); @@ -717,22 +776,22 @@ private void writeIDCharsInternal() throws IOException { showExtras(bf, remainingOutputSet, letters); /* - out.println(""); - - out.println(""); - out.println("# Input Characters"); - out.println(""); - bf.setValueSource("input"); - bf.showSetNames(out, inputSet_strict); - showExtras(bf, inputSet_strict, letters); - - out.println(""); - out.println("# Input Characters (lenient)"); - out.println(""); - bf.setValueSource("input-lenient"); - bf.showSetNames(out, inputSet_lenient); - showExtras(bf, inputSet_lenient, letters); - */ + out.println(""); + + out.println(""); + out.println("# Input Characters"); + out.println(""); + bf.setValueSource("input"); + bf.showSetNames(out, inputSet_strict); + showExtras(bf, inputSet_strict, letters); + + out.println(""); + out.println("# Input Characters (lenient)"); + out.println(""); + bf.setValueSource("input-lenient"); + bf.showSetNames(out, inputSet_lenient); + showExtras(bf, inputSet_lenient, letters); + */ out.println(""); out.println("# Not allowed at start of identifier"); @@ -740,22 +799,20 @@ private void writeIDCharsInternal() throws IOException { bf.setValueSource("nonstarting"); bf.showSetNames(out, nonstarting); - //out.println(""); + // out.println(""); - //showRemapped(out, "Characters remapped on input in GUIs -- Not required by profile!", remap); + // showRemapped(out, "Characters remapped on input in GUIs -- Not required by profile!", + // remap); out.close(); } - - /** - * - */ + /** */ private void showExtras(BagFormatter bf, UnicodeSet source, UnicodeSet letters) { final UnicodeSet extra = new UnicodeSet(source).removeAll(letters); if (extra.size() != 0) { final UnicodeSet fixed = new UnicodeSet(); - for (final UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(extra); it.next(); ) { if (!letters.containsAll(GenerateConfusables.NFKD.normalize(it.getString()))) { fixed.add(it.codepoint); } @@ -771,7 +828,9 @@ private void printModificationsInternal() throws IOException { bf.setMergeRanges(true); PrintWriter out; - // PrintWriter out = GenerateConfusables.openAndWriteHeader(GenerateConfusables.DRAFT_OUT, "xidmodifications.txt", "Security Profile for General Identifiers"); + // PrintWriter out = + // GenerateConfusables.openAndWriteHeader(GenerateConfusables.DRAFT_OUT, + // "xidmodifications.txt", "Security Profile for General Identifiers"); // /* PrintWriter out = FileUtilities.openUTF8Writer(outdir, "xidmodifications.txt"); // // out.println("# Security Profile for General Identifiers"); @@ -796,10 +855,13 @@ private void printModificationsInternal() throws IOException { // .set(recastRemovals) // .setMain("Removals", "GCB", UnicodeProperty.ENUMERATED, "1.0")); // - // final Set fullListing = new HashSet(Arrays.asList("technical limited-use historic discouraged obsolete".split("\\s+"))); - // // final Set sortedValues = new TreeSet(GenerateConfusables.UCAComparator); + // final Set fullListing = new HashSet(Arrays.asList("technical + // limited-use historic discouraged obsolete".split("\\s+"))); + // // final Set sortedValues = new + // TreeSet(GenerateConfusables.UCAComparator); // // sortedValues.addAll(recastRemovals.values()); - // // if (GenerateConfusables.DEBUG) System.out.println("Restriction Values: " + sortedValues); + // // if (GenerateConfusables.DEBUG) System.out.println("Restriction Values: " + // + sortedValues); // for (Identifier_Type value : Identifier_Type.values()) { // if (value == Identifier_Type.not_characters) { // continue; @@ -811,7 +873,8 @@ private void printModificationsInternal() throws IOException { // out.println(""); // out.println("#\tStatus/Type:\t" + value.name); // out.println(""); - // //bf.setMergeRanges(Collections.disjoint(fullListing, Arrays.asList(value.split("[\\s;]+")))); + // //bf.setMergeRanges(Collections.disjoint(fullListing, + // Arrays.asList(value.split("[\\s;]+")))); // //bf.setMergeRanges(value.propertyFileFormat()); // bf.showSetNames(out, uset); // } @@ -827,9 +890,14 @@ private void printModificationsInternal() throws IOException { // // out.close(); - out = GenerateConfusables.openAndWriteHeader(GenerateConfusables.reformatedInternal, "xidAllowed.txt", "Security Profile for General Identifiers"); + out = + GenerateConfusables.openAndWriteHeader( + GenerateConfusables.reformatedInternal, + "xidAllowed.txt", + "Security Profile for General Identifiers"); final UnicodeSet allowed = new UnicodeSet(xidPlus).removeAll(removals.keySet()); - final UnicodeSet cfAllowed = new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet); + final UnicodeSet cfAllowed = + new UnicodeSet().addAll(allowed).retainAll(isCaseFolded).retainAll(propNFKCSet); allowed.removeAll(cfAllowed); bf.setValueSource("case_folded"); out.println("# XID characters allowed (no uppercase)"); @@ -843,42 +911,50 @@ private void printModificationsInternal() throws IOException { out.close(); final UnicodeMap someRemovals = new UnicodeMap<>(); - final UnicodeMap.Composer myComposer = new UnicodeMap.Composer() { - @Override - public String compose(int codePoint, String string, String a, String b) { - if (b == null) { - return null; - } - String x = (String)b; - // if (ALT) { - // if (!GenerateConfusables.IDNOutputSet.contains(codePoint)) { - // return "~IDNA"; - // } - // if (!xidPlus.contains(codePoint)) { - // return "~Unicode Identifier"; - // } - // } - if (x.startsWith(Identifier_Status.restricted.toString())) { - x = x.substring(Identifier_Status.restricted.toString().length()); - } - //if (!propNFKCSet.contains(codePoint)) x += "*"; - if (GenerateConfusables.GC_LOWERCASE.contains(codePoint)) { - final String upper = DEFAULT_UCD.getCase(codePoint, UCD_Types.FULL, UCD_Types.UPPER); - if (upper.equals(UTF16.valueOf(codePoint)) - && x.equals("technical symbol (phonetic)")) { - x = "technical symbol (phonetic with no uppercase)"; + final UnicodeMap.Composer myComposer = + new UnicodeMap.Composer() { + @Override + public String compose(int codePoint, String string, String a, String b) { + if (b == null) { + return null; + } + String x = (String) b; + // if (ALT) { + // if + // (!GenerateConfusables.IDNOutputSet.contains(codePoint)) { + // return "~IDNA"; + // } + // if (!xidPlus.contains(codePoint)) { + // return "~Unicode Identifier"; + // } + // } + if (x.startsWith(Identifier_Status.restricted.toString())) { + x = x.substring(Identifier_Status.restricted.toString().length()); + } + // if (!propNFKCSet.contains(codePoint)) x += "*"; + if (GenerateConfusables.GC_LOWERCASE.contains(codePoint)) { + final String upper = + DEFAULT_UCD.getCase(codePoint, UCD_Types.FULL, UCD_Types.UPPER); + if (upper.equals(UTF16.valueOf(codePoint)) + && x.equals("technical symbol (phonetic)")) { + x = "technical symbol (phonetic with no uppercase)"; + } + } + return x; } - } - return x; - } - }; + }; someRemovals.composeWith(recastRemovals, myComposer); - final UnicodeSet nonIDNA = new UnicodeSet(GenerateConfusables.IDNOutputSet).addAll(GenerateConfusables.IDNInputSet).complement(); + final UnicodeSet nonIDNA = + new UnicodeSet(GenerateConfusables.IDNOutputSet) + .addAll(GenerateConfusables.IDNInputSet) + .complement(); someRemovals.putAll(nonIDNA, "~IDNA"); someRemovals.putAll(new UnicodeSet(xidPlus).complement(), "~Unicode Identifier"); someRemovals.putAll(GenerateConfusables.UNASSIGNED, null); // clear extras - //someRemovals = removals; - out = FileUtilities.openUTF8Writer(GenerateConfusables.reformatedInternal, "draft-restrictions.txt"); + // someRemovals = removals; + out = + FileUtilities.openUTF8Writer( + GenerateConfusables.reformatedInternal, "draft-restrictions.txt"); out.println("# Characters restricted in domain names"); out.println("#"); out.println("# This file contains a draft list of characters for use in"); @@ -894,52 +970,56 @@ public String compose(int codePoint, String string, String a, String b) { out.println("# Notes:"); out.println("# - Characters are listed along with a reason for their removal."); out.println("# - Characters listed as ~IDNA are excluded at this point in domain names,"); - out.println("# in many cases because the international domain name specification does not contain"); - out.println("# characters beyond Unicode 3.2. At this point in time, feedback on those characters"); + out.println( + "# in many cases because the international domain name specification does not contain"); + out.println( + "# characters beyond Unicode 3.2. At this point in time, feedback on those characters"); out.println("# is not relevant."); out.println("# - Characters listed as ~Unicode Identifiers are restricted because they"); out.println("# do not fit the specification of identifiers given in"); out.println("# UAX #31: Identifier and Pattern Syntax"); out.println("# http://unicode.org/reports/tr31/"); - out.println("# - Characters listed as ~IICore are restricted because they are Ideographic,"); + out.println( + "# - Characters listed as ~IICore are restricted because they are Ideographic,"); out.println("# but not part of the IICore set defined by the IRG as the minimal set"); out.println("# of required ideographs for East Asian use."); bf.setRangeBreakSource(new GenerateConfusables.FakeBreak2()); if (MAIN_CODE) { final Set values = new TreeSet<>(someRemovals.getAvailableValues()); - for (final Iterator it = values.iterator(); it.hasNext();) { + for (final Iterator it = values.iterator(); it.hasNext(); ) { final String reason1 = (String) it.next(); bf.setValueSource(reason1); final UnicodeSet keySet = someRemovals.keySet(reason1); if (reason1.contains("recommended")) { if (DEBUG) System.out.println("Recommended: " + keySet.toPattern(false)); - UnicodeSet current = GenerateConfusables.AGE.getSet(GenerateConfusables.VERSION_PROP_VALUE); + UnicodeSet current = + GenerateConfusables.AGE.getSet(GenerateConfusables.VERSION_PROP_VALUE); if (DEBUG) System.out.println("Current: " + current.toPattern(false)); UnicodeSet newRecommended = new UnicodeSet(keySet).retainAll(current); for (String s : newRecommended) { // [:script=Phag:] ; historic # UAX31 T4 # Phags Pa - if (DEBUG) System.out.println(Utility.hex(s) - + "\t;\thistoric\t#\t" - + DEFAULT_UCD.getName(s)); + if (DEBUG) + System.out.println( + Utility.hex(s) + "\t;\thistoric\t#\t" + DEFAULT_UCD.getName(s)); } } out.println(""); bf.showSetNames(out, keySet); } } else { - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(someRemovals).setMain("Removals", "GCB", - UnicodeProperty.ENUMERATED, "1.0")); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(someRemovals) + .setMain("Removals", "GCB", UnicodeProperty.ENUMERATED, "1.0")); bf.showSetNames(out, someRemovals.keySet()); } out.close(); } - private void printIdentifierTypes(Style status) throws IOException { final UnicodeMap tempMap = new UnicodeMap(); - final Map> sortingMap = new HashMap<>(); + final Map> sortingMap = new HashMap<>(); for (Set value : identifierTypesMap.values()) { if (value.contains(Identifier_Type.not_characters)) { continue; @@ -949,57 +1029,68 @@ private void printIdentifierTypes(Style status) throws IOException { sortingMap.put(valueString, value); tempMap.putAll(set, valueString); } - final Comparator tempComp = new Comparator() { - @Override - public int compare(String o1, String o2) { - Set set0 = sortingMap.get(o1); - Set set1 = sortingMap.get(o2); - return CollectionUtilities.compare(set0.iterator(), set1.iterator()); - } - - }; + final Comparator tempComp = + new Comparator() { + @Override + public int compare(String o1, String o2) { + Set set0 = sortingMap.get(o1); + Set set1 = sortingMap.get(o2); + return CollectionUtilities.compare(set0.iterator(), set1.iterator()); + } + }; final BagFormatter bf2 = GenerateConfusables.makeFormatter(); bf2.setMergeRanges(true); - final ToolUnicodePropertySource properties = ToolUnicodePropertySource.make(Default.ucdVersion()); + final ToolUnicodePropertySource properties = + ToolUnicodePropertySource.make(Default.ucdVersion()); final UnicodeProperty age = properties.getProperty("age"); bf2.setLabelSource(age); final String propName = "Identifier_Type"; - final String filename = status == Style.byValue ? "IdentifierType.txt" : "IdentifierTypeFlat.txt"; - try (PrintWriter out2 = GenerateConfusables.openAndWriteHeader(GenerateConfusables.GEN_SECURITY_DIR, - filename, "Security Profile for General Identifiers: " - + propName)) { - out2.println("# Format" - + "\n#" - + "\n# Field 0: code point" - + "\n# Field 1: set of Identifier_Type values (see Table 1 of http://www.unicode.org/reports/tr39)" - + "\n#" - + "\n# Any missing code points have the " + propName + " value Not_Character"); - - out2.println("#\n" - + "# For the purpose of regular expressions, the property " + propName + " is defined as\n" - + "# mapping each code point to a set of enumerated values.\n" - + "# The short name of " + propName + " is the same as the long name.\n" - + "# The possible values are:\n" - + "# Not_Character, Deprecated, Default_Ignorable, Not_NFKC, Not_XID,\n" - + "# Exclusion, Obsolete, Technical, Uncommon_Use, Limited_Use, Inclusion, Recommended\n" - + "# The short name of each value is the same as its long name.\n" - + "# The default property value for all Unicode code points U+0000..U+10FFFF\n" - + "# not mentioned in this data file is Not_Character.\n" - + "# As usual, sets are unordered, with no duplicate values.\n"); - - bf2.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(tempMap).setMain(propName, "IDT", - UnicodeProperty.EXTENDED_MISC, "1.0")); + final String filename = + status == Style.byValue ? "IdentifierType.txt" : "IdentifierTypeFlat.txt"; + try (PrintWriter out2 = + GenerateConfusables.openAndWriteHeader( + GenerateConfusables.GEN_SECURITY_DIR, + filename, + "Security Profile for General Identifiers: " + propName)) { + out2.println( + "# Format" + + "\n#" + + "\n# Field 0: code point" + + "\n# Field 1: set of Identifier_Type values (see Table 1 of http://www.unicode.org/reports/tr39)" + + "\n#" + + "\n# Any missing code points have the " + + propName + + " value Not_Character"); + + out2.println( + "#\n" + + "# For the purpose of regular expressions, the property " + + propName + + " is defined as\n" + + "# mapping each code point to a set of enumerated values.\n" + + "# The short name of " + + propName + + " is the same as the long name.\n" + + "# The possible values are:\n" + + "# Not_Character, Deprecated, Default_Ignorable, Not_NFKC, Not_XID,\n" + + "# Exclusion, Obsolete, Technical, Uncommon_Use, Limited_Use, Inclusion, Recommended\n" + + "# The short name of each value is the same as its long name.\n" + + "# The default property value for all Unicode code points U+0000..U+10FFFF\n" + + "# not mentioned in this data file is Not_Character.\n" + + "# As usual, sets are unordered, with no duplicate values.\n"); + + bf2.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(tempMap) + .setMain(propName, "IDT", UnicodeProperty.EXTENDED_MISC, "1.0")); if (status == Style.byValue) { TreeSet sorted = new TreeSet<>(tempComp); sorted.addAll(tempMap.values()); for (String value : sorted) { out2.println(""); - out2.println("#\t" - + propName - + ":\t" + value); + out2.println("#\t" + propName + ":\t" + value); out2.println(""); bf2.showSetNames(out2, tempMap.getSet(value)); } @@ -1014,7 +1105,8 @@ private void printIdentifierStatus() throws IOException { final UnicodeMap tempMap = new UnicodeMap(); tempMap.putAll(0, 0x10FFFF, Identifier_Status.allowed.toString()); for (Set value : identifierTypesMap.values()) { - if (!value.contains(Identifier_Type.recommended) && !value.contains(Identifier_Type.inclusion)) { + if (!value.contains(Identifier_Type.recommended) + && !value.contains(Identifier_Type.inclusion)) { UnicodeSet set = identifierTypesMap.getSet(value); tempMap.putAll(set, Identifier_Status.restricted.toString()); } @@ -1022,33 +1114,46 @@ private void printIdentifierStatus() throws IOException { final BagFormatter bf2 = GenerateConfusables.makeFormatter(); bf2.setMergeRanges(true); - final ToolUnicodePropertySource properties = ToolUnicodePropertySource.make(Default.ucdVersion()); + final ToolUnicodePropertySource properties = + ToolUnicodePropertySource.make(Default.ucdVersion()); final UnicodeProperty age = properties.getProperty("age"); bf2.setLabelSource(age); final String propName = "Identifier_Status"; - try (PrintWriter out2 = GenerateConfusables.openAndWriteHeader(GenerateConfusables.GEN_SECURITY_DIR, - "IdentifierStatus.txt", "Security Profile for General Identifiers: " + propName)) { - out2.println("# Format" - + "\n#" - + "\n# Field 0: code point" - + "\n# Field 1: Identifier_Status value (see Table 1 of http://www.unicode.org/reports/tr39)" - + "\n#" - + "\n# Any missing code points have the " + propName + " value Restricted"); - - out2.println("#\n" - + "# For the purpose of regular expressions, the property " + propName + " is defined as\n" - + "# an enumerated property of code points.\n" - + "# The short name of " + propName + " is the same as the long name.\n" - + "# The possible values are:\n" - + "# Allowed, Restricted\n" - + "# The short name of each value is the same as its long name.\n" - + "# The default property value for all Unicode code points U+0000..U+10FFFF\n" - + "# not mentioned in this data file is Restricted.\n"); - - bf2.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(tempMap).setMain(propName, "IDS", - UnicodeProperty.EXTENDED_MISC, "1.0")); + try (PrintWriter out2 = + GenerateConfusables.openAndWriteHeader( + GenerateConfusables.GEN_SECURITY_DIR, + "IdentifierStatus.txt", + "Security Profile for General Identifiers: " + propName)) { + out2.println( + "# Format" + + "\n#" + + "\n# Field 0: code point" + + "\n# Field 1: Identifier_Status value (see Table 1 of http://www.unicode.org/reports/tr39)" + + "\n#" + + "\n# Any missing code points have the " + + propName + + " value Restricted"); + + out2.println( + "#\n" + + "# For the purpose of regular expressions, the property " + + propName + + " is defined as\n" + + "# an enumerated property of code points.\n" + + "# The short name of " + + propName + + " is the same as the long name.\n" + + "# The possible values are:\n" + + "# Allowed, Restricted\n" + + "# The short name of each value is the same as its long name.\n" + + "# The default property value for all Unicode code points U+0000..U+10FFFF\n" + + "# not mentioned in this data file is Restricted.\n"); + + bf2.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(tempMap) + .setMain(propName, "IDS", UnicodeProperty.EXTENDED_MISC, "1.0")); for (Identifier_Status value : Identifier_Status.values()) { if (value == Identifier_Status.restricted) { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/IntMap.java b/unicodetools/src/main/java/org/unicode/text/UCD/IntMap.java index 10304f300..42cb73631 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/IntMap.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/IntMap.java @@ -1,17 +1,15 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/IntMap.java,v $ - * $Date: 2007-02-11 08:15:09 $ - * $Revision: 1.2 $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/IntMap.java,v $ $Date: 2007-02-11 + * 08:15:09 $ $Revision: 1.2 $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; + import java.util.HashMap; public class IntMap { diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/ListNFComplete.java b/unicodetools/src/main/java/org/unicode/text/UCD/ListNFComplete.java index 23301617c..f75de5683 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/ListNFComplete.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/ListNFComplete.java @@ -1,4 +1,5 @@ package org.unicode.text.UCD; + public class ListNFComplete { // find all the characters that are @@ -11,8 +12,8 @@ public class ListNFComplete { // Example: a-breve might satisfy a-d, but if you // add an ogonek it changes to a-ogonek + breve - public static void main (String[] args) { - //Normalizer nfd = new Normalizer(Normalizer.NFD); + public static void main(String[] args) { + // Normalizer nfd = new Normalizer(Normalizer.NFD); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MLStreamWriter.java b/unicodetools/src/main/java/org/unicode/text/UCD/MLStreamWriter.java index e0aa6c493..db794c716 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MLStreamWriter.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MLStreamWriter.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/MLStreamWriter.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/MLStreamWriter.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; import java.io.PrintWriter; @@ -19,39 +17,40 @@ public class MLStreamWriter extends Writer { public static final String copyright = "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; - public MLStreamWriter (PrintWriter output, boolean HTML) { + public MLStreamWriter(PrintWriter output, boolean HTML) { out = output; isHTML = HTML; } - public MLStreamWriter (PrintWriter output) { - this(output,true); + public MLStreamWriter(PrintWriter output) { + this(output, true); } public MLStreamWriter el(String elementName) { closeIfOpen(); print('<', AFTER); - print(elementName, elementName.equals("!--") ? AFTER+FORCE : AFTER); + print(elementName, elementName.equals("!--") ? AFTER + FORCE : AFTER); stack.add(elementName); inElement = true; return this; } private MLStreamWriter closeIfOpen() { - if (inElement && !"!--".equals(stack.get(stack.size()-1))) { - print('>',BEFORE+FORCE); + if (inElement && !"!--".equals(stack.get(stack.size() - 1))) { + print('>', BEFORE + FORCE); } inElement = false; return this; } - final public MLStreamWriter cel(String elementName) { + public final MLStreamWriter cel(String elementName) { return cl().tx(elementName); } public MLStreamWriter at(String attributeName, String attributeValue) { if (!inElement) { - throw new IllegalArgumentException("attribute \"" + attributeName + "\" not in element"); + throw new IllegalArgumentException( + "attribute \"" + attributeName + "\" not in element"); } print(' ', BOTH); print(attributeName, AFTER); @@ -91,49 +90,53 @@ public MLStreamWriter tx(String text) { return this; } - final public MLStreamWriter tx(char text) { + public final MLStreamWriter tx(char text) { return tx(String.valueOf(text)); } - final public MLStreamWriter tx(int text) { + public final MLStreamWriter tx(int text) { return tx(String.valueOf(text)); } - final public MLStreamWriter tx16(String text) { + public final MLStreamWriter tx16(String text) { return tx(hex(text)); } - final public MLStreamWriter tx16(char text) { + public final MLStreamWriter tx16(char text) { return tx(hex(text)); } - final public MLStreamWriter tx16(int text) { + public final MLStreamWriter tx16(int text) { return tx(hex(text)); } public MLStreamWriter cl(String closingElement) { closeIfOpen(); - final String lastElement = (String)stack.remove(stack.size()-1); + final String lastElement = (String) stack.remove(stack.size() - 1); if (closingElement != null && !closingElement.equals(lastElement)) { - throw new IllegalArgumentException("mismatch when closing \"" + closingElement - + "\", current active element is \"" + lastElement + "\""); + throw new IllegalArgumentException( + "mismatch when closing \"" + + closingElement + + "\", current active element is \"" + + lastElement + + "\""); } - if (lastElement.equals("!--")) {// hack for XML/HTML - print("-->",BEFORE+FORCE); + if (lastElement.equals("!--")) { // hack for XML/HTML + print("-->", BEFORE + FORCE); } else { print("',BEFORE); + print('>', BEFORE); } return this; } - final public MLStreamWriter cl() { + public final MLStreamWriter cl() { return cl(null); } public MLStreamWriter closeAllElements() { - for (int i = stack.size()-1; i >= 0; --i) { + for (int i = stack.size() - 1; i >= 0; --i) { cl(null); } return this; @@ -161,7 +164,7 @@ public void flush() { // Utility methods - final public MLStreamWriter cell(String ch, String type, String codepoint, String cat) { + public final MLStreamWriter cell(String ch, String type, String codepoint, String cat) { if (codepoint == null) { codepoint = ch; } @@ -169,8 +172,8 @@ final public MLStreamWriter cell(String ch, String type, String codepoint, Strin if (dotpos == -1) { el(type); } else { - el(type.substring(0,dotpos)); - at("class",type.substring(dotpos+1)); + el(type.substring(0, dotpos)); + at("class", type.substring(dotpos + 1)); } /* if (color == -1) { @@ -190,42 +193,39 @@ final public MLStreamWriter cell(String ch, String type, String codepoint, Strin return this; } - final public MLStreamWriter cell(String ch) { - return cell(ch,"td",null,null); + public final MLStreamWriter cell(String ch) { + return cell(ch, "td", null, null); } - final public MLStreamWriter cell(String ch, String type) { - return cell(ch,type,null,null); + public final MLStreamWriter cell(String ch, String type) { + return cell(ch, type, null, null); } - final public MLStreamWriter cell(String ch, String type, String codepoint) { - return cell(ch,type,codepoint,null); + public final MLStreamWriter cell(String ch, String type, String codepoint) { + return cell(ch, type, codepoint, null); } - static public String hex(int i, int width) { + public static String hex(int i, int width) { final String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase(); - return "00000000".substring(result.length(),width) + result; + return "00000000".substring(result.length(), width) + result; } - /** - * Supplies a zero-padded hex representation of an integer (without 0x) - */ - static public String hex(int i) { - return hex(i,8); + /** Supplies a zero-padded hex representation of an integer (without 0x) */ + public static String hex(int i) { + return hex(i, 8); } - /** - * Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u) - */ - static public String hex(char i) { - return hex(i,4); + /** Supplies a zero-padded hex representation of a Unicode character (without 0x, \\u) */ + public static String hex(char i) { + return hex(i, 4); } /** * Supplies a zero-padded hex representation of a Unicode String (without 0x, \\u) - *@param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062" + * + * @param sep can be used to give a sequence, e.g. hex("ab", ",") gives "0061,0062" */ - static public String hex(String s, String sep) { + public static String hex(String s, String sep) { final StringBuffer result = new StringBuffer(); for (int i = 0; i < s.length(); ++i) { if (i != 0) { @@ -236,13 +236,20 @@ static public String hex(String s, String sep) { return result.toString(); } - static public String hex(String s) { - return hex(s," "); + public static String hex(String s) { + return hex(s, " "); } - public void author(String name, String url) { - el("font").at("size","-3").tx("[").el("a").at("href",url).tx(name).cl("a").el("script").el("!--"); + el("font") + .at("size", "-3") + .tx("[") + .el("a") + .at("href", url) + .tx(name) + .cl("a") + .el("script") + .el("!--"); tx("document.write(', ', document.lastModified);"); cl("!--").cl("script").tx("]").cl("font"); } @@ -258,14 +265,14 @@ public void author(String name, String url) { int maxLineLength = 60; // later, add better line end management, indenting - static final int NONE=0, BEFORE=1, AFTER=2, BOTH=3, FORCE = 4; // chosen for bits!! + static final int NONE = 0, BEFORE = 1, AFTER = 2, BOTH = 3, FORCE = 4; // chosen for bits!! final void print(String s) { - print(s,NONE); + print(s, NONE); } final void print(char c) { - print(c,NONE); + print(c, NONE); } final void print(String s, int doesBreak) { @@ -305,41 +312,43 @@ public String quoted(String source) { final StringBuffer result = new StringBuffer(); for (int i = 0; i < source.length(); ++i) { final char ch = source.charAt(i); - switch(ch) { - case '\'': - if (!isHTML) { - result.append("'"); - } else { + switch (ch) { + case '\'': + if (!isHTML) { + result.append("'"); + } else { + result.append(ch); + } + break; + case '\"': + result.append("""); + break; + case '<': + result.append("<"); + break; + case '&': + result.append("&"); + break; + case '>': + result.append(">"); + break; + case '\n': + case '\r': + case '\t': result.append(ch); - } - break; - case '\"': - result.append("""); - break; - case '<': - result.append("<"); - break; - case '&': - result.append("&"); - break; - case '>': - result.append(">"); - break; - case '\n': case '\r': case '\t': - result.append(ch); - break; - default: if (ch < ' ' // do surrogates later - || ch >= '\u007F' && ch <= '\u009F' - || ch >= '\uD800' && ch <= '\uDFFF' - || ch >= '\uFFFE') { - result.append('\uFFFD'); - } else { - result.append(ch); - } - break; + break; + default: + if (ch < ' ' // do surrogates later + || ch >= '\u007F' && ch <= '\u009F' + || ch >= '\uD800' && ch <= '\uDFFF' + || ch >= '\uFFFE') { + result.append('\uFFFD'); + } else { + result.append(ch); + } + break; } } return result.toString(); } - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/Main.java b/unicodetools/src/main/java/org/unicode/text/UCD/Main.java index 48572f401..f8a532178 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/Main.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/Main.java @@ -1,15 +1,14 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/Main.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/UCD/Main.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.UCD; + import org.unicode.text.utility.CallArgs; import org.unicode.text.utility.DirectoryIterator; import org.unicode.text.utility.FastBinarySearch; @@ -34,7 +33,7 @@ public final class Main implements UCD_Types { "DerivedAge", "StandardizedVariants", "HangulSyllableType", - //"OtherDerivedProperties", + // "OtherDerivedProperties", }; static final String[] EXTRACTED_FILES = { @@ -51,18 +50,15 @@ public final class Main implements UCD_Types { "DerivedNumericValues", }; - static final String[] ALL_FILES = { - "Core", "Extracted" - }; + static final String[] ALL_FILES = {"Core", "Extracted"}; - public static void main (String[] args) throws Exception { + public static void main(String[] args) throws Exception { System.out.println("*** Start *** " + Default.getDate()); try { for (int i = 0; i < args.length; ++i) { final String arg = args[i]; - if (arg.charAt(0) == '#') - { + if (arg.charAt(0) == '#') { return; // skip rest of line } @@ -73,19 +69,19 @@ public static void main (String[] args) throws Exception { // Expand string arguments if (arg.equalsIgnoreCase("ALL")) { - args = Utility.append(ALL_FILES, Utility.subarray(args, i+1)); + args = Utility.append(ALL_FILES, Utility.subarray(args, i + 1)); i = -1; continue; } if (arg.equalsIgnoreCase("CORE")) { - args = Utility.append(CORE_FILES, Utility.subarray(args, i+1)); + args = Utility.append(CORE_FILES, Utility.subarray(args, i + 1)); i = -1; continue; } if (arg.equalsIgnoreCase("EXTRACTED")) { - args = Utility.append(EXTRACTED_FILES, Utility.subarray(args, i+1)); + args = Utility.append(EXTRACTED_FILES, Utility.subarray(args, i + 1)); i = -1; continue; } @@ -106,7 +102,7 @@ public static void main (String[] args) throws Exception { VerifyUCD.checkAgainstUInfo(); } else if (arg.equalsIgnoreCase("build")) { - ConvertUCD.main(new String[]{Default.ucdVersion()}); + ConvertUCD.main(new String[] {Default.ucdVersion()}); } else if (arg.equalsIgnoreCase("statistics")) { VerifyUCD.statistics(); } else if (arg.equalsIgnoreCase("NFSkippable")) { @@ -182,9 +178,9 @@ public static void main (String[] args) throws Exception { } else if (arg.equalsIgnoreCase("GenerateThaiBreaks")) { GenerateThaiBreaks.main(null); } else if (arg.equalsIgnoreCase("TestData")) { - TestData.main(new String[]{args[++i]}); + TestData.main(new String[] {args[++i]}); } else if (arg.equalsIgnoreCase("MakeUnicodeFiles")) { - MakeUnicodeFiles.main(new String[]{}); + MakeUnicodeFiles.main(new String[] {}); } else if (arg.equalsIgnoreCase("checkScripts")) { VerifyUCD.checkScripts(); } else if (arg.equalsIgnoreCase("IdentifierTest")) { @@ -197,7 +193,8 @@ public static void main (String[] args) throws Exception { DirectoryIterator.test(); } else if (arg.equalsIgnoreCase("testnameuniqueness")) { TestNameUniqueness.checkNameList(); - //else if (arg.equalsIgnoreCase("checkDifferences")) GenerateData.checkDifferences("3.2.0"); + // else if (arg.equalsIgnoreCase("checkDifferences")) + // GenerateData.checkDifferences("3.2.0"); } else if (arg.equalsIgnoreCase("Compare14652")) { Compare14652.main(null); } else if (arg.equalsIgnoreCase("StandardizedVariants")) { @@ -214,98 +211,98 @@ public static void main (String[] args) throws Exception { GenerateCaseFolding.generateSpecialCasing(false); /* } else if (arg.equalsIgnoreCase("CompositionExclusions")) { - GenerateData.generateCompExclusions(); + GenerateData.generateCompExclusions(); - } else if (arg.equalsIgnoreCase("DerivedAge")) { - GenerateData.generateAge("DerivedData/", "DerivedAge"); + } else if (arg.equalsIgnoreCase("DerivedAge")) { + GenerateData.generateAge("DerivedData/", "DerivedAge"); - } else if (arg.equalsIgnoreCase("backwardsCompat")) { - GenerateData.backwardsCompat("DerivedData/extracted/", "Compatibility_ID_START", - new int[] {ID_Start, ID_Continue_NO_Cf, Mod_ID_Start, Mod_ID_Continue_NO_Cf}); + } else if (arg.equalsIgnoreCase("backwardsCompat")) { + GenerateData.backwardsCompat("DerivedData/extracted/", "Compatibility_ID_START", + new int[] {ID_Start, ID_Continue_NO_Cf, Mod_ID_Start, Mod_ID_Continue_NO_Cf}); - } else if (arg.equalsIgnoreCase("DerivedCoreProperties")) { - GenerateData.generateDerived(DERIVED_CORE, true, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedCoreProperties"); + } else if (arg.equalsIgnoreCase("DerivedCoreProperties")) { + GenerateData.generateDerived(DERIVED_CORE, true, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedCoreProperties"); - } else if (arg.equalsIgnoreCase("DerivedNormalizationProps")) { - GenerateData.generateDerived(DERIVED_NORMALIZATION, true, GenerateData.HEADER_DERIVED, "DerivedData/", - "DerivedNormalizationProps" ); + } else if (arg.equalsIgnoreCase("DerivedNormalizationProps")) { + GenerateData.generateDerived(DERIVED_NORMALIZATION, true, GenerateData.HEADER_DERIVED, "DerivedData/", + "DerivedNormalizationProps" ); - } else if (arg.equalsIgnoreCase("NormalizationTest")) { - GenerateData.writeNormalizerTestSuite("DerivedData/", "NormalizationTest"); + } else if (arg.equalsIgnoreCase("NormalizationTest")) { + GenerateData.writeNormalizerTestSuite("DerivedData/", "NormalizationTest"); - } else if (arg.equalsIgnoreCase("PropertyAliases")) { - GenerateData.generatePropertyAliases(); + } else if (arg.equalsIgnoreCase("PropertyAliases")) { + GenerateData.generatePropertyAliases(); - } else if (arg.equalsIgnoreCase("PropList")) { - GenerateData.generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + NEXT_ENUM, - GenerateData.HEADER_EXTEND, "DerivedData/", "PropList"); + } else if (arg.equalsIgnoreCase("PropList")) { + GenerateData.generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + NEXT_ENUM, + GenerateData.HEADER_EXTEND, "DerivedData/", "PropList"); - } else if (arg.equalsIgnoreCase("Scripts")) { - GenerateData.generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, - GenerateData.HEADER_SCRIPTS, "DerivedData/", "Scripts"); - // OTHER TESTING + } else if (arg.equalsIgnoreCase("Scripts")) { + GenerateData.generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM, + GenerateData.HEADER_SCRIPTS, "DerivedData/", "Scripts"); + // OTHER TESTING - } else if (arg.equalsIgnoreCase("OtherDerivedProperties")) { - //mask = Utility.setBits(0, NFC_Leading, NFC_Resulting); - GenerateData.generateDerived((byte)(ALL & ~DERIVED_CORE & ~DERIVED_NORMALIZATION), false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties"); + } else if (arg.equalsIgnoreCase("OtherDerivedProperties")) { + //mask = Utility.setBits(0, NFC_Leading, NFC_Resulting); + GenerateData.generateDerived((byte)(ALL & ~DERIVED_CORE & ~DERIVED_NORMALIZATION), false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties"); - } else if (arg.equalsIgnoreCase("AllBinary")) { - GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM, - GenerateData.HEADER_EXTEND, "OtherDerived/", "AllBinary"); + } else if (arg.equalsIgnoreCase("AllBinary")) { + GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM, + GenerateData.HEADER_EXTEND, "OtherDerived/", "AllBinary"); - } else if (arg.equalsIgnoreCase("DerivedGeneralCategoryTEST")) { - GenerateData.generateVerticalSlice(CATEGORY+29, CATEGORY+32, GenerateData.HEADER_DERIVED, - "DerivedData/", "DerivedGeneralCategory" ); + } else if (arg.equalsIgnoreCase("DerivedGeneralCategoryTEST")) { + GenerateData.generateVerticalSlice(CATEGORY+29, CATEGORY+32, GenerateData.HEADER_DERIVED, + "DerivedData/", "DerivedGeneralCategory" ); - } else if (arg.equalsIgnoreCase("listDifferences")) { - CompareProperties.listDifferences(); + } else if (arg.equalsIgnoreCase("listDifferences")) { + CompareProperties.listDifferences(); - } else if (arg.equalsIgnoreCase("partition")) { - CompareProperties.partition(); + } else if (arg.equalsIgnoreCase("partition")) { + CompareProperties.partition(); - } else if (arg.equalsIgnoreCase("propertyStatistics")) { - CompareProperties.statistics(); + } else if (arg.equalsIgnoreCase("propertyStatistics")) { + CompareProperties.statistics(); - } else if (arg.equalsIgnoreCase("listAccents")) { - GenerateData.listCombiningAccents(); + } else if (arg.equalsIgnoreCase("listAccents")) { + GenerateData.listCombiningAccents(); - } else if (arg.equalsIgnoreCase("listGreekVowels")) { - GenerateData.listGreekVowels(); + } else if (arg.equalsIgnoreCase("listGreekVowels")) { + GenerateData.listGreekVowels(); - } else if (arg.equalsIgnoreCase("listKatakana")) { - GenerateData.listKatakana(); - */ + } else if (arg.equalsIgnoreCase("listKatakana")) { + GenerateData.listKatakana(); + */ /* - } else if (arg.equalsIgnoreCase("DerivedFullNormalization")) { - mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC); - GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedFullNormalization" ); - } else if (arg.equalsIgnoreCase("caseignorable")) { - mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i); - GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "CaseIgnorable" ); - } else if (arg.equalsIgnoreCase("nfunsafestart")) { - mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart); - GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "NFUnsafeStart"); - */ + } else if (arg.equalsIgnoreCase("DerivedFullNormalization")) { + mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC); + GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedFullNormalization" ); + } else if (arg.equalsIgnoreCase("caseignorable")) { + mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i); + GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "CaseIgnorable" ); + } else if (arg.equalsIgnoreCase("nfunsafestart")) { + mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart); + GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "NFUnsafeStart"); + */ } else { - CallArgs.call(new String[]{arg}, classPrefix); + CallArgs.call(new String[] {arg}, classPrefix); } + // checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F"); + // checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"); - //checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F"); - //checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD"); - - - //GenerateData.generateDerived(Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf), + // GenerateData.generateDerived(Utility.setBits(0, DerivedProperty.PropMath, + // DerivedProperty.Mod_ID_Continue_NO_Cf), // GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedPropData2" ); - //GenerateData.generateVerticalSlice(SCRIPT, SCRIPT+1, "ScriptCommon" ); - //listStrings("LowerCase" , 0,0); - //GenerateData.generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedData/", "DerivedPropData1" ); + // GenerateData.generateVerticalSlice(SCRIPT, SCRIPT+1, "ScriptCommon" ); + // listStrings("LowerCase" , 0,0); + // GenerateData.generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, + // "DerivedData/", "DerivedPropData1" ); // AGE stuff - //UCD ucd = UCD.make(); - //System.out.println(ucd.getAgeID(0x61)); - //System.out.println(ucd.getAgeID(0x2FA1D)); + // UCD ucd = UCD.make(); + // System.out.println(ucd.getAgeID(0x61)); + // System.out.println(ucd.getAgeID(0x2FA1D)); // } diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java index d5c6e455a..301a0d517 100644 --- a/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/UCD/MakeUnicodeFiles.java @@ -1,5 +1,14 @@ package org.unicode.text.UCD; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; @@ -20,17 +29,16 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Tabber; +import org.unicode.cldr.util.props.UnicodeLabel; import org.unicode.props.BagFormatter; import org.unicode.props.DefaultValues; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; -import org.unicode.cldr.util.props.UnicodeLabel; -import org.unicode.props.UnicodeProperty; import org.unicode.props.UcdPropertyValues.Bidi_Class_Values; import org.unicode.props.UcdPropertyValues.Block_Values; +import org.unicode.props.UnicodeProperty; import org.unicode.text.UCD.MakeUnicodeFiles.Format.PrintStyle; import org.unicode.text.utility.ChainException; import org.unicode.text.utility.Settings; @@ -38,16 +46,6 @@ import org.unicode.text.utility.Utility; import org.unicode.tools.Segmenter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - public class MakeUnicodeFiles { static boolean DEBUG = false; @@ -59,7 +57,6 @@ public static void main(String[] args) throws IOException { static class Format { public static Format theFormat = new Format(); // singleton - Map printStyleMap = new TreeMap(UnicodeProperty.PROPERTY_COMPARATOR); static PrintStyle DEFAULT_PRINT_STYLE = new PrintStyle(); @@ -72,7 +69,7 @@ static class Format { UnicodeProperty.MapFilter hackMapFilter; String[] filesToDo; - private Format(){ + private Format() { build(); } @@ -146,20 +143,19 @@ String parse(String options) { longValueHeading = afterEquals(piece); } else if (piece.startsWith("skipValue=")) { if (skipUnassigned != null) { - throw new IllegalArgumentException("Can't have both skipUnassigned and skipValue"); + throw new IllegalArgumentException( + "Can't have both skipUnassigned and skipValue"); } skipValue = afterEquals(piece); } else if (piece.startsWith("skipUnassigned=")) { if (skipValue != null) { - throw new IllegalArgumentException("Can't have both skipUnassigned and skipValue"); + throw new IllegalArgumentException( + "Can't have both skipUnassigned and skipValue"); } skipUnassigned = afterEquals(piece); } else if (piece.length() != 0) { throw new IllegalArgumentException( - "Illegal PrintStyle Parameter: " - + piece - + " in " - + pieces[0]); + "Illegal PrintStyle Parameter: " + piece + " in " + pieces[0]); } } return pieces[0]; @@ -172,7 +168,8 @@ private boolean afterEqualsBoolean(String piece) { } else if (value.equalsIgnoreCase("false")) { return false; } - throw new IllegalArgumentException("Value in <" + piece + "> must be 'true' or 'false'"); + throw new IllegalArgumentException( + "Value in <" + piece + "> must be 'true' or 'false'"); } @Override @@ -189,7 +186,8 @@ public String toString() { } else { value = obj.toString(); } - } catch (final Exception e) {} + } catch (final Exception e) { + } result += "\t" + myField.getName() + "=<" + value + ">\n"; } return result; @@ -200,8 +198,7 @@ void addValueComments(String property, String value, String comments) { if (DEBUG) { showPVC(property, value, comments); } - Map valueToComments = - propertyToValueToComments.get(property); + Map valueToComments = propertyToValueToComments.get(property); if (valueToComments == null) { valueToComments = new TreeMap(); propertyToValueToComments.put(property, valueToComments); @@ -219,19 +216,25 @@ private void showPVC(String property, String value, String comments) { + ">, Value: <" + value + ">, Comments: <" - + comments + ">"); + + comments + + ">"); } String getValueComments(String property, String value) { - final Map valueToComments = - propertyToValueToComments.get(property); + final Map valueToComments = propertyToValueToComments.get(property); String result = null; if (valueToComments != null) { result = valueToComments.get(value); } if (DEBUG) { - System.out.println("Getting Property: <" + property + ">, Value: <" - + value + ">, Comment: <" + result + ">"); + System.out.println( + "Getting Property: <" + + property + + ">, Value: <" + + value + + ">, Comment: <" + + result + + ">"); } return result; } @@ -258,7 +261,9 @@ static String afterWhitespace(String source) { private void build() { BufferedReader br = null; try { - br = Utility.openReadFile(Settings.SRC_UCD_DIR + "MakeUnicodeFiles.txt", Utility.UTF8); + br = + Utility.openReadFile( + Settings.SRC_UCD_DIR + "MakeUnicodeFiles.txt", Utility.UTF8); String file = null, property = null, value = "", comments = ""; while (true) { String line = br.readLine(); @@ -300,9 +305,9 @@ private void build() { Default.setYear(lineValue); } else if (line.startsWith("File:")) { final int p2 = lineValue.lastIndexOf('/'); - file = lineValue.substring(p2+1); + file = lineValue.substring(p2 + 1); if (p2 >= 0) { - fileToDirectory.put(file, lineValue.substring(0,p2+1)); + fileToDirectory.put(file, lineValue.substring(0, p2 + 1)); } property = null; } else if (line.startsWith("Property:")) { @@ -338,7 +343,8 @@ private void build() { private void write() { final TreeMap fileoptions = new TreeMap(); - for (final Iterator it = fileToPropertySet.keySet().iterator(); it.hasNext();) { + for (final Iterator it = fileToPropertySet.keySet().iterator(); + it.hasNext(); ) { final String key = it.next(); if (DEBUG) { System.out.println(); @@ -349,7 +355,7 @@ private void write() { System.out.println("SPECIAL"); continue; } - for (final Iterator pIt = propList2.iterator(); pIt.hasNext();) { + for (final Iterator pIt = propList2.iterator(); pIt.hasNext(); ) { final String prop = pIt.next(); final String options = fileoptions.get(prop); if (DEBUG) { @@ -363,7 +369,7 @@ private void write() { if (vc == null) { continue; } - for (final Iterator it2 = vc.keySet().iterator(); it2.hasNext();) { + for (final Iterator it2 = vc.keySet().iterator(); it2.hasNext(); ) { final String value = it2.next(); final String comment = vc.get(value); if (DEBUG) { @@ -389,9 +395,11 @@ private void addPropertyToFile(String filename, String property) { } properties.add(property); } + public List getPropertiesFromFile(String filename) { return fileToPropertySet.get(filename); } + public Set getFiles() { return fileToPropertySet.keySet(); } @@ -400,7 +408,8 @@ public Set getFiles() { public static void generateFile() throws IOException { for (final String element : Format.theFormat.filesToDo) { final String fileNamePattern = element.trim(); - final Matcher matcher = Pattern.compile(fileNamePattern, Pattern.CASE_INSENSITIVE).matcher(""); + final Matcher matcher = + Pattern.compile(fileNamePattern, Pattern.CASE_INSENSITIVE).matcher(""); final Iterator it = Format.theFormat.getFiles().iterator(); boolean gotOne = false; while (it.hasNext()) { @@ -412,8 +421,7 @@ public static void generateFile() throws IOException { gotOne = true; } if (!gotOne) { - throw new IllegalArgumentException( - "Non-matching file name: " + fileNamePattern); + throw new IllegalArgumentException("Non-matching file name: " + fileNamePattern); } } } @@ -431,52 +439,57 @@ public static void generateFile(String filename) throws IOException { } else if (filename.contains("ScriptNfkc")) { generateScriptNfkc(filename); } else { - switch(filename) { - case "unihan": - writeUnihan(outputDir + "unihan/"); - break; - case "NormalizationTest": - GenerateData.writeNormalizerTestSuite(outputDir, "NormalizationTest"); - break; - case "BidiTest": - doBidiTest(filename); - break; - case "CaseFolding": - GenerateCaseFolding.makeCaseFold(false); - break; - case "SpecialCasing": - GenerateCaseFolding.generateSpecialCasing(false); - break; - case "StandardizedVariants": - GenerateStandardizedVariants.generate(); - break; - case "GraphemeBreakTest": - new GenerateBreakTest.GenerateGraphemeBreakTest( - Default.ucd(), Segmenter.Target.FOR_UCD).run(); - break; - case "WordBreakTest": - new GenerateBreakTest.GenerateWordBreakTest( - Default.ucd(), Segmenter.Target.FOR_UCD).run(); - break; - case "LineBreakTest": - new GenerateBreakTest.GenerateLineBreakTest( - Default.ucd(), Segmenter.Target.FOR_UCD).run(); - break; - case "SentenceBreakTest": - new GenerateBreakTest.GenerateSentenceBreakTest( - Default.ucd(), Segmenter.Target.FOR_UCD).run(); - break; - case "GraphemeBreakTest-cldr": - new GenerateBreakTest.GenerateGraphemeBreakTest( - Default.ucd(), Segmenter.Target.FOR_CLDR).run(); - break; - case "DerivedName": - case "DerivedLabel": - generateDerivedName(filename); - break; - default: - generatePropertyFile(filename); - break; + switch (filename) { + case "unihan": + writeUnihan(outputDir + "unihan/"); + break; + case "NormalizationTest": + GenerateData.writeNormalizerTestSuite(outputDir, "NormalizationTest"); + break; + case "BidiTest": + doBidiTest(filename); + break; + case "CaseFolding": + GenerateCaseFolding.makeCaseFold(false); + break; + case "SpecialCasing": + GenerateCaseFolding.generateSpecialCasing(false); + break; + case "StandardizedVariants": + GenerateStandardizedVariants.generate(); + break; + case "GraphemeBreakTest": + new GenerateBreakTest.GenerateGraphemeBreakTest( + Default.ucd(), Segmenter.Target.FOR_UCD) + .run(); + break; + case "WordBreakTest": + new GenerateBreakTest.GenerateWordBreakTest( + Default.ucd(), Segmenter.Target.FOR_UCD) + .run(); + break; + case "LineBreakTest": + new GenerateBreakTest.GenerateLineBreakTest( + Default.ucd(), Segmenter.Target.FOR_UCD) + .run(); + break; + case "SentenceBreakTest": + new GenerateBreakTest.GenerateSentenceBreakTest( + Default.ucd(), Segmenter.Target.FOR_UCD) + .run(); + break; + case "GraphemeBreakTest-cldr": + new GenerateBreakTest.GenerateGraphemeBreakTest( + Default.ucd(), Segmenter.Target.FOR_CLDR) + .run(); + break; + case "DerivedName": + case "DerivedLabel": + generateDerivedName(filename); + break; + default: + generatePropertyFile(filename); + break; } } } @@ -486,8 +499,7 @@ private static void generateDerivedName(String filename) throws IOException { final String dir = Format.theFormat.fileToDirectory.get(filename); final UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader( - "UCD/" + Default.ucdVersion() + '/' + dir, - filename); + "UCD/" + Default.ucdVersion() + '/' + dir, filename); final PrintWriter pw = udf.out; Format.theFormat.printFileComments(pw, filename); final UCD ucd = Default.ucd(); @@ -511,19 +523,21 @@ private static void generateDerivedName(String filename) throws IOException { names.put(i, name); } names.freeze(); - UnicodeLabel nameProp = new UnicodeProperty.UnicodeMapProperty() - .set(names) - .setMain("X", "X", UnicodeProperty.STRING, Default.ucdVersion()); + UnicodeLabel nameProp = + new UnicodeProperty.UnicodeMapProperty() + .set(names) + .setMain("X", "X", UnicodeProperty.STRING, Default.ucdVersion()); final BagFormatter bf = new BagFormatter(); bf.setHexValue(false) - .setMergeRanges(true) - .setNameSource(null) - .setLabelSource(null) - .setShowCount(false) - .setValueSource(nameProp) - .setRangeBreakSource(new UnicodeLabel.Constant("")) // prevent breaking on category boundaries - .showSetNames(pw, UnicodeSet.ALL_CODE_POINTS); + .setMergeRanges(true) + .setNameSource(null) + .setLabelSource(null) + .setShowCount(false) + .setValueSource(nameProp) + .setRangeBreakSource( + new UnicodeLabel.Constant("")) // prevent breaking on category boundaries + .showSetNames(pw, UnicodeSet.ALL_CODE_POINTS); pw.println(); pw.println("# EOF"); @@ -534,14 +548,14 @@ private static void generateScriptNfkc(String filename) throws IOException { final String dir = Format.theFormat.fileToDirectory.get(filename); final UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader( - "UCD/" + Default.ucdVersion() + '/' + dir, - filename); + "UCD/" + Default.ucdVersion() + '/' + dir, filename); final PrintWriter pw = udf.out; Format.theFormat.printFileComments(pw, filename); final UCD ucd = Default.ucd(); final BitSet normScripts = new BitSet(); - final UnicodeMap> results = new UnicodeMap>(); + final UnicodeMap> results = + new UnicodeMap>(); for (int i = 0; i <= 0x10FFFF; ++i) { final byte dt = ucd.getDecompositionType(i); if (dt == UCD_Types.NONE) { @@ -552,18 +566,29 @@ private static void generateScriptNfkc(String filename) throws IOException { final BitSet scripts = ucd.getScripts(norm, normScripts); scripts.clear(UCD_Types.COMMON_SCRIPT); scripts.clear(UCD_Types.INHERITED_SCRIPT); - final int expectedCount = script == UCD_Types.COMMON_SCRIPT || script == UCD_Types.INHERITED_SCRIPT ? 0 : 1; + final int expectedCount = + script == UCD_Types.COMMON_SCRIPT || script == UCD_Types.INHERITED_SCRIPT + ? 0 + : 1; if (scripts.cardinality() != expectedCount) { - results.put(i, Row.of(Character.codePointCount(norm, 0, norm.length()),UCD.getScriptID_fromIndex(script, UCD_Types.LONG), ucd.getScriptIDs(norm, " ", UCD_Types.LONG))); + results.put( + i, + Row.of( + Character.codePointCount(norm, 0, norm.length()), + UCD.getScriptID_fromIndex(script, UCD_Types.LONG), + ucd.getScriptIDs(norm, " ", UCD_Types.LONG))); } } results.freeze(); - final BagFormatter bf = new BagFormatter(ToolUnicodePropertySource.make(Default.ucdVersion())); + final BagFormatter bf = + new BagFormatter(ToolUnicodePropertySource.make(Default.ucdVersion())); pw.println(""); - for (final R3 value : results.values(new TreeSet>())) { + for (final R3 value : + results.values(new TreeSet>())) { final UnicodeSet uset = results.getSet(value); - pw.println("#\t" + value.get1() + "\t=>\t" + value.get2() + "\t" + uset.toPattern(false)); + pw.println( + "#\t" + value.get1() + "\t=>\t" + value.get2() + "\t" + uset.toPattern(false)); pw.println(""); pw.println(bf.showSetNames(uset)); } @@ -573,9 +598,7 @@ private static void generateScriptNfkc(String filename) throws IOException { private static void doBidiTest(String filename) throws IOException { final UnicodeDataFile udf = - UnicodeDataFile.openAndWriteHeader( - "UCD/" + Default.ucdVersion() + '/', - filename); + UnicodeDataFile.openAndWriteHeader("UCD/" + Default.ucdVersion() + '/', filename); final PrintWriter pw = udf.out; Format.theFormat.printFileComments(pw, filename); org.unicode.bidi.BidiConformanceTestBuilder.write(pw); @@ -594,19 +617,28 @@ private static void writeUnihan(String directory) throws IOException { for (final String propName : props.keySet()) { final UnicodeDataFile udf = - UnicodeDataFile.openAndWriteHeader(directory, propName).setSkipCopyright(Settings.SKIP_COPYRIGHT); + UnicodeDataFile.openAndWriteHeader(directory, propName) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter pw = udf.out; final BagFormatter bf = new BagFormatter(); bf.setHexValue(false) - .setMergeRanges(true) - .setNameSource(null) - .setLabelSource(null) - .setShowCount(false); - - final UnicodeProperty prop = new UnicodeProperty.UnicodeMapProperty().set(props.get(propName)).setMain(propName, propName, UnicodeProperty.STRING, Default.ucdVersion()); + .setMergeRanges(true) + .setNameSource(null) + .setLabelSource(null) + .setShowCount(false); + + final UnicodeProperty prop = + new UnicodeProperty.UnicodeMapProperty() + .set(props.get(propName)) + .setMain( + propName, + propName, + UnicodeProperty.STRING, + Default.ucdVersion()); final String name = prop.getName(); - System.out.println("Property: " + name + "; " + UnicodeProperty.getTypeName(prop.getType())); + System.out.println( + "Property: " + name + "; " + UnicodeProperty.getTypeName(prop.getType())); pw.println(); pw.println(SEPARATOR); pw.println(); @@ -617,16 +649,17 @@ private static void writeUnihan(String directory) throws IOException { if (map.getAvailableValues().size() < 100) { writeEnumeratedValues(pw, bf, unassigned, prop, ps); } else { - bf.setValueSource(prop) - .showSetNames(pw,new UnicodeSet(0,0x10FFFF)); + bf.setValueSource(prop).showSetNames(pw, new UnicodeSet(0, 0x10FFFF)); } } } - private static Map> getUnihanProps() { - final Map> unihanProps = new TreeMap>(); + private static Map> getUnihanProps() { + final Map> unihanProps = + new TreeMap>(); try { - final BufferedReader in = Utility.openUnicodeFile("Unihan", Default.ucdVersion(), true, Utility.UTF8); + final BufferedReader in = + Utility.openUnicodeFile("Unihan", Default.ucdVersion(), true, Utility.UTF8); int lineCounter = 0; while (true) { Utility.dot(++lineCounter); @@ -644,9 +677,9 @@ private static Map> getUnihanProps() { line = line.trim(); final int tabPos = line.indexOf('\t'); - final int tabPos2 = line.indexOf('\t', tabPos+1); + final int tabPos2 = line.indexOf('\t', tabPos + 1); - final String property = line.substring(tabPos+1, tabPos2).trim(); + final String property = line.substring(tabPos + 1, tabPos2).trim(); UnicodeMap result = unihanProps.get(property); if (result == null) { unihanProps.put(property, result = new UnicodeMap()); @@ -654,7 +687,7 @@ private static Map> getUnihanProps() { final String scode = line.substring(2, tabPos).trim(); final int code = Integer.parseInt(scode, 16); - final String propertyValue = line.substring(tabPos2+1).trim(); + final String propertyValue = line.substring(tabPos2 + 1).trim(); result.put(code, propertyValue); } in.close(); @@ -670,28 +703,25 @@ private static Map> getUnihanProps() { public static void generateAliasFile(String filename) throws IOException { final UnicodeDataFile udf = - UnicodeDataFile.openAndWriteHeader( - "UCD/" + Default.ucdVersion() + '/', - filename). - setSkipCopyright(Settings.SKIP_COPYRIGHT); + UnicodeDataFile.openAndWriteHeader("UCD/" + Default.ucdVersion() + '/', filename) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); final PrintWriter pw = udf.out; - final UnicodeProperty.Factory ups - = ToolUnicodePropertySource.make(Default.ucdVersion()); + final UnicodeProperty.Factory ups = ToolUnicodePropertySource.make(Default.ucdVersion()); final TreeSet sortedSet = new TreeSet(CASELESS_COMPARATOR); - final Tabber.MonoTabber mt = (Tabber.MonoTabber) new Tabber.MonoTabber() - .add(25,Tabber.LEFT) - .add(30,Tabber.LEFT); + final Tabber.MonoTabber mt = + (Tabber.MonoTabber) + new Tabber.MonoTabber().add(25, Tabber.LEFT).add(30, Tabber.LEFT); int count = 0; for (int i = UnicodeProperty.LIMIT_TYPE - 1; i >= UnicodeProperty.BINARY; --i) { if ((i & UnicodeProperty.EXTENDED_MASK) != 0) { continue; } - final List list = ups.getAvailableNames(1< list = ups.getAvailableNames(1 << i); + // if (list.size() == 0) continue; sortedSet.clear(); final StringBuffer buffer = new StringBuffer(); - for (final Iterator it = list.iterator(); it.hasNext();) { + for (final Iterator it = list.iterator(); it.hasNext(); ) { final String propAlias = it.next(); final UnicodeProperty up = ups.getProperty(propAlias); @@ -705,7 +735,7 @@ public static void generateAliasFile(String filename) throws IOException { } else { buffer.setLength(0); boolean isFirst = true; - for (final Iterator it2 = aliases.iterator(); it2.hasNext();) { + for (final Iterator it2 = aliases.iterator(); it2.hasNext(); ) { if (isFirst) { isFirst = false; } else { @@ -733,7 +763,7 @@ public static void generateAliasFile(String filename) throws IOException { pw.println(SEPARATOR); pw.println("# " + UnicodeProperty.getTypeName(i) + " Properties"); pw.println(SEPARATOR); - for (final Iterator it = sortedSet.iterator(); it.hasNext();) { + for (final Iterator it = sortedSet.iterator(); it.hasNext(); ) { pw.println(it.next()); count++; } @@ -747,33 +777,37 @@ public static void generateAliasFile(String filename) throws IOException { } static String[] specialMisc = { - //"isc\t; ISO_Comment", - //"na1\t; Unicode_1_Name", - //"URS\t; Unicode_Radical_Stroke" + // "isc\t; ISO_Comment", + // "na1\t; Unicode_1_Name", + // "URS\t; Unicode_Radical_Stroke" }; static String[] specialString = { - "dm\t; Decomposition_Mapping", - "lc\t; Lowercase_Mapping", - //"scc\t; Special_Case_Condition", - //"sfc\t; Simple_Case_Folding", - "slc\t; Simple_Lowercase_Mapping", - "stc\t; Simple_Titlecase_Mapping", - "suc\t; Simple_Uppercase_Mapping", - "tc\t; Titlecase_Mapping", - "uc\t; Uppercase_Mapping"}; + "dm\t; Decomposition_Mapping", + "lc\t; Lowercase_Mapping", + // "scc\t; Special_Case_Condition", + // "sfc\t; Simple_Case_Folding", + "slc\t; Simple_Lowercase_Mapping", + "stc\t; Simple_Titlecase_Mapping", + "suc\t; Simple_Uppercase_Mapping", + "tc\t; Titlecase_Mapping", + "uc\t; Uppercase_Mapping" + }; static String[] specialGC = { - "gc\t;\tC\t;\tOther\t# Cc | Cf | Cn | Co | Cs", - "gc\t;\tL\t;\tLetter\t# Ll | Lm | Lo | Lt | Lu", - "gc\t;\tLC\t;\tCased_Letter\t# Ll | Lt | Lu", - "gc\t;\tM\t;\tMark\t;\tCombining_Mark\t# Mc | Me | Mn", - "gc\t;\tN\t;\tNumber\t# Nd | Nl | No", - "gc\t;\tP\t;\tPunctuation\t;\tpunct\t# Pc | Pd | Pe | Pf | Pi | Po | Ps", - "gc\t;\tS\t;\tSymbol\t# Sc | Sk | Sm | So", - "gc\t;\tZ\t;\tSeparator\t# Zl | Zp | Zs"}; - - static final RuleBasedCollator CASELESS_COMPARATOR = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); + "gc\t;\tC\t;\tOther\t# Cc | Cf | Cn | Co | Cs", + "gc\t;\tL\t;\tLetter\t# Ll | Lm | Lo | Lt | Lu", + "gc\t;\tLC\t;\tCased_Letter\t# Ll | Lt | Lu", + "gc\t;\tM\t;\tMark\t;\tCombining_Mark\t# Mc | Me | Mn", + "gc\t;\tN\t;\tNumber\t# Nd | Nl | No", + "gc\t;\tP\t;\tPunctuation\t;\tpunct\t# Pc | Pd | Pe | Pf | Pi | Po | Ps", + "gc\t;\tS\t;\tSymbol\t# Sc | Sk | Sm | So", + "gc\t;\tZ\t;\tSeparator\t# Zl | Zp | Zs" + }; + + static final RuleBasedCollator CASELESS_COMPARATOR = + (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); + static { CASELESS_COMPARATOR.setNumericCollation(true); CASELESS_COMPARATOR.freeze(); @@ -781,49 +815,61 @@ public static void generateAliasFile(String filename) throws IOException { public static void generateValueAliasFile(String filename) throws IOException { String outputDir = "UCD/" + Default.ucdVersion() + '/'; - final UnicodeDataFile udf = UnicodeDataFile.openAndWriteHeader(outputDir, filename).setSkipCopyright(Settings.SKIP_COPYRIGHT); - final UnicodeDataFile diff = UnicodeDataFile.openAndWriteHeader(outputDir + "extra/", "diff"); + final UnicodeDataFile udf = + UnicodeDataFile.openAndWriteHeader(outputDir, filename) + .setSkipCopyright(Settings.SKIP_COPYRIGHT); + final UnicodeDataFile diff = + UnicodeDataFile.openAndWriteHeader(outputDir + "extra/", "diff"); final PrintWriter pw = udf.out; final PrintWriter diffOut = diff.out; Format.theFormat.printFileComments(pw, filename); - final UnicodeProperty.Factory toolFactory = ToolUnicodePropertySource.make(Default.ucdVersion()); - final UnicodeProperty.Factory lastFactory = ToolUnicodePropertySource.make(Utility.getPreviousUcdVersion(Default.ucdVersion())); - final UnicodeSet lastDefined = new UnicodeSet(lastFactory.getSet("gc=cn")).complement().freeze(); + final UnicodeProperty.Factory toolFactory = + ToolUnicodePropertySource.make(Default.ucdVersion()); + final UnicodeProperty.Factory lastFactory = + ToolUnicodePropertySource.make(Utility.getPreviousUcdVersion(Default.ucdVersion())); + final UnicodeSet lastDefined = + new UnicodeSet(lastFactory.getSet("gc=cn")).complement().freeze(); final BagFormatter bf = new BagFormatter(toolFactory); final BagFormatter bfdiff = new BagFormatter(toolFactory); final StringBuffer buffer = new StringBuffer(); final Set sortedSet = new TreeSet(CASELESS_COMPARATOR); - //gc ; C ; Other # Cc | Cf | Cn | Co | Cs + // gc ; C ; Other # Cc | Cf | Cn | Co | Cs // 123456789012345678901234567890123 // sc ; Arab ; Arabic - final Tabber.MonoTabber mt2 = (Tabber.MonoTabber) new Tabber.MonoTabber() - .add(3,Tabber.LEFT) - .add(2,Tabber.LEFT) // ; - .add(33,Tabber.LEFT) - .add(2,Tabber.LEFT) // ; - .add(33,Tabber.LEFT) - .add(2,Tabber.LEFT) // ; - .add(33,Tabber.LEFT); + final Tabber.MonoTabber mt2 = + (Tabber.MonoTabber) + new Tabber.MonoTabber() + .add(3, Tabber.LEFT) + .add(2, Tabber.LEFT) // ; + .add(33, Tabber.LEFT) + .add(2, Tabber.LEFT) // ; + .add(33, Tabber.LEFT) + .add(2, Tabber.LEFT) // ; + .add(33, Tabber.LEFT); // ccc; 216; ATAR ; Attached_Above_Right - final Tabber.MonoTabber mt3 = (Tabber.MonoTabber) new Tabber.MonoTabber() - .add(3,Tabber.LEFT) - .add(2,Tabber.LEFT) // ; - .add(3,Tabber.RIGHT) - .add(2,Tabber.LEFT) // ; - .add(27,Tabber.LEFT) - .add(2,Tabber.LEFT) // ; - .add(33,Tabber.LEFT) - .add(2,Tabber.LEFT) // ; - .add(33,Tabber.LEFT); - - //final Set skipNames = new HashSet(Arrays.asList("Lowercase_Mapping", "Uppercase_Mapping", "Titlecase_Mapping")); - - for (final Iterator it = toolFactory.getAvailableNames().iterator(); it.hasNext();) { + final Tabber.MonoTabber mt3 = + (Tabber.MonoTabber) + new Tabber.MonoTabber() + .add(3, Tabber.LEFT) + .add(2, Tabber.LEFT) // ; + .add(3, Tabber.RIGHT) + .add(2, Tabber.LEFT) // ; + .add(27, Tabber.LEFT) + .add(2, Tabber.LEFT) // ; + .add(33, Tabber.LEFT) + .add(2, Tabber.LEFT) // ; + .add(33, Tabber.LEFT); + + // final Set skipNames = new HashSet(Arrays.asList("Lowercase_Mapping", + // "Uppercase_Mapping", "Titlecase_Mapping")); + + for (final Iterator it = toolFactory.getAvailableNames().iterator(); + it.hasNext(); ) { final String propName = it.next(); final UnicodeProperty up = toolFactory.getProperty(propName); final int type = up.getType(); @@ -839,7 +885,11 @@ public static void generateValueAliasFile(String filename) throws IOException { final boolean isJamoShortName = propName.equals("Jamo_Short_Name"); final boolean isJoiningGroup = propName.equals("Joining_Group"); - if (isJamoShortName || ((1< l = up.getValueAliases(value); // HACK @@ -865,13 +915,13 @@ public static void generateValueAliasFile(String filename) throws IOException { } else if (l.size() == 2 && propName.equals("Decomposition_Type")) { l.add(0, l.get(0)); // double up } - if (UnicodeProperty.equalNames(value,"Cyrillic_Supplement")) { + if (UnicodeProperty.equalNames(value, "Cyrillic_Supplement")) { l.add("Cyrillic_Supplementary"); } buffer.setLength(0); buffer.append(shortProp); - for (final Iterator it3 = l.iterator(); it3.hasNext();) { + for (final Iterator it3 = l.iterator(); it3.hasNext(); ) { buffer.append("\t; \t" + it3.next()); } @@ -945,7 +995,6 @@ public static void generateValueAliasFile(String filename) throws IOException { // bfdiff.showSetNames(diffOut, newValues); // } } - } pw.println(); pw.println("# EOF"); @@ -953,9 +1002,15 @@ public static void generateValueAliasFile(String filename) throws IOException { diff.close(); } - private static void printDefaultValueComment(PrintWriter pw, String propName, UnicodeProperty up, boolean showPropName, String defaultValue) { + private static void printDefaultValueComment( + PrintWriter pw, + String propName, + UnicodeProperty up, + boolean showPropName, + String defaultValue) { if (Default.ucd().isAllocated(0xE0000)) { - throw new IllegalArgumentException("The paradigm 'default value' code point needs fixing!"); + throw new IllegalArgumentException( + "The paradigm 'default value' code point needs fixing!"); } if (defaultValue != null) { // ok @@ -964,8 +1019,7 @@ private static void printDefaultValueComment(PrintWriter pw, String propName, Un || propName.equals("Name") || propName.equals("Unicode_Radical_Stroke") || propName.equals("Unicode_1_Name") - || propName.equals("Jamo_Short_Name") - ) { + || propName.equals("Jamo_Short_Name")) { defaultValue = ""; } else if (propName.equals("Script_Extensions")) { defaultValue = ""); + System.out.println(""); } - private static void extract(Multimap characterToRelation, String oldLabel, UnicodeSet unicodeSet, String newLabel) { + private static void extract( + Multimap characterToRelation, + String oldLabel, + UnicodeSet unicodeSet, + String newLabel) { Collection old = characterToRelation.get(oldLabel); if (old.isEmpty()) { throw new IllegalArgumentException(); @@ -177,13 +199,13 @@ private static void extract(Multimap characterToRelation, String characterToRelation.remove(oldLabel, s); } } - + private static void getSc() { CurrencyMetaInfo metaInfo = CurrencyMetaInfo.getInstance(); UnicodeSet currencySymbol = new UnicodeSet("[[:sc:]元円圓圓圆]"); Counter2 symbolToGdp = new Counter2<>(); - Multimap currencyToInfo = TreeMultimap.create(); - Multimap symbolToInfo = TreeMultimap.create(); + Multimap currencyToInfo = TreeMultimap.create(); + Multimap symbolToInfo = TreeMultimap.create(); SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); for (ULocale locale : ULocale.getAvailableLocales()) { String region = locale.getCountry(); @@ -199,8 +221,9 @@ private static void getSc() { } } PopulationData pd = sdi.getPopulationDataForTerritory(region); - CurrencyFilter filter = CurrencyMetaInfo.CurrencyFilter.onRegion(region) - .withDateRange(new Date().getTime(), Long.MAX_VALUE); + CurrencyFilter filter = + CurrencyMetaInfo.CurrencyFilter.onRegion(region) + .withDateRange(new Date().getTime(), Long.MAX_VALUE); for (String currency : metaInfo.currencies(filter)) { Currency c = Currency.getInstance(currency); String symbol = c.getSymbol(locale); @@ -219,20 +242,29 @@ private static void getSc() { for (Entry> key : currencyToInfo.asMap().entrySet()) { System.out.println(key.getKey() + "\t" + key.getValue()); } - + UnicodeSet missing = new UnicodeSet(currencySymbol); for (String key : symbolToGdp.getKeysetSortedByCount(false)) { missing.remove(key); - System.out.println(key + "\t" + UCharacter.getName(key,"+") + "\t" + symbolToGdp.getCount(key) + "\t" + symbolToInfo.get(key)); - for (String s : CldrUtility.ifNull(codepointToNfkcs.get(key), Collections.emptySet())) { - System.out.println(s + "\t" + UCharacter.getName(s,"+")); + System.out.println( + key + + "\t" + + UCharacter.getName(key, "+") + + "\t" + + symbolToGdp.getCount(key) + + "\t" + + symbolToInfo.get(key)); + for (String s : + CldrUtility.ifNull(codepointToNfkcs.get(key), Collections.emptySet())) { + System.out.println(s + "\t" + UCharacter.getName(s, "+")); missing.remove(s); } } for (String key : missing) { - System.out.println(key + "\t" + UCharacter.getName(key,"+")); - for (String s : CldrUtility.ifNull(codepointToNfkcs.get(key), Collections.emptySet())) { - System.out.println(s + "\t" + UCharacter.getName(s,"+")); + System.out.println(key + "\t" + UCharacter.getName(key, "+")); + for (String s : + CldrUtility.ifNull(codepointToNfkcs.get(key), Collections.emptySet())) { + System.out.println(s + "\t" + UCharacter.getName(s, "+")); } } } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/GenerateRadicals.java b/unicodetools/src/main/java/org/unicode/text/tools/GenerateRadicals.java index ef0fece85..6e326d330 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/GenerateRadicals.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/GenerateRadicals.java @@ -1,5 +1,14 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Row.R5; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -11,7 +20,6 @@ import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; - import org.unicode.cldr.util.CldrUtility; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; @@ -26,33 +34,26 @@ import org.unicode.tools.IdsFileData; import org.unicode.tools.RadicalEnum; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row.R5; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - public class GenerateRadicals { public static final Normalizer2 NFKC = Normalizer2.getNFKCInstance(); public static final Normalizer2 NFC = Normalizer2.getNFKCInstance(); - private static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Settings.latestVersion); + private static final IndexUnicodeProperties iup = + IndexUnicodeProperties.make(Settings.latestVersion); - interface Collector { + interface Collector { void add(T item); + S get(); } - static class StringCollector implements Collector { + static class StringCollector implements Collector { private StringBuilder b = new StringBuilder(); private String separator; StringCollector(String separator) { this.separator = separator; } + @Override public void add(String item) { if (b.length() != 0) { @@ -66,10 +67,10 @@ public String get() { String result = b.toString(); b.setLength(0); return result; - } + } } - public static S transform(String s, UnicodeMap map, Collector collector) { + public static S transform(String s, UnicodeMap map, Collector collector) { for (int cp : CharSequences.codePoints(s)) { T value = map.get(cp); collector.add(value); @@ -86,20 +87,23 @@ public Data(RadicalEnum a, Integer b, String c, String d, String e) { public static void main(String[] args) { UnicodeMap names = iup.load(UcdProperty.Name); - UnicodeMap gc = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + UnicodeMap gc = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); UnicodeMap cjkRadicals = iup.load(UcdProperty.CJK_Radical); UnicodeSet radical = iup.loadEnum(UcdProperty.Radical, Binary.class).getSet(Binary.Yes); - UnicodeMap blocks = iup.loadEnum(UcdProperty.Block, UcdPropertyValues.Block_Values.class); - UnicodeSet interest = new UnicodeSet() - .addAll(blocks.getSet(Block_Values.CJK_Strokes)) - .removeAll(gc.getSet(General_Category_Values.Unassigned) - ); - - Relation cjkRadicalToIdeograph = Relation.of(new HashMap(), LinkedHashSet.class); + UnicodeMap blocks = + iup.loadEnum(UcdProperty.Block, UcdPropertyValues.Block_Values.class); + UnicodeSet interest = + new UnicodeSet() + .addAll(blocks.getSet(Block_Values.CJK_Strokes)) + .removeAll(gc.getSet(General_Category_Values.Unassigned)); + + Relation cjkRadicalToIdeograph = + Relation.of(new HashMap(), LinkedHashSet.class); for (Entry entry : cjkRadicals.entrySet()) { cjkRadicalToIdeograph.put(RadicalEnum.fromString(entry.getValue()), entry.getKey()); } - Map radicalToUnified = new HashMap<>(); + Map radicalToUnified = new HashMap<>(); for (Entry> entry : cjkRadicalToIdeograph.keyValuesSet()) { List values = new ArrayList<>(entry.getValue()); String item0 = values.get(0); @@ -114,7 +118,6 @@ public static void main(String[] args) { StringCollector sc = new StringCollector("+"); ComparisonNormalizer cnorm = ComparisonNormalizer.getSimple(); - RuleBasedCollator col = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); col.setNumericCollation(true); col.freeze(); @@ -136,7 +139,8 @@ public static void main(String[] args) { } for (RadicalEnum radical1 : radicals) { - Relation otherToCause = Relation.of(new HashMap(), LinkedHashSet.class); + Relation otherToCause = + Relation.of(new HashMap(), LinkedHashSet.class); addOther(cp, radicalToUnified.get(cp), "CJKRadicals.txt", otherToCause); addOther(cp, NFKC.normalize(cp), "NFKC", otherToCause); addOther(cp, cnorm.get(cp), "UCA", otherToCause); @@ -147,7 +151,13 @@ public static void main(String[] args) { } else { for (Entry> entry : otherToCause.keyValuesSet()) { - sorted.add(new Data(radical1, cp.charAt(0) >= 0x2F00 ? 0 : 1, cp, entry.getKey(), Common.COMMA_JOINER.join(entry.getValue()))); + sorted.add( + new Data( + radical1, + cp.charAt(0) >= 0x2F00 ? 0 : 1, + cp, + entry.getKey(), + Common.COMMA_JOINER.join(entry.getValue()))); } } } @@ -169,34 +179,56 @@ public static void main(String[] args) { String other = entry.get3(); soFar.add(other); String cause = entry.get4(); - - System.out.println(Utility.hex(cp) + ";" - + "\t" + Common.COMMA_JOINER.join(getTotalStrokes(other)) + ";" - + "\t" + radNum + ";" - + "\t" + Utility.hex(other) - + ";\t#\t" + cp + "\t→\t" + other - + "\t" + names.get(cp) - + ";\t" + cause - ); + + System.out.println( + Utility.hex(cp) + + ";" + + "\t" + + Common.COMMA_JOINER.join(getTotalStrokes(other)) + + ";" + + "\t" + + radNum + + ";" + + "\t" + + Utility.hex(other) + + ";\t#\t" + + cp + + "\t→\t" + + other + + "\t" + + names.get(cp) + + ";\t" + + cause); } doFinal(last, soFar); } private static List getTotalStrokes(String other) { - List totalStrokes = CldrUtility.ifNull( - other.isEmpty() ? null : IdsFileData.TOTAL_STROKES.get(other), + List totalStrokes = + CldrUtility.ifNull( + other.isEmpty() ? null : IdsFileData.TOTAL_STROKES.get(other), Collections.singletonList(0)); return totalStrokes; } private static void doFinal(RadicalEnum radical1, Set soFar) { - Set unicodeItems1 = normalizeSet(new LinkedHashSet<>(CldrUtility.ifNull(UNICODE_RADICALS.get(radical1), Collections.emptySet()))); + Set unicodeItems1 = + normalizeSet( + new LinkedHashSet<>( + CldrUtility.ifNull( + UNICODE_RADICALS.get(radical1), + Collections.emptySet()))); unicodeItems1.removeAll(soFar); - + Set unicodeItems = unicodeItems1; showOthers(radical1, unicodeItems, "kRSUnicode"); - Set adobeItems = normalizeSet(new LinkedHashSet<>(CldrUtility.ifNull(ADOBE_RADICALS.get(radical1), Collections.emptySet()))); + Set adobeItems = + normalizeSet( + new LinkedHashSet<>( + CldrUtility.ifNull( + ADOBE_RADICALS.get(radical1), + Collections.emptySet()))); adobeItems.removeAll(soFar); adobeItems.removeAll(unicodeItems); showOthers(radical1, adobeItems, "kRSAdobe"); @@ -204,14 +236,24 @@ private static void doFinal(RadicalEnum radical1, Set soFar) { private static void showOthers(RadicalEnum radical1, Set unicodeItems, String cause) { for (String other : unicodeItems) { - System.out.println("#? " - + "\t" + Common.COMMA_JOINER.join(getTotalStrokes(other)) + ";" - + "\t" + radical1 + ";" - + "\t" + Utility.hex(other) - + ";\t#\t" + "?" + "\t→\t" + other - + "\t" + "?" - + ";\t" + cause - ); + System.out.println( + "#? " + + "\t" + + Common.COMMA_JOINER.join(getTotalStrokes(other)) + + ";" + + "\t" + + radical1 + + ";" + + "\t" + + Utility.hex(other) + + ";\t#\t" + + "?" + + "\t→\t" + + other + + "\t" + + "?" + + ";\t" + + cause); } } @@ -229,16 +271,19 @@ private static String show(Set unicodeItems) { if (b.length() != 0) { b.append(", "); } - List totalStrokes = CldrUtility.ifNull(IdsFileData.TOTAL_STROKES.get(s), Collections.singletonList(0)); + List totalStrokes = + CldrUtility.ifNull( + IdsFileData.TOTAL_STROKES.get(s), Collections.singletonList(0)); - b.append(Utility.hex(s) - + " (" + s - + "/" + Common.COMMA_JOINER.join(totalStrokes) + ")"); - }; + b.append( + Utility.hex(s) + " (" + s + "/" + Common.COMMA_JOINER.join(totalStrokes) + ")"); + } + ; return b.toString(); } - private static void addOther(String cp, Set mapped, String string, Relation otherToCause) { + private static void addOther( + String cp, Set mapped, String string, Relation otherToCause) { if (mapped != null) { for (String other : mapped) { addOther(cp, other, string, otherToCause); @@ -246,20 +291,24 @@ private static void addOther(String cp, Set mapped, String string, Relat } } - private static String addOther(String cp, String mapped, String cause, Relation otherToCause) { + private static String addOther( + String cp, String mapped, String cause, Relation otherToCause) { String other = mapped; if (other == null) { return other; } other = NFC.normalize(other); if (!cp.equals(other)) { - otherToCause.put(other,cause); + otherToCause.put(other, cause); } return other; } - static final Relation ADOBE_RADICALS = Relation.of(new HashMap(), TreeSet.class); - static final Relation UNICODE_RADICALS = Relation.of(new HashMap(), TreeSet.class); + static final Relation ADOBE_RADICALS = + Relation.of(new HashMap(), TreeSet.class); + static final Relation UNICODE_RADICALS = + Relation.of(new HashMap(), TreeSet.class); + static { Matcher m = Common.ADOBE_RS_MATCHER.matcher(""); UnicodeMap> adobeRadicalStroke = iup.loadSet(UcdProperty.kRSAdobe_Japan1_6); diff --git a/unicodetools/src/main/java/org/unicode/text/tools/GenerateSubtagNames.java b/unicodetools/src/main/java/org/unicode/text/tools/GenerateSubtagNames.java index 764025c22..b45e7b60b 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/GenerateSubtagNames.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/GenerateSubtagNames.java @@ -8,7 +8,6 @@ import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; - import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.StandardCodes; @@ -44,7 +43,8 @@ private static Map generate() { StandardCodes sc = StandardCodes.make(); CLDRFile english = config.getEnglish(); Set CODE_OK = new HashSet(Arrays.asList("QO", "UK", "ZZ")); - for (Entry>> entry : StandardCodes.getEnumLstreg().entrySet()) { + for (Entry>> entry : + StandardCodes.getEnumLstreg().entrySet()) { LstrType type = entry.getKey(); int cldrType = -1; switch (type) { @@ -73,7 +73,9 @@ private static Map generate() { continue; } String description = fieldToValue.get(LstrField.Description); - if (description != null && description.equalsIgnoreCase("Private use") && !CODE_OK.contains(code)) { + if (description != null + && description.equalsIgnoreCase("Private use") + && !CODE_OK.contains(code)) { continue; } if (seen.containsKey(code)) { diff --git a/unicodetools/src/main/java/org/unicode/text/tools/GifSequenceWriter.java b/unicodetools/src/main/java/org/unicode/text/tools/GifSequenceWriter.java index 1e1d511e9..3c4956131 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/GifSequenceWriter.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/GifSequenceWriter.java @@ -15,7 +15,6 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; - import javax.imageio.IIOException; import javax.imageio.IIOImage; import javax.imageio.ImageIO; @@ -34,62 +33,45 @@ public class GifSequenceWriter { /** * Creates a new GifSequenceWriter - * - * @param outputStream - * the ImageOutputStream to be written to - * @param imageType - * one of the imageTypes specified in BufferedImage - * @param timeBetweenFramesMS - * the time between frames in miliseconds - * @param loopContinuously - * wether the gif should loop repeatedly - * @throws IIOException - * if no gif ImageWriters are found - * + * + * @param outputStream the ImageOutputStream to be written to + * @param imageType one of the imageTypes specified in BufferedImage + * @param timeBetweenFramesMS the time between frames in miliseconds + * @param loopContinuously wether the gif should loop repeatedly + * @throws IIOException if no gif ImageWriters are found * @author Elliot Kroo (elliot[at]kroo[dot]net) */ public GifSequenceWriter( ImageOutputStream outputStream, int imageType, int timeBetweenFramesMS, - boolean loopContinuously) throws IIOException, IOException { + boolean loopContinuously) + throws IIOException, IOException { // my method to create a writer gifWriter = getWriter(); imageWriteParam = gifWriter.getDefaultWriteParam(); ImageTypeSpecifier imageTypeSpecifier = ImageTypeSpecifier.createFromBufferedImageType(imageType); - imageMetaData = - gifWriter.getDefaultImageMetadata(imageTypeSpecifier, - imageWriteParam); + imageMetaData = gifWriter.getDefaultImageMetadata(imageTypeSpecifier, imageWriteParam); String metaFormatName = imageMetaData.getNativeMetadataFormatName(); - IIOMetadataNode root = (IIOMetadataNode) - imageMetaData.getAsTree(metaFormatName); + IIOMetadataNode root = (IIOMetadataNode) imageMetaData.getAsTree(metaFormatName); - IIOMetadataNode graphicsControlExtensionNode = getNode( - root, - "GraphicControlExtension"); + IIOMetadataNode graphicsControlExtensionNode = getNode(root, "GraphicControlExtension"); graphicsControlExtensionNode.setAttribute("disposalMethod", "none"); graphicsControlExtensionNode.setAttribute("userInputFlag", "FALSE"); + graphicsControlExtensionNode.setAttribute("transparentColorFlag", "FALSE"); graphicsControlExtensionNode.setAttribute( - "transparentColorFlag", - "FALSE"); - graphicsControlExtensionNode.setAttribute( - "delayTime", - Integer.toString(timeBetweenFramesMS / 10)); - graphicsControlExtensionNode.setAttribute( - "transparentColorIndex", - "0"); + "delayTime", Integer.toString(timeBetweenFramesMS / 10)); + graphicsControlExtensionNode.setAttribute("transparentColorIndex", "0"); IIOMetadataNode commentsNode = getNode(root, "CommentExtensions"); commentsNode.setAttribute("CommentExtension", "Created by MAH"); - IIOMetadataNode appEntensionsNode = getNode( - root, - "ApplicationExtensions"); + IIOMetadataNode appEntensionsNode = getNode(root, "ApplicationExtensions"); IIOMetadataNode child = new IIOMetadataNode("ApplicationExtension"); @@ -98,8 +80,7 @@ public GifSequenceWriter( int loop = loopContinuously ? 0 : 1; - child.setUserObject(new byte[] { 0x1, (byte) (loop & 0xFF), (byte) - ((loop >> 8) & 0xFF) }); + child.setUserObject(new byte[] {0x1, (byte) (loop & 0xFF), (byte) ((loop >> 8) & 0xFF)}); appEntensionsNode.appendChild(child); imageMetaData.setFromTree(metaFormatName, root); @@ -110,29 +91,22 @@ public GifSequenceWriter( } public void writeToSequence(RenderedImage img) throws IOException { - gifWriter.writeToSequence( - new IIOImage( - img, - null, - imageMetaData), - imageWriteParam); + gifWriter.writeToSequence(new IIOImage(img, null, imageMetaData), imageWriteParam); } /** - * Close this GifSequenceWriter object. This does not close the underlying - * stream, just finishes off the GIF. + * Close this GifSequenceWriter object. This does not close the underlying stream, just finishes + * off the GIF. */ public void close() throws IOException { gifWriter.endWriteSequence(); } /** - * Returns the first available GIF ImageWriter using - * ImageIO.getImageWritersBySuffix("gif"). - * + * Returns the first available GIF ImageWriter using ImageIO.getImageWritersBySuffix("gif"). + * * @return a GIF ImageWriter object - * @throws IIOException - * if no GIF image writers are returned + * @throws IIOException if no GIF image writers are returned */ private static ImageWriter getWriter() throws IIOException { Iterator iter = ImageIO.getImageWritersBySuffix("gif"); @@ -144,24 +118,17 @@ private static ImageWriter getWriter() throws IIOException { } /** - * Returns an existing child node, or creates and returns a new child node - * (if the requested node does not exist). - * - * @param rootNode - * the IIOMetadataNode to search for the child node. - * @param nodeName - * the name of the child node. - * - * @return the child node, if found or a new node created with the given - * name. + * Returns an existing child node, or creates and returns a new child node (if the requested + * node does not exist). + * + * @param rootNode the IIOMetadataNode to search for the child node. + * @param nodeName the name of the child node. + * @return the child node, if found or a new node created with the given name. */ - private static IIOMetadataNode getNode( - IIOMetadataNode rootNode, - String nodeName) { + private static IIOMetadataNode getNode(IIOMetadataNode rootNode, String nodeName) { int nNodes = rootNode.getLength(); for (int i = 0; i < nNodes; i++) { - if (rootNode.item(i).getNodeName().compareToIgnoreCase(nodeName) - == 0) { + if (rootNode.item(i).getNodeName().compareToIgnoreCase(nodeName) == 0) { return ((IIOMetadataNode) rootNode.item(i)); } } @@ -171,24 +138,21 @@ private static IIOMetadataNode getNode( } /** - * public GifSequenceWriter( BufferedOutputStream outputStream, int - * imageType, int timeBetweenFramesMS, boolean loopContinuously) { + * public GifSequenceWriter( BufferedOutputStream outputStream, int imageType, int + * timeBetweenFramesMS, boolean loopContinuously) { */ - public static void main(String[] args) throws Exception { if (args.length > 1) { // grab the output image type from the first image in the sequence BufferedImage firstImage = ImageIO.read(new File(args[0])); // create a new BufferedOutputStream with the last argument - ImageOutputStream output = - new FileImageOutputStream(new File(args[args.length - 1])); + ImageOutputStream output = new FileImageOutputStream(new File(args[args.length - 1])); // create a gif sequence with the type of the first image, 1 second // between frames, which loops continuously GifSequenceWriter writer = - new GifSequenceWriter(output, firstImage.getType(), 1, - false); + new GifSequenceWriter(output, firstImage.getType(), 1, false); // write out the first image to our sequence... writer.writeToSequence(firstImage); @@ -200,9 +164,7 @@ public static void main(String[] args) throws Exception { writer.close(); output.close(); } else { - System.out - .println( - "Usage: java GifSequenceWriter [list of gif files] [output file]"); + System.out.println("Usage: java GifSequenceWriter [list of gif files] [output file]"); } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/tools/IcannMsr.java b/unicodetools/src/main/java/org/unicode/text/tools/IcannMsr.java index dc5cec9c5..6dd475ee4 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/IcannMsr.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/IcannMsr.java @@ -1,5 +1,13 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.EntryRange; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -11,7 +19,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Pair; import org.unicode.cldr.util.XMLFileReader; @@ -26,15 +33,6 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.EntryRange; -import com.ibm.icu.util.ULocale; - public class IcannMsr { private static final String ICANN_DIR = Settings.Output.GEN_DIR + "icann/"; @@ -43,19 +41,23 @@ public class IcannMsr { private static final String XML_DATA = "msr-2-wle-rules-13apr15-en.xml"; private static final String DELTA_LIST = "MSR-2.0v5Delta.lst"; - static final SetMaker SM = new SetMaker() { - public Set make() { - return new LinkedHashSet(); - } - }; - static final SetMaker ITM = new SetMaker() { - public EnumSet make() { - return EnumSet.noneOf(Identifier_Type.class); - } - }; - static final UnicodeRelation DATA = new UnicodeRelation(SM); - static final UnicodeRelation DATA2TYPE = new UnicodeRelation(ITM); - static final UnicodeRelation REMAP = new UnicodeRelation(SM); + static final SetMaker SM = + new SetMaker() { + public Set make() { + return new LinkedHashSet(); + } + }; + static final SetMaker ITM = + new SetMaker() { + public EnumSet make() { + return EnumSet.noneOf(Identifier_Type.class); + } + }; + static final UnicodeRelation DATA = new UnicodeRelation(SM); + static final UnicodeRelation DATA2TYPE = + new UnicodeRelation(ITM); + static final UnicodeRelation REMAP = new UnicodeRelation(SM); + static { UnicodeSet found = new UnicodeSet(); boolean comment = false; @@ -93,7 +95,8 @@ public EnumSet make() { List> data = new ArrayList<>(); // try { - // BufferedReader f = FileUtilities.openFile(ICANN_DIR, "msr-wle-rules-04dec14-en.xml"); + // BufferedReader f = FileUtilities.openFile(ICANN_DIR, + // "msr-wle-rules-04dec14-en.xml"); // int line = f.read(); // System.out.println(Utility.hex(line)); // } catch (IOException e) { @@ -101,26 +104,26 @@ public EnumSet make() { Matcher first = Pattern.compile("@first-cp=\"([0-9A-Fa-f]+)\"").matcher(""); Matcher last = Pattern.compile("@last-cp=\"([0-9A-Fa-f]+)\"").matcher(""); Matcher only = Pattern.compile("@cp=\"([0-9A-Fa-f]+)\"").matcher(""); - XMLFileReader.loadPathValues(ICANN_DIR - + XML_DATA, data, false); + XMLFileReader.loadPathValues(ICANN_DIR + XML_DATA, data, false); for (Pair datum : data) { String path = datum.getFirst(); - // (//lgr/data/range[@first-cp="20E0E"][@last-cp="20E0F"][@tag="sc:Hani"][@ref="4 ZH IIC"],) + // (//lgr/data/range[@first-cp="20E0E"][@last-cp="20E0F"][@tag="sc:Hani"][@ref="4 ZH + // IIC"],) // (//lgr/data/char[@cp="20731"][@tag="sc:Hani"][@ref="4 ZH IIC"],) if (path.startsWith("//lgr/data/char")) { only.reset(path).find(); - int onlyCp = Integer.parseInt(only.group(1),16); + int onlyCp = Integer.parseInt(only.group(1), 16); addIfEmpty(onlyCp); } else if (path.startsWith("//lgr/data/range")) { first.reset(path).find(); - int firstCp = Integer.parseInt(first.group(1),16); + int firstCp = Integer.parseInt(first.group(1), 16); last.reset(path).find(); - int lastCp = Integer.parseInt(last.group(1),16); + int lastCp = Integer.parseInt(last.group(1), 16); for (int i = firstCp; i <= lastCp; ++i) { addIfEmpty(i); } } else { - //System.out.println("Skipping " + datum); + // System.out.println("Skipping " + datum); } } DATA.freeze(); @@ -161,9 +164,8 @@ private static Identifier_Type getIdentifierType(int cp, String dataLine) { Identifier_Type identifierType = Identifier_Type.fromString(firstWord); return identifierType; } catch (Exception e) { - throw new IllegalArgumentException(Utility.hex(cp) - + "\t" + getName(cp) - + "\t" + dataLine); + throw new IllegalArgumentException( + Utility.hex(cp) + "\t" + getName(cp) + "\t" + dataLine); } } @@ -188,13 +190,17 @@ public static void main(String[] args) { UnicodeMap> cldrChars = CLDRCharacterUtility.getCLDRCharacters(); UnicodeMap> diff = new UnicodeMap<>(); - for (EntryRange x : new UnicodeSet("[[:age=6.3:]-[[:nd:][:cn:][:co:][:cs:][:cwcf:]]]").ranges()) { + for (EntryRange x : + new UnicodeSet("[[:age=6.3:]-[[:nd:][:cn:][:co:][:cs:][:cwcf:]]]").ranges()) { for (int i = x.codepoint; i <= x.codepointEnd; ++i) { Set unicodeSet = xidMod.get(i); Identifier_Type unicode = unicodeSet.iterator().next(); Identifier_Status unicodeStatus = unicode.identifierStatus; Set icann = DATA2TYPE.get(i); - Identifier_Status icannStatus = icann == null ? Identifier_Status.restricted : icann.iterator().next().identifierStatus; + Identifier_Status icannStatus = + icann == null + ? Identifier_Status.restricted + : icann.iterator().next().identifierStatus; if (unicodeStatus != icannStatus) { Identifier_Type icann1 = icann == null ? null : icann.iterator().next(); diff.put(i, Pair.of(unicode, icann1)); @@ -204,24 +210,27 @@ public static void main(String[] args) { showValues("diff.txt", diff); } - static final Comparator> IDPAIR = new Comparator>() { + static final Comparator> IDPAIR = + new Comparator>() { - @Override - public int compare(Pair o1, Pair o2) { - Identifier_Type o11 = o1.getFirst(); - Identifier_Type o21 = o2.getFirst(); - int diff = o11.compareTo(o21); - if (diff != 0) return diff; - Identifier_Type o12 = o1.getSecond(); - Identifier_Type o22 = o2.getSecond(); - if (o12 == null) { - return o22 == null ? 0 : -1; - } else if (o22 == null) { - return 1; - } - return o12.compareTo(o22); - } - }; + @Override + public int compare( + Pair o1, + Pair o2) { + Identifier_Type o11 = o1.getFirst(); + Identifier_Type o21 = o2.getFirst(); + int diff = o11.compareTo(o21); + if (diff != 0) return diff; + Identifier_Type o12 = o1.getSecond(); + Identifier_Type o22 = o2.getSecond(); + if (o12 == null) { + return o22 == null ? 0 : -1; + } else if (o22 == null) { + return 1; + } + return o12.compareTo(o22); + } + }; private static boolean equalsSet(T a, Set b) { if (b == null) { @@ -231,12 +240,14 @@ private static boolean equalsSet(T a, Set b) { } static final Collator COL = Collator.getInstance(ULocale.ROOT); + static { COL.setStrength(Collator.IDENTICAL); } private static void showValues(String file, UnicodeRelation remap2, T skipValue) { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "icann/", file)) { + try (PrintWriter out = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "icann/", file)) { TreeSet sorted = new TreeSet(remap2.values()); for (T value : sorted) { UnicodeSet set = remap2.getKeys(value); @@ -250,7 +261,9 @@ private static void showValues(String file, UnicodeRelation remap2, T ski throw new RuntimeException(e); } } - static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); + + static final IndexUnicodeProperties iup = + IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); static final UnicodeMap Script_Extensions = iup.load(UcdProperty.Script_Extensions); private static void showSortedSet(PrintWriter out, UnicodeSet set, Identifier_Type _type) { @@ -263,13 +276,13 @@ private static void showSortedSet(PrintWriter out, UnicodeSet set, Identifier_Ty UnicodeSet inScript = new UnicodeSet(ss).retainAll(set); for (EntryRange x : inScript.ranges()) { show(out, _type, x.codepoint, x.codepointEnd); -// if (x.codepoint == x.codepointEnd) { -// continue; -// } else if (x.codepoint + 1 == x.codepointEnd) { -// show(out, "", x.codepointEnd); -// } else { -// show(out, "..", x.codepointEnd); -// } + // if (x.codepoint == x.codepointEnd) { + // continue; + // } else if (x.codepoint + 1 == x.codepointEnd) { + // show(out, "", x.codepointEnd); + // } else { + // show(out, "..", x.codepointEnd); + // } } out.println(); } @@ -278,18 +291,24 @@ private static void showSortedSet(PrintWriter out, UnicodeSet set, Identifier_Ty // 1F54F; uncommon-use # BOWL OF HYGIEIA // 0138 ĸ Ll Latin LATIN SMALL LETTER KRA private static void show(PrintWriter out, Identifier_Type _type, int cpStart, int cpEnd) { - out.println(Utility.hex(cpStart) - + (cpStart == cpEnd ? "\t" : ".." + Utility.hex(cpEnd)) - + " ; " + (_type == null ? "???" : _type) - + "\t # " + (_type == null ? "" : "according to MSR 5 ") - + "( " + UTF16.valueOf(cpStart) - + (cpStart == cpEnd ? "" : ".." + UTF16.valueOf(cpEnd)) - + " ) [" + getGc(cpStart) - + (cpStart == cpEnd ? "" : "..") - + ", " + Script_Extensions.get(cpStart) - + (cpStart == cpEnd ? "" : "..") - + "] " + getNames(cpStart, cpEnd) - ); + out.println( + Utility.hex(cpStart) + + (cpStart == cpEnd ? "\t" : ".." + Utility.hex(cpEnd)) + + " ; " + + (_type == null ? "???" : _type) + + "\t # " + + (_type == null ? "" : "according to MSR 5 ") + + "( " + + UTF16.valueOf(cpStart) + + (cpStart == cpEnd ? "" : ".." + UTF16.valueOf(cpEnd)) + + " ) [" + + getGc(cpStart) + + (cpStart == cpEnd ? "" : "..") + + ", " + + Script_Extensions.get(cpStart) + + (cpStart == cpEnd ? "" : "..") + + "] " + + getNames(cpStart, cpEnd)); } private static String getNames(int cpStart, int cpEnd) { @@ -310,11 +329,16 @@ private static String getName(int cpStart) { } private static String getGc(int cpStart) { - return UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(cpStart), UProperty.NameChoice.SHORT); + return UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, + UCharacter.getType(cpStart), + UProperty.NameChoice.SHORT); } - private static void showValues(String file, UnicodeMap> diff) { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "icann/", file)) { + private static void showValues( + String file, UnicodeMap> diff) { + try (PrintWriter out = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "icann/", file)) { Set> sorted = new TreeSet<>(IDPAIR); sorted.addAll(diff.values()); @@ -322,17 +346,19 @@ private static void showValues(String file, UnicodeMap] ;", // "$hash = \\u0023 ;", // "$question = \\u003F ;", // "$semi = \\u003B ;", }; - + enum ScanStatus { - /** - * The scan made it to the end, and there is a valid token. - */ - AT_END, - /** - * The scan didn't make it to the end, but there is a valid token. - */ - VALID, - STOPPED, - INVALID} - + /** The scan made it to the end, and there is a valid token. */ + AT_END, + /** The scan didn't make it to the end, but there is a valid token. */ + VALID, + STOPPED, + INVALID + } + private class ScanData { String source; int start; @@ -67,6 +63,7 @@ private class ScanData { private abstract class Scanner { /** * Scans the input for a valid token. Returns true if scanned to the end. + * * @param input * @param index * @return @@ -76,33 +73,39 @@ private abstract class Scanner { static class TldMatcher { static final List TLDS = Arrays.asList("com", "org", "de", "香港"); + boolean matches(String s, int limit) { for (String item : TLDS) { - if (s.regionMatches(limit-item.length(), item, 0, item.length())) { + if (s.regionMatches(limit - item.length(), item, 0, item.length())) { return true; } } return false; } } - + private class UrlScanner extends Scanner { private static final char FRAGMENT_START = '#'; private static final char QUERY_START = '?'; private static final char PATH_START = '/'; - Scanner schemeScanner = new SimpleScanner(new UnicodeSet("[-+.a-zA-Z0-9]"), UnicodeSet.EMPTY); + Scanner schemeScanner = + new SimpleScanner(new UnicodeSet("[-+.a-zA-Z0-9]"), UnicodeSet.EMPTY); Scanner domainScanner = new DomainScanner(); Scanner pathScanner = new SimpleScanner(new UnicodeSet(NORMAL).add(PATH_START), INCLUSIONS); - Scanner queryScanner = new SimpleScanner(new UnicodeSet(NORMAL).add(QUERY_START), INCLUSIONS); - Scanner fragmentScanner = new SimpleScanner(new UnicodeSet(NORMAL).add(FRAGMENT_START), INCLUSIONS); + Scanner queryScanner = + new SimpleScanner(new UnicodeSet(NORMAL).add(QUERY_START), INCLUSIONS); + Scanner fragmentScanner = + new SimpleScanner(new UnicodeSet(NORMAL).add(FRAGMENT_START), INCLUSIONS); TldMatcher tldMatcher = new TldMatcher(); + public ScanStatus scan(ScanData input) { int start = input.lastValidLimit; ScanStatus status = schemeScanner.scan(input); if (status == ScanStatus.AT_END) { return status; } - if (input.source.regionMatches(input.lastValidLimit, SCHEME_END, 0, SCHEME_END.length())) { + if (input.source.regionMatches( + input.lastValidLimit, SCHEME_END, 0, SCHEME_END.length())) { // we have a scheme, look for a domain input.lastValidLimit += SCHEME_END.length(); status = domainScanner.scan(input); @@ -122,13 +125,16 @@ public ScanStatus scan(ScanData input) { return status; } } + private class SimpleScanner extends Scanner { private UnicodeSet ok; private UnicodeSet softBreak; + public SimpleScanner(UnicodeSet ok, UnicodeSet softBreak) { this.ok = ok.freeze(); this.softBreak = softBreak.freeze(); } + public ScanStatus scan(ScanData data) { int cp; int codePointLength; @@ -146,17 +152,17 @@ public ScanStatus scan(ScanData data) { return ScanStatus.AT_END; } } - + class DomainScanner extends SimpleScanner { DomainScanner() { super(NORMAL, DOTS); } - } - + private static final String REGEX_STRING; private static final Pattern REGEX; + static { UnicodeRegex unicodeRegex = new UnicodeRegex(); REGEX_STRING = unicodeRegex.compileBnf(Arrays.asList(REGEX_SOURCE)); @@ -165,7 +171,7 @@ class DomainScanner extends SimpleScanner { private Matcher matcher = REGEX.matcher(""); private IDNA idna = IDNA.getUTS46Instance(0); - + public Linkifier reset(String source) { matcher.reset(source); return this; @@ -175,11 +181,11 @@ public boolean find() { return matcher.find(); } - public int start () { + public int start() { return matcher.start(0); } - public int end () { + public int end() { return matcher.end(0); } @@ -188,7 +194,7 @@ private String findMismatch(String source) { } public static void main(String[] args) { - + Pattern pat = Pattern.compile("(?\\w+) (?\\d+)"); Matcher m = pat.matcher("TEST 123"); m.matches(); @@ -196,36 +202,35 @@ public static void main(String[] args) { System.out.println("2" + "\t" + m.group(2)); System.out.println("login" + "\t" + m.group("login")); System.out.println("id" + "\t" + m.group("id")); - - + String[] tests = { - "google.com", - "xn--a.com", - "google.com./", - "google.com./abc/def", - "αβγ。com/δεζ?η=θ?&ι=κ#λμ_?#ξ", - "http://google.com/", - "αβγ@δεζ.com", - "mailto:αβγ@δεζ.com", - "file:///c:/WINDOWS/clock.avi", - "http://be.wikipedia.org/wiki/Вікіпедыя:Артыкулы,_якія_мусяць_быць_у_кожнай_Вікіпедыі", - "http://bpy.wikipedia.org/wiki/উইকিপিডিয়া:থানা_থকিসে_নিবন্ধ", - "http://bug.wikipedia.org/wiki/Wikipedia:Daftar_artikel_ᨆᨊᨛᨂ_ᨅᨔ_ᨄᨑᨛᨒᨘ_ᨂᨛᨀ", - "http://fa.wikipedia.org/wiki/ویکی‌پدیا:فهرست_نوشتارهایی_که_هر_ویکی‌پدیا_باید_بدارد", - "http://gu.wikipedia.org/wiki/વિકિપીડિયા:દરેક_ભાષાના_વિકિપીડિયામાં_હોય_એવા_પ્રારંભિક_લેખોની_યાદી", - "http://hi.wikipedia.org/wiki/विकिपीडिया:कुछ_प्रारंभिक_लेख_जो_कि_हर_भाषा_के_विकिपीडिया_में_होने_चाहिए", - "http://kn.wikipedia.org/wiki/ವಿಕಿಪೀಡಿಯ:ಅಗತ್ಯ_ಲೇಖನಗಳು", - "http://ml.wikipedia.org/wiki/വിക്കിപീഡിയ:അവശ്യലേഖനങ്ങള്‍", - "http://mr.wikipedia.org/wiki/विकिपीडिया:लेख_संपादन_स्पर्धा/लेखांची_यादी", - "http://zh-min-nan.wikipedia.org/wiki/Wikipedia:Só͘-ū_ê_Wikipedia_pán-pún_lóng_èng-kai_ū_ê_bûn-chiuⁿ", - "http://new.wikipedia.org/wiki/विकिपिडियाःहलिमसफूया_ख्यःत", - "http://os.wikipedia.org/wiki/Википеди:Алы_æвзагыл_дæр_чи_хъуамæ_уа,_уыцы_статьятæ", - "http://ru.wikipedia.org/wiki/Википедия:Список_статей,_которые_должны_быть_во_всех_языковых_версиях", - "http://ta.wikipedia.org/wiki/Wikipedia:அனைத்து_மொழி_விக்கிபீடியாக்களிலும்_இருக்க_வேண்டிய_கட்டுரைகளின்_பட்டியல்", - "http://te.wikipedia.org/wiki/Wikipedia:వికీపీడియాలో_తప్పకుండా_ఉండవలసిన_వ్యాసాలు", - "http://th.wikipedia.org/wiki/วิกิพีเดีย:รายชื่อบทความที่วิกิพีเดียทุกภาษาควรมี", - "http://uk.wikipedia.org/wiki/Вікіпедія:Статті,_які_повинні_бути_у_всіх_вікіпедіях", - "http://yi.wikipedia.org/wiki/װיקיפּעדיע:וויכטיגע_ארטיקלן" + "google.com", + "xn--a.com", + "google.com./", + "google.com./abc/def", + "αβγ。com/δεζ?η=θ?&ι=κ#λμ_?#ξ", + "http://google.com/", + "αβγ@δεζ.com", + "mailto:αβγ@δεζ.com", + "file:///c:/WINDOWS/clock.avi", + "http://be.wikipedia.org/wiki/Вікіпедыя:Артыкулы,_якія_мусяць_быць_у_кожнай_Вікіпедыі", + "http://bpy.wikipedia.org/wiki/উইকিপিডিয়া:থানা_থকিসে_নিবন্ধ", + "http://bug.wikipedia.org/wiki/Wikipedia:Daftar_artikel_ᨆᨊᨛᨂ_ᨅᨔ_ᨄᨑᨛᨒᨘ_ᨂᨛᨀ", + "http://fa.wikipedia.org/wiki/ویکی‌پدیا:فهرست_نوشتارهایی_که_هر_ویکی‌پدیا_باید_بدارد", + "http://gu.wikipedia.org/wiki/વિકિપીડિયા:દરેક_ભાષાના_વિકિપીડિયામાં_હોય_એવા_પ્રારંભિક_લેખોની_યાદી", + "http://hi.wikipedia.org/wiki/विकिपीडिया:कुछ_प्रारंभिक_लेख_जो_कि_हर_भाषा_के_विकिपीडिया_में_होने_चाहिए", + "http://kn.wikipedia.org/wiki/ವಿಕಿಪೀಡಿಯ:ಅಗತ್ಯ_ಲೇಖನಗಳು", + "http://ml.wikipedia.org/wiki/വിക്കിപീഡിയ:അവശ്യലേഖനങ്ങള്‍", + "http://mr.wikipedia.org/wiki/विकिपीडिया:लेख_संपादन_स्पर्धा/लेखांची_यादी", + "http://zh-min-nan.wikipedia.org/wiki/Wikipedia:Só͘-ū_ê_Wikipedia_pán-pún_lóng_èng-kai_ū_ê_bûn-chiuⁿ", + "http://new.wikipedia.org/wiki/विकिपिडियाःहलिमसफूया_ख्यःत", + "http://os.wikipedia.org/wiki/Википеди:Алы_æвзагыл_дæр_чи_хъуамæ_уа,_уыцы_статьятæ", + "http://ru.wikipedia.org/wiki/Википедия:Список_статей,_которые_должны_быть_во_всех_языковых_версиях", + "http://ta.wikipedia.org/wiki/Wikipedia:அனைத்து_மொழி_விக்கிபீடியாக்களிலும்_இருக்க_வேண்டிய_கட்டுரைகளின்_பட்டியல்", + "http://te.wikipedia.org/wiki/Wikipedia:వికీపీడియాలో_తప్పకుండా_ఉండవలసిన_వ్యాసాలు", + "http://th.wikipedia.org/wiki/วิกิพีเดีย:รายชื่อบทความที่วิกิพีเดียทุกภาษาควรมี", + "http://uk.wikipedia.org/wiki/Вікіпедія:Статті,_які_повинні_бути_у_всіх_вікіпедіях", + "http://yi.wikipedia.org/wiki/װיקיפּעדיע:וויכטיגע_ארטיקלן" }; Linkifier linkifier = new Linkifier(); System.out.println(CollectionUtilities.join(REGEX_SOURCE, "\n\t")); @@ -239,19 +244,28 @@ public static void main(String[] args) { } else { int start = linkifier.start(); int end = linkifier.end(); - final int fromEnd = source.length()-end; + final int fromEnd = source.length() - end; if (start != 1 || fromEnd != 1) { found = false; } - System.out.println(found + "\t" + start + "\t" + fromEnd - + "\t«" + source.substring(0,start) - + "❴❴❴" + source.substring(start, end) - + "❵❵❵" + source.substring(end, source.length()) - + "»"); + System.out.println( + found + + "\t" + + start + + "\t" + + fromEnd + + "\t«" + + source.substring(0, start) + + "❴❴❴" + + source.substring(start, end) + + "❵❵❵" + + source.substring(end, source.length()) + + "»"); for (String name : GROUPS) { final String group = linkifier.matcher.group(name); if (group != null) { - System.out.println("\t" + name + " " + group + "\t" + linkifier.check(name, group)); + System.out.println( + "\t" + name + " " + group + "\t" + linkifier.check(name, group)); } } } @@ -259,15 +273,15 @@ public static void main(String[] args) { } private String check(String groupName, String group) { - switch (groupName) { - case "domain": + switch (groupName) { + case "domain": Info info = new Info(); StringBuilder result = idna.nameToUnicode(group, new StringBuilder(), info); if (info.hasErrors()) { result.append("\tErrors: ").append(info.getErrors().toString()); } return result.toString(); - } - return ""; + } + return ""; } } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/Linkifier2.java b/unicodetools/src/main/java/org/unicode/text/tools/Linkifier2.java index 5d42c43c6..edca4faa3 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/Linkifier2.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/Linkifier2.java @@ -1,31 +1,41 @@ package org.unicode.text.tools; +import com.ibm.icu.text.IDNA; +import com.ibm.icu.text.IDNA.Info; +import com.ibm.icu.text.UnicodeSet; import java.nio.charset.Charset; import java.text.ParsePosition; import java.util.Arrays; import java.util.List; -import com.ibm.icu.text.IDNA; -import com.ibm.icu.text.IDNA.Info; -import com.ibm.icu.text.UnicodeSet; - public class Linkifier2 { static final List SCHEMES = Arrays.asList("http://", "https://"); static final List TLDS = Arrays.asList("com", "org", "de", "香港"); - + static final UnicodeSet SKIPPED_ASCII = new UnicodeSet("[\\ \"#%<>\\[-\\^`\\{-\\}]").freeze(); - static final UnicodeSet URL_CODE_POINT = new UnicodeSet("[^[:NChar:][:Cc:][:Cs:]]").removeAll(SKIPPED_ASCII); + static final UnicodeSet URL_CODE_POINT = + new UnicodeSet("[^[:NChar:][:Cc:][:Cs:]]").removeAll(SKIPPED_ASCII); // see https://url.spec.whatwg.org/#url-code-points - static final UnicodeSet INCLUSIONS = new UnicodeSet("[, ; \\: ! ¡ ¿ . · ' \" @ * \\\\ \\& \\u200C \\u200D]").freeze(); + static final UnicodeSet INCLUSIONS = + new UnicodeSet("[, ; \\: ! ¡ ¿ . · ' \" @ * \\\\ \\& \\u200C \\u200D]").freeze(); + static { System.out.println(URL_CODE_POINT); System.out.println(INCLUSIONS); } + static final UnicodeSet DOTS = new UnicodeSet("[..。。]").freeze(); - static final UnicodeSet ALWAYS_BAD = new UnicodeSet("[\\p{Cn}\\p{Cs}\\p{Cc}\\p{Deprecated}\\p{bidi_control}-[\\u061C\\u200E\\u200F]]").freeze(); - static final UnicodeSet NORMAL = new UnicodeSet("[\\p{L}\\p{N}\\p{M}\\p{S}\\p{Pd}\\p{Pc}%]").removeAll(ALWAYS_BAD).freeze(); - static final UnicodeSet OK_ESCAPED = new UnicodeSet("[\\p{di}\\p{Cf}\\p{Co}]").removeAll(ALWAYS_BAD).freeze(); + static final UnicodeSet ALWAYS_BAD = + new UnicodeSet( + "[\\p{Cn}\\p{Cs}\\p{Cc}\\p{Deprecated}\\p{bidi_control}-[\\u061C\\u200E\\u200F]]") + .freeze(); + static final UnicodeSet NORMAL = + new UnicodeSet("[\\p{L}\\p{N}\\p{M}\\p{S}\\p{Pd}\\p{Pc}%]") + .removeAll(ALWAYS_BAD) + .freeze(); + static final UnicodeSet OK_ESCAPED = + new UnicodeSet("[\\p{di}\\p{Cf}\\p{Co}]").removeAll(ALWAYS_BAD).freeze(); private static String[] GROUPS = {"scheme", "domain", "path", "query", "fragment"}; private static String[] REGEX_SOURCE = { @@ -38,7 +48,7 @@ public class Linkifier2 { "$query = (?: $percentEncodedUtf8Char | $char | [/?])* ;", "$fragment = (?: $percentEncodedUtf8Char | $char | [/?\\x{23}])* ;", "$percentEncodedUtf8Char = ( %\\p{XDigit}\\p{XDigit} )+ ;", - "$char = [\\p{L}\\p{N}\\p{M}\\p{S}\\p{Pd}\\p{Pc}$inclusionChar&[^$exclusionChar]] ;", // [^\\p{C}\\p{Z}] ;", // + "$char = [\\p{L}\\p{N}\\p{M}\\p{S}\\p{Pd}\\p{Pc}$inclusionChar&[^$exclusionChar]] ;", // [^\\p{C}\\p{Z}] ;", // "$inclusionChar = [, ; \\: ! ¡ ¿ . · ' \" @ * \\\\ \\& % \\u200C \\u200D];", "$exclusionChar = [<>] ;", // "$hash = \\u0023 ;", @@ -47,12 +57,15 @@ public class Linkifier2 { }; enum Status { - VALID, - INVALID} + VALID, + INVALID + } - private static abstract class Scanner { + private abstract static class Scanner { /** - * Scans the input for a valid token. returns x > start if there is a token extends to x. Otherwise -1 + * Scans the input for a valid token. returns x > start if there is a token extends to x. + * Otherwise -1 + * * @param input * @param index * @return @@ -62,12 +75,14 @@ private static abstract class Scanner { static class MatchEnd { private List source; + MatchEnd(List source) { this.source = source; } + int matches(String s, int limit) { for (String item : source) { - if (s.regionMatches(limit-item.length(), item, 0, item.length())) { + if (s.regionMatches(limit - item.length(), item, 0, item.length())) { return item.length(); } } @@ -77,9 +92,11 @@ int matches(String s, int limit) { static class MatchStart { private List source; + MatchStart(List source) { this.source = source; } + int matches(String s, int start) { for (String item : source) { if (s.regionMatches(start, item, 0, item.length())) { @@ -95,9 +112,18 @@ private static class UrlScanner extends Scanner { private static final char QUERY_START = '?'; private static final char PATH_START = '/'; Scanner domainScanner = new DomainScanner(); - Scanner pathScanner = new SimpleScanner(new UnicodeSet(NORMAL).add(PATH_START), OK_ESCAPED, INCLUSIONS); - Scanner queryScanner = new SimpleScanner(new UnicodeSet(NORMAL).add(PATH_START).add(QUERY_START), OK_ESCAPED, INCLUSIONS); - Scanner fragmentScanner = new SimpleScanner(new UnicodeSet(NORMAL).add(PATH_START).add(QUERY_START).add(FRAGMENT_START), OK_ESCAPED, INCLUSIONS); + Scanner pathScanner = + new SimpleScanner(new UnicodeSet(NORMAL).add(PATH_START), OK_ESCAPED, INCLUSIONS); + Scanner queryScanner = + new SimpleScanner( + new UnicodeSet(NORMAL).add(PATH_START).add(QUERY_START), + OK_ESCAPED, + INCLUSIONS); + Scanner fragmentScanner = + new SimpleScanner( + new UnicodeSet(NORMAL).add(PATH_START).add(QUERY_START).add(FRAGMENT_START), + OK_ESCAPED, + INCLUSIONS); MatchEnd tldMatcher = new MatchEnd(TLDS); MatchStart schemeMatcher = new MatchStart(SCHEMES); @@ -106,7 +132,7 @@ public Status scan(String input, ParsePosition pos) { int schemeMatch = schemeMatcher.matches(input, start); if (schemeMatch > 0) { // we have a scheme, look for a domain - pos.setIndex(start+schemeMatch); + pos.setIndex(start + schemeMatch); Status result = domainScanner.scan(input, pos); if (result != Status.VALID) { return result; @@ -126,7 +152,10 @@ public Status scan(String input, ParsePosition pos) { int current = pos.getIndex(); cp = current < input.length() ? input.codePointAt(current) : 0; if (cp == PATH_START) { - overallResult = pathScanner.scan(input, pos) == Status.INVALID ? Status.INVALID : overallResult; + overallResult = + pathScanner.scan(input, pos) == Status.INVALID + ? Status.INVALID + : overallResult; int newPos = pos.getIndex(); if (newPos != current) { current = newPos; @@ -134,7 +163,10 @@ public Status scan(String input, ParsePosition pos) { } } if (cp == QUERY_START) { - overallResult = queryScanner.scan(input, pos) == Status.INVALID ? Status.INVALID : overallResult; + overallResult = + queryScanner.scan(input, pos) == Status.INVALID + ? Status.INVALID + : overallResult; int newPos = pos.getIndex(); if (newPos != current) { current = newPos; @@ -142,7 +174,10 @@ public Status scan(String input, ParsePosition pos) { } } if (cp == FRAGMENT_START) { - overallResult = fragmentScanner.scan(input, pos) == Status.INVALID ? Status.INVALID : overallResult; + overallResult = + fragmentScanner.scan(input, pos) == Status.INVALID + ? Status.INVALID + : overallResult; int newPos = pos.getIndex(); if (newPos != current) { current = newPos; @@ -157,6 +192,7 @@ private static class SimpleScanner extends Scanner { private UnicodeSet ok; private UnicodeSet softBreak; private UnicodeSet okEscaped; + public SimpleScanner(UnicodeSet ok, UnicodeSet okEscaped, UnicodeSet softBreak) { checkOverlap("ALWAYS_BAD.containsSome(ok)", ALWAYS_BAD, ok); checkOverlap("ALWAYS_BAD.containsSome(okEscaped)", ALWAYS_BAD, okEscaped); @@ -166,11 +202,13 @@ public SimpleScanner(UnicodeSet ok, UnicodeSet okEscaped, UnicodeSet softBreak) this.okEscaped = okEscaped.freeze(); this.softBreak = softBreak.freeze(); } + public void checkOverlap(String title, UnicodeSet a, UnicodeSet b) { if (a.containsSome(b)) { throw new IllegalArgumentException(title + ":\t" + new UnicodeSet(a).retainAll(b)); } } + public Status scan(String input, ParsePosition pos) { int start = pos.getIndex(); Status result = Status.VALID; @@ -178,7 +216,7 @@ public Status scan(String input, ParsePosition pos) { int codePointLength; int current = start; int i = start; - for (;i < input.length(); i += codePointLength) { + for (; i < input.length(); i += codePointLength) { cp = input.codePointAt(i); codePointLength = Character.charCount(cp); if (cp == '%') { @@ -197,6 +235,7 @@ public Status scan(String input, ParsePosition pos) { pos.setIndex(current); return start == current ? Status.INVALID : result; } + private final Charset UTF8_CHARSET = Charset.forName("UTF-8"); private String getEscaped(String input, int startPos, ParsePosition endPos) { @@ -207,21 +246,21 @@ private String getEscaped(String input, int startPos, ParsePosition endPos) { for (int i = startPos; i < input.length(); ++i) { char c = input.charAt(i); switch (state) { - case 0: - if (c != '%') break; - bytes[++byteNumber] = 0; - ++state; - continue; - case 1: case 2: - if (c < '0' || c > 'f' || c > '9' && c < 'A' || c > 'F' && c < 'a') { - error = true; - } - bytes[byteNumber] *= 16; - bytes[byteNumber] += (c < 'A') ? c - '0' : (c & 0xF) + 9; - ++state; - continue; - case 3: // error - + case 0: + if (c != '%') break; + bytes[++byteNumber] = 0; + ++state; + continue; + case 1: + case 2: + if (c < '0' || c > 'f' || c > '9' && c < 'A' || c > 'F' && c < 'a') { + error = true; + } + bytes[byteNumber] *= 16; + bytes[byteNumber] += (c < 'A') ? c - '0' : (c & 0xF) + 9; + ++state; + continue; + case 3: // error } } String s = new String(bytes, UTF8_CHARSET); @@ -230,9 +269,12 @@ private String getEscaped(String input, int startPos, ParsePosition endPos) { } return s; } + @Override public String toString() { - return new UnicodeSet(ok).complement().complement() + "/" + new UnicodeSet(softBreak).complement().complement(); + return new UnicodeSet(ok).complement().complement() + + "/" + + new UnicodeSet(softBreak).complement().complement(); } } @@ -240,9 +282,11 @@ static class DomainScanner extends SimpleScanner { private IDNA idna = IDNA.getUTS46Instance(0); Info info = new Info(); StringBuilder dest = new StringBuilder(); + DomainScanner() { super(NORMAL, OK_ESCAPED, DOTS); } + @Override public Status scan(String input, ParsePosition pos) { // scan, then do syntax check @@ -251,46 +295,43 @@ public Status scan(String input, ParsePosition pos) { if (result != Status.VALID) { return result; } - StringBuilder unicode = idna.nameToUnicode(input.substring(start, pos.getIndex()), dest, info); - return info.hasErrors() - ? Status.INVALID - : Status.VALID - ; + StringBuilder unicode = + idna.nameToUnicode(input.substring(start, pos.getIndex()), dest, info); + return info.hasErrors() ? Status.INVALID : Status.VALID; } } - public static void main(String[] args) { String[] tests = { - "google.com", - "google\uE0000.com", - "google%F3%A0%80%80.com", - "xn--a.com", - "google.com./", - "google.com./abc/def", - "αβγ。com/δεζ?η=θ?&ι=κ#λμ_?#ξ", - "http://google.com/", - "αβγ@δεζ.com", - "mailto:αβγ@δεζ.com", - "file:///c:/WINDOWS/clock.avi", - "http://be.wikipedia.org/wiki/Вікіпедыя:Артыкулы,_якія_мусяць_быць_у_кожнай_Вікіпедыі", - "http://bpy.wikipedia.org/wiki/উইকিপিডিয়া:থানা_থকিসে_নিবন্ধ", - "http://bug.wikipedia.org/wiki/Wikipedia:Daftar_artikel_ᨆᨊᨛᨂ_ᨅᨔ_ᨄᨑᨛᨒᨘ_ᨂᨛᨀ", - "http://fa.wikipedia.org/wiki/ویکی‌پدیا:فهرست_نوشتارهایی_که_هر_ویکی‌پدیا_باید_بدارد", - "http://gu.wikipedia.org/wiki/વિકિપીડિયા:દરેક_ભાષાના_વિકિપીડિયામાં_હોય_એવા_પ્રારંભિક_લેખોની_યાદી", - "http://hi.wikipedia.org/wiki/विकिपीडिया:कुछ_प्रारंभिक_लेख_जो_कि_हर_भाषा_के_विकिपीडिया_में_होने_चाहिए", - "http://kn.wikipedia.org/wiki/ವಿಕಿಪೀಡಿಯ:ಅಗತ್ಯ_ಲೇಖನಗಳು", - "http://ml.wikipedia.org/wiki/വിക്കിപീഡിയ:അവശ്യലേഖനങ്ങള്‍", - "http://mr.wikipedia.org/wiki/विकिपीडिया:लेख_संपादन_स्पर्धा/लेखांची_यादी", - "http://zh-min-nan.wikipedia.org/wiki/Wikipedia:Só͘-ū_ê_Wikipedia_pán-pún_lóng_èng-kai_ū_ê_bûn-chiuⁿ", - "http://new.wikipedia.org/wiki/विकिपिडियाःहलिमसफूया_ख्यःत", - "http://os.wikipedia.org/wiki/Википеди:Алы_æвзагыл_дæр_чи_хъуамæ_уа,_уыцы_статьятæ", - "http://ru.wikipedia.org/wiki/Википедия:Список_статей,_которые_должны_быть_во_всех_языковых_версиях", - "http://ta.wikipedia.org/wiki/Wikipedia:அனைத்து_மொழி_விக்கிபீடியாக்களிலும்_இருக்க_வேண்டிய_கட்டுரைகளின்_பட்டியல்", - "http://te.wikipedia.org/wiki/Wikipedia:వికీపీడియాలో_తప్పకుండా_ఉండవలసిన_వ్యాసాలు", - "http://th.wikipedia.org/wiki/วิกิพีเดีย:รายชื่อบทความที่วิกิพีเดียทุกภาษาควรมี", - "http://uk.wikipedia.org/wiki/Вікіпедія:Статті,_які_повинні_бути_у_всіх_вікіпедіях", - "http://yi.wikipedia.org/wiki/װיקיפּעדיע:וויכטיגע_ארטיקלן" + "google.com", + "google\uE0000.com", + "google%F3%A0%80%80.com", + "xn--a.com", + "google.com./", + "google.com./abc/def", + "αβγ。com/δεζ?η=θ?&ι=κ#λμ_?#ξ", + "http://google.com/", + "αβγ@δεζ.com", + "mailto:αβγ@δεζ.com", + "file:///c:/WINDOWS/clock.avi", + "http://be.wikipedia.org/wiki/Вікіпедыя:Артыкулы,_якія_мусяць_быць_у_кожнай_Вікіпедыі", + "http://bpy.wikipedia.org/wiki/উইকিপিডিয়া:থানা_থকিসে_নিবন্ধ", + "http://bug.wikipedia.org/wiki/Wikipedia:Daftar_artikel_ᨆᨊᨛᨂ_ᨅᨔ_ᨄᨑᨛᨒᨘ_ᨂᨛᨀ", + "http://fa.wikipedia.org/wiki/ویکی‌پدیا:فهرست_نوشتارهایی_که_هر_ویکی‌پدیا_باید_بدارد", + "http://gu.wikipedia.org/wiki/વિકિપીડિયા:દરેક_ભાષાના_વિકિપીડિયામાં_હોય_એવા_પ્રારંભિક_લેખોની_યાદી", + "http://hi.wikipedia.org/wiki/विकिपीडिया:कुछ_प्रारंभिक_लेख_जो_कि_हर_भाषा_के_विकिपीडिया_में_होने_चाहिए", + "http://kn.wikipedia.org/wiki/ವಿಕಿಪೀಡಿಯ:ಅಗತ್ಯ_ಲೇಖನಗಳು", + "http://ml.wikipedia.org/wiki/വിക്കിപീഡിയ:അവശ്യലേഖനങ്ങള്‍", + "http://mr.wikipedia.org/wiki/विकिपीडिया:लेख_संपादन_स्पर्धा/लेखांची_यादी", + "http://zh-min-nan.wikipedia.org/wiki/Wikipedia:Só͘-ū_ê_Wikipedia_pán-pún_lóng_èng-kai_ū_ê_bûn-chiuⁿ", + "http://new.wikipedia.org/wiki/विकिपिडियाःहलिमसफूया_ख्यःत", + "http://os.wikipedia.org/wiki/Википеди:Алы_æвзагыл_дæр_чи_хъуамæ_уа,_уыцы_статьятæ", + "http://ru.wikipedia.org/wiki/Википедия:Список_статей,_которые_должны_быть_во_всех_языковых_версиях", + "http://ta.wikipedia.org/wiki/Wikipedia:அனைத்து_மொழி_விக்கிபீடியாக்களிலும்_இருக்க_வேண்டிய_கட்டுரைகளின்_பட்டியல்", + "http://te.wikipedia.org/wiki/Wikipedia:వికీపీడియాలో_తప్పకుండా_ఉండవలసిన_వ్యాసాలు", + "http://th.wikipedia.org/wiki/วิกิพีเดีย:รายชื่อบทความที่วิกิพีเดียทุกภาษาควรมี", + "http://uk.wikipedia.org/wiki/Вікіпедія:Статті,_які_повинні_бути_у_всіх_вікіпедіях", + "http://yi.wikipedia.org/wiki/װיקיפּעדיע:וויכטיגע_ארטיקלן" }; UrlScanner linkifier = new UrlScanner(); for (String test : tests) { @@ -304,12 +345,20 @@ public static void main(String[] args) { if (result != Status.VALID) { continue; } - final int fromEnd = source.length()-end; - System.out.println(end + "\t" + start + "\t" + fromEnd - + "\t«" + source.substring(0,start) - + "❴❴❴" + source.substring(start, end) - + "❵❵❵" + source.substring(end, source.length()) - + "»"); + final int fromEnd = source.length() - end; + System.out.println( + end + + "\t" + + start + + "\t" + + fromEnd + + "\t«" + + source.substring(0, start) + + "❴❴❴" + + source.substring(start, end) + + "❵❵❵" + + source.substring(end, source.length()) + + "»"); start = end - 1; } } @@ -317,7 +366,7 @@ public static void main(String[] args) { // private String check(String groupName, String group) { // switch (groupName) { - // case "domain": + // case "domain": // Info info = new Info(); // StringBuilder result = idna.nameToUnicode(group, new StringBuilder(), info); // if (info.hasErrors()) { diff --git a/unicodetools/src/main/java/org/unicode/text/tools/MakeEmojiTable.java b/unicodetools/src/main/java/org/unicode/text/tools/MakeEmojiTable.java index 62bef8d8f..eaee901f8 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/MakeEmojiTable.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/MakeEmojiTable.java @@ -1,26 +1,33 @@ package org.unicode.text.tools; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UTF16; import java.io.IOException; import java.io.PrintWriter; import java.util.Locale; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.tools.emoji.ChartUtilities; import org.unicode.tools.emoji.Emoji; import org.unicode.tools.emoji.EmojiData; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UTF16; - public class MakeEmojiTable { public static void main(String[] args) throws IOException { final String outFileName = "emoji-glyphs.html"; PrintWriter out = FileUtilities.openUTF8Writer(Emoji.TR51_INTERNAL_DIR, outFileName); - ChartUtilities.writeHeader(outFileName, out, "recommended glyphs", null, false, "

" + "" + "

\n", Emoji.DATA_DIR_PRODUCTION, Emoji.TR51_HTML); + ChartUtilities.writeHeader( + outFileName, + out, + "recommended glyphs", + null, + false, + "

" + "" + "

\n", + Emoji.DATA_DIR_PRODUCTION, + Emoji.TR51_HTML); out.println(""); boolean first = true; boolean firstText = true; - out.println(""); + out.println( + ""); for (String line : FileUtilities.in(MakeEmojiTable.class, "emojiGlyphs.txt")) { line = line.trim(); if (line.isEmpty()) { @@ -34,14 +41,30 @@ public static void main(String[] args) throws IOException { String chars = UTF16.valueOf(hex); String hexUpper = Utility.hex(hex); String hexLower = hexUpper.toLowerCase(Locale.ROOT); - out.println("" - + "" - + "" - + "" - + "" - + "" - + "" + + "" + + "" + + "" + + "" + + "" + + ""); + String ch = (String) data.get("c"); + ch = fixHack(ch); + String name = (String) data.get("n"); + if (name == null) { + name = ""; + } + String props = (String) data.get("xs"); + if (props == null) { + props = "\u00A0"; + } + String gc = (String) data.get("gc"); + if (gc == null) { + gc = "Lo"; + } - // clear storage - data.clear(); - break; + // split tables + final int code = UTF16.charAt(ch, 0); + if ((topByte & ~0x1F) != (code & ~0x1F)) { + log.println("
CodeRefAppleAndr.NameRemarks
CodeRefAppleAndr.NameRemarks
" + hexUpper + "" + chars + "" + chars + "" + chars + "" + EmojiData.EMOJI_DATA.getName(chars) + "" - ); + out.println( + "
" + + hexUpper + + ""
+                                + chars
+                                + ""
+                                + chars
+                                + ""
+                                + chars
+                                + "" + + EmojiData.EMOJI_DATA.getName(chars) + + ""); firstText = true; } else { if (firstText) { diff --git a/unicodetools/src/main/java/org/unicode/text/tools/NamesListPrint.java b/unicodetools/src/main/java/org/unicode/text/tools/NamesListPrint.java index 8586b93f8..8967d2be3 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/NamesListPrint.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/NamesListPrint.java @@ -1,9 +1,12 @@ package org.unicode.text.tools; +import com.google.common.base.Objects; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.util.HashSet; import java.util.Map.Entry; import java.util.Set; - import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.Pair; import org.unicode.cldr.util.With; @@ -14,11 +17,6 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Objects; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class NamesListPrint { public static void main(String[] args) { @@ -40,7 +38,14 @@ public static void main(String[] args) { compare(nl.subheadComments, nl2.subheadComments, output, "?*"); for (String k : output) { - System.out.println(Utility.hex(k) + "\t" + k + "\t" + Default.ucd().getName(k) + "\n" + output.getValue(k)); + System.out.println( + Utility.hex(k) + + "\t" + + k + + "\t" + + Default.ucd().getName(k) + + "\n" + + output.getValue(k)); } if (true) { return; @@ -48,7 +53,6 @@ public static void main(String[] args) { print(nl); } - private static void printall(NamesList nl, int max) { String lastBlock = null; String lastSubhead = null; @@ -58,11 +62,10 @@ private static void printall(NamesList nl, int max) { if (count++ > max) break; lastBlock = showValue(nl.blockTitles.get(item), lastBlock, item); lastSubhead = showValue(nl.subheads.get(item), lastSubhead, item); - lastSubheadComments = showValue(nl.subheadComments.get(item), lastSubheadComments, item); - System.out.println(Utility.hex(item) - + "\t" + item - + "\t" + Default.ucd().getName(item) - ); + lastSubheadComments = + showValue(nl.subheadComments.get(item), lastSubheadComments, item); + System.out.println( + Utility.hex(item) + "\t" + item + "\t" + Default.ucd().getName(item)); for (Comment comment : Comment.values()) { Set commentLines = nl.getItem(comment, item); if (commentLines != null) { @@ -71,7 +74,6 @@ private static void printall(NamesList nl, int max) { } } } - } } @@ -86,15 +88,15 @@ private static String showValue(String newValue, String lastBlock, String item) private static void compare( UnicodeRelation subheads, - UnicodeRelation subheads2, + UnicodeRelation subheads2, UnicodeMap output, String sep) { UnicodeSet all = new UnicodeSet(subheads.keySet()).addAll(subheads2.keySet()); - Set> seen = new HashSet(); + Set> seen = new HashSet(); for (String cp : all) { - String v1 = CldrUtility.ifNull(subheads.get(cp),"").toString(); - String v2 = CldrUtility.ifNull(subheads2.get(cp),"").toString(); - //if (v1 == null || v2 == null) continue; + String v1 = CldrUtility.ifNull(subheads.get(cp), "").toString(); + String v2 = CldrUtility.ifNull(subheads2.get(cp), "").toString(); + // if (v1 == null || v2 == null) continue; if (!Objects.equal(v1, v2)) { Pair pair = Pair.of(v1, v2); if (seen.contains(pair)) { @@ -102,25 +104,31 @@ private static void compare( } seen.add(pair); String old = output.get(cp); - output.put(cp, (old == null ? "" : old + "\n") - + sep + "\t" + v1 - + "\n≠" - + sep + "\t" + v2); + output.put( + cp, + (old == null ? "" : old + "\n") + + sep + + "\t" + + v1 + + "\n≠" + + sep + + "\t" + + v2); } } } private static void compare( UnicodeMap subheads, - UnicodeMap subheads2, + UnicodeMap subheads2, UnicodeMap output, String sep) { UnicodeSet all = new UnicodeSet(subheads.keySet()).addAll(subheads2.keySet()); - Set> seen = new HashSet(); + Set> seen = new HashSet(); for (String cp : all) { - String v1 = CldrUtility.ifNull(subheads.get(cp),"").toString(); - String v2 = CldrUtility.ifNull(subheads2.get(cp),"").toString(); - //if (v1 == null || v2 == null) continue; + String v1 = CldrUtility.ifNull(subheads.get(cp), "").toString(); + String v2 = CldrUtility.ifNull(subheads2.get(cp), "").toString(); + // if (v1 == null || v2 == null) continue; if (!Objects.equal(v1, v2)) { Pair pair = Pair.of(v1, v2); if (seen.contains(pair)) { @@ -128,15 +136,20 @@ private static void compare( } seen.add(pair); String old = output.get(cp); - output.put(cp, (old == null ? "" : old + "\n") - + sep + "\t" + v1 - + "\n≠" - + sep + "\t" + v2); + output.put( + cp, + (old == null ? "" : old + "\n") + + sep + + "\t" + + v1 + + "\n≠" + + sep + + "\t" + + v2); } } } - public static void print(NamesList nl) { String lastSubheadComment = null; String lastSubhead = null; @@ -144,13 +157,13 @@ public static void print(NamesList nl) { for (Entry fileComment : nl.fileComments.keyValueSet()) { System.out.println(fileComment.getKey() + "\t" + fileComment.getValue()); } - UnicodeSet all = new UnicodeSet() - .addAll(nl.informalAliases.keySet()) - .addAll(nl.informalComments.keySet()) - .addAll(nl.informalXrefs.keySet()) - .addAll(nl.subheads.keySet()) - .addAll(nl.subheadComments.keySet()) - ; + UnicodeSet all = + new UnicodeSet() + .addAll(nl.informalAliases.keySet()) + .addAll(nl.informalComments.keySet()) + .addAll(nl.informalXrefs.keySet()) + .addAll(nl.subheads.keySet()) + .addAll(nl.subheadComments.keySet()); for (String key : all) { final int keyCodePoint = key.codePointAt(0); @@ -158,21 +171,25 @@ public static void print(NamesList nl) { if (!block.equals(lastblock)) { if (block != null && !block.equals("No_Block")) { UnicodeSet set = Default.ucd().getBlockSet(block, new UnicodeSet()); - System.out.print("\n======\n" - + Utility.hex(set.getRangeStart(0)) - + "\t" + block.replace('_', ' ') - + "\t" + Utility.hex(set.getRangeStart(1)) - + "\n"); + System.out.print( + "\n======\n" + + Utility.hex(set.getRangeStart(0)) + + "\t" + + block.replace('_', ' ') + + "\t" + + Utility.hex(set.getRangeStart(1)) + + "\n"); } lastblock = block; } lastSubhead = showChangedItem(nl.subheads, keyCodePoint, lastSubhead); - lastSubheadComment = showChangedItem(nl.subheadComments, keyCodePoint, lastSubheadComment); + lastSubheadComment = + showChangedItem(nl.subheadComments, keyCodePoint, lastSubheadComment); String realName = Default.ucd().getName(keyCodePoint); - - System.out.println(Utility.hex(key) + "\t" + NamesList.CODE.transform(key) + "\t" + realName); + System.out.println( + Utility.hex(key) + "\t" + NamesList.CODE.transform(key) + "\t" + realName); Set informalComment = nl.informalComments.get(key); Set informalXref = nl.informalXrefs.get(key); display(key, nl.informalAliases, Comment.alias); @@ -199,15 +216,23 @@ public static void print(NamesList nl) { } } - private static void display(String key, UnicodeRelation informalAliases, Comment alias) { + private static void display( + String key, UnicodeRelation informalAliases, Comment alias) { Set values = informalAliases.get(key); if (values != null) { for (String value : values) { if (alias == Comment.xref) { for (int cp : With.codePointArray(value)) { String realName = Default.ucd().getName(cp); - System.out.println("\t\t\t" + alias.displaySymbol + "\t" - + Utility.hex(cp) + " " + NamesList.CODE.transform(UTF16.valueOf(cp)) + " " + realName); + System.out.println( + "\t\t\t" + + alias.displaySymbol + + "\t" + + Utility.hex(cp) + + " " + + NamesList.CODE.transform(UTF16.valueOf(cp)) + + " " + + realName); } } else { for (String s : value.split("\n")) { @@ -218,8 +243,8 @@ private static void display(String key, UnicodeRelation informalAliases, } } - public static String showChangedItem(UnicodeMap map, final int keyCodePoint, - String lastSubhead) { + public static String showChangedItem( + UnicodeMap map, final int keyCodePoint, String lastSubhead) { String subhead = map.get(keyCodePoint); if (!Objects.equal(subhead, lastSubhead)) { if (subhead != null) { @@ -229,5 +254,4 @@ public static String showChangedItem(UnicodeMap map, final int keyCodePo } return lastSubhead; } - } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/NotoCoverage.java b/unicodetools/src/main/java/org/unicode/text/tools/NotoCoverage.java index 282a64bac..b711fc5f7 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/NotoCoverage.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/NotoCoverage.java @@ -1,46 +1,48 @@ package org.unicode.text.tools; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.util.List; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.Emoji; -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class NotoCoverage { static final UnicodeMap DATA = new UnicodeMap<>(); + static { - DATA.putAll(new UnicodeSet( - "[\\x{20e3}\\x{2122}\\x{231a}\\x{231b}\\x{23e9}\\x{23ea}\\x{23eb}\\x{23ec}\\x{23f0}\\x{23f3}\\x{2601}\\x{2611}\\x{2614}\\x{2615}\\x{261d}\\x{263a}\\x{2648}\\x{2649}\\x{264a}\\x{264b}\\x{264c}\\x{264d}\\x{264e}\\x{264f}\\x{2650}\\x{2651}\\x{2652}\\x{2653}\\x{267b}\\x{267f}\\x{2693}\\x{26a0}\\x{26a1}\\x{26bd}\\x{26be}\\x{26c4}\\x{26c5}\\x{26ce}\\x{26d4}\\x{26ea}\\x{26f2}\\x{26f3}\\x{26f5}\\x{26fa}\\x{26fd}\\x{2705}\\x{270a}\\x{270b}\\x{270c}\\x{2728}\\x{274c}\\x{274e}\\x{2753}\\x{2754}\\x{2755}\\x{2795}\\x{2796}\\x{2797}\\x{27b0}\\x{27bf}\\x{3030}\\x{303d}\\x{1f004}\\x{1f0cf}\\x{1f170}\\x{1f171}\\x{1f17e}\\x{1f17f}\\x{1f18e}\\x{1f191}\\x{1f192}\\x{1f193}\\x{1f194}\\x{1f195}\\x{1f196}\\x{1f197}\\x{1f198}\\x{1f199}\\x{1f19a}\\x{1f1e6}\\x{1f1e7}\\x{1f1e8}\\x{1f1e9}\\x{1f1ea}\\x{1f1eb}\\x{1f1ec}\\x{1f1ed}\\x{1f1ee}\\x{1f1ef}\\x{1f1f0}\\x{1f1f1}\\x{1f1f2}\\x{1f1f3}\\x{1f1f4}\\x{1f1f5}\\x{1f1f6}\\x{1f1f7}\\x{1f1f8}\\x{1f1f9}\\x{1f1fa}\\x{1f1fb}\\x{1f1fc}\\x{1f1fd}\\x{1f1fe}\\x{1f1ff}\\x{1f201}\\x{1f202}\\x{1f21a}\\x{1f22f}\\x{1f232}\\x{1f233}\\x{1f234}\\x{1f235}\\x{1f236}\\x{1f237}\\x{1f238}\\x{1f239}\\x{1f23a}\\x{1f250}\\x{1f251}\\x{1f300}\\x{1f301}\\x{1f302}\\x{1f303}\\x{1f304}\\x{1f305}\\x{1f306}\\x{1f307}\\x{1f308}\\x{1f309}\\x{1f30a}\\x{1f30b}\\x{1f30c}\\x{1f30d}\\x{1f30e}\\x{1f30f}\\x{1f310}\\x{1f311}\\x{1f312}\\x{1f313}\\x{1f314}\\x{1f315}\\x{1f316}\\x{1f317}\\x{1f318}\\x{1f319}\\x{1f31a}\\x{1f31b}\\x{1f31c}\\x{1f31d}\\x{1f31e}\\x{1f31f}\\x{1f320}\\x{1f330}\\x{1f331}\\x{1f332}\\x{1f333}\\x{1f334}\\x{1f335}\\x{1f337}\\x{1f338}\\x{1f339}\\x{1f33a}\\x{1f33b}\\x{1f33c}\\x{1f33d}\\x{1f33e}\\x{1f33f}\\x{1f340}\\x{1f341}\\x{1f342}\\x{1f343}\\x{1f344}\\x{1f345}\\x{1f346}\\x{1f347}\\x{1f348}\\x{1f349}\\x{1f34a}\\x{1f34b}\\x{1f34c}\\x{1f34d}\\x{1f34e}\\x{1f34f}\\x{1f350}\\x{1f351}\\x{1f352}\\x{1f353}\\x{1f354}\\x{1f355}\\x{1f356}\\x{1f357}\\x{1f358}\\x{1f359}\\x{1f35a}\\x{1f35b}\\x{1f35c}\\x{1f35d}\\x{1f35e}\\x{1f35f}\\x{1f360}\\x{1f361}\\x{1f362}\\x{1f363}\\x{1f364}\\x{1f365}\\x{1f366}\\x{1f367}\\x{1f368}\\x{1f369}\\x{1f36a}\\x{1f36b}\\x{1f36c}\\x{1f36d}\\x{1f36e}\\x{1f36f}\\x{1f370}\\x{1f371}\\x{1f372}\\x{1f373}\\x{1f374}\\x{1f375}\\x{1f376}\\x{1f377}\\x{1f378}\\x{1f379}\\x{1f37a}\\x{1f37b}\\x{1f37c}\\x{1f380}\\x{1f381}\\x{1f382}\\x{1f383}\\x{1f384}\\x{1f385}\\x{1f386}\\x{1f387}\\x{1f388}\\x{1f389}\\x{1f38a}\\x{1f38b}\\x{1f38c}\\x{1f38d}\\x{1f38e}\\x{1f38f}\\x{1f390}\\x{1f391}\\x{1f392}\\x{1f393}\\x{1f3a0}\\x{1f3a1}\\x{1f3a2}\\x{1f3a3}\\x{1f3a4}\\x{1f3a5}\\x{1f3a6}\\x{1f3a7}\\x{1f3a8}\\x{1f3a9}\\x{1f3aa}\\x{1f3ab}\\x{1f3ac}\\x{1f3ad}\\x{1f3ae}\\x{1f3af}\\x{1f3b0}\\x{1f3b1}\\x{1f3b2}\\x{1f3b3}\\x{1f3b4}\\x{1f3b5}\\x{1f3b6}\\x{1f3b7}\\x{1f3b8}\\x{1f3b9}\\x{1f3ba}\\x{1f3bb}\\x{1f3bc}\\x{1f3bd}\\x{1f3be}\\x{1f3bf}\\x{1f3c0}\\x{1f3c1}\\x{1f3c2}\\x{1f3c3}\\x{1f3c4}\\x{1f3c6}\\x{1f3c7}\\x{1f3c8}\\x{1f3c9}\\x{1f3ca}\\x{1f3e0}\\x{1f3e1}\\x{1f3e2}\\x{1f3e3}\\x{1f3e4}\\x{1f3e5}\\x{1f3e6}\\x{1f3e7}\\x{1f3e8}\\x{1f3e9}\\x{1f3ea}\\x{1f3eb}\\x{1f3ec}\\x{1f3ed}\\x{1f3ee}\\x{1f3ef}\\x{1f3f0}\\x{1f400}\\x{1f401}\\x{1f402}\\x{1f403}\\x{1f404}\\x{1f405}\\x{1f406}\\x{1f407}\\x{1f408}\\x{1f409}\\x{1f40a}\\x{1f40b}\\x{1f40c}\\x{1f40d}\\x{1f40e}\\x{1f40f}\\x{1f410}\\x{1f411}\\x{1f412}\\x{1f413}\\x{1f414}\\x{1f415}\\x{1f416}\\x{1f417}\\x{1f418}\\x{1f419}\\x{1f41a}\\x{1f41b}\\x{1f41c}\\x{1f41d}\\x{1f41e}\\x{1f41f}\\x{1f420}\\x{1f421}\\x{1f422}\\x{1f423}\\x{1f424}\\x{1f425}\\x{1f426}\\x{1f427}\\x{1f428}\\x{1f429}\\x{1f42a}\\x{1f42b}\\x{1f42c}\\x{1f42d}\\x{1f42e}\\x{1f42f}\\x{1f430}\\x{1f431}\\x{1f432}\\x{1f433}\\x{1f434}\\x{1f435}\\x{1f436}\\x{1f437}\\x{1f438}\\x{1f439}\\x{1f43a}\\x{1f43b}\\x{1f43c}\\x{1f43d}\\x{1f43e}\\x{1f440}\\x{1f442}\\x{1f443}\\x{1f444}\\x{1f445}\\x{1f446}\\x{1f447}\\x{1f448}\\x{1f449}\\x{1f44a}\\x{1f44b}\\x{1f44c}\\x{1f44d}\\x{1f44e}\\x{1f44f}\\x{1f450}\\x{1f451}\\x{1f452}\\x{1f453}\\x{1f454}\\x{1f455}\\x{1f456}\\x{1f457}\\x{1f458}\\x{1f459}\\x{1f45a}\\x{1f45b}\\x{1f45c}\\x{1f45d}\\x{1f45e}\\x{1f45f}\\x{1f460}\\x{1f461}\\x{1f462}\\x{1f463}\\x{1f464}\\x{1f465}\\x{1f466}\\x{1f467}\\x{1f468}\\x{1f469}\\x{1f46a}\\x{1f46b}\\x{1f46c}\\x{1f46d}\\x{1f46e}\\x{1f46f}\\x{1f470}\\x{1f471}\\x{1f472}\\x{1f473}\\x{1f474}\\x{1f475}\\x{1f476}\\x{1f477}\\x{1f478}\\x{1f479}\\x{1f47a}\\x{1f47b}\\x{1f47c}\\x{1f47d}\\x{1f47e}\\x{1f47f}\\x{1f480}\\x{1f481}\\x{1f482}\\x{1f483}\\x{1f484}\\x{1f485}\\x{1f486}\\x{1f487}\\x{1f488}\\x{1f489}\\x{1f48a}\\x{1f48b}\\x{1f48c}\\x{1f48d}\\x{1f48e}\\x{1f48f}\\x{1f490}\\x{1f491}\\x{1f492}\\x{1f493}\\x{1f494}\\x{1f495}\\x{1f496}\\x{1f497}\\x{1f498}\\x{1f499}\\x{1f49a}\\x{1f49b}\\x{1f49c}\\x{1f49d}\\x{1f49e}\\x{1f49f}\\x{1f4a0}\\x{1f4a1}\\x{1f4a2}\\x{1f4a3}\\x{1f4a4}\\x{1f4a5}\\x{1f4a6}\\x{1f4a7}\\x{1f4a8}\\x{1f4a9}\\x{1f4aa}\\x{1f4ab}\\x{1f4ac}\\x{1f4ad}\\x{1f4ae}\\x{1f4af}\\x{1f4b0}\\x{1f4b1}\\x{1f4b2}\\x{1f4b3}\\x{1f4b4}\\x{1f4b5}\\x{1f4b6}\\x{1f4b7}\\x{1f4b8}\\x{1f4b9}\\x{1f4ba}\\x{1f4bb}\\x{1f4bc}\\x{1f4bd}\\x{1f4be}\\x{1f4bf}\\x{1f4c0}\\x{1f4c1}\\x{1f4c2}\\x{1f4c3}\\x{1f4c4}\\x{1f4c5}\\x{1f4c6}\\x{1f4c7}\\x{1f4c8}\\x{1f4c9}\\x{1f4ca}\\x{1f4cb}\\x{1f4cc}\\x{1f4cd}\\x{1f4ce}\\x{1f4cf}\\x{1f4d0}\\x{1f4d1}\\x{1f4d2}\\x{1f4d3}\\x{1f4d4}\\x{1f4d5}\\x{1f4d6}\\x{1f4d7}\\x{1f4d8}\\x{1f4d9}\\x{1f4da}\\x{1f4db}\\x{1f4dc}\\x{1f4dd}\\x{1f4de}\\x{1f4df}\\x{1f4e0}\\x{1f4e1}\\x{1f4e2}\\x{1f4e3}\\x{1f4e4}\\x{1f4e5}\\x{1f4e6}\\x{1f4e7}\\x{1f4e8}\\x{1f4e9}\\x{1f4ea}\\x{1f4eb}\\x{1f4ec}\\x{1f4ed}\\x{1f4ee}\\x{1f4ef}\\x{1f4f0}\\x{1f4f1}\\x{1f4f2}\\x{1f4f3}\\x{1f4f4}\\x{1f4f5}\\x{1f4f6}\\x{1f4f7}\\x{1f4f9}\\x{1f4fa}\\x{1f4fb}\\x{1f4fc}\\x{1f500}\\x{1f501}\\x{1f502}\\x{1f503}\\x{1f504}\\x{1f505}\\x{1f506}\\x{1f507}\\x{1f508}\\x{1f509}\\x{1f50a}\\x{1f50b}\\x{1f50c}\\x{1f50d}\\x{1f50e}\\x{1f50f}\\x{1f510}\\x{1f511}\\x{1f512}\\x{1f513}\\x{1f514}\\x{1f515}\\x{1f516}\\x{1f517}\\x{1f518}\\x{1f519}\\x{1f51a}\\x{1f51b}\\x{1f51c}\\x{1f51d}\\x{1f51e}\\x{1f51f}\\x{1f520}\\x{1f521}\\x{1f522}\\x{1f523}\\x{1f524}\\x{1f525}\\x{1f526}\\x{1f527}\\x{1f528}\\x{1f529}\\x{1f52a}\\x{1f52b}\\x{1f52c}\\x{1f52d}\\x{1f52e}\\x{1f52f}\\x{1f530}\\x{1f531}\\x{1f532}\\x{1f533}\\x{1f534}\\x{1f535}\\x{1f536}\\x{1f537}\\x{1f538}\\x{1f539}\\x{1f53a}\\x{1f53b}\\x{1f53c}\\x{1f53d}\\x{1f550}\\x{1f551}\\x{1f552}\\x{1f553}\\x{1f554}\\x{1f555}\\x{1f556}\\x{1f557}\\x{1f558}\\x{1f559}\\x{1f55a}\\x{1f55b}\\x{1f55c}\\x{1f55d}\\x{1f55e}\\x{1f55f}\\x{1f560}\\x{1f561}\\x{1f562}\\x{1f563}\\x{1f564}\\x{1f565}\\x{1f566}\\x{1f567}\\x{1f5fb}\\x{1f5fc}\\x{1f5fd}\\x{1f5fe}\\x{1f5ff}\\x{1f600}\\x{1f601}\\x{1f602}\\x{1f603}\\x{1f604}\\x{1f605}\\x{1f606}\\x{1f607}\\x{1f608}\\x{1f609}\\x{1f60a}\\x{1f60b}\\x{1f60c}\\x{1f60d}\\x{1f60e}\\x{1f60f}\\x{1f610}\\x{1f611}\\x{1f612}\\x{1f613}\\x{1f614}\\x{1f615}\\x{1f616}\\x{1f617}\\x{1f618}\\x{1f619}\\x{1f61a}\\x{1f61b}\\x{1f61c}\\x{1f61d}\\x{1f61e}\\x{1f61f}\\x{1f620}\\x{1f621}\\x{1f622}\\x{1f623}\\x{1f624}\\x{1f625}\\x{1f626}\\x{1f627}\\x{1f628}\\x{1f629}\\x{1f62a}\\x{1f62b}\\x{1f62c}\\x{1f62d}\\x{1f62e}\\x{1f62f}\\x{1f630}\\x{1f631}\\x{1f632}\\x{1f633}\\x{1f634}\\x{1f635}\\x{1f636}\\x{1f637}\\x{1f638}\\x{1f639}\\x{1f63a}\\x{1f63b}\\x{1f63c}\\x{1f63d}\\x{1f63e}\\x{1f63f}\\x{1f640}\\x{1f645}\\x{1f646}\\x{1f647}\\x{1f648}\\x{1f649}\\x{1f64a}\\x{1f64b}\\x{1f64c}\\x{1f64d}\\x{1f64e}\\x{1f64f}\\x{1f680}\\x{1f681}\\x{1f682}\\x{1f683}\\x{1f684}\\x{1f685}\\x{1f686}\\x{1f687}\\x{1f688}\\x{1f689}\\x{1f68a}\\x{1f68b}\\x{1f68c}\\x{1f68d}\\x{1f68e}\\x{1f68f}\\x{1f690}\\x{1f691}\\x{1f692}\\x{1f693}\\x{1f694}\\x{1f695}\\x{1f696}\\x{1f697}\\x{1f698}\\x{1f699}\\x{1f69a}\\x{1f69b}\\x{1f69c}\\x{1f69d}\\x{1f69e}\\x{1f69f}\\x{1f6a0}\\x{1f6a1}\\x{1f6a2}\\x{1f6a3}\\x{1f6a4}\\x{1f6a5}\\x{1f6a6}\\x{1f6a7}\\x{1f6a8}\\x{1f6a9}\\x{1f6aa}\\x{1f6ab}\\x{1f6ac}\\x{1f6ad}\\x{1f6ae}\\x{1f6af}\\x{1f6b0}\\x{1f6b1}\\x{1f6b2}\\x{1f6b3}\\x{1f6b4}\\x{1f6b5}\\x{1f6b6}\\x{1f6b7}\\x{1f6b8}\\x{1f6b9}\\x{1f6ba}\\x{1f6bb}\\x{1f6bc}\\x{1f6bd}\\x{1f6be}\\x{1f6bf}\\x{1f6c0}\\x{1f6c1}\\x{1f6c2}\\x{1f6c3}\\x{1f6c4}\\x{1f6c5}\\x{fe4e5}\\x{fe4e6}\\x{fe4e7}\\x{fe4e8}\\x{fe4e9}\\x{fe4ea}\\x{fe4eb}\\x{fe4ec}\\x{fe4ed}\\x{fe4ee}\\x{fe82c}\\x{fe82e}\\x{fe82f}\\x{fe830}\\x{fe831}\\x{fe832}\\x{fe833}\\x{fe834}\\x{fe835}\\x{fe836}\\x{fe837}]" - ) - .removeAll(new UnicodeSet("[:M:]")), "Emoji-Color"); + DATA.putAll( + new UnicodeSet( + "[\\x{20e3}\\x{2122}\\x{231a}\\x{231b}\\x{23e9}\\x{23ea}\\x{23eb}\\x{23ec}\\x{23f0}\\x{23f3}\\x{2601}\\x{2611}\\x{2614}\\x{2615}\\x{261d}\\x{263a}\\x{2648}\\x{2649}\\x{264a}\\x{264b}\\x{264c}\\x{264d}\\x{264e}\\x{264f}\\x{2650}\\x{2651}\\x{2652}\\x{2653}\\x{267b}\\x{267f}\\x{2693}\\x{26a0}\\x{26a1}\\x{26bd}\\x{26be}\\x{26c4}\\x{26c5}\\x{26ce}\\x{26d4}\\x{26ea}\\x{26f2}\\x{26f3}\\x{26f5}\\x{26fa}\\x{26fd}\\x{2705}\\x{270a}\\x{270b}\\x{270c}\\x{2728}\\x{274c}\\x{274e}\\x{2753}\\x{2754}\\x{2755}\\x{2795}\\x{2796}\\x{2797}\\x{27b0}\\x{27bf}\\x{3030}\\x{303d}\\x{1f004}\\x{1f0cf}\\x{1f170}\\x{1f171}\\x{1f17e}\\x{1f17f}\\x{1f18e}\\x{1f191}\\x{1f192}\\x{1f193}\\x{1f194}\\x{1f195}\\x{1f196}\\x{1f197}\\x{1f198}\\x{1f199}\\x{1f19a}\\x{1f1e6}\\x{1f1e7}\\x{1f1e8}\\x{1f1e9}\\x{1f1ea}\\x{1f1eb}\\x{1f1ec}\\x{1f1ed}\\x{1f1ee}\\x{1f1ef}\\x{1f1f0}\\x{1f1f1}\\x{1f1f2}\\x{1f1f3}\\x{1f1f4}\\x{1f1f5}\\x{1f1f6}\\x{1f1f7}\\x{1f1f8}\\x{1f1f9}\\x{1f1fa}\\x{1f1fb}\\x{1f1fc}\\x{1f1fd}\\x{1f1fe}\\x{1f1ff}\\x{1f201}\\x{1f202}\\x{1f21a}\\x{1f22f}\\x{1f232}\\x{1f233}\\x{1f234}\\x{1f235}\\x{1f236}\\x{1f237}\\x{1f238}\\x{1f239}\\x{1f23a}\\x{1f250}\\x{1f251}\\x{1f300}\\x{1f301}\\x{1f302}\\x{1f303}\\x{1f304}\\x{1f305}\\x{1f306}\\x{1f307}\\x{1f308}\\x{1f309}\\x{1f30a}\\x{1f30b}\\x{1f30c}\\x{1f30d}\\x{1f30e}\\x{1f30f}\\x{1f310}\\x{1f311}\\x{1f312}\\x{1f313}\\x{1f314}\\x{1f315}\\x{1f316}\\x{1f317}\\x{1f318}\\x{1f319}\\x{1f31a}\\x{1f31b}\\x{1f31c}\\x{1f31d}\\x{1f31e}\\x{1f31f}\\x{1f320}\\x{1f330}\\x{1f331}\\x{1f332}\\x{1f333}\\x{1f334}\\x{1f335}\\x{1f337}\\x{1f338}\\x{1f339}\\x{1f33a}\\x{1f33b}\\x{1f33c}\\x{1f33d}\\x{1f33e}\\x{1f33f}\\x{1f340}\\x{1f341}\\x{1f342}\\x{1f343}\\x{1f344}\\x{1f345}\\x{1f346}\\x{1f347}\\x{1f348}\\x{1f349}\\x{1f34a}\\x{1f34b}\\x{1f34c}\\x{1f34d}\\x{1f34e}\\x{1f34f}\\x{1f350}\\x{1f351}\\x{1f352}\\x{1f353}\\x{1f354}\\x{1f355}\\x{1f356}\\x{1f357}\\x{1f358}\\x{1f359}\\x{1f35a}\\x{1f35b}\\x{1f35c}\\x{1f35d}\\x{1f35e}\\x{1f35f}\\x{1f360}\\x{1f361}\\x{1f362}\\x{1f363}\\x{1f364}\\x{1f365}\\x{1f366}\\x{1f367}\\x{1f368}\\x{1f369}\\x{1f36a}\\x{1f36b}\\x{1f36c}\\x{1f36d}\\x{1f36e}\\x{1f36f}\\x{1f370}\\x{1f371}\\x{1f372}\\x{1f373}\\x{1f374}\\x{1f375}\\x{1f376}\\x{1f377}\\x{1f378}\\x{1f379}\\x{1f37a}\\x{1f37b}\\x{1f37c}\\x{1f380}\\x{1f381}\\x{1f382}\\x{1f383}\\x{1f384}\\x{1f385}\\x{1f386}\\x{1f387}\\x{1f388}\\x{1f389}\\x{1f38a}\\x{1f38b}\\x{1f38c}\\x{1f38d}\\x{1f38e}\\x{1f38f}\\x{1f390}\\x{1f391}\\x{1f392}\\x{1f393}\\x{1f3a0}\\x{1f3a1}\\x{1f3a2}\\x{1f3a3}\\x{1f3a4}\\x{1f3a5}\\x{1f3a6}\\x{1f3a7}\\x{1f3a8}\\x{1f3a9}\\x{1f3aa}\\x{1f3ab}\\x{1f3ac}\\x{1f3ad}\\x{1f3ae}\\x{1f3af}\\x{1f3b0}\\x{1f3b1}\\x{1f3b2}\\x{1f3b3}\\x{1f3b4}\\x{1f3b5}\\x{1f3b6}\\x{1f3b7}\\x{1f3b8}\\x{1f3b9}\\x{1f3ba}\\x{1f3bb}\\x{1f3bc}\\x{1f3bd}\\x{1f3be}\\x{1f3bf}\\x{1f3c0}\\x{1f3c1}\\x{1f3c2}\\x{1f3c3}\\x{1f3c4}\\x{1f3c6}\\x{1f3c7}\\x{1f3c8}\\x{1f3c9}\\x{1f3ca}\\x{1f3e0}\\x{1f3e1}\\x{1f3e2}\\x{1f3e3}\\x{1f3e4}\\x{1f3e5}\\x{1f3e6}\\x{1f3e7}\\x{1f3e8}\\x{1f3e9}\\x{1f3ea}\\x{1f3eb}\\x{1f3ec}\\x{1f3ed}\\x{1f3ee}\\x{1f3ef}\\x{1f3f0}\\x{1f400}\\x{1f401}\\x{1f402}\\x{1f403}\\x{1f404}\\x{1f405}\\x{1f406}\\x{1f407}\\x{1f408}\\x{1f409}\\x{1f40a}\\x{1f40b}\\x{1f40c}\\x{1f40d}\\x{1f40e}\\x{1f40f}\\x{1f410}\\x{1f411}\\x{1f412}\\x{1f413}\\x{1f414}\\x{1f415}\\x{1f416}\\x{1f417}\\x{1f418}\\x{1f419}\\x{1f41a}\\x{1f41b}\\x{1f41c}\\x{1f41d}\\x{1f41e}\\x{1f41f}\\x{1f420}\\x{1f421}\\x{1f422}\\x{1f423}\\x{1f424}\\x{1f425}\\x{1f426}\\x{1f427}\\x{1f428}\\x{1f429}\\x{1f42a}\\x{1f42b}\\x{1f42c}\\x{1f42d}\\x{1f42e}\\x{1f42f}\\x{1f430}\\x{1f431}\\x{1f432}\\x{1f433}\\x{1f434}\\x{1f435}\\x{1f436}\\x{1f437}\\x{1f438}\\x{1f439}\\x{1f43a}\\x{1f43b}\\x{1f43c}\\x{1f43d}\\x{1f43e}\\x{1f440}\\x{1f442}\\x{1f443}\\x{1f444}\\x{1f445}\\x{1f446}\\x{1f447}\\x{1f448}\\x{1f449}\\x{1f44a}\\x{1f44b}\\x{1f44c}\\x{1f44d}\\x{1f44e}\\x{1f44f}\\x{1f450}\\x{1f451}\\x{1f452}\\x{1f453}\\x{1f454}\\x{1f455}\\x{1f456}\\x{1f457}\\x{1f458}\\x{1f459}\\x{1f45a}\\x{1f45b}\\x{1f45c}\\x{1f45d}\\x{1f45e}\\x{1f45f}\\x{1f460}\\x{1f461}\\x{1f462}\\x{1f463}\\x{1f464}\\x{1f465}\\x{1f466}\\x{1f467}\\x{1f468}\\x{1f469}\\x{1f46a}\\x{1f46b}\\x{1f46c}\\x{1f46d}\\x{1f46e}\\x{1f46f}\\x{1f470}\\x{1f471}\\x{1f472}\\x{1f473}\\x{1f474}\\x{1f475}\\x{1f476}\\x{1f477}\\x{1f478}\\x{1f479}\\x{1f47a}\\x{1f47b}\\x{1f47c}\\x{1f47d}\\x{1f47e}\\x{1f47f}\\x{1f480}\\x{1f481}\\x{1f482}\\x{1f483}\\x{1f484}\\x{1f485}\\x{1f486}\\x{1f487}\\x{1f488}\\x{1f489}\\x{1f48a}\\x{1f48b}\\x{1f48c}\\x{1f48d}\\x{1f48e}\\x{1f48f}\\x{1f490}\\x{1f491}\\x{1f492}\\x{1f493}\\x{1f494}\\x{1f495}\\x{1f496}\\x{1f497}\\x{1f498}\\x{1f499}\\x{1f49a}\\x{1f49b}\\x{1f49c}\\x{1f49d}\\x{1f49e}\\x{1f49f}\\x{1f4a0}\\x{1f4a1}\\x{1f4a2}\\x{1f4a3}\\x{1f4a4}\\x{1f4a5}\\x{1f4a6}\\x{1f4a7}\\x{1f4a8}\\x{1f4a9}\\x{1f4aa}\\x{1f4ab}\\x{1f4ac}\\x{1f4ad}\\x{1f4ae}\\x{1f4af}\\x{1f4b0}\\x{1f4b1}\\x{1f4b2}\\x{1f4b3}\\x{1f4b4}\\x{1f4b5}\\x{1f4b6}\\x{1f4b7}\\x{1f4b8}\\x{1f4b9}\\x{1f4ba}\\x{1f4bb}\\x{1f4bc}\\x{1f4bd}\\x{1f4be}\\x{1f4bf}\\x{1f4c0}\\x{1f4c1}\\x{1f4c2}\\x{1f4c3}\\x{1f4c4}\\x{1f4c5}\\x{1f4c6}\\x{1f4c7}\\x{1f4c8}\\x{1f4c9}\\x{1f4ca}\\x{1f4cb}\\x{1f4cc}\\x{1f4cd}\\x{1f4ce}\\x{1f4cf}\\x{1f4d0}\\x{1f4d1}\\x{1f4d2}\\x{1f4d3}\\x{1f4d4}\\x{1f4d5}\\x{1f4d6}\\x{1f4d7}\\x{1f4d8}\\x{1f4d9}\\x{1f4da}\\x{1f4db}\\x{1f4dc}\\x{1f4dd}\\x{1f4de}\\x{1f4df}\\x{1f4e0}\\x{1f4e1}\\x{1f4e2}\\x{1f4e3}\\x{1f4e4}\\x{1f4e5}\\x{1f4e6}\\x{1f4e7}\\x{1f4e8}\\x{1f4e9}\\x{1f4ea}\\x{1f4eb}\\x{1f4ec}\\x{1f4ed}\\x{1f4ee}\\x{1f4ef}\\x{1f4f0}\\x{1f4f1}\\x{1f4f2}\\x{1f4f3}\\x{1f4f4}\\x{1f4f5}\\x{1f4f6}\\x{1f4f7}\\x{1f4f9}\\x{1f4fa}\\x{1f4fb}\\x{1f4fc}\\x{1f500}\\x{1f501}\\x{1f502}\\x{1f503}\\x{1f504}\\x{1f505}\\x{1f506}\\x{1f507}\\x{1f508}\\x{1f509}\\x{1f50a}\\x{1f50b}\\x{1f50c}\\x{1f50d}\\x{1f50e}\\x{1f50f}\\x{1f510}\\x{1f511}\\x{1f512}\\x{1f513}\\x{1f514}\\x{1f515}\\x{1f516}\\x{1f517}\\x{1f518}\\x{1f519}\\x{1f51a}\\x{1f51b}\\x{1f51c}\\x{1f51d}\\x{1f51e}\\x{1f51f}\\x{1f520}\\x{1f521}\\x{1f522}\\x{1f523}\\x{1f524}\\x{1f525}\\x{1f526}\\x{1f527}\\x{1f528}\\x{1f529}\\x{1f52a}\\x{1f52b}\\x{1f52c}\\x{1f52d}\\x{1f52e}\\x{1f52f}\\x{1f530}\\x{1f531}\\x{1f532}\\x{1f533}\\x{1f534}\\x{1f535}\\x{1f536}\\x{1f537}\\x{1f538}\\x{1f539}\\x{1f53a}\\x{1f53b}\\x{1f53c}\\x{1f53d}\\x{1f550}\\x{1f551}\\x{1f552}\\x{1f553}\\x{1f554}\\x{1f555}\\x{1f556}\\x{1f557}\\x{1f558}\\x{1f559}\\x{1f55a}\\x{1f55b}\\x{1f55c}\\x{1f55d}\\x{1f55e}\\x{1f55f}\\x{1f560}\\x{1f561}\\x{1f562}\\x{1f563}\\x{1f564}\\x{1f565}\\x{1f566}\\x{1f567}\\x{1f5fb}\\x{1f5fc}\\x{1f5fd}\\x{1f5fe}\\x{1f5ff}\\x{1f600}\\x{1f601}\\x{1f602}\\x{1f603}\\x{1f604}\\x{1f605}\\x{1f606}\\x{1f607}\\x{1f608}\\x{1f609}\\x{1f60a}\\x{1f60b}\\x{1f60c}\\x{1f60d}\\x{1f60e}\\x{1f60f}\\x{1f610}\\x{1f611}\\x{1f612}\\x{1f613}\\x{1f614}\\x{1f615}\\x{1f616}\\x{1f617}\\x{1f618}\\x{1f619}\\x{1f61a}\\x{1f61b}\\x{1f61c}\\x{1f61d}\\x{1f61e}\\x{1f61f}\\x{1f620}\\x{1f621}\\x{1f622}\\x{1f623}\\x{1f624}\\x{1f625}\\x{1f626}\\x{1f627}\\x{1f628}\\x{1f629}\\x{1f62a}\\x{1f62b}\\x{1f62c}\\x{1f62d}\\x{1f62e}\\x{1f62f}\\x{1f630}\\x{1f631}\\x{1f632}\\x{1f633}\\x{1f634}\\x{1f635}\\x{1f636}\\x{1f637}\\x{1f638}\\x{1f639}\\x{1f63a}\\x{1f63b}\\x{1f63c}\\x{1f63d}\\x{1f63e}\\x{1f63f}\\x{1f640}\\x{1f645}\\x{1f646}\\x{1f647}\\x{1f648}\\x{1f649}\\x{1f64a}\\x{1f64b}\\x{1f64c}\\x{1f64d}\\x{1f64e}\\x{1f64f}\\x{1f680}\\x{1f681}\\x{1f682}\\x{1f683}\\x{1f684}\\x{1f685}\\x{1f686}\\x{1f687}\\x{1f688}\\x{1f689}\\x{1f68a}\\x{1f68b}\\x{1f68c}\\x{1f68d}\\x{1f68e}\\x{1f68f}\\x{1f690}\\x{1f691}\\x{1f692}\\x{1f693}\\x{1f694}\\x{1f695}\\x{1f696}\\x{1f697}\\x{1f698}\\x{1f699}\\x{1f69a}\\x{1f69b}\\x{1f69c}\\x{1f69d}\\x{1f69e}\\x{1f69f}\\x{1f6a0}\\x{1f6a1}\\x{1f6a2}\\x{1f6a3}\\x{1f6a4}\\x{1f6a5}\\x{1f6a6}\\x{1f6a7}\\x{1f6a8}\\x{1f6a9}\\x{1f6aa}\\x{1f6ab}\\x{1f6ac}\\x{1f6ad}\\x{1f6ae}\\x{1f6af}\\x{1f6b0}\\x{1f6b1}\\x{1f6b2}\\x{1f6b3}\\x{1f6b4}\\x{1f6b5}\\x{1f6b6}\\x{1f6b7}\\x{1f6b8}\\x{1f6b9}\\x{1f6ba}\\x{1f6bb}\\x{1f6bc}\\x{1f6bd}\\x{1f6be}\\x{1f6bf}\\x{1f6c0}\\x{1f6c1}\\x{1f6c2}\\x{1f6c3}\\x{1f6c4}\\x{1f6c5}\\x{fe4e5}\\x{fe4e6}\\x{fe4e7}\\x{fe4e8}\\x{fe4e9}\\x{fe4ea}\\x{fe4eb}\\x{fe4ec}\\x{fe4ed}\\x{fe4ee}\\x{fe82c}\\x{fe82e}\\x{fe82f}\\x{fe830}\\x{fe831}\\x{fe832}\\x{fe833}\\x{fe834}\\x{fe835}\\x{fe836}\\x{fe837}]") + .removeAll(new UnicodeSet("[:M:]")), + "Emoji-Color"); Splitter lineSplitter = Splitter.onPattern("\\.\\.|;").trimResults(); for (String line : FileUtilities.in(Emoji.class, "notoCoverage.txt")) { // 2F83B ; NotoSansCJKjp-Black // 2F83F..2F840 ; NotoSansCJKjp-Black List items = lineSplitter.splitToList(line); String codePoint = Utility.fromHex(items.get(0)); - switch(items.size()) { - case 3: - String codePointEnd = Utility.fromHex(items.get(1)); - DATA.putAll(codePoint.codePointAt(0), codePointEnd.codePointAt(0), items.get(1)); - break; - case 2: - DATA.put(codePoint, items.get(1)); - break; - default: - throw new IllegalArgumentException(); + switch (items.size()) { + case 3: + String codePointEnd = Utility.fromHex(items.get(1)); + DATA.putAll( + codePoint.codePointAt(0), codePointEnd.codePointAt(0), items.get(1)); + break; + case 2: + DATA.put(codePoint, items.get(1)); + break; + default: + throw new IllegalArgumentException(); } - } DATA.freeze(); } + public static UnicodeMap getData() { return DATA; } + public static boolean isCovered(int cp) { return DATA.containsKey(cp); } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/OldEmojiProcessing.java b/unicodetools/src/main/java/org/unicode/text/tools/OldEmojiProcessing.java index c9730b371..3094e5934 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/OldEmojiProcessing.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/OldEmojiProcessing.java @@ -1,22 +1,21 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeSet; - import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.Emoji; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row; - public class OldEmojiProcessing { static final Set ANDROID_IMAGES = new TreeSet<>(); - static final Map,Integer> ANDROID_REMAP = new HashMap<>(); + static final Map, Integer> ANDROID_REMAP = new HashMap<>(); static final UnicodeMap ANDROID_REMAP_VALUES = new UnicodeMap(); + static { addAndroidRemap("🇨🇳", 0xFE4ED); // cn addAndroidRemap("🇩🇪", 0xFE4E8); // de @@ -30,7 +29,8 @@ public class OldEmojiProcessing { addAndroidRemap("🇺🇸", 0xFE4E6); // us addAndroidRemap("#⃣", 0xFE82C); for (int i = 1; i <= 9; ++i) { - addAndroidRemap((char)('0' + i) + "" + Emoji.ENCLOSING_KEYCAP, 0xFE82D + i); // 1 => U+FE82E + addAndroidRemap( + (char) ('0' + i) + "" + Emoji.ENCLOSING_KEYCAP, 0xFE82D + i); // 1 => U+FE82E } addAndroidRemap("0⃣", 0xFE837); } @@ -38,9 +38,10 @@ public class OldEmojiProcessing { public static Integer addAndroidRemap(String real, int replacement) { ANDROID_REMAP_VALUES.put(replacement, real); int first = real.codePointAt(0); - return ANDROID_REMAP.put(Row.of(first, real.codePointAt(Character.charCount(first))), replacement); + return ANDROID_REMAP.put( + Row.of(first, real.codePointAt(Character.charCount(first))), replacement); } - + public static String androidPng(int firstCodepoint, int secondCodepoint, boolean first) { if (secondCodepoint == Emoji.ENCLOSING_KEYCAP) { int debug = 0; @@ -56,18 +57,22 @@ public static String androidPng(int firstCodepoint, int secondCodepoint, boolean return null; } } - String filename = "android/emoji_u" + Utility.hex(first ? firstCodepoint : secondCodepoint).toLowerCase(Locale.ENGLISH) + ".png"; + String filename = + "android/emoji_u" + + Utility.hex(first ? firstCodepoint : secondCodepoint) + .toLowerCase(Locale.ENGLISH) + + ".png"; ANDROID_IMAGES.add(filename); return filename; } - //Collator.getInstance(ULocale.ENGLISH); + // Collator.getInstance(ULocale.ENGLISH); // static final Map REMAP_FLAGS = new HashMap(); // static { // addFlagRemap("BL", "FR"); // addFlagRemap("BV", "NO"); // addFlagRemap("GF", "FR"); - // addFlagRemap("HM", "AU"); + // addFlagRemap("HM", "AU"); // addFlagRemap("MF", "FR"); // addFlagRemap("RE", "FR"); // addFlagRemap("SJ", "NO"); @@ -79,11 +84,9 @@ public static String androidPng(int firstCodepoint, int secondCodepoint, boolean // public static void addFlagRemap(String originalCountry, String replacementCountry) { // REMAP_FLAGS.put(originalCountry, replacementCountry); // System.out.println( - // Emoji.buildFileName(Emoji.getHexFromFlagCode(originalCountry),"_") + // Emoji.buildFileName(Emoji.getHexFromFlagCode(originalCountry),"_") // + " => " // + Emoji.buildFileName(Emoji.getHexFromFlagCode(replacementCountry),"_")); // } - - } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/PropertyChanges.java b/unicodetools/src/main/java/org/unicode/text/tools/PropertyChanges.java index 14d7428c5..3eb2e2b78 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/PropertyChanges.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/PropertyChanges.java @@ -1,18 +1,16 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.util.Arrays; import java.util.LinkedHashSet; import java.util.Objects; import java.util.Set; - import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; import org.unicode.props.UcdPropertyValues.Age_Values; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class PropertyChanges { public static void main(String[] args) { checkProperties(); @@ -26,8 +24,8 @@ static void checkProperties() { ages.add(Age_Values.V10_0); System.out.println(ages); - - UnicodeMap ageMap = latest.loadEnum(UcdProperty.Age, UcdPropertyValues.Age_Values.class); + UnicodeMap ageMap = + latest.loadEnum(UcdProperty.Age, UcdPropertyValues.Age_Values.class); for (UcdProperty prop : UcdProperty.values()) { if (prop == UcdProperty.Age) { continue; @@ -62,7 +60,8 @@ public Changes(int sameItems, int changedItems) { this.changedItems = changedItems; } - static Changes getChanges(UnicodeMap oldMap, UnicodeMap newMap, UnicodeSet oldChars) { + static Changes getChanges( + UnicodeMap oldMap, UnicodeMap newMap, UnicodeSet oldChars) { int sameItems = 0; int changedItems = 0; for (String c : oldChars) { @@ -78,12 +77,13 @@ static Changes getChanges(UnicodeMap oldMap, UnicodeMap newMap, } } - // private static void checkDates() { // long base = new Date(2017-1900, 0, 15).getTime(); // for (String locale : Arrays.asList("en", "zh")) { - // DateFormat df = DateFormat.getInstanceForSkeleton("hCCC", ULocale.forLanguageTag(locale)); - // System.out.println("locale: " + locale + "\tpattern: " + ((SimpleDateFormat) df).toPattern()); + // DateFormat df = DateFormat.getInstanceForSkeleton("hCCC", + // ULocale.forLanguageTag(locale)); + // System.out.println("locale: " + locale + "\tpattern: " + ((SimpleDateFormat) + // df).toPattern()); // //DateFormat df = new SimpleDateFormat("h BBBB", ULocale.forLanguageTag(locale)); // for (int hour = 0; hour < 24; ++hour) { // String formatted = df.format(base+hour*3600000); diff --git a/unicodetools/src/main/java/org/unicode/text/tools/RecommendedSetGenerator.java b/unicodetools/src/main/java/org/unicode/text/tools/RecommendedSetGenerator.java index f989ba4cf..3029d802f 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/RecommendedSetGenerator.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/RecommendedSetGenerator.java @@ -1,136 +1,137 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.util.Set; - import org.unicode.text.UCD.IdentifierInfo.Identifier_Status; import org.unicode.text.UCD.IdentifierInfo.Identifier_Type; import org.unicode.text.utility.Settings; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - /** * Generates the recommended UnicodeSet according to UTS 39. Used for updating the constant in ICU * SpoofChecker. - * + * * @author Shane Carr * @see com.ibm.icu.text.SpoofChecker */ public class RecommendedSetGenerator { - public static void main(String[] args) { - Sets sets = generateSet(); - System.out.println("# inclusion: \n" + sets.inclusion.toString()); - System.out.println("\n# recommended: \n" + sets.recommended.toString()); - System.out.println("\n\nJava Version:\n\n"); - System.out.println(uniSetToCodeString(sets.inclusion, "INCLUSION", true)); - System.out.println(uniSetToCodeString(sets.recommended, "RECOMMENDED", true)); - System.out.println("\n\nC++ Version:\n\n"); - System.out.println(uniSetToCodeString(sets.inclusion, "inclusionPat", false)); - System.out.println(uniSetToCodeString(sets.recommended, "recommendedPat", false)); - } - - public static String uniSetToCodeString(UnicodeSet uniset, String varName, boolean isJava) { - String str = uniset.toString().replace("\\", "\\\\"); - StringBuilder result = new StringBuilder(); - if (isJava) { - result.append(" public static final UnicodeSet " + varName + " = new UnicodeSet("); - } else { - result.append(" static const char16_t *" + varName + " ="); + public static void main(String[] args) { + Sets sets = generateSet(); + System.out.println("# inclusion: \n" + sets.inclusion.toString()); + System.out.println("\n# recommended: \n" + sets.recommended.toString()); + System.out.println("\n\nJava Version:\n\n"); + System.out.println(uniSetToCodeString(sets.inclusion, "INCLUSION", true)); + System.out.println(uniSetToCodeString(sets.recommended, "RECOMMENDED", true)); + System.out.println("\n\nC++ Version:\n\n"); + System.out.println(uniSetToCodeString(sets.inclusion, "inclusionPat", false)); + System.out.println(uniSetToCodeString(sets.recommended, "recommendedPat", false)); } - for (int i = 0; i < str.length();) { - // split into short lines - int end = i + 75; - if (end > str.length()) { - end = str.length(); - } - // break before an escape, not in the middle - // 11 = "\\\\U0010FFFF".length() - int min = end - 11; - if (min < i) { min = i; } - char nextChar = 0; - for (int j = end; min < j;) { - char c = str.charAt(--j); - if (c == '\\') { - if ((nextChar == 'u' && (end - j) >= 6) || (nextChar == 'U' && (end - j) >= 10)) { - // The escape sequence is completely on this line. - } else { - // Truncate before double escape. - if (i < j && str.charAt(j - 1) == '\\') { - --j; + + public static String uniSetToCodeString(UnicodeSet uniset, String varName, boolean isJava) { + String str = uniset.toString().replace("\\", "\\\\"); + StringBuilder result = new StringBuilder(); + if (isJava) { + result.append(" public static final UnicodeSet " + varName + " = new UnicodeSet("); + } else { + result.append(" static const char16_t *" + varName + " ="); + } + for (int i = 0; i < str.length(); ) { + // split into short lines + int end = i + 75; + if (end > str.length()) { + end = str.length(); + } + // break before an escape, not in the middle + // 11 = "\\\\U0010FFFF".length() + int min = end - 11; + if (min < i) { + min = i; } - // Do not truncate to nothing. - if (i < j) { - end = j; + char nextChar = 0; + for (int j = end; min < j; ) { + char c = str.charAt(--j); + if (c == '\\') { + if ((nextChar == 'u' && (end - j) >= 6) + || (nextChar == 'U' && (end - j) >= 10)) { + // The escape sequence is completely on this line. + } else { + // Truncate before double escape. + if (i < j && str.charAt(j - 1) == '\\') { + --j; + } + // Do not truncate to nothing. + if (i < j) { + end = j; + } + } + break; + } + nextChar = c; } - } - break; + String line = str.substring(i, end); + if (isJava) { + result.append("\n " + (i == 0 ? "\"" : "+ \"") + line + '"'); + } else { + result.append("\n u\"" + line + '"'); + } + i = end; } - nextChar = c; - } - String line = str.substring(i, end); - if (isJava) { - result.append("\n " + (i == 0 ? "\"" : "+ \"") + line + '"'); - } else { - result.append("\n u\"" + line + '"'); - } - i = end; + result.append(isJava ? "\n ).freeze();\n" : ";\n"); + return result.toString(); } - result.append(isJava ? "\n ).freeze();\n" : ";\n"); - return result.toString(); - } - public static Sets generateSet() { - String path = Settings.UnicodeTools.getDataPathStringForLatestVersion("security"); - XIDModifications inst = new XIDModifications(path); + public static Sets generateSet() { + String path = Settings.UnicodeTools.getDataPathStringForLatestVersion("security"); + XIDModifications inst = new XIDModifications(path); - // Compute sets based on status - UnicodeSet allowedS = new UnicodeSet(); - UnicodeSet restrictedS = new UnicodeSet(); - UnicodeMap statuses = inst.getStatus(); - for (String range : statuses) { - Identifier_Status status = statuses.get(range); - if (status == Identifier_Status.allowed) { - allowedS.add(range); - } else { - restrictedS.add(range); - } - } - allowedS.freeze(); - restrictedS.freeze(); + // Compute sets based on status + UnicodeSet allowedS = new UnicodeSet(); + UnicodeSet restrictedS = new UnicodeSet(); + UnicodeMap statuses = inst.getStatus(); + for (String range : statuses) { + Identifier_Status status = statuses.get(range); + if (status == Identifier_Status.allowed) { + allowedS.add(range); + } else { + restrictedS.add(range); + } + } + allowedS.freeze(); + restrictedS.freeze(); - // Compute sets based on types - UnicodeSet recommendedT = new UnicodeSet(); - UnicodeSet inclusionT = new UnicodeSet(); - UnicodeSet restrictedT = new UnicodeSet(); - UnicodeMap> typeses = inst.getType(); - for (String range : typeses) { - Set types = typeses.get(range); - if (types.contains(Identifier_Type.inclusion)) { - inclusionT.add(range); - } else if (types.contains(Identifier_Type.recommended)) { - recommendedT.add(range); - } else { - restrictedT.add(range); - } - } - recommendedT.freeze(); - inclusionT.freeze(); - restrictedT.freeze(); - assert restrictedS.equals(restrictedT); + // Compute sets based on types + UnicodeSet recommendedT = new UnicodeSet(); + UnicodeSet inclusionT = new UnicodeSet(); + UnicodeSet restrictedT = new UnicodeSet(); + UnicodeMap> typeses = inst.getType(); + for (String range : typeses) { + Set types = typeses.get(range); + if (types.contains(Identifier_Type.inclusion)) { + inclusionT.add(range); + } else if (types.contains(Identifier_Type.recommended)) { + recommendedT.add(range); + } else { + restrictedT.add(range); + } + } + recommendedT.freeze(); + inclusionT.freeze(); + restrictedT.freeze(); + assert restrictedS.equals(restrictedT); - // ALLOWED should be the union of RECOMMENDED and INCLUSION. - UnicodeSet allowed = recommendedT.cloneAsThawed().addAll(inclusionT).freeze(); - assert allowedS.equals(allowed); + // ALLOWED should be the union of RECOMMENDED and INCLUSION. + UnicodeSet allowed = recommendedT.cloneAsThawed().addAll(inclusionT).freeze(); + assert allowedS.equals(allowed); - // Return value - Sets result = new Sets(); - result.inclusion = inclusionT; - result.recommended = recommendedT; - return result; - } + // Return value + Sets result = new Sets(); + result.inclusion = inclusionT; + result.recommended = recommendedT; + return result; + } - public static class Sets { - public UnicodeSet recommended; - public UnicodeSet inclusion; - } + public static class Sets { + public UnicodeSet recommended; + public UnicodeSet inclusion; + } } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/RegexBuilder.java b/unicodetools/src/main/java/org/unicode/text/tools/RegexBuilder.java index e77944278..c5e119436 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/RegexBuilder.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/RegexBuilder.java @@ -1,23 +1,22 @@ package org.unicode.text.tools; -import java.util.Locale; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.unicode.tools.emoji.Emoji; -import org.unicode.tools.emoji.EmojiData; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.CharSequences; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.EntryRange; +import java.util.Locale; public class RegexBuilder { - public enum Style {CODEPOINT_REGEX, CHAR_REGEX} - public static final UnicodeSet NEEDS_ESCAPE = new UnicodeSet("[[:di:][:Me:][:Mn:][:c:]]") -// .add(0x1F1E6,0x1F1FF) - .freeze(); + public enum Style { + CODEPOINT_REGEX, + CHAR_REGEX + } + + public static final UnicodeSet NEEDS_ESCAPE = + new UnicodeSet("[[:di:][:Me:][:Mn:][:c:]]") + // .add(0x1F1E6,0x1F1FF) + .freeze(); public static StringBuilder showSet(UnicodeSet us, StringBuilder output) { if (us.size() == 1) { @@ -30,7 +29,8 @@ public static StringBuilder showSet(UnicodeSet us, StringBuilder output) { if (count > 0) { if (count != 1) { output.append('-'); - }; + } + ; showChar(e.codepointEnd, output); } } @@ -59,8 +59,8 @@ public static StringBuilder showString(String s, StringBuilder output) { public static StringBuilder showChar(int cp, StringBuilder output) { if (NEEDS_ESCAPE.contains(cp)) { output.append("\\x{") - .append(Integer.toHexString(cp).toUpperCase(Locale.ROOT)) - .append("}"); + .append(Integer.toHexString(cp).toUpperCase(Locale.ROOT)) + .append("}"); } else { output.appendCodePoint(cp); } @@ -79,7 +79,8 @@ private class NodeF { public String toString() { return "〔" + plainSet + "/" + plainMap + "/" + optionalMap + "〕"; - }; + } + ; NodeF(Node source) { UnicodeSet finals = source.finals; @@ -89,16 +90,16 @@ public String toString() { plainSet = new UnicodeSet(finals).removeAll(multiKeys).freeze(); optionalMap = new UnicodeMap().putAll(continues).retainAll(finals).freeze(); plainMap = new UnicodeMap().putAll(continues).removeAll(finals).freeze(); - } + @Override public boolean equals(Object obj) { NodeF that = (NodeF) obj; - return that.plainSet.equals(plainSet) + return that.plainSet.equals(plainSet) && that.plainMap.equals(plainMap) - && that.optionalMap.equals(optionalMap) - ; + && that.optionalMap.equals(optionalMap); } + @Override public int hashCode() { int result = plainSet.hashCode(); @@ -123,12 +124,13 @@ UnicodeMap deepCopy(UnicodeMap continues) { return result.freeze(); } - private StringBuilder print(int depth, boolean showLevel, boolean optional, StringBuilder output) { + private StringBuilder print( + int depth, boolean showLevel, boolean optional, StringBuilder output) { - int mapItemCount = countItems(plainMap) - + countItems(optionalMap); + int mapItemCount = countItems(plainMap) + countItems(optionalMap); int setCount = plainSet.isEmpty() ? 0 : 1; - final boolean needsParen = (mapItemCount + setCount) > 1 || optional && mapItemCount != 0; + final boolean needsParen = + (mapItemCount + setCount) > 1 || optional && mapItemCount != 0; if (needsParen) { output.append('('); @@ -153,7 +155,13 @@ private StringBuilder print(int depth, boolean showLevel, boolean optional, Stri return output; } - private boolean print(UnicodeMap plainMap, int depth, boolean showLevel, StringBuilder output, boolean first, boolean optional) { + private boolean print( + UnicodeMap plainMap, + int depth, + boolean showLevel, + StringBuilder output, + boolean first, + boolean optional) { for (NodeF n : plainMap.values()) { if (first) { first = false; @@ -165,15 +173,12 @@ private boolean print(UnicodeMap plainMap, int depth, boolean showLevel, } UnicodeSet us = plainMap.getSet(n); showSet(us, output); - n.print(depth+1, false, optional, output); + n.print(depth + 1, false, optional, output); } return first; } - - } - // TODO fix hashcode // TODO make strings be empty not null private static int countItems(UnicodeMap unicodeMap) { @@ -223,7 +228,7 @@ private void add(int[] list, int pos, Node data2) { if (node2 == null) { data2.continues.put(cp, node2 = new Node()); } - add(list, pos+1, node2); + add(list, pos + 1, node2); } } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/RenameFiles.java b/unicodetools/src/main/java/org/unicode/text/tools/RenameFiles.java index bc39ff993..08ee22be9 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/RenameFiles.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/RenameFiles.java @@ -1,5 +1,8 @@ package org.unicode.text.tools; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Output; import java.io.File; import java.io.IOException; import java.nio.file.FileSystem; @@ -10,15 +13,10 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.util.RegexUtilities; import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.Emoji; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Output; - public class RenameFiles { // First set the source accordingly. @@ -27,7 +25,7 @@ public class RenameFiles { // Then set PREVIEW_ONLY to true to check that the right changes are done, // then to false to do them. private static final boolean PREVIEW_ONLY = false; - + private static final boolean RECURSIVE = true; // Modify the dir, regex, filter, and output-platform as needed @@ -35,42 +33,45 @@ public class RenameFiles { // Set PREVIEW to false, and run for real enum Choice { - fix_emoji_u("/Users/markdavis/Documents/workspace/unicodetools/data/images/svg", + fix_emoji_u( + "/Users/markdavis/Documents/workspace/unicodetools/data/images/svg", "emoji_u.*", - "emoji_u(?.*)\\.svg", - "emoji"), + "emoji_u(?.*)\\.svg", + "emoji"), flags( - "/Users/markdavis/Downloads/svg-flags", + "/Users/markdavis/Downloads/svg-flags", null, - "(?[A-Z]+)([-](?[A-Z]+))?\\.svg", + "(?[A-Z]+)([-](?[A-Z]+))?\\.svg", "emoji"), emojipedia( - "/Users/markdavis/Downloads/Emojipedia 11.0 Sample Images 72px", + "/Users/markdavis/Downloads/Emojipedia 11.0 Sample Images 72px", null, - "x?(?[-_A-Za-z0-9]+)?\\.png", + "x?(?[-_A-Za-z0-9]+)?\\.png", "emojipedia"), samsung( - "/Users/markdavis/Downloads/Samsung_Emoji_72x72_0322", + "/Users/markdavis/Downloads/Samsung_Emoji_72x72_0322", "^.*[^s].png$", - "samsung_(?[-_A-Za-z0-9]+)\\.png", + "samsung_(?[-_A-Za-z0-9]+)\\.png", "samsung"), emojione( - "/Users/markdavis/Downloads/joypixels_72", ///Users/markdavis/Downloads/png_72 + "/Users/markdavis/Downloads/joypixels_72", /// Users/markdavis/Downloads/png_72 "^.*.png$", - "joypixels_(?[-_A-Za-z0-9]+)\\.png", + "joypixels_(?[-_A-Za-z0-9]+)\\.png", "emojione"), twitter( - "/Users/markdavis/Downloads/twitter", //"/Users/markdavis/Downloads/72x72" + "/Users/markdavis/Downloads/twitter", // "/Users/markdavis/Downloads/72x72" "^(?!\\.).*.png$", - "(?:twitter[-_])?(?[-_A-Za-z0-9]+)\\.png", - "twitter"), - android("/Users/markdavis/Downloads/ExtractedEmojis", // uni1f1e6_uni1f1e8.png + "(?:twitter[-_])?(?[-_A-Za-z0-9]+)\\.png", + "twitter"), + android( + "/Users/markdavis/Downloads/ExtractedEmojis", // uni1f1e6_uni1f1e8.png "^(?!\\.).*.png$", - "(uni|emoji_)?(?[a-fA-F0-9]+(_[a-fA-F0-9]+)*)\\.png", + "(uni|emoji_)?(?[a-fA-F0-9]+(_[a-fA-F0-9]+)*)\\.png", "android"), - cldr("/Users/markdavis/eclipse-workspace/unicode-draft/reports/tr51/images/cldr", // uni1f1e6_uni1f1e8.png + cldr( + "/Users/markdavis/eclipse-workspace/unicode-draft/reports/tr51/images/cldr", // uni1f1e6_uni1f1e8.png "^(?!\\\\.).*.png$", - "(?:proposed[-_])?(?[-_A-Za-z0-9]+)\\.png", + "(?:proposed[-_])?(?[-_A-Za-z0-9]+)\\.png", "emoji"), ; final String sourceDir; @@ -86,32 +87,28 @@ enum Choice { } } - - // FileMatch - //"(?[A-Z]+)([-](?[A-Z]+))?\\.svg" - //"(?[a-zA-Z]+)[-_](?[-0-9a-fA-F_]+)\\.png" + // "(?[A-Z]+)([-](?[A-Z]+))?\\.svg" + // "(?[a-zA-Z]+)[-_](?[-0-9a-fA-F_]+)\\.png" // "([-0-9a-fA-F_]+)\\.png" // twitter // "(?:[a-zA-Z]+|emoji_thumbnail)?(?:_[xu])?([-0-9a-fA-F_]+)\\.png" // anything else - //"proposed_(?:x)?(.*)\\.png"; + // "proposed_(?:x)?(.*)\\.png"; // U+270C,U+1F3FC_256.png - - private static final int HEX_ADDITION = + private static final int HEX_ADDITION = // 0x10000 - 0x100000; 0; // sourceDir - //"/Users/markdavis/Downloads/svg-flags" - //"/Users/markdavis/Documents/workspace/unicode-draft/reports/tr51/images/proposed" + // "/Users/markdavis/Downloads/svg-flags" + // "/Users/markdavis/Documents/workspace/unicode-draft/reports/tr51/images/proposed" // Settings.BASE_DIRECTORY + "Google Drive/workspace/DATA/emoji/twitter/" // Settings.UNICODE_DRAFT_DIRECTORY + "/reports/tr51/images/" + OUTPUT_PLATFORM_PREFIX - private static final Pattern REMOVE_FROM_HEX = Pattern.compile("_fe0f"); public static void main(String[] args) throws IOException { - final Matcher m = choice.fileMatch; + final Matcher m = choice.fileMatch; final File dir = new File(choice.sourceDir); if (!dir.exists()) { throw new IllegalArgumentException("Missing dir: " + dir); @@ -138,14 +135,13 @@ private static void process(File f, Matcher m, Output count) { String name = f.getName(); String path = f.getPath(); String parent = f.getParent(); - if (name.startsWith(".") - || name.endsWith(" (1).png") - || name.endsWith(" 2.png") + if (name.startsWith(".") + || name.endsWith(" (1).png") + || name.endsWith(" 2.png") || name.contains("_x") || name.endsWith(".gif") || name.endsWith(".jpg") - || parent.endsWith("/other") - ) { + || parent.endsWith("/other")) { return; } try { @@ -153,69 +149,83 @@ private static void process(File f, Matcher m, Output count) { return; } if (!m.reset(name).matches()) { - throw new IllegalArgumentException(RegexUtilities.showMismatch(m, name) + "\nHex: " + Utility.hex(name, " ")); + throw new IllegalArgumentException( + RegexUtilities.showMismatch(m, name) + "\nHex: " + Utility.hex(name, " ")); } String suffix = ".png"; String oldHex; - switch(choice) { - case flags: { - final String country = m.group("name"); - final String subdivision = m.group("codes"); - if (subdivision == null) { - oldHex = Emoji.getHexFromFlagCode(country); - } else { - oldHex = Emoji.getHexFromSubdivision(country+subdivision); - } - suffix = ".svg"; - } - case fix_emoji_u: { - oldHex = Utility.fromHex(m.group("hex").replaceAll("[_]", " ")); - suffix = ".svg"; - break; - } - case android: { - final String oldName = m.group("codes").replaceAll("(_|uni)+", " ").trim(); - oldHex = Utility.fromHex(oldName, false, 2); - // HACK: uni2640_uni200d_uni1f9b8_uni1f3fb.png should be uni1f9b8_uni1f3fb _uni2640_uni200d.png -// if (SUPERS.containsSome(oldHex) && !SUPERS.contains(oldHex.codePointAt(0)) { -// -// } - break; - } - default: { - final String oldName = m.group("codes").replaceAll("[-_,]", " ").trim(); - oldHex = Utility.fromHex(oldName, false, 2); - // HACK for J. - // final String oldPrefix = m.group("name"); - // if (oldPrefix != null) { - // // curlyhair | 1f3fb-200d-2640-fe0f = curly + zwj + woman - // oldHex = oldHex.replace("\ufe0f", "").replace("\u200d", ""); - // // => 1f3fb-200d-2640 - // int first = oldHex.endsWith("\u2640") ? 0x1F469 : oldHex.endsWith("\u2642") ? 0x1F468 : -1; - // oldHex = oldHex.substring(0, oldHex.length()-1); - // // => 1f3fb-200d - // int last; - // switch(oldPrefix) { - // case "curlyhair": last = 0x1F9B1; break; - // case "nohair": last = 0x1F9B2; break; - // case "redhair": last = 0x1F9B0; break; - // case "whitehair": last = 0x1F9B3; break; - // default: throw new IllegalArgumentException("bad hair day"); - // } - // // 1F469 1F3FB 200D 1F9B1 = woman, light skin, zwj curly - // oldHex = UTF16.valueOf(first) + oldHex + UTF16.valueOf(0x200d) + UTF16.valueOf(last); - // } - if (HEX_ADDITION != 0) { - oldHex = UTF16.valueOf(HEX_ADDITION + oldHex.codePointAt(0)); - } - } + switch (choice) { + case flags: + { + final String country = m.group("name"); + final String subdivision = m.group("codes"); + if (subdivision == null) { + oldHex = Emoji.getHexFromFlagCode(country); + } else { + oldHex = Emoji.getHexFromSubdivision(country + subdivision); + } + suffix = ".svg"; + } + case fix_emoji_u: + { + oldHex = Utility.fromHex(m.group("hex").replaceAll("[_]", " ")); + suffix = ".svg"; + break; + } + case android: + { + final String oldName = m.group("codes").replaceAll("(_|uni)+", " ").trim(); + oldHex = Utility.fromHex(oldName, false, 2); + // HACK: uni2640_uni200d_uni1f9b8_uni1f3fb.png should be uni1f9b8_uni1f3fb + // _uni2640_uni200d.png + // if (SUPERS.containsSome(oldHex) && + // !SUPERS.contains(oldHex.codePointAt(0)) { + // + // } + break; + } + default: + { + final String oldName = m.group("codes").replaceAll("[-_,]", " ").trim(); + oldHex = Utility.fromHex(oldName, false, 2); + // HACK for J. + // final String oldPrefix = m.group("name"); + // if (oldPrefix != null) { + // // curlyhair | 1f3fb-200d-2640-fe0f = curly + zwj + + // woman + // oldHex = oldHex.replace("\ufe0f", + // "").replace("\u200d", ""); + // // => 1f3fb-200d-2640 + // int first = oldHex.endsWith("\u2640") ? 0x1F469 : + // oldHex.endsWith("\u2642") ? 0x1F468 : -1; + // oldHex = oldHex.substring(0, oldHex.length()-1); + // // => 1f3fb-200d + // int last; + // switch(oldPrefix) { + // case "curlyhair": last = 0x1F9B1; break; + // case "nohair": last = 0x1F9B2; break; + // case "redhair": last = 0x1F9B0; break; + // case "whitehair": last = 0x1F9B3; break; + // default: throw new IllegalArgumentException("bad hair + // day"); + // } + // // 1F469 1F3FB 200D 1F9B1 = woman, light skin, zwj + // curly + // oldHex = UTF16.valueOf(first) + oldHex + + // UTF16.valueOf(0x200d) + UTF16.valueOf(last); + // } + if (HEX_ADDITION != 0) { + oldHex = UTF16.valueOf(HEX_ADDITION + oldHex.codePointAt(0)); + } + } } String newHex = Utility.hex(oldHex, "_").toLowerCase(Locale.ENGLISH); newHex = REMOVE_FROM_HEX.matcher(newHex).replaceAll(""); - //Emoji.buildFileName(Emoji.getHexFromFlagCode(m.group(1)), "_") + // Emoji.buildFileName(Emoji.getHexFromFlagCode(m.group(1)), "_") - final String prefix = choice.outputPlatformPrefix == null ? m.group(1) : choice.outputPlatformPrefix; + final String prefix = + choice.outputPlatformPrefix == null ? m.group(1) : choice.outputPlatformPrefix; String newName = prefix + "_" + newHex + suffix; if (newName.equals(name)) { return; @@ -227,10 +237,14 @@ private static void process(File f, Matcher m, Output count) { return; } FileSystem dfs = FileSystems.getDefault(); - Path oldPath = dfs.getPath(path); - Path foo = Files.move(oldPath, oldPath.resolveSibling(newName), StandardCopyOption.ATOMIC_MOVE); + Path oldPath = dfs.getPath(path); + Path foo = + Files.move( + oldPath, + oldPath.resolveSibling(newName), + StandardCopyOption.ATOMIC_MOVE); } catch (Exception e) { - throw new IllegalArgumentException(parent,e); + throw new IllegalArgumentException(parent, e); } } } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/ScriptPopulation.java b/unicodetools/src/main/java/org/unicode/text/tools/ScriptPopulation.java index 940acd495..2e4803e68 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/ScriptPopulation.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/ScriptPopulation.java @@ -1,10 +1,20 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.lang.UScript.ScriptUsage; +import com.ibm.icu.text.DecimalFormat; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.util.BitSet; import java.util.HashMap; import java.util.Map; import java.util.Set; - import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.Counter; import org.unicode.cldr.util.Counter2; @@ -14,21 +24,7 @@ import org.unicode.text.tools.ScriptPopulation.Category.Extra; import org.unicode.tools.emoji.EmojiData; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.lang.UScript.ScriptUsage; -import com.ibm.icu.text.DecimalFormat; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - -/** - * See CharacterFrequency for the base data - */ +/** See CharacterFrequency for the base data */ class ScriptPopulation { static CLDRConfig testInfo = CLDRConfig.getInstance(); static SupplementalDataInfo supplemental = testInfo.getSupplementalDataInfo(); @@ -38,27 +34,28 @@ class ScriptPopulation { private static final boolean SHOW_DECTILES = false; static final EmojiData EMOJI_DATA = EmojiData.EMOJI_DATA; - private static final UnicodeSet SINGLETONS_WITHOUT_DEFECTIVES = new UnicodeSet() - .addAll(EMOJI_DATA.getSingletonsWithoutDefectives()) - .removeAll("©®♥™") - .freeze(); + private static final UnicodeSet SINGLETONS_WITHOUT_DEFECTIVES = + new UnicodeSet() + .addAll(EMOJI_DATA.getSingletonsWithoutDefectives()) + .removeAll("©®♥™") + .freeze(); // define a category to be: // explicit script // main general category public static void main(String[] args) { -// checkCommon(); -// if (true) return; + // checkCommon(); + // if (true) return; // getLanguageInfo(); // LanguageTagParser ltp = new LanguageTagParser(); // LikelySubtags likely = new LikelySubtags(supplemental); // Counter2 scriptPopulation = new Counter2<>(); Counter2 scriptFrequency = new Counter2<>(); Counter scriptCount = new Counter<>(); - Counter2> notoScriptFrequency = new Counter2<>(); - Counter> notoScriptCount = new Counter<>(); -// Counter2 rawScriptFrequency = new Counter2<>(); + Counter2> notoScriptFrequency = new Counter2<>(); + Counter> notoScriptCount = new Counter<>(); + // Counter2 rawScriptFrequency = new Counter2<>(); Counter freq = CharacterFrequency.getCodePointCounter("mul", true); @SuppressWarnings("unchecked") @@ -78,29 +75,30 @@ public static void main(String[] args) { // notoScriptFrequency.add(pair, 0d); // } - for (int cp = 0; cp <= 0x10FFFF; ++cp){ + for (int cp = 0; cp <= 0x10FFFF; ++cp) { long frequency = freq.get(cp); - //rawScriptFrequency.add(UScript.getScript(cp), (double) frequency); + // rawScriptFrequency.add(UScript.getScript(cp), (double) frequency); // quick approximate normalization - int i = UCharacter.foldCase(cp,true); + int i = UCharacter.foldCase(cp, true); String str = NFC.normalize(UTF16.valueOf(i)); if (1 == UTF16.countCodePoint(str)) { i = str.codePointAt(0); } int scriptNum = Category.getCategory(i); -// if (scriptNum == Category.Extra.Private.ordinal()) { -// continue; -// } - scriptFrequency.add(scriptNum, (double)frequency); + // if (scriptNum == Category.Extra.Private.ordinal()) { + // continue; + // } + scriptFrequency.add(scriptNum, (double) frequency); scriptCount.add(scriptNum, 1); - categoryToTopItems[scriptNum].add(i, (double)frequency); + categoryToTopItems[scriptNum].add(i, (double) frequency); boolean isNoto = NotoCoverage.isCovered(cp); - notoScriptFrequency.add(Pair.of(isNoto, scriptNum), (double)frequency); + notoScriptFrequency.add(Pair.of(isNoto, scriptNum), (double) frequency); notoScriptCount.add(Pair.of(isNoto, scriptNum), 1); - (isNoto ? notoCategoryToTopItems[scriptNum] : nonotoCategoryToTopItems[scriptNum]).add(i, (double)frequency); + (isNoto ? notoCategoryToTopItems[scriptNum] : nonotoCategoryToTopItems[scriptNum]) + .add(i, (double) frequency); } // make sure all existing scripts have at least 1 for (int script = Category.OFFSET; script < Category.CODE_LIMIT; ++script) { @@ -110,7 +108,7 @@ public static void main(String[] args) { ScriptUsage usage = Category.getUsage(script); if (usage != ScriptUsage.NOT_ENCODED && scriptFrequency.getCount(script) == 0.0d) { scriptFrequency.add(script, 1.0d); // fake unknown scripts - //System.out.println("Adding" + Category.getName(script)); + // System.out.println("Adding" + Category.getName(script)); } } DecimalFormat nf = (DecimalFormat) NumberFormat.getInstance(ULocale.ENGLISH); @@ -118,21 +116,26 @@ public static void main(String[] args) { nf.setMinimumSignificantDigits(3); double totalFreq = scriptFrequency.getTotal().doubleValue(); int count = 0; - System.out.println("№\t-log(%)\tCount\tScript (*Cat)\tUAX31 Status\t1st\t2nd\t3rd\t4th\t5th\t6th\t7th\t8th\t9th\t10th\t11th\t12th\t13th\t14th\t15th\t16th\t17th\t18th\t19th\t20th\t…"); + System.out.println( + "№\t-log(%)\tCount\tScript (*Cat)\tUAX31 Status\t1st\t2nd\t3rd\t4th\t5th\t6th\t7th\t8th\t9th\t10th\t11th\t12th\t13th\t14th\t15th\t16th\t17th\t18th\t19th\t20th\t…"); for (Integer category : scriptFrequency.getKeysetSortedByCount(false, null)) { if (category == Extra.Format.ordinal() - //|| category == Extra.Unknown.ordinal() - //|| category == Extra.Private.ordinal() + // || category == Extra.Unknown.ordinal() + // || category == Extra.Private.ordinal() || category == Extra.Control.ordinal() - || category == Extra.Whitespace.ordinal() - ) { + || category == Extra.Whitespace.ordinal()) { continue; } Double frequ = scriptFrequency.getCount(category); - System.out.print(++count - + (true ? "\t" + nf.format(Math.log10(totalFreq/frequ)) : "") - + "\t" + scriptCount.get(category) - + "\t" + Category.getName(category) + "\t" + Category.getUsageName(category)); + System.out.print( + ++count + + (true ? "\t" + nf.format(Math.log10(totalFreq / frequ)) : "") + + "\t" + + scriptCount.get(category) + + "\t" + + Category.getName(category) + + "\t" + + Category.getUsageName(category)); int max = 20; Counter2 topItems = categoryToTopItems[category]; showTop(max, topItems, frequ); @@ -140,10 +143,12 @@ public static void main(String[] args) { } System.out.println("\nNOTO\n"); - System.out.println("№\t-log(%)\tCount\tNoto?\tScript (*Cat)\tUAX31 Status\t1st\t2nd\t3rd\t4th\t5th\t6th\t7th\t8th\t9th\t10th\t11th\t12th\t13th\t14th\t15th\t16th\t17th\t18th\t19th\t20th\t…"); + System.out.println( + "№\t-log(%)\tCount\tNoto?\tScript (*Cat)\tUAX31 Status\t1st\t2nd\t3rd\t4th\t5th\t6th\t7th\t8th\t9th\t10th\t11th\t12th\t13th\t14th\t15th\t16th\t17th\t18th\t19th\t20th\t…"); count = 0; - for (Pair entry : notoScriptFrequency.getKeysetSortedByCount(false, null)) { + for (Pair entry : + notoScriptFrequency.getKeysetSortedByCount(false, null)) { Double frequ = notoScriptFrequency.getCount(entry); long countItems = notoScriptCount.getCount(entry); Boolean inNoto = entry.getFirst(); @@ -153,25 +158,31 @@ public static void main(String[] args) { || category == Extra.Unknown.ordinal() || category == Extra.Private.ordinal() || category == Extra.Control.ordinal() - || category == Extra.Whitespace.ordinal() - ) { + || category == Extra.Whitespace.ordinal()) { continue; } - System.out.print(++count - + (true ? "\t" + nf.format(Math.log(totalFreq/frequ)) : "") - + "\t" + countItems - + "\t" + noto - + "\t" + Category.getName(category) - + "\t" + Category.getUsageName(category)); + System.out.print( + ++count + + (true ? "\t" + nf.format(Math.log(totalFreq / frequ)) : "") + + "\t" + + countItems + + "\t" + + noto + + "\t" + + Category.getName(category) + + "\t" + + Category.getUsageName(category)); int max = 20; - Counter2 topItems = inNoto ? notoCategoryToTopItems[category] : nonotoCategoryToTopItems[category]; + Counter2 topItems = + inNoto ? notoCategoryToTopItems[category] : nonotoCategoryToTopItems[category]; showTop(max, topItems, frequ); System.out.println(); } if (true) return; - // Relation languagesWithoutScripts = Relation.of(new TreeMap>(), TreeSet.class); + // Relation languagesWithoutScripts = Relation.of(new + // TreeMap>(), TreeSet.class); // for (String language : supplemental.getLanguagesForTerritoriesPopulationData()) { // boolean usedSil = false; // String script = ltp.set(language).getScript(); @@ -201,7 +212,8 @@ public static void main(String[] args) { // } // int scriptNum = UScript.getCodeFromName(script); // for (String territory : supplemental.getTerritoriesForPopulationData(language)) { - // PopulationData pop = supplemental.getLanguageAndTerritoryPopulationData(language, territory); + // PopulationData pop = supplemental.getLanguageAndTerritoryPopulationData(language, + // territory); // double population = pop.getLiteratePopulation(); // scriptPopulation.add(scriptNum, population); // if (scriptNum == UScript.HIRAGANA) { @@ -210,10 +222,10 @@ public static void main(String[] args) { // } else if (bopomofo) { // scriptPopulation.add(UScript.BOPOMOFO, population); // } else if (scriptNum == UScript.UNKNOWN || usedSil) { - // languagesWithoutScripts.put(testInfo.getEnglish().getName(language), - // language - // + "\t" + script - // + "\t" + territory + // languagesWithoutScripts.put(testInfo.getEnglish().getName(language), + // language + // + "\t" + script + // + "\t" + territory // + "\t" + (long)population // + "\t" + Category.getUsage(scriptNum) // ); @@ -239,7 +251,8 @@ public static void main(String[] args) { // if (charCount < 1) { // charCount = 1; // } - // System.out.println(Category.getName(i) + "\t" + usage + "\t" + count + "\t" + charCount); + // System.out.println(Category.getName(i) + "\t" + usage + "\t" + count + "\t" + + // charCount); // } // BitSet bitset = new BitSet(); // UnicodeMap fixedScripts = new UnicodeMap<>(); @@ -256,13 +269,14 @@ public static void main(String[] args) { // } // String scriptNames = getScriptNames(bitset, " ", false); // String scriptName = Category.getName(sc); - // //System.out.println(scriptName + "\t" + scriptNames + "\t" + Utility.hex(i) + "\t" + UCharacter.getName(i)); + // //System.out.println(scriptName + "\t" + scriptNames + "\t" + Utility.hex(i) + "\t" + // + UCharacter.getName(i)); // if (sc == UScript.COMMON || sc == UScript.INHERITED) { // int single = getBest(bitset); // if (single > 0) { - // fixedScripts.put(i, - // scriptName - // + ";" + Category.getName(single) + // fixedScripts.put(i, + // scriptName + // + ";" + Category.getName(single) // + ";" + scriptNames); // } // continue; @@ -275,200 +289,208 @@ public static void main(String[] args) { // showScripts(fixedScripts); } - private static final Map REMAP_SCRIPT = new HashMap<>(); + private static final Map REMAP_SCRIPT = new HashMap<>(); + static { - REMAP_SCRIPT.put("Hant","Hani"); - REMAP_SCRIPT.put("Hans","Hani"); - REMAP_SCRIPT.put("Jpan","Kana"); - REMAP_SCRIPT.put("Hira","Kana"); - REMAP_SCRIPT.put("Kore","Hang"); + REMAP_SCRIPT.put("Hant", "Hani"); + REMAP_SCRIPT.put("Hans", "Hani"); + REMAP_SCRIPT.put("Jpan", "Kana"); + REMAP_SCRIPT.put("Hira", "Kana"); + REMAP_SCRIPT.put("Kore", "Hang"); } - -// private static void checkCommon() { -// Counter[] scriptToOthers = new Counter[UScript.CODE_LIMIT]; -// Counter totalScriptFrequency = new Counter<>(); -// Map> cpToScriptFrequency = new TreeMap<>(); -// for (String lang : CharacterFrequency.getLanguagesWithCounter()) { -// if (lang.equals("mul")) continue; -// ULocale max = ULocale.addLikelySubtags(new ULocale(lang)); -// String script = max.getScript(); -// String temp = REMAP_SCRIPT.get(script); -// if (temp != null) { -// script = temp; -// } -// -// int scriptCode = UScript.getCodeFromName(script); -// //System.out.println(lang + "\t=>\t" + max + "\t" + scriptCode + "\t" + UScript.getName(scriptCode)); -// if (scriptCode == UScript.KAITHI) continue; -// Counter data = scriptToOthers[scriptCode]; -// if (data == null) { -// scriptToOthers[scriptCode] = data = new Counter(); -// } -// -// Counter freq = CharacterFrequency.getCodePointCounter(lang, true); -// for (int cp : freq.keySet()) { -// long value = freq.get(cp); -// int cpScript = getScript(cp); -// data.add(cpScript, value); -// -// // data per language-script -// totalScriptFrequency.add(scriptCode, value); -// -// Extra extra = getGeneralCategory(cp); -// if (extra != Extra.Punctuation && extra != Extra.Letter && extra != Extra.Mark) continue; -// if (cpScript == UScript.COMMON || cpScript == UScript.INHERITED) { -// Counter counter = cpToScriptFrequency.get(cp); -// if (counter == null) { -// cpToScriptFrequency.put(cp, counter = new Counter()); -// } -// counter.add(scriptCode, value); -// } -// } -// } -// // normalize -// Map> normcpToScriptFrequency = new TreeMap>(); -// for (Entry> entry : cpToScriptFrequency.entrySet()) { -// Counter scriptFrequency = entry.getValue(); -// Counter2 normscriptFrequency = new Counter2(); -// normcpToScriptFrequency.put(entry.getKey(), normscriptFrequency); -// for (R2 x : scriptFrequency.getEntrySetSortedByCount(false, null)) { -// int script = x.get1(); -// double count = x.get0(); -// normscriptFrequency.add(script, count/totalScriptFrequency.get(script)); -// } -// } -// if (false) for (Extra extra : Extra.values()) { -// for (Entry> entry : cpToScriptFrequency.entrySet()) { -// int cp = entry.getKey(); -// if (getGeneralCategory(cp) != extra) continue; -// System.out.print(extra + "\tU+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp)); -// int max = 5; -// Counter scriptFrequency = entry.getValue(); -// double total = scriptFrequency.getTotal(); -// for (R2 x : scriptFrequency.getEntrySetSortedByCount(false, null)) { -// if (--max < 0) break; -// int script = x.get1(); -// long count = x.get0(); -// double proportion = count/total; -// System.out.print("\t" + UScript.getShortName(script) -// + "\t" + proportion); -// } -// System.out.println(); -// } -// } -// System.out.println("\nNormalized\n"); -// for (Extra extra : Extra.values()) { -// for (Entry> entry : normcpToScriptFrequency.entrySet()) { -// int cp = entry.getKey(); -// if (getGeneralCategory(cp) != extra) continue; -// System.out.print(extra + "\tU+" + Utility.hex(cp) + "\t" + UCharacter.getName(cp)); -// int max = 5; -// Counter2 scriptFrequency = entry.getValue(); -// double total = scriptFrequency.getTotal().doubleValue(); -// for (Integer script : scriptFrequency.getKeysetSortedByCount(false, null)) { -// if (--max < 0) break; -// double count = scriptFrequency.getCount(script); -// double proportion = count/total; -// System.out.print("\t" + UScript.getShortName(script) -// + "\t" + proportion); -// } -// System.out.println(); -// } -// } -// -// -//// for (int value : remapped.values()) { -//// System.out.println(UScript.getName(value) + "\t" + remapped.getSet(value).toPattern(false)); -//// } -// -// // filter out those below threshold -// BitSet keep = new BitSet(); -// double threshold = 0.00001; -// for (int scriptCode = 0 ; scriptCode < scriptToOthers.length; ++scriptCode) { -// Counter data = scriptToOthers[scriptCode]; -// if (data == null) { -// continue; -// } -// double total = data.getTotal(); -// for (int scriptCode2 = 0 ; scriptCode2 < scriptToOthers.length; ++scriptCode2) { -// if (data.get(scriptCode2)/total > threshold) { -// keep.set(scriptCode2); -// } -// } -// } -// -// System.out.print("\t"); -// for (int scriptCode = 0 ; scriptCode < scriptToOthers.length; ++scriptCode) { -// if (!keep.get(scriptCode)) continue; -// System.out.print("\t" + UScript.getShortName(scriptCode)); -// } -// System.out.println(); -// -// for (int scriptCode = 0 ; scriptCode < scriptToOthers.length; ++scriptCode) { -// Counter data = scriptToOthers[scriptCode]; -// if (data == null) { -// continue; -// } -// double total = data.getTotal(); -// System.out.print(UScript.getShortName(scriptCode) + "\t" + total); -// for (int scriptCode2 = 0 ; scriptCode2 < scriptToOthers.length; ++scriptCode2) { -// if (!keep.get(scriptCode2)) continue; -// System.out.print("\t" + (data.get(scriptCode2)/total)); -// } -// System.out.println(); -// } -// } - + // private static void checkCommon() { + // Counter[] scriptToOthers = new Counter[UScript.CODE_LIMIT]; + // Counter totalScriptFrequency = new Counter<>(); + // Map> cpToScriptFrequency = new TreeMap<>(); + // for (String lang : CharacterFrequency.getLanguagesWithCounter()) { + // if (lang.equals("mul")) continue; + // ULocale max = ULocale.addLikelySubtags(new ULocale(lang)); + // String script = max.getScript(); + // String temp = REMAP_SCRIPT.get(script); + // if (temp != null) { + // script = temp; + // } + // + // int scriptCode = UScript.getCodeFromName(script); + // //System.out.println(lang + "\t=>\t" + max + "\t" + scriptCode + "\t" + + // UScript.getName(scriptCode)); + // if (scriptCode == UScript.KAITHI) continue; + // Counter data = scriptToOthers[scriptCode]; + // if (data == null) { + // scriptToOthers[scriptCode] = data = new Counter(); + // } + // + // Counter freq = CharacterFrequency.getCodePointCounter(lang, true); + // for (int cp : freq.keySet()) { + // long value = freq.get(cp); + // int cpScript = getScript(cp); + // data.add(cpScript, value); + // + // // data per language-script + // totalScriptFrequency.add(scriptCode, value); + // + // Extra extra = getGeneralCategory(cp); + // if (extra != Extra.Punctuation && extra != Extra.Letter && extra != + // Extra.Mark) continue; + // if (cpScript == UScript.COMMON || cpScript == UScript.INHERITED) { + // Counter counter = cpToScriptFrequency.get(cp); + // if (counter == null) { + // cpToScriptFrequency.put(cp, counter = new Counter()); + // } + // counter.add(scriptCode, value); + // } + // } + // } + // // normalize + // Map> normcpToScriptFrequency = new TreeMap>(); + // for (Entry> entry : cpToScriptFrequency.entrySet()) { + // Counter scriptFrequency = entry.getValue(); + // Counter2 normscriptFrequency = new Counter2(); + // normcpToScriptFrequency.put(entry.getKey(), normscriptFrequency); + // for (R2 x : scriptFrequency.getEntrySetSortedByCount(false, null)) + // { + // int script = x.get1(); + // double count = x.get0(); + // normscriptFrequency.add(script, count/totalScriptFrequency.get(script)); + // } + // } + // if (false) for (Extra extra : Extra.values()) { + // for (Entry> entry : cpToScriptFrequency.entrySet()) { + // int cp = entry.getKey(); + // if (getGeneralCategory(cp) != extra) continue; + // System.out.print(extra + "\tU+" + Utility.hex(cp) + "\t" + + // UCharacter.getName(cp)); + // int max = 5; + // Counter scriptFrequency = entry.getValue(); + // double total = scriptFrequency.getTotal(); + // for (R2 x : scriptFrequency.getEntrySetSortedByCount(false, + // null)) { + // if (--max < 0) break; + // int script = x.get1(); + // long count = x.get0(); + // double proportion = count/total; + // System.out.print("\t" + UScript.getShortName(script) + // + "\t" + proportion); + // } + // System.out.println(); + // } + // } + // System.out.println("\nNormalized\n"); + // for (Extra extra : Extra.values()) { + // for (Entry> entry : normcpToScriptFrequency.entrySet()) + // { + // int cp = entry.getKey(); + // if (getGeneralCategory(cp) != extra) continue; + // System.out.print(extra + "\tU+" + Utility.hex(cp) + "\t" + + // UCharacter.getName(cp)); + // int max = 5; + // Counter2 scriptFrequency = entry.getValue(); + // double total = scriptFrequency.getTotal().doubleValue(); + // for (Integer script : scriptFrequency.getKeysetSortedByCount(false, null)) { + // if (--max < 0) break; + // double count = scriptFrequency.getCount(script); + // double proportion = count/total; + // System.out.print("\t" + UScript.getShortName(script) + // + "\t" + proportion); + // } + // System.out.println(); + // } + // } + // + // + //// for (int value : remapped.values()) { + //// System.out.println(UScript.getName(value) + "\t" + + // remapped.getSet(value).toPattern(false)); + //// } + // + // // filter out those below threshold + // BitSet keep = new BitSet(); + // double threshold = 0.00001; + // for (int scriptCode = 0 ; scriptCode < scriptToOthers.length; ++scriptCode) { + // Counter data = scriptToOthers[scriptCode]; + // if (data == null) { + // continue; + // } + // double total = data.getTotal(); + // for (int scriptCode2 = 0 ; scriptCode2 < scriptToOthers.length; ++scriptCode2) { + // if (data.get(scriptCode2)/total > threshold) { + // keep.set(scriptCode2); + // } + // } + // } + // + // System.out.print("\t"); + // for (int scriptCode = 0 ; scriptCode < scriptToOthers.length; ++scriptCode) { + // if (!keep.get(scriptCode)) continue; + // System.out.print("\t" + UScript.getShortName(scriptCode)); + // } + // System.out.println(); + // + // for (int scriptCode = 0 ; scriptCode < scriptToOthers.length; ++scriptCode) { + // Counter data = scriptToOthers[scriptCode]; + // if (data == null) { + // continue; + // } + // double total = data.getTotal(); + // System.out.print(UScript.getShortName(scriptCode) + "\t" + total); + // for (int scriptCode2 = 0 ; scriptCode2 < scriptToOthers.length; ++scriptCode2) { + // if (!keep.get(scriptCode2)) continue; + // System.out.print("\t" + (data.get(scriptCode2)/total)); + // } + // System.out.println(); + // } + // } // NOT THREADSAFE -// static IdentifierInfo identifierInfo = new IdentifierInfo(); + // static IdentifierInfo identifierInfo = new IdentifierInfo(); static final StringBuilder buffer = new StringBuilder(); static UnicodeMap remapped = new UnicodeMap<>(); -// private static int getScript(int cp) { -// int cpScript = UScript.getScript(cp); -// if (cpScript == UScript.HIRAGANA) { -// cpScript = UScript.KATAKANA; -// } -// if (cpScript == UScript.COMMON || cpScript == UScript.INHERITED) { -// Integer cached = remapped.get(cp); -// if (cached != null) { -// return cached; -// } -// -// buffer.setLength(0); -// buffer.appendCodePoint(cp); -// String normalized = nfkc.normalize(buffer); -// identifierInfo.setIdentifier(normalized); -// BitSet scripts = identifierInfo.getScripts(); -// scripts.clear(UScript.UNKNOWN); // ignore -// -// // favor Latin -// int temp; -// if (scripts.get(UScript.LATIN)) { -// temp = UScript.LATIN; -// } else { -// temp = scripts.nextSetBit(0); -// if (temp < 0) { -// for (BitSet alternates : identifierInfo.getAlternates()) { -// if (scripts.get(UScript.LATIN)) { -// temp = UScript.LATIN; -// break; -// } else { -// temp = scripts.nextSetBit(0); -// break; -// } -// } -// } -// } -// if (temp > 0) { -// cpScript = temp; -// } -// remapped.put(cp, cpScript); -// } -// return cpScript; -// } + // private static int getScript(int cp) { + // int cpScript = UScript.getScript(cp); + // if (cpScript == UScript.HIRAGANA) { + // cpScript = UScript.KATAKANA; + // } + // if (cpScript == UScript.COMMON || cpScript == UScript.INHERITED) { + // Integer cached = remapped.get(cp); + // if (cached != null) { + // return cached; + // } + // + // buffer.setLength(0); + // buffer.appendCodePoint(cp); + // String normalized = nfkc.normalize(buffer); + // identifierInfo.setIdentifier(normalized); + // BitSet scripts = identifierInfo.getScripts(); + // scripts.clear(UScript.UNKNOWN); // ignore + // + // // favor Latin + // int temp; + // if (scripts.get(UScript.LATIN)) { + // temp = UScript.LATIN; + // } else { + // temp = scripts.nextSetBit(0); + // if (temp < 0) { + // for (BitSet alternates : identifierInfo.getAlternates()) { + // if (scripts.get(UScript.LATIN)) { + // temp = UScript.LATIN; + // break; + // } else { + // temp = scripts.nextSetBit(0); + // break; + // } + // } + // } + // } + // if (temp > 0) { + // cpScript = temp; + // } + // remapped.put(cp, cpScript); + // } + // return cpScript; + // } private static void showTop(int max, Counter2 topItems, Double frequ) { Set sorted = topItems.getKeysetSortedByCount(false, null); @@ -481,10 +503,9 @@ private static void showTop(int max, Counter2 topItems, Double frequ) { if (codePoint == '=' || codePoint == '"' || codePoint == '\'' || codePoint == '+') { str = '\'' + str; } - System.out.print( - (SHOW_FREQ ? "\t" + cFreq : "") - + "\t" + str); - }; + System.out.print((SHOW_FREQ ? "\t" + cFreq : "") + "\t" + str); + } + ; if (max < 0) { System.out.print("\t…"); if (SHOW_DECTILES) { @@ -508,41 +529,67 @@ private static void showTop(int max, Counter2 topItems, Double frequ) { } } - static final UnicodeSet SHOULD_BE_SYMBOL = new UnicodeSet("[@ * \\& # % ‰ ‱ † ‡ ※]").freeze(); // PRI 228 + static final UnicodeSet SHOULD_BE_SYMBOL = + new UnicodeSet("[@ * \\& # % ‰ ‱ † ‡ ※]").freeze(); // PRI 228 static final UnicodeSet SHOULD_BE_GREEK = new UnicodeSet("[ℼ µℽ ʹ ̓ ̈́]").freeze(); static final UnicodeSet SHOULD_BE_COPTIC = new UnicodeSet("[\uFE24-\uFE26]").freeze(); static final UnicodeSet SHOULD_BE_DEVA = new UnicodeSet("[᳓ ᳩ-ᳬᳮ-ᳱ ᳵ ᳶ]").freeze(); - static final UnicodeSet MAKE_FORMAT_FOR_CHART = new UnicodeSet("[" - + "[:variationselector:]" - + "[\u034F]" // grapheme joiner - +"]").freeze(); - static final UnicodeSet MAKE_SYMBOL_FOR_CHART = new UnicodeSet("[" - + "[:mark:]" - + "&[:block=Musical Symbols:]" - + "[\\x{101FD}]" // Phaistos Disc - +"]").freeze(); - static final UnicodeSet latinMark = new UnicodeSet("[[:scx=common:][:scx=inherited:]&[:mark:]]") - .removeAll(MAKE_FORMAT_FOR_CHART) - .removeAll(MAKE_SYMBOL_FOR_CHART) - .removeAll(SHOULD_BE_GREEK) - .removeAll(SHOULD_BE_COPTIC) - .removeAll(SHOULD_BE_DEVA).freeze(); - static final UnicodeSet latinLetter = new UnicodeSet("[[:scx=common:][:scx=inherited:]&[:letter:]]") - .removeAll(MAKE_SYMBOL_FOR_CHART) - .removeAll(MAKE_FORMAT_FOR_CHART) - .removeAll(SHOULD_BE_GREEK) - .removeAll(SHOULD_BE_COPTIC) - .removeAll(SHOULD_BE_DEVA); - static final UnicodeSet SHOULD_BE_LATIN = new UnicodeSet(latinMark).addAll(latinLetter).freeze(); - static final UnicodeSet SHOULD_BE_HAN = new UnicodeSet("[" - + "[:East_Asian_Width=Fullwidth:]" - + "\\p{Block=Counting Rod Numerals}" - + "-[:cn:]]").freeze(); + static final UnicodeSet MAKE_FORMAT_FOR_CHART = + new UnicodeSet( + "[" + + "[:variationselector:]" + + "[\u034F]" // grapheme joiner + + "]") + .freeze(); + static final UnicodeSet MAKE_SYMBOL_FOR_CHART = + new UnicodeSet( + "[" + + "[:mark:]" + + "&[:block=Musical Symbols:]" + + "[\\x{101FD}]" // Phaistos Disc + + "]") + .freeze(); + static final UnicodeSet latinMark = + new UnicodeSet("[[:scx=common:][:scx=inherited:]&[:mark:]]") + .removeAll(MAKE_FORMAT_FOR_CHART) + .removeAll(MAKE_SYMBOL_FOR_CHART) + .removeAll(SHOULD_BE_GREEK) + .removeAll(SHOULD_BE_COPTIC) + .removeAll(SHOULD_BE_DEVA) + .freeze(); + static final UnicodeSet latinLetter = + new UnicodeSet("[[:scx=common:][:scx=inherited:]&[:letter:]]") + .removeAll(MAKE_SYMBOL_FOR_CHART) + .removeAll(MAKE_FORMAT_FOR_CHART) + .removeAll(SHOULD_BE_GREEK) + .removeAll(SHOULD_BE_COPTIC) + .removeAll(SHOULD_BE_DEVA); + static final UnicodeSet SHOULD_BE_LATIN = + new UnicodeSet(latinMark).addAll(latinLetter).freeze(); + static final UnicodeSet SHOULD_BE_HAN = + new UnicodeSet( + "[" + + "[:East_Asian_Width=Fullwidth:]" + + "\\p{Block=Counting Rod Numerals}" + + "-[:cn:]]") + .freeze(); static final UnicodeSet SHOULD_BE_KANA = new UnicodeSet("[・]").freeze(); static final UnicodeSet SHOULD_BE_PUNCTUATION = new UnicodeSet("[`´]").freeze(); static class Category { - enum Extra {Unknown, Whitespace, Letter, Mark, Numeric, Control, Format, Punctuation, Symbol, Emoji, Private}; + enum Extra { + Unknown, + Whitespace, + Letter, + Mark, + Numeric, + Control, + Format, + Punctuation, + Symbol, + Emoji, + Private + }; private static final Extra[] ITEMS = Extra.values(); private static final int OFFSET = ITEMS.length; @@ -551,40 +598,40 @@ enum Extra {Unknown, Whitespace, Letter, Mark, Numeric, Control, Format, Punctua private static String getUsageName(Integer category) { ScriptUsage usage = Category.getUsage(category); - return usage == ScriptUsage.UNKNOWN ? "N/A" : UCharacter.toTitleCase(usage.toString(), null); + return usage == ScriptUsage.UNKNOWN + ? "N/A" + : UCharacter.toTitleCase(usage.toString(), null); } public static String getName(int category) { - return category < OFFSET + return category < OFFSET ? "*General " + ITEMS[category].toString() - : category - OFFSET == UScript.HIRAGANA + : category - OFFSET == UScript.HIRAGANA ? "Kana" - : UScript.getName(category - OFFSET); + : UScript.getName(category - OFFSET); } public static ScriptUsage getUsage(int category) { - return category < OFFSET + return category < OFFSET ? UScript.ScriptUsage.UNKNOWN - : UScript.getUsage(category - OFFSET); + : UScript.getUsage(category - OFFSET); } private static boolean isVariant(int category) { int script = category - Category.OFFSET; - return - script == UScript.SIMPLIFIED_HAN + return script == UScript.SIMPLIFIED_HAN || script == UScript.TRADITIONAL_HAN || script == UScript.UNKNOWN || script == UScript.JAPANESE || script == UScript.KOREAN || script == UScript.COMMON || script == UScript.INHERITED - || script == UScript.KATAKANA - ; + || script == UScript.KATAKANA; } static final BitSet temp = new BitSet(); static final StringBuilder buffer = new StringBuilder(); - + public static int getCategory(int cp) { if (UCharacter.isWhitespace(cp)) { return Extra.Whitespace.ordinal(); @@ -592,7 +639,8 @@ public static int getCategory(int cp) { if (SINGLETONS_WITHOUT_DEFECTIVES.contains(cp)) { return Extra.Emoji.ordinal(); } - int defaultIgnorable = UCharacter.getIntPropertyValue(cp, UProperty.DEFAULT_IGNORABLE_CODE_POINT); + int defaultIgnorable = + UCharacter.getIntPropertyValue(cp, UProperty.DEFAULT_IGNORABLE_CODE_POINT); if (defaultIgnorable != 0) { return Extra.Format.ordinal(); } @@ -620,11 +668,15 @@ public static int getCategory(int cp) { temp.clear(); int script = UScript.getScript(cp); - if (script != UScript.UNKNOWN && script != UScript.COMMON && script != UScript.INHERITED) { + if (script != UScript.UNKNOWN + && script != UScript.COMMON + && script != UScript.INHERITED) { return fixScript(script); } script = getBestScript(cp); - if (script != UScript.UNKNOWN && script != UScript.COMMON && script != UScript.INHERITED) { + if (script != UScript.UNKNOWN + && script != UScript.COMMON + && script != UScript.INHERITED) { return fixScript(script); } buffer.setLength(0); @@ -632,12 +684,13 @@ public static int getCategory(int cp) { String nfkcForm = nfkc.normalize(buffer); if (UTF16.countCodePoint(nfkcForm) == 1) { script = getBestScript(nfkcForm.codePointAt(0)); - if (script != UScript.UNKNOWN && script != UScript.COMMON && script != UScript.INHERITED) { + if (script != UScript.UNKNOWN + && script != UScript.COMMON + && script != UScript.INHERITED) { return fixScript(script); } } - // now do category if (SHOULD_BE_SYMBOL.contains(cp)) { @@ -673,13 +726,13 @@ private static int fixScript(int script) { private static int getBest(BitSet bitset) { int best = -1; ScriptUsage bestUsage = null; - for (int i = bitset.nextSetBit(0); i >= 0; i = bitset.nextSetBit(i+1)) { + for (int i = bitset.nextSetBit(0); i >= 0; i = bitset.nextSetBit(i + 1)) { ScriptUsage usage = UScript.getUsage(i); if (bestUsage == null || usage.compareTo(bestUsage) > 0) { best = i; bestUsage = usage; } - } + } return best; } } @@ -690,21 +743,23 @@ private static int getBest(BitSet bitset) { // String[] parts = s.split(";"); // if (!lastS.equals(s)) { // System.out.println("\n# " + - // "old-sc=" + parts[0] + // "old-sc=" + parts[0] // + ", " + - // "new-sc=" + parts[1] + // "new-sc=" + parts[1] // + ", " + // "scx={" + parts[2] + "}"); // lastS = s; // } - // for (UnicodeSetIterator it = new UnicodeSetIterator(fixedScripts.getSet(s)); it.nextRange();) { + // for (UnicodeSetIterator it = new UnicodeSetIterator(fixedScripts.getSet(s)); + // it.nextRange();) { // if (it.codepoint != it.codepointEnd) { - // System.out.println(Utility.hex(it.codepoint) + ".." + Utility.hex(it.codepointEnd) - // + " ;\t" + parts[0] + "\t# " - // + UCharacter.getName(it.codepoint) + ".." + UCharacter.getName(it.codepointEnd)); + // System.out.println(Utility.hex(it.codepoint) + ".." + Utility.hex(it.codepointEnd) + // + " ;\t" + parts[0] + "\t# " + // + UCharacter.getName(it.codepoint) + ".." + + // UCharacter.getName(it.codepointEnd)); // } else { - // System.out.println(Utility.hex(it.codepoint) - // + " ;\t" + parts[0] + "\t# " + // System.out.println(Utility.hex(it.codepoint) + // + " ;\t" + parts[0] + "\t# " // + UCharacter.getName(it.codepoint)); // } // } @@ -725,7 +780,7 @@ private static int getBest(BitSet bitset) { // private static void getLanguageInfo() { // supplemental.getLanguages(); - // for (String s : + // for (String s : // "sq hy az my ka kk km lo mk mn ne si ky pa uz".split(" ")) { // PopulationData data = supplemental.getBaseLanguagePopulationData(s); // if (data == null) { @@ -753,60 +808,60 @@ private static Extra getGeneralCategory(int cp) { int category = UCharacter.getType(cp); switch (category) { - case UCharacter.UPPERCASE_LETTER : - case UCharacter.LOWERCASE_LETTER : - case UCharacter.TITLECASE_LETTER : - case UCharacter.MODIFIER_LETTER : - case UCharacter.MODIFIER_SYMBOL : - case UCharacter.OTHER_LETTER : - return Extra.Letter; - - case UCharacter.NON_SPACING_MARK : - case UCharacter.ENCLOSING_MARK : - case UCharacter.COMBINING_SPACING_MARK : - return UCharacter.getIntPropertyValue(cp, UProperty.VARIATION_SELECTOR) != 0 ? Extra.Format : Extra.Mark; - - case UCharacter.DECIMAL_DIGIT_NUMBER : - case UCharacter.LETTER_NUMBER : - case UCharacter.OTHER_NUMBER : - return Extra.Numeric; - - case UCharacter.SPACE_SEPARATOR: - case UCharacter.LINE_SEPARATOR : - case UCharacter.PARAGRAPH_SEPARATOR : - return Extra.Whitespace; - - case UCharacter.CONTROL : - return UCharacter.isWhitespace(cp) ? Extra.Whitespace : Extra.Control; - - case UCharacter.FORMAT : - return Extra.Format; - - case UCharacter.UNASSIGNED: - case UCharacter.SURROGATE : - return Extra.Unknown; - - case UCharacter.PRIVATE_USE : - return Extra.Private; - - case UCharacter.DASH_PUNCTUATION : - case UCharacter.START_PUNCTUATION : - case UCharacter.END_PUNCTUATION : - case UCharacter.CONNECTOR_PUNCTUATION : - case UCharacter.OTHER_PUNCTUATION : - case UCharacter.INITIAL_PUNCTUATION : - case UCharacter.FINAL_PUNCTUATION : - return Extra.Punctuation; - - case UCharacter.MATH_SYMBOL : - case UCharacter.CURRENCY_SYMBOL : - case UCharacter.OTHER_SYMBOL : - return Extra.Symbol; - - default: - throw new IllegalArgumentException(); + case UCharacter.UPPERCASE_LETTER: + case UCharacter.LOWERCASE_LETTER: + case UCharacter.TITLECASE_LETTER: + case UCharacter.MODIFIER_LETTER: + case UCharacter.MODIFIER_SYMBOL: + case UCharacter.OTHER_LETTER: + return Extra.Letter; + + case UCharacter.NON_SPACING_MARK: + case UCharacter.ENCLOSING_MARK: + case UCharacter.COMBINING_SPACING_MARK: + return UCharacter.getIntPropertyValue(cp, UProperty.VARIATION_SELECTOR) != 0 + ? Extra.Format + : Extra.Mark; + + case UCharacter.DECIMAL_DIGIT_NUMBER: + case UCharacter.LETTER_NUMBER: + case UCharacter.OTHER_NUMBER: + return Extra.Numeric; + + case UCharacter.SPACE_SEPARATOR: + case UCharacter.LINE_SEPARATOR: + case UCharacter.PARAGRAPH_SEPARATOR: + return Extra.Whitespace; + + case UCharacter.CONTROL: + return UCharacter.isWhitespace(cp) ? Extra.Whitespace : Extra.Control; + + case UCharacter.FORMAT: + return Extra.Format; + + case UCharacter.UNASSIGNED: + case UCharacter.SURROGATE: + return Extra.Unknown; + + case UCharacter.PRIVATE_USE: + return Extra.Private; + + case UCharacter.DASH_PUNCTUATION: + case UCharacter.START_PUNCTUATION: + case UCharacter.END_PUNCTUATION: + case UCharacter.CONNECTOR_PUNCTUATION: + case UCharacter.OTHER_PUNCTUATION: + case UCharacter.INITIAL_PUNCTUATION: + case UCharacter.FINAL_PUNCTUATION: + return Extra.Punctuation; + + case UCharacter.MATH_SYMBOL: + case UCharacter.CURRENCY_SYMBOL: + case UCharacter.OTHER_SYMBOL: + return Extra.Symbol; + + default: + throw new IllegalArgumentException(); } } - - } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacterFrequency.java b/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacterFrequency.java index 0a2be415f..ca5b76e59 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacterFrequency.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacterFrequency.java @@ -1,44 +1,45 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.util.Locale; - import org.unicode.cldr.util.Counter; import org.unicode.draft.CharacterFrequency; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues.General_Category_Values; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class ShowCharacterFrequency { static final Counter freq = CharacterFrequency.getCodePointCounter("mul", true); static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(); - static final UnicodeMap cat = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + static final UnicodeMap cat = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); public static void main(String[] args) { - UnicodeSet proposal = new UnicodeSet("[– — · « » § † • ° ℗ ← → ↑ ↓ ⇅ ⇆ ∆-∈ × √ ∞ ∩ ∪ ≡ ⊂ ▲ ▼ ◊ ○ ● ◯ ↕ ↔ ▶ ◀ © ® ™ £ ¥ € ₹ ₽ ² ³ µ" - + "‘ ’ “ ” (-* ∋ ⁻ ∖ ⊃ ⊆ ⊇ ⁰ ¹" - + "]"); - UnicodeSet punctSym = new UnicodeSet(proposal) - .addAll(cat.getSet(General_Category_Values.Close_Punctuation)) - .addAll(cat.getSet(General_Category_Values.Connector_Punctuation)) - .addAll(cat.getSet(General_Category_Values.Open_Punctuation)) - .addAll(cat.getSet(General_Category_Values.Final_Punctuation)) - .addAll(cat.getSet(General_Category_Values.Dash_Punctuation)) - .addAll(cat.getSet(General_Category_Values.Initial_Punctuation)) - .addAll(cat.getSet(General_Category_Values.Currency_Symbol)) - .addAll(cat.getSet(General_Category_Values.Math_Symbol)) - .addAll(cat.getSet(General_Category_Values.Modifier_Symbol)) - .addAll(cat.getSet(General_Category_Values.Math_Symbol)) - .removeAll(new UnicodeSet(0,0x7F)) - ; + UnicodeSet proposal = + new UnicodeSet( + "[– — · « » § † • ° ℗ ← → ↑ ↓ ⇅ ⇆ ∆-∈ × √ ∞ ∩ ∪ ≡ ⊂ ▲ ▼ ◊ ○ ● ◯ ↕ ↔ ▶ ◀ © ® ™ £ ¥ € ₹ ₽ ² ³ µ" + + "‘ ’ “ ” (-* ∋ ⁻ ∖ ⊃ ⊆ ⊇ ⁰ ¹" + + "]"); + UnicodeSet punctSym = + new UnicodeSet(proposal) + .addAll(cat.getSet(General_Category_Values.Close_Punctuation)) + .addAll(cat.getSet(General_Category_Values.Connector_Punctuation)) + .addAll(cat.getSet(General_Category_Values.Open_Punctuation)) + .addAll(cat.getSet(General_Category_Values.Final_Punctuation)) + .addAll(cat.getSet(General_Category_Values.Dash_Punctuation)) + .addAll(cat.getSet(General_Category_Values.Initial_Punctuation)) + .addAll(cat.getSet(General_Category_Values.Currency_Symbol)) + .addAll(cat.getSet(General_Category_Values.Math_Symbol)) + .addAll(cat.getSet(General_Category_Values.Modifier_Symbol)) + .addAll(cat.getSet(General_Category_Values.Math_Symbol)) + .removeAll(new UnicodeSet(0, 0x7F)); double factor = show("broad-ASCII", punctSym, 100, proposal); - //show("original", proposal, 100, factor); + // show("original", proposal, 100, factor); } private static double show(String title, UnicodeSet proposal, int maxCount, UnicodeSet keep) { @@ -48,7 +49,7 @@ private static double show(String title, UnicodeSet proposal, int maxCount, Unic int cp = s.codePointAt(0); nfreq.add(cp, freq.getCount(cp)); } - double factor = 100d/nfreq.getTotal(); + double factor = 100d / nfreq.getTotal(); for (R2 s : nfreq.getEntrySetSortedByCount(false, null)) { final Integer cp = s.get1(); @@ -57,8 +58,16 @@ private static double show(String title, UnicodeSet proposal, int maxCount, Unic continue; } } - double count = s.get0()*factor; - System.out.println(UTF16.valueOf(cp) + "\t" + count + "%" + "\t" + cat.get(cp) + "\t" + iup.getName(cp).toLowerCase(Locale.ROOT)); + double count = s.get0() * factor; + System.out.println( + UTF16.valueOf(cp) + + "\t" + + count + + "%" + + "\t" + + cat.get(cp) + + "\t" + + iup.getName(cp).toLowerCase(Locale.ROOT)); } return factor; } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacters.java b/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacters.java index 8e1a82af0..c9e2cf8db 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacters.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/ShowCharacters.java @@ -1,5 +1,12 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R3; +import com.ibm.icu.impl.Row.R5; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.PrintWriter; import java.util.Arrays; import java.util.HashSet; @@ -7,15 +14,14 @@ import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; - import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.With; -import org.unicode.props.UnicodeProperty; -import org.unicode.props.UnicodeProperty.RegexMatcher; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; import org.unicode.props.UcdPropertyValues.Age_Values; +import org.unicode.props.UnicodeProperty; +import org.unicode.props.UnicodeProperty.RegexMatcher; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.ToolUnicodePropertySource; import org.unicode.text.utility.Settings; @@ -24,24 +30,20 @@ import org.unicode.tools.emoji.EmojiData; import org.unicode.tools.emoji.GenerateEmojiData; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R3; -import com.ibm.icu.impl.Row.R5; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class ShowCharacters { static IndexUnicodeProperties iup = IndexUnicodeProperties.make("9.0"); static UnicodeMap names = iup.load(UcdProperty.Name); - static UnicodeMap ages = iup.loadEnum(UcdProperty.Age, UcdPropertyValues.Age_Values.class); + static UnicodeMap ages = + iup.loadEnum(UcdProperty.Age, UcdPropertyValues.Age_Values.class); static CandidateData CD = CandidateData.getInstance(); public static void main(String[] args) { show("New", CD.keySet()); System.out.println(); - show("Gendered", new UnicodeSet("[👲 👳 💂 🎅 👯 👰 🕴 ☃ ⛄ \\U0001F57A \\U0001F934 \\U0001F936 \\U0001F935]")); + show( + "Gendered", + new UnicodeSet( + "[👲 👳 💂 🎅 👯 👰 🕴 ☃ ⛄ \\U0001F57A \\U0001F934 \\U0001F936 \\U0001F935]")); // use same definition as for data files, but generate simpler list show("Emoji_Flag_Base", GenerateEmojiData.flagBase); @@ -50,26 +52,44 @@ public static void main(String[] args) { show("Emoji_Direction_Base", GenerateEmojiData.directionBase); UnicodeSet mods = EmojiData.EMOJI_DATA.getModifierBases(); - UnicodeSet modsWithVS = new UnicodeSet(mods).retainAll(EmojiData.EMOJI_DATA.getEmojiWithVariants()); - UnicodeSet modsWithVSAndDefaultEmoji = new UnicodeSet(modsWithVS) - .removeAll(EmojiData.EMOJI_DATA.getEmojiPresentationSet()); + UnicodeSet modsWithVS = + new UnicodeSet(mods).retainAll(EmojiData.EMOJI_DATA.getEmojiWithVariants()); + UnicodeSet modsWithVSAndDefaultEmoji = + new UnicodeSet(modsWithVS) + .removeAll(EmojiData.EMOJI_DATA.getEmojiPresentationSet()); show("basesWithVSAndDefaultEmoji", modsWithVSAndDefaultEmoji); - UnicodeSet modsWithVSAndDefaultText = new UnicodeSet(modsWithVS) - .retainAll(EmojiData.EMOJI_DATA.getTextPresentationSet()); + UnicodeSet modsWithVSAndDefaultText = + new UnicodeSet(modsWithVS).retainAll(EmojiData.EMOJI_DATA.getTextPresentationSet()); show("basesWithVSAndDefaultText", modsWithVSAndDefaultText); } private static void show(String prop, UnicodeSet unicodeSet) { - System.out.println("# All omitted code points have " + prop + "=No\n" - + "# @missing: 0000..10FFFF ; " + prop + " ; No\n"); + System.out.println( + "# All omitted code points have " + + prop + + "=No\n" + + "# @missing: 0000..10FFFF ; " + + prop + + " ; No\n"); for (String s : unicodeSet) { - System.out.println(Utility.hex(s) + " ;\t" + prop - + "\t# " + getAge(s) - + " (" + s + ") " - + getName(s)); + System.out.println( + Utility.hex(s) + + " ;\t" + + prop + + "\t# " + + getAge(s) + + " (" + + s + + ") " + + getName(s)); } - System.out.println("# total:\t" + unicodeSet.size() + "\n# uset: \t" + unicodeSet.toPattern(false) + "\n"); + System.out.println( + "# total:\t" + + unicodeSet.size() + + "\n# uset: \t" + + unicodeSet.toPattern(false) + + "\n"); } private static String getAge(String s) { @@ -78,12 +98,12 @@ private static String getAge(String s) { } private static String getName(String s) { - return CldrUtility.ifNull(CD.getName(s),names.get(s)); + return CldrUtility.ifNull(CD.getName(s), names.get(s)); } public void test(String[] args) { final ToolUnicodePropertySource pSource = ToolUnicodePropertySource.make(null); - final Map,UnicodeSet> data = new TreeMap(); + final Map, UnicodeSet> data = new TreeMap(); final UnicodeProperty subhead = pSource.getProperty("subhead"); final UnicodeProperty name = pSource.getProperty("name"); final UnicodeProperty block = pSource.getProperty("block"); @@ -95,31 +115,37 @@ public void test(String[] args) { final UnicodeProperty dt = pSource.getProperty("dt"); final UnicodeProperty alphabetic = pSource.getProperty("alphabetic"); - final UnicodeSet source1 = new UnicodeSet() - .addAll(pSource.getSet("gc=Lm")) - .addAll(pSource.getSet("dt=super")) - .addAll(pSource.getSet("dt=sub")) - .freeze(); + final UnicodeSet source1 = + new UnicodeSet() + .addAll(pSource.getSet("gc=Lm")) + .addAll(pSource.getSet("dt=super")) + .addAll(pSource.getSet("dt=sub")) + .freeze(); final RegexMatcher regexMatcher = new RegexMatcher(); - final UnicodeSet source2 = new UnicodeSet() - // .addAll(subhead.getSet(regexMatcher.set("(?i)enclosed"))) - .addAll(block.getSet(regexMatcher.set("(?i)enclosed"))) - .addAll(name.getSet(regexMatcher.set("(?i)(circled|parenthesized|squared)"))) - .addAll(dt.getSet("Circled")) - .removeAll(gc.getSet("Cn")) - .removeAll(gc.getSet("sm")) - .removeAll(gc.getSet("Lo")) - .removeAll(gc.getSet("Po")) - .freeze(); + final UnicodeSet source2 = + new UnicodeSet() + // .addAll(subhead.getSet(regexMatcher.set("(?i)enclosed"))) + .addAll(block.getSet(regexMatcher.set("(?i)enclosed"))) + .addAll( + name.getSet( + regexMatcher.set("(?i)(circled|parenthesized|squared)"))) + .addAll(dt.getSet("Circled")) + .removeAll(gc.getSet("Cn")) + .removeAll(gc.getSet("sm")) + .removeAll(gc.getSet("Lo")) + .removeAll(gc.getSet("Po")) + .freeze(); // [:subhead=/(?i)Enclosed/:][:name=/CIRCLED/:][:name=/PARENTHESIZED/:]\p{name=/(?i)squared/}\p{block=/(?i)enclosed/}-\p{cn}-\p{sm}-\p{Lo}-\p{Po} final UnicodeSet source = args.length == 0 ? source1 : source2; final UnicodeSet okScripts = new UnicodeSet(); - for (final String okScript : Arrays.asList("Zyyy", "Armn", "Copt", "Dest", "Glag", "Cyrl", "Grek", "Geor", "Latn")) { + for (final String okScript : + Arrays.asList( + "Zyyy", "Armn", "Copt", "Dest", "Glag", "Cyrl", "Grek", "Geor", "Latn")) { okScripts.addAll(script.getSet(okScript)); } okScripts.freeze(); @@ -128,20 +154,17 @@ public void test(String[] args) { final int ch = s.codePointAt(0); final String nfkdForm = Default.nfkd().normalize(s); final String scriptValue = getScript(s, script); - if (okScripts != null && !okScripts.containsSome(s) && !okScripts.containsSome(nfkdForm)) { + if (okScripts != null + && !okScripts.containsSome(s) + && !okScripts.containsSome(nfkdForm)) { continue; } final String dscriptValue = getScript(nfkdForm, script); final String casing = alphabeticStatus(s, lower, upper, alphabetic); final String dcasing = alphabeticStatus(nfkdForm, lower, upper, alphabetic); - final R5 row = Row.of( - scriptValue, - dscriptValue, - casing, - dcasing, - gc.getValue(ch,true) - ); + final R5 row = + Row.of(scriptValue, dscriptValue, casing, dcasing, gc.getValue(ch, true)); UnicodeSet us = data.get(row); if (us == null) { data.put(row, us = new UnicodeSet()); @@ -149,40 +172,62 @@ public void test(String[] args) { us.add(ch); } - final PrintWriter log = Utility.openPrintWriter(Settings.Output.GEN_DIR, "showChars.html", Utility.UTF8_WINDOWS); + final PrintWriter log = + Utility.openPrintWriter( + Settings.Output.GEN_DIR, "showChars.html", Utility.UTF8_WINDOWS); log.println(""); log.println(""); - log.println("" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - ); + log.println( + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + ""); int count = 0; - for (final Entry, UnicodeSet> entry : data.entrySet()) { + for (final Entry, UnicodeSet> entry : + data.entrySet()) { final R5 row = entry.getKey(); - for (final UnicodeSetIterator it = new UnicodeSetIterator(entry.getValue()); it.nextRange();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(entry.getValue()); + it.nextRange(); ) { final R3 chardata = getCharData(it); final int start = count; count += (it.codepointEnd - it.codepoint + 1); - log.println("" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - +"" - ); + log.println( + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + ""); } } log.println("
CountSCdSCL/U/AdL/U/AGCCodeCharName
CountSCdSCL/U/AdL/U/AGCCodeCharName
'"+(count-start)+""+row.get0()+""+row.get1()+""+row.get2()+""+row.get3()+""+row.get4()+"'"+chardata.get0()+""+chardata.get1()+""+chardata.get2()+"
'" + + (count - start) + + "" + + row.get0() + + "" + + row.get1() + + "" + + row.get2() + + "" + + row.get3() + + "" + + row.get4() + + "'" + + chardata.get0() + + "" + + chardata.get1() + + "" + + chardata.get2() + + "
"); @@ -195,7 +240,7 @@ public void test(String[] args) { private static String getScript(String string, UnicodeProperty script) { final Set results = new HashSet(); for (final int ch : With.codePointArray(string)) { - results.add( script.getValue(ch,true)); + results.add(script.getValue(ch, true)); } // remove common. If results are unique, return. Otherwise return common results.remove("Zyyy"); @@ -203,21 +248,24 @@ private static String getScript(String string, UnicodeProperty script) { return results.size() == 1 ? results.iterator().next() : "Zyyy"; } - - private static String alphabeticStatus(String string, UnicodeProperty lower, UnicodeProperty upper, UnicodeProperty alphabetic) { + private static String alphabeticStatus( + String string, + UnicodeProperty lower, + UnicodeProperty upper, + UnicodeProperty alphabetic) { final Set results = new HashSet(); for (final int ch : With.codePointArray(string)) { boolean didOne = false; - if (lower.getValue(ch,true).equals("Y")) { + if (lower.getValue(ch, true).equals("Y")) { results.add("l"); didOne = true; } - if (upper.getValue(ch,true).equals("Y")) { + if (upper.getValue(ch, true).equals("Y")) { results.add("U"); didOne = true; } - if (alphabetic.getValue(ch,true).equals("Y")) { + if (alphabetic.getValue(ch, true).equals("Y")) { results.add("A"); didOne = true; } @@ -231,16 +279,18 @@ private static String alphabeticStatus(String string, UnicodeProperty lower, Uni private static R3 getCharData(UnicodeSetIterator it) { if (it.codepoint == it.codepointEnd) { - return Row.of(Utility.hex(it.codepoint), + return Row.of( + Utility.hex(it.codepoint), UTF16.valueOf(it.codepoint), Default.ucd().getName(it.codepoint)); } else { final String sep = it.codepointEnd == it.codepoint + 1 ? "," : "…"; - return Row.of(Utility.hex(it.codepoint) + sep + Utility.hex(it.codepointEnd), + return Row.of( + Utility.hex(it.codepoint) + sep + Utility.hex(it.codepointEnd), UTF16.valueOf(it.codepoint) + sep + UTF16.valueOf(it.codepointEnd), - Default.ucd().getName(it.codepoint) + sep + Default.ucd().getName(it.codepointEnd) - ); + Default.ucd().getName(it.codepoint) + + sep + + Default.ucd().getName(it.codepointEnd)); } - } } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/ShowPatternSyntax.java b/unicodetools/src/main/java/org/unicode/text/tools/ShowPatternSyntax.java index 8186db510..a5403d9e1 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/ShowPatternSyntax.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/ShowPatternSyntax.java @@ -1,17 +1,16 @@ package org.unicode.text.tools; +import com.ibm.icu.text.UnicodeSet; import org.unicode.props.BagFormatter; import org.unicode.text.UCD.ToolUnicodePropertySource; -import com.ibm.icu.text.UnicodeSet; - public class ShowPatternSyntax { public static void main(String[] args) { final ToolUnicodePropertySource source = ToolUnicodePropertySource.make(""); final UnicodeSet syntax = source.getSet("Pattern_Syntax=true"); final UnicodeSet unassigned = source.getSet("gc=Cn"); final UnicodeSet unassignedSyntax = new UnicodeSet(syntax).retainAll(unassigned); - //UnicodeSet spanned = Utility.addDontCareSpans(unassignedSyntax, syntax); + // UnicodeSet spanned = Utility.addDontCareSpans(unassignedSyntax, syntax); final BagFormatter bf = new BagFormatter(); bf.setLabelSource(source.getProperty("Block")); bf.setNameSource(null); diff --git a/unicodetools/src/main/java/org/unicode/text/tools/ShowUnicodeGrowth.java b/unicodetools/src/main/java/org/unicode/text/tools/ShowUnicodeGrowth.java index a60326ef7..d07a2f94b 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/ShowUnicodeGrowth.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/ShowUnicodeGrowth.java @@ -1,8 +1,8 @@ package org.unicode.text.tools; +import com.ibm.icu.text.UnicodeSet; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.util.Counter; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.TestData; @@ -10,12 +10,25 @@ import org.unicode.text.UCD.UCD; import org.unicode.text.UCD.UCD_Types; -import com.ibm.icu.text.UnicodeSet; - public class ShowUnicodeGrowth { private static final int LATEST = UCD_Types.AGE_VERSIONS.length - 1; - enum Type {format, whitespace, number, punctuation, symbol, mark, hangul, han, other_letter, surrogate, private_use, noncharacter, unassigned}; + enum Type { + format, + whitespace, + number, + punctuation, + symbol, + mark, + hangul, + han, + other_letter, + surrogate, + private_use, + noncharacter, + unassigned + }; + static UCD ucd = UCD.make(UCD_Types.AGE_VERSIONS[LATEST]); public static void main(String[] args) { @@ -55,14 +68,16 @@ static void foo() { final Counter counter = new Counter(); final ToolUnicodePropertySource toolSource52 = ToolUnicodePropertySource.make("5.2.0"); final ToolUnicodePropertySource toolSource51 = ToolUnicodePropertySource.make("5.1.0"); - final UnicodeSet current = new UnicodeSet(toolSource51.getSet("gc=cn")).removeAll(toolSource52.getSet("gc=cn")); + final UnicodeSet current = + new UnicodeSet(toolSource51.getSet("gc=cn")) + .removeAll(toolSource52.getSet("gc=cn")); for (final String s : current) { final int cp = s.codePointAt(0); final String script = Default.ucd().getScriptID(cp); if (script.equals("COMMON") || script.equals("INHERITED")) { final String category = Default.ucd().getCategoryID(cp); - counter.add(""+category.charAt(0) + "*", 1); + counter.add("" + category.charAt(0) + "*", 1); continue; } counter.add(script, 1); @@ -85,54 +100,54 @@ private static Type getType(int i) { return Type.noncharacter; } else { switch (ucd.getCategory(i)) { - case UCD_Types.UNASSIGNED: - case UCD_Types.UPPERCASE_LETTER: - case UCD_Types.LOWERCASE_LETTER: - case UCD_Types.TITLECASE_LETTER: - case UCD_Types.MODIFIER_LETTER: - case UCD_Types.OTHER_LETTER: - final short script = ucd.getScript(i); - if (script == UCD_Types.HANGUL_SCRIPT) { - return Type.hangul; - } else if (script == UCD_Types.HAN_SCRIPT) { - return Type.han; - } else { - return Type.other_letter; - } + case UCD_Types.UNASSIGNED: + case UCD_Types.UPPERCASE_LETTER: + case UCD_Types.LOWERCASE_LETTER: + case UCD_Types.TITLECASE_LETTER: + case UCD_Types.MODIFIER_LETTER: + case UCD_Types.OTHER_LETTER: + final short script = ucd.getScript(i); + if (script == UCD_Types.HANGUL_SCRIPT) { + return Type.hangul; + } else if (script == UCD_Types.HAN_SCRIPT) { + return Type.han; + } else { + return Type.other_letter; + } - case UCD_Types.NON_SPACING_MARK: - case UCD_Types.ENCLOSING_MARK: - case UCD_Types.COMBINING_SPACING_MARK: - return Type.mark; - case UCD_Types.DECIMAL_DIGIT_NUMBER: - case UCD_Types.LETTER_NUMBER: - case UCD_Types.OTHER_NUMBER: - return Type.number; - case UCD_Types.SPACE_SEPARATOR: - case UCD_Types.LINE_SEPARATOR: - case UCD_Types.PARAGRAPH_SEPARATOR: - return Type.whitespace; - case UCD_Types.CONTROL: - case UCD_Types.FORMAT: - return Type.format; - case UCD_Types.UNUSED_CATEGORY: - case UCD_Types.PRIVATE_USE: - return Type.private_use; - case UCD_Types.SURROGATE: - return Type.surrogate; - case UCD_Types.DASH_PUNCTUATION: - case UCD_Types.START_PUNCTUATION: - case UCD_Types.END_PUNCTUATION: - case UCD_Types.CONNECTOR_PUNCTUATION: - case UCD_Types.OTHER_PUNCTUATION: - case UCD_Types.INITIAL_PUNCTUATION: - case UCD_Types.FINAL_PUNCTUATION: - return Type.punctuation; - case UCD_Types.MATH_SYMBOL: - case UCD_Types.CURRENCY_SYMBOL: - case UCD_Types.MODIFIER_SYMBOL: - case UCD_Types.OTHER_SYMBOL: - return Type.symbol; + case UCD_Types.NON_SPACING_MARK: + case UCD_Types.ENCLOSING_MARK: + case UCD_Types.COMBINING_SPACING_MARK: + return Type.mark; + case UCD_Types.DECIMAL_DIGIT_NUMBER: + case UCD_Types.LETTER_NUMBER: + case UCD_Types.OTHER_NUMBER: + return Type.number; + case UCD_Types.SPACE_SEPARATOR: + case UCD_Types.LINE_SEPARATOR: + case UCD_Types.PARAGRAPH_SEPARATOR: + return Type.whitespace; + case UCD_Types.CONTROL: + case UCD_Types.FORMAT: + return Type.format; + case UCD_Types.UNUSED_CATEGORY: + case UCD_Types.PRIVATE_USE: + return Type.private_use; + case UCD_Types.SURROGATE: + return Type.surrogate; + case UCD_Types.DASH_PUNCTUATION: + case UCD_Types.START_PUNCTUATION: + case UCD_Types.END_PUNCTUATION: + case UCD_Types.CONNECTOR_PUNCTUATION: + case UCD_Types.OTHER_PUNCTUATION: + case UCD_Types.INITIAL_PUNCTUATION: + case UCD_Types.FINAL_PUNCTUATION: + return Type.punctuation; + case UCD_Types.MATH_SYMBOL: + case UCD_Types.CURRENCY_SYMBOL: + case UCD_Types.MODIFIER_SYMBOL: + case UCD_Types.OTHER_SYMBOL: + return Type.symbol; } } return null; diff --git a/unicodetools/src/main/java/org/unicode/text/tools/SimplifiedAndTraditional.java b/unicodetools/src/main/java/org/unicode/text/tools/SimplifiedAndTraditional.java index 8ec2900d0..ee2e257e0 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/SimplifiedAndTraditional.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/SimplifiedAndTraditional.java @@ -1,5 +1,11 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; @@ -9,7 +15,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.UnicodeSetPrettyPrinter; import org.unicode.cldr.util.XEquivalenceClass; import org.unicode.props.IndexUnicodeProperties; @@ -17,13 +22,6 @@ import org.unicode.text.UCD.Default; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; - public class SimplifiedAndTraditional { public static void main(String[] args) { new SimplifiedAndTraditional().showSimpVsTrad(args); @@ -44,7 +42,7 @@ private UnicodeSet showKeyset(String propertyName, UnicodeSet filter) { } private void run3(String[] args) { - final Map mandarin = new TreeMap<>(); + final Map mandarin = new TreeMap<>(); IndexUnicodeProperties iup = IndexUnicodeProperties.make(Default.ucd().getVersionInfo()); final UnicodeMap mand = iup.load(UcdProperty.kMandarin); for (final String value : (Collection) mand.getAvailableValues()) { @@ -70,8 +68,10 @@ private void run3(String[] args) { } static UnicodeSetPrettyPrinter PRETTY = - new UnicodeSetPrettyPrinter().setOrdering(Collator.getInstance(ULocale.ROOT)). - setSpaceComparator(Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)); + new UnicodeSetPrettyPrinter() + .setOrdering(Collator.getInstance(ULocale.ROOT)) + .setSpaceComparator( + Collator.getInstance(ULocale.ROOT).setStrength2(Collator.PRIMARY)); private void showSimpVsTrad(String[] args) { IndexUnicodeProperties iup = IndexUnicodeProperties.make(Default.ucd().getVersionInfo()); @@ -83,39 +83,49 @@ private void showSimpVsTrad(String[] args) { final UnicodeSet overlap = new UnicodeSet(simpOnly).retainAll(tradOnly); simpOnly.removeAll(overlap); tradOnly.removeAll(overlap); - System.out.println("UnicodeSet simpOnly = new UnicodeSet(\"" + simpOnly.toPattern(false) + "\");"); - System.out.println("UnicodeSet tradOnly = new UnicodeSet(\"" + tradOnly.toPattern(false) + "\");"); + System.out.println( + "UnicodeSet simpOnly = new UnicodeSet(\"" + simpOnly.toPattern(false) + "\");"); + System.out.println( + "UnicodeSet tradOnly = new UnicodeSet(\"" + tradOnly.toPattern(false) + "\");"); final XEquivalenceClass equivalences = new XEquivalenceClass("?"); System.out.println("*** Data Problems ***"); System.out.println(); - for (final UnicodeSetIterator it = new UnicodeSetIterator(simp2trad.keySet()); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(simp2trad.keySet()); + it.next(); ) { final String source = it.getString(); final String targetOptions = getVariant(simp2trad, it.codepoint); int cp; for (int i = 0; i < targetOptions.length(); i += UTF16.getCharCount(cp)) { - String target = new StringBuilder().appendCodePoint(cp = UTF16.charAt(targetOptions,i)).toString(); + String target = + new StringBuilder() + .appendCodePoint(cp = UTF16.charAt(targetOptions, i)) + .toString(); if (source.equals(target)) { target = target + "*"; } equivalences.add(source, target, "→T", "T←"); } } - for (final UnicodeSetIterator it = new UnicodeSetIterator(trad2simp.keySet()); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(trad2simp.keySet()); + it.next(); ) { final String source = it.getString(); final String targetOptions = getVariant(trad2simp, it.codepoint); int cp; for (int i = 0; i < targetOptions.length(); i += UTF16.getCharCount(cp)) { - final String target = new StringBuilder().appendCodePoint(cp = UTF16.charAt(targetOptions,i)).toString(); + final String target = + new StringBuilder() + .appendCodePoint(cp = UTF16.charAt(targetOptions, i)) + .toString(); String source2 = source; if (source.equals(target)) { source2 = source2 + "*"; } equivalences.add(source2, target, "→S", "S←"); } - //equivalences.add(it.getString(), getVariant(trad2simp, it.codepoint), "→S", "S←"); + // equivalences.add(it.getString(), getVariant(trad2simp, it.codepoint), "→S", "S←"); } System.out.println("*** Simple Pairs ***"); @@ -129,7 +139,8 @@ private void showSimpVsTrad(String[] args) { continue; } final ArrayList list = new ArrayList(equivSet); - final String reasonString = equivalences.getReasons(list.get(0), list.get(1)).toString(); + final String reasonString = + equivalences.getReasons(list.get(0), list.get(1)).toString(); // S↔T if (reasonString.equals("[[[S←, →T]]]")) { System.out.println(list.get(0) + "\tS↔T\t" + list.get(1)); @@ -172,9 +183,9 @@ private void showSimpVsTrad(String[] args) { continue; } String reasonString = reason.toString(); - reasonString = reasonString.substring(1,reasonString.length()-1); + reasonString = reasonString.substring(1, reasonString.length() - 1); if (item2.endsWith("*")) { - item2 = item2.substring(0,item2.length()-1); + item2 = item2.substring(0, item2.length() - 1); } String line; if (reasonString.equals("S←, →T")) { @@ -214,17 +225,18 @@ private void showSimpVsTrad(String[] args) { final UnicodeSet simpAndTrad = new UnicodeSet(simp).retainAll(trad); System.out.println("Characters that are both Simp & Trad: " + PRETTY.format(simpAndTrad)); System.out.println(); - System.out.println("Characters that are both Simp & Trad - Dual: " + PRETTY.format(simpAndTrad.removeAll(dual))); + System.out.println( + "Characters that are both Simp & Trad - Dual: " + + PRETTY.format(simpAndTrad.removeAll(dual))); - if (true) - { + if (true) { return; // ============================== } System.out.println("x →T y & x →S z"); final UnicodeSet both = new UnicodeSet(simp2trad.keySet()).retainAll(trad2simp.keySet()); - for (final UnicodeSetIterator it = new UnicodeSetIterator(both); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(both); it.next(); ) { System.out.println(it.getString() + "\t→T\t" + getVariant(simp2trad, it.codepoint)); System.out.println(it.getString() + "\t→S\t" + getVariant(trad2simp, it.codepoint)); System.out.println(); @@ -276,7 +288,8 @@ private Set subtractStar(Set equivSet) { return result; } - private String showLine(String item, String relation, String item2, UnicodeSet simp, UnicodeSet trad) { + private String showLine( + String item, String relation, String item2, UnicodeSet simp, UnicodeSet trad) { String line; line = (item + relation + item2); simp.addAll(item); @@ -284,9 +297,15 @@ private String showLine(String item, String relation, String item2, UnicodeSet s return line; } - private void addItems(UnicodeMap simp2trad, UnicodeMap trad2simp, List output, - Set seen, boolean isTrad2Simp, Set buffered) { - for (final UnicodeSetIterator it = new UnicodeSetIterator(simp2trad.keySet()); it.next();) { + private void addItems( + UnicodeMap simp2trad, + UnicodeMap trad2simp, + List output, + Set seen, + boolean isTrad2Simp, + Set buffered) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(simp2trad.keySet()); + it.next(); ) { final String string = it.getString(); if (seen.contains(string)) { continue; diff --git a/unicodetools/src/main/java/org/unicode/text/tools/StringTree.java b/unicodetools/src/main/java/org/unicode/text/tools/StringTree.java index be09d21cd..ca1018357 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/StringTree.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/StringTree.java @@ -1,5 +1,12 @@ package org.unicode.text.tools; +import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Multimap; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.dev.util.UnicodeMap.EntryRange; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -12,43 +19,46 @@ import java.util.Map.Entry; import java.util.Set; import java.util.regex.Pattern; - import org.unicode.cldr.util.XEquivalenceMap; import org.unicode.tools.emoji.EmojiData; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Multimap; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.dev.util.UnicodeMap.EntryRange; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - - public class StringTree { - static final Transliterator SHOW = Transliterator.createFromRules("foo", "([[:c:][:z:][:di:][:M:]-[\\ \\x0A]]) > &hex/perl($1);", Transliterator.FORWARD); + static final Transliterator SHOW = + Transliterator.createFromRules( + "foo", + "([[:c:][:z:][:di:][:M:]-[\\ \\x0A]]) > &hex/perl($1);", + Transliterator.FORWARD); public abstract static class CPNode> implements Iterable> { public static final int NO_VALUE = Integer.MIN_VALUE; - /** Not complete comparator !! **/ - public static final Comparator DEPTH_FIRST = new Comparator() { - @Override - public int compare(CPNode o1, CPNode o2) { - return o1.depth() - o2.depth(); - } - }; + /** Not complete comparator !! * */ + public static final Comparator DEPTH_FIRST = + new Comparator() { + @Override + public int compare(CPNode o1, CPNode o2) { + return o1.depth() - o2.depth(); + } + }; + + public abstract int getValue(); - abstract public int getValue(); public boolean hasValue() { return getValue() != NO_VALUE; } - abstract public boolean childless(); - abstract public int childCount(); - abstract public Iterator> iterator(); - abstract public Set values(); - abstract public UnicodeSet getSet(T value); - abstract public T get(int cp); - abstract public int depth(); + + public abstract boolean childless(); + + public abstract int childCount(); + + public abstract Iterator> iterator(); + + public abstract Set values(); + + public abstract UnicodeSet getSet(T value); + + public abstract T get(int cp); + + public abstract int depth(); @Override public String toString() { @@ -75,59 +85,63 @@ void toString(StringBuilder result, String indent) { if (!entry.value.childless()) { entry.value.toString(result, indent + " "); } - } + } } + public abstract UnicodeSet keySet(); } - static final Comparator CPNODE_COMPARATOR = new Comparator() { - @Override - public int compare(CPNode o1, CPNode o2) { - if (o1 == o2) { - return 0; - } - int diff; + static final Comparator CPNODE_COMPARATOR = + new Comparator() { + @Override + public int compare(CPNode o1, CPNode o2) { + if (o1 == o2) { + return 0; + } + int diff; - if (0 != (diff = o1.getValue() - o2.getValue())) { - return diff; - } - int childCount1 = o1.childCount(); - if (0 != (diff = childCount1 - o2.childCount())) { - return diff; - } - if (childCount1 == 0) { - return 0; - } - for (Iterator> it1 = o1.iterator(), it2 = o2.iterator(); - it1.hasNext();) { - EntryRange range1 = it1.next(), range2 = it2.next(); - // This is more complicated. - // We treat null values as less than everything else. - // so the key is that when we find the lowest range - if (0 != (range1.codepoint - range2.codepoint)) { - return diff; - } - if (0 != (this.compare(range1.value, range2.value))) { - return diff; - } - // this is more complicated. So that it works properly we have to - // compare the value at minEnd+1 - if (0 != (range2.codepointEnd - range1.codepointEnd)) { - return diff; + if (0 != (diff = o1.getValue() - o2.getValue())) { + return diff; + } + int childCount1 = o1.childCount(); + if (0 != (diff = childCount1 - o2.childCount())) { + return diff; + } + if (childCount1 == 0) { + return 0; + } + for (Iterator> it1 = o1.iterator(), it2 = o2.iterator(); + it1.hasNext(); ) { + EntryRange range1 = it1.next(), range2 = it2.next(); + // This is more complicated. + // We treat null values as less than everything else. + // so the key is that when we find the lowest range + if (0 != (range1.codepoint - range2.codepoint)) { + return diff; + } + if (0 != (this.compare(range1.value, range2.value))) { + return diff; + } + // this is more complicated. So that it works properly we have to + // compare the value at minEnd+1 + if (0 != (range2.codepointEnd - range1.codepointEnd)) { + return diff; + } + } + return 0; } - } - return 0; - } - }; + }; static class CpWrapper { - final public CPNode item; + public final CPNode item; + public CpWrapper(CPNode item) { this.item = item; } + @Override public boolean equals(Object obj) { - return equal(item, ((CpWrapper)obj).item); + return equal(item, ((CpWrapper) obj).item); } @Override @@ -155,7 +169,7 @@ public static boolean equal(CPNode o1, CPNode o2) { if (!set1.equals(set2)) { return false; } - for (UnicodeSetIterator range1 = new UnicodeSetIterator(set1); range1.next();) { + for (UnicodeSetIterator range1 = new UnicodeSetIterator(set1); range1.next(); ) { CPNode value1 = o1.get(range1.codepoint); for (int cp = range1.codepoint; cp <= range1.codepointEnd; ++cp) { if (!equal(value1, o2.get(cp))) { @@ -170,7 +184,7 @@ public static boolean equal(CPNode o1, CPNode o2) { static class ImmutableCPNode extends CPNode { static final UnicodeMap EMPTY = new UnicodeMap().freeze(); final int stringValue; - final private UnicodeMap data; + private final UnicodeMap data; private ImmutableCPNode(int stringValue, UnicodeMap data) { super(); @@ -179,10 +193,11 @@ private ImmutableCPNode(int stringValue, UnicodeMap data) { } public static ImmutableCPNode copy(CPNodeBuilder source) { - return copy(source, new HashMap()); + return copy(source, new HashMap()); } - public static ImmutableCPNode copy(CPNodeBuilder source, Map cache) { + public static ImmutableCPNode copy( + CPNodeBuilder source, Map cache) { // need special map CpWrapper wrapped = new CpWrapper(source); ImmutableCPNode result = cache.get(wrapped); @@ -253,11 +268,11 @@ public int depth() { public UnicodeSet keySet() { return data.keySet(); } + @Override public ImmutableCPNode get(int cp) { return data.get(cp); } - } static class CPNodeBuilder extends CPNode { @@ -346,10 +361,12 @@ public int depth() { } return 1 + childMax; } + @Override public UnicodeSet keySet() { return data.keySet(); } + @Override public CPNodeBuilder get(int cp) { return data.get(cp); @@ -426,13 +443,13 @@ private static void getRegex(CPNode baseNode, StringBuilder result) { } } - static private void addCodePoint(StringBuilder result, UnicodeSet singles) { + private static void addCodePoint(StringBuilder result, UnicodeSet singles) { if (singles.size() == 1) { addCodePoint(result, singles.charAt(0)); return; } result.append('['); - for (UnicodeSetIterator entry = new UnicodeSetIterator(singles); entry.nextRange();) { + for (UnicodeSetIterator entry = new UnicodeSetIterator(singles); entry.nextRange(); ) { addCodePoint(result, entry.codepoint); if (entry.codepoint != entry.codepointEnd) { result.append('-'); @@ -442,14 +459,14 @@ static private void addCodePoint(StringBuilder result, UnicodeSet singles) { result.append(']'); } - static private StringBuilder addCodePoint(StringBuilder result, int cp) { + private static StringBuilder addCodePoint(StringBuilder result, int cp) { switch (cp) { - case '*' : - case '#' : - case '|' : - case '\\' : - result.append('\\'); - break; + case '*': + case '#': + case '|': + case '\\': + result.append('\\'); + break; } return result.appendCodePoint(cp); } @@ -458,18 +475,20 @@ static private StringBuilder addCodePoint(StringBuilder result, int cp) { public static void main(String[] args) { Collection tests = Arrays.asList("a", "xyz", "bc", "bce", "bcd", "p", "q", "r"); check(tests); - HashSet tests2 = EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives().addAllTo(new HashSet()); + HashSet tests2 = + EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives().addAllTo(new HashSet()); LinkedHashSet tests3 = new LinkedHashSet<>(); LinkedHashSet tests4 = new LinkedHashSet<>(); UnicodeSet starters = new UnicodeSet("[🏴]"); // # - //UnicodeSet starters = new UnicodeSet("[#*0-9©®‼⁉☝⛹✌-✍🕴⛹🖐👻👽✊-✋🎅🏂]"); - tests2.forEach(s -> { - String fixed = EmojiData.EMOJI_DATA.addEmojiVariants(s); - if (starters.matchesAt(fixed, 0) >= 0) { - tests3.add(fixed); - } - tests4.add(fixed); - }); + // UnicodeSet starters = new UnicodeSet("[#*0-9©®‼⁉☝⛹✌-✍🕴⛹🖐👻👽✊-✋🎅🏂]"); + tests2.forEach( + s -> { + String fixed = EmojiData.EMOJI_DATA.addEmojiVariants(s); + if (starters.matchesAt(fixed, 0) >= 0) { + tests3.add(fixed); + } + tests4.add(fixed); + }); check(tests3); ImmutableCPNode full = check(tests4); partition(full); @@ -501,15 +520,16 @@ private static ImmutableCPNode check(Collection tests) { static void partition(ImmutableCPNode s) { System.out.println("Partition: "); - UnicodeMap ri = new UnicodeMap() - .putAll(new UnicodeSet("[:regional_indicator:]"), "RI") - .putAll(new UnicodeSet("[:block=tags:]"), "TAG") - .freeze(); + UnicodeMap ri = + new UnicodeMap() + .putAll(new UnicodeSet("[:regional_indicator:]"), "RI") + .putAll(new UnicodeSet("[:block=tags:]"), "TAG") + .freeze(); Object fake = new Object(); Multimap basePartition = LinkedHashMultimap.create(); addPartitions(s, ri, fake, basePartition); XEquivalenceMap, String> partition = new XEquivalenceMap<>(); - for ( Entry> entry : basePartition.asMap().entrySet()) { + for (Entry> entry : basePartition.asMap().entrySet()) { partition.add(entry.getKey(), (Set) entry.getValue()); } @@ -528,7 +548,7 @@ static void partition(ImmutableCPNode s) { for (Object target : targets) { String targetString; if (target instanceof ImmutableCPNode) { - String pattern = RegexBuilder.getRegex((ImmutableCPNode)target); + String pattern = RegexBuilder.getRegex((ImmutableCPNode) target); targetString = SHOW.transform(pattern); } else { targetString = target.toString(); @@ -536,21 +556,28 @@ static void partition(ImmutableCPNode s) { targetSet.add(targetString); } - System.out.println(key.size() - + "\t" + targetSet.size() - + "\t" + SHOW.transform(key.toPattern(false)) - + "\t" + targetSet - ); + System.out.println( + key.size() + + "\t" + + targetSet.size() + + "\t" + + SHOW.transform(key.toPattern(false)) + + "\t" + + targetSet); } } - private static void addPartitions(ImmutableCPNode s, UnicodeMap ri, Object fake, Multimap basePartition) { + private static void addPartitions( + ImmutableCPNode s, + UnicodeMap ri, + Object fake, + Multimap basePartition) { for (EntryRange entry : s) { for (int cp = entry.codepoint; cp <= entry.codepointEnd; ++cp) { String special = ri.get(cp); if (special != null) { basePartition.put(cp, special); - } else if (entry.value.childless()){ + } else if (entry.value.childless()) { basePartition.put(cp, "TERM"); } else { basePartition.put(cp, entry.value); diff --git a/unicodetools/src/main/java/org/unicode/text/tools/TransformFile.java b/unicodetools/src/main/java/org/unicode/text/tools/TransformFile.java index c95502c05..8555e0d57 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/TransformFile.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/TransformFile.java @@ -1,10 +1,9 @@ package org.unicode.text.tools; -import org.unicode.cldr.draft.FileUtilities; -import org.unicode.tools.emoji.CandidateData; - import com.ibm.icu.text.Transform; import com.ibm.icu.text.Transliterator; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.tools.emoji.CandidateData; public class TransformFile { public static void main(String[] args) { @@ -18,7 +17,8 @@ public static void main(String[] args) { } rules.append(line); } - Transform trans = Transliterator.createFromRules("foo", rules.toString(), Transliterator.FORWARD); + Transform trans = + Transliterator.createFromRules("foo", rules.toString(), Transliterator.FORWARD); int countChanged = 0; for (String line : FileUtilities.in(class1, sourceFile)) { diff --git a/unicodetools/src/main/java/org/unicode/text/tools/UnicodeSetTree.java b/unicodetools/src/main/java/org/unicode/text/tools/UnicodeSetTree.java index 6a56fcd90..b232d69db 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/UnicodeSetTree.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/UnicodeSetTree.java @@ -1,22 +1,23 @@ package org.unicode.text.tools; +import com.ibm.icu.text.UnicodeSet; import java.util.Comparator; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; -import com.ibm.icu.text.UnicodeSet; - public class UnicodeSetTree { static boolean SHOW = false; static class Node { final UnicodeSet parent; final Set children = new LinkedHashSet<>(); + Node(UnicodeSet set) { parent = set; } + public boolean add(UnicodeSet entry) { return add(new Node(entry)); } @@ -35,9 +36,11 @@ public boolean add(Node entry) { if (!didAdd) { // only add as child if not in any children already if (SHOW) { System.out.println( - (parent == null ? "null" : parent.toPattern(false)) - + "\t" + show(children) - + "\t" + entry.parent.toPattern(false)); + (parent == null ? "null" : parent.toPattern(false)) + + "\t" + + show(children) + + "\t" + + entry.parent.toPattern(false)); } children.add(entry); } @@ -56,22 +59,24 @@ private String show(Set children2) { } return b.toString(); } + @Override public String toString() { return parent + " =>\n" + children; } } - private static final Comparator LONGEST = new Comparator() { - @Override - public int compare(UnicodeSet o1, UnicodeSet o2) { - return o1.compareTo(o2,UnicodeSet.ComparisonStyle.LONGER_FIRST); - } - }; + private static final Comparator LONGEST = + new Comparator() { + @Override + public int compare(UnicodeSet o1, UnicodeSet o2) { + return o1.compareTo(o2, UnicodeSet.ComparisonStyle.LONGER_FIRST); + } + }; final Node base = new Node(new UnicodeSet()); - final Map data = new TreeMap(LONGEST); + final Map data = new TreeMap(LONGEST); static interface Merger { T merge(T a, T b); @@ -87,7 +92,8 @@ UnicodeSetTree add(T name, UnicodeSet set) { T old = data.get(set); if (old != null) { name = merger.merge(old, name); - }; + } + ; data.put(set, name); base.parent.addAll(set); return this; @@ -112,6 +118,7 @@ public T get(UnicodeSet key) { static interface Visitor { public void show(UnicodeSetTree tree, Node node, int indent); + public void showRemainder(UnicodeSet remainder, int indent); } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/VerifyIdna.java b/unicodetools/src/main/java/org/unicode/text/tools/VerifyIdna.java index 0c40a1a4f..c397ab3e9 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/VerifyIdna.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/VerifyIdna.java @@ -1,5 +1,11 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; import java.io.IOException; import java.text.ParsePosition; @@ -7,38 +13,36 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; -import org.unicode.props.BagFormatter; import org.unicode.jsp.ICUPropertyFactory; +import org.unicode.props.BagFormatter; import org.unicode.props.UnicodeProperty; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.ToolUnicodePropertySource; import org.unicode.text.UCD.UCD_Types; import org.unicode.text.utility.Settings; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class VerifyIdna { static final boolean USE_ICU = true; // 011B ; PVALID # LATIN SMALL LETTER E WITH CARON - static final Matcher DATALINE = Pattern.compile( - "([0-9a-fA-F]{4,6})" + - "(?:\\.\\.([0-9a-fA-F]{4,6}))?" + - "\\s*;\\s*" + - "(PVALID|DISALLOWED|UNASSIGNED|CONTEXTJ|CONTEXTO)" + - "\\s*#\\s*" + - "(.*)").matcher(""); + static final Matcher DATALINE = + Pattern.compile( + "([0-9a-fA-F]{4,6})" + + "(?:\\.\\.([0-9a-fA-F]{4,6}))?" + + "\\s*;\\s*" + + "(PVALID|DISALLOWED|UNASSIGNED|CONTEXTJ|CONTEXTO)" + + "\\s*#\\s*" + + "(.*)") + .matcher(""); public enum IdnaType { - PVALID, DISALLOWED, UNASSIGNED, CONTEXTJ, CONTEXTO + PVALID, + DISALLOWED, + UNASSIGNED, + CONTEXTJ, + CONTEXTO } public static void main(String[] args) throws IOException { @@ -65,13 +69,13 @@ public static void main(String[] args) throws IOException { System.out.println("\tequal"); } else { System.out.println("Difference in Contents"); - //System.out.println(bf.showSetDifferences("pat", patItems, "alt", altItems)); + // System.out.println(bf.showSetDifferences("pat", patItems, "alt", altItems)); final UnicodeSet patDiffAlt = new UnicodeSet(patItems).removeAll(altItems); System.out.println("\tpat-alt:\n" + bf.showSetNames(patDiffAlt)); final UnicodeSet altDiffPat = new UnicodeSet(altItems).removeAll(patItems); - System.out.println("\talt-pat:\n" + bf.showSetNames(altDiffPat)); - //System.out.println("\tpat:\t" + patItems); - //System.out.println("\talt:\t" + altItems); + System.out.println("\talt-pat:\n" + bf.showSetNames(altDiffPat)); + // System.out.println("\tpat:\t" + patItems); + // System.out.println("\talt:\t" + altItems); } System.out.println(); } @@ -84,34 +88,62 @@ private static UnicodeMap getAlternativeDerivation() { final UnicodeMap result = new UnicodeMap(); final UnicodeSet foo = parseUnicodeSet("[[:gc=cn:]-[:NChar:]]"); - System.out.println("A\t" + parseUnicodeSet("[" + - "[[:gc=Ll:][:gc=Lt:][:gc=Lu:][:gc=Lo:][:gc=Lm:][:gc=Mn:][:gc=Mc:][:gc=Nd:]]" + // A - restrict to only letters, marks, numbers - "]").complement().complement()); + System.out.println( + "A\t" + + parseUnicodeSet( + "[" + + "[[:gc=Ll:][:gc=Lt:][:gc=Lu:][:gc=Lo:][:gc=Lm:][:gc=Mn:][:gc=Mc:][:gc=Nd:]]" + + // A - restrict to only letters, marks, numbers + "]") + .complement() + .complement()); - result.putAll(parseUnicodeSet("[\\u0000-\\U0010FFFF]"), IdnaType.DISALLOWED); // Assume disallowed unless we set otherwise - result.putAll(parseUnicodeSet("[[:gc=cn:]-[:NChar:]]"), IdnaType.UNASSIGNED); // J - unassigned code points // -[:NChar:] - //parseUnicodeSet("[[:gc=cn:]]"); - result.putAll(parseUnicodeSet("[" + - "[[:gc=Ll:][:gc=Lt:][:gc=Lu:][:gc=Lo:][:gc=Lm:][:gc=Mn:][:gc=Mc:][:gc=Nd:]]" + // A - restrict to only letters, marks, numbers - "-[[:^isCaseFolded:]]" + // B - minus characters unstable under NFKC & casefolding - "-[:di:]" + // C - minus default-ignorables - "-[[:block=Combining_Diacritical_Marks_for_Symbols:]" + // D minus exceptional block exclusions - "[:block=Musical_Symbols:]" + - "[:block=Ancient_Greek_Musical_Notation:]" + - "[:block=Phaistos_Disc:]]" + // x - "[\\u3007]" + // x - "]"), IdnaType.PVALID); - result.putAll(parseUnicodeSet("[" + - "[\u002D\u00B7\u02B9\u0375\u0483\u05F3\u05F4\u3005\u303B\u30FB]" + // F.2 - exceptional contextual characters - "[:gc=cf:]" + // I - other Cf characters (should be omitted) - "]"), IdnaType.CONTEXTO); - result.putAll(parseUnicodeSet("[:join_control:]"), IdnaType.CONTEXTJ); // H - join controls + result.putAll( + parseUnicodeSet("[\\u0000-\\U0010FFFF]"), + IdnaType.DISALLOWED); // Assume disallowed unless we set otherwise + result.putAll( + parseUnicodeSet("[[:gc=cn:]-[:NChar:]]"), + IdnaType.UNASSIGNED); // J - unassigned code points // -[:NChar:] + // parseUnicodeSet("[[:gc=cn:]]"); + result.putAll( + parseUnicodeSet( + "[" + + "[[:gc=Ll:][:gc=Lt:][:gc=Lu:][:gc=Lo:][:gc=Lm:][:gc=Mn:][:gc=Mc:][:gc=Nd:]]" + + // A - restrict to only letters, marks, numbers + "-[[:^isCaseFolded:]]" + + // B - minus characters unstable under NFKC & casefolding + "-[:di:]" + + // C - minus default-ignorables + "-[[:block=Combining_Diacritical_Marks_for_Symbols:]" + + // D minus exceptional block exclusions + "[:block=Musical_Symbols:]" + + "[:block=Ancient_Greek_Musical_Notation:]" + + "[:block=Phaistos_Disc:]]" + + // x + "[\\u3007]" + + // x + "]"), + IdnaType.PVALID); + result.putAll( + parseUnicodeSet( + "[" + + "[\u002D\u00B7\u02B9\u0375\u0483\u05F3\u05F4\u3005\u303B\u30FB]" + + // F.2 - exceptional contextual characters + "[:gc=cf:]" + + // I - other Cf characters (should be omitted) + "]"), + IdnaType.CONTEXTO); + result.putAll(parseUnicodeSet("[:join_control:]"), IdnaType.CONTEXTJ); // H - join controls result.freeze(); return result; } private static UnicodeMap getPatriksMapping() throws IOException { - final BufferedReader in = FileUtilities.openReader(Settings.UnicodeTools.DATA_DIR + "/IDN/", "draft-faltstrom-idnabis-tables-05.txt", "ascii"); + final BufferedReader in = + FileUtilities.openReader( + Settings.UnicodeTools.DATA_DIR + "/IDN/", + "draft-faltstrom-idnabis-tables-05.txt", + "ascii"); boolean inTable = false; final UnicodeMap patrik = new UnicodeMap(); int count = 0; @@ -134,7 +166,9 @@ private static UnicodeMap getPatriksMapping() throws IOException { continue; } line = line.trim(); - if (line.length() == 0 || line.startsWith("Faltstrom") || line.startsWith("Internet-Draft")) { + if (line.length() == 0 + || line.startsWith("Faltstrom") + || line.startsWith("Internet-Draft")) { continue; } // we now have real data @@ -143,8 +177,8 @@ private static UnicodeMap getPatriksMapping() throws IOException { continue; } final int startChar = Integer.parseInt(DATALINE.group(1), 16); - final int endChar = DATALINE.group(2) == null ? startChar : Integer.parseInt(DATALINE - .group(2), 16); + final int endChar = + DATALINE.group(2) == null ? startChar : Integer.parseInt(DATALINE.group(2), 16); final IdnaType idnaType = IdnaType.valueOf(DATALINE.group(3)); patrik.putAll(startChar, endChar, idnaType); } @@ -161,16 +195,19 @@ public PropertySymbolTable(UnicodeProperty.Factory unicodePropertyFactory) { this.unicodePropertyFactory = unicodePropertyFactory; final UnicodeProperty gc = unicodePropertyFactory.getProperty("gc"); final UnicodeProperty ideo = unicodePropertyFactory.getProperty("ideographic"); - final UnicodeSet invariant = new UnicodeSet() - .addAll(gc.getSet("cc")) - .addAll(gc.getSet("cn")) - .addAll(gc.getSet("co")) - .addAll(gc.getSet("cs")) - //.addAll(ideo.getSet("t")) - ; + final UnicodeSet invariant = + new UnicodeSet() + .addAll(gc.getSet("cc")) + .addAll(gc.getSet("cn")) + .addAll(gc.getSet("co")) + .addAll(gc.getSet("cs")) + // .addAll(ideo.getSet("t")) + ; isCaseFolded = new UnicodeSet(invariant); - for (final UnicodeSetIterator it = new UnicodeSetIterator(new UnicodeSet(invariant).complement()); it.nextRange();) { + for (final UnicodeSetIterator it = + new UnicodeSetIterator(new UnicodeSet(invariant).complement()); + it.nextRange(); ) { for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) { final String s = UTF16.valueOf(cp); if (getNormalizedCaseFolded(getNormalizedCaseFolded(s)).equals(s)) { @@ -182,15 +219,17 @@ public PropertySymbolTable(UnicodeProperty.Factory unicodePropertyFactory) { private String getNormalizedCaseFolded(String s) { if (USE_ICU) { - return Normalizer.normalize(UCharacter.foldCase(s,true), Normalizer.COMPOSE_COMPAT); + return Normalizer.normalize( + UCharacter.foldCase(s, true), Normalizer.COMPOSE_COMPAT); } else { - return Default.nfkc().normalize(Default.ucd().getCase(s, UCD_Types.FULL, UCD_Types.FOLD)); + return Default.nfkc() + .normalize(Default.ucd().getCase(s, UCD_Types.FULL, UCD_Types.FOLD)); } } @Override - public boolean applyPropertyAlias(String propertyName, - String propertyValue, UnicodeSet result) { + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { // String trimmedPropertyValue = propertyValue.trim(); // if (trimmedPropertyValue.startsWith("/") && trimmedPropertyValue.endsWith("/")) { // Matcher matcher = Pattern.compile( @@ -230,7 +269,8 @@ public boolean applyPropertyAlias(String propertyName, private UnicodeSet getSet(String propertyName, String propertyValue) { return unicodePropertyFactory.getSet(propertyName + "=" + propertyValue); } - }; + } + ; static PropertySymbolTable myXSymbolTable = null; @@ -246,14 +286,18 @@ public static UnicodeSet parseUnicodeSet(String input) { final ParsePosition parsePosition = new ParsePosition(0); final UnicodeSet result = new UnicodeSet(input, parsePosition, myXSymbolTable); if (parsePosition.getIndex() != input.length()) { - throw new IllegalArgumentException("Additional characters past the end of the set, at " - + parsePosition.getIndex() + ", ..." - + input.substring(Math.max(0, parsePosition.getIndex() - 10), parsePosition.getIndex()) - + "|" - + input.substring(parsePosition.getIndex(), Math.min(input.length(), parsePosition.getIndex() + 10)) - ); + throw new IllegalArgumentException( + "Additional characters past the end of the set, at " + + parsePosition.getIndex() + + ", ..." + + input.substring( + Math.max(0, parsePosition.getIndex() - 10), + parsePosition.getIndex()) + + "|" + + input.substring( + parsePosition.getIndex(), + Math.min(input.length(), parsePosition.getIndex() + 10))); } return result; } - } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/VerifyUCD.java b/unicodetools/src/main/java/org/unicode/text/tools/VerifyUCD.java index 74a9d1691..1ba6179fa 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/VerifyUCD.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/VerifyUCD.java @@ -1,5 +1,9 @@ package org.unicode.text.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.IOException; import java.util.Arrays; import java.util.Collection; @@ -9,12 +13,11 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.Log; import org.unicode.cldr.util.Tabber; import org.unicode.cldr.util.TransliteratorUtilities; -import org.unicode.props.BagFormatter; import org.unicode.cldr.util.props.UnicodeLabel; +import org.unicode.props.BagFormatter; import org.unicode.props.UnicodeProperty; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.Normalizer; @@ -26,11 +29,6 @@ import org.unicode.tools.Segmenter; import org.unicode.tools.Segmenter.Builder; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class VerifyUCD { private static final boolean FULL = false; @@ -38,14 +36,18 @@ public class VerifyUCD { public static void main(String[] args) throws IOException { // System.out.println(new File(UCD_Types.BASE_DIR).getCanonicalPath()); final String x = Default.ucd().getCase("\u0130", UCD_Types.FULL, UCD_Types.LOWER); - final String y = Default.ucd().getCase(Default.nfd().normalize("\u0130"), UCD_Types.FULL, UCD_Types.LOWER); + final String y = + Default.ucd() + .getCase( + Default.nfd().normalize("\u0130"), UCD_Types.FULL, UCD_Types.LOWER); Log.setLog(Settings.Output.GEN_DIR + "verifyUCD.html"); - Log.logln(""); + Log.logln( + ""); Log.logln("UCD Canonical Check"); Log.getLog().println("

L2/06-386R2

"); Log.logln("

UCD Canonical Check

"); - //Log.logln("

" + new java.util.Date() + "

"); + // Log.logln("

" + new java.util.Date() + "

"); final String property = System.getProperty("method"); try { @@ -67,7 +69,7 @@ public static void main(String[] args) throws IOException { } } - static final int COMBINING_MASK = (1<*Characters with Bidi_Mirrored=True containing one or more marks*"); - bf.showSetNames(Log.getLog(),contents); + bf.showSetNames(Log.getLog(), contents); } - private static void addMarks(UnicodeSet contents, UnicodeSet marks, int codepoint, Normalizer normalizer) { + private static void addMarks( + UnicodeSet contents, UnicodeSet marks, int codepoint, Normalizer normalizer) { if (marks.containsSome(normalizer.normalize(codepoint))) { contents.add(codepoint); } @@ -108,75 +111,125 @@ public static void doEquivalenceOfProperties(Equivalence equivalence) { Log.logln("

" + equivalence + " Equivalence of Properties

"); final ToolUnicodePropertySource ups = ToolUnicodePropertySource.make(Default.ucdVersion()); - final UnicodeSet setToTest = ups.getSet(equivalence == Equivalence.CANONICAL ? "NFDQuickCheck=No" : "NFKDQuickCheck=No"); + final UnicodeSet setToTest = + ups.getSet( + equivalence == Equivalence.CANONICAL + ? "NFDQuickCheck=No" + : "NFKDQuickCheck=No"); final TreeSet properties = new TreeSet(); - final Set availablePropertyNames = new TreeSet(ups.getAvailableNames(UnicodeProperty.BINARY_MASK - | UnicodeProperty.ENUMERATED_OR_CATALOG_MASK - | (1< removals = new TreeSet(Arrays.asList(new String[] { "Name", - "Unicode_1_Name", "East_Asian_Width", - "IdnOutput", - - "Simple_Case_Folding", - "Simple_Titlecase_Mapping", "Simple_Lowercase_Mapping", - "Simple_Uppercase_Mapping", - /* - "Titlecase_Mapping", "Lowercase_Mapping", - "Uppercase_Mapping", - */ - "Case_Stable", - - "Decomposition_Mapping", - "Age", "Composition_Exclusion", "Canonical_Combining_Class", "Pattern_Syntax", "Pattern_White_Space", "Expands_On_NFC", "Expands_On_NFD", - "Expands_On_NFKC", "Expands_On_NFKD", "Block", "Decomposition_Type", "Deprecated", "Full_Composition_Exclusion", - "NFC_Quick_Check", "Unified_Ideograph", "NFD_Quick_Check", "NFKC_Quick_Check", "NFKD_Quick_Check", - "Other_Alphabetic", "Other_Default_Ignorable_Code_Point", "Other_Grapheme_Extend", "Other_ID_Continue", "Other_ID_Start", "Other_Lowercase", "Other_Math", "Other_Uppercase" - })); + final Set availablePropertyNames = + new TreeSet( + ups.getAvailableNames( + UnicodeProperty.BINARY_MASK + | UnicodeProperty.ENUMERATED_OR_CATALOG_MASK + | (1 << UnicodeProperty.NUMERIC + | UnicodeProperty.STRING_OR_MISC_MASK))); + final Set removals = + new TreeSet( + Arrays.asList( + new String[] { + "Name", + "Unicode_1_Name", + "East_Asian_Width", + "IdnOutput", + "Simple_Case_Folding", + "Simple_Titlecase_Mapping", + "Simple_Lowercase_Mapping", + "Simple_Uppercase_Mapping", + /* + "Titlecase_Mapping", "Lowercase_Mapping", + "Uppercase_Mapping", + */ + "Case_Stable", + "Decomposition_Mapping", + "Age", + "Composition_Exclusion", + "Canonical_Combining_Class", + "Pattern_Syntax", + "Pattern_White_Space", + "Expands_On_NFC", + "Expands_On_NFD", + "Expands_On_NFKC", + "Expands_On_NFKD", + "Block", + "Decomposition_Type", + "Deprecated", + "Full_Composition_Exclusion", + "NFC_Quick_Check", + "Unified_Ideograph", + "NFD_Quick_Check", + "NFKC_Quick_Check", + "NFKD_Quick_Check", + "Other_Alphabetic", + "Other_Default_Ignorable_Code_Point", + "Other_Grapheme_Extend", + "Other_ID_Continue", + "Other_ID_Start", + "Other_Lowercase", + "Other_Math", + "Other_Uppercase" + })); if (equivalence == Equivalence.COMPATIBLITY) { removals.addAll(Arrays.asList("XID_Start", "XID_Continue", "ID_Start", "ID_Continue")); } removals.retainAll(availablePropertyNames); - final UnicodeSet forceNFC = new UnicodeSet() - .addAll(ups.getSet("Hangul_Syllable_Type=LV_Syllable")) - .addAll(ups.getSet("Hangul_Syllable_Type=LVT_Syllable")) - .addAll(ups.getSet("General_Category=Titlecase_Letter")) - .addAll("\u1B3B\u1B3D\u1B43\u0CC0\u0CC7\u0CC8\u0CCA\u0CCB") - ; - final Set singleCharOnly = new TreeSet(Arrays.asList(new String[] { - "ASCII_Hex_Digit", "Hex_Digit", "Bidi_Mirroring_Glyph", "Soft_Dotted"})); - - //System.out.println("Other:\t" + ups.getAvailableNames(UnicodeProperty.STRING_OR_MISC_MASK)); - //removals.addAll(ups.getAvailableNames(UnicodeProperty.STRING_OR_MISC_MASK)); + final UnicodeSet forceNFC = + new UnicodeSet() + .addAll(ups.getSet("Hangul_Syllable_Type=LV_Syllable")) + .addAll(ups.getSet("Hangul_Syllable_Type=LVT_Syllable")) + .addAll(ups.getSet("General_Category=Titlecase_Letter")) + .addAll("\u1B3B\u1B3D\u1B43\u0CC0\u0CC7\u0CC8\u0CCA\u0CCB"); + final Set singleCharOnly = + new TreeSet( + Arrays.asList( + new String[] { + "ASCII_Hex_Digit", + "Hex_Digit", + "Bidi_Mirroring_Glyph", + "Soft_Dotted" + })); + + // System.out.println("Other:\t" + + // ups.getAvailableNames(UnicodeProperty.STRING_OR_MISC_MASK)); + // removals.addAll(ups.getAvailableNames(UnicodeProperty.STRING_OR_MISC_MASK)); availablePropertyNames.removeAll(removals); Log.getLog().println(""); - Log.getLog().println(""); + Log.getLog() + .println(""); Log.getLog().println(""); Log.getLog().println("
Testing:" + availablePropertyNames + "
Testing:" + availablePropertyNames + "
Skipping:" + removals + "

"); Log.logln("
"); final UnicodeMap results = new UnicodeMap(); - final Map sidewaysResults = new TreeMap(); + final Map sidewaysResults = new TreeMap(); // http://demo.icu-project.org/icu-bin/ubrowse?go=2224 - for (final UnicodeSetIterator it = new UnicodeSetIterator(setToTest); it.next();) { + for (final UnicodeSetIterator it = new UnicodeSetIterator(setToTest); it.next(); ) { final int codepoint = it.codepoint; - final String normalized = (forceNFC.contains(codepoint) - ? equivalence == Equivalence.CANONICAL ? Default.nfc() : Default.nfkc() - : equivalence == Equivalence.CANONICAL ? Default.nfd() : Default.nfkd()).normalize(codepoint); + final String normalized = + (forceNFC.contains(codepoint) + ? equivalence == Equivalence.CANONICAL + ? Default.nfc() + : Default.nfkc() + : equivalence == Equivalence.CANONICAL + ? Default.nfd() + : Default.nfkd()) + .normalize(codepoint); properties.clear(); for (final String propertyName : availablePropertyNames) { - if (UTF16.hasMoreCodePointsThan(normalized,1) && singleCharOnly.contains(propertyName)) { + if (UTF16.hasMoreCodePointsThan(normalized, 1) + && singleCharOnly.contains(propertyName)) { continue; } final UnicodeProperty up = ups.getProperty(propertyName); - final boolean isStringProp = ((1< sidewaysResults, final Equivalence - equivalence) { - final UnicodeLabel nameLabel = new UnicodeLabel() { - @Override - public String getValue(int codepoint, boolean isShort) { - final String nfd = (equivalence == Equivalence.CANONICAL ? Default.nfd() : Default.nfkd()).normalize(codepoint); - return Default.ucd().getCodeAndName(codepoint,UCD_Types.NORMAL,TransliteratorUtilities.toHTMLControl) - + "\t\u2192\t" - + Default.ucd().getCodeAndName(nfd,UCD_Types.NORMAL,TransliteratorUtilities.toHTMLControl); - } - }; + enum Equivalence { + PLAIN, + CANONICAL, + COMPATIBLITY + }; + + private static void showProperties( + UnicodeMap results, + Map sidewaysResults, + final Equivalence equivalence) { + final UnicodeLabel nameLabel = + new UnicodeLabel() { + @Override + public String getValue(int codepoint, boolean isShort) { + final String nfd = + (equivalence == Equivalence.CANONICAL + ? Default.nfd() + : Default.nfkd()) + .normalize(codepoint); + return Default.ucd() + .getCodeAndName( + codepoint, + UCD_Types.NORMAL, + TransliteratorUtilities.toHTMLControl) + + "\t\u2192\t" + + Default.ucd() + .getCodeAndName( + nfd, + UCD_Types.NORMAL, + TransliteratorUtilities.toHTMLControl); + } + }; final BagFormatter bf = new BagFormatter(); bf.setNameSource(nameLabel); bf.setTabber(new Tabber.HTMLTabber()); @@ -239,9 +324,14 @@ public String getValue(int codepoint, boolean isShort) { Log.logln("

" + "By Property" + "

"); for (final String propName : sidewaysResults.keySet()) { final UnicodeMap map = sidewaysResults.get(propName); - bf.setValueSource((new UnicodeProperty.UnicodeMapProperty() { - }).set(map).setMain(propName + "_diff", propName + "_diff", - UnicodeProperty.EXTENDED_STRING, "1.0")); + bf.setValueSource( + (new UnicodeProperty.UnicodeMapProperty() {}) + .set(map) + .setMain( + propName + "_diff", + propName + "_diff", + UnicodeProperty.EXTENDED_STRING, + "1.0")); Log.logln("

" + propName + "

"); Log.logln(bf.showSetNames(map.keySet())); @@ -249,15 +339,25 @@ public String getValue(int codepoint, boolean isShort) { } private static String caseMapping(String source, String propertyName) { - final byte operation = propertyName.contains("Uppercase") ? UCD_Types.UPPER - : propertyName.contains("Lowercase") ? UCD_Types.LOWER - : propertyName.contains("Titlecase") ? UCD_Types.TITLE - : UCD_Types.FOLD; + final byte operation = + propertyName.contains("Uppercase") + ? UCD_Types.UPPER + : propertyName.contains("Lowercase") + ? UCD_Types.LOWER + : propertyName.contains("Titlecase") + ? UCD_Types.TITLE + : UCD_Types.FOLD; final byte style = propertyName.contains("Simple") ? UCD_Types.SIMPLE : UCD_Types.FULL; - return Default.ucd().getCase(source,style, operation); + return Default.ucd().getCase(source, style, operation); } - private static void addPropertyDifference(Map sidewaysResults, TreeSet properties, int codePoint, String propName, Object value1, Object value2) { + private static void addPropertyDifference( + Map sidewaysResults, + TreeSet properties, + int codePoint, + String propName, + Object value1, + Object value2) { properties.add(propName + "=" + value1 + "\u2260" + value2); UnicodeMap umap = sidewaysResults.get(propName); if (umap == null) { @@ -267,7 +367,7 @@ private static void addPropertyDifference(Map sidewaysResult } private static Object getValue(UnicodeProperty up, int codepoint) { - final int type = 1<Characters that break within their NFD form."); - for (final String type : new String[] {"GraphemeClusterBreak", "WordBreak", "LineBreak", "SentenceBreak"}) { - final Builder segBuilder = Segmenter.make(ToolUnicodePropertySource.make(Default.ucdVersion()), type); + for (final String type : + new String[] {"GraphemeClusterBreak", "WordBreak", "LineBreak", "SentenceBreak"}) { + final Builder segBuilder = + Segmenter.make(ToolUnicodePropertySource.make(Default.ucdVersion()), type); final Segmenter seg = segBuilder.make(); // quick test @@ -352,15 +457,18 @@ static public void checkSegmentation() { final boolean b = seg.breaksAt(nfd, i); if (b) { seg.breaksAt(nfd, i); - Log.logln("

Failure with " + Default.ucd().getCodeAndName(cp) - + " => " + Default.ucd().getCodeAndName(nfd) - + " @ " + i - + "

"); + Log.logln( + "

Failure with " + + Default.ucd().getCodeAndName(cp) + + " => " + + Default.ucd().getCodeAndName(nfd) + + " @ " + + i + + "

"); } } } Log.logln("

Failures: " + failures + "

"); } } - } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/VerifyXmlUcd.java b/unicodetools/src/main/java/org/unicode/text/tools/VerifyXmlUcd.java index a8bdb1de1..2a8dae447 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/VerifyXmlUcd.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/VerifyXmlUcd.java @@ -1,5 +1,8 @@ package org.unicode.text.tools; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -14,7 +17,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.util.XMLFileReader; import org.unicode.cldr.util.XPathParts; import org.unicode.jsp.ICUPropertyFactory; @@ -24,10 +26,6 @@ import org.unicode.text.UCD.ToolUnicodePropertySource; import org.unicode.text.utility.Settings; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class VerifyXmlUcd { public static final boolean USE_ICU = false; public static final boolean ABBREVIATED = true; @@ -35,14 +33,17 @@ public class VerifyXmlUcd { static Factory getFactory() { if (factory == null) { - factory = USE_ICU ? ICUPropertyFactory.make() : ToolUnicodePropertySource.make(Default.ucdVersion()); + factory = + USE_ICU + ? ICUPropertyFactory.make() + : ToolUnicodePropertySource.make(Default.ucdVersion()); } return factory; } public static void main(String[] args) throws IOException { try { - //checkRegex(); + // checkRegex(); // TODO: There is no xml folder inside .../unicodetools/data/ucd/ // Instead, there is a ucdxml folder parallel to ucd. // Is this class obsolete? @@ -63,7 +64,9 @@ private static void checkRegex() { if (patternString == null) { continue; } - final Matcher matcher = Pattern.compile(patternString.replace("\\U0010FFFF","\uDBFF\uDFFF")).matcher(""); + final Matcher matcher = + Pattern.compile(patternString.replace("\\U0010FFFF", "\uDBFF\uDFFF")) + .matcher(""); System.out.format("Testing %s (%s) with /%s/\n", shortName, prop, patternString); try { Collection values = property.getAvailableValues(); @@ -91,36 +94,37 @@ private static void checkRegex() { } } } - static Map PROP_REGEX = new TreeMap(); + + static Map PROP_REGEX = new TreeMap(); + static { final String cp = "[\\u0000-\\U0010FFFF]"; // "[\\x{0}-\\x{10FFFF}]"; final String name = "[A-Z0-9]+(([-\\ ]|\\ -|-\\ )[A-Z0-9]+)*"; final String bname = "[a-zA-Z0-9]+([_\\ ][a-zA-Z0-9]+)*"; - PROP_REGEX.put("nv", "-?[0-9]+\\.[0-9]+"); // * nv ; Numeric_Value - PROP_REGEX.put("bmg", cp); // bmg ; Bidi_Mirroring_Glyph - PROP_REGEX.put("cf", cp+"+"); // cf ; Case_Folding - PROP_REGEX.put("dm", cp+"+"); // dm ; Decomposition_Mapping - PROP_REGEX.put("FC_NFKC", cp+"+"); // FC_NFKC ; FC_NFKC_Closure - PROP_REGEX.put("lc", cp+"+"); // lc ; Lowercase_Mapping - PROP_REGEX.put("scc", cp+"*"); // scc ; Special_Case_Condition - PROP_REGEX.put("sfc", cp); // sfc ; Simple_Case_Folding - PROP_REGEX.put("slc", cp); // slc ; Simple_Lowercase_Mapping - PROP_REGEX.put("stc", cp); // stc ; Simple_Titlecase_Mapping - PROP_REGEX.put("suc", cp); // suc ; Simple_Uppercase_Mapping - PROP_REGEX.put("tc", cp+"+"); // tc ; Titlecase_Mapping - PROP_REGEX.put("uc", cp+"+"); // uc ; Uppercase_Mapping - PROP_REGEX.put("isc", name); // isc ; ISO_Comment - PROP_REGEX.put("na", "("+name + "|\\)?"); // na ; Name - PROP_REGEX.put("na1","("+name + "(\\ \\((CR|FF|LF|NEL)\\))?)?"); // na1 ; Unicode_1_Name - //PROP_REGEX.put("bmg", ".*"); // URS ; Unicode_Radical_Stroke - PROP_REGEX.put("age", "([0-9]+\\.[0-9]|unassigned)"); // age ; Age - PROP_REGEX.put("blk", bname); // blk ; Block - PROP_REGEX.put("sc", bname); // sc ; Script + PROP_REGEX.put("nv", "-?[0-9]+\\.[0-9]+"); // * nv ; Numeric_Value + PROP_REGEX.put("bmg", cp); // bmg ; Bidi_Mirroring_Glyph + PROP_REGEX.put("cf", cp + "+"); // cf ; Case_Folding + PROP_REGEX.put("dm", cp + "+"); // dm ; Decomposition_Mapping + PROP_REGEX.put("FC_NFKC", cp + "+"); // FC_NFKC ; FC_NFKC_Closure + PROP_REGEX.put("lc", cp + "+"); // lc ; Lowercase_Mapping + PROP_REGEX.put("scc", cp + "*"); // scc ; Special_Case_Condition + PROP_REGEX.put("sfc", cp); // sfc ; Simple_Case_Folding + PROP_REGEX.put("slc", cp); // slc ; Simple_Lowercase_Mapping + PROP_REGEX.put("stc", cp); // stc ; Simple_Titlecase_Mapping + PROP_REGEX.put("suc", cp); // suc ; Simple_Uppercase_Mapping + PROP_REGEX.put("tc", cp + "+"); // tc ; Titlecase_Mapping + PROP_REGEX.put("uc", cp + "+"); // uc ; Uppercase_Mapping + PROP_REGEX.put("isc", name); // isc ; ISO_Comment + PROP_REGEX.put("na", "(" + name + "|\\)?"); // na ; Name + PROP_REGEX.put( + "na1", + "(" + name + "(\\ \\((CR|FF|LF|NEL)\\))?)?"); // na1 ; Unicode_1_Name + // PROP_REGEX.put("bmg", ".*"); // URS ; Unicode_Radical_Stroke + PROP_REGEX.put("age", "([0-9]+\\.[0-9]|unassigned)"); // age ; Age + PROP_REGEX.put("blk", bname); // blk ; Block + PROP_REGEX.put("sc", bname); // sc ; Script } - - - private static void testFile(String file) throws IOException { System.out.format("\nTesting: %s\n\n", file); final File file2 = new File(file); @@ -131,15 +135,15 @@ private static void testFile(String file) throws IOException { final XMLFileReader xmlFileReader = new XMLFileReader(); final MySimpleHandler handler = new MySimpleHandler(); xmlFileReader.setHandler(handler); - xmlFileReader.read(file,XMLFileReader.CONTENT_HANDLER,false); + xmlFileReader.read(file, XMLFileReader.CONTENT_HANDLER, false); handler.showSummary(); } static Set codepoints = getSet("char", "reserved", "noncharacter", "surrogate"); static Set skipProperties = getSet("cp", "first-cp", "last-cp"); - //static Set hexValue = getSet("slc", "bmg", "lc", "stc", "suc", "tc", "uc"); + // static Set hexValue = getSet("slc", "bmg", "lc", "stc", "suc", "tc", "uc"); - static Set getSet(String ... strings) { + static Set getSet(String... strings) { return Collections.unmodifiableSet(new HashSet(Arrays.asList(strings))); } @@ -155,7 +159,7 @@ static class MySimpleHandler extends XMLFileReader.SimpleHandler { private final Set unhandledElements = new HashSet(); MySimpleHandler() { - final String test = factory.getProperty("lb").getValue(0x10EA0,true); + final String test = factory.getProperty("lb").getValue(0x10EA0, true); System.out.println("test: " + test); } @@ -174,12 +178,14 @@ public void showSummary() { core.add(p.getFirstNameAlias()); } showDifference("core", core, "xml_core", accummulatedProperties); - showDifference("blocks", Default.ucd().getBlockNames(), "xml_blocks", accummulatedBlocks); + showDifference( + "blocks", Default.ucd().getBlockNames(), "xml_blocks", accummulatedBlocks); System.out.format("unhandled elements: %s\n", unhandledElements); System.out.format("skipped properties: %s\n", bogusPropertiesSkipped); } - private void showDifference(String title1, Collection core, String title2, Collection accummulatedProperties) { + private void showDifference( + String title1, Collection core, String title2, Collection accummulatedProperties) { final Set core_minus_xml = new HashSet(core); core_minus_xml.removeAll(accummulatedProperties); final Set xml_minus_core = new HashSet(accummulatedProperties); @@ -198,7 +204,8 @@ public void handlePathValue(String path, String value) { return; } if (value.length() != 0) { - throw new IllegalArgumentException(String.format("non-empty value: %s, %s", value, path)); + throw new IllegalArgumentException( + String.format("non-empty value: %s, %s", value, path)); } XPathParts.getFrozenInstance(path); final String finalElement = parser.getElement(-1); @@ -208,42 +215,47 @@ public void handlePathValue(String path, String value) { return; } Map attributes = parser.getAttributes(-1); - final Map groupAttributes = parser.getElement(-2).equals("group") ? parser.getAttributes(-2) : null; + final Map groupAttributes = + parser.getElement(-2).equals("group") ? parser.getAttributes(-2) : null; if (groupAttributes != null) { groupAttributes.putAll(attributes); // add, possibly overriding attributes = groupAttributes; } checkAttributes(path, finalElement, attributes); } else if (finalElement.equals("block")) { - final Map attributes = parser.getAttributes(-1); - final int cpStart = Integer.parseInt(attributes.get("first-cp"),16); - final int cpEnd = Integer.parseInt(attributes.get("last-cp"),16); + final Map attributes = parser.getAttributes(-1); + final int cpStart = Integer.parseInt(attributes.get("first-cp"), 16); + final int cpEnd = Integer.parseInt(attributes.get("last-cp"), 16); final String blockName = attributes.get("name"); accummulatedBlocks.add( org.unicode.text.utility.Utility.getUnskeleton(blockName, true)); final UnicodeSet xmlBlock = new UnicodeSet(cpStart, cpEnd); - final UnicodeSet toolBlock = Default.ucd().getBlockSet(blockName, new UnicodeSet()); + final UnicodeSet toolBlock = + Default.ucd().getBlockSet(blockName, new UnicodeSet()); if (!xmlBlock.equals(toolBlock)) { - System.out.format("blocks differ: %s, %s != %s\n", blockName, xmlBlock, toolBlock); + System.out.format( + "blocks differ: %s, %s != %s\n", blockName, xmlBlock, toolBlock); } } else { unhandledElements.add(finalElement); } } catch (final Exception e) { - throw (RuntimeException) new IllegalArgumentException("Failure at: " + path).initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Failure at: " + path).initCause(e); } } private void checkAttributes(String path, String element, Map attributes) { - // get the start and end of the range. Would be slightly simpler if first-cp were just cp. + // get the start and end of the range. Would be slightly simpler if first-cp were just + // cp. String cpStartString = attributes.get("cp"); String cpEndString = null; if (cpStartString == null) { cpStartString = attributes.get("first-cp"); cpEndString = attributes.get("last-cp"); } - final int cpStart = Integer.parseInt(cpStartString,16); - int cpEnd = cpEndString == null ? cpStart : Integer.parseInt(cpEndString,16); + final int cpStart = Integer.parseInt(cpStartString, 16); + int cpEnd = cpEndString == null ? cpStart : Integer.parseInt(cpEndString, 16); if (ABBREVIATED && !element.equals("char")) { if (cpEnd > cpStart + 10) { cpEnd = cpStart + 10; @@ -271,14 +283,13 @@ void checkAttributes(String path, int cp, Map attributes) { // fix up names value = value.replace("#", Utility.hex(cp)); // hack for bad names: - //value = value.replace("CJK UNIFIED IDEOGRAPHS-", "CJK UNIFIED IDEOGRAPH-"); + // value = value.replace("CJK UNIFIED IDEOGRAPHS-", "CJK UNIFIED IDEOGRAPH-"); } final UnicodeProperty toolProperty = factory.getProperty(property); final String toolValue = matchEricsValues(cp, property, toolProperty); - if (toolValue == null) - { + if (toolValue == null) { continue; // for now } @@ -291,15 +302,25 @@ void checkAttributes(String path, int cp, Map attributes) { System.out.println(path); okSoFar = false; } - System.out.println("cp=" + Utility.hex(cp) + ", " + property + "=<" + value + "> (" + originalValue + ") != <" + toolValue + ">"); + System.out.println( + "cp=" + + Utility.hex(cp) + + ", " + + property + + "=<" + + value + + "> (" + + originalValue + + ") != <" + + toolValue + + ">"); } } } } - static Set defaultSame = getSet("dm", "bmg", - "cf", "lc", "tc", "uc", - "sfc", "slc", "stc", "suc"); + static Set defaultSame = + getSet("dm", "bmg", "cf", "lc", "tc", "uc", "sfc", "slc", "stc", "suc"); static Set bogusProperties = getSet("dm", "ccf", "jamo", "fcf", "tcf", "isc"); static Set bogusPropertiesSkipped = new HashSet(); @@ -324,8 +345,7 @@ private String matchEricsValues(int cp, String property, UnicodeProperty toolPro return USE_ICU ? null : "MISSING"; } String toolValue = toolProperty.getValue(cp, true); - if (toolValue == null) - { + if (toolValue == null) { return USE_ICU ? null : ""; // for ICU, only test a subset } final int type = toolProperty.getType(); @@ -339,12 +359,13 @@ private String matchEricsValues(int cp, String property, UnicodeProperty toolPro // return null; // } - //if (type == UnicodeProperty.STRING) { + // if (type == UnicodeProperty.STRING) { // UCD marks no change with "". I reflect the full value, Eric doesn't. // however, this is tricky, so I'm still playing with it to get them to match up. - //if (UTF16.valueOf(cp).equals(toolValue)) { - //toolValue = "#"; - // } else if (property.equals("lc") || property.equals("uc") || property.equals("tc")) { + // if (UTF16.valueOf(cp).equals(toolValue)) { + // toolValue = "#"; + // } else if (property.equals("lc") || property.equals("uc") || + // property.equals("tc")) { // String basePropertyName = property.equals("tc") ? "uc" : "s" + property; // UnicodeProperty baseProperty = factory.getProperty(basePropertyName); // String baseValue = baseProperty.getValue(cp, true); @@ -353,8 +374,8 @@ private String matchEricsValues(int cp, String property, UnicodeProperty toolPro // } else if (property.equals("tc")){ // // } - //} - //} + // } + // } // differences in format @@ -371,11 +392,11 @@ private String matchEricsValues(int cp, String property, UnicodeProperty toolPro } } - if (//property.endsWith("_QC") - // these are uppercase in in PropertyValueAliases - //|| - property.equals("dt")) - // these are not uppercase in in PropertyValueAliases, so Eric is right + if ( // property.endsWith("_QC") + // these are uppercase in in PropertyValueAliases + // || + property.equals("dt")) + // these are not uppercase in in PropertyValueAliases, so Eric is right { toolValue = toolValue.toLowerCase(Locale.ENGLISH); } @@ -388,15 +409,16 @@ private String matchEricsValues(int cp, String property, UnicodeProperty toolPro } else { toolValue = org.unicode.text.utility.Utility.hex(toolValue, " "); } - //if (property.equals("FC_NFKC")) toolValue = toolValue.toLowerCase(); // but formatting problem here, should be uppercase + // if (property.equals("FC_NFKC")) toolValue = toolValue.toLowerCase(); // but + // formatting problem here, should be uppercase } if (property.equals("nv")) { - // I'm using the format in http://unicode.org/Public/UNIDATA/extracted/DerivedNumericValues.txt + // I'm using the format in + // http://unicode.org/Public/UNIDATA/extracted/DerivedNumericValues.txt toolValue = dumbFraction(toolValue); } - return toolValue; } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/WordFrequency.java b/unicodetools/src/main/java/org/unicode/text/tools/WordFrequency.java index e51cb3434..01ec56903 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/WordFrequency.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/WordFrequency.java @@ -1,5 +1,9 @@ package org.unicode.text.tools; +import com.google.common.base.Splitter; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UnicodeSet; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -8,20 +12,15 @@ import java.util.Locale; import java.util.Map; import java.util.Set; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Counter; import org.unicode.text.utility.Settings; import org.unicode.tools.emoji.EmojiAnnotations; -import com.google.common.base.Splitter; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UnicodeSet; - public class WordFrequency { - static Map data = new HashMap<>(); + static Map data = new HashMap<>(); static long total; + static { Splitter tab = Splitter.on("\t").trimResults(); for (String line : FileUtilities.in(Settings.Output.GEN_DIR + "words/", "freq.txt")) { @@ -35,9 +34,11 @@ public class WordFrequency { } data = Collections.unmodifiableMap(data); } + public static Long getFrequency(String word) { return data.get(word); } + public static void main(String[] args) { if (args.length == 0) { args = new String[] {"animal", "food", "sport"}; @@ -51,8 +52,14 @@ public static void main(String[] args) { for (R2 entry : info.fileData.getEntrySetSortedByCount(false, null)) { final String item = entry.get1(); UnicodeSet us = EmojiAnnotations.ANNOTATIONS_TO_CHARS.getUnicodeSet(item); - System.out.println(arg + "\t" + item + "\t" + nf.format(entry.get0() / info.total) - + "\t" + (us == null || us.size() == 0 ? "" : us.toPattern(false))); + System.out.println( + arg + + "\t" + + item + + "\t" + + nf.format(entry.get0() / info.total) + + "\t" + + (us == null || us.size() == 0 ? "" : us.toPattern(false))); } } } @@ -62,31 +69,115 @@ static class Info { double total = 0; void addFile(String fileName) { - for (String line2 : FileUtilities.in(WordFrequency.class, "words/" + fileName + ".txt")) { + for (String line2 : + FileUtilities.in(WordFrequency.class, "words/" + fileName + ".txt")) { add(line2); } } - static final Set SKIP = new HashSet<>(Arrays.asList("love", - "open", "water", "baby", "print", "heart", "face", - "oh", "nature", "eye", "feet", "lady", "object", "peace", - "mouth", "shell", "smile", "romance", - "kiss", "joy", "flying", "sad", "cry", "nose", "tropical", - "surprised", "tears", "smiling", - "tear", "creature", "spiral", "smiley", "grin", "hump", - "ironic", "weary", "hatching", "flipper", - "grinning", "extraterrestrial", "wry", "spouting", "pouting", - "hot", "travel", "red", "green", "box", "cup", "human", "bar", - "french", "glass", "japanese", "square", "birthday", "sweet", - "ball", "plant", "soft", "delicious", "stick", "celebration", - "shaved", "yum", "um", "fried", "cooked", "slice", "roasted", - "sliced", "swirl", "frying", "steaming", "savouring", - "clinking", - "not", "no", "game", "car", "american", "entertainment", - "person", "weight", "shirt", "vehicle", "ice", "flag", "pool", - "mountain", "hole", "eight", "prohibited", "prize", "pole", - "forbidden", "hoop", "sash", "checkered")); - + static final Set SKIP = + new HashSet<>( + Arrays.asList( + "love", + "open", + "water", + "baby", + "print", + "heart", + "face", + "oh", + "nature", + "eye", + "feet", + "lady", + "object", + "peace", + "mouth", + "shell", + "smile", + "romance", + "kiss", + "joy", + "flying", + "sad", + "cry", + "nose", + "tropical", + "surprised", + "tears", + "smiling", + "tear", + "creature", + "spiral", + "smiley", + "grin", + "hump", + "ironic", + "weary", + "hatching", + "flipper", + "grinning", + "extraterrestrial", + "wry", + "spouting", + "pouting", + "hot", + "travel", + "red", + "green", + "box", + "cup", + "human", + "bar", + "french", + "glass", + "japanese", + "square", + "birthday", + "sweet", + "ball", + "plant", + "soft", + "delicious", + "stick", + "celebration", + "shaved", + "yum", + "um", + "fried", + "cooked", + "slice", + "roasted", + "sliced", + "swirl", + "frying", + "steaming", + "savouring", + "clinking", + "not", + "no", + "game", + "car", + "american", + "entertainment", + "person", + "weight", + "shirt", + "vehicle", + "ice", + "flag", + "pool", + "mountain", + "hole", + "eight", + "prohibited", + "prize", + "pole", + "forbidden", + "hoop", + "sash", + "checkered")); + public void addCharAnnotations(String arg) { UnicodeSet chars = EmojiAnnotations.ANNOTATIONS_TO_CHARS.getUnicodeSet(arg); for (String cp : chars) { @@ -105,9 +196,9 @@ public void add(String word) { } Long count = getFrequency(line); if (count != null) { - fileData.add(line,count); + fileData.add(line, count); total += count; - //System.out.println(line + ", " + count); + // System.out.println(line + ", " + count); } } } diff --git a/unicodetools/src/main/java/org/unicode/text/tools/XIDModifications.java b/unicodetools/src/main/java/org/unicode/text/tools/XIDModifications.java index 0b370e605..7dec54db8 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/XIDModifications.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/XIDModifications.java @@ -1,18 +1,16 @@ package org.unicode.text.tools; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.util.VersionInfo; import java.io.File; import java.util.Collections; import java.util.Set; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.text.UCD.IdentifierInfo.Identifier_Status; import org.unicode.text.UCD.IdentifierInfo.Identifier_Type; import org.unicode.text.utility.Settings; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.util.VersionInfo; - public class XIDModifications { private UnicodeMap identifierStatus = new UnicodeMap<>(); private UnicodeMap> identifierType = new UnicodeMap<>(); @@ -24,7 +22,7 @@ public XIDModifications(String directory) { finalPart = Settings.latestVersion; } VersionInfo version = VersionInfo.getInstance(finalPart); - identifierStatus.putAll(0,0x10FFFF, Identifier_Status.restricted); + identifierStatus.putAll(0, 0x10FFFF, Identifier_Status.restricted); if (version.getMajor() == 9) { // Version 9 IdentifierType.txt: // Any missing values have the value: IdentifierType={Recommended} @@ -32,7 +30,8 @@ public XIDModifications(String directory) { } else { // Version 10+ IdentifierType.txt: // Any missing code points have the IdentifierType value Not_Character - identifierType.putAll(0, 0x10FFFF, Collections.singleton(Identifier_Type.not_characters)); + identifierType.putAll( + 0, 0x10FFFF, Collections.singleton(Identifier_Type.not_characters)); } if (version.getMajor() <= 8) { new MyReader().process(directory, "xidmodifications.txt"); @@ -47,6 +46,7 @@ public XIDModifications(String directory) { public UnicodeMap getStatus() { return identifierStatus; } + public UnicodeMap> getType() { return identifierType; } @@ -62,6 +62,7 @@ protected boolean handleLine(int lineCount, int start, int end, String[] items) return true; } } + private class MyReaderStatus extends FileUtilities.SemiFileReader { @Override protected boolean handleLine(int lineCount, int start, int end, String[] items) { @@ -74,7 +75,8 @@ private class MyReader extends FileUtilities.SemiFileReader { @Override protected boolean handleLine(int lineCount, int start, int end, String[] items) { identifierStatus.putAll(start, end, Identifier_Status.fromString(items[1])); - identifierType.putAll(start, end, Collections.singleton(Identifier_Type.fromString(items[2]))); + identifierType.putAll( + start, end, Collections.singleton(Identifier_Type.fromString(items[2]))); return true; } } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/Birelation.java b/unicodetools/src/main/java/org/unicode/text/utility/Birelation.java index c0197c0ab..03d662db6 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/Birelation.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/Birelation.java @@ -1,5 +1,6 @@ package org.unicode.text.utility; +import com.ibm.icu.impl.Relation; import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashSet; @@ -7,32 +8,32 @@ import java.util.Map.Entry; import java.util.Set; -import com.ibm.icu.impl.Relation; - -public class Birelation { - private final Relation keyToValues; - private final Relation valueToKeys; +public class Birelation { + private final Relation keyToValues; + private final Relation valueToKeys; - public Birelation(Map> map1, - Map> map2, - Class valueSetCreator, - Class keySetCreator, + public Birelation( + Map> map1, + Map> map2, + Class valueSetCreator, + Class keySetCreator, Comparator valueSetComparator, - Comparator keySetComparator - ) { + Comparator keySetComparator) { keyToValues = Relation.of(map1, valueSetCreator, valueSetComparator); valueToKeys = Relation.of(map2, keySetCreator, keySetComparator); } - public static Birelation of( - Map> map1, - Map> map2, - Class setCreator1, - Class setCreator2, + + public static Birelation of( + Map> map1, + Map> map2, + Class setCreator1, + Class setCreator2, Comparator setComparator1, - Comparator setComparator2 - ) { - return new Birelation(map1, map2, setCreator1, setCreator2, setComparator1, setComparator2); + Comparator setComparator2) { + return new Birelation( + map1, map2, setCreator1, setCreator2, setComparator1, setComparator2); } + public Birelation add(K key, V value) { keyToValues.put(key, value); valueToKeys.put(value, key); @@ -56,30 +57,39 @@ public Birelation removeKey(K key) { keyToValues.removeAll(key); return this; } + public Set getValues(K key) { return keyToValues.get(key); } + public Set getKeys(V value) { return valueToKeys.get(value); } + public Set>> keyValuesSet() { return keyToValues.keyValuesSet(); } + public Set>> valueKeysSet() { return valueToKeys.keyValuesSet(); } + public void freeze() { keyToValues.freeze(); valueToKeys.freeze(); } + public Set keySet() { return keyToValues.keySet(); } + public Set valuesSet() { return valueToKeys.keySet(); } - public Relation,K> getValuesToKeys() { - Relation,K> result = Relation.of(new HashMap,Set>(), LinkedHashSet.class); + + public Relation, K> getValuesToKeys() { + Relation, K> result = + Relation.of(new HashMap, Set>(), LinkedHashSet.class); for (Entry> entry : keyToValues.keyValuesSet()) { K key = entry.getKey(); Set values = entry.getValue(); @@ -87,4 +97,4 @@ public Relation,K> getValuesToKeys() { } return result; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/CallArgs.java b/unicodetools/src/main/java/org/unicode/text/utility/CallArgs.java index 76be36b18..3e6ea2e01 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/CallArgs.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/CallArgs.java @@ -2,7 +2,6 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; - import org.unicode.props.BagFormatter; public class CallArgs { @@ -14,22 +13,21 @@ public static String getPrefix(Class c) { if (pos < 0) { return ""; } - return prefix.substring(0,pos+1); + return prefix.substring(0, pos + 1); } public static void call(String[] args, String prefix) throws Exception { for (final String arg2 : args) { String arg = arg2; - if (arg.startsWith("#")) - { + if (arg.startsWith("#")) { break; // comments out rest of line } String[] methodArgs = null; final int par = arg.indexOf('('); if (par >= 0) { - methodArgs = Utility.split(arg.substring(par+1, arg.length()-1),','); - arg = arg.substring(0,par); + methodArgs = Utility.split(arg.substring(par + 1, arg.length() - 1), ','); + arg = arg.substring(0, par); } final int pos = arg.indexOf('.'); Method method = null; @@ -37,8 +35,8 @@ public static void call(String[] args, String prefix) throws Exception { String methodName = ""; if (pos >= 0) { - className = prefix + arg.substring(0,pos); - methodName = arg.substring(pos+1); + className = prefix + arg.substring(0, pos); + methodName = arg.substring(pos + 1); method = tryMethod(className, methodName, methodArgs); } else { method = tryMethod(className, arg, methodArgs); @@ -49,12 +47,14 @@ public static void call(String[] args, String prefix) throws Exception { } } if (method == null) { - throw new IllegalArgumentException("Bad parameter: " + className + ", " + methodName); + throw new IllegalArgumentException( + "Bad parameter: " + className + ", " + methodName); } System.out.println(method.getName() + "\t" + bf.join(methodArgs)); - method.invoke(null,methodArgs); + method.invoke(null, methodArgs); } } + private static Method tryMethod(String className, String methodName, String[] methodArgs) throws IllegalAccessException, InvocationTargetException { try { @@ -66,7 +66,7 @@ private static Method tryMethod(String className, String methodName, String[] me parameterTypes[i] = String.class; } } - return foo.getDeclaredMethod(methodName,parameterTypes); + return foo.getDeclaredMethod(methodName, parameterTypes); } catch (final Exception e) { return null; } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/ChainException.java b/unicodetools/src/main/java/org/unicode/text/utility/ChainException.java index e3e6c48a8..641be8cd4 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/ChainException.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/ChainException.java @@ -1,30 +1,27 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/ChainException.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/ChainException.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; - import java.text.MessageFormat; public class ChainException extends RuntimeException { Object[] keyData; String messageFormat; - //Exception chain; + // Exception chain; - public ChainException (String messageFormat, Object[] objects) { + public ChainException(String messageFormat, Object[] objects) { this.messageFormat = messageFormat; keyData = objects == null ? null : (Object[]) objects.clone(); } - public ChainException (String messageFormat, Object[] objects, Exception chainedException) { + public ChainException(String messageFormat, Object[] objects, Exception chainedException) { this.messageFormat = messageFormat; keyData = objects == null ? null : (Object[]) objects.clone(); initCause(chainedException); @@ -49,4 +46,3 @@ public String getMessage() { return main + chainMsg; } } - diff --git a/unicodetools/src/main/java/org/unicode/text/utility/CompactByteArray.java b/unicodetools/src/main/java/org/unicode/text/utility/CompactByteArray.java index 5e0231c97..d59a11b70 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/CompactByteArray.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/CompactByteArray.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/CompactByteArray.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/CompactByteArray.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; /* @@ -45,27 +43,24 @@ import java.io.Serializable; /** + * Provides a compact way to store information that is indexed by Unicode values, such as character + * properties, types, keyboard values, etc. only for internal use for now. Made public for + * discussion purposes. * - * Provides a compact way to store information that is indexed by Unicode - * values, such as character properties, types, keyboard values, etc. - * only for internal use for now. Made public for discussion purposes. - * - * @see CompactIntArray - * @see CompactShortArray - * @version %I% %G% - * @author Helena Shih + * @see CompactIntArray + * @see CompactShortArray + * @version %I% %G% + * @author Helena Shih */ public final class CompactByteArray implements Serializable { + public static final int UNICODECOUNT = 65536; - public static final int UNICODECOUNT =65536; - - public CompactByteArray() - { - this((byte)0); + public CompactByteArray() { + this((byte) 0); } - public CompactByteArray(byte defaultValue) - { + + public CompactByteArray(byte defaultValue) { int i; values = new byte[UNICODECOUNT]; indices = new short[INDEXCOUNT]; @@ -73,30 +68,28 @@ public CompactByteArray(byte defaultValue) values[i] = defaultValue; } for (i = 0; i < INDEXCOUNT; ++i) { - indices[i] = (short)(i<= newValues.length+BLOCKCOUNT)) { + if ((index < 0) || (index >= newValues.length + BLOCKCOUNT)) { throw new IllegalArgumentException(); } } indices = indexArray; values = newValues; isCompact = true; - } + } - public void writeArrays(PrintWriter output) - { + public void writeArrays(PrintWriter output) { int i; output.println("package org.unicode.text.unicode;"); output.println("import org.unicode.text.collections.*;"); @@ -134,21 +127,19 @@ public void writeArrays(PrintWriter output) } public byte elementAt(char index) // parameterized on byte - { - return (values[(indices[index >>> BLOCKSHIFT] & 0xFFFF) + - (index & BLOCKMASK)]); + { + return (values[(indices[index >>> BLOCKSHIFT] & 0xFFFF) + (index & BLOCKMASK)]); } // Set automatically expands the array if it is compacted. // parameterized on value (byte) - public void setElementAt(char index, byte value) - { + public void setElementAt(char index, byte value) { if (isCompact) { expand(); } values[index] = value; } - public void setElementAt(char start, char end, byte value) - { + + public void setElementAt(char start, char end, byte value) { int i; if (isCompact) { expand(); @@ -163,47 +154,43 @@ public void setElementAt(char start, char end, byte value) // If values stored in the array tend to repeat in cycles of, say, 16, // then using that will be faster than cycle = 1, and get almost the // same compression. cycle is hardcoded as BLOCKCOUNT now. - public void compact() - { + public void compact() { if (isCompact == false) { - char[] tempIndex; - int tempIndexCount; - byte[] tempArray; - short iBlock, iIndex; + char[] tempIndex; + int tempIndexCount; + byte[] tempArray; + short iBlock, iIndex; // make temp storage, larger than we need tempIndex = new char[UNICODECOUNT]; // set up first block. tempIndexCount = BLOCKCOUNT; for (iIndex = 0; iIndex < BLOCKCOUNT; ++iIndex) { - tempIndex[iIndex] = (char)iIndex; - }; // endfor (iIndex = 0; .....) - indices[0] = (short)0; + tempIndex[iIndex] = (char) iIndex; + } + ; // endfor (iIndex = 0; .....) + indices[0] = (short) 0; // for each successive block, find out its first position // in the compacted array for (iBlock = 1; iBlock < INDEXCOUNT; ++iBlock) { - int newCount, firstPosition, block; - block = iBlock< DEBUGSMALLLIMIT) { break; } } - firstPosition = FindOverlappingPosition( block, tempIndex, - tempIndexCount ); + firstPosition = FindOverlappingPosition(block, tempIndex, tempIndexCount); newCount = firstPosition + BLOCKCOUNT; if (newCount > tempIndexCount) { - for (iIndex = (short)tempIndexCount; - iIndex < newCount; - ++iIndex) { - tempIndex[iIndex] = (char) - (iIndex - firstPosition + block); + for (iIndex = (short) tempIndexCount; iIndex < newCount; ++iIndex) { + tempIndex[iIndex] = (char) (iIndex - firstPosition + block); } // endfor (iIndex = tempIndexCount....) tempIndexCount = newCount; } // endif (newCount > tempIndexCount) - indices[iBlock] = (short)firstPosition; + indices[iBlock] = (short) firstPosition; } // endfor (iBlock = 1.....) // now allocate and copy the items into the array @@ -217,17 +204,16 @@ public void compact() } // endif (isCompact != false) } // Expanded takes the array back to a 65536 element array - public void expand() - { + public void expand() { int i; if (isCompact) { - byte[] tempArray; + byte[] tempArray; tempArray = new byte[UNICODECOUNT]; for (i = 0; i < UNICODECOUNT; ++i) { - tempArray[i] = elementAt((char)i); + tempArray[i] = elementAt((char) i); } for (i = 0; i < INDEXCOUNT; ++i) { - indices[i] = (short)(i< : " + - ((indices[i] >= 0) ? - indices[i] : - indices[i] + UNICODECOUNT)); + for (i = start; i < count; ++i) { + System.out.println( + i + " -> : " + ((indices[i] >= 0) ? indices[i] : indices[i] + UNICODECOUNT)); } System.out.println(); } - public void printPlainArray(int start,int count, char[] tempIndex) - { + + public void printPlainArray(int start, int count, char[] tempIndex) { int iIndex; - if (tempIndex != null) - { - for (iIndex = start; iIndex < start + count; ++iIndex) - { - System.out.print(" " + (int)values[tempIndex[iIndex]]); + if (tempIndex != null) { + for (iIndex = start; iIndex < start + count; ++iIndex) { + System.out.print(" " + (int) values[tempIndex[iIndex]]); } - } - else - { - for (iIndex = start; iIndex < start + count; ++iIndex) - { - System.out.print(" " + (int)values[iIndex]); + } else { + for (iIndex = start; iIndex < start + count; ++iIndex) { + System.out.print(" " + (int) values[iIndex]); } } System.out.println(" Range: start " + start + " , count " + count); } // # of elements in the indexed array - public short capacity() - { - return (short)values.length; + public short capacity() { + return (short) values.length; } - public int storage() - { + public int storage() { return values.length * 1 + indices.length * 2 + 12; } - private byte[] getArray() - { + private byte[] getArray() { return values; } - private int - FindOverlappingPosition(int start, char[] tempIndex, int tempIndexCount) - { + + private int FindOverlappingPosition(int start, char[] tempIndex, int tempIndexCount) { int i; short j; short currentCount; @@ -293,9 +266,9 @@ private byte[] getArray() printPlainArray(0, tempIndexCount, tempIndex); } for (i = 0; i < tempIndexCount; i += BLOCKCOUNT) { - currentCount = (short)BLOCKCOUNT; + currentCount = (short) BLOCKCOUNT; if (i + BLOCKCOUNT > tempIndexCount) { - currentCount = (short)(tempIndexCount - i); + currentCount = (short) (tempIndexCount - i); } for (j = 0; j < currentCount; ++j) { if (values[start + j] != values[tempIndex[i + j]]) { @@ -315,18 +288,20 @@ private byte[] getArray() } return i; } - private static final int DEBUGSHOWOVERLAPLIMIT = 100; - private static final boolean DEBUGTRACE = false; - private static final boolean DEBUGSMALL = false; - private static final boolean DEBUGOVERLAP = false; - private static final int DEBUGSMALLLIMIT = 30000; - private static final int BLOCKSHIFT =6; - private static final int BLOCKCOUNT =(1< short (char parameterized short) + private static final int DEBUGSHOWOVERLAPLIMIT = 100; + private static final boolean DEBUGTRACE = false; + private static final boolean DEBUGSMALL = false; + private static final boolean DEBUGOVERLAP = false; + private static final int DEBUGSMALLLIMIT = 30000; + private static final int BLOCKSHIFT = 6; + private static final int BLOCKCOUNT = (1 << BLOCKSHIFT); + private static final int INDEXSHIFT = (16 - BLOCKSHIFT); + private static final int INDEXCOUNT = (1 << INDEXSHIFT); + private static final int BLOCKMASK = BLOCKCOUNT - 1; + + private byte[] values; // char -> short (char parameterized short) private final short indices[]; private boolean isCompact; -}; +} +; diff --git a/unicodetools/src/main/java/org/unicode/text/utility/CompactShortArray.java b/unicodetools/src/main/java/org/unicode/text/utility/CompactShortArray.java index fc09a05e7..5da763be3 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/CompactShortArray.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/CompactShortArray.java @@ -1,17 +1,14 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/CompactShortArray.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/CompactShortArray.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; - /* * %W% %E% * @@ -42,55 +39,47 @@ * */ -import java.io.Serializable; - import com.ibm.icu.util.CompactCharArray; +import java.io.Serializable; /** - * class CompactATypeArray : use only on primitive data types - * Provides a compact way to store information that is indexed by Unicode - * values, such as character properties, types, keyboard values, etc.This - * is very useful when you have a block of Unicode data that contains - * significant values while the rest of the Unicode data is unused in the - * application or when you have a lot of redundance, such as where all 21,000 - * Han ideographs have the same value. However, lookup is much faster than a - * hash table. - * A compact array of any primitive data type serves two purposes: + * class CompactATypeArray : use only on primitive data types Provides a compact way to store + * information that is indexed by Unicode values, such as character properties, types, keyboard + * values, etc.This is very useful when you have a block of Unicode data that contains significant + * values while the rest of the Unicode data is unused in the application or when you have a lot of + * redundance, such as where all 21,000 Han ideographs have the same value. However, lookup is much + * faster than a hash table. A compact array of any primitive data type serves two purposes: + * *

    - *
  • Fast access of the indexed values. - *
  • Smaller memory footprint. + *
  • Fast access of the indexed values. + *
  • Smaller memory footprint. *
- * A compact array is composed of a index array and value array. The index - * array contains the indicies of Unicode characters to the value array. - * @see CompactByteArray - * @see CompactIntArray - * @see CompactCharArray - * @see CompactStringArray - * @version %I% %G% - * @author Helena Shih + * + * A compact array is composed of a index array and value array. The index array contains the + * indicies of Unicode characters to the value array. + * + * @see CompactByteArray + * @see CompactIntArray + * @see CompactCharArray + * @see CompactStringArray + * @version %I% %G% + * @author Helena Shih */ public final class CompactShortArray implements Serializable { + /** The total number of Unicode characters. */ + public static final int UNICODECOUNT = 65536; - /** - * The total number of Unicode characters. - */ - public static final int UNICODECOUNT =65536; - - /** - * Default constructor for CompactShortArray, the default value of the - * compact array is 0. - */ - public CompactShortArray() - { - this((short)0); + /** Default constructor for CompactShortArray, the default value of the compact array is 0. */ + public CompactShortArray() { + this((short) 0); } /** * Constructor for CompactShortArray. + * * @param defaultValue the default value of the compact array. */ - public CompactShortArray(short defaultValue) - { + public CompactShortArray(short defaultValue) { int i; values = new short[UNICODECOUNT]; indices = new short[INDEXCOUNT]; @@ -98,50 +87,50 @@ public CompactShortArray(short defaultValue) values[i] = defaultValue; } for (i = 0; i < INDEXCOUNT; ++i) { - indices[i] = (short)(i<= newValues.length+BLOCKCOUNT)) { + if ((index < 0) || (index >= newValues.length + BLOCKCOUNT)) { throw new IllegalArgumentException("Index out of bounds."); } } indices = indexArray; values = newValues; - } + } /** * Get the mapped value of a Unicode character. + * * @param index the character to get the mapped value with * @return the mapped value of the given character */ public short elementAt(char index) // parameterized on short - { - return (values[(indices[index >> BLOCKSHIFT] & 0xFFFF) - + (index & BLOCKMASK)]); + { + return (values[(indices[index >> BLOCKSHIFT] & 0xFFFF) + (index & BLOCKMASK)]); } /** - * Set a new value for a Unicode character. - * Set automatically expands the array if it is compacted. + * Set a new value for a Unicode character. Set automatically expands the array if it is + * compacted. + * * @param index the character to set the mapped value with * @param value the new mapped value */ - public void setElementAt(char index, short value) - { + public void setElementAt(char index, short value) { if (isCompact) { expand(); } @@ -149,12 +138,12 @@ public void setElementAt(char index, short value) } /** * Set new values for a range of Unicode character. + * * @param start the starting offset of the range * @param end the ending offset of the range * @param value the new mapped value */ - public void setElementAt(char start, char end, short value) - { + public void setElementAt(char start, char end, short value) { int i; if (isCompact) { expand(); @@ -163,50 +152,44 @@ public void setElementAt(char start, char end, short value) values[i] = value; } } - /** - *Compact the array. - */ - public void compact() - { + /** Compact the array. */ + public void compact() { if (isCompact == false) { - char[] tempIndex; - int tempIndexCount; - short[] tempArray; - short iBlock, iIndex; + char[] tempIndex; + int tempIndexCount; + short[] tempArray; + short iBlock, iIndex; // make temp storage, larger than we need tempIndex = new char[UNICODECOUNT]; // set up first block. tempIndexCount = BLOCKCOUNT; for (iIndex = 0; iIndex < BLOCKCOUNT; ++iIndex) { - tempIndex[iIndex] = (char)iIndex; - }; // endfor (iIndex = 0; .....) - indices[0] = (short)0; + tempIndex[iIndex] = (char) iIndex; + } + ; // endfor (iIndex = 0; .....) + indices[0] = (short) 0; // for each successive block, find out its first position // in the compacted array for (iBlock = 1; iBlock < INDEXCOUNT; ++iBlock) { - int newCount, firstPosition, block; - block = iBlock< DEBUGSMALLLIMIT) { break; } } - firstPosition = FindOverlappingPosition(block, tempIndex, - tempIndexCount); + firstPosition = FindOverlappingPosition(block, tempIndex, tempIndexCount); newCount = firstPosition + BLOCKCOUNT; if (newCount > tempIndexCount) { - for (iIndex = (short)tempIndexCount; - iIndex < newCount; - ++iIndex) { - tempIndex[iIndex] - = (char)(iIndex - firstPosition + block); + for (iIndex = (short) tempIndexCount; iIndex < newCount; ++iIndex) { + tempIndex[iIndex] = (char) (iIndex - firstPosition + block); } // endfor (iIndex = tempIndexCount....) tempIndexCount = newCount; } // endif (newCount > tempIndexCount) - indices[iBlock] = (short)firstPosition; + indices[iBlock] = (short) firstPosition; } // endfor (iBlock = 1.....) // now allocate and copy the items into the array @@ -219,85 +202,76 @@ public void compact() isCompact = true; } // endif (isCompact != false) } - /** For internal use only. Do not modify the result, the behavior of - * modified results are undefined. + /** + * For internal use only. Do not modify the result, the behavior of modified results are + * undefined. */ - public short getIndexArray()[] - { + public short getIndexArray()[] { return indices; - } - /** For internal use only. Do not modify the result, the behavior of - * modified results are undefined. + } + /** + * For internal use only. Do not modify the result, the behavior of modified results are + * undefined. */ - public short getStringArray()[] - { + public short getStringArray()[] { return values; - } + } // -------------------------------------------------------------- // package private // -------------------------------------------------------------- - void writeArrays() - { + void writeArrays() { int i; - final int cnt = ((values.length > 0) ? values.length : - (values.length + UNICODECOUNT)); + final int cnt = ((values.length > 0) ? values.length : (values.length + UNICODECOUNT)); System.out.println("{"); - for (i = 0; i < INDEXCOUNT-1; i++) - { - System.out.print("(short)" + ((getIndexArrayValue(i) >= 0) ? - (int)getIndexArrayValue(i) : - (int)(getIndexArrayValue(i)+UNICODECOUNT)) + ", "); + for (i = 0; i < INDEXCOUNT - 1; i++) { + System.out.print( + "(short)" + + ((getIndexArrayValue(i) >= 0) + ? (int) getIndexArrayValue(i) + : (int) (getIndexArrayValue(i) + UNICODECOUNT)) + + ", "); if (i != 0) { if (i % 10 == 0) { System.out.println(); } } } - System.out.println("(short)" + - ((getIndexArrayValue(INDEXCOUNT-1) >= 0) ? - (int)getIndexArrayValue(i) : - (int)(getIndexArrayValue(i)+UNICODECOUNT)) + - " }"); + System.out.println( + "(short)" + + ((getIndexArrayValue(INDEXCOUNT - 1) >= 0) + ? (int) getIndexArrayValue(i) + : (int) (getIndexArrayValue(i) + UNICODECOUNT)) + + " }"); System.out.println("{"); - for (i = 0; i < cnt-1; i++) - { - System.out.print("(short)" + (int)getArrayValue(i) + ", "); + for (i = 0; i < cnt - 1; i++) { + System.out.print("(short)" + (int) getArrayValue(i) + ", "); if (i != 0) { if (i % 10 == 0) { System.out.println(); } } } - System.out.println("(short)" + (int)getArrayValue(cnt-1) + " }"); + System.out.println("(short)" + (int) getArrayValue(cnt - 1) + " }"); } // Print char Array : Debug only - void printIndex(short start, short count) - { + void printIndex(short start, short count) { int i; - for (i = start; i < count; ++i) - { - System.out.println(i + " -> : " + - ((indices[i] >= 0) ? - indices[i] : - indices[i] + UNICODECOUNT)); + for (i = start; i < count; ++i) { + System.out.println( + i + " -> : " + ((indices[i] >= 0) ? indices[i] : indices[i] + UNICODECOUNT)); } System.out.println(); } - void printPlainArray(int start,int count, char[] tempIndex) - { + + void printPlainArray(int start, int count, char[] tempIndex) { int iIndex; - if (tempIndex != null) - { - for (iIndex = start; iIndex < start + count; ++iIndex) - { - System.out.print(" " + (int)getArrayValue(tempIndex[iIndex])); + if (tempIndex != null) { + for (iIndex = start; iIndex < start + count; ++iIndex) { + System.out.print(" " + (int) getArrayValue(tempIndex[iIndex])); } - } - else - { - for (iIndex = start; iIndex < start + count; ++iIndex) - { - System.out.print(" " + (int)getArrayValue(iIndex)); + } else { + for (iIndex = start; iIndex < start + count; ++iIndex) { + System.out.print(" " + (int) getArrayValue(iIndex)); } } System.out.println(" Range: start " + start + " , count " + count); @@ -305,20 +279,17 @@ void printPlainArray(int start,int count, char[] tempIndex) // -------------------------------------------------------------- // private // -------------------------------------------------------------- - /** - * Expanding takes the array back to a 65536 element array. - */ - private void expand() - { + /** Expanding takes the array back to a 65536 element array. */ + private void expand() { int i; if (isCompact) { short[] tempArray; tempArray = new short[UNICODECOUNT]; for (i = 0; i < UNICODECOUNT; ++i) { - tempArray[i] = elementAt((char)i); + tempArray[i] = elementAt((char) i); } for (i = 0; i < INDEXCOUNT; ++i) { - indices[i] = (short)(i< tempIndexCount) { - currentCount = (short)(tempIndexCount - i); + currentCount = (short) (tempIndexCount - i); } for (j = 0; j < currentCount; ++j) { if (values[start + j] != values[tempIndex[i + j]]) { @@ -378,18 +346,19 @@ private short getIndexArrayValue(int n) return i; } - private static final int DEBUGSHOWOVERLAPLIMIT = 100; - private static final boolean DEBUGTRACE = false; - private static final boolean DEBUGSMALL = false; - private static final boolean DEBUGOVERLAP = false; - private static final int DEBUGSMALLLIMIT = 30000; - private static final int BLOCKSHIFT =7; - private static final int BLOCKCOUNT =(1< short (char parameterized short) + private short values[]; // char -> short (char parameterized short) private final short indices[]; private boolean isCompact; -}; +} +; diff --git a/unicodetools/src/main/java/org/unicode/text/utility/ComparisonNormalizer.java b/unicodetools/src/main/java/org/unicode/text/utility/ComparisonNormalizer.java index bfe0d6b48..51a9ec393 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/ComparisonNormalizer.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/ComparisonNormalizer.java @@ -1,14 +1,5 @@ package org.unicode.text.utility; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Map.Entry; -import java.util.Set; -import java.util.TreeSet; - -import org.unicode.cldr.util.MultiComparator; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.impl.Relation; import com.ibm.icu.lang.UCharacter; @@ -21,14 +12,22 @@ import com.ibm.icu.text.UTF16.StringComparator; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeSet; +import org.unicode.cldr.util.MultiComparator; public class ComparisonNormalizer { public static final Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance(); private UnicodeMap map = new UnicodeMap<>(); - public ComparisonNormalizer(Transform transform, Comparator bestFirst) { + public ComparisonNormalizer(Transform transform, Comparator bestFirst) { UnicodeSet chars = new UnicodeSet("[^[:Cn:][:Co:][:Cs:]]"); - Relation mapping = Relation.of(new HashMap>(), TreeSet.class, bestFirst); + Relation mapping = + Relation.of(new HashMap>(), TreeSet.class, bestFirst); for (String cp : chars) { mapping.put(transform.transform(cp), cp); String cpN = nfkc_cf.normalize(cp); @@ -63,6 +62,7 @@ public ComparisonNormalizer(Transform transform, Comparator { private final RuleBasedCollator uca_raw; + public RawCollationKeyTransform(RuleBasedCollator uca_raw) { this.uca_raw = uca_raw; } + @Override public RawCollationKey transform(String source) { RawCollationKey result = uca_raw.getRawCollationKey(source, new RawCollationKey()); if (result.size != result.bytes.length) { int size = result.size; byte[] bytes = result.releaseBytes(); - result.append(bytes, 0, size); // make the capacity = size, to work around hashCode bug. + result.append( + bytes, 0, size); // make the capacity = size, to work around hashCode bug. } return result; } - } - public static final Comparator CODEPOINT = new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); - public static final RuleBasedCollator UCA_SECONDARY_DECOMPOSING = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); + public static final Comparator CODEPOINT = + new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); + public static final RuleBasedCollator UCA_SECONDARY_DECOMPOSING = + (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); + static { UCA_SECONDARY_DECOMPOSING.setStrength(Collator.SECONDARY); UCA_SECONDARY_DECOMPOSING.setDecomposition(Collator.CANONICAL_DECOMPOSITION); UCA_SECONDARY_DECOMPOSING.freeze(); } - - public static ComparisonNormalizer getSimple() { - Comparator combiningLast = new Comparator() { - @Override - public int compare(String o1, String o2) { - boolean c1 = !o1.isEmpty() && isCombining(o1.codePointAt(0)); - boolean c2 = !o2.isEmpty() && isCombining(o2.codePointAt(0)); - return c1 == c2 ? 0 - : c1 ? 1 : -1; // is NOT combining is better - } - - private boolean isCombining(int cp) { - int t1 = UCharacter.getType(cp); - return t1 == UCharacter.NON_SPACING_MARK || t1 == UCharacter.ENCLOSING_MARK || t1 == UCharacter.COMBINING_SPACING_MARK; - } - }; - - Comparator isNFKC_CF = new Comparator() { - @Override - public int compare(String o1, String o2) { - boolean c1 = nfkc_cf.isNormalized(o1); - boolean c2 = nfkc_cf.isNormalized(o2); - return c1 == c2 ? 0 - : c1 ? -1 : 1; // isNormalized is better - } - }; - - - Comparator secondarySet = new Comparator() { - @Override - public int compare(String o1, String o2) { - boolean c1 = !o1.isEmpty() && secondaryData.containsAll(o1); - boolean c2 = !o1.isEmpty() && secondaryData.containsAll(o2); - return c1 == c2 ? 0 - : c1 ? 1 : -1; // NOT secondary is better - } - }; - + Comparator combiningLast = + new Comparator() { + @Override + public int compare(String o1, String o2) { + boolean c1 = !o1.isEmpty() && isCombining(o1.codePointAt(0)); + boolean c2 = !o2.isEmpty() && isCombining(o2.codePointAt(0)); + return c1 == c2 ? 0 : c1 ? 1 : -1; // is NOT combining is better + } - Comparator firstIsBetterTarget = new MultiComparator( - isNFKC_CF, - combiningLast, - secondarySet, - (Comparator)(Comparator)UCA_SECONDARY_DECOMPOSING, - CODEPOINT); - Transform transform = new RawCollationKeyTransform(UCA_SECONDARY_DECOMPOSING); + private boolean isCombining(int cp) { + int t1 = UCharacter.getType(cp); + return t1 == UCharacter.NON_SPACING_MARK + || t1 == UCharacter.ENCLOSING_MARK + || t1 == UCharacter.COMBINING_SPACING_MARK; + } + }; + + Comparator isNFKC_CF = + new Comparator() { + @Override + public int compare(String o1, String o2) { + boolean c1 = nfkc_cf.isNormalized(o1); + boolean c2 = nfkc_cf.isNormalized(o2); + return c1 == c2 ? 0 : c1 ? -1 : 1; // isNormalized is better + } + }; + + Comparator secondarySet = + new Comparator() { + @Override + public int compare(String o1, String o2) { + boolean c1 = !o1.isEmpty() && secondaryData.containsAll(o1); + boolean c2 = !o1.isEmpty() && secondaryData.containsAll(o2); + return c1 == c2 ? 0 : c1 ? 1 : -1; // NOT secondary is better + } + }; + + Comparator firstIsBetterTarget = + new MultiComparator( + isNFKC_CF, + combiningLast, + secondarySet, + (Comparator) (Comparator) UCA_SECONDARY_DECOMPOSING, + CODEPOINT); + Transform transform = + new RawCollationKeyTransform(UCA_SECONDARY_DECOMPOSING); ComparisonNormalizer norm = new ComparisonNormalizer(transform, firstIsBetterTarget); return norm; } - // private static void checkProblem(final RuleBasedCollator uca_raw, Comparator ucaFull) { + // private static void checkProblem(final RuleBasedCollator uca_raw, Comparator + // ucaFull) { // RawCollationKey a = uca_raw.getRawCollationKey("०", new RawCollationKey()); // RawCollationKey b = uca_raw.getRawCollationKey("𐒠", new RawCollationKey()); // boolean eq = a.equals(b); // int comp = a.compareTo(b); - // Relation mapping = Relation.of(new HashMap>(), TreeSet.class, ucaFull); + // Relation mapping = Relation.of(new + // HashMap>(), TreeSet.class, ucaFull); // mapping.put(a, "०"); // mapping.put(b, "𐒠"); // System.out.println(a.hashCode()); @@ -204,18 +211,21 @@ public UnicodeSet getSet(String value) { public static void main(String[] args) { ComparisonNormalizer norm = getSimple(); - Comparator ucaFull = new MultiComparator((Comparator)(Comparator)UCA_SECONDARY_DECOMPOSING, CODEPOINT); + Comparator ucaFull = + new MultiComparator( + (Comparator) (Comparator) UCA_SECONDARY_DECOMPOSING, CODEPOINT); TreeSet sorted = new TreeSet<>(ucaFull); sorted.addAll(norm.values()); -// String z1 = norm.map.get('०'); -// String z2 = norm.map.get("𐒠"); + // String z1 = norm.map.get('०'); + // String z2 = norm.map.get("𐒠"); StringBuilder key = new StringBuilder(); for (String target : sorted) { UnicodeSet sources = norm.getSet(target); - CollationElementIterator it = UCA_SECONDARY_DECOMPOSING.getCollationElementIterator(target); + CollationElementIterator it = + UCA_SECONDARY_DECOMPOSING.getCollationElementIterator(target); key.setLength(0); for (int ce = it.next(); ce != CollationElementIterator.NULLORDER; ce = it.next()) { if (key.length() != 0) { @@ -223,22 +233,31 @@ public static void main(String[] args) { } int primary = CollationElementIterator.primaryOrder(ce); int secondary = CollationElementIterator.secondaryOrder(ce); - key.append(Utility.hex(primary) + ":" + Utility.hex(secondary,2)); + key.append(Utility.hex(primary) + ":" + Utility.hex(secondary, 2)); } System.out.println( - "\n#" + key - + "\t" + Utility.hex(target) - + ";\t# ( " + target + " )" - + "\t" + (target.isEmpty() ? "" : UCharacter.getExtendedName(target.codePointAt(0))) - ); + "\n#" + + key + + "\t" + + Utility.hex(target) + + ";\t# ( " + + target + + " )" + + "\t" + + (target.isEmpty() + ? "" + : UCharacter.getExtendedName(target.codePointAt(0)))); for (String source : sources) { System.out.println( - Utility.hex(source) - + ";\t" + Utility.hex(target) - + ";\t# ( " + source + " )" - + "\t" + UCharacter.getExtendedName(source.codePointAt(0)) - ); + Utility.hex(source) + + ";\t" + + Utility.hex(target) + + ";\t# ( " + + source + + " )" + + "\t" + + UCharacter.getExtendedName(source.codePointAt(0))); } } } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/DifferTest.java b/unicodetools/src/main/java/org/unicode/text/utility/DifferTest.java index babb38282..6c8ee30b1 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/DifferTest.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/DifferTest.java @@ -1,19 +1,16 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/DifferTest.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/DifferTest.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import org.unicode.cldr.util.Differ; - public class DifferTest { public static final String copyright = "Copyright (C) 2000, IBM Corp. and others. All Rights Reserved."; @@ -22,7 +19,7 @@ static final void main(String[] args) { // for testing final String[] as = {"a", "b", "20D4", "0344", "20D5", "20D6", "20D7", "20D8", "20D9"}; final String[] bs = {"a", "b", "20D4", "20D5", "0344", "20D6", "20D7", "20D8", "20D9"}; - final Differ differ = new Differ(100,30); + final Differ differ = new Differ(100, 30); int max = as.length; if (max < bs.length) { max = bs.length; @@ -39,7 +36,16 @@ static final void main(String[] args) { // for testing final int aCount = differ.getACount(); final int bCount = differ.getBCount(); if (aCount != 0 || bCount != 0) { - System.out.println("a: " + differ.getALine(-1) + " " + differ.getA(-1) + "\t" + "b: " + differ.getBLine(-1) + " " + differ.getB(-1)); + System.out.println( + "a: " + + differ.getALine(-1) + + " " + + differ.getA(-1) + + "\t" + + "b: " + + differ.getBLine(-1) + + " " + + differ.getB(-1)); if (aCount != 0) { for (int i = 0; i < aCount; ++i) { @@ -54,10 +60,19 @@ static final void main(String[] args) { // for testing System.out.println("b: " + differ.getBLine(i) + " " + differ.getB(i)); } } - System.out.println("a: " + differ.getALine(aCount) + " " + differ.getA(aCount) + "\t" + "b: " + differ.getBLine(bCount) + " " + differ.getB(bCount)); + System.out.println( + "a: " + + differ.getALine(aCount) + + " " + + differ.getA(aCount) + + "\t" + + "b: " + + differ.getBLine(bCount) + + " " + + differ.getB(bCount)); } System.out.println("----"); - //differ.flush(); + // differ.flush(); } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/DirectoryIterator.java b/unicodetools/src/main/java/org/unicode/text/utility/DirectoryIterator.java index 3a64aca07..fd7548275 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/DirectoryIterator.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/DirectoryIterator.java @@ -10,7 +10,6 @@ import java.util.Set; import java.util.TreeSet; - public class DirectoryIterator { private File baseDirectory; @@ -18,13 +17,13 @@ public class DirectoryIterator { private DirectoryIterator subdirectory = null; private FileFilter filter = null; - static private Comparator reverseComparator = + private static Comparator reverseComparator = new Comparator() { - @Override - public int compare(Object a, Object b) { - return ((Comparable) b).compareTo(a); - } - }; + @Override + public int compare(Object a, Object b) { + return ((Comparable) b).compareTo(a); + } + }; public DirectoryIterator(File directory, FileFilter filter) { setDirectory(directory); @@ -70,8 +69,7 @@ public FileFilter getFilter() { return filter; } - /** Returns null when done - */ + /** Returns null when done */ public File next() { File file = null; while (true) { @@ -85,7 +83,7 @@ public File next() { if (!fileList.hasNext()) { return null; } - final String filestr = (String)fileList.next(); + final String filestr = (String) fileList.next(); file = new File(baseDirectory, filestr); if (file.isDirectory()) { subdirectory = new DirectoryIterator(file, filter); @@ -100,10 +98,8 @@ public File next() { } } - /** - * Returns the part before any '.' or '-' in the file name, without directory - */ - static public String getRoot(File f) { + /** Returns the part before any '.' or '-' in the file name, without directory */ + public static String getRoot(File f) { final String s = f.getName(); int dotPos = s.indexOf('.'); if (dotPos < 0) { @@ -116,35 +112,41 @@ static public String getRoot(File f) { if (dotPos < dashPos) { dashPos = dotPos; } - return s.substring(0,dashPos); + return s.substring(0, dashPos); } public static class RootFileFilter implements FileFilter { String root; + public RootFileFilter(String root) { setRoot(root); } + public void setRoot(String root) { this.root = root; } + public String getRoot() { return root; } + @Override public boolean accept(File f) { return DirectoryIterator.getRoot(f).equals("DerivedCoreProperties"); } + public String getDescription() { return "Root is '" + root + "'"; } + @Override public String toString() { return getDescription(); } - }; - + } + ; - static public void test() { + public static void test() { final File testDir = new File(Settings.UnicodeTools.UCD_DIR); DirectoryIterator di; @@ -172,9 +174,10 @@ static public void test() { } } - static public boolean isAlmostIdentical(File file1, File file2, boolean show) throws IOException { - final BufferedReader br1 = new BufferedReader(new FileReader(file1), 32*1024); - final BufferedReader br2 = new BufferedReader(new FileReader(file2), 32*1024); + public static boolean isAlmostIdentical(File file1, File file2, boolean show) + throws IOException { + final BufferedReader br1 = new BufferedReader(new FileReader(file1), 32 * 1024); + final BufferedReader br2 = new BufferedReader(new FileReader(file2), 32 * 1024); try { for (int lineCount = 0; ; ++lineCount) { final String line1 = br1.readLine(); @@ -215,5 +218,4 @@ static public boolean isAlmostIdentical(File file1, File file2, boolean show) th br2.close(); } } - } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/DualWriter.java b/unicodetools/src/main/java/org/unicode/text/utility/DualWriter.java index 5eaa520dd..7d37f8d9b 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/DualWriter.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/DualWriter.java @@ -1,35 +1,33 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/DualWriter.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/DualWriter.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import java.io.IOException; import java.io.Writer; -final public class DualWriter extends Writer { +public final class DualWriter extends Writer { private static final String copyright = "(C) Copyright IBM Corp. 1998 - All Rights Reserved"; // Abstract class for writing to character streams. // The only methods that a subclass must implement are // write(char[], int, int), flush(), and close(). - private boolean autoflush ; + private boolean autoflush; private final Writer a; private final Writer b; - public DualWriter (Writer a, Writer b) { + public DualWriter(Writer a, Writer b) { this.a = a; this.b = b; } - public DualWriter (Writer a, Writer b, boolean autoFlush) { + public DualWriter(Writer a, Writer b, boolean autoFlush) { this.a = a; this.b = b; autoflush = autoFlush; @@ -44,9 +42,7 @@ public boolean getAutoFlush() { } @Override - public void write(char cbuf[], - int off, - int len) throws IOException { + public void write(char cbuf[], int off, int len) throws IOException { a.write(cbuf, off, len); b.write(cbuf, off, len); if (autoflush) { diff --git a/unicodetools/src/main/java/org/unicode/text/utility/EnumBase.java b/unicodetools/src/main/java/org/unicode/text/utility/EnumBase.java index e6acb52d2..dedee4a5e 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/EnumBase.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/EnumBase.java @@ -1,38 +1,34 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/EnumBase.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/EnumBase.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import java.util.ArrayList; import java.util.List; /** - Used for generating fake enums. These can be compared with ==, - used in for statements, etc. - Subclasses will be of the form: -

-    static public class MyEnum extends EnumBase {
-        public static MyEnum
-            ZEROED = (MyEnum) makeNext(new MyEnum(), "ZEROED"),
-            SHIFTED = (MyEnum) makeNext(new MyEnum(), "SHIFTED"),
-            NON_IGNORABLE = (MyEnum) makeNext(new MyEnum(), "NON_IGNORABLE");
-        public MyEnum next() { return (MyEnum) internalNext(); }
-    }
-    
+ * Used for generating fake enums. These can be compared with ==, used in for statements, etc. + * Subclasses will be of the form: + * + *
+ * static public class MyEnum extends EnumBase {
+ * public static MyEnum
+ * ZEROED = (MyEnum) makeNext(new MyEnum(), "ZEROED"),
+ * SHIFTED = (MyEnum) makeNext(new MyEnum(), "SHIFTED"),
+ * NON_IGNORABLE = (MyEnum) makeNext(new MyEnum(), "NON_IGNORABLE");
+ * public MyEnum next() { return (MyEnum) internalNext(); }
+ * }
+ * 
*/ - public class EnumBase implements Comparable { - /** For use in collections - */ + /** For use in collections */ @Override public int compareTo(Object other) { final EnumBase that = (EnumBase) other; @@ -48,7 +44,7 @@ public int hashCode() { @Override public String toString() { - return (String)uniqueNames.get(value); + return (String) uniqueNames.get(value); } ////////////////// @@ -57,8 +53,7 @@ public String toString() { private static List uniqueList = new ArrayList(); private static List uniqueNames = new ArrayList(); - /** For use in for(..) statements - */ + /** For use in for(..) statements */ public Object internalNext() { final int temp = value + 1; if (temp >= uniqueList.size()) { @@ -71,10 +66,8 @@ public Object internalNext() { return result; } - /** - * For constructing the enums the first time - */ - static protected EnumBase makeNext(EnumBase result, String name) { + /** For constructing the enums the first time */ + protected static EnumBase makeNext(EnumBase result, String name) { try { result.value = uniqueList.size(); uniqueList.add(result); @@ -94,4 +87,3 @@ protected final int getValue() { protected EnumBase() {} } - diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearch.java b/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearch.java index fe656cd67..1279593a0 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearch.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearch.java @@ -1,35 +1,26 @@ - /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/FastBinarySearch.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/FastBinarySearch.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; +import com.ibm.icu.text.NumberFormat; import java.util.Arrays; import java.util.Random; -import com.ibm.icu.text.NumberFormat; - /** - * Quick & Dirty test program for fast (unrolled) binary search - * Should use new PerfTest once that is done, although since there is no object - * creation the numbers should be fairly reliable. + * Quick & Dirty test program for fast (unrolled) binary search Should use new PerfTest once that is + * done, although since there is no object creation the numbers should be fairly reliable. */ +public final class FastBinarySearch { -final public class FastBinarySearch { - - /** - * Testing - */ - - static public void test() { + /** Testing */ + public static void test() { perfTest(100, 100); // warmup // try different combinations of data size and iterations @@ -63,7 +54,7 @@ static void perfTest(int dataSize, int iterations) { fbs.setData(myData, myData.length); // produce probe data - final int[] probe = new int[myData.length*2]; + final int[] probe = new int[myData.length * 2]; for (int i = 0; i < probe.length; ++i) { probe[i] = (int) (random.nextDouble() * myData.length * 3); } @@ -82,7 +73,7 @@ static void perfTest(int dataSize, int iterations) { } } endTime = System.currentTimeMillis(); - baseTime = time = (endTime - startTime)*1000/totalIterations; + baseTime = time = (endTime - startTime) * 1000 / totalIterations; System.out.println("Basic; time=" + time + " microsecs/call"); startTime = System.currentTimeMillis(); @@ -92,8 +83,9 @@ static void perfTest(int dataSize, int iterations) { } } endTime = System.currentTimeMillis(); - time = (endTime - startTime)*1000/totalIterations; - System.out.println("Fast; time=" + time + " microsecs/call\t" + percent.format(time/baseTime-1)); + time = (endTime - startTime) * 1000 / totalIterations; + System.out.println( + "Fast; time=" + time + " microsecs/call\t" + percent.format(time / baseTime - 1)); startTime = System.currentTimeMillis(); for (int testCount = 0; testCount < iterations; ++testCount) { @@ -102,11 +94,14 @@ static void perfTest(int dataSize, int iterations) { } } endTime = System.currentTimeMillis(); - time = (endTime - startTime)*1000/totalIterations; - System.out.println("Compact; time=" + time + " microsecs/call\t" + percent.format(time/baseTime-1)); + time = (endTime - startTime) * 1000 / totalIterations; + System.out.println( + "Compact; time=" + + time + + " microsecs/call\t" + + percent.format(time / baseTime - 1)); } - static void validityTest() { final Random random = new Random(123456789L); final int[] myData = new int[50]; @@ -116,11 +111,11 @@ static void validityTest() { // produce test case double ran = random.nextDouble(); - //System.out.println(ran); - final int myCount = 2+ (int) (ran * (myData.length - 2)); + // System.out.println(ran); + final int myCount = 2 + (int) (ran * (myData.length - 2)); for (int i = 0; i < myCount; ++i) { ran = random.nextDouble(); - //System.out.println(ran); + // System.out.println(ran); myData[i] = (int) (ran * myData.length * 3); } System.out.println("Trial " + testCount + ", len: " + myCount); @@ -137,7 +132,7 @@ static void validityTest() { System.out.println(fbs); } System.out.println("Error: probe=" + i + ", brute=" + brute + ", fast=" + fast); - fast = fbs.highestIndexLEQ(i); // do again with debugger + fast = fbs.highestIndexLEQ(i); // do again with debugger ok = false; } } @@ -147,10 +142,7 @@ static void validityTest() { } } - /** - * Set the data to be scanned. It must be in sorted order. - */ - + /** Set the data to be scanned. It must be in sorted order. */ public void setData(int data[], int count) { this.data = data.clone(); // clone for safety @@ -158,10 +150,7 @@ public void setData(int data[], int count) { this.count = count; } - /** - * Basic binary search - */ - + /** Basic binary search */ private final int findCodePoint(int c) { // Return the smallest i such that c < list[i]. Assume // list[len - 1] == HIGH and that c is legal (0..HIGH-1). @@ -172,7 +161,7 @@ private final int findCodePoint(int c) { int hi = count - 1; // invariant: c >= list[lo] // invariant: c < list[hi] - for (;;) { + for (; ; ) { final int i = (lo + hi) >>> 1; if (i == lo) { return hi; @@ -186,10 +175,9 @@ private final int findCodePoint(int c) { } /** - * @return greatest index whose value is less than or equal to the searchValue. - * If there is no such index, then -1 is returned + * @return greatest index whose value is less than or equal to the searchValue. If there is no + * such index, then -1 is returned */ - public int bruteForce(int searchValue) { int i = count; while (--i >= 0 && data[i] > searchValue) {} @@ -197,10 +185,9 @@ public int bruteForce(int searchValue) { } /** - * @return greatest index such that data[index] <= searchValue - * If there is no such index (e.g. searchValue < data[0]), then -1 is returned + * @return greatest index such that data[index] <= searchValue If there is no such index (e.g. + * searchValue < data[0]), then -1 is returned */ - public int highestIndexLEQ(int searchValue) { if (!isValid) { @@ -223,114 +210,145 @@ public int highestIndexLEQ(int searchValue) { // The invariant AFTER each line is that data[low] < searchValue <= data[high] switch (power) { - //case 31: if (searchValue < data[temp = high-0x40000000]) high = temp; // no unsigned int in Java - case 30: if (searchValue < data[temp = high-0x20000000]) { - high = temp; - } - case 29: if (searchValue < data[temp = high-0x10000000]) { - high = temp; - } + // case 31: if (searchValue < data[temp = high-0x40000000]) high = temp; // no + // unsigned int in Java + case 30: + if (searchValue < data[temp = high - 0x20000000]) { + high = temp; + } + case 29: + if (searchValue < data[temp = high - 0x10000000]) { + high = temp; + } - case 28: if (searchValue < data[temp = high- 0x8000000]) { - high = temp; - } - case 27: if (searchValue < data[temp = high- 0x4000000]) { - high = temp; - } - case 26: if (searchValue < data[temp = high- 0x2000000]) { - high = temp; - } - case 25: if (searchValue < data[temp = high- 0x1000000]) { - high = temp; - } + case 28: + if (searchValue < data[temp = high - 0x8000000]) { + high = temp; + } + case 27: + if (searchValue < data[temp = high - 0x4000000]) { + high = temp; + } + case 26: + if (searchValue < data[temp = high - 0x2000000]) { + high = temp; + } + case 25: + if (searchValue < data[temp = high - 0x1000000]) { + high = temp; + } - case 24: if (searchValue < data[temp = high- 0x800000]) { - high = temp; - } - case 23: if (searchValue < data[temp = high- 0x400000]) { - high = temp; - } - case 22: if (searchValue < data[temp = high- 0x200000]) { - high = temp; - } - case 21: if (searchValue < data[temp = high- 0x100000]) { - high = temp; - } + case 24: + if (searchValue < data[temp = high - 0x800000]) { + high = temp; + } + case 23: + if (searchValue < data[temp = high - 0x400000]) { + high = temp; + } + case 22: + if (searchValue < data[temp = high - 0x200000]) { + high = temp; + } + case 21: + if (searchValue < data[temp = high - 0x100000]) { + high = temp; + } - case 20: if (searchValue < data[temp = high- 0x80000]) { - high = temp; - } - case 19: if (searchValue < data[temp = high- 0x40000]) { - high = temp; - } - case 18: if (searchValue < data[temp = high- 0x20000]) { - high = temp; - } - case 17: if (searchValue < data[temp = high- 0x10000]) { - high = temp; - } + case 20: + if (searchValue < data[temp = high - 0x80000]) { + high = temp; + } + case 19: + if (searchValue < data[temp = high - 0x40000]) { + high = temp; + } + case 18: + if (searchValue < data[temp = high - 0x20000]) { + high = temp; + } + case 17: + if (searchValue < data[temp = high - 0x10000]) { + high = temp; + } - case 16: if (searchValue < data[temp = high- 0x8000]) { - high = temp; - } - case 15: if (searchValue < data[temp = high- 0x4000]) { - high = temp; - } - case 14: if (searchValue < data[temp = high- 0x2000]) { - high = temp; - } - case 13: if (searchValue < data[temp = high- 0x1000]) { - high = temp; - } + case 16: + if (searchValue < data[temp = high - 0x8000]) { + high = temp; + } + case 15: + if (searchValue < data[temp = high - 0x4000]) { + high = temp; + } + case 14: + if (searchValue < data[temp = high - 0x2000]) { + high = temp; + } + case 13: + if (searchValue < data[temp = high - 0x1000]) { + high = temp; + } - case 12: if (searchValue < data[temp = high- 0x800]) { - high = temp; - } - case 11: if (searchValue < data[temp = high- 0x400]) { - high = temp; - } - case 10: if (searchValue < data[temp = high- 0x200]) { - high = temp; - } - case 9: if (searchValue < data[temp = high- 0x100]) { - high = temp; - } + case 12: + if (searchValue < data[temp = high - 0x800]) { + high = temp; + } + case 11: + if (searchValue < data[temp = high - 0x400]) { + high = temp; + } + case 10: + if (searchValue < data[temp = high - 0x200]) { + high = temp; + } + case 9: + if (searchValue < data[temp = high - 0x100]) { + high = temp; + } - case 8: if (searchValue < data[temp = high- 0x80]) { - high = temp; - } - case 7: if (searchValue < data[temp = high- 0x40]) { - high = temp; - } - case 6: if (searchValue < data[temp = high- 0x20]) { - high = temp; - } - case 5: if (searchValue < data[temp = high- 0x10]) { - high = temp; - } + case 8: + if (searchValue < data[temp = high - 0x80]) { + high = temp; + } + case 7: + if (searchValue < data[temp = high - 0x40]) { + high = temp; + } + case 6: + if (searchValue < data[temp = high - 0x20]) { + high = temp; + } + case 5: + if (searchValue < data[temp = high - 0x10]) { + high = temp; + } - case 4: if (searchValue < data[temp = high- 0x8]) { - high = temp; - } - case 3: if (searchValue < data[temp = high- 0x4]) { - high = temp; - } - case 2: if (searchValue < data[temp = high- 0x2]) { - high = temp; - } - case 1: if (searchValue < data[temp = high- 0x1]) { - high = temp; - } + case 4: + if (searchValue < data[temp = high - 0x8]) { + high = temp; + } + case 3: + if (searchValue < data[temp = high - 0x4]) { + high = temp; + } + case 2: + if (searchValue < data[temp = high - 0x2]) { + high = temp; + } + case 1: + if (searchValue < data[temp = high - 0x1]) { + high = temp; + } } if (high == topOfHigh && searchValue >= data[high]) { return high; } - return high-1; + return high - 1; } - // NOTE: on some machines the above may not be optimal, if the size of the function - // forces code out of the cache. For that case, it would be better for program in a loop, like the following + // forces code out of the cache. For that case, it would be better for program in a loop, like + // the following public int highestIndexLEQ2(int searchValue) { @@ -340,19 +358,17 @@ public int highestIndexLEQ2(int searchValue) { int temp; int high = searchValue < data[topOfLow] ? topOfLow : topOfHigh; for (int delta = deltaStart; delta != 0; delta >>= 1) { - if (searchValue < data[temp = high-delta]) { + if (searchValue < data[temp = high - delta]) { high = temp; } } if (high == topOfHigh && searchValue >= data[high]) { return high; } - return high-1; + return high - 1; } - /** - * For debugging - */ + /** For debugging */ @Override public String toString() { String result = "["; @@ -369,7 +385,6 @@ public String toString() { return result; } - // ================ Privates ================ // data @@ -385,12 +400,12 @@ private void validate() { } // find greatest power of 2 less than or equal to count - for (power = exp2.length-1; power > 0 && exp2[power] > count; power--) {} + for (power = exp2.length - 1; power > 0 && exp2[power] > count; power--) {} // determine the starting points topOfLow = exp2[power] - 1; topOfHigh = count - 1; - deltaStart = exp2[power-1]; + deltaStart = exp2[power - 1]; isValid = true; } @@ -410,4 +425,4 @@ private void validate() { 0x1000000, 0x2000000, 0x4000000, 0x8000000, 0x10000000, 0x20000000 // , 0x40000000 // no unsigned int in Java }; -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearchTest.java b/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearchTest.java index 93ee8e426..7c65f4a06 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearchTest.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastBinarySearchTest.java @@ -12,42 +12,36 @@ package org.unicode.text.utility; -public class FastBinarySearchTest -{ - static boolean printResult(FastIntBinarySearch search, int value) - { +public class FastBinarySearchTest { + static boolean printResult(FastIntBinarySearch search, int value) { final int ndx = search.findIndex(value); final int data[] = search.getData(); String errorString = null; if (ndx < 0) { if (!(ndx < data[0])) { - errorString = "" + value +" < "+data[0]; + errorString = "" + value + " < " + data[0]; } - } - else if (ndx < data.length -1) { - if (!(data[ndx] <= value && value < data[ndx+1])) { - errorString = "" + data[ndx]+"<="+value+"<"+data[ndx+1]; + } else if (ndx < data.length - 1) { + if (!(data[ndx] <= value && value < data[ndx + 1])) { + errorString = "" + data[ndx] + "<=" + value + "<" + data[ndx + 1]; } - } - else { + } else { if (!(data[ndx] <= value)) { - errorString = ""+data[ndx]+"<"+value; + errorString = "" + data[ndx] + "<" + value; } } if (errorString != null) { - System.out.println("ERROR: findIndex("+value+") => "+ndx+" "+errorString); + System.out.println("ERROR: findIndex(" + value + ") => " + ndx + " " + errorString); return false; - } - else { + } else { return true; } } - static void test(int testArray[]) - { + static void test(int testArray[]) { boolean passed = true; final FastIntBinarySearch search = new FastIntBinarySearch(testArray); - for (int i = -1; passed && i < testArray[testArray.length-1]+2; i++) { + for (int i = -1; passed && i < testArray[testArray.length - 1] + 2; i++) { passed = passed && printResult(search, i); } if (passed) { @@ -57,18 +51,20 @@ static void test(int testArray[]) } } - // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + // 16, 17 public static int testArray1[] = {1}; public static int testArray2[] = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 19, 19, 25, 27, 29, 31}; public static int testArray3[] = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 19, 19, 25, 27, 29}; - public static int testArray4[] = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 19, 19, 25, 27, 29, 31, 33}; + public static int testArray4[] = { + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 19, 19, 25, 27, 29, 31, 33 + }; - public static void main(String args[]) - { + public static void main(String args[]) { System.out.println("running 4 tests..."); test(testArray1); test(testArray2); test(testArray3); test(testArray4); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastByteBinarySearch.java b/unicodetools/src/main/java/org/unicode/text/utility/FastByteBinarySearch.java index b378ac3da..ac042a24e 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastByteBinarySearch.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastByteBinarySearch.java @@ -12,119 +12,115 @@ package org.unicode.text.utility; -public class FastByteBinarySearch -{ +public class FastByteBinarySearch { private final byte dataArray[]; private int auxStart; private int power; - private static final int exp2[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 }; + private static final int exp2[] = { + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 + }; - public FastByteBinarySearch(byte data[]) - { + public FastByteBinarySearch(byte data[]) { if (data.length < 1) { throw new IllegalArgumentException(); } - if (data.length >= exp2[exp2.length-1]) { + if (data.length >= exp2[exp2.length - 1]) { throw new IllegalArgumentException(); } dataArray = data; - for (power = exp2.length-1; power > 0 && dataArray.length < exp2[power]; power--) {} + for (power = exp2.length - 1; power > 0 && dataArray.length < exp2[power]; power--) {} if (exp2[power] != dataArray.length) { auxStart = dataArray.length - exp2[power]; - } - else { + } else { auxStart = 0; } } - public byte[] getData() - { + public byte[] getData() { return dataArray; } - public int findIndex(int value) - { - int index = exp2[power]-1; + public int findIndex(int value) { + int index = exp2[power] - 1; if (value >= dataArray[auxStart]) { index += auxStart; } switch (power) { - case 17: - if (value < dataArray[index-65536]) { - index -= 65536; - } - case 16: - if (value < dataArray[index-32768]) { - index -= 32768; - } - case 15: - if (value < dataArray[index-16384]) { - index -= 16384; - } - case 14: - if (value < dataArray[index-8192]) { - index -= 8192; - } - case 13: - if (value < dataArray[index-4096]) { - index -= 4096; - } - case 12: - if (value < dataArray[index-2048]) { - index -= 2048; - } - case 11: - if (value < dataArray[index-1024]) { - index -= 1024; - } - case 10: - if (value < dataArray[index-512]) { - index -= 512; - } - case 9: - if (value < dataArray[index-256]) { - index -= 256; - } - case 8: - if (value < dataArray[index-128]) { - index -= 128; - } - case 7: - if (value < dataArray[index-64]) { - index -= 64; - } - case 6: - if (value < dataArray[index-32]) { - index -= 32; - } - case 5: - if (value < dataArray[index-16]) { - index -= 16; - } - case 4: - if (value < dataArray[index-8]) { - index -= 8; - } - case 3: - if (value < dataArray[index-4]) { - index -= 4; - } - case 2: - if (value < dataArray[index-2]) { - index -= 2; - } - case 1: - if (value < dataArray[index-1]) { - index -= 1; - } - case 0: - if (value < dataArray[index]) { - index -= 1; - } + case 17: + if (value < dataArray[index - 65536]) { + index -= 65536; + } + case 16: + if (value < dataArray[index - 32768]) { + index -= 32768; + } + case 15: + if (value < dataArray[index - 16384]) { + index -= 16384; + } + case 14: + if (value < dataArray[index - 8192]) { + index -= 8192; + } + case 13: + if (value < dataArray[index - 4096]) { + index -= 4096; + } + case 12: + if (value < dataArray[index - 2048]) { + index -= 2048; + } + case 11: + if (value < dataArray[index - 1024]) { + index -= 1024; + } + case 10: + if (value < dataArray[index - 512]) { + index -= 512; + } + case 9: + if (value < dataArray[index - 256]) { + index -= 256; + } + case 8: + if (value < dataArray[index - 128]) { + index -= 128; + } + case 7: + if (value < dataArray[index - 64]) { + index -= 64; + } + case 6: + if (value < dataArray[index - 32]) { + index -= 32; + } + case 5: + if (value < dataArray[index - 16]) { + index -= 16; + } + case 4: + if (value < dataArray[index - 8]) { + index -= 8; + } + case 3: + if (value < dataArray[index - 4]) { + index -= 4; + } + case 2: + if (value < dataArray[index - 2]) { + index -= 2; + } + case 1: + if (value < dataArray[index - 1]) { + index -= 1; + } + case 0: + if (value < dataArray[index]) { + index -= 1; + } } return index; } - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastCharBinarySearch.java b/unicodetools/src/main/java/org/unicode/text/utility/FastCharBinarySearch.java index 08c919e5f..1ecb54c32 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastCharBinarySearch.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastCharBinarySearch.java @@ -12,119 +12,115 @@ package org.unicode.text.utility; - -public class FastCharBinarySearch -{ +public class FastCharBinarySearch { private final char dataArray[]; private int auxStart; private int power; - private static final int exp2[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 }; + private static final int exp2[] = { + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 + }; - public FastCharBinarySearch(char data[]) - { + public FastCharBinarySearch(char data[]) { if (data.length < 1) { throw new IllegalArgumentException(); } - if (data.length >= exp2[exp2.length-1]) { + if (data.length >= exp2[exp2.length - 1]) { throw new IllegalArgumentException(); } dataArray = data; - for (power = exp2.length-1; power > 0 && dataArray.length < exp2[power]; power--) {} + for (power = exp2.length - 1; power > 0 && dataArray.length < exp2[power]; power--) {} if (exp2[power] != dataArray.length) { auxStart = dataArray.length - exp2[power]; - } - else { + } else { auxStart = 0; } } - public char[] getData() - { + public char[] getData() { return dataArray; } - public int findIndex(int value) - { - int index = exp2[power]-1; + public int findIndex(int value) { + int index = exp2[power] - 1; if (value >= dataArray[auxStart]) { index += auxStart; } switch (power) { - case 17: - if (value < dataArray[index-65536]) { - index -= 65536; - } - case 16: - if (value < dataArray[index-32768]) { - index -= 32768; - } - case 15: - if (value < dataArray[index-16384]) { - index -= 16384; - } - case 14: - if (value < dataArray[index-8192]) { - index -= 8192; - } - case 13: - if (value < dataArray[index-4096]) { - index -= 4096; - } - case 12: - if (value < dataArray[index-2048]) { - index -= 2048; - } - case 11: - if (value < dataArray[index-1024]) { - index -= 1024; - } - case 10: - if (value < dataArray[index-512]) { - index -= 512; - } - case 9: - if (value < dataArray[index-256]) { - index -= 256; - } - case 8: - if (value < dataArray[index-128]) { - index -= 128; - } - case 7: - if (value < dataArray[index-64]) { - index -= 64; - } - case 6: - if (value < dataArray[index-32]) { - index -= 32; - } - case 5: - if (value < dataArray[index-16]) { - index -= 16; - } - case 4: - if (value < dataArray[index-8]) { - index -= 8; - } - case 3: - if (value < dataArray[index-4]) { - index -= 4; - } - case 2: - if (value < dataArray[index-2]) { - index -= 2; - } - case 1: - if (value < dataArray[index-1]) { - index -= 1; - } - case 0: - if (value < dataArray[index]) { - index -= 1; - } + case 17: + if (value < dataArray[index - 65536]) { + index -= 65536; + } + case 16: + if (value < dataArray[index - 32768]) { + index -= 32768; + } + case 15: + if (value < dataArray[index - 16384]) { + index -= 16384; + } + case 14: + if (value < dataArray[index - 8192]) { + index -= 8192; + } + case 13: + if (value < dataArray[index - 4096]) { + index -= 4096; + } + case 12: + if (value < dataArray[index - 2048]) { + index -= 2048; + } + case 11: + if (value < dataArray[index - 1024]) { + index -= 1024; + } + case 10: + if (value < dataArray[index - 512]) { + index -= 512; + } + case 9: + if (value < dataArray[index - 256]) { + index -= 256; + } + case 8: + if (value < dataArray[index - 128]) { + index -= 128; + } + case 7: + if (value < dataArray[index - 64]) { + index -= 64; + } + case 6: + if (value < dataArray[index - 32]) { + index -= 32; + } + case 5: + if (value < dataArray[index - 16]) { + index -= 16; + } + case 4: + if (value < dataArray[index - 8]) { + index -= 8; + } + case 3: + if (value < dataArray[index - 4]) { + index -= 4; + } + case 2: + if (value < dataArray[index - 2]) { + index -= 2; + } + case 1: + if (value < dataArray[index - 1]) { + index -= 1; + } + case 0: + if (value < dataArray[index]) { + index -= 1; + } } return index; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastIntBinarySearch.java b/unicodetools/src/main/java/org/unicode/text/utility/FastIntBinarySearch.java index e04a9f817..5fc54640c 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastIntBinarySearch.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastIntBinarySearch.java @@ -12,119 +12,115 @@ package org.unicode.text.utility; -public class FastIntBinarySearch -{ +public class FastIntBinarySearch { private final int dataArray[]; private int auxStart; private int power; - private static final int exp2[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 }; + private static final int exp2[] = { + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 + }; - public FastIntBinarySearch(int data[]) - { + public FastIntBinarySearch(int data[]) { if (data.length < 1) { throw new IllegalArgumentException(); } - if (data.length >= exp2[exp2.length-1]) { + if (data.length >= exp2[exp2.length - 1]) { throw new IllegalArgumentException(); } dataArray = data; - for (power = exp2.length-1; power > 0 && dataArray.length < exp2[power]; power--) {} + for (power = exp2.length - 1; power > 0 && dataArray.length < exp2[power]; power--) {} if (exp2[power] != dataArray.length) { auxStart = dataArray.length - exp2[power]; - } - else { + } else { auxStart = 0; } } - public int[] getData() - { + public int[] getData() { return dataArray; } - public int findIndex(int value) - { - int index = exp2[power]-1; + public int findIndex(int value) { + int index = exp2[power] - 1; if (value >= dataArray[auxStart]) { index += auxStart; } switch (power) { - case 17: - if (value < dataArray[index-65536]) { - index -= 65536; - } - case 16: - if (value < dataArray[index-32768]) { - index -= 32768; - } - case 15: - if (value < dataArray[index-16384]) { - index -= 16384; - } - case 14: - if (value < dataArray[index-8192]) { - index -= 8192; - } - case 13: - if (value < dataArray[index-4096]) { - index -= 4096; - } - case 12: - if (value < dataArray[index-2048]) { - index -= 2048; - } - case 11: - if (value < dataArray[index-1024]) { - index -= 1024; - } - case 10: - if (value < dataArray[index-512]) { - index -= 512; - } - case 9: - if (value < dataArray[index-256]) { - index -= 256; - } - case 8: - if (value < dataArray[index-128]) { - index -= 128; - } - case 7: - if (value < dataArray[index-64]) { - index -= 64; - } - case 6: - if (value < dataArray[index-32]) { - index -= 32; - } - case 5: - if (value < dataArray[index-16]) { - index -= 16; - } - case 4: - if (value < dataArray[index-8]) { - index -= 8; - } - case 3: - if (value < dataArray[index-4]) { - index -= 4; - } - case 2: - if (value < dataArray[index-2]) { - index -= 2; - } - case 1: - if (value < dataArray[index-1]) { - index -= 1; - } - case 0: - if (value < dataArray[index]) { - index -= 1; - } + case 17: + if (value < dataArray[index - 65536]) { + index -= 65536; + } + case 16: + if (value < dataArray[index - 32768]) { + index -= 32768; + } + case 15: + if (value < dataArray[index - 16384]) { + index -= 16384; + } + case 14: + if (value < dataArray[index - 8192]) { + index -= 8192; + } + case 13: + if (value < dataArray[index - 4096]) { + index -= 4096; + } + case 12: + if (value < dataArray[index - 2048]) { + index -= 2048; + } + case 11: + if (value < dataArray[index - 1024]) { + index -= 1024; + } + case 10: + if (value < dataArray[index - 512]) { + index -= 512; + } + case 9: + if (value < dataArray[index - 256]) { + index -= 256; + } + case 8: + if (value < dataArray[index - 128]) { + index -= 128; + } + case 7: + if (value < dataArray[index - 64]) { + index -= 64; + } + case 6: + if (value < dataArray[index - 32]) { + index -= 32; + } + case 5: + if (value < dataArray[index - 16]) { + index -= 16; + } + case 4: + if (value < dataArray[index - 8]) { + index -= 8; + } + case 3: + if (value < dataArray[index - 4]) { + index -= 4; + } + case 2: + if (value < dataArray[index - 2]) { + index -= 2; + } + case 1: + if (value < dataArray[index - 1]) { + index -= 1; + } + case 0: + if (value < dataArray[index]) { + index -= 1; + } } return index; } - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastLongBinarySearch.java b/unicodetools/src/main/java/org/unicode/text/utility/FastLongBinarySearch.java index 7d632e259..4620b56d8 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastLongBinarySearch.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastLongBinarySearch.java @@ -12,119 +12,115 @@ package org.unicode.text.utility; - -public class FastLongBinarySearch -{ +public class FastLongBinarySearch { private final long dataArray[]; private int auxStart; private int power; - private static final int exp2[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 }; + private static final int exp2[] = { + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 + }; - public FastLongBinarySearch(long data[]) - { + public FastLongBinarySearch(long data[]) { if (data.length < 1) { throw new IllegalArgumentException(); } - if (data.length >= exp2[exp2.length-1]) { + if (data.length >= exp2[exp2.length - 1]) { throw new IllegalArgumentException(); } dataArray = data; - for (power = exp2.length-1; power > 0 && dataArray.length < exp2[power]; power--) {} + for (power = exp2.length - 1; power > 0 && dataArray.length < exp2[power]; power--) {} if (exp2[power] != dataArray.length) { auxStart = dataArray.length - exp2[power]; - } - else { + } else { auxStart = 0; } } - public long[] getData() - { + public long[] getData() { return dataArray; } - public int findIndex(int value) - { - int index = exp2[power]-1; + public int findIndex(int value) { + int index = exp2[power] - 1; if (value >= dataArray[auxStart]) { index += auxStart; } switch (power) { - case 17: - if (value < dataArray[index-65536]) { - index -= 65536; - } - case 16: - if (value < dataArray[index-32768]) { - index -= 32768; - } - case 15: - if (value < dataArray[index-16384]) { - index -= 16384; - } - case 14: - if (value < dataArray[index-8192]) { - index -= 8192; - } - case 13: - if (value < dataArray[index-4096]) { - index -= 4096; - } - case 12: - if (value < dataArray[index-2048]) { - index -= 2048; - } - case 11: - if (value < dataArray[index-1024]) { - index -= 1024; - } - case 10: - if (value < dataArray[index-512]) { - index -= 512; - } - case 9: - if (value < dataArray[index-256]) { - index -= 256; - } - case 8: - if (value < dataArray[index-128]) { - index -= 128; - } - case 7: - if (value < dataArray[index-64]) { - index -= 64; - } - case 6: - if (value < dataArray[index-32]) { - index -= 32; - } - case 5: - if (value < dataArray[index-16]) { - index -= 16; - } - case 4: - if (value < dataArray[index-8]) { - index -= 8; - } - case 3: - if (value < dataArray[index-4]) { - index -= 4; - } - case 2: - if (value < dataArray[index-2]) { - index -= 2; - } - case 1: - if (value < dataArray[index-1]) { - index -= 1; - } - case 0: - if (value < dataArray[index]) { - index -= 1; - } + case 17: + if (value < dataArray[index - 65536]) { + index -= 65536; + } + case 16: + if (value < dataArray[index - 32768]) { + index -= 32768; + } + case 15: + if (value < dataArray[index - 16384]) { + index -= 16384; + } + case 14: + if (value < dataArray[index - 8192]) { + index -= 8192; + } + case 13: + if (value < dataArray[index - 4096]) { + index -= 4096; + } + case 12: + if (value < dataArray[index - 2048]) { + index -= 2048; + } + case 11: + if (value < dataArray[index - 1024]) { + index -= 1024; + } + case 10: + if (value < dataArray[index - 512]) { + index -= 512; + } + case 9: + if (value < dataArray[index - 256]) { + index -= 256; + } + case 8: + if (value < dataArray[index - 128]) { + index -= 128; + } + case 7: + if (value < dataArray[index - 64]) { + index -= 64; + } + case 6: + if (value < dataArray[index - 32]) { + index -= 32; + } + case 5: + if (value < dataArray[index - 16]) { + index -= 16; + } + case 4: + if (value < dataArray[index - 8]) { + index -= 8; + } + case 3: + if (value < dataArray[index - 4]) { + index -= 4; + } + case 2: + if (value < dataArray[index - 2]) { + index -= 2; + } + case 1: + if (value < dataArray[index - 1]) { + index -= 1; + } + case 0: + if (value < dataArray[index]) { + index -= 1; + } } return index; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastShortBinarySearch.java b/unicodetools/src/main/java/org/unicode/text/utility/FastShortBinarySearch.java index 1f384bef8..28fa2608f 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastShortBinarySearch.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastShortBinarySearch.java @@ -12,119 +12,115 @@ package org.unicode.text.utility; - -public class FastShortBinarySearch -{ +public class FastShortBinarySearch { private final short dataArray[]; private int auxStart; private int power; - private static final int exp2[] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 }; + private static final int exp2[] = { + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072 + }; - public FastShortBinarySearch(short data[]) - { + public FastShortBinarySearch(short data[]) { if (data.length < 1) { throw new IllegalArgumentException(); } - if (data.length >= exp2[exp2.length-1]) { + if (data.length >= exp2[exp2.length - 1]) { throw new IllegalArgumentException(); } dataArray = data; - for (power = exp2.length-1; power > 0 && dataArray.length < exp2[power]; power--) {} + for (power = exp2.length - 1; power > 0 && dataArray.length < exp2[power]; power--) {} if (exp2[power] != dataArray.length) { auxStart = dataArray.length - exp2[power]; - } - else { + } else { auxStart = 0; } } - public short[] getData() - { + public short[] getData() { return dataArray; } - public int findIndex(int value) - { - int index = exp2[power]-1; + public int findIndex(int value) { + int index = exp2[power] - 1; if (value >= dataArray[auxStart]) { index += auxStart; } switch (power) { - case 17: - if (value < dataArray[index-65536]) { - index -= 65536; - } - case 16: - if (value < dataArray[index-32768]) { - index -= 32768; - } - case 15: - if (value < dataArray[index-16384]) { - index -= 16384; - } - case 14: - if (value < dataArray[index-8192]) { - index -= 8192; - } - case 13: - if (value < dataArray[index-4096]) { - index -= 4096; - } - case 12: - if (value < dataArray[index-2048]) { - index -= 2048; - } - case 11: - if (value < dataArray[index-1024]) { - index -= 1024; - } - case 10: - if (value < dataArray[index-512]) { - index -= 512; - } - case 9: - if (value < dataArray[index-256]) { - index -= 256; - } - case 8: - if (value < dataArray[index-128]) { - index -= 128; - } - case 7: - if (value < dataArray[index-64]) { - index -= 64; - } - case 6: - if (value < dataArray[index-32]) { - index -= 32; - } - case 5: - if (value < dataArray[index-16]) { - index -= 16; - } - case 4: - if (value < dataArray[index-8]) { - index -= 8; - } - case 3: - if (value < dataArray[index-4]) { - index -= 4; - } - case 2: - if (value < dataArray[index-2]) { - index -= 2; - } - case 1: - if (value < dataArray[index-1]) { - index -= 1; - } - case 0: - if (value < dataArray[index]) { - index -= 1; - } + case 17: + if (value < dataArray[index - 65536]) { + index -= 65536; + } + case 16: + if (value < dataArray[index - 32768]) { + index -= 32768; + } + case 15: + if (value < dataArray[index - 16384]) { + index -= 16384; + } + case 14: + if (value < dataArray[index - 8192]) { + index -= 8192; + } + case 13: + if (value < dataArray[index - 4096]) { + index -= 4096; + } + case 12: + if (value < dataArray[index - 2048]) { + index -= 2048; + } + case 11: + if (value < dataArray[index - 1024]) { + index -= 1024; + } + case 10: + if (value < dataArray[index - 512]) { + index -= 512; + } + case 9: + if (value < dataArray[index - 256]) { + index -= 256; + } + case 8: + if (value < dataArray[index - 128]) { + index -= 128; + } + case 7: + if (value < dataArray[index - 64]) { + index -= 64; + } + case 6: + if (value < dataArray[index - 32]) { + index -= 32; + } + case 5: + if (value < dataArray[index - 16]) { + index -= 16; + } + case 4: + if (value < dataArray[index - 8]) { + index -= 8; + } + case 3: + if (value < dataArray[index - 4]) { + index -= 4; + } + case 2: + if (value < dataArray[index - 2]) { + index -= 2; + } + case 1: + if (value < dataArray[index - 1]) { + index -= 1; + } + case 0: + if (value < dataArray[index]) { + index -= 1; + } } return index; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSet.java b/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSet.java index 2131fea8f..43c0ff86e 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSet.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSet.java @@ -6,10 +6,11 @@ public final class FastUnicodeSet { static final int index1Length = 272; static final int index2Length = 64; /** - * Structured as a simple trie. The last level is a long (64 bits). It is accessed by - * taking successive parts of the codepoint + * Structured as a simple trie. The last level is a long (64 bits). It is accessed by taking + * successive parts of the codepoint */ private final long[][] data = new long[272][64]; + private int size = 0; public FastUnicodeSet(UnicodeSet source) { @@ -36,7 +37,7 @@ public boolean contains(int codePoint) { index1 >>= 6; final int index2 = index1 & 0x3F; // middle 6 bits index1 >>= 6; // top 9 bits - return 0 != (data[index1][index2] & (1L<>= 6; - final int index2 = index1 & 0x3F; // middle 6 bits - index1 >>= 6; // top 11 bits - data[index1][index2] |= 1L<>= 6; // top 11 bits + data[index1][index2] |= 1L << index3; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSetTest.java b/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSetTest.java index 9cb6e118d..f38335979 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSetTest.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FastUnicodeSetTest.java @@ -5,12 +5,18 @@ public class FastUnicodeSetTest { - static final UnicodeSet[] testList = {new UnicodeSet("[:whitespace:]"), new UnicodeSet("[a-f]"), new UnicodeSet("[:alphabetic:]"), new UnicodeSet("[[:alphabetic:][\\uFFFD]]"), new UnicodeSet("[:cn:]")}; + static final UnicodeSet[] testList = { + new UnicodeSet("[:whitespace:]"), + new UnicodeSet("[a-f]"), + new UnicodeSet("[:alphabetic:]"), + new UnicodeSet("[[:alphabetic:][\\uFFFD]]"), + new UnicodeSet("[:cn:]") + }; public static void main(String[] args) { new FastUnicodeSetTest().TestContains(); // [[:alphabetic:][\uFFFD]] containsAll [:cn:] - UnicodeSet lastSet = testList[testList.length-1]; + UnicodeSet lastSet = testList[testList.length - 1]; if (System.getProperty("DO_QUICK") != null) { for (final UnicodeSet set : testList) { @@ -24,11 +30,17 @@ public static void main(String[] args) { } } - lastSet = testList[testList.length-1]; + lastSet = testList[testList.length - 1]; FastUnicodeSet lastAlt = new FastUnicodeSet(lastSet); for (final UnicodeSet set : testList) { System.out.println(); - System.out.println("Set:\t" + set + "\tsize:\t" + set.size() + "\tranges:\t" + set.getRangeCount()); + System.out.println( + "Set:\t" + + set + + "\tsize:\t" + + set.size() + + "\tranges:\t" + + set.getRangeCount()); final FastUnicodeSet alt = new FastUnicodeSet(set); verify(set, lastSet, alt, lastAlt); verify(set, set, alt, alt); @@ -64,28 +76,26 @@ public void TestContains() { final boolean equals = i == j; if (containsNone != x.containsNone(y)) { x.containsNone(y); // repeat for debugging - errln("FAILED: " + x + " containsSome " + y); + errln("FAILED: " + x + " containsSome " + y); } if (containsAll != x.containsAll(y)) { x.containsAll(y); // repeat for debugging - errln("FAILED: " + x + " containsAll " + y); + errln("FAILED: " + x + " containsAll " + y); } if (equals != x.equals(y)) { x.equals(y); // repeat for debugging - errln("FAILED: " + x + " equals " + y); + errln("FAILED: " + x + " equals " + y); } } } } - /** - * Convert a bitmask to a UnicodeSet. - */ + /** Convert a bitmask to a UnicodeSet. */ FastUnicodeSet bitsToSet(int a) { final UnicodeSet result = new UnicodeSet(); for (int i = 0; i < 32; ++i) { - if ((a & (1<= 0; --i) { for (int j = 0; j < 0x10FFFF; ++j) { result ^= set.contains(j); @@ -185,7 +197,7 @@ private static boolean timeContains(UnicodeSet set, FastUnicodeSet alt, int iter } final double lastDelta = timer.getDelta(); - for (timer.reset(); timer.insufficient();) { + for (timer.reset(); timer.insufficient(); ) { for (long i = timer.iterations(); i >= 0; --i) { for (int j = 0; j < 0x10FFFF; ++j) { result ^= alt.contains(j); @@ -198,29 +210,54 @@ private static boolean timeContains(UnicodeSet set, FastUnicodeSet alt, int iter return result; } - private static void show(String set, String relation, String x, String value, double lastDelta, double delta) { - System.out.println(set + "\t" + relation + "\t" + x + "\t" + value + "\told:\t" + lastDelta + "\tnew:\t" + delta + "\t" + percent.format(delta/lastDelta)); + private static void show( + String set, String relation, String x, String value, double lastDelta, double delta) { + System.out.println( + set + + "\t" + + relation + + "\t" + + x + + "\t" + + value + + "\told:\t" + + lastDelta + + "\tnew:\t" + + delta + + "\t" + + percent.format(delta / lastDelta)); } private static final NumberFormat percent = NumberFormat.getPercentInstance(); - private static boolean timeContainsAll(UnicodeSet set, UnicodeSet set2, FastUnicodeSet alt, FastUnicodeSet alt2, int iterations) { + private static boolean timeContainsAll( + UnicodeSet set, + UnicodeSet set2, + FastUnicodeSet alt, + FastUnicodeSet alt2, + int iterations) { boolean result = false; - for (timer.reset(); timer.insufficient();) { + for (timer.reset(); timer.insufficient(); ) { for (long i = timer.iterations(); i >= 0; --i) { result ^= set.containsAll(set2); } } final double lastDelta = timer.getDelta(); - for (timer.reset(); timer.insufficient();) { + for (timer.reset(); timer.insufficient(); ) { for (long i = timer.iterations(); i >= 0; --i) { result ^= alt.containsAll(alt2); } } final double delta = timer.getDelta(); - show(set.toString(), "containsAll", set2.toString(), String.valueOf(set.containsAll(set2)), lastDelta, delta); + show( + set.toString(), + "containsAll", + set2.toString(), + String.valueOf(set.containsAll(set2)), + lastDelta, + delta); return result; } @@ -240,7 +277,8 @@ private static boolean timeContainsAll(UnicodeSet set, UnicodeSet set2, int iter // } // } // double delta = timer.getDelta(); - // show(set.toString(), "containsAll", set2.toString(), String.valueOf(set.containsAll(set2)), lastDelta, delta); + // show(set.toString(), "containsAll", set2.toString(), + // String.valueOf(set.containsAll(set2)), lastDelta, delta); return result; } @@ -260,48 +298,70 @@ private static boolean timeContainsNone(UnicodeSet set, UnicodeSet set2, int ite // } // } // double delta = timer.getDelta(); - // show(set.toString(), "containsNone", set2.toString(), String.valueOf(set.containsNone(set2)), lastDelta, delta); + // show(set.toString(), "containsNone", set2.toString(), + // String.valueOf(set.containsNone(set2)), lastDelta, delta); return result; } - - private static boolean timeContainsNone(UnicodeSet set, UnicodeSet set2, FastUnicodeSet alt, FastUnicodeSet alt2, int iterations) { + private static boolean timeContainsNone( + UnicodeSet set, + UnicodeSet set2, + FastUnicodeSet alt, + FastUnicodeSet alt2, + int iterations) { boolean result = false; - for (timer.reset(); timer.insufficient();) { + for (timer.reset(); timer.insufficient(); ) { for (long i = timer.iterations(); i >= 0; --i) { result ^= set.containsNone(set2); } } final double lastDelta = timer.getDelta(); - for (timer.reset(); timer.insufficient();) { + for (timer.reset(); timer.insufficient(); ) { for (long i = timer.iterations(); i >= 0; --i) { result ^= alt.containsNone(alt2); } } final double delta = timer.getDelta(); - show(set.toString(), "containsNone", set2.toString(), String.valueOf(set.containsNone(set2)), lastDelta, delta); + show( + set.toString(), + "containsNone", + set2.toString(), + String.valueOf(set.containsNone(set2)), + lastDelta, + delta); return result; } - private static boolean timeEquals(UnicodeSet set, UnicodeSet set2, FastUnicodeSet alt, FastUnicodeSet alt2, int iterations) { + private static boolean timeEquals( + UnicodeSet set, + UnicodeSet set2, + FastUnicodeSet alt, + FastUnicodeSet alt2, + int iterations) { boolean result = false; - for (timer.reset(); timer.insufficient();) { + for (timer.reset(); timer.insufficient(); ) { for (long i = timer.iterations(); i >= 0; --i) { result ^= set.equals(set2); } } final double lastDelta = timer.getDelta(); - for (timer.reset(); timer.insufficient();) { + for (timer.reset(); timer.insufficient(); ) { for (long i = timer.iterations(); i >= 0; --i) { result ^= alt.equals(alt2); } } final double delta = timer.getDelta(); - show(set.toString(), "equals", set2.toString(), String.valueOf(set.equals(set2)), lastDelta, delta); + show( + set.toString(), + "equals", + set2.toString(), + String.valueOf(set.equals(set2)), + lastDelta, + delta); return result; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/FileLineIterator.java b/unicodetools/src/main/java/org/unicode/text/utility/FileLineIterator.java index 452e3398b..bd7376801 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/FileLineIterator.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/FileLineIterator.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/FileLineIterator.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/FileLineIterator.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import java.io.BufferedReader; @@ -16,16 +14,16 @@ import java.io.IOException; /** - * Opens a file, and iterates through the lines in the file. - * Options allow trimming and comment handling, and splitting + * Opens a file, and iterates through the lines in the file. Options allow trimming and comment + * handling, and splitting */ public class FileLineIterator { - static public final char NOTCHAR = '\uFFFF'; + public static final char NOTCHAR = '\uFFFF'; // public writable public boolean doCounter = true; public int lineLimit = Integer.MAX_VALUE; - public char commentChar = '#'; // NOTCHAR if no comments + public char commentChar = '#'; // NOTCHAR if no comments public boolean showFilename = true; // public readable @@ -36,9 +34,7 @@ public class FileLineIterator { private BufferedReader br = null; private Utility.Encoding encoding = Utility.UTF8; - /** - * Open the file for reading. If useGenDir is set, use the normal generation directory - */ + /** Open the file for reading. If useGenDir is set, use the normal generation directory */ public void open(String filename, Utility.Encoding encoding) throws IOException { if (showFilename) { Utility.fixDot(); @@ -49,7 +45,8 @@ public void open(String filename, Utility.Encoding encoding) throws IOException } /** - * Fetch a non-zero-length line from the file, stripping comments & using counter, according to settings. + * Fetch a non-zero-length line from the file, stripping comments & using counter, according to + * settings. */ public String read() throws IOException { while (true) { @@ -65,7 +62,10 @@ public String read() throws IOException { } // drop BOM - if (encoding == Utility.UTF8 && counter == 0 && cleanedLine.length() > 0 && cleanedLine.charAt(0) == 0xFEFF) { + if (encoding == Utility.UTF8 + && counter == 0 + && cleanedLine.length() > 0 + && cleanedLine.charAt(0) == 0xFEFF) { cleanedLine = cleanedLine.substring(1); } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/IcuUnicodeNormalizerFactory.java b/unicodetools/src/main/java/org/unicode/text/utility/IcuUnicodeNormalizerFactory.java index 545fe2e89..86051daa6 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/IcuUnicodeNormalizerFactory.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/IcuUnicodeNormalizerFactory.java @@ -6,31 +6,29 @@ */ package org.unicode.text.utility; -import org.unicode.text.utility.UnicodeTransform.Type; - import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.Normalizer2; +import org.unicode.text.utility.UnicodeTransform.Type; /** * @author markdavis - * */ public class IcuUnicodeNormalizerFactory implements UnicodeTransform.Factory { public UnicodeTransform getInstance(Type type) { switch (type) { - case NFC: - return new IcuUnicodeNormalizer(Normalizer2.getNFCInstance()); - case NFKC: - return new IcuUnicodeNormalizer(Normalizer2.getNFKCInstance()); - case NFD: - return new IcuUnicodeNormalizer(Normalizer2.getNFDInstance()); - case NFKD: - return new IcuUnicodeNormalizer(Normalizer2.getNFKDInstance()); - case CASEFOLD: - return new CaseFolder(); - default: - throw new IllegalArgumentException(); + case NFC: + return new IcuUnicodeNormalizer(Normalizer2.getNFCInstance()); + case NFKC: + return new IcuUnicodeNormalizer(Normalizer2.getNFKCInstance()); + case NFD: + return new IcuUnicodeNormalizer(Normalizer2.getNFDInstance()); + case NFKD: + return new IcuUnicodeNormalizer(Normalizer2.getNFKDInstance()); + case CASEFOLD: + return new CaseFolder(); + default: + throw new IllegalArgumentException(); } } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/IndentWriter.java b/unicodetools/src/main/java/org/unicode/text/utility/IndentWriter.java index b36f90629..0a5582e06 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/IndentWriter.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/IndentWriter.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/IndentWriter.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/IndentWriter.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import java.io.IOException; @@ -25,31 +23,38 @@ public IndentWriter(Writer writer) { width = 30000; separator = " "; } - public IndentWriter(OutputStream writer, String encoding) - throws UnsupportedEncodingException{ + + public IndentWriter(OutputStream writer, String encoding) throws UnsupportedEncodingException { this.writer = new OutputStreamWriter(writer, encoding); width = 30000; separator = " "; } + public void setSeparator(String separator) { this.separator = separator; } + public String getSeparator() { return separator; } + public void setWidth(int width) { this.width = width; } + public int getWidth() { return width; } + public void indentBy(int indentDelta) throws IOException { indent += indentDelta; flush(); } + public void setIndent(int indent) { this.indent = indent; } + public int getIndent() { return indent; } @@ -70,16 +75,19 @@ public void write(String string) throws IOException { */ public void write(int indent, String string) throws IOException { setIndent(indent); - write(string,0,string.length()); + write(string, 0, string.length()); } + public void writeln(int indent, String string) throws IOException { write(indent, string); flushLine(); } + public void writeln(String string) throws IOException { write(string); flushLine(); } + public void writeln() throws IOException { flushLine(); } @@ -98,7 +106,7 @@ public void write(char cbuf[], int off, int len) throws IOException { public void flushLine() throws IOException { if (buffer.length() != 0) { // indent - writer.write(" ",0,bufferIndent); + writer.write(" ", 0, bufferIndent); writer.write(buffer.toString()); writer.write(EOL); buffer.setLength(0); @@ -116,6 +124,7 @@ public void close() throws IOException { flush(); writer.close(); } + private final Writer writer; private final StringBuffer buffer = new StringBuffer(200); private int width; @@ -123,6 +132,7 @@ public void close() throws IOException { private int bufferIndent; private String separator; private static String EOL; + static { // gets platform-specific eol final StringWriter foo = new StringWriter(); final PrintWriter fii = new PrintWriter(foo); @@ -130,4 +140,4 @@ public void close() throws IOException { fii.flush(); EOL = foo.toString(); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/IntStack.java b/unicodetools/src/main/java/org/unicode/text/utility/IntStack.java index f9268efc7..6224d1b64 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/IntStack.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/IntStack.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/IntStack.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/IntStack.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; // ============================================================= @@ -42,8 +40,8 @@ public int length() { public IntStack push(int value) { if (top >= values.length) { // must grow? - final int[] temp = new int[values.length*2]; - System.arraycopy(values,0,temp,0,values.length); + final int[] temp = new int[values.length * 2]; + System.arraycopy(values, 0, temp, 0, values.length); values = temp; } values[top++] = value; @@ -151,6 +149,7 @@ public String toString() { /** * Copy items from the stack into a buffer, and return number of items copied. + * * @param limit * @param start * @param buffer @@ -158,7 +157,7 @@ public String toString() { * @return */ public int extractInto(int start, int limit, int[] buffer, int bufferStart) { - final int len = limit-start; + final int len = limit - start; System.arraycopy(values, first + start, buffer, bufferStart, len); return len; } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/LengthFirstComparator.java b/unicodetools/src/main/java/org/unicode/text/utility/LengthFirstComparator.java index 94d2ce508..3fa77f18f 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/LengthFirstComparator.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/LengthFirstComparator.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/LengthFirstComparator.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/LengthFirstComparator.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import java.util.Comparator; @@ -26,4 +24,4 @@ public int compare(Object a, Object b) { } return as.compareTo(bs); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/Main.java b/unicodetools/src/main/java/org/unicode/text/utility/Main.java index 55bca15df..adaed5118 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/Main.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/Main.java @@ -1,50 +1,56 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/Main.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/Main.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; - public class Main { - static public class CollatorStyle extends EnumBase { + public static class CollatorStyle extends EnumBase { public static CollatorStyle - ZEROED = (CollatorStyle) makeNext(new CollatorStyle(), "ZEROED"), - SHIFTED = (CollatorStyle) makeNext(new CollatorStyle(), "SHIFTED"), - NON_IGNORABLE = (CollatorStyle) makeNext(new CollatorStyle(), "NON_IGNORABLE"); - public CollatorStyle next() { return (CollatorStyle) internalNext(); } + ZEROED = (CollatorStyle) makeNext(new CollatorStyle(), "ZEROED"), + SHIFTED = (CollatorStyle) makeNext(new CollatorStyle(), "SHIFTED"), + NON_IGNORABLE = (CollatorStyle) makeNext(new CollatorStyle(), "NON_IGNORABLE"); + + public CollatorStyle next() { + return (CollatorStyle) internalNext(); + } + private CollatorStyle() {} } - static public class NormalizerType extends EnumBase { - public static NormalizerType - NFC = (NormalizerType) makeNext(new NormalizerType(), "NFC"), - NFD = (NormalizerType) makeNext(new NormalizerType(), "NFD"), - NFKC = (NormalizerType) makeNext(new NormalizerType(), "NFKC"), - NFKD = (NormalizerType) makeNext(new NormalizerType(), "NFKD"); - public NormalizerType next() { return (NormalizerType) internalNext(); } + public static class NormalizerType extends EnumBase { + public static NormalizerType NFC = (NormalizerType) makeNext(new NormalizerType(), "NFC"), + NFD = (NormalizerType) makeNext(new NormalizerType(), "NFD"), + NFKC = (NormalizerType) makeNext(new NormalizerType(), "NFKC"), + NFKD = (NormalizerType) makeNext(new NormalizerType(), "NFKD"); + + public NormalizerType next() { + return (NormalizerType) internalNext(); + } + private NormalizerType() {} } - static public class Length extends EnumBase { - public static Length - SHORT = (Length) makeNext(new Length(), "SHORT"), - NORMAL = (Length) makeNext(new Length(), "NORMAL"), - LONG = (Length) makeNext(new Length(), "LONG"); - public Length next() { return (Length) internalNext(); } + public static class Length extends EnumBase { + public static Length SHORT = (Length) makeNext(new Length(), "SHORT"), + NORMAL = (Length) makeNext(new Length(), "NORMAL"), + LONG = (Length) makeNext(new Length(), "LONG"); + + public Length next() { + return (Length) internalNext(); + } + private Length() {} } - static public void main (String[] args) { - for (final String arg : args) { - } + public static void main(String[] args) { + for (final String arg : args) {} if (true) { return; } @@ -58,5 +64,4 @@ static public void main (String[] args) { final NormalizerType foo = new NormalizerType(); } - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/OldEquivalenceClass.java b/unicodetools/src/main/java/org/unicode/text/utility/OldEquivalenceClass.java index 6ee9a15e0..6acdfecb6 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/OldEquivalenceClass.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/OldEquivalenceClass.java @@ -1,17 +1,14 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/OldEquivalenceClass.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/OldEquivalenceClass.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; - import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -24,22 +21,23 @@ public class OldEquivalenceClass { static final boolean DEBUG = false; /** - * Takes a many:many relation between source and value. - * Produces equivalence class. - * Two sources are in the same equivalence class any time they share the same value. + * Takes a many:many relation between source and value. Produces equivalence class. Two sources + * are in the same equivalence class any time they share the same value. */ // associated with each value, we keep a set of sources. // whenever we add a pair, we see if any sets collide. - // associated with each set of sources, we keep a representative Whenever we add to the set, if we + // associated with each set of sources, we keep a representative Whenever we add to the set, if + // we // Map sourceToEquiv = new TreeMap(); + Map valueToRepresentativeSource = new HashMap(); Map forcedMerge = new HashMap(); /** * @return true if made a difference */ - String itemSeparator; + int places; boolean hex; @@ -56,16 +54,15 @@ public OldEquivalenceClass(String itemSeparator, int places, boolean hex) { public boolean add(Object source, Object value) { boolean result = false; Object repSource = valueToRepresentativeSource.get(value); - Set equivSet = (Set)sourceToEquiv.get(source); - Set fm = (Set)forcedMerge.get(source); + Set equivSet = (Set) sourceToEquiv.get(source); + Set fm = (Set) forcedMerge.get(source); if (fm == null) { fm = new TreeSet(); forcedMerge.put(source, fm); } if (DEBUG) { - System.out.println("+Source " + source - + ", value: " + value); + System.out.println("+Source " + source + ", value: " + value); } if (repSource == null && equivSet == null) { equivSet = new TreeSet(); @@ -87,8 +84,15 @@ public boolean add(Object source, Object value) { result = true; if (DEBUG) { - System.out.println("Merging (" + repSource + ") " + toString(repEquiv) - + " + (" + source + ") " + toString(equivSet)); + System.out.println( + "Merging (" + + repSource + + ") " + + toString(repEquiv) + + " + (" + + source + + ") " + + toString(equivSet)); } // merge!! // put all items from equivSet into repEquiv @@ -98,7 +102,7 @@ public boolean add(Object source, Object value) { Iterator it = repEquiv.iterator(); while (it.hasNext()) { final Object n = it.next(); - fm = (Set)forcedMerge.get(n); + fm = (Set) forcedMerge.get(n); fm.add(value); } @@ -116,25 +120,24 @@ public boolean add(Object source, Object value) { it = toReplace.iterator(); while (it.hasNext()) { final Object otherSource = it.next(); - sourceToEquiv.put(otherSource,repEquiv); + sourceToEquiv.put(otherSource, repEquiv); } equivSet = repEquiv; // for debugging } } if (DEBUG) { - System.out.println("--- repSource: " + repSource - + ", equivSet: " + equivSet); + System.out.println("--- repSource: " + repSource + ", equivSet: " + equivSet); } return result; } @Override - public String toString () { + public String toString() { final StringBuffer result = new StringBuffer(); // make a set to skip duplicates final Iterator it = new HashSet(sourceToEquiv.values()).iterator(); while (it.hasNext()) { - toString((Set)it.next(), result, forcedMerge); + toString((Set) it.next(), result, forcedMerge); } return result.toString(); } @@ -146,17 +149,19 @@ private class MyIterator implements Iterator { public boolean hasNext() { return it.hasNext(); } + @Override public Object next() { return sourceToEquiv.get(it.next()); } + @Override public void remove() { throw new IllegalArgumentException("can't remove"); } } - public Iterator getSetIterator () { + public Iterator getSetIterator() { return new MyIterator(); } @@ -166,7 +171,7 @@ private String toString(Object s) { } if (s instanceof Collection) { final StringBuffer sb = new StringBuffer(); - toString((Collection)s, sb, null); + toString((Collection) s, sb, null); return sb.toString(); } if (hex && s instanceof Number) { @@ -199,5 +204,4 @@ private void toString(Collection s, StringBuffer sb, Map valueToRep) { } sb.append('}'); } - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/Pair.java b/unicodetools/src/main/java/org/unicode/text/utility/Pair.java index 9e14f3b78..5b5ad1ea3 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/Pair.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/Pair.java @@ -1,21 +1,19 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/Pair.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/Pair.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; public final class Pair implements java.lang.Comparable, Cloneable { public Comparable first, second; - public Pair (Comparable first, Comparable second) { + public Pair(Comparable first, Comparable second) { this.first = first; this.second = second; } @@ -28,7 +26,7 @@ public int hashCode() { @Override public boolean equals(Object other) { try { - final Pair that = (Pair)other; + final Pair that = (Pair) other; return first.equals(that.first) && second.equals(that.second); } catch (final Exception e) { return false; @@ -37,7 +35,7 @@ public boolean equals(Object other) { @Override public int compareTo(Object other) { - final Pair that = (Pair)other; + final Pair that = (Pair) other; final int trial = first.compareTo(that.first); if (trial != 0) { return trial; @@ -56,7 +54,10 @@ public Object clone() { @Override public String toString() { - return '(' + (first == null ? "null" : first.toString()) - + ',' + (second == null ? "null" : second.toString()) + ')'; + return '(' + + (first == null ? "null" : first.toString()) + + ',' + + (second == null ? "null" : second.toString()) + + ')'; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/PoorMansEnum.java b/unicodetools/src/main/java/org/unicode/text/utility/PoorMansEnum.java index 67726b9fa..df8156856 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/PoorMansEnum.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/PoorMansEnum.java @@ -1,12 +1,11 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/PoorMansEnum.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/PoorMansEnum.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ /* Goal for enum is: @@ -39,11 +38,9 @@ public String toString() { // for subclassers - protected PoorMansEnum() { - } + protected PoorMansEnum() {} - /** Utility for subclasses - */ + /** Utility for subclasses */ protected static class EnumStore { private final List int2Id = new ArrayList(); private final Map string2Id = new HashMap(); @@ -104,4 +101,4 @@ public PoorMansEnum get(String name) { return (PoorMansEnum) string2Id.get(name); } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/SampleEnum.java b/unicodetools/src/main/java/org/unicode/text/utility/SampleEnum.java index 8865e09a7..f8e69f349 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/SampleEnum.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/SampleEnum.java @@ -15,39 +15,53 @@ import java.util.Set; import java.util.TreeSet; -/** Sample Poor-Man's Enum. - * To use as a template, copy and +/** + * Sample Poor-Man's Enum. To use as a template, copy and + * *

    - *
  • replace all instances of "SampleEnum" by your enum's name
  • - *
  • change the enum values to your values
  • - *
  • set any aliases (or remove that section)
  • + *
  • replace all instances of "SampleEnum" by your enum's name + *
  • change the enum values to your values + *
  • set any aliases (or remove that section) *
*/ public final class SampleEnum extends PoorMansEnum { private static PoorMansEnum.EnumStore store = new PoorMansEnum.EnumStore(); - public static final SampleEnum - ALPHA = add("The"), - BETA = add("Quick"), - GAMMA = add("Brown"), - - FIRST = ALPHA; + public static final SampleEnum ALPHA = add("The"), + BETA = add("Quick"), + GAMMA = add("Brown"), + FIRST = ALPHA; static { store.addAlias(ALPHA, "A"); } /* Boilerplate */ - public SampleEnum next() { return (SampleEnum) next; } - public void getAliases(Collection output) { store.getAliases(this, output); } - public static SampleEnum get(String s) { return (SampleEnum) store.get(s); } - public static SampleEnum get(int v) { return (SampleEnum) store.get(v); } - public static int getMax() { return store.getMax(); } + public SampleEnum next() { + return (SampleEnum) next; + } - private SampleEnum() {} - private static SampleEnum add(String name) { return (SampleEnum) store.add(new SampleEnum(), name);} + public void getAliases(Collection output) { + store.getAliases(this, output); + } + public static SampleEnum get(String s) { + return (SampleEnum) store.get(s); + } + public static SampleEnum get(int v) { + return (SampleEnum) store.get(v); + } + + public static int getMax() { + return store.getMax(); + } + + private SampleEnum() {} + + private static SampleEnum add(String name) { + return (SampleEnum) store.add(new SampleEnum(), name); + } /* just for testing */ public static void test() { @@ -61,7 +75,7 @@ public static void test() { // String to int final Iterator it = s.iterator(); while (it.hasNext()) { - final String n = (String)it.next(); + final String n = (String) it.next(); System.out.println(n + ", " + SampleEnum.get(n).toInt()); } @@ -72,6 +86,4 @@ public static void test() { System.out.println(current.toInt() + ", " + current + ", " + s); } } - - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/Settings.java b/unicodetools/src/main/java/org/unicode/text/utility/Settings.java index 7ebbd7960..155669b1e 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/Settings.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/Settings.java @@ -1,31 +1,29 @@ package org.unicode.text.utility; +import com.ibm.icu.util.VersionInfo; import java.io.File; import java.io.FileNotFoundException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Locale; - import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; -import com.ibm.icu.util.VersionInfo; - public class Settings { // TODO Many of these settings are crufty and need revision. // https://unicode-org.atlassian.net/browse/CLDR-14335 "Rationalize CLDR constants" // is for fixing these and CLDR. - // TODO: Why do we sometimes use CldrUtility.getPath() which normalizes paths via java.nio.file.Paths + // TODO: Why do we sometimes use CldrUtility.getPath() which normalizes paths via + // java.nio.file.Paths // and sometimes Utility.fixFileName() which normalizes paths via java.io.File? // Are they equivalent for our purposes? - /** - * Used for the default version. - */ + /** Used for the default version. */ public static final String latestVersion = "15.0.0"; + public static final String lastVersion = "14.0.0"; // last released version private static final String TRIMMED_LATEST_VERSION = trimVersion(latestVersion); @@ -40,9 +38,8 @@ public class Settings { "skip".equalsIgnoreCase(CldrUtility.getProperty("copyright", "skip")); /** - * Removes one or more trailing ".0" substrings, and/or one trailing "-Update". - * For quick comparison of version strings and folder names - * without building VersionInfo objects. + * Removes one or more trailing ".0" substrings, and/or one trailing "-Update". For quick + * comparison of version strings and folder names without building VersionInfo objects. */ private static final String trimVersion(String version) { int length = trimmedVersionLength(version); @@ -54,7 +51,9 @@ private static final int trimmedVersionLength(String version) { if (version.endsWith("-Update")) { length -= "-Update".length(); } - while (length >= 2 && version.charAt(length - 2) == '.' && version.charAt(length - 1) == '0') { + while (length >= 2 + && version.charAt(length - 2) == '.' + && version.charAt(length - 1) == '0') { length -= 2; } return length; @@ -63,8 +62,8 @@ private static final int trimmedVersionLength(String version) { /** Ignores trailing ".0" and "-Update". */ private static final boolean isLatestVersion(String version) { int length = trimmedVersionLength(version); - return length == TRIMMED_LATEST_VERSION.length() && - TRIMMED_LATEST_VERSION.regionMatches(0, version, 0, length); + return length == TRIMMED_LATEST_VERSION.length() + && TRIMMED_LATEST_VERSION.regionMatches(0, version, 0, length); } private static final Path getPath(Path parentPath, String relativeDir, String version) { @@ -123,50 +122,50 @@ private static final String getRequiredPathAndFix(String key) { public static final class CLDR { private static final String CLDR_REPO_DIR = CLDRPaths.BASE_DIRECTORY; - public static final String SVN_DIRECTORY = CldrUtility.getPath( - CldrUtility.getProperty("SVN_DIR", CLDR_REPO_DIR + "/../")); - public static final String AUX_DIRECTORY = CldrUtility.getPath( - CldrUtility.getProperty("CLDR_TMP_DIR", - CldrUtility.getPath(SVN_DIRECTORY, "cldr-aux/"))); - public static final String UCD_DATA_DIRECTORY = CldrUtility.getPath( - SVN_DIRECTORY + "unicodetools/unicodetools/data/"); - public static final String BASE_DIRECTORY = Utility.fixFileName( - CldrUtility.getProperty("BASE_DIRECTORY", SVN_DIRECTORY + "../")) + "/"; + public static final String SVN_DIRECTORY = + CldrUtility.getPath(CldrUtility.getProperty("SVN_DIR", CLDR_REPO_DIR + "/../")); + public static final String AUX_DIRECTORY = + CldrUtility.getPath( + CldrUtility.getProperty( + "CLDR_TMP_DIR", CldrUtility.getPath(SVN_DIRECTORY, "cldr-aux/"))); + public static final String UCD_DATA_DIRECTORY = + CldrUtility.getPath(SVN_DIRECTORY + "unicodetools/unicodetools/data/"); + public static final String BASE_DIRECTORY = + Utility.fixFileName( + CldrUtility.getProperty("BASE_DIRECTORY", SVN_DIRECTORY + "../")) + + "/"; } public static final class UnicodeTools { /** - * The root of the unicodetools repo. - * Contains the UnicodeJsps and unicodetools folders etc. + * The root of the unicodetools repo. Contains the UnicodeJsps and unicodetools folders etc. */ public static final String UNICODETOOLS_REPO_DIR = getRequiredPathAndFix("UNICODETOOLS_REPO_DIR"); - // TODO: Try to make this private; see https://github.com/unicode-org/unicodetools/issues/159 + // TODO: Try to make this private; see + // https://github.com/unicode-org/unicodetools/issues/159 // Call sites should use more specific paths. public static final String UNICODETOOLS_DIR = UNICODETOOLS_REPO_DIR + "unicodetools/"; - /** - * Use this for files such as org/unicode/Whatever.java - */ + /** Use this for files such as org/unicode/Whatever.java */ public static final String UNICODETOOLS_JAVA_DIR = UNICODETOOLS_DIR + "src/main/java/"; - /** - * Use this for package-relative data, such as org/unicode/SomeData.txt - */ + /** Use this for package-relative data, such as org/unicode/SomeData.txt */ public static final String UNICODETOOLS_RSRC_DIR = UNICODETOOLS_DIR + "src/main/resources/"; + public static final String UNICODEJSPS_DIR = UNICODETOOLS_REPO_DIR + "UnicodeJsps/"; public static final String DATA_DIR = UNICODETOOLS_DIR + "data/"; public static final Path DATA_PATH = Paths.get(DATA_DIR); public static final String UCD_DIR = DATA_DIR + "ucd/"; private static final Path UCD_PATH = DATA_PATH.resolve("ucd"); - // TODO: IDN_DIR is used, but there is no .../data/IDN/ folder. Should this be .../data/idna/ ? + // TODO: IDN_DIR is used, but there is no .../data/IDN/ folder. Should this be + // .../data/idna/ ? public static final String IDN_DIR = DATA_DIR + "IDN/"; // TODO: DICT_DIR is used, but there is no .../data/dict/ folder. ?? public static final String DICT_DIR = DATA_DIR + "dict/"; /** - * Returns a path to the Unicode Tools source data plus the relativeDir and version. - * Falls back from latestVersion to "dev" as needed. - * Appends "-Update" to a "ucd" version as needed. - * Returns null if the folder does not exist. + * Returns a path to the Unicode Tools source data plus the relativeDir and version. Falls + * back from latestVersion to "dev" as needed. Appends "-Update" to a "ucd" version as + * needed. Returns null if the folder does not exist. */ public static final Path getDataPath(String relativeDir, String version) { return getPath(DATA_PATH, relativeDir, version); @@ -174,39 +173,32 @@ public static final Path getDataPath(String relativeDir, String version) { /** * Returns a path string to the Unicode Tools source data plus the relativeDir and version. - * Falls back from latestVersion to "dev" as needed. - * Appends "-Update" to a "ucd" version as needed. - * Returns null if the folder does not exist. + * Falls back from latestVersion to "dev" as needed. Appends "-Update" to a "ucd" version as + * needed. Returns null if the folder does not exist. */ public static final String getDataPathString(String relativeDir, String version) { return getPathString(DATA_PATH, relativeDir, version); } /** - * Returns a path to the Unicode Tools source data - * plus the relativeDir and latest version. - * Falls back from latestVersion to "dev" as needed. - * Appends "-Update" to a "ucd" version as needed. - * Returns null if the folder does not exist. + * Returns a path to the Unicode Tools source data plus the relativeDir and latest version. + * Falls back from latestVersion to "dev" as needed. Appends "-Update" to a "ucd" version as + * needed. Returns null if the folder does not exist. */ public static final Path getDataPathForLatestVersion(String relativeDir) { return getPath(DATA_PATH, relativeDir, latestVersion); } /** - * Returns a path string to the Unicode Tools source data - * plus the relativeDir and latest version. - * Falls back from latestVersion to "dev" as needed. - * Appends "-Update" to a "ucd" version as needed. - * Returns null if the folder does not exist. + * Returns a path string to the Unicode Tools source data plus the relativeDir and latest + * version. Falls back from latestVersion to "dev" as needed. Appends "-Update" to a "ucd" + * version as needed. Returns null if the folder does not exist. */ public static final String getDataPathStringForLatestVersion(String relativeDir) { return getPathString(DATA_PATH, relativeDir, latestVersion); } - /** - * Constants representing data subdirectories - */ + /** Constants representing data subdirectories */ public enum DataDir { SECURITY, UCD, @@ -215,14 +207,16 @@ public enum DataDir { /** * This dir as a Path + * * @return */ public Path asPath() { - return DATA_PATH.resolve(name().toLowerCase(Locale.ROOT)); + return DATA_PATH.resolve(name().toLowerCase(Locale.ROOT)); } /** * This dir as a Path to the version subdir + * * @param forVersion * @return */ @@ -233,18 +227,16 @@ public Path asPath(VersionInfo forVersion) { /** * Map a version number to a string. + * * @param version * @return */ public String versionToString(VersionInfo version) { StringBuilder sb = new StringBuilder(); - sb.append(version.getMajor()) - .append(".") - .append(version.getMinor()); + sb.append(version.getMajor()).append(".").append(version.getMinor()); if (this != EMOJI) { // 13.1, 14.0 - sb.append(".") - .append(version.getMilli()); + sb.append(".").append(version.getMilli()); } // else: 14.0.0 return sb.toString(); } @@ -258,38 +250,34 @@ public static final class Images { public static final class Output { /** The root of where we write output files. Most go into a "Generated" sub-folder. */ - public static final String GEN_DIR = - getRequiredPathAndFix("UNICODETOOLS_GEN_DIR"); + public static final String GEN_DIR = getRequiredPathAndFix("UNICODETOOLS_GEN_DIR"); + public static final String BIN_DIR = GEN_DIR + "BIN/"; public static final String GEN_UCD_DIR = GEN_DIR + "UCD/"; public static final String GEN_UCA_DIR = GEN_DIR + "UCA/"; - /** - * Make sure the output dirs exist - */ + /** Make sure the output dirs exist */ public static void ensureOutputDirs() throws FileNotFoundException { - if (!(new File(GEN_DIR)).isDirectory()) { - throw new FileNotFoundException("Not a directory: UNICODETOOLS_GEN_DIR=" + GEN_DIR); - } - ensureOutputDir(GEN_DIR); - ensureOutputDir(GEN_UCD_DIR); - ensureOutputDir(GEN_UCA_DIR); - ensureOutputDir(BIN_DIR); + if (!(new File(GEN_DIR)).isDirectory()) { + throw new FileNotFoundException("Not a directory: UNICODETOOLS_GEN_DIR=" + GEN_DIR); + } + ensureOutputDir(GEN_DIR); + ensureOutputDir(GEN_UCD_DIR); + ensureOutputDir(GEN_UCA_DIR); + ensureOutputDir(BIN_DIR); } + public static void ensureOutputDir(String dir) { - if(new File(dir).mkdirs()) { - System.err.println("# mkdir " + dir); - } + if (new File(dir).mkdirs()) { + System.err.println("# mkdir " + dir); + } } } - public static final String SRC_DIR = Utility.fixFileName(UnicodeTools.UNICODETOOLS_RSRC_DIR+"org/unicode/text") + "/"; - /** - * Used for data files - */ + public static final String SRC_DIR = + Utility.fixFileName(UnicodeTools.UNICODETOOLS_RSRC_DIR + "org/unicode/text") + "/"; + /** Used for data files */ public static final String SRC_UCA_DIR = SRC_DIR + "UCA/"; - /** - * Used for data files - */ + /** Used for data files */ public static final String SRC_UCD_DIR = SRC_DIR + "UCD/"; } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/TempPrintWriter.java b/unicodetools/src/main/java/org/unicode/text/utility/TempPrintWriter.java index 171f93533..57fdfaf57 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/TempPrintWriter.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/TempPrintWriter.java @@ -6,21 +6,22 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; - import org.unicode.cldr.draft.FileUtilities; public class TempPrintWriter extends PrintWriter { static { } + final String filename; - - public TempPrintWriter (String dir, String filename, String encoding) throws IOException { + + public TempPrintWriter(String dir, String filename, String encoding) throws IOException { super(getBuffer(dir, filename, encoding)); this.filename = filename; throw new IllegalArgumentException("USE org.unicode.tools.emoji."); } - private static BufferedWriter getBuffer(String dirString, String filename, String encoding) throws IOException { + private static BufferedWriter getBuffer(String dirString, String filename, String encoding) + throws IOException { File file = File.createTempFile(filename, null, new File(dirString)); if (FileUtilities.SHOW_FILES) { System.out.println("Creating File: " + file.getCanonicalPath()); @@ -31,10 +32,7 @@ private static BufferedWriter getBuffer(String dirString, String filename, Strin parent.mkdirs(); } return new BufferedWriter( - new OutputStreamWriter( - new FileOutputStream(file), - encoding), - 4*1024); + new OutputStreamWriter(new FileOutputStream(file), encoding), 4 * 1024); } @Override diff --git a/unicodetools/src/main/java/org/unicode/text/utility/TestUtility.java b/unicodetools/src/main/java/org/unicode/text/utility/TestUtility.java index bd353465c..25923bbbc 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/TestUtility.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/TestUtility.java @@ -1,16 +1,19 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/TestUtility.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/TestUtility.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.dev.util.UnicodeMapIterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; @@ -30,57 +33,50 @@ import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; - import org.unicode.cldr.util.Counter; import org.unicode.jsp.ICUPropertyFactory; -import org.unicode.props.UnicodeProperty; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; +import org.unicode.props.UnicodeProperty; import org.unicode.text.UCD.Default; import org.unicode.unused.DataInputCompressor; import org.unicode.unused.DataOutputCompressor; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.dev.util.UnicodeMapIterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - public class TestUtility { /* - static public class MyEnum extends EnumBase { - public static MyEnum - ZEROED = (MyEnum) makeNext(myEnum.getClass()), - SHIFTED = (MyEnum) makeNext(), - NON_IGNORABLE = (MyEnum) makeNext(), - - FIRST_ENUM = ZEROED, - LAST_ENUM = NON_IGNORABLE; - public MyEnum next(int value) { - return (MyEnum) internalNext(value); - } - protected MyEnum() {} - } - */ + static public class MyEnum extends EnumBase { + public static MyEnum + ZEROED = (MyEnum) makeNext(myEnum.getClass()), + SHIFTED = (MyEnum) makeNext(), + NON_IGNORABLE = (MyEnum) makeNext(), + + FIRST_ENUM = ZEROED, + LAST_ENUM = NON_IGNORABLE; + public MyEnum next(int value) { + return (MyEnum) internalNext(value); + } + protected MyEnum() {} + } + */ static final boolean USE_FILE = true; static final boolean DEBUG = false; - static public void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { tryFileUnicodeProperty(); check(); final int iterations = 1; - //testStreamCompressor(); + // testStreamCompressor(); UnicodeMap umap = new UnicodeMap(); - umap.put(0,"abcdefg"); + umap.put(0, "abcdefg"); if (false) { for (int i = 0; i < 256; ++i) { - umap.put(i, String.valueOf(i&0xF0)); + umap.put(i, String.valueOf(i & 0xF0)); } } int total = testUnicodeMapSerialization(1, iterations, "dummy", umap); - //if (true) return; - //UnicodeLabel ul; + // if (true) return; + // UnicodeLabel ul; final ICUPropertyFactory p = ICUPropertyFactory.make(); total = 0; @@ -88,14 +84,16 @@ static public void main(String[] args) throws Exception { final Matcher nameMatch = Pattern.compile("Name").matcher(""); final UnicodeProperty gc = p.getProperty("General_Category"); - final UnicodeSet checkSet = gc.getSet("Cn").addAll(gc.getSet("Co")).addAll(gc.getSet("Cs")).complement(); + final UnicodeSet checkSet = + gc.getSet("Cn").addAll(gc.getSet("Co")).addAll(gc.getSet("Cs")).complement(); final UnicodeSetIterator checkSetIterator = new UnicodeSetIterator(checkSet); final UnicodeProperty hangulSyllableType = p.getProperty("Hangul_Syllable_Type"); - final UnicodeSet hangulSyllable = hangulSyllableType.getSet("LVT_Syllable").addAll(hangulSyllableType.getSet("LV_Syllable")); + final UnicodeSet hangulSyllable = + hangulSyllableType + .getSet("LVT_Syllable") + .addAll(hangulSyllableType.getSet("LV_Syllable")); - - for (final Iterator pnames = p.getAvailableNames().iterator(); pnames - .hasNext();) { + for (final Iterator pnames = p.getAvailableNames().iterator(); pnames.hasNext(); ) { final String pname = (String) pnames.next(); if (!nameMatch.reset(pname).matches()) { continue; @@ -118,7 +116,7 @@ static public void main(String[] args) throws Exception { continue; } umap.put(i, value); - //System.out.println("Adding " + Utility.hex(i) + ", " + Utility.hex(value)); + // System.out.println("Adding " + Utility.hex(i) + ", " + Utility.hex(value)); } } else { final UnicodeProperty sampleProp = p.getProperty(pname); @@ -142,10 +140,9 @@ static public void main(String[] args) throws Exception { System.out.println("Done"); } - static void check() throws IOException, ClassNotFoundException { final UnicodeMap m = new UnicodeMap(); - m.put(1,"abc"); + m.put(1, "abc"); final ByteArrayOutputStream out = new ByteArrayOutputStream(); final ObjectOutputStream oos = new ObjectOutputStream(out); oos.writeBoolean(true); @@ -165,9 +162,7 @@ static void check() throws IOException, ClassNotFoundException { ois.close(); } - /** - * - */ + /** */ private static boolean equals(int i, String value) { final int len = value.length(); if (len < 0 || len > 2) { @@ -179,13 +174,12 @@ private static boolean equals(int i, String value) { if (i <= 0xFFFF) { return false; } - return i == UTF16.charAt(value,0); + return i == UTF16.charAt(value, 0); } - /** - * - */ - private static void testHanProp(int iterations, int total, String pname, String type) throws IOException, ClassNotFoundException { + /** */ + private static void testHanProp(int iterations, int total, String pname, String type) + throws IOException, ClassNotFoundException { System.out.println(); IndexUnicodeProperties iup = IndexUnicodeProperties.make(Default.ucd().getVersionInfo()); final UnicodeMap umap = iup.load(UcdProperty.forString(pname)); @@ -199,9 +193,10 @@ private static void testHanProp(int iterations, int total, String pname, String static String outdir = outdircore + "4.1.0/"; /** * @param pname - * */ - private static int testUnicodeMapSerialization(int iterations, int total, String pname, UnicodeMap umap) throws IOException, ClassNotFoundException { + private static int testUnicodeMapSerialization( + int iterations, int total, String pname, UnicodeMap umap) + throws IOException, ClassNotFoundException { System.out.print("\tValue Count:\t" + umap.getAvailableValues().size()); final String filename = outdir + pname + ".bin"; @@ -214,28 +209,27 @@ private static int testUnicodeMapSerialization(int iterations, int total, String } out = new GZIPOutputStream(out); final ObjectOutputStream oos = new ObjectOutputStream(out); - //Random rand = new Random(); + // Random rand = new Random(); /* if (false) { - oos.writeObject(umap); - oos.close(); - buffer = baout.toByteArray(); - in = new ByteArrayInputStream(buffer, 0, baout.size()); - ois = new ObjectInputStream(in); - reverseMap = (UnicodeMap) ois.readObject(); - } - */ + oos.writeObject(umap); + oos.close(); + buffer = baout.toByteArray(); + in = new ByteArrayInputStream(buffer, 0, baout.size()); + ois = new ObjectInputStream(in); + reverseMap = (UnicodeMap) ois.readObject(); + } + */ // UnicodeMap.StreamCompressor sc = new UnicodeMap.StreamCompressor(); // int test = (int)Math.abs(rand.nextGaussian()*100000); // System.out.print(Integer.toString(test, 16).toUpperCase()); // sc.writeInt(out, test); // out.close(); - //oos.writeBoolean(true); - //oos.writeUTF("abcdefg"); + // oos.writeBoolean(true); + // oos.writeUTF("abcdefg"); oos.writeObject(umap); oos.close(); - long size; byte[] buffer; if (USE_FILE) { @@ -247,8 +241,7 @@ private static int testUnicodeMapSerialization(int iterations, int total, String System.out.println(showBuffer(buffer, size)); } } - System.out.print("\t"+"Size:\t" + size); - + System.out.print("\t" + "Size:\t" + size); // only measure read time UnicodeMap reverseMap = null; @@ -258,15 +251,15 @@ private static int testUnicodeMapSerialization(int iterations, int total, String if (USE_FILE) { in = new FileInputStream(filename); } else { - in = new ByteArrayInputStream(buffer, 0, (int)size); + in = new ByteArrayInputStream(buffer, 0, (int) size); } in = new GZIPInputStream(in); // int x = sc.readInt(in); // if (x != test) System.out.println("Failure"); // System.out.println("\t=> " + Integer.toString(x, 16).toUpperCase()); final ObjectInputStream ois = new ObjectInputStream(in); - //System.out.println(ois.readBoolean()); - //System.out.println(ois.readUTF()); + // System.out.println(ois.readBoolean()); + // System.out.println(ois.readUTF()); try { reverseMap = (UnicodeMap) ois.readObject(); @@ -287,14 +280,17 @@ private static int testUnicodeMapSerialization(int iterations, int total, String if (UnicodeMap.areEqual(main, rev)) { continue; } - System.out.println(Utility.hex(i) + "\t'" + main + "',\t'" - + rev + "'"); + System.out.println(Utility.hex(i) + "\t'" + main + "',\t'" + rev + "'"); } } - //out.toByteArray(); + // out.toByteArray(); total += size; - System.out.print("\tTime:\t" + (end - start) / (iterations * 1.0) - + "\tmsecs (raw:\t" + ((end - start) / 1000.0) + "\tsecs)"); + System.out.print( + "\tTime:\t" + + (end - start) / (iterations * 1.0) + + "\tmsecs (raw:\t" + + ((end - start) / 1000.0) + + "\tsecs)"); /* with Vanilla Serialization * Size: 24131 * Time: 1.9488 msecs (raw: 9.744 secs) @@ -314,30 +310,33 @@ private static int testUnicodeMapSerialization(int iterations, int total, String return total; } - /** - * - */ + /** */ private static String showBuffer(byte[] buffer, long size) { final StringBuffer result = new StringBuffer(); for (int j = 0; j < size; ++j) { if (j != 0) { result.append(' '); } - result.append(Utility.hex(buffer[j]&0xFF,2)); + result.append(Utility.hex(buffer[j] & 0xFF, 2)); } return result.toString(); } - /** - * - */ + /** */ private static void testStreamCompressor() throws IOException { final Object[] tests = { - UTF16.valueOf(0x10FFFF),"\u1234", "abc", - new Long(-3), new Long(12345), - new Short(Short.MAX_VALUE), new Short(Short.MIN_VALUE), - new Integer(Integer.MAX_VALUE), new Integer(Integer.MIN_VALUE), - new Long(Long.MIN_VALUE), new Long(Long.MAX_VALUE)}; + UTF16.valueOf(0x10FFFF), + "\u1234", + "abc", + new Long(-3), + new Long(12345), + new Short(Short.MAX_VALUE), + new Short(Short.MIN_VALUE), + new Integer(Integer.MAX_VALUE), + new Integer(Integer.MIN_VALUE), + new Long(Long.MIN_VALUE), + new Long(Long.MAX_VALUE) + }; for (int i = 0; i < tests.length; ++i) { final Object source = tests[i]; @@ -349,9 +348,9 @@ private static void testStreamCompressor() throws IOException { final DataOutputCompressor sc = new DataOutputCompressor(out2); long y = 0; if (source instanceof String) { - sc.writeUTF((String)source); + sc.writeUTF((String) source); } else { - y = ((Number)source).longValue(); + y = ((Number) source).longValue(); sc.writeLong(y); } out2.close(); @@ -367,35 +366,40 @@ private static void testStreamCompressor() throws IOException { long x = 0; if (isString) { result = isc.readUTF(); - System.out.println(i + "\t" + source - + "\t" + result - + (source.equals(result) ? "\tSuccess" : "\tBitter Failure")); + System.out.println( + i + + "\t" + + source + + "\t" + + result + + (source.equals(result) ? "\tSuccess" : "\tBitter Failure")); } else { x = isc.readLong(); result = new Long(x); - System.out.println(i + "\t" + y - + x - + "\t" + Utility.hex(y) - + "\t" + Utility.hex(x) - + (x == y ? "\tSuccess" : "\tBitter Failure")); + System.out.println( + i + + "\t" + + y + + x + + "\t" + + Utility.hex(y) + + "\t" + + Utility.hex(x) + + (x == y ? "\tSuccess" : "\tBitter Failure")); } in2.close(); } } - /** - * - */ + /** */ private static void showBytes(byte[] buffer, int len) { for (int i = 0; i < len; ++i) { - System.out.print(Utility.hex(buffer[i]&0xFF,2) + " "); + System.out.print(Utility.hex(buffer[i] & 0xFF, 2) + " "); } } - /** - * - */ + /** */ private static UnicodeMap fixNameMap(BreakIterator bk, UnicodeMap umap) { final UnicodeMap temp = new UnicodeMap(); final Counter counter = new Counter(); @@ -431,23 +435,22 @@ private static UnicodeMap fixNameMap(BreakIterator bk, UnicodeMap umap) { for (final String key : counter) { final long c = counter.getCount(key); running += c; - System.out.println(count++ + "\t" + c + "\t" + running - + "\t" + key); + System.out.println(count++ + "\t" + c + "\t" + running + "\t" + key); } - for (final UnicodeMapIterator it2 = new UnicodeMapIterator( - temp); it2.nextRange();) { - System.out.println(Utility.hex(it2.codepoint) + "\t" - + Utility.hex(it2.codepointEnd) + "\t" - + it2.value); + for (final UnicodeMapIterator it2 = new UnicodeMapIterator(temp); it2.nextRange(); ) { + System.out.println( + Utility.hex(it2.codepoint) + + "\t" + + Utility.hex(it2.codepointEnd) + + "\t" + + it2.value); } } umap = temp; return umap; } - /** - * - */ + /** */ private static void tryFileUnicodeProperty() { final UnicodeProperty.Factory factory = FileUnicodeProperty.Factory.make("4.1.0"); System.out.println(factory.getAvailableNames()); @@ -467,11 +470,12 @@ private FileUnicodeProperty(File file, String version) { this.file = file; this.version = version; final String base = file.getName(); - setName(base.substring(0, base.length()-4)); // subtract .bin + setName(base.substring(0, base.length() - 4)); // subtract .bin } public static class Factory extends UnicodeProperty.Factory { private Factory() {} + public static Factory make(String version) { final Factory result = new Factory(); final File f = new File(outdircore + version + "/"); @@ -504,12 +508,10 @@ protected String _getValue(int codepoint) { if (map == null) { make(); } - return (String)map.getValue(codepoint); + return (String) map.getValue(codepoint); } - /** - * - */ + /** */ private void make() { try { final InputStream in = new FileInputStream(file.getCanonicalPath()); @@ -517,7 +519,7 @@ private void make() { map = (UnicodeMap) ois.readObject(); ois.close(); } catch (final Exception e) { - throw (InternalError)new InternalError("Can't create property").initCause(e); + throw (InternalError) new InternalError("Can't create property").initCause(e); } } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UTF16Plus.java b/unicodetools/src/main/java/org/unicode/text/utility/UTF16Plus.java index bf5f4d7b9..6d36b6f32 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UTF16Plus.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UTF16Plus.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF16Plus.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF16Plus.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; public final class UTF16Plus { diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UTF32.java b/unicodetools/src/main/java/org/unicode/text/utility/UTF32.java index bf293ce5c..4ae4f0bbc 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UTF32.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UTF32.java @@ -1,27 +1,25 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF32.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF32.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import com.ibm.icu.text.UTF16; /** - * Utility class for dealing with UTF-16 strings and code points. - * Provides only methods that are not available in Java itself, nor in ICU. + * Utility class for dealing with UTF-16 strings and code points. Provides only methods that are not + * available in Java itself, nor in ICU. */ public final class UTF32 { /** - * Determines whether the code point is a surrogate. - * TODO: Propose again to widen UTF16.isSurrogate(char) to take an int. - * Or maybe add UTF16.isSurrogateCodePoint(int) or isCodePointSurrogate(int). + * Determines whether the code point is a surrogate. TODO: Propose again to widen + * UTF16.isSurrogate(char) to take an int. Or maybe add UTF16.isSurrogateCodePoint(int) or + * isCodePointSurrogate(int). * * @return true iff the input character is a surrogate. * @param ch the input character. @@ -31,12 +29,14 @@ public static boolean isSurrogate(int char32) { } /** - * Convenience method corresponding to String.valueOf(char). It returns a one or two char string containing - * the UTF-32 value. If the input value can't be converted, it substitutes the replacement character U+FFFD. + * Convenience method corresponding to String.valueOf(char). It returns a one or two char string + * containing the UTF-32 value. If the input value can't be converted, it substitutes the + * replacement character U+FFFD. * * @return string value of char32 * @param ch the input character. - * @deprecated Try to use UTF16.valueOf(char32), but that throws an exception for illegal code points. + * @deprecated Try to use UTF16.valueOf(char32), but that throws an exception for illegal code + * points. */ @Deprecated public static String valueOf32(int char32) { @@ -47,8 +47,7 @@ public static String valueOf32(int char32) { } } - /** - * Prevent instance from being created. - */ + /** Prevent instance from being created. */ private UTF32() {} -}; +} +; diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamReader.java b/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamReader.java index 07d88bfe0..dc033c67c 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamReader.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamReader.java @@ -1,14 +1,12 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF8StreamReader.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF8StreamReader.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; import java.io.IOException; @@ -17,14 +15,16 @@ /** * Utility class that writes UTF8.
- * Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors. - *
+ * Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
* Example of Usage: + * *

  * PrintWriter log = new PrintWriter(
  *   new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
  * 
- * NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads. + * + * NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple + * threads. */ // TODO: Fix case of surrogate pair crossing input buffer boundary @@ -44,32 +44,31 @@ public UTF8StreamReader(InputStream stream, int buffersize) { private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00); private final byte[] bBuffer; // do a bit of buffering ourselves for efficiency - private int - bIndex = 0, - bEnd = 0, - bRemaining = 0, - currentPoint = 0, - lastPoint, - shortestFormTest = 0; + private int bIndex = 0, + bEnd = 0, + bRemaining = 0, + currentPoint = 0, + lastPoint, + shortestFormTest = 0; private char cCarry = 0; private static final byte[] BYTES_REMAINING = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0- - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1- - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2- - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3- - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4- - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5- - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6- - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7- - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // 8- - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // 9- - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // A- - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, // B- - -1,-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C- - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D- - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // E- - 3, 3, 3, 3, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1 // F- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 6- + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7- + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 8- + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 9- + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // A- + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // B- + -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // C- + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // D- + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // E- + 3, 3, 3, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // F- }; @Override @@ -116,74 +115,83 @@ public int read(char cbuf[], int off, int len) throws IOException { int b = bBuffer[bIndex++] & 0xFF; switch (bRemaining) { - // First Byte case - case 0: - bRemaining = BYTES_REMAINING[b]; - switch (bRemaining) { + // First Byte case case 0: - cbuf[cIndex++] = (char) (lastPoint = b); - break; - case 1: - currentPoint = b & 0x1F; - shortestFormTest = 0x80; + bRemaining = BYTES_REMAINING[b]; + switch (bRemaining) { + case 0: + cbuf[cIndex++] = (char) (lastPoint = b); + break; + case 1: + currentPoint = b & 0x1F; + shortestFormTest = 0x80; + break; + case 2: + currentPoint = b & 0xF; + shortestFormTest = 0x800; + break; + case 3: + currentPoint = b & 0x7; + shortestFormTest = 0x10000; + break; + default: + throw new IllegalArgumentException("illegal lead code unit: " + b); + } break; + + // Trailing bytes case 2: - currentPoint = b & 0xF; - shortestFormTest = 0x800; - break; case 3: - currentPoint = b & 0x7; - shortestFormTest = 0x10000; + b ^= 0x80; + if (b > 0x3F) { + throw new IllegalArgumentException( + "illegal trail code unit: " + (b ^ 0x80)); + } + currentPoint = (currentPoint << 6) | b; + --bRemaining; break; - default: - throw new IllegalArgumentException("illegal lead code unit: " + b); - } - break; - // Trailing bytes - case 2: case 3: - b ^= 0x80; - if (b > 0x3F) { - throw new IllegalArgumentException("illegal trail code unit: " + (b ^ 0x80)); - } - currentPoint = (currentPoint << 6) | b; - --bRemaining; - break; - - // Last trailing byte, time to assemble - case 1: - b ^= 0x80; - if (b > 0x3F) { - throw new IllegalArgumentException("illegal trail code unit: " + (b ^ 0x80)); - } - currentPoint = (currentPoint << 6) | b; - --bRemaining; + // Last trailing byte, time to assemble + case 1: + b ^= 0x80; + if (b > 0x3F) { + throw new IllegalArgumentException( + "illegal trail code unit: " + (b ^ 0x80)); + } + currentPoint = (currentPoint << 6) | b; + --bRemaining; - // we have gotten the code, so check and stash it + // we have gotten the code, so check and stash it - if (currentPoint < shortestFormTest) { - throw new IllegalArgumentException("illegal sequence, not shortest form: " + currentPoint); - } - if (checkIrregular && 0xD800 <= lastPoint && lastPoint <= 0xDC00 - && 0xDC00 <= currentPoint && currentPoint <= 0xDFFF) { - throw new IllegalArgumentException("irregular sequence, surrogate pair: " + currentPoint); - } - lastPoint = currentPoint; - if (currentPoint >= 0x10000) { - if (currentPoint > 0x10FFFF) { - throw new IllegalArgumentException("illegal code point, too large: " + currentPoint); + if (currentPoint < shortestFormTest) { + throw new IllegalArgumentException( + "illegal sequence, not shortest form: " + currentPoint); } - currentPoint -= 0x10000; - cbuf[cIndex++] = (char)(0xD800 + (currentPoint >> 10)); - currentPoint = 0xDC00 + (currentPoint & 0x3FF); - if (cIndex >= cEnd) { - cCarry = (char)currentPoint; - return cIndex - off; + if (checkIrregular + && 0xD800 <= lastPoint + && lastPoint <= 0xDC00 + && 0xDC00 <= currentPoint + && currentPoint <= 0xDFFF) { + throw new IllegalArgumentException( + "irregular sequence, surrogate pair: " + currentPoint); } - } - cbuf[cIndex++] = (char)currentPoint; - currentPoint = 0; - break; + lastPoint = currentPoint; + if (currentPoint >= 0x10000) { + if (currentPoint > 0x10FFFF) { + throw new IllegalArgumentException( + "illegal code point, too large: " + currentPoint); + } + currentPoint -= 0x10000; + cbuf[cIndex++] = (char) (0xD800 + (currentPoint >> 10)); + currentPoint = 0xDC00 + (currentPoint & 0x3FF); + if (cIndex >= cEnd) { + cCarry = (char) currentPoint; + return cIndex - off; + } + } + cbuf[cIndex++] = (char) currentPoint; + currentPoint = 0; + break; } } return cIndex - off; diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamWriter.java b/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamWriter.java index d496d9549..8658b71a6 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamWriter.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UTF8StreamWriter.java @@ -1,29 +1,30 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF8StreamWriter.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/UTF8StreamWriter.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; + import java.io.IOException; import java.io.OutputStream; import java.io.Writer; /** * Utility class that writes UTF8.
- * Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors. - *
+ * Main purpose is to supplant OutputStreamWriter(x, "UTF8"), since that has serious errors.
* Example of Usage: + * *

  * PrintWriter log = new PrintWriter(
  *   new UTF8StreamWriter(new FileOutputStream(fileName), 32*1024));
  * 
- * NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple threads. + * + * NB: unsynchronized for simplicity and speed. The same object must NOT be used in multiple + * threads. */ // TODO: Fix case of surrogate pair crossing input buffer boundary @@ -54,14 +55,10 @@ public UTF8StreamWriter(OutputStream stream, int buffersize, boolean removeCR, b Latin1 = latin1; } - private static final int - NEED_2_BYTES = 1<<7, - NEED_3_BYTES = 1<<(2*5 + 1), - NEED_4_BYTES = 1<<(3*5 + 1); - - private static final int - TRAILING_BOTTOM_MASK = 0x3F, - TRAILING_TOP = 0x80; + private static final int NEED_2_BYTES = 1 << 7, + NEED_3_BYTES = 1 << (2 * 5 + 1), + NEED_4_BYTES = 1 << (3 * 5 + 1); + private static final int TRAILING_BOTTOM_MASK = 0x3F, TRAILING_TOP = 0x80; private static final int MAGIC = 0x10000 + ((0 - 0xD800) << 10) + (0 - 0xDC00); @@ -81,16 +78,15 @@ public final void write(char[] buffer, int cStart, int cLength) throws IOExcepti final int utf32 = buffer[cStart++]; - if (utf32 == 0x0D && removeCR) - { + if (utf32 == 0x0D && removeCR) { continue; // skip write } if (Latin1) { if (utf32 > 0xFF) { - bBuffer[bIndex++] = (byte)'?'; + bBuffer[bIndex++] = (byte) '?'; } else { - bBuffer[bIndex++] = (byte)utf32; + bBuffer[bIndex++] = (byte) utf32; } continue; } @@ -123,14 +119,14 @@ private final void writeCodePoint(int utf32) { // convert to bytes if (utf32 < NEED_2_BYTES) { - bBuffer[bIndex++] = (byte)utf32; + bBuffer[bIndex++] = (byte) utf32; return; } // Find out how many bytes we need to write // At this point, it is at least 2. - //int count; + // int count; int backIndex; int firstByteMark; if (utf32 < NEED_3_BYTES) { @@ -139,19 +135,20 @@ private final void writeCodePoint(int utf32) { } else if (utf32 < NEED_4_BYTES) { backIndex = bIndex += 3; firstByteMark = 0xE0; - bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); + bBuffer[--backIndex] = (byte) (TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); utf32 >>= 6; } else { backIndex = bIndex += 4; firstByteMark = 0xF0; - bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); + bBuffer[--backIndex] = (byte) (TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); utf32 >>= 6; - bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); - utf32 >>= 6; - }; - bBuffer[--backIndex] = (byte)(TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); + bBuffer[--backIndex] = (byte) (TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); + utf32 >>= 6; + } + ; + bBuffer[--backIndex] = (byte) (TRAILING_TOP | (utf32 & TRAILING_BOTTOM_MASK)); utf32 >>= 6; - bBuffer[--backIndex] = (byte)(firstByteMark | utf32); + bBuffer[--backIndex] = (byte) (firstByteMark | utf32); } private void internalFlush() throws IOException { diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeDataFile.java b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeDataFile.java index e8052b30f..3d93d445d 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeDataFile.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeDataFile.java @@ -3,9 +3,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; - import org.unicode.text.UCD.Default; -import org.unicode.text.UCD.MakeUnicodeFiles; import org.unicode.text.utility.Utility.RuntimeIOException; public class UnicodeDataFile { @@ -13,22 +11,28 @@ public class UnicodeDataFile { private String newFile; private String mostRecent; private String filename; - private UnicodeDataFile(){}; + + private UnicodeDataFile() {} + ; + private String fileType = ".txt"; private boolean skipCopyright = true; - public static UnicodeDataFile openAndWriteHeader(String directory, String filename) throws IOException { + public static UnicodeDataFile openAndWriteHeader(String directory, String filename) + throws IOException { return new UnicodeDataFile(directory, filename, false); } - public static UnicodeDataFile openHTMLAndWriteHeader(String directory, String filename) throws IOException { + public static UnicodeDataFile openHTMLAndWriteHeader(String directory, String filename) + throws IOException { return new UnicodeDataFile(directory, filename, true); } private UnicodeDataFile(String directory, String filename, boolean isHTML) throws IOException { fileType = isHTML ? ".html" : ".txt"; // When we still generated files with version infixes, the following line was: - // newSuffix = FileInfix.fromFlags(Settings.BUILD_FOR_COMPARE, true).getFileSuffix(fileType); + // newSuffix = FileInfix.fromFlags(Settings.BUILD_FOR_COMPARE, + // true).getFileSuffix(fileType); // newFile = directory + filename + newSuffix; newFile = directory + filename + fileType; out = Utility.openPrintWriterGenDir(newFile, Utility.UTF8_UNIX); @@ -36,26 +40,36 @@ private UnicodeDataFile(String directory, String filename, boolean isHTML) throw // exist somewhere in the input data folder: // It will look through all versioned folders and their subfolders, // and eventually fail to find such a file. - // We skip this when we won't look at what we find, or when we know that we won't find anything. + // We skip this when we won't look at what we find, or when we know that we won't find + // anything. // For known pure output files, we could use a different constructor, or add a parameter. boolean skipRecentFile = // close() will not even look at mostRecent. - Settings.BUILD_FOR_COMPARE || - // These folders exist only in the tools output, not in the tools input. - directory.endsWith("/cldr/") || - directory.endsWith("/extra/"); - mostRecent = skipRecentFile ? null : - Utility.getMostRecentUnicodeDataFile( - UnicodeDataFile.fixFile(filename), Default.ucd().getVersion(), true, true, fileType); + Settings.BUILD_FOR_COMPARE + || + // These folders exist only in the tools output, not in the tools input. + directory.endsWith("/cldr/") + || directory.endsWith("/extra/"); + mostRecent = + skipRecentFile + ? null + : Utility.getMostRecentUnicodeDataFile( + UnicodeDataFile.fixFile(filename), + Default.ucd().getVersion(), + true, + true, + fileType); this.filename = filename; if (!isHTML) { out.println(Utility.getDataHeader(filename + FileInfix.plain.getFileSuffix(".txt"))); - out.println("#\n# Unicode Character Database" - + "\n# For documentation, see https://www.unicode.org/reports/tr44/"); + out.println( + "#\n# Unicode Character Database" + + "\n# For documentation, see https://www.unicode.org/reports/tr44/"); } try { - Utility.appendFile(Settings.SRC_UCD_DIR + filename + "Header" + fileType, Utility.UTF8_UNIX, out); + Utility.appendFile( + Settings.SRC_UCD_DIR + filename + "Header" + fileType, Utility.UTF8_UNIX, out); } catch (final RuntimeIOException e) { if (!(e.getCause() instanceof FileNotFoundException)) { throw e; @@ -77,7 +91,8 @@ public void close() throws IOException { } out.close(); if (!Settings.BUILD_FOR_COMPARE) { - Utility.renameIdentical(mostRecent, Utility.getOutputName(newFile), null, skipCopyright); + Utility.renameIdentical( + mostRecent, Utility.getOutputName(newFile), null, skipCopyright); } } @@ -87,11 +102,12 @@ public void close() throws IOException { * plain version infix (ArabicShaping-11.0.0.txt)
*/ public enum FileInfix { - none, plain; + none, + plain; public String getFileSuffix(String fileType) { if (this == none) { - return fileType; // avoid string concatenation + return fileType; // avoid string concatenation } return "-" + Default.ucd().getVersion() + fileType; } @@ -105,22 +121,21 @@ private static FileInfix suppressVersion(boolean suppress) { } } - - //Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names + // Remove "d1" from DerivedJoiningGroup-3.1.0d1.txt type names public static String fixFile(String s) { final int len = s.length(); if (!s.endsWith(".txt")) { return s; } - if (s.charAt(len-6) != 'd') { + if (s.charAt(len - 6) != 'd') { return s; } - final char c = s.charAt(len-5); + final char c = s.charAt(len - 5); if (c != 'X' && (c < '0' || '9' < c)) { return s; } - s = s.substring(0,len-6) + s.substring(len-4); + s = s.substring(0, len - 6) + s.substring(len - 4); System.out.println("Fixing File Name: " + s); return s; } @@ -134,4 +149,3 @@ public UnicodeDataFile setSkipCopyright(boolean skipCopyright) { return this; } } - diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeMapInt.java b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeMapInt.java index 35cbdbb9f..52a620314 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeMapInt.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeMapInt.java @@ -62,9 +62,9 @@ public int get(int cp) { return data[findIndex(cp) - 1]; } - *//** - * Returns the set of all characters that have the given value - *//* + */ + /** Returns the set of all characters that have the given value */ + /* public UnicodeSet getMatch(int value) { UnicodeSet result = new UnicodeSet(); for (int i = 0; i < len; ++i) { @@ -73,7 +73,9 @@ public UnicodeSet getMatch(int value) { return result; } - *//** Finds the least index with a value greater than cp *//* + */ + /** Finds the least index with a value greater than cp */ + /* private int findIndex( int cp) { if (cp > 0x10FFFF) throw new ArrayIndexOutOfBoundsException("Code point too large: " + cp); // out of bounds! int i = -1; diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeSetParser.java b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeSetParser.java index 8933edd85..577b7985f 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeSetParser.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeSetParser.java @@ -1,71 +1,69 @@ package org.unicode.text.utility; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - import com.ibm.icu.impl.UnicodeRegex; import com.ibm.icu.text.UnicodeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Lenient parsing of Unicode values. + * *
 set   := range (sep? range)+
  * range := code rangSep code | code rangeSep code
- * code  := 
+ * code  :=
  *          literal     – non-ASCII, non-whitespace
  *        | U+XXXX..    – must not be followed by X (\b)
  *        | XXXX        – no a-f, must not be followed by X (\b)
- *        | \\uXXXX 
+ *        | \\uXXXX
  *        | \\UXXXXXX
- *        | \\x{X...} 
+ *        | \\x{X...}
  *        | \\u{X...}
  * rSep  := {whitespace}* ..? {whitespace}*
  * sep   := {whitespace}* ,? {whitespace}*
- * + * * @author markdavis */ public class UnicodeSetParser { - private static final Pattern HEX_PATTERN = Pattern.compile( - UnicodeRegex.fix("((?:\\h|[:di:])+|,)") - + "|(\\.\\.|-)" - + "|([A-F0-9]{4,6})(?![A-F0-9])" // code points - + "|U\\+([a-fA-F0-9]{4,6})(?![a-fA-F0-9])" - + "|\\\\x([a-fA-F0-9]{2})" - + "|\\\\u([a-fA-F0-9]{4})" - + "|\\\\U([a-fA-F0-9]{6})" - + "|\\\\u\\{([a-fA-F0-9]{1,6})\\}" - + "|\\\\x\\{([a-fA-F0-9]{1,6})\\}" - + "|([^\\h\\v\\x{20}-\\x{7F}])" // any non-space, ASCII - ); - - private static final Pattern HEX_PATTERN_ANY = Pattern.compile( - UnicodeRegex.fix("((?:\\h|[:di:])+|,)") - + "|(\\.\\.|-)" - + "|([A-F0-9]{4,6})(?![A-F0-9])" // code points - + "|U\\+([a-fA-F0-9]{4,6})(?![a-fA-F0-9])" - + "|\\\\x([a-fA-F0-9]{2})" - + "|\\\\u([a-fA-F0-9]{4})" - + "|\\\\U([a-fA-F0-9]{6})" - + "|\\\\u\\{([a-fA-F0-9]{1,6})\\}" - + "|\\\\x\\{([a-fA-F0-9]{1,6})\\}" - + "|([^\\h])" // any non-space, ASCII - ); + private static final Pattern HEX_PATTERN = + Pattern.compile( + UnicodeRegex.fix("((?:\\h|[:di:])+|,)") + + "|(\\.\\.|-)" + + "|([A-F0-9]{4,6})(?![A-F0-9])" // code points + + "|U\\+([a-fA-F0-9]{4,6})(?![a-fA-F0-9])" + + "|\\\\x([a-fA-F0-9]{2})" + + "|\\\\u([a-fA-F0-9]{4})" + + "|\\\\U([a-fA-F0-9]{6})" + + "|\\\\u\\{([a-fA-F0-9]{1,6})\\}" + + "|\\\\x\\{([a-fA-F0-9]{1,6})\\}" + + "|([^\\h\\v\\x{20}-\\x{7F}])" // any non-space, ASCII + ); + + private static final Pattern HEX_PATTERN_ANY = + Pattern.compile( + UnicodeRegex.fix("((?:\\h|[:di:])+|,)") + + "|(\\.\\.|-)" + + "|([A-F0-9]{4,6})(?![A-F0-9])" // code points + + "|U\\+([a-fA-F0-9]{4,6})(?![a-fA-F0-9])" + + "|\\\\x([a-fA-F0-9]{2})" + + "|\\\\u([a-fA-F0-9]{4})" + + "|\\\\U([a-fA-F0-9]{6})" + + "|\\\\u\\{([a-fA-F0-9]{1,6})\\}" + + "|\\\\x\\{([a-fA-F0-9]{1,6})\\}" + + "|([^\\h])" // any non-space, ASCII + ); static final int SEP = 1, RANGE = 2, START = 3, ANY = 10; final Pattern hexPattern; - + public UnicodeSetParser(boolean allowAny) { hexPattern = allowAny ? HEX_PATTERN_ANY : HEX_PATTERN; } /** - * Convert a string with a mixture of hex and normal characters. - * Anything like the following is converted from hex to chars - * and all spaces are removed - * hexChar = \b[A-F0-9]{4,6}\b - * | U+[a-fA-F0-9]{4,6} - * | \\u[a-fA-F0-9]{4} - * | \\U[a-fA-F0-9]{6} - * | \\u{[a-fA-F0-9]{1,6} + * Convert a string with a mixture of hex and normal characters. Anything like the following is + * converted from hex to chars and all spaces are removed hexChar = \b[A-F0-9]{4,6}\b | + * U+[a-fA-F0-9]{4,6} | \\u[a-fA-F0-9]{4} | \\U[a-fA-F0-9]{6} | \\u{[a-fA-F0-9]{1,6} + * * @param hexOrChars * @return */ @@ -79,7 +77,11 @@ public UnicodeSet parse(String hexOrChars, UnicodeSet target) { while (hex.find()) { if (hex.start() != lastOffset) { // skipped something, fail - throw new IllegalArgumentException("Unexpected characters at " + lastOffset + ": " + hexOrChars.substring(lastOffset, hex.start())); + throw new IllegalArgumentException( + "Unexpected characters at " + + lastOffset + + ": " + + hexOrChars.substring(lastOffset, hex.start())); } lastOffset = hex.end(); if (hex.group(SEP) != null) { @@ -87,7 +89,8 @@ public UnicodeSet parse(String hexOrChars, UnicodeSet target) { } if (hex.group(RANGE) != null) { if (lastCp < 0) { - throw new IllegalArgumentException("Illegal range at" + lastOffset + ": " + hex.group(0)); + throw new IllegalArgumentException( + "Illegal range at" + lastOffset + ": " + hex.group(0)); } range = true; continue; @@ -98,9 +101,13 @@ public UnicodeSet parse(String hexOrChars, UnicodeSet target) { int num = i == ANY ? group.codePointAt(0) : Integer.parseInt(group, 16); if (range) { if (lastCp >= num) { - throw new IllegalArgumentException("Second of range must be greater, at " + lastOffset + ": " + hex.group(0)); + throw new IllegalArgumentException( + "Second of range must be greater, at " + + lastOffset + + ": " + + hex.group(0)); } - target.add(lastCp+1, num); + target.add(lastCp + 1, num); range = false; lastCp = -1; } else { @@ -112,7 +119,11 @@ public UnicodeSet parse(String hexOrChars, UnicodeSet target) { } } if (lastOffset != hexOrChars.length()) { - throw new IllegalArgumentException("Unexpected characters at " + lastOffset + ": " + hexOrChars.substring(lastOffset, hex.start())); + throw new IllegalArgumentException( + "Unexpected characters at " + + lastOffset + + ": " + + hexOrChars.substring(lastOffset, hex.start())); } return target; } @@ -125,7 +136,11 @@ public StringBuilder parseString(String hexOrChars, StringBuilder target) { while (hex.find()) { if (hex.start() != lastOffset || hex.group(RANGE) != null) { // skipped something, fail - throw new IllegalArgumentException("Unexpected characters at " + lastOffset + ": " + hexOrChars.substring(lastOffset, hex.start())); + throw new IllegalArgumentException( + "Unexpected characters at " + + lastOffset + + ": " + + hexOrChars.substring(lastOffset, hex.start())); } lastOffset = hex.end(); if (hex.group(SEP) != null) { @@ -144,7 +159,11 @@ public StringBuilder parseString(String hexOrChars, StringBuilder target) { } } if (lastOffset != hexOrChars.length()) { - throw new IllegalArgumentException("Unexpected characters at " + lastOffset + ": " + hexOrChars.substring(lastOffset, hex.start())); + throw new IllegalArgumentException( + "Unexpected characters at " + + lastOffset + + ": " + + hexOrChars.substring(lastOffset, hex.start())); } return target; } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeTransform.java b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeTransform.java index dbcd4f211..4b984cb43 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UnicodeTransform.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UnicodeTransform.java @@ -10,19 +10,24 @@ import com.ibm.icu.text.UTF16; /** - * Simple wrapping for normalizer that allows for both the standard ICU normalizer, and one built directly from the UCD. + * Simple wrapping for normalizer that allows for both the standard ICU normalizer, and one built + * directly from the UCD. */ -public abstract class UnicodeTransform implements Transform { +public abstract class UnicodeTransform implements Transform { public enum Type { - NFD, NFC, NFKD, NFKC, CASEFOLD + NFD, + NFC, + NFKD, + NFKC, + CASEFOLD } - + public interface Factory { public UnicodeTransform getInstance(Type type); } - + private static Factory factory = new IcuUnicodeNormalizerFactory(); - + public static synchronized Factory getFactory() { return factory; } @@ -34,26 +39,19 @@ public static synchronized void setFactory(Factory factory) { public static synchronized UnicodeTransform getInstance(Type type) { return factory.getInstance(type); } - + public abstract String transform(String source); - - /** - * Can be overridden for performance. - */ + + /** Can be overridden for performance. */ public boolean isTransformed(String source) { return source.equals(transform(source)); } - /** - * Can be overridden for performance. - */ + /** Can be overridden for performance. */ public String transform(int source) { return transform(UTF16.valueOf(source)); } - /** - * Can be overridden for performance. - */ + /** Can be overridden for performance. */ public boolean isTransformed(int source) { return isTransformed(UTF16.valueOf(source)); } } - diff --git a/unicodetools/src/main/java/org/unicode/text/utility/Utility.java b/unicodetools/src/main/java/org/unicode/text/utility/Utility.java index deeb46f4c..1e35ef740 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/Utility.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/Utility.java @@ -1,16 +1,23 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/Utility.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/Utility.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Replaceable; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeMatcher; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.VersionInfo; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -24,7 +31,6 @@ import java.io.UnsupportedEncodingException; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -35,23 +41,12 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; - import org.unicode.props.UnicodeProperty; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.UCD; import org.unicode.text.UCD.UCD_Types; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Replaceable; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeMatcher; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.VersionInfo; - -public final class Utility implements UCD_Types { // COMMON UTILITIES +public final class Utility implements UCD_Types { // COMMON UTILITIES private static final boolean SHOW_SEARCH_PATH = false; @@ -107,7 +102,7 @@ public static long setBits(long source, int start, int end) { start = end; end = temp; } - long bmstart = (1L << (start+1)) - 1; + long bmstart = (1L << (start + 1)) - 1; final long bmend = (1L << end) - 1; bmstart &= ~bmend; return source |= bmstart; @@ -123,7 +118,7 @@ public static long clearBits(long source, int start, int end) { start = end; end = temp; } - int bmstart = (1 << (start+1)) - 1; + int bmstart = (1 << (start + 1)) - 1; final int bmend = (1 << end) - 1; bmstart &= ~bmend; return source &= ~bmstart; @@ -148,8 +143,8 @@ public static int find(String source, String[] target, boolean skeletonize) { // These routines use the Java functions, because they only need to act on ASCII. /** - * Removes space, _, and lowercases. - * Calls ICU UnicodeProperty.toSkeleton() and may add Unicode tool specific overrides. + * Removes space, _, and lowercases. Calls ICU UnicodeProperty.toSkeleton() and may add Unicode + * tool specific overrides. */ public static String getSkeleton(String source) { return UnicodeProperty.toSkeleton(source); @@ -179,16 +174,16 @@ public static String getSkeleton(String source) { // private static StringBuffer skeletonBuffer = new StringBuffer(); + /** Sutton SignWriting really does want to be in CamelCase without underscore. */ + private static final String Signwriting = "Signwriting"; /** - * Sutton SignWriting really does want to be in CamelCase without underscore. + * @see Signwriting */ - private static final String Signwriting = "Signwriting"; - /** @see Signwriting */ private static final String Sign_Writing = "Sign_Writing"; /** - * Changes space, - into _, inserts _ between lower and UPPER. - * Calls ICU UnicodeProperty.regularize() and adds Unicode tool specific overrides. + * Changes space, - into _, inserts _ between lower and UPPER. Calls ICU + * UnicodeProperty.regularize() and adds Unicode tool specific overrides. */ public static String getUnskeleton(String source, boolean titlecaseStart) { String result = UnicodeProperty.regularize(source, titlecaseStart); @@ -229,37 +224,40 @@ public static String getUnskeleton(String source, boolean titlecaseStart) { */ } - public static int lookup(String value, String[] values, String[] altValues, boolean skeletonize, int maxValue) { - if ((values.length - 1) > maxValue || (altValues != null && (altValues.length - 1) > maxValue)) { - throw new IllegalArgumentException("values or altValues too long for maxValue " + maxValue); + public static int lookup( + String value, String[] values, String[] altValues, boolean skeletonize, int maxValue) { + if ((values.length - 1) > maxValue + || (altValues != null && (altValues.length - 1) > maxValue)) { + throw new IllegalArgumentException( + "values or altValues too long for maxValue " + maxValue); } int result = Utility.find(value, values, skeletonize); if (result < 0 && altValues != null) { result = Utility.find(value, altValues, skeletonize); } if (result < 0) { - throw new ChainException("Could not find \"{0}\" in table [{1}] nor in [{2}]", - new Object [] { value, Arrays.asList(values), Arrays.asList(altValues) }); + throw new ChainException( + "Could not find \"{0}\" in table [{1}] nor in [{2}]", + new Object[] {value, Arrays.asList(values), Arrays.asList(altValues)}); } return result; } public static byte lookup(String source, String[] target, boolean skeletonize) { - return (byte)lookup(source, target, null, skeletonize, Byte.MAX_VALUE); + return (byte) lookup(source, target, null, skeletonize, Byte.MAX_VALUE); } public static short lookupShort(String source, String[] target, boolean skeletonize) { - return (short)lookup(source, target, null, skeletonize, Short.MAX_VALUE); + return (short) lookup(source, target, null, skeletonize, Short.MAX_VALUE); } - public static short lookupShort(String source, String[] target, String[] altValues, boolean skeletonize) { - return (short)lookup(source, target, altValues, skeletonize, Short.MAX_VALUE); + public static short lookupShort( + String source, String[] target, String[] altValues, boolean skeletonize) { + return (short) lookup(source, target, altValues, skeletonize, Short.MAX_VALUE); } - /** - * Supplies a zero-padded hex representation of an integer (without 0x) - */ - static public String hex(long i, int places) { + /** Supplies a zero-padded hex representation of an integer (without 0x) */ + public static String hex(long i, int places) { if (i == Long.MIN_VALUE) { return "-8000000000000000"; } @@ -269,7 +267,7 @@ static public String hex(long i, int places) { } String result = Long.toString(i, 16).toUpperCase(); if (result.length() < places) { - result = "0000000000000000".substring(result.length(),places) + result; + result = "0000000000000000".substring(result.length(), places) + result; } if (negative) { return '-' + result; @@ -278,15 +276,15 @@ static public String hex(long i, int places) { } public static String hex(long ch) { - return hex(ch,4); + return hex(ch, 4); } public static String hex(byte ch) { - return hex(ch & 0xFF,2); + return hex(ch & 0xFF, 2); } public static String hex(char ch) { - return hex(ch & 0xFFFF,4); + return hex(ch & 0xFFFF, 4); } public static String hex(Object s) { @@ -306,7 +304,7 @@ public static String hex(Object o, int places, String separator) { return ""; } if (o instanceof Number) { - return hex(((Number)o).longValue(), places); + return hex(((Number) o).longValue(), places); } final String s = o.toString(); @@ -324,7 +322,7 @@ public static String hex(Object o, int places, String separator) { public static String hex(byte[] o, int start, int end, String separator) { final StringBuffer result = new StringBuffer(); - //int ch; + // int ch; for (int i = start; i < end; ++i) { if (i != 0) { result.append(separator); @@ -345,10 +343,7 @@ public static String hex(char[] o, int start, int end, String separator) { return result.toString(); } - /** - * Returns a string containing count copies of s. - * If count <= 0, returns "". - */ + /** Returns a string containing count copies of s. If count <= 0, returns "". */ public static String repeat(String s, int count) { if (count <= 0) { return ""; @@ -356,7 +351,7 @@ public static String repeat(String s, int count) { if (count == 1) { return s; } - final StringBuffer result = new StringBuffer(count*s.length()); + final StringBuffer result = new StringBuffer(count * s.length()); for (int i = 0; i < count; ++i) { result.append(s); } @@ -385,17 +380,17 @@ public static float floatFrom(String p) { if (fract == -1) { return Float.valueOf(p).floatValue(); } - final String q = p.substring(0,fract); + final String q = p.substring(0, fract); float num = 0; if (q.length() != 0) { num = Integer.parseInt(q); } - p = p.substring(fract+1,p.length()); + p = p.substring(fract + 1, p.length()); float den = 0; if (p.length() != 0) { den = Integer.parseInt(p); } - return num/den; + return num / den; } public static double doubleFrom(String p) { @@ -406,17 +401,17 @@ public static double doubleFrom(String p) { if (fract == -1) { return Double.valueOf(p).doubleValue(); } - final String q = p.substring(0,fract); + final String q = p.substring(0, fract); double num = 0; if (q.length() != 0) { num = Integer.parseInt(q); } - p = p.substring(fract+1,p.length()); + p = p.substring(fract + 1, p.length()); double den = 0; if (p.length() != 0) { den = Integer.parseInt(p); } - return num/den; + return num / den; } public static int codePointFromHex(String p) { @@ -432,7 +427,7 @@ public static String fromHex(String p) { } public static String fromHex(String p, boolean acceptChars) { - return fromHex(p,acceptChars,4); + return fromHex(p, acceptChars, 4); } public static String fromHex(String p, boolean acceptChars, int minHex) { @@ -440,18 +435,36 @@ public static String fromHex(String p, boolean acceptChars, int minHex) { int value = 0; int count = 0; main: - for (int i = 0; i < p.length(); ++i) { - final char ch = p.charAt(i); - int digit = 0; - switch (ch) { - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + for (int i = 0; i < p.length(); ++i) { + final char ch = p.charAt(i); + int digit = 0; + switch (ch) { + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': digit = ch - 'a' + 10; break; - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': digit = ch - 'A' + 10; break; - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': - case '8': case '9': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': digit = ch - '0'; break; default: @@ -461,26 +474,34 @@ public static String fromHex(String p, boolean acceptChars, int minHex) { if (count >= minHex && count <= 6) { output.appendCodePoint(value); } else if (count != 0) { - output.append(p.substring(i-count, i)); // TODO fix supplementary characters + output.append( + p.substring( + i - count, i)); // TODO fix supplementary characters } count = 0; value = 0; output.appendCodePoint((int) ch); continue main; } - throw new ChainException("bad hex value: ‘{0}’ at position {1} in \"{2}\"", + throw new ChainException( + "bad hex value: ‘{0}’ at position {1} in \"{2}\"", new Object[] {String.valueOf(ch), new Integer(i), p}); } // fall through!! - case 'U': case 'u': case '+': // for the U+ case + case 'U': + case 'u': + case '+': // for the U+ case - case ' ': case ',': case ';': // do SPACE here, just for speed + case ' ': + case ',': + case ';': // do SPACE here, just for speed if (count != 0) { if (count < minHex || count > 6) { if (acceptChars) { - output.append(p.substring(i-count, i)); + output.append(p.substring(i - count, i)); } else { - throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"", + throw new ChainException( + "bad hex value: '{0}' at position {1} in \"{2}\"", new Object[] {String.valueOf(ch), new Integer(i), p}); } } else { @@ -490,21 +511,23 @@ public static String fromHex(String p, boolean acceptChars, int minHex) { count = 0; value = 0; continue main; - } - value <<= 4; - value += digit; - if (value > 0x10FFFF) { - throw new ChainException("Character code too large: '{0}' at position {1} in \"{2}\"", - new Object[] {String.valueOf(ch), new Integer(i), p}); - } - count++; } + value <<= 4; + value += digit; + if (value > 0x10FFFF) { + throw new ChainException( + "Character code too large: '{0}' at position {1} in \"{2}\"", + new Object[] {String.valueOf(ch), new Integer(i), p}); + } + count++; + } if (count != 0) { if (count < minHex || count > 6) { if (acceptChars) { return p; } else { - throw new ChainException("bad hex value: '{0}' at position {1} in \"{2}\"", + throw new ChainException( + "bad hex value: '{0}' at position {1} in \"{2}\"", new Object[] {"EOS", new Integer(p.length()), p}); } } else { @@ -516,21 +539,21 @@ public static String fromHex(String p, boolean acceptChars, int minHex) { return output.toString(); } - public static final class Position { public int start, limit; } /** * Finds the next position in the text that matches. + * * @param divider A UnicodeMatcher, such as a UnicodeSet. * @text obvious * @offset starting offset * @output start and limit of the piece found. If the return is false, then start,limit = length * @return true iff match found */ - public static boolean next(UnicodeMatcher matcher, Replaceable text, int offset, - Position output) { + public static boolean next( + UnicodeMatcher matcher, Replaceable text, int offset, Position output) { final int[] io = new int[1]; // TODO replace later; extra object creation final int limit = text.length(); // don't worry about surrogates; matcher will handle @@ -549,14 +572,15 @@ public static boolean next(UnicodeMatcher matcher, Replaceable text, int offset, /** * Finds the next position in the text that matches. + * * @param divider A UnicodeMatcher, such as a UnicodeSet. * @text obvious * @offset starting offset * @output start and limit of the piece found. If the return is false, then start,limit = 0 * @return true iff match found */ - public static boolean previous(UnicodeMatcher matcher, Replaceable text, int offset, - Position output) { + public static boolean previous( + UnicodeMatcher matcher, Replaceable text, int offset, Position output) { final int[] io = new int[1]; // TODO replace later; extra object creation final int limit = 0; // don't worry about surrogates; matcher will handle @@ -574,11 +598,11 @@ public static boolean previous(UnicodeMatcher matcher, Replaceable text, int off } /** - * Splits a string containing divider into pieces, storing in output - * and returns the number of pieces. The string does not have to be terminated: - * the segment after the last divider is returned in the last output element. - * Thus if the string has no dividers, then the whole string is returned in output[0] - * with a return value of 1. + * Splits a string containing divider into pieces, storing in output and returns the number of + * pieces. The string does not have to be terminated: the segment after the last divider is + * returned in the last output element. Thus if the string has no dividers, then the whole + * string is returned in output[0] with a return value of 1. + * * @param divider A UnicodeMatcher, such as a UnicodeSet. * @param s the text to be divided * @param output where the resulting pieces go @@ -586,7 +610,7 @@ public static boolean previous(UnicodeMatcher matcher, Replaceable text, int off */ public static int split(UnicodeMatcher divider, Replaceable text, Position[] output) { int index = 0; - for (int offset = 0;; offset = output[index-1].limit) { + for (int offset = 0; ; offset = output[index - 1].limit) { if (output[index] == null) { output[index] = new Position(); } @@ -598,8 +622,8 @@ public static int split(UnicodeMatcher divider, Replaceable text, Position[] out } /** - * Splits a string containing divider into pieces, storing in output - * and returns the number of pieces. + * Splits a string containing divider into pieces, storing in output and returns the number of + * pieces. */ public static int split(String s, char divider, String[] output, boolean trim) { try { @@ -608,15 +632,15 @@ public static int split(String s, char divider, String[] output, boolean trim) { int i; for (i = 0; i < s.length(); ++i) { if (s.charAt(i) == divider) { - String temp = s.substring(last,i); + String temp = s.substring(last, i); if (trim) { temp = temp.trim(); } output[current++] = temp; - last = i+1; + last = i + 1; } } - String temp = s.substring(last,i); + String temp = s.substring(last, i); if (trim) { temp = temp.trim(); } @@ -632,10 +656,11 @@ public static int split(String s, char divider, String[] output, boolean trim) { } public static String[] split(String s, char divider) { - return split(s,divider,false); + return split(s, divider, false); } + public static int split(String s, char divider, String[] output) { - return split(s,divider,output,false); + return split(s, divider, output, false); } public static String[] split(String s, char divider, boolean trim) { @@ -645,20 +670,20 @@ public static String[] split(String s, char divider, boolean trim) { } public static String[] extract(String[] source, int start, int limit) { - final String[] result = new String[limit-start]; + final String[] result = new String[limit - start]; System.arraycopy(source, start, result, 0, limit - start); return result; } /* - public static String quoteJava(String s) { - StringBuffer result = new StringBuffer(); - for (int i = 0; i < s.length(); ++i) { - result.append(quoteJava(s.charAt(i))); - } - return result.toString(); - } - */ + public static String quoteJava(String s) { + StringBuffer result = new StringBuffer(); + for (int i = 0; i < s.length(); ++i) { + result.append(quoteJava(s.charAt(i))); + } + return result.toString(); + } + */ public static String quoteJavaString(String s) { if (s == null) { return "null"; @@ -674,54 +699,90 @@ public static String quoteJavaString(String s) { public static String quoteJava(int c) { switch (c) { - case '\\': - return "\\\\"; - case '"': - return "\\\""; - case '\r': - return "\\r"; - case '\n': - return "\\n"; - default: - if (c >= 0x20 && c <= 0x7E) { - return String.valueOf((char)c); - } else if (Character.isSupplementaryCodePoint(c)) { - return "\\u" + hex(Character.highSurrogate(c),4) + "\\u" + hex(Character.lowSurrogate(c),4); - } else { - return "\\u" + hex((char)c,4); - } + case '\\': + return "\\\\"; + case '"': + return "\\\""; + case '\r': + return "\\r"; + case '\n': + return "\\n"; + default: + if (c >= 0x20 && c <= 0x7E) { + return String.valueOf((char) c); + } else if (Character.isSupplementaryCodePoint(c)) { + return "\\u" + + hex(Character.highSurrogate(c), 4) + + "\\u" + + hex(Character.lowSurrogate(c), 4); + } else { + return "\\u" + hex((char) c, 4); + } } } public static String quoteXML(int c, boolean HTML) { switch (c) { - case '<': return "<"; - case '>': return ">"; - case '&': return "&"; - case '\'': if (!HTML) { - return "'"; - } - break; - case '"': return """; - - // fix controls, since XML can't handle - - // also do this for 09, 0A, and 0D, so we can see them. - case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07: - case 0x08: case 0x09: case 0x0A: case 0x0B: case 0x0C: case 0x0D: case 0x0E: case 0x0F: - case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17: - case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F: - case 0x7F: - - // fix noncharacters, since XML can't handle - case 0xFFFE: case 0xFFFF: - - return HTML ? '#' + hex(c,4) : ""; + case '<': + return "<"; + case '>': + return ">"; + case '&': + return "&"; + case '\'': + if (!HTML) { + return "'"; + } + break; + case '"': + return """; + + // fix controls, since XML can't handle + + // also do this for 09, 0A, and 0D, so we can see them. + case 0x00: + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x08: + case 0x09: + case 0x0A: + case 0x0B: + case 0x0C: + case 0x0D: + case 0x0E: + case 0x0F: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1A: + case 0x1B: + case 0x1C: + case 0x1D: + case 0x1E: + case 0x1F: + case 0x7F: + + // fix noncharacters, since XML can't handle + case 0xFFFE: + case 0xFFFF: + return HTML ? '#' + hex(c, 4) : ""; } // fix surrogates, since XML can't handle if (UTF32.isSurrogate(c)) { - return HTML ? '#' + hex(c,4) : ""; + return HTML ? '#' + hex(c, 4) : ""; } if (c <= 0x7E) { @@ -734,7 +795,7 @@ public static String quoteXML(int c, boolean HTML) { } */ - return "&#x" + hex(c,1) + ";"; + return "&#x" + hex(c, 1) + ";"; } public static String quoteXML(String source, boolean HTML) { @@ -780,7 +841,8 @@ public static int compare(byte[] a, int aStart, int aEnd, byte[] b, int bStart, return (aEnd - aStart) - (bEnd - bStart); } - public static int compareUnsigned(byte[] a, int aStart, int aEnd, byte[] b, int bStart, int bEnd) { + public static int compareUnsigned( + byte[] a, int aStart, int aEnd, byte[] b, int bStart, int bEnd) { while (aStart < aEnd && bStart < bEnd) { final int diff = (a[aStart++] & 0xFF) - (b[bStart++] & 0xFF); if (diff != 0) { @@ -793,8 +855,8 @@ public static int compareUnsigned(byte[] a, int aStart, int aEnd, byte[] b, int public static final class NumericComparator implements Comparator { public static final NumericComparator INSTANCE = new NumericComparator(); // kn turns on numeric ordering: "10" > "9" - private final Collator coll = Collator.getInstance( - Locale.forLanguageTag("und-u-kn")).freeze(); + private final Collator coll = + Collator.getInstance(Locale.forLanguageTag("und-u-kn")).freeze(); @Override public int compare(String s1, String s2) { @@ -802,9 +864,7 @@ public int compare(String s1, String s2) { } } - /** - * Joins an array together, using divider between the pieces - */ + /** Joins an array together, using divider between the pieces */ public static String join(int[] array, String divider) { String result = "{"; for (int i = 0; i < array.length; ++i) { @@ -814,7 +874,7 @@ public static String join(int[] array, String divider) { result += array[i]; } return result + "}"; -} + } public static String join(long[] array, String divider) { String result = "{"; @@ -825,42 +885,43 @@ public static String join(long[] array, String divider) { result += array[i]; } return result + "}"; -} + } public static final String[] searchPath = { - // "EXTRAS" + (FIX_FOR_NEW_VERSION == 0 ? "" : ""), - "15.0.0", - "14.0.0", - "13.1.0", // TODO: there is no Unicode 13.1, see https://github.com/unicode-org/unicodetools/issues/100 - "13.0.0", - "12.1.0", - "12.0.0", - "11.0.0", - "10.0.0", - "9.0.0", - "8.0.0", - "7.0.0", - "6.3.0", - "6.2.0", - "6.1.0", - "6.0.0", - "5.2.0", - "5.1.0", - "5.0.0", - "4.1.0", - "4.0.1", - "4.0.0", - "3.2.0", - "3.1.1", - "3.1.0", - "3.0.1", - "3.0.0", - "2.1.9", - "2.1.8", - "2.1.5", - "2.1.2", - "2.0.0", - "1.1.0", + // "EXTRAS" + (FIX_FOR_NEW_VERSION == 0 ? "" : ""), + "15.0.0", + "14.0.0", + "13.1.0", // TODO: there is no Unicode 13.1, see + // https://github.com/unicode-org/unicodetools/issues/100 + "13.0.0", + "12.1.0", + "12.0.0", + "11.0.0", + "10.0.0", + "9.0.0", + "8.0.0", + "7.0.0", + "6.3.0", + "6.2.0", + "6.1.0", + "6.0.0", + "5.2.0", + "5.1.0", + "5.0.0", + "4.1.0", + "4.0.1", + "4.0.0", + "3.2.0", + "3.1.1", + "3.1.0", + "3.0.1", + "3.0.0", + "2.1.9", + "2.1.8", + "2.1.5", + "2.1.2", + "2.0.0", + "1.1.0", }; /*public static PrintWriter openPrintWriter(String filename) throws IOException { @@ -872,31 +933,46 @@ public static final class Encoding extends PoorMansEnum { private static PoorMansEnum.EnumStore store = new PoorMansEnum.EnumStore(); /* Boilerplate */ - public Encoding next() { return (Encoding) next; } - public void getAliases(Collection output) { store.getAliases(this, output); } - public static Encoding get(String s) { return (Encoding) store.get(s); } - public static Encoding get(int v) { return (Encoding) store.get(v); } - public static int getMax() { return store.getMax(); } + public Encoding next() { + return (Encoding) next; + } + + public void getAliases(Collection output) { + store.getAliases(this, output); + } + + public static Encoding get(String s) { + return (Encoding) store.get(s); + } + + public static Encoding get(int v) { + return (Encoding) store.get(v); + } + + public static int getMax() { + return store.getMax(); + } private Encoding() {} - private static Encoding add(String name) { return (Encoding) store.add(new Encoding(), name);} - } - public static final Encoding - LATIN1_UNIX = Encoding.add("LATIN1_UNIX"), - LATIN1_WINDOWS = Encoding.add("LATIN1_WINDOWS"), - UTF8_UNIX = Encoding.add("UTF8_UNIX"), - UTF8_WINDOWS = Encoding.add("UTF8_WINDOWS"), + private static Encoding add(String name) { + return (Encoding) store.add(new Encoding(), name); + } + } - //UTF8 = Encoding.add("UTF8"), // for read-only - //LATIN1 = Encoding.add("LATIN1"), // for read-only + public static final Encoding LATIN1_UNIX = Encoding.add("LATIN1_UNIX"), + LATIN1_WINDOWS = Encoding.add("LATIN1_WINDOWS"), + UTF8_UNIX = Encoding.add("UTF8_UNIX"), + UTF8_WINDOWS = Encoding.add("UTF8_WINDOWS"), - // read-only (platform doesn't matter, since it is only line-end) + // UTF8 = Encoding.add("UTF8"), // for read-only + // LATIN1 = Encoding.add("LATIN1"), // for read-only - UTF8 = UTF8_WINDOWS, - LATIN1 = LATIN1_WINDOWS, + // read-only (platform doesn't matter, since it is only line-end) - FIRST = LATIN1_UNIX; + UTF8 = UTF8_WINDOWS, + LATIN1 = LATIN1_WINDOWS, + FIRST = LATIN1_UNIX; /* public static final Encoding @@ -917,23 +993,21 @@ public static PrintWriter openPrintWriter(String filename, Encoding options) { // Normally use false, false. // But for UCD files use true, true // Or if they are UTF8, use true, false - public static PrintWriter openPrintWriter(String directory, String filename, Encoding options) { + public static PrintWriter openPrintWriter(String directory, String filename, Encoding options) { try { - final File file; - if (directory.equals("")) - file = new File(filename); - else - file = new File(directory, filename); - Utility.fixDot(); + final File file; + if (directory.equals("")) file = new File(filename); + else file = new File(directory, filename); + Utility.fixDot(); System.out.println("\nCreating File: " + file.getCanonicalPath()); final File parent = new File(file.getParent()); - //System.out.println("Creating File: "+ parent); + // System.out.println("Creating File: "+ parent); parent.mkdirs(); return new PrintWriter( new UTF8StreamWriter( new FileOutputStream(file), - 32*1024, + 32 * 1024, options == LATIN1_UNIX || options == UTF8_UNIX, options == LATIN1_UNIX || options == LATIN1_WINDOWS)); } catch (final IOException e) { @@ -951,10 +1025,16 @@ public static void print(PrintWriter pw, Collection c, String separator) { public interface Breaker { public String get(Object current, Object old); + public boolean filter(Object current); // true is keep } - public static void printMapOfCollection(PrintWriter pw, Map c, String mainSeparator, String itemSeparator, String subseparator) { + public static void printMapOfCollection( + PrintWriter pw, + Map c, + String mainSeparator, + String itemSeparator, + String subseparator) { final Iterator it = c.keySet().iterator(); boolean first = true; final Object last = null; @@ -998,7 +1078,8 @@ public static int print(PrintWriter pw, Collection c, String separator, Breaker return count; } - public static void print(PrintWriter pw, Map c, String pairSeparator, String separator, Breaker b) { + public static void print( + PrintWriter pw, Map c, String pairSeparator, String separator, Breaker b) { final Iterator it = c.keySet().iterator(); boolean first = true; Object last = null; @@ -1024,9 +1105,11 @@ public static void print(PrintWriter pw, Map c, String pairSeparator, String sep public static class RuntimeIOException extends RuntimeException { private static final long serialVersionUID = 2982482977979580522L; + public RuntimeIOException() { super(); } + public RuntimeIOException(Exception e) { super(e); } @@ -1049,7 +1132,7 @@ public static BufferedReader openReadFile(String filename, Encoding encoding) { } else { isr = new InputStreamReader(fis); } - final BufferedReader br = new BufferedReader(isr, 32*1024); + final BufferedReader br = new BufferedReader(isr, 32 * 1024); return br; } @@ -1062,7 +1145,7 @@ public static void addCount(Map m, Object key, int count) { m.put(key, new Integer(oldCount.intValue() + count)); } - public static void addToSet(Map> m, K key, V value) { + public static void addToSet(Map> m, K key, V value) { Set set = m.get(key); if (set == null) { set = new TreeSet(); @@ -1107,16 +1190,20 @@ public static String readDataLine(BufferedReader br, int[] count) throws IOExcep } line = line.trim(); } catch (final Exception e) { - throw new ChainException("Line \"{0}\", \"{1}\"", new String[] {originalLine, line}, e); + throw new ChainException( + "Line \"{0}\", \"{1}\"", new String[] {originalLine, line}, e); } return line; } - public static void appendFile(String filename, Encoding encoding, PrintWriter output) throws IOException { + public static void appendFile(String filename, Encoding encoding, PrintWriter output) + throws IOException { appendFile(filename, encoding, output, null); } - public static void appendFile(String filename, Encoding encoding, PrintWriter output, String[] replacementList) throws IOException { + public static void appendFile( + String filename, Encoding encoding, PrintWriter output, String[] replacementList) + throws IOException { final BufferedReader br = openReadFile(filename, encoding); /* FileInputStream fis = new FileInputStream(filename); @@ -1130,15 +1217,19 @@ public static void appendFile(String filename, Encoding encoding, PrintWriter ou } if (replacementList != null) { for (int i = 0; i < replacementList.length; i += 2) { - line = replace(line, replacementList[i], replacementList[i+1]); + line = replace(line, replacementList[i], replacementList[i + 1]); } } output.println(line); } } - /** If contents(newFile) ≠ contents(oldFile), rename newFile to old. Otherwise delete newfile. Return true if replaced. **/ - public static boolean replaceDifferentOrDelete(String oldFile, String newFile, boolean skipCopyright) throws IOException { + /** + * If contents(newFile) ≠ contents(oldFile), rename newFile to old. Otherwise delete newfile. + * Return true if replaced. * + */ + public static boolean replaceDifferentOrDelete( + String oldFile, String newFile, boolean skipCopyright) throws IOException { final File oldFile2 = new File(oldFile); if (oldFile2.exists()) { final String lines[] = new String[2]; @@ -1149,14 +1240,25 @@ public static boolean replaceDifferentOrDelete(String oldFile, String newFile, b } System.out.println("Found difference in : " + oldFile + ", " + newFile); final int diff = compare(lines[0], lines[1]); - System.out.println(" File1: '" + lines[0].substring(0,diff) + "', '" + lines[0].substring(diff) + "'"); - System.out.println(" File2: '" + lines[1].substring(0,diff) + "', '" + lines[1].substring(diff) + "'"); + System.out.println( + " File1: '" + + lines[0].substring(0, diff) + + "', '" + + lines[0].substring(diff) + + "'"); + System.out.println( + " File2: '" + + lines[1].substring(0, diff) + + "', '" + + lines[1].substring(diff) + + "'"); } new File(newFile).renameTo(oldFile2); return true; } - public static boolean renameIdentical(String file1, String file2, String batFile, boolean skipCopyright) throws IOException { + public static boolean renameIdentical( + String file1, String file2, String batFile, boolean skipCopyright) throws IOException { if (file1 == null) { System.out.println("Null file"); return false; @@ -1173,20 +1275,31 @@ public static boolean renameIdentical(String file1, String file2, String batFile fixDot(); System.out.println("Found difference in : " + file1 + ", " + file2); final int diff = compare(lines[0], lines[1]); - System.out.println(" File1: '" + lines[0].substring(0,diff) + "', '" + lines[0].substring(diff) + "'"); - System.out.println(" File2: '" + lines[1].substring(0,diff) + "', '" + lines[1].substring(diff) + "'"); + System.out.println( + " File1: '" + + lines[0].substring(0, diff) + + "', '" + + lines[0].substring(diff) + + "'"); + System.out.println( + " File2: '" + + lines[1].substring(0, diff) + + "', '" + + lines[1].substring(diff) + + "'"); return false; } } - public static boolean filesAreIdentical(String file1, String file2, boolean skipCopyright, String[] lines) throws IOException { + public static boolean filesAreIdentical( + String file1, String file2, boolean skipCopyright, String[] lines) throws IOException { if (file1 == null) { lines[0] = null; lines[1] = null; return false; } - final BufferedReader br1 = new BufferedReader(new FileReader(file1), 32*1024); - final BufferedReader br2 = new BufferedReader(new FileReader(file2), 32*1024); + final BufferedReader br1 = new BufferedReader(new FileReader(file1), 32 * 1024); + final BufferedReader br2 = new BufferedReader(new FileReader(file2), 32 * 1024); String line1 = ""; String line2 = ""; try { @@ -1232,7 +1345,8 @@ static void renameIdentical(String file2) { } } - static String getLineWithoutFluff(BufferedReader br1, boolean first, boolean skipCopyright) throws IOException { + static String getLineWithoutFluff(BufferedReader br1, boolean first, boolean skipCopyright) + throws IOException { while (true) { String line1 = br1.readLine(); if (line1 == null) { @@ -1272,8 +1386,7 @@ static String getLineWithoutFluff(BufferedReader br1, boolean first, boolean ski } } - /** Returns -1 if strings are equal; otherwise the position they are different at - */ + /** Returns -1 if strings are equal; otherwise the position they are different at */ public static int compare(String a, String b) { int len = a.length(); if (len > b.length()) { @@ -1290,17 +1403,21 @@ public static int compare(String a, String b) { return -1; } - public static void copyTextFile(String filename, Encoding encoding, String newName, String[] replacementList) throws IOException { + public static void copyTextFile( + String filename, Encoding encoding, String newName, String[] replacementList) + throws IOException { final PrintWriter out = Utility.openPrintWriter(newName, UTF8_WINDOWS); appendFile(filename, encoding, out, replacementList); out.close(); } - public static void copyTextFile(String filename, Encoding encoding, String newName) throws IOException { + public static void copyTextFile(String filename, Encoding encoding, String newName) + throws IOException { copyTextFile(filename, encoding, newName, null); } - public static BufferedReader openUnicodeFile(String filename, String version, boolean show, Encoding encoding) { + public static BufferedReader openUnicodeFile( + String filename, String version, boolean show, Encoding encoding) { final String name = getMostRecentUnicodeDataFile(filename, version, true, show); if (name == null) { return null; @@ -1308,16 +1425,20 @@ public static BufferedReader openUnicodeFile(String filename, String version, bo return openReadFile(name, encoding); // new BufferedReader(new FileReader(name),32*1024); } - public static String getMostRecentUnicodeDataFile(String filename, String version, - boolean acceptLatest, boolean show) { + public static String getMostRecentUnicodeDataFile( + String filename, String version, boolean acceptLatest, boolean show) { return getMostRecentUnicodeDataFile(filename, version, acceptLatest, show, ".txt"); } - public static String getMostRecentUnicodeDataFile(String filename, String versionString, - boolean acceptLatest, boolean show, String fileType) { + public static String getMostRecentUnicodeDataFile( + String filename, + String versionString, + boolean acceptLatest, + boolean show, + String fileType) { // get all the files in the directory - VersionInfo version = versionString.isEmpty() ? - null : VersionInfo.getInstance(versionString); + VersionInfo version = + versionString.isEmpty() ? null : VersionInfo.getInstance(versionString); final int compValue = acceptLatest ? 0 : 1; Set tries = show ? new LinkedHashSet() : null; String result = null; @@ -1364,27 +1485,34 @@ public static String getMostRecentUnicodeDataFile(String filename, String versio } } if (show && !tries.isEmpty()) { - System.out.println("\tTried: '" + "(" - + CollectionUtilities.join(tries, "|") + ")" - + File.separator + filename + "*" + fileType + "'"); + System.out.println( + "\tTried: '" + + "(" + + CollectionUtilities.join(tries, "|") + + ")" + + File.separator + + filename + + "*" + + fileType + + "'"); } return result; } private static String getEmojiVersion(VersionInfo versionInfo) { int major = versionInfo.getMajor(); - switch(major) { - case 10: - return "6.0"; - case 9: - return "4.0"; - case 8: - return "3.0"; - default: - if (major > 10) { - return versionInfo.getVersionString(2, 2); - } - break; + switch (major) { + case 10: + return "6.0"; + case 9: + return "4.0"; + case 8: + return "3.0"; + default: + if (major > 10) { + return versionInfo.getVersionString(2, 2); + } + break; } return null; } @@ -1400,21 +1528,25 @@ public static Set getDirectoryContentsLastFirst(File directory) { throw new IllegalArgumentException(e); } } - final Set result = new TreeSet<>(new Comparator() { - @Override - public int compare(String a, String b) { - return b.compareTo(a); - } - }); + final Set result = + new TreeSet<>( + new Comparator() { + @Override + public int compare(String a, String b) { + return b.compareTo(a); + } + }); result.addAll(java.util.Arrays.asList(directory.list())); return result; } - public static String searchDirectory(File directory, String filename, boolean show) throws IOException { + public static String searchDirectory(File directory, String filename, boolean show) + throws IOException { return searchDirectory(directory, filename, show, ".txt"); } - public static String searchDirectory(File directory, String filename, boolean show, String fileType) { + public static String searchDirectory( + File directory, String filename, boolean show, String fileType) { // Before looking for the file, does the directory even exist? Path dirPath = directory.toPath(); if (!Files.exists(dirPath)) { @@ -1481,9 +1613,7 @@ public static void writeHtmlHeader(PrintWriter log, String title) { log.println(""); } - /** - * Replaces all occurrences of piece with replacement, and returns new String - */ + /** Replaces all occurrences of piece with replacement, and returns new String */ public static String replace(String source, String piece, String replacement) { if (source == null || source.length() < piece.length()) { return source; @@ -1494,7 +1624,8 @@ public static String replace(String source, String piece, String replacement) { if (pos < 0) { return source; } - source = source.substring(0,pos) + replacement + source.substring(pos + piece.length()); + source = + source.substring(0, pos) + replacement + source.substring(pos + piece.length()); pos += replacement.length(); } } @@ -1531,52 +1662,84 @@ public static String getStack() { public static String getUnicodeImage(int cp) { final String code = hex(cp, 4); - return "U+" + code + ""; + return "U+"
+                + code
+                + ""; } static PrintWriter showSetNamesPw; - public static void showSetDifferences(String name1, UnicodeSet set1, String name2, UnicodeSet set2, boolean separateLines, UCD ucd) { + public static void showSetDifferences( + String name1, + UnicodeSet set1, + String name2, + UnicodeSet set2, + boolean separateLines, + UCD ucd) { if (showSetNamesPw == null) { showSetNamesPw = new PrintWriter(System.out); } - showSetDifferences(showSetNamesPw, name1, set1, name2, set2, separateLines, false, null, ucd); + showSetDifferences( + showSetNamesPw, name1, set1, name2, set2, separateLines, false, null, ucd); } - public static void showSetDifferences(PrintWriter pw, String name1, UnicodeSet set1, String name2, UnicodeSet set2, - boolean separateLines, boolean withChar, UnicodeMap names, UCD ucd) { + public static void showSetDifferences( + PrintWriter pw, + String name1, + UnicodeSet set1, + String name2, + UnicodeSet set2, + boolean separateLines, + boolean withChar, + UnicodeMap names, + UCD ucd) { UnicodeSet temp = new UnicodeSet(set1).removeAll(set2); pw.println(); pw.println("In " + name1 + ", but not in " + name2 + ": "); - showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd); + showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd); temp = new UnicodeSet(set2).removeAll(set1); pw.println(); pw.println("Not in " + name1 + ", but in " + name2 + ": "); - showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd); + showSetNames(pw, "\t", temp, separateLines, false, withChar, names, ucd); temp = new UnicodeSet(set2).retainAll(set1); pw.println(); pw.println("In both " + name1 + " and " + name2 + ": "); - pw.println(temp.size() == 0 ? "" : ""+ temp); + pw.println(temp.size() == 0 ? "" : "" + temp); pw.flush(); // showSetNames(pw, "\t", temp, false, false, withChar, names, ucd); } public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, UCD ucd) { - showSetNames(prefix, set, separateLines, false, false, ucd); + showSetNames(prefix, set, separateLines, false, false, ucd); } - public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) { - showSetNames(prefix, set, separateLines, IDN, false, ucd); + public static void showSetNames( + String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) { + showSetNames(prefix, set, separateLines, IDN, false, ucd); } - public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, UCD ucd) { - showSetNames( pw, prefix, set, separateLines, IDN, false, null, ucd); + public static void showSetNames( + PrintWriter pw, + String prefix, + UnicodeSet set, + boolean separateLines, + boolean IDN, + UCD ucd) { + showSetNames(pw, prefix, set, separateLines, IDN, false, null, ucd); } - public static void showSetNames(String prefix, UnicodeSet set, boolean separateLines, boolean IDN, boolean withChar, UCD ucd) { + public static void showSetNames( + String prefix, + UnicodeSet set, + boolean separateLines, + boolean IDN, + boolean withChar, + UCD ucd) { if (showSetNamesPw == null) { showSetNamesPw = new PrintWriter(System.out); } @@ -1585,8 +1748,15 @@ public static void showSetNames(String prefix, UnicodeSet set, boolean separateL static java.text.NumberFormat nf = java.text.NumberFormat.getInstance(); - public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, boolean separateLines, boolean IDN, - boolean withChar, UnicodeMap names, UCD ucd) { + public static void showSetNames( + PrintWriter pw, + String prefix, + UnicodeSet set, + boolean separateLines, + boolean IDN, + boolean withChar, + UnicodeMap names, + UCD ucd) { if (set.size() == 0) { pw.println(prefix + ""); pw.flush(); @@ -1597,39 +1767,54 @@ public static void showSetNames(PrintWriter pw, String prefix, UnicodeSet set, b for (int i = 0; i < count; ++i) { final int start = set.getRangeStart(i); final int end = set.getRangeEnd(i); - if (separateLines || (IDN && isSeparateLineIDN(start,end,ucd))) { + if (separateLines || (IDN && isSeparateLineIDN(start, end, ucd))) { for (int cp = start; cp <= end; ++cp) { if (!IDN) { - pw.println(prefix + UCD.getCode(cp) - + "\t# " - + (useHTML ? "(" + getUnicodeImage(cp) + ") " : "") - + (withChar && (cp >= 0x20) ? "(" + UTF16.valueOf(cp) + ") " : "") - + (names != null ? names.getValue(cp) + " " : "") - + ucd.getName(cp) - + (useHTML ? "
" : "")); + pw.println( + prefix + + UCD.getCode(cp) + + "\t# " + + (useHTML ? "(" + getUnicodeImage(cp) + ") " : "") + + (withChar && (cp >= 0x20) + ? "(" + UTF16.valueOf(cp) + ") " + : "") + + (names != null ? names.getValue(cp) + " " : "") + + ucd.getName(cp) + + (useHTML ? "
" : "")); } else { - pw.println(prefix + Utility.hex(cp,4) + "; " + ucd.getName(cp)); + pw.println(prefix + Utility.hex(cp, 4) + "; " + ucd.getName(cp)); } } } else { if (!IDN) { - pw.println(prefix + UCD.getCode(start) - + ((start != end) ? (".." + UCD.getCode(end)) : "") - + "\t# " - + (withChar && (start >= 0x20) ? " (" + UTF16.valueOf(start) - + ((start != end) ? (".." + UTF16.valueOf(end)) : "") + ") " : "") - + ucd.getName(start) + ((start != end) ? (".." + ucd.getName(end)) : "") - ); + pw.println( + prefix + + UCD.getCode(start) + + ((start != end) ? (".." + UCD.getCode(end)) : "") + + "\t# " + + (withChar && (start >= 0x20) + ? " (" + + UTF16.valueOf(start) + + ((start != end) + ? (".." + UTF16.valueOf(end)) + : "") + + ") " + : "") + + ucd.getName(start) + + ((start != end) ? (".." + ucd.getName(end)) : "")); } else { - pw.println(prefix + Utility.hex(start,4) - + ((start != end) ? ("-" + Utility.hex(end,4)) : "") - + (ucd.isAssigned(start) - ? "; " + ucd.getName(start) + ((start != end) - ? ("-" + ucd.getName(end)) - : "") - : "") - ); + pw.println( + prefix + + Utility.hex(start, 4) + + ((start != end) ? ("-" + Utility.hex(end, 4)) : "") + + (ucd.isAssigned(start) + ? "; " + + ucd.getName(start) + + ((start != end) + ? ("-" + ucd.getName(end)) + : "") + : "")); } } } @@ -1645,7 +1830,8 @@ private static boolean isSeparateLineIDN(int cp, UCD ucd) { if (cat == UCD_Types.Cn) { return false; } - if (ucd.getCategory(cp) == UCD_Types.Cc && !ucd.getBinaryProperty(cp, UCD_Types.White_space)) { + if (ucd.getCategory(cp) == UCD_Types.Cc + && !ucd.getBinaryProperty(cp, UCD_Types.White_space)) { return false; } return true; @@ -1655,7 +1841,8 @@ private static boolean isSeparateLineIDN(int start, int end, UCD ucd) { return (isSeparateLineIDN(start, ucd) || isSeparateLineIDN(end, ucd)); } - public static Transliterator createFromFile(String fileName, int direction, Transliterator pretrans) throws IOException { + public static Transliterator createFromFile( + String fileName, int direction, Transliterator pretrans) throws IOException { final StringBuffer buffer = new StringBuffer(); final FileLineIterator fli = new FileLineIterator(); fli.open(fileName, Utility.UTF8); @@ -1700,7 +1887,7 @@ public static Transliterator createFromFile(String fileName, int direction, Tran if (pos >= 0) { id = id.substring(0, pos); } - //System.out.println(buffer); + // System.out.println(buffer); return Transliterator.createFromRules(id, buffer.toString(), direction); } @@ -1735,27 +1922,37 @@ public static String generateDateLine() { } public static String getDataHeader(String filename) { - return "# " + filename + return "# " + + filename + (Settings.BUILD_FOR_COMPARE ? "" : "\n" + generateDateLine()) - + "\n# © " + Default.getYear() + " Unicode®, Inc." + + "\n# © " + + Default.getYear() + + " Unicode®, Inc." + "\n# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries." - + "\n# For terms of use, see https://www.unicode.org/terms_of_use.html" - ; + + "\n# For terms of use, see https://www.unicode.org/terms_of_use.html"; } - public static String getBaseDataHeaderWithVersionText(String filename, int trNumber, String title, String versionText) { + public static String getBaseDataHeaderWithVersionText( + String filename, int trNumber, String title, String versionText) { if (!filename.endsWith(".txt")) { filename = filename + ".txt"; } return getDataHeader(filename) + "\n#" - + "\n# " + title + " for UTS #" + trNumber - + "\n# " + versionText + + "\n# " + + title + + " for UTS #" + + trNumber + + "\n# " + + versionText + "\n#" - + "\n# For documentation and usage, see https://www.unicode.org/reports/tr" + trNumber + + "\n# For documentation and usage, see https://www.unicode.org/reports/tr" + + trNumber + "\n#"; } - public static String getBaseDataHeader(String filename, int trNumber, String title, String version) { + + public static String getBaseDataHeader( + String filename, int trNumber, String title, String version) { String versionText = "Version: " + version; return getBaseDataHeaderWithVersionText(filename, trNumber, title, versionText); } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/UtilityBase.java b/unicodetools/src/main/java/org/unicode/text/utility/UtilityBase.java index f8835a2c7..52db08e95 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/UtilityBase.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/UtilityBase.java @@ -1,13 +1,12 @@ package org.unicode.text.utility; +import com.ibm.icu.text.UTF16; import org.unicode.text.UCD.Default; import org.unicode.text.UCD.DerivedProperty; import org.unicode.text.UCD.UCDProperty; import org.unicode.text.UCD.UCD_Types; -import com.ibm.icu.text.UTF16; - -public class UtilityBase implements UCD_Types { +public class UtilityBase implements UCD_Types { public static String getDisplay(int cp) { String result = UTF16.valueOf(cp); @@ -28,16 +27,16 @@ public static String getDisplay(int cp) { } static UCDProperty defaultIgnorable = null; - - public static final String HTML_HEAD = "\n" - + "\n" - + "\n" - + "\n" - + "\n"; + public static final String HTML_HEAD = + "\n" + + "\n" + + "\n" + + "\n" + + "\n"; } diff --git a/unicodetools/src/main/java/org/unicode/text/utility/XMLParse.java b/unicodetools/src/main/java/org/unicode/text/utility/XMLParse.java index 222c74b33..72c6963f0 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/XMLParse.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/XMLParse.java @@ -1,83 +1,74 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/XMLParse.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/XMLParse.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; /** - * Very dumb XML parser, designed for restricted environment where transmitter is guaranteed - * to limit types of XML files generated. + * Very dumb XML parser, designed for restricted environment where transmitter is guaranteed to + * limit types of XML files generated. * - * RESTRICTIONS - * Requires document to be well-formed. Doesn't properly signal errors if it is not. - * No DTDs, !DOCTYPE, !ATTLIST, !ELEMENT, ![, !NOTATION, !ENTITY, CDATA - * No processing instructions - * Does do character references, lt, gt, amp, apos, quot - * The encoding is specified by the user, by using the right Reader - * On creation, you supply a buffer for the textual elements. Use a buffer that is as large - * as the largest possible piece of text (e.g. attribute value or element text) in the file. + *

RESTRICTIONS Requires document to be well-formed. Doesn't properly signal errors if it is not. + * No DTDs, !DOCTYPE, !ATTLIST, !ELEMENT, ![, !NOTATION, !ENTITY, CDATA No processing instructions + * Does do character references, lt, gt, amp, apos, quot The encoding is specified by the user, by + * using the right Reader On creation, you supply a buffer for the textual elements. Use a buffer + * that is as large as the largest possible piece of text (e.g. attribute value or element text) in + * the file. * * @author Mark Davis */ +import com.ibm.icu.text.UTF16; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.Reader; -import com.ibm.icu.text.UTF16; - public final class XMLParse implements XMLParseTypes { - /** Create a parser. - */ + /** Create a parser. */ public XMLParse(Reader stream, char[] buffer) { this.stream = stream; this.buffer = buffer; } - /** Create a parser. - */ + /** Create a parser. */ public XMLParse(String fileName, char[] buffer) throws FileNotFoundException { - stream = new BufferedReader(new FileReader(fileName),32*1024); + stream = new BufferedReader(new FileReader(fileName), 32 * 1024); this.buffer = buffer; } - /** Get the textual value associated with this item. - * Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT. + /** + * Get the textual value associated with this item. Only valid for ELEMENT_TAG*, ATTRIBUTE*, + * TEXT. */ public String getValue() { return String.valueOf(buffer, 0, bufferCount); } - /** Get length of the textual value associated with this item. - * Only valid for ELEMENT_TAG*, ATTRIBUTE*, TEXT. + /** + * Get length of the textual value associated with this item. Only valid for ELEMENT_TAG*, + * ATTRIBUTE*, TEXT. */ public int getValueCount() { return bufferCount; } - /** Get the buffer that was passed in on creation. - */ + /** Get the buffer that was passed in on creation. */ public char[] getValueArray() { return buffer; } - /** Get the "kind" of the last item (see XMLParseTypes) - */ + /** Get the "kind" of the last item (see XMLParseTypes) */ public int getKind() { return kind; } - /** Get the next element, returning a "Kind" (see XMLParseTypes) - */ - + /** Get the next element, returning a "Kind" (see XMLParseTypes) */ public byte next() { char c = '\u0000'; @@ -99,43 +90,55 @@ public byte next() { // can be classed as IDENTIFIER switch (c) { - case ' ': case '\r': case '\n': case '\t': - type = ' '; - break; - case '<': case '>': case '#': case ';': case '/': case '\'': case '"': - case '=': case '?': case '!': case '-': - type = c; - break; - case '&': // CR, either numerical or lt, gt, quot, amp, apos - - // gather characters - - int crCount = 0; - while (true) { - c = (char) stream.read(); - if (c == ';') { - break; + case ' ': + case '\r': + case '\n': + case '\t': + type = ' '; + break; + case '<': + case '>': + case '#': + case ';': + case '/': + case '\'': + case '"': + case '=': + case '?': + case '!': + case '-': + type = c; + break; + case '&': // CR, either numerical or lt, gt, quot, amp, apos + + // gather characters + + int crCount = 0; + while (true) { + c = (char) stream.read(); + if (c == ';') { + break; + } + crBuffer[crCount++] = c; } - crBuffer[crCount++] = c; - } - // parse it, and break into two pieces if necessary + // parse it, and break into two pieces if necessary - int x = parseCR(crBuffer, crCount); - c = (char)x; - if (x > 0xFFFF) { // Supplementary - x -= 0x10000; - c = (char) (0xD800 + (x >> 10)); - bufferChar = (char) (0xDC00 + (x & 0x3FF)); - } + int x = parseCR(crBuffer, crCount); + c = (char) x; + if (x > 0xFFFF) { // Supplementary + x -= 0x10000; + c = (char) (0xD800 + (x >> 10)); + bufferChar = (char) (0xDC00 + (x & 0x3FF)); + } - // Since we assume validity, any CRs are not syntax characters + // Since we assume validity, any CRs are not syntax characters - type = IDENTIFIER; // everything else - break; - default: - type = IDENTIFIER; // everything else - break; + type = IDENTIFIER; // everything else + break; + default: + type = IDENTIFIER; // everything else + break; } } catch (final Exception e) { c = '\uFFFF'; @@ -147,154 +150,154 @@ public byte next() { System.out.println(c + ", " + type + ", " + stateNames[state]); } switch (state) { - case IN_TEXT: - if (type == '<') { - state = START_ELEMENT; - if (bufferCount != 0) { - kind = TEXT; - return kind; + case IN_TEXT: + if (type == '<') { + state = START_ELEMENT; + if (bufferCount != 0) { + kind = TEXT; + return kind; + } + break; } + buffer[bufferCount++] = c; break; - } - buffer[bufferCount++] = c; - break; - case START_ELEMENT: // must be either '/' or more than one ID char - bufferCount = 0; - switch (type) { - case '/': - elementType = ELEMENT_TAG_SLASH; - state = IN_ELEMENT; + case START_ELEMENT: // must be either '/' or more than one ID char + bufferCount = 0; + switch (type) { + case '/': + elementType = ELEMENT_TAG_SLASH; + state = IN_ELEMENT; + break; + case '!': + buffer[bufferCount++] = c; + elementType = ELEMENT_TAG_COMMENT; + state = IN_COMMENT; + break; + case '?': + elementType = ELEMENT_TAG_QUESTION; + state = IN_ELEMENT; + break; + default: + elementType = ELEMENT_TAG; + buffer[bufferCount++] = c; + state = IN_ELEMENT; + break; + } break; - case '!': + case IN_COMMENT: buffer[bufferCount++] = c; - elementType = ELEMENT_TAG_COMMENT; - state = IN_COMMENT; + if (type == '-') { + state = IN_COMMENT2; + } else { + state = IN_COMMENT; + } break; - case '?': - elementType = ELEMENT_TAG_QUESTION; - state = IN_ELEMENT; + case IN_COMMENT2: + buffer[bufferCount++] = c; + if (type == '-') { + state = IN_COMMENT3; + } else { + state = IN_COMMENT; + } break; - default: - elementType = ELEMENT_TAG; + case IN_COMMENT3: + if (type == '>') { + kind = ELEMENT_TAG_COMMENT; + bufferChar = c; + state = IN_ATTRIBUTES; + elementType = END_ELEMENT_COMMENT; + return kind; + } else if (type != '-') { + state = IN_COMMENT; + } buffer[bufferCount++] = c; - state = IN_ELEMENT; break; - } - break; - case IN_COMMENT: - buffer[bufferCount++] = c; - if (type == '-') { - state = IN_COMMENT2; - } else { - state = IN_COMMENT; - } - break; - case IN_COMMENT2: - buffer[bufferCount++] = c; - if (type == '-') { - state = IN_COMMENT3; - } else { - state = IN_COMMENT; - } - break; - case IN_COMMENT3: - if (type == '>') { - kind = ELEMENT_TAG_COMMENT; - bufferChar = c; - state = IN_ATTRIBUTES; - elementType = END_ELEMENT_COMMENT; - return kind; - } else if (type != '-') { - state = IN_COMMENT; - } - buffer[bufferCount++] = c; - break; - case IN_ELEMENT: - if (type != IDENTIFIER) { - state = IN_ATTRIBUTES; - kind = elementType; - elementType = END_ELEMENT; - bufferChar = c; - return kind; - } - buffer[bufferCount++] = c; - break; - case IN_ATTRIBUTES: - bufferCount = 0; - if (type == '/') { - elementType = END_ELEMENT_SLASH; - } else if (type == '?') { - elementType = END_ELEMENT_QUESTION; - } else if (type == '>') { - state = IN_TEXT; - kind = elementType; - return kind; - } else if (type == IDENTIFIER) { - state = IN_ATTR; + case IN_ELEMENT: + if (type != IDENTIFIER) { + state = IN_ATTRIBUTES; + kind = elementType; + elementType = END_ELEMENT; + bufferChar = c; + return kind; + } buffer[bufferCount++] = c; break; - } - break; - case IN_ATTR: - if (type != IDENTIFIER) { - state = START_VALUE; - kind = ATTRIBUTE_TAG; - return kind; - } - buffer[bufferCount++] = c; - break; - case START_VALUE: // must have * = ( ' | " ) - if (type == '\'' || type == '"') { - lastQuote = c; - state = IN_VALUE; + case IN_ATTRIBUTES: bufferCount = 0; - } - break; - case IN_VALUE: // only terminated by lastQuote - if (type == lastQuote) { - state = IN_ATTRIBUTES; - kind = ATTRIBUTE_VALUE; - return kind; - } - buffer[bufferCount++] = c; - break; + if (type == '/') { + elementType = END_ELEMENT_SLASH; + } else if (type == '?') { + elementType = END_ELEMENT_QUESTION; + } else if (type == '>') { + state = IN_TEXT; + kind = elementType; + return kind; + } else if (type == IDENTIFIER) { + state = IN_ATTR; + buffer[bufferCount++] = c; + break; + } + break; + case IN_ATTR: + if (type != IDENTIFIER) { + state = START_VALUE; + kind = ATTRIBUTE_TAG; + return kind; + } + buffer[bufferCount++] = c; + break; + case START_VALUE: // must have * = ( ' | " ) + if (type == '\'' || type == '"') { + lastQuote = c; + state = IN_VALUE; + bufferCount = 0; + } + break; + case IN_VALUE: // only terminated by lastQuote + if (type == lastQuote) { + state = IN_ATTRIBUTES; + kind = ATTRIBUTE_VALUE; + return kind; + } + buffer[bufferCount++] = c; + break; } } return DONE; } - /** Utility for doing XML quotes. Flags control which characters are handled and how. - * (see XMLParseTypes for values) + /** + * Utility for doing XML quotes. Flags control which characters are handled and how. (see + * XMLParseTypes for values) */ - public static String quote(int c) { return quote(c, 0); } - /** Utility for doing XML quotes. Flags control which characters are handled and how. - * (see XMLParseTypes for values) + /** + * Utility for doing XML quotes. Flags control which characters are handled and how. (see + * XMLParseTypes for values) */ - public static String quote(int c, int flags) { final String result = quoteGuts(c, flags); if (result != null) { return result; } - return String.valueOf((char)c); + return String.valueOf((char) c); } - /** Utility for doing XML quotes. Flags control which characters are handled and how. - * (see XMLParseTypes for values) + /** + * Utility for doing XML quotes. Flags control which characters are handled and how. (see + * XMLParseTypes for values) */ - public static String quote(String source) { return quote(source, 0); } - /** Utility for doing XML quotes. Flags control which characters are handled and how. - * (see XMLParseTypes for values) + /** + * Utility for doing XML quotes. Flags control which characters are handled and how. (see + * XMLParseTypes for values) */ - public static String quote(String source, int flags) { final StringBuffer result = new StringBuffer(); String temp; @@ -307,17 +310,15 @@ public static String quote(String source, int flags) { if (temp != null) { result.append(temp); } else if (c <= 0xFFFF) { - result.append((char)c); - } - else { - result.append(source.substring(i-1,i+1)); // surrogates + result.append((char) c); + } else { + result.append(source.substring(i - 1, i + 1)); // surrogates } } return result.toString(); } - /** Parses inside of CR. buffer should not contain the initial '&', or final ';' - */ + /** Parses inside of CR. buffer should not contain the initial '&', or final ';' */ static int parseCR(char[] crBuffer, int crCount) { int c; int start = 0; @@ -325,38 +326,42 @@ static int parseCR(char[] crBuffer, int crCount) { return -1; } switch (crBuffer[start++]) { - case 'l': c = '<'; break; // lt - case 'g': c = '>'; break; // gt - case 'q': c = '"'; break; // quot - case 'a': // &, ' - if (crCount > start && crBuffer[start] == 'm') { - c = '&'; - } else { - c = '\''; - } - break; - case '#': - int radix = 10; - if (crCount > start && crBuffer[start] == 'x') { - radix = 16; - ++start; - } - // Simple code for now. Could be sped up. - c = Integer.parseInt(String.valueOf(crBuffer,start,crCount-start), radix); - break; - default: - c = -1; + case 'l': + c = '<'; + break; // lt + case 'g': + c = '>'; + break; // gt + case 'q': + c = '"'; + break; // quot + case 'a': // &, ' + if (crCount > start && crBuffer[start] == 'm') { + c = '&'; + } else { + c = '\''; + } + break; + case '#': + int radix = 10; + if (crCount > start && crBuffer[start] == 'x') { + radix = 16; + ++start; + } + // Simple code for now. Could be sped up. + c = Integer.parseInt(String.valueOf(crBuffer, start, crCount - start), radix); + break; + default: + c = -1; } return c; } - /** Utility for doing hex, padding with zeros - */ - - static public String hex(long i, int places) { + /** Utility for doing hex, padding with zeros */ + public static String hex(long i, int places) { String result = Long.toString(i, 16).toUpperCase(); if (result.length() < places) { - result = "0000000000000000".substring(result.length(),places) + result; + result = "0000000000000000".substring(result.length(), places) + result; } return result; } @@ -377,73 +382,133 @@ static public String hex(long i, int places) { private char lastQuote; private char bufferChar; - private static final byte IN_TEXT = 0, START_ELEMENT = 1, IN_ELEMENT = 2, - IN_ATTR = 3, START_VALUE = 4, IN_VALUE = 5, IN_ATTRIBUTES = 6, - IN_COMMENT = 7, IN_COMMENT2 = 8, IN_COMMENT3 = 9; - - private static final String[] stateNames = {"IN_TEXT", "START_ELEMENT", "IN_ELEMENT", - "IN_ATTR", "START_VALUE", "IN_VALUE", "IN_ATTRIBUTES", - "IN_COMMENT", "IN_COMMENT2", "IN_COMMENT3"}; + private static final byte IN_TEXT = 0, + START_ELEMENT = 1, + IN_ELEMENT = 2, + IN_ATTR = 3, + START_VALUE = 4, + IN_VALUE = 5, + IN_ATTRIBUTES = 6, + IN_COMMENT = 7, + IN_COMMENT2 = 8, + IN_COMMENT3 = 9; + + private static final String[] stateNames = { + "IN_TEXT", + "START_ELEMENT", + "IN_ELEMENT", + "IN_ATTR", + "START_VALUE", + "IN_VALUE", + "IN_ATTRIBUTES", + "IN_COMMENT", + "IN_COMMENT2", + "IN_COMMENT3" + }; private static final char IDENTIFIER = 'a'; - private static String quoteGuts(int c, int flags) { String prefix = "&"; switch (c) { - case '<': return "<"; - case '>': return ">"; - case '&': return "&"; - case '\'': return "'"; - case '"': return """; - - // Optionally fix TAB, CR, LF - - case 0x09: case 0x0A: case 0x0D: - if ((flags & QUOTE_TABCRLF) == 0) { - return null; - } - break; - - // Fix controls, non-characters, since XML can't handle - - case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07: - case 0x08: case 0x0B: case 0x0C: case 0x0E: case 0x0F: - case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17: - case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D: case 0x1E: case 0x1F: - case 0x7F: - case 0xFFFE: case 0xFFFF: - prefix = ""; - break; - - // Optionally fix IE Bug characters + case '<': + return "<"; + case '>': + return ">"; + case '&': + return "&"; + case '\'': + return "'"; + case '"': + return """; + + // Optionally fix TAB, CR, LF + + case 0x09: + case 0x0A: + case 0x0D: + if ((flags & QUOTE_TABCRLF) == 0) { + return null; + } + break; - case 0xFF00: case 0xFF01: case 0xFF02: case 0xFF03: case 0xFF04: case 0xFF05: case 0xFF06: case 0xFF07: - case 0xFFF8: case 0xFFF9: case 0xFFFA: case 0xFFFB: case 0xFFFC: case 0xFFFD: - if ((flags & QUOTE_IEBUG) == 0) { - return null; - } - prefix = ""; - break; + // Fix controls, non-characters, since XML can't handle + + case 0x00: + case 0x01: + case 0x02: + case 0x03: + case 0x04: + case 0x05: + case 0x06: + case 0x07: + case 0x08: + case 0x0B: + case 0x0C: + case 0x0E: + case 0x0F: + case 0x10: + case 0x11: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + case 0x16: + case 0x17: + case 0x18: + case 0x19: + case 0x1A: + case 0x1B: + case 0x1C: + case 0x1D: + case 0x1E: + case 0x1F: + case 0x7F: + case 0xFFFE: + case 0xFFFF: + prefix = ""; + break; - default: - if (c <= 0x7E) { // don't quote other ASCII - if ((flags & QUOTE_ASCII) == 0) { + // Optionally fix IE Bug characters + + case 0xFF00: + case 0xFF01: + case 0xFF02: + case 0xFF03: + case 0xFF04: + case 0xFF05: + case 0xFF06: + case 0xFF07: + case 0xFFF8: + case 0xFFF9: + case 0xFFFA: + case 0xFFFB: + case 0xFFFC: + case 0xFFFD: + if ((flags & QUOTE_IEBUG) == 0) { return null; } - } else if (0xD800 <= c && c <= 0xDFFF) {// fix surrogates, since XML can't handle - prefix = ""; - } else if (c > 0xFFFF && (flags & QUOTE_IEBUG) != 0) { prefix = ""; - } else if ((flags & QUOTE_NON_ASCII) == 0) { - return null; - } - break; + break; + + default: + if (c <= 0x7E) { // don't quote other ASCII + if ((flags & QUOTE_ASCII) == 0) { + return null; + } + } else if (0xD800 <= c && c <= 0xDFFF) { // fix surrogates, since XML can't handle + prefix = ""; + } else if (c > 0xFFFF && (flags & QUOTE_IEBUG) != 0) { + prefix = ""; + } else if ((flags & QUOTE_NON_ASCII) == 0) { + return null; + } + break; } if ((flags & QUOTE_DECIMAL) == 0) { - return prefix + "#x" + hex(c,1) + ";"; + return prefix + "#x" + hex(c, 1) + ";"; } else { return prefix + "#" + Integer.toString(c) + ";"; } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/XMLParseTypes.java b/unicodetools/src/main/java/org/unicode/text/utility/XMLParseTypes.java index d52077695..e8d365af5 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/XMLParseTypes.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/XMLParseTypes.java @@ -1,48 +1,55 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/XMLParseTypes.java,v $ - * $Date: 2007-02-11 08:15:09 $ - * $Revision: 1.3 $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/XMLParseTypes.java,v $ $Date: + * 2007-02-11 08:15:09 $ $Revision: 1.3 $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; -/** Interface of values for use with XMLParse. - * Others classes can "implements" this also, to avoid typing XMLParseTypes.XXX +/** + * Interface of values for use with XMLParse. Others classes can "implements" this also, to avoid + * typing XMLParseTypes.XXX */ public interface XMLParseTypes { - /** Kind values, for XMLParse.getKind(), next() - */ - public static final byte - DONE = 0, - ELEMENT_TAG = 1, ELEMENT_TAG_SLASH = 2, ELEMENT_TAG_COMMENT = 3, ELEMENT_TAG_QUESTION = 4, - END_ELEMENT = 5, END_ELEMENT_SLASH = 6, END_ELEMENT_COMMENT = 7, END_ELEMENT_QUESTION = 8, - ATTRIBUTE_TAG = 9, ATTRIBUTE_VALUE = 10, - TEXT = 11; + /** Kind values, for XMLParse.getKind(), next() */ + public static final byte DONE = 0, + ELEMENT_TAG = 1, + ELEMENT_TAG_SLASH = 2, + ELEMENT_TAG_COMMENT = 3, + ELEMENT_TAG_QUESTION = 4, + END_ELEMENT = 5, + END_ELEMENT_SLASH = 6, + END_ELEMENT_COMMENT = 7, + END_ELEMENT_QUESTION = 8, + ATTRIBUTE_TAG = 9, + ATTRIBUTE_VALUE = 10, + TEXT = 11; - /** Flag masks for XMLParse.quote(x, flags). Use '|' to combine - */ - public static final byte - QUOTE_NON_ASCII = 1, - QUOTE_ASCII = 2, - QUOTE_IEBUG = 4, - QUOTE_TABCRLF = 8, - QUOTE_DECIMAL = 16; + /** Flag masks for XMLParse.quote(x, flags). Use '|' to combine */ + public static final byte QUOTE_NON_ASCII = 1, + QUOTE_ASCII = 2, + QUOTE_IEBUG = 4, + QUOTE_TABCRLF = 8, + QUOTE_DECIMAL = 16; - /** For Debugging - */ + /** For Debugging */ static final String[] kindNames = { "DONE", - "ELEMENT_TAG", "ELEMENT_TAG_SLASH", "ELEMENT_TAG_COMMENT", "ELEMENT_TAG_QUESTION", - "END_ELEMENT", "END_ELEMENT_SLASH", "END_ELEMENT_COMMENT", "END_ELEMENT_QUESTION", - "ATTRIBUTE_TAG", "ATTRIBUTE_VALUE", + "ELEMENT_TAG", + "ELEMENT_TAG_SLASH", + "ELEMENT_TAG_COMMENT", + "ELEMENT_TAG_QUESTION", + "END_ELEMENT", + "END_ELEMENT_SLASH", + "END_ELEMENT_COMMENT", + "END_ELEMENT_QUESTION", + "ATTRIBUTE_TAG", + "ATTRIBUTE_VALUE", "TEXT", }; -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/text/utility/testParser.java b/unicodetools/src/main/java/org/unicode/text/utility/testParser.java index 271557a7e..1c73254d3 100644 --- a/unicodetools/src/main/java/org/unicode/text/utility/testParser.java +++ b/unicodetools/src/main/java/org/unicode/text/utility/testParser.java @@ -1,18 +1,16 @@ /** - ******************************************************************************* - * Copyright (C) 1996-2001, International Business Machines Corporation and * - * others. All Rights Reserved. * - ******************************************************************************* + * ****************************************************************************** Copyright (C) + * 1996-2001, International Business Machines Corporation and * others. All Rights Reserved. * + * ****************************************************************************** * - * $Source: /home/cvsroot/unicodetools/org/unicode/text/utility/testParser.java,v $ + *

$Source: /home/cvsroot/unicodetools/org/unicode/text/utility/testParser.java,v $ * - ******************************************************************************* + *

****************************************************************************** */ - package org.unicode.text.utility; -/** Simple Test program for XMLParse - */ +/** Simple Test program for XMLParse */ +import com.ibm.icu.text.UTF16; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.OutputStreamWriter; @@ -20,19 +18,17 @@ import java.util.Map; import java.util.TreeMap; -import com.ibm.icu.text.UTF16; - - public class testParser implements XMLParseTypes { // TODO: There is no such UNIDATA folder. Is this class obsolete? public static final String BASE_DIR = Settings.UnicodeTools.UCD_DIR + "/UNIDATA 3.0.1/"; public static final boolean VERBOSE = false; - private static final String testFile = BASE_DIR + "UCD-Main.xml"; // "test.xml"; // BASE_DIR + "UCD-Main.xml"; + private static final String testFile = + BASE_DIR + "UCD-Main.xml"; // "test.xml"; // BASE_DIR + "UCD-Main.xml"; - public static void main (String[] args) throws Exception { - //test1(); - //test2(); + public static void main(String[] args) throws Exception { + // test1(); + // test2(); test3(); } @@ -44,28 +40,52 @@ public static void test1() throws Exception { break; } final String value = xml.getValue(); - final int quoteFlags = QUOTE_IEBUG | QUOTE_NON_ASCII | (kind != TEXT ? QUOTE_TABCRLF : 0); + final int quoteFlags = + QUOTE_IEBUG | QUOTE_NON_ASCII | (kind != TEXT ? QUOTE_TABCRLF : 0); final String qValue = XMLParse.quote(value, quoteFlags); if (VERBOSE) { System.out.println(kindNames[kind] + ", \"" + value + "\", \"" + qValue + "\""); } else { switch (kind) { - case ELEMENT_TAG: System.out.print('<' + qValue); break; - case ELEMENT_TAG_SLASH: System.out.print(""); break; - case END_ELEMENT_COMMENT: System.out.print(">"); break; - case END_ELEMENT_SLASH: System.out.print("/>"); break; - case END_ELEMENT_QUESTION: System.out.print("?>"); break; + case END_ELEMENT: + System.out.print(">"); + break; + case END_ELEMENT_COMMENT: + System.out.print(">"); + break; + case END_ELEMENT_SLASH: + System.out.print("/>"); + break; + case END_ELEMENT_QUESTION: + System.out.print("?>"); + break; - case ATTRIBUTE_TAG: System.out.print(" " + qValue + "="); break; - case ATTRIBUTE_VALUE: System.out.print("\"" + qValue + "\""); break; + case ATTRIBUTE_TAG: + System.out.print(" " + qValue + "="); + break; + case ATTRIBUTE_VALUE: + System.out.print("\"" + qValue + "\""); + break; - case TEXT: System.out.print(qValue); break; + case TEXT: + System.out.print(qValue); + break; - default: throw new Exception("Unknown KIND"); + default: + throw new Exception("Unknown KIND"); } } } @@ -75,27 +95,31 @@ public static void test1() throws Exception { static void test2() throws Exception { - final PrintWriter log = Utility.openPrintWriterGenDir("log/UCD-Extract.html", Utility.UTF8_WINDOWS); + final PrintWriter log = + Utility.openPrintWriterGenDir("log/UCD-Extract.html", Utility.UTF8_WINDOWS); - //int fieldCount = 4; - //int width = 100/fieldCount; - //int first = width + 100 - width*fieldCount; + // int fieldCount = 4; + // int width = 100/fieldCount; + // int first = width + 100 - width*fieldCount; try { - log.println(""); + log.println( + ""); log.println(""); log.println("Extract from UCD"); log.println(""); - final String tableHead = "" - + "" - + "" - + "" - + "" - + ""; + final String tableHead = + "
CodeCharGCPropsName
" + + "" + + "" + + "" + + "" + + ""; log.println(tableHead); final XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]); @@ -112,72 +136,79 @@ static void test2() throws Exception { } final String value = xml.getValue(); switch (kind) { - case ELEMENT_TAG: - recordingChar = value.equals("e"); - break; - - case ATTRIBUTE_TAG: - if (!recordingChar) { + case ELEMENT_TAG: + recordingChar = value.equals("e"); break; - } - lastTag = value; - break; - case ATTRIBUTE_VALUE: - if (!recordingChar) { + case ATTRIBUTE_TAG: + if (!recordingChar) { + break; + } + lastTag = value; break; - } - data.put(lastTag, value); - break; - case END_ELEMENT: - case END_ELEMENT_SLASH: - if (!recordingChar) { + case ATTRIBUTE_VALUE: + if (!recordingChar) { + break; + } + data.put(lastTag, value); break; - } - recordingChar = false; - // get data - - String ch = (String)data.get("c"); - ch = fixHack(ch); - String name = (String)data.get("n"); - if (name == null) { - name = ""; - } - String props = (String)data.get("xs"); - if (props == null) { - props = "\u00A0"; - } - String gc = (String)data.get("gc"); - if (gc == null) { - gc = "Lo"; - } - - // split tables - final int code = UTF16.charAt(ch, 0); - if ((topByte & ~0x1F) != (code & ~0x1F)) { - log.println("
CodeCharGCPropsName

"); - log.println(tableHead); - topByte = code; - if ((printByte & ~0xFF) != (code & ~0xFF)) { - System.out.println("Printing table for " + XMLParse.hex(topByte,2)); - printByte = code; + case END_ELEMENT: + case END_ELEMENT_SLASH: + if (!recordingChar) { + break; } - } + recordingChar = false; - // draw line + // get data - log.println("

" + XMLParse.hex(code,4) + - "" + XMLParse.quote(ch,NORMAL_QUOTE) + - "" + XMLParse.quote(gc,NORMAL_QUOTE) + - "" + XMLParse.quote(props,NORMAL_QUOTE) + - "" + XMLParse.quote(name,NORMAL_QUOTE) + "

"); + log.println(tableHead); + topByte = code; + if ((printByte & ~0xFF) != (code & ~0xFF)) { + System.out.println( + "Printing table for " + XMLParse.hex(topByte, 2)); + printByte = code; + } + } + // draw line + + log.println( + "
" + + XMLParse.hex(code, 4) + + "" + + XMLParse.quote(ch, NORMAL_QUOTE) + + "" + + XMLParse.quote(gc, NORMAL_QUOTE) + + "" + + XMLParse.quote(props, NORMAL_QUOTE) + + "" + + XMLParse.quote(name, NORMAL_QUOTE) + + "
"); @@ -187,30 +218,32 @@ static void test2() throws Exception { } static void test3() throws Exception { - final PrintWriter log = new PrintWriter(new BufferedWriter( - new OutputStreamWriter( - new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"), - "UTF8"), - 32*1024)); + final PrintWriter log = + new PrintWriter( + new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"), + "UTF8"), + 32 * 1024)); try { collect(log, "Other_Math"); - collect (log, "Other_Alphabetic"); - collect (log, "Other_Composite"); - //int fieldCount = 4; - //int width = 100/fieldCount; - //int first = width + 100 - width*fieldCount; + collect(log, "Other_Alphabetic"); + collect(log, "Other_Composite"); + // int fieldCount = 4; + // int width = 100/fieldCount; + // int first = width + 100 - width*fieldCount; } finally { log.close(); } } - static final void collect(PrintWriter log, String prop) throws Exception { + static final void collect(PrintWriter log, String prop) throws Exception { final XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]); - //boolean recordingChar = false; - //int topByte = 0; - //int printByte = 0; - //Map data = new TreeMap(); + // boolean recordingChar = false; + // int topByte = 0; + // int printByte = 0; + // Map data = new TreeMap(); String lastTag = ""; String lastChar = ""; String lastName = ""; @@ -230,67 +263,83 @@ static final void collect(PrintWriter log, String prop) throws Exception { } final String value = xml.getValue(); switch (kind) { - case ATTRIBUTE_TAG: - lastTag = value; - break; + case ATTRIBUTE_TAG: + lastTag = value; + break; - case ATTRIBUTE_VALUE: - if (lastTag.equals("c")) { - lastChar = value; - } else if (lastTag.equals("n")) { - lastName = value; - } else if (lastTag.equals("gc")) { - lastCat = value; - } else if (lastTag.equals("xs") && value.indexOf(prop) >= 0) { - lastChar = fixHack(lastChar); - final int ch = UTF16.charAt(lastChar, 0); - if (ch == endChar + 1) { - endChar = ch; - } else { - //FDD0; FDEF; Noncharacter_Code_Point; # XX; 32; - if (endChar >= 0) { - log.println(Utility.hex(startChar, 4) + "; " - + (endChar == startChar ? " " : Utility.hex(endChar, 4)) - + "; " + prop - + "; # " + startCat - + "; " + (endChar-startChar+1) - + "; " + startName - + (endChar == startChar ? "" : "...")); + case ATTRIBUTE_VALUE: + if (lastTag.equals("c")) { + lastChar = value; + } else if (lastTag.equals("n")) { + lastName = value; + } else if (lastTag.equals("gc")) { + lastCat = value; + } else if (lastTag.equals("xs") && value.indexOf(prop) >= 0) { + lastChar = fixHack(lastChar); + final int ch = UTF16.charAt(lastChar, 0); + if (ch == endChar + 1) { + endChar = ch; + } else { + // FDD0; FDEF; Noncharacter_Code_Point; # XX; 32; + if (endChar >= 0) { + log.println( + Utility.hex(startChar, 4) + + "; " + + (endChar == startChar + ? " " + : Utility.hex(endChar, 4)) + + "; " + + prop + + "; # " + + startCat + + "; " + + (endChar - startChar + 1) + + "; " + + startName + + (endChar == startChar ? "" : "...")); + } + startChar = endChar = ch; + startName = lastName; + startCat = lastCat; } - startChar = endChar = ch; - startName = lastName; - startCat = lastCat; } - } - break; + break; } } if (endChar >= 0) { - log.println(Utility.hex(startChar, 4) + "; " - + (endChar == startChar ? " " : Utility.hex(endChar, 4)) - + "; " + prop - + "; # " + startCat - + "; " + (endChar-startChar+1) - + "; " + startName - + (endChar == startChar ? "" : "...")); + log.println( + Utility.hex(startChar, 4) + + "; " + + (endChar == startChar ? " " : Utility.hex(endChar, 4)) + + "; " + + prop + + "; # " + + startCat + + "; " + + (endChar - startChar + 1) + + "; " + + startName + + (endChar == startChar ? "" : "...")); } } static void test4() throws Exception { - final PrintWriter log = new PrintWriter(new BufferedWriter( - new OutputStreamWriter( - new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"), - "UTF8"), - 32*1024)); - - //int fieldCount = 4; - //int width = 100/fieldCount; - //int first = width + 100 - width*fieldCount; + final PrintWriter log = + new PrintWriter( + new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(BASE_DIR + "CaseFoldingDraft3.txt"), + "UTF8"), + 32 * 1024)); + + // int fieldCount = 4; + // int width = 100/fieldCount; + // int first = width + 100 - width*fieldCount; try { final XMLParse xml = new XMLParse(BASE_DIR + "UCD-Main.xml", new char[1000]); boolean recordingChar = false; - //int topByte = 0; - //int printByte = 0; + // int topByte = 0; + // int printByte = 0; final Map data = new TreeMap(); String lastTag = ""; @@ -304,71 +353,85 @@ static void test4() throws Exception { } final String value = xml.getValue(); switch (kind) { - case ELEMENT_TAG: - recordingChar = value.equals("e"); - break; - - case ATTRIBUTE_TAG: - if (!recordingChar) { + case ELEMENT_TAG: + recordingChar = value.equals("e"); break; - } - lastTag = value; - break; - case ATTRIBUTE_VALUE: - if (!recordingChar) { + case ATTRIBUTE_TAG: + if (!recordingChar) { + break; + } + lastTag = value; break; - } - data.put(lastTag, value); - break; - case END_ELEMENT: - case END_ELEMENT_SLASH: - if (!recordingChar) { + case ATTRIBUTE_VALUE: + if (!recordingChar) { + break; + } + data.put(lastTag, value); break; - } - recordingChar = false; - // get data + case END_ELEMENT: + case END_ELEMENT_SLASH: + if (!recordingChar) { + break; + } + recordingChar = false; - String ch = (String)data.get("c"); - ch = fixHack(ch); + // get data - String name = (String)data.get("n"); - if (name == null) { - name = ""; - } + String ch = (String) data.get("c"); + ch = fixHack(ch); - String lc = (String)data.get("lc"); - if (lc == null) { - lc = ch; - } + String name = (String) data.get("n"); + if (name == null) { + name = ""; + } - String fc = (String)data.get("fc"); - if (fc == null) { - fc = (String)data.get("sl"); - } - if (fc == null) { - fc = lc; - } + String lc = (String) data.get("lc"); + if (lc == null) { + lc = ch; + } - if (fc.equals(ch)) { - continue; - } + String fc = (String) data.get("fc"); + if (fc == null) { + fc = (String) data.get("sl"); + } + if (fc == null) { + fc = lc; + } - if (fc.length() == 1) { - log.println(Utility.hex(ch, " ") + "; C; " + Utility.hex(fc, " ") + "; # " + name); - } else { - log.println(Utility.hex(ch, " ") + "; F; " + Utility.hex(fc, " ") + "; # " + name); - if (!lc.equals(ch)) { - log.println(Utility.hex(ch, " ") + "; S; " + Utility.hex(lc, " ") + "; # " + name); + if (fc.equals(ch)) { + continue; } - } - // clear storage - data.clear(); - break; + if (fc.length() == 1) { + log.println( + Utility.hex(ch, " ") + + "; C; " + + Utility.hex(fc, " ") + + "; # " + + name); + } else { + log.println( + Utility.hex(ch, " ") + + "; F; " + + Utility.hex(fc, " ") + + "; # " + + name); + if (!lc.equals(ch)) { + log.println( + Utility.hex(ch, " ") + + "; S; " + + Utility.hex(lc, " ") + + "; # " + + name); + } + } + // clear storage + data.clear(); + break; } } } finally { @@ -384,14 +447,14 @@ static final String fixHack(String s) { final char c = s.charAt(i); if (position > 0) { if (c == ';') { - final int x = Integer.parseInt(s.substring(position,i),16); + final int x = Integer.parseInt(s.substring(position, i), 16); result.append(UTF32.valueOf32(x)); position = -1; } } else { if (last == '#' && c == 'x') { - result.setLength(result.length()-1); // remove '#' - position = i+1; + result.setLength(result.length() - 1); // remove '#' + position = i + 1; } else { result.append(c); } diff --git a/unicodetools/src/main/java/org/unicode/tools/AacCheck.java b/unicodetools/src/main/java/org/unicode/tools/AacCheck.java index 0b0b754af..2501a4d1a 100644 --- a/unicodetools/src/main/java/org/unicode/tools/AacCheck.java +++ b/unicodetools/src/main/java/org/unicode/tools/AacCheck.java @@ -1,13 +1,12 @@ package org.unicode.tools; -import org.unicode.tools.emoji.Emoji; -import org.unicode.tools.emoji.EmojiData; - import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSet.EntryRange; import com.ibm.icu.util.Output; +import org.unicode.tools.emoji.Emoji; +import org.unicode.tools.emoji.EmojiData; public class AacCheck { // Error messages @@ -19,8 +18,8 @@ public class AacCheck { /** * input is a list of space-delimited lists of hex values, such as:
* AacCheck 61
- * AacCheck "1F468 200D 2764" 200D 1F468 - * + * AacCheck "1F468 200D 2764" 200D 1F468 + * * @param args * @return error code, after printing message */ @@ -29,16 +28,23 @@ public static void main(String[] args) { EmojiData.EMOJI_DATA.getName("👨"); for (EntryRange range : ALLOWED.ranges()) { if (range.codepoint == range.codepointEnd) { - System.out.println(Utility.hex(range.codepoint) - + " ; " + UCharacter.getName(range.codepoint)); + System.out.println( + Utility.hex(range.codepoint) + + " ; " + + UCharacter.getName(range.codepoint)); } else { - System.out.println(Utility.hex(range.codepoint) + ".." + Utility.hex(range.codepointEnd) - + " ; " + UCharacter.getName(range.codepoint) + ".." + UCharacter.getName(range.codepointEnd)); + System.out.println( + Utility.hex(range.codepoint) + + ".." + + Utility.hex(range.codepointEnd) + + " ; " + + UCharacter.getName(range.codepoint) + + ".." + + UCharacter.getName(range.codepointEnd)); } } - for (String cps: ALLOWED.strings()) { - System.out.println(Utility.hex(cps) - + " ; " + EmojiData.EMOJI_DATA.getName(cps)); + for (String cps : ALLOWED.strings()) { + System.out.println(Utility.hex(cps) + " ; " + EmojiData.EMOJI_DATA.getName(cps)); } } else { Output message = new Output<>(); @@ -65,8 +71,9 @@ public static int process(Output messageOut, String... input) { } int cp = -1; try { - cp = Integer.parseInt(arg, 16); - } catch (Exception e) {} // fall through with -1 + cp = Integer.parseInt(arg, 16); + } catch (Exception e) { + } // fall through with -1 if (cp < 0 || cp > 0x10FFFF) { messageOut.value = ";Error: bad codepoint: " + arg; @@ -80,7 +87,7 @@ public static int process(Output messageOut, String... input) { } // now strip trailing VS characters // we can work with 16-bit chars because there is no overlap - for (int i = filtered.length()-1; i > 0; --i) { + for (int i = filtered.length() - 1; i > 0; --i) { if (EMOJI_VARIATION_SELECTORS.contains(filtered.charAt(i))) { filtered.setLength(i); } else { @@ -113,18 +120,21 @@ public static int process(Output messageOut, String... input) { return OK; } - private static final UnicodeSet EMOJI_VARIATION_SELECTORS = new UnicodeSet("[\uFE0F\uFE0E]") - .freeze(); + private static final UnicodeSet EMOJI_VARIATION_SELECTORS = + new UnicodeSet("[\uFE0F\uFE0E]").freeze(); - private static final UnicodeSet IGNORE = new UnicodeSet("[[:z:][:di:]]") - .remove(ZWJ) -// .removeAll(EMOJI_VARIATION_SELECTORS) - .freeze(); + private static final UnicodeSet IGNORE = + new UnicodeSet("[[:z:][:di:]]") + .remove(ZWJ) + // .removeAll(EMOJI_VARIATION_SELECTORS) + .freeze(); static final UnicodeSet ALLOWED = new UnicodeSet(); - static { - UnicodeSet temp = new UnicodeSet("[^[:c:][:z:][:di:][࿕-࿘ 卍 卐]]") - .addAll(EmojiData.EMOJI_DATA.getChars()); + + static { + UnicodeSet temp = + new UnicodeSet("[^[:c:][:z:][:di:][࿕-࿘ 卍 卐]]") + .addAll(EmojiData.EMOJI_DATA.getChars()); // remove the variation selectors for (String s : temp) { if (s.contains(Emoji.EMOJI_VARIANT_STRING)) { @@ -140,37 +150,66 @@ public static int process(Output messageOut, String... input) { // // keycaps // + "[{#⃣}{*⃣}{0⃣}{1⃣}{2⃣}{3⃣}{4⃣}{5⃣}{6⃣}{7⃣}{8⃣}{9⃣}]" // // flags - // + "[{🇦🇨}{🇦🇩}{🇦🇪}{🇦🇫}{🇦🇬}{🇦🇮}{🇦🇱}{🇦🇲}{🇦🇴}{🇦🇶}{🇦🇷}{🇦🇸}{🇦🇹}{🇦🇺}{🇦🇼}{🇦🇽}{🇦🇿}{🇧🇦}{🇧🇧}{🇧🇩}{🇧🇪}{🇧🇫}" - // + "{🇧🇬}{🇧🇭}{🇧🇮}{🇧🇯}{🇧🇱}{🇧🇲}{🇧🇳}{🇧🇴}{🇧🇶}{🇧🇷}{🇧🇸}{🇧🇹}{🇧🇻}{🇧🇼}{🇧🇾}{🇧🇿}{🇨🇦}{🇨🇨}{🇨🇩}{🇨🇫}{🇨🇬}{🇨🇭}" - // + "{🇨🇮}{🇨🇰}{🇨🇱}{🇨🇲}{🇨🇳}{🇨🇴}{🇨🇵}{🇨🇷}{🇨🇺}{🇨🇻}{🇨🇼}{🇨🇽}{🇨🇾}{🇨🇿}{🇩🇪}{🇩🇬}{🇩🇯}{🇩🇰}{🇩🇲}{🇩🇴}{🇩🇿}{🇪🇦}" - // + "{🇪🇨}{🇪🇪}{🇪🇬}{🇪🇭}{🇪🇷}{🇪🇸}{🇪🇹}{🇪🇺}{🇫🇮}{🇫🇯}{🇫🇰}{🇫🇲}{🇫🇴}{🇫🇷}{🇬🇦}{🇬🇧}{🇬🇩}{🇬🇪}{🇬🇫}{🇬🇬}{🇬🇭}{🇬🇮}" - // + "{🇬🇱}{🇬🇲}{🇬🇳}{🇬🇵}{🇬🇶}{🇬🇷}{🇬🇸}{🇬🇹}{🇬🇺}{🇬🇼}{🇬🇾}{🇭🇰}{🇭🇲}{🇭🇳}{🇭🇷}{🇭🇹}{🇭🇺}{🇮🇨}{🇮🇩}{🇮🇪}{🇮🇱}{🇮🇲}" - // + "{🇮🇳}{🇮🇴}{🇮🇶}{🇮🇷}{🇮🇸}{🇮🇹}{🇯🇪}{🇯🇲}{🇯🇴}{🇯🇵}{🇰🇪}{🇰🇬}{🇰🇭}{🇰🇮}{🇰🇲}{🇰🇳}{🇰🇵}{🇰🇷}{🇰🇼}{🇰🇾}{🇰🇿}{🇱🇦}" - // + "{🇱🇧}{🇱🇨}{🇱🇮}{🇱🇰}{🇱🇷}{🇱🇸}{🇱🇹}{🇱🇺}{🇱🇻}{🇱🇾}{🇲🇦}{🇲🇨}{🇲🇩}{🇲🇪}{🇲🇫}{🇲🇬}{🇲🇭}{🇲🇰}{🇲🇱}{🇲🇲}{🇲🇳}{🇲🇴}" - // + "{🇲🇵}{🇲🇶}{🇲🇷}{🇲🇸}{🇲🇹}{🇲🇺}{🇲🇻}{🇲🇼}{🇲🇽}{🇲🇾}{🇲🇿}{🇳🇦}{🇳🇨}{🇳🇪}{🇳🇫}{🇳🇬}{🇳🇮}{🇳🇱}{🇳🇴}{🇳🇵}{🇳🇷}{🇳🇺}" - // + "{🇳🇿}{🇴🇲}{🇵🇦}{🇵🇪}{🇵🇫}{🇵🇬}{🇵🇭}{🇵🇰}{🇵🇱}{🇵🇲}{🇵🇳}{🇵🇷}{🇵🇸}{🇵🇹}{🇵🇼}{🇵🇾}{🇶🇦}{🇷🇪}{🇷🇴}{🇷🇸}{🇷🇺}{🇷🇼}" - // + "{🇸🇦}{🇸🇧}{🇸🇨}{🇸🇩}{🇸🇪}{🇸🇬}{🇸🇭}{🇸🇮}{🇸🇯}{🇸🇰}{🇸🇱}{🇸🇲}{🇸🇳}{🇸🇴}{🇸🇷}{🇸🇸}{🇸🇹}{🇸🇻}{🇸🇽}{🇸🇾}{🇸🇿}{🇹🇦}" - // + "{🇹🇨}{🇹🇩}{🇹🇫}{🇹🇬}{🇹🇭}{🇹🇯}{🇹🇰}{🇹🇱}{🇹🇲}{🇹🇳}{🇹🇴}{🇹🇷}{🇹🇹}{🇹🇻}{🇹🇼}{🇹🇿}{🇺🇦}{🇺🇬}{🇺🇲}{🇺🇸}{🇺🇾}{🇺🇿}" - // + "{🇻🇦}{🇻🇨}{🇻🇪}{🇻🇬}{🇻🇮}{🇻🇳}{🇻🇺}{🇼🇫}{🇼🇸}{🇽🇰}{🇾🇪}{🇾🇹}{🇿🇦}{🇿🇲}{🇿🇼}]" + // + + // "[{🇦🇨}{🇦🇩}{🇦🇪}{🇦🇫}{🇦🇬}{🇦🇮}{🇦🇱}{🇦🇲}{🇦🇴}{🇦🇶}{🇦🇷}{🇦🇸}{🇦🇹}{🇦🇺}{🇦🇼}{🇦🇽}{🇦🇿}{🇧🇦}{🇧🇧}{🇧🇩}{🇧🇪}{🇧🇫}" + // + + // "{🇧🇬}{🇧🇭}{🇧🇮}{🇧🇯}{🇧🇱}{🇧🇲}{🇧🇳}{🇧🇴}{🇧🇶}{🇧🇷}{🇧🇸}{🇧🇹}{🇧🇻}{🇧🇼}{🇧🇾}{🇧🇿}{🇨🇦}{🇨🇨}{🇨🇩}{🇨🇫}{🇨🇬}{🇨🇭}" + // + + // "{🇨🇮}{🇨🇰}{🇨🇱}{🇨🇲}{🇨🇳}{🇨🇴}{🇨🇵}{🇨🇷}{🇨🇺}{🇨🇻}{🇨🇼}{🇨🇽}{🇨🇾}{🇨🇿}{🇩🇪}{🇩🇬}{🇩🇯}{🇩🇰}{🇩🇲}{🇩🇴}{🇩🇿}{🇪🇦}" + // + + // "{🇪🇨}{🇪🇪}{🇪🇬}{🇪🇭}{🇪🇷}{🇪🇸}{🇪🇹}{🇪🇺}{🇫🇮}{🇫🇯}{🇫🇰}{🇫🇲}{🇫🇴}{🇫🇷}{🇬🇦}{🇬🇧}{🇬🇩}{🇬🇪}{🇬🇫}{🇬🇬}{🇬🇭}{🇬🇮}" + // + + // "{🇬🇱}{🇬🇲}{🇬🇳}{🇬🇵}{🇬🇶}{🇬🇷}{🇬🇸}{🇬🇹}{🇬🇺}{🇬🇼}{🇬🇾}{🇭🇰}{🇭🇲}{🇭🇳}{🇭🇷}{🇭🇹}{🇭🇺}{🇮🇨}{🇮🇩}{🇮🇪}{🇮🇱}{🇮🇲}" + // + + // "{🇮🇳}{🇮🇴}{🇮🇶}{🇮🇷}{🇮🇸}{🇮🇹}{🇯🇪}{🇯🇲}{🇯🇴}{🇯🇵}{🇰🇪}{🇰🇬}{🇰🇭}{🇰🇮}{🇰🇲}{🇰🇳}{🇰🇵}{🇰🇷}{🇰🇼}{🇰🇾}{🇰🇿}{🇱🇦}" + // + + // "{🇱🇧}{🇱🇨}{🇱🇮}{🇱🇰}{🇱🇷}{🇱🇸}{🇱🇹}{🇱🇺}{🇱🇻}{🇱🇾}{🇲🇦}{🇲🇨}{🇲🇩}{🇲🇪}{🇲🇫}{🇲🇬}{🇲🇭}{🇲🇰}{🇲🇱}{🇲🇲}{🇲🇳}{🇲🇴}" + // + + // "{🇲🇵}{🇲🇶}{🇲🇷}{🇲🇸}{🇲🇹}{🇲🇺}{🇲🇻}{🇲🇼}{🇲🇽}{🇲🇾}{🇲🇿}{🇳🇦}{🇳🇨}{🇳🇪}{🇳🇫}{🇳🇬}{🇳🇮}{🇳🇱}{🇳🇴}{🇳🇵}{🇳🇷}{🇳🇺}" + // + + // "{🇳🇿}{🇴🇲}{🇵🇦}{🇵🇪}{🇵🇫}{🇵🇬}{🇵🇭}{🇵🇰}{🇵🇱}{🇵🇲}{🇵🇳}{🇵🇷}{🇵🇸}{🇵🇹}{🇵🇼}{🇵🇾}{🇶🇦}{🇷🇪}{🇷🇴}{🇷🇸}{🇷🇺}{🇷🇼}" + // + + // "{🇸🇦}{🇸🇧}{🇸🇨}{🇸🇩}{🇸🇪}{🇸🇬}{🇸🇭}{🇸🇮}{🇸🇯}{🇸🇰}{🇸🇱}{🇸🇲}{🇸🇳}{🇸🇴}{🇸🇷}{🇸🇸}{🇸🇹}{🇸🇻}{🇸🇽}{🇸🇾}{🇸🇿}{🇹🇦}" + // + + // "{🇹🇨}{🇹🇩}{🇹🇫}{🇹🇬}{🇹🇭}{🇹🇯}{🇹🇰}{🇹🇱}{🇹🇲}{🇹🇳}{🇹🇴}{🇹🇷}{🇹🇹}{🇹🇻}{🇹🇼}{🇹🇿}{🇺🇦}{🇺🇬}{🇺🇲}{🇺🇸}{🇺🇾}{🇺🇿}" + // + + // "{🇻🇦}{🇻🇨}{🇻🇪}{🇻🇬}{🇻🇮}{🇻🇳}{🇻🇺}{🇼🇫}{🇼🇸}{🇽🇰}{🇾🇪}{🇾🇹}{🇿🇦}{🇿🇲}{🇿🇼}]" // // modifier sequences - // + "[{☝🏻}{☝🏼}{☝🏽}{☝🏾}{☝🏿}{⛹🏻}{⛹🏼}{⛹🏽}{⛹🏾}{⛹🏿}{✊🏻}{✊🏼}{✊🏽}{✊🏾}{✊🏿}{✋🏻}{✋🏼}{✋🏽}{✋🏾}{✋🏿}" - // + "{✌🏻}{✌🏼}{✌🏽}{✌🏾}{✌🏿}{✍🏻}{✍🏼}{✍🏽}{✍🏾}{✍🏿}{🎅🏻}{🎅🏼}{🎅🏽}{🎅🏾}{🎅🏿}{🏃🏻}{🏃🏼}{🏃🏽}{🏃🏾}{🏃🏿}" - // + "{🏄🏻}{🏄🏼}{🏄🏽}{🏄🏾}{🏄🏿}{🏊🏻}{🏊🏼}{🏊🏽}{🏊🏾}{🏊🏿}{🏋🏻}{🏋🏼}{🏋🏽}{🏋🏾}{🏋🏿}{👂🏻}{👂🏼}{👂🏽}{👂🏾}{👂🏿}" - // + "{👃🏻}{👃🏼}{👃🏽}{👃🏾}{👃🏿}{👆🏻}{👆🏼}{👆🏽}{👆🏾}{👆🏿}{👇🏻}{👇🏼}{👇🏽}{👇🏾}{👇🏿}{👈🏻}{👈🏼}{👈🏽}{👈🏾}{👈🏿}" - // + "{👉🏻}{👉🏼}{👉🏽}{👉🏾}{👉🏿}{👊🏻}{👊🏼}{👊🏽}{👊🏾}{👊🏿}{👋🏻}{👋🏼}{👋🏽}{👋🏾}{👋🏿}{👌🏻}{👌🏼}{👌🏽}{👌🏾}{👌🏿}" - // + "{👍🏻}{👍🏼}{👍🏽}{👍🏾}{👍🏿}{👎🏻}{👎🏼}{👎🏽}{👎🏾}{👎🏿}{👏🏻}{👏🏼}{👏🏽}{👏🏾}{👏🏿}{👐🏻}{👐🏼}{👐🏽}{👐🏾}{👐🏿}" - // + "{👦🏻}{👦🏼}{👦🏽}{👦🏾}{👦🏿}{👧🏻}{👧🏼}{👧🏽}{👧🏾}{👧🏿}{👨🏻}{👨🏼}{👨🏽}{👨🏾}{👨🏿}{👩🏻}{👩🏼}{👩🏽}{👩🏾}{👩🏿}" - // + "{👮🏻}{👮🏼}{👮🏽}{👮🏾}{👮🏿}{👰🏻}{👰🏼}{👰🏽}{👰🏾}{👰🏿}{👱🏻}{👱🏼}{👱🏽}{👱🏾}{👱🏿}{👲🏻}{👲🏼}{👲🏽}{👲🏾}{👲🏿}" - // + "{👳🏻}{👳🏼}{👳🏽}{👳🏾}{👳🏿}{👴🏻}{👴🏼}{👴🏽}{👴🏾}{👴🏿}{👵🏻}{👵🏼}{👵🏽}{👵🏾}{👵🏿}{👶🏻}{👶🏼}{👶🏽}{👶🏾}{👶🏿}" - // + "{👷🏻}{👷🏼}{👷🏽}{👷🏾}{👷🏿}{👸🏻}{👸🏼}{👸🏽}{👸🏾}{👸🏿}{👼🏻}{👼🏼}{👼🏽}{👼🏾}{👼🏿}{💁🏻}{💁🏼}{💁🏽}{💁🏾}{💁🏿}" - // + "{💂🏻}{💂🏼}{💂🏽}{💂🏾}{💂🏿}{💃🏻}{💃🏼}{💃🏽}{💃🏾}{💃🏿}{💅🏻}{💅🏼}{💅🏽}{💅🏾}{💅🏿}{💆🏻}{💆🏼}{💆🏽}{💆🏾}{💆🏿}" - // + "{💇🏻}{💇🏼}{💇🏽}{💇🏾}{💇🏿}{💪🏻}{💪🏼}{💪🏽}{💪🏾}{💪🏿}{🕵🏻}{🕵🏼}{🕵🏽}{🕵🏾}{🕵🏿}{🖐🏻}{🖐🏼}{🖐🏽}{🖐🏾}{🖐🏿}" - // + "{🖕🏻}{🖕🏼}{🖕🏽}{🖕🏾}{🖕🏿}{🖖🏻}{🖖🏼}{🖖🏽}{🖖🏾}{🖖🏿}{🙅🏻}{🙅🏼}{🙅🏽}{🙅🏾}{🙅🏿}{🙆🏻}{🙆🏼}{🙆🏽}{🙆🏾}{🙆🏿}" - // + "{🙇🏻}{🙇🏼}{🙇🏽}{🙇🏾}{🙇🏿}{🙋🏻}{🙋🏼}{🙋🏽}{🙋🏾}{🙋🏿}{🙌🏻}{🙌🏼}{🙌🏽}{🙌🏾}{🙌🏿}{🙍🏻}{🙍🏼}{🙍🏽}{🙍🏾}{🙍🏿}" - // + "{🙎🏻}{🙎🏼}{🙎🏽}{🙎🏾}{🙎🏿}{🙏🏻}{🙏🏼}{🙏🏽}{🙏🏾}{🙏🏿}{🚣🏻}{🚣🏼}{🚣🏽}{🚣🏾}{🚣🏿}{🚴🏻}{🚴🏼}{🚴🏽}{🚴🏾}{🚴🏿}" - // + "{🚵🏻}{🚵🏼}{🚵🏽}{🚵🏾}{🚵🏿}{🚶🏻}{🚶🏼}{🚶🏽}{🚶🏾}{🚶🏿}{🛀🏻}{🛀🏼}{🛀🏽}{🛀🏾}{🛀🏿}{🤘🏻}{🤘🏼}{🤘🏽}{🤘🏾}{🤘🏿}]" + // + + // "[{☝🏻}{☝🏼}{☝🏽}{☝🏾}{☝🏿}{⛹🏻}{⛹🏼}{⛹🏽}{⛹🏾}{⛹🏿}{✊🏻}{✊🏼}{✊🏽}{✊🏾}{✊🏿}{✋🏻}{✋🏼}{✋🏽}{✋🏾}{✋🏿}" + // + + // "{✌🏻}{✌🏼}{✌🏽}{✌🏾}{✌🏿}{✍🏻}{✍🏼}{✍🏽}{✍🏾}{✍🏿}{🎅🏻}{🎅🏼}{🎅🏽}{🎅🏾}{🎅🏿}{🏃🏻}{🏃🏼}{🏃🏽}{🏃🏾}{🏃🏿}" + // + + // "{🏄🏻}{🏄🏼}{🏄🏽}{🏄🏾}{🏄🏿}{🏊🏻}{🏊🏼}{🏊🏽}{🏊🏾}{🏊🏿}{🏋🏻}{🏋🏼}{🏋🏽}{🏋🏾}{🏋🏿}{👂🏻}{👂🏼}{👂🏽}{👂🏾}{👂🏿}" + // + + // "{👃🏻}{👃🏼}{👃🏽}{👃🏾}{👃🏿}{👆🏻}{👆🏼}{👆🏽}{👆🏾}{👆🏿}{👇🏻}{👇🏼}{👇🏽}{👇🏾}{👇🏿}{👈🏻}{👈🏼}{👈🏽}{👈🏾}{👈🏿}" + // + + // "{👉🏻}{👉🏼}{👉🏽}{👉🏾}{👉🏿}{👊🏻}{👊🏼}{👊🏽}{👊🏾}{👊🏿}{👋🏻}{👋🏼}{👋🏽}{👋🏾}{👋🏿}{👌🏻}{👌🏼}{👌🏽}{👌🏾}{👌🏿}" + // + + // "{👍🏻}{👍🏼}{👍🏽}{👍🏾}{👍🏿}{👎🏻}{👎🏼}{👎🏽}{👎🏾}{👎🏿}{👏🏻}{👏🏼}{👏🏽}{👏🏾}{👏🏿}{👐🏻}{👐🏼}{👐🏽}{👐🏾}{👐🏿}" + // + + // "{👦🏻}{👦🏼}{👦🏽}{👦🏾}{👦🏿}{👧🏻}{👧🏼}{👧🏽}{👧🏾}{👧🏿}{👨🏻}{👨🏼}{👨🏽}{👨🏾}{👨🏿}{👩🏻}{👩🏼}{👩🏽}{👩🏾}{👩🏿}" + // + + // "{👮🏻}{👮🏼}{👮🏽}{👮🏾}{👮🏿}{👰🏻}{👰🏼}{👰🏽}{👰🏾}{👰🏿}{👱🏻}{👱🏼}{👱🏽}{👱🏾}{👱🏿}{👲🏻}{👲🏼}{👲🏽}{👲🏾}{👲🏿}" + // + + // "{👳🏻}{👳🏼}{👳🏽}{👳🏾}{👳🏿}{👴🏻}{👴🏼}{👴🏽}{👴🏾}{👴🏿}{👵🏻}{👵🏼}{👵🏽}{👵🏾}{👵🏿}{👶🏻}{👶🏼}{👶🏽}{👶🏾}{👶🏿}" + // + + // "{👷🏻}{👷🏼}{👷🏽}{👷🏾}{👷🏿}{👸🏻}{👸🏼}{👸🏽}{👸🏾}{👸🏿}{👼🏻}{👼🏼}{👼🏽}{👼🏾}{👼🏿}{💁🏻}{💁🏼}{💁🏽}{💁🏾}{💁🏿}" + // + + // "{💂🏻}{💂🏼}{💂🏽}{💂🏾}{💂🏿}{💃🏻}{💃🏼}{💃🏽}{💃🏾}{💃🏿}{💅🏻}{💅🏼}{💅🏽}{💅🏾}{💅🏿}{💆🏻}{💆🏼}{💆🏽}{💆🏾}{💆🏿}" + // + + // "{💇🏻}{💇🏼}{💇🏽}{💇🏾}{💇🏿}{💪🏻}{💪🏼}{💪🏽}{💪🏾}{💪🏿}{🕵🏻}{🕵🏼}{🕵🏽}{🕵🏾}{🕵🏿}{🖐🏻}{🖐🏼}{🖐🏽}{🖐🏾}{🖐🏿}" + // + + // "{🖕🏻}{🖕🏼}{🖕🏽}{🖕🏾}{🖕🏿}{🖖🏻}{🖖🏼}{🖖🏽}{🖖🏾}{🖖🏿}{🙅🏻}{🙅🏼}{🙅🏽}{🙅🏾}{🙅🏿}{🙆🏻}{🙆🏼}{🙆🏽}{🙆🏾}{🙆🏿}" + // + + // "{🙇🏻}{🙇🏼}{🙇🏽}{🙇🏾}{🙇🏿}{🙋🏻}{🙋🏼}{🙋🏽}{🙋🏾}{🙋🏿}{🙌🏻}{🙌🏼}{🙌🏽}{🙌🏾}{🙌🏿}{🙍🏻}{🙍🏼}{🙍🏽}{🙍🏾}{🙍🏿}" + // + + // "{🙎🏻}{🙎🏼}{🙎🏽}{🙎🏾}{🙎🏿}{🙏🏻}{🙏🏼}{🙏🏽}{🙏🏾}{🙏🏿}{🚣🏻}{🚣🏼}{🚣🏽}{🚣🏾}{🚣🏿}{🚴🏻}{🚴🏼}{🚴🏽}{🚴🏾}{🚴🏿}" + // + + // "{🚵🏻}{🚵🏼}{🚵🏽}{🚵🏾}{🚵🏿}{🚶🏻}{🚶🏼}{🚶🏽}{🚶🏾}{🚶🏿}{🛀🏻}{🛀🏼}{🛀🏽}{🛀🏾}{🛀🏿}{🤘🏻}{🤘🏼}{🤘🏽}{🤘🏾}{🤘🏿}]" // // zwj sequences - // + "[{👁‍🗨}{👨‍❤️‍👨}{👨‍❤️‍💋‍👨}{👨‍👨‍👦}{👨‍👨‍👦‍👦}{👨‍👨‍👧}{👨‍👨‍👧‍👦}{👨‍👨‍👧‍👧}{👨‍👩‍👦}{👨‍👩‍👦‍👦}{👨‍👩‍👧}{👨‍👩‍👧‍👦}{👨‍👩‍👧‍👧}{👩‍❤️‍👨}{👩‍❤️‍👩}{👩‍❤️‍💋‍👨}{👩‍❤️‍💋‍👩}{👩‍👩‍👦}{👩‍👩‍👦‍👦}{👩‍👩‍👧}{👩‍👩‍👧‍👦}{👩‍👩‍👧‍👧}]" + // + + // "[{👁‍🗨}{👨‍❤️‍👨}{👨‍❤️‍💋‍👨}{👨‍👨‍👦}{👨‍👨‍👦‍👦}{👨‍👨‍👧}{👨‍👨‍👧‍👦}{👨‍👨‍👧‍👧}{👨‍👩‍👦}{👨‍👩‍👦‍👦}{👨‍👩‍👧}{👨‍👩‍👧‍👦}{👨‍👩‍👧‍👧}{👩‍❤️‍👨}{👩‍❤️‍👩}{👩‍❤️‍💋‍👨}{👩‍❤️‍💋‍👩}{👩‍👩‍👦}{👩‍👩‍👦‍👦}{👩‍👩‍👧}{👩‍👩‍👧‍👦}{👩‍👩‍👧‍👧}]" // + "]" // // TODO, add NamedSequences-8.0.0.txt, or at least Tamil // ).freeze(); diff --git a/unicodetools/src/main/java/org/unicode/tools/AacOrder.java b/unicodetools/src/main/java/org/unicode/tools/AacOrder.java index 2d999b167..5dbe72516 100644 --- a/unicodetools/src/main/java/org/unicode/tools/AacOrder.java +++ b/unicodetools/src/main/java/org/unicode/tools/AacOrder.java @@ -1,9 +1,15 @@ package org.unicode.tools; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.EntryRange; +import com.ibm.icu.util.VersionInfo; import java.io.IOException; import java.io.PrintWriter; import java.util.Set; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyValueSets; @@ -21,14 +27,6 @@ import org.unicode.tools.emoji.EmojiDataSourceCombined; import org.unicode.tools.emoji.EmojiOrder; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.EntryRange; -import com.ibm.icu.util.VersionInfo; - public class AacOrder { private static final VersionInfo VERSION = Emoji.VERSION14; @@ -36,17 +34,20 @@ public class AacOrder { private static final CandidateData CANDIDATE_DATA = CandidateData.getInstance(); - private static final EmojiDataSource EMOJI_DATA = new EmojiDataSourceCombined(EmojiData.EMOJI_DATA); + private static final EmojiDataSource EMOJI_DATA = + new EmojiDataSourceCombined(EmojiData.EMOJI_DATA); private static final EmojiOrder ORDER = EmojiOrder.of(VERSION); static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(UCD_VERSION); static final UnicodeMap names = iup.load(UcdProperty.Name); - static final UnicodeMap gencat = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); - static final UnicodeSet DI = iup.loadEnum(UcdProperty.Default_Ignorable_Code_Point, Binary.class).getSet(Binary.Yes); + static final UnicodeMap gencat = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + static final UnicodeSet DI = + iup.loadEnum(UcdProperty.Default_Ignorable_Code_Point, Binary.class).getSet(Binary.Yes); static final UnicodeMap AGE = iup.loadEnum(UcdProperty.Age, Age_Values.class); - static final UnicodeSet EMOJI = new UnicodeSet(); + static { UnicodeSet temp = EMOJI_DATA.getAllEmojiWithoutDefectives(); // new UnicodeSet() @@ -69,29 +70,31 @@ public class AacOrder { EMOJI.freeze(); } - static final UnicodeSet ALLOWED = new UnicodeSet(0,0x10FFFF); + static final UnicodeSet ALLOWED = new UnicodeSet(0, 0x10FFFF); + static { // + "[^[:c:][:z:][:di:][࿕-࿘ 卍 卐]]" + emoji_sequences - for (General_Category_Values v : ImmutableSet.builder() - .addAll(PropertyValueSets.CONTROL) - .addAll(PropertyValueSets.SEPARATOR) - .build()) { + for (General_Category_Values v : + ImmutableSet.builder() + .addAll(PropertyValueSets.CONTROL) + .addAll(PropertyValueSets.SEPARATOR) + .build()) { ALLOWED.removeAll(gencat.getSet(v)); } - ALLOWED - .removeAll(DI) - .removeAll(new UnicodeSet("[࿕-࿘ 卍 卐]")) // special exceptions - .addAll(EMOJI) - .freeze(); + ALLOWED.removeAll(DI) + .removeAll(new UnicodeSet("[࿕-࿘ 卍 卐]")) // special exceptions + .addAll(EMOJI) + .freeze(); } - static final Set SORTED_ALL_CHARS_SET - = EmojiOrder.sort(ORDER.codepointCompare, ALLOWED); + static final Set SORTED_ALL_CHARS_SET = + EmojiOrder.sort(ORDER.codepointCompare, ALLOWED); /** * First arg is output directory. + * * @param args - * @throws IOException + * @throws IOException */ public static void main(String[] args) throws IOException { UnicodeSet Not10 = AGE.getSet(Age_Values.Unassigned); @@ -104,14 +107,15 @@ public static void main(String[] args) throws IOException { throw new IllegalArgumentException("Bad size"); } String outputDir = args.length == 1 ? args[0] : Settings.Output.GEN_DIR + "consortium/"; - try (PrintWriter outRanges = FileUtilities.openUTF8Writer(outputDir, "aac-order-ranges.txt"); - PrintWriter outEach = FileUtilities.openUTF8Writer(outputDir, "aac-order.txt") - ) { + try (PrintWriter outRanges = + FileUtilities.openUTF8Writer(outputDir, "aac-order-ranges.txt"); + PrintWriter outEach = FileUtilities.openUTF8Writer(outputDir, "aac-order.txt")) { outRanges.println("# Format: codepoint/range/string ; index ; name (if emoji)"); - outEach.println("# Format: codepoint/string ; name (if emoji)" - + "\n# Compute the index while reading the file:" - + "\n# For each single codepoint or string, add one; " - + "\n# For each range, add the number of items in the range"); + outEach.println( + "# Format: codepoint/string ; name (if emoji)" + + "\n# Compute the index while reading the file:" + + "\n# For each single codepoint or string, add one; " + + "\n# For each range, add the number of items in the range"); Range range = new Range(outRanges, true); Range rangeNone = new Range(outEach, false); for (String s : SORTED_ALL_CHARS_SET) { @@ -137,7 +141,8 @@ public static void main(String[] args) throws IOException { // writeUs(outputDir, extra9, "aac-extra-emoji-U9.txt"); } - private static void writeUs(String outputDir, UnicodeSet unicodeSet, String filename) throws IOException { + private static void writeUs(String outputDir, UnicodeSet unicodeSet, String filename) + throws IOException { try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, filename)) { out.print("UnicodeSet EMOJI_ALLOWED = new UnicodeSet("); int totalCodePoints = 0; @@ -146,10 +151,22 @@ private static void writeUs(String outputDir, UnicodeSet unicodeSet, String file for (EntryRange entry : unicodeSet.ranges()) { out.print(separator); separator = ","; - out.println("\n// (" + UTF16.valueOf(entry.codepoint) + ") " + getName(entry.codepoint) + - (entry.codepointEnd == entry.codepoint ? "" : - "\t..\t(" + UTF16.valueOf(entry.codepointEnd) + ") " + getName(entry.codepointEnd))); - out.print("0x" + Integer.toHexString(entry.codepoint) + ",0x" + Integer.toHexString(entry.codepointEnd)); + out.println( + "\n// (" + + UTF16.valueOf(entry.codepoint) + + ") " + + getName(entry.codepoint) + + (entry.codepointEnd == entry.codepoint + ? "" + : "\t..\t(" + + UTF16.valueOf(entry.codepointEnd) + + ") " + + getName(entry.codepointEnd))); + out.print( + "0x" + + Integer.toHexString(entry.codepoint) + + ",0x" + + Integer.toHexString(entry.codepointEnd)); totalCodePoints += (entry.codepointEnd - entry.codepoint) + 1; } out.println(")"); @@ -189,30 +206,39 @@ public Range(PrintWriter out, boolean doIndexes) { this.out = out; this.doIndexes = doIndexes; } + public void flush() { if (last >= 0) { if (first == last) { - out.println(Utility.hex(first) - + (doIndexes ? " ; " + firstIndex : "") - + (EMOJI.contains(first) ? "; \t" + getName(first) : "")); - } else if (EMOJI.containsSome(first,last)) { + out.println( + Utility.hex(first) + + (doIndexes ? " ; " + firstIndex : "") + + (EMOJI.contains(first) ? "; \t" + getName(first) : "")); + } else if (EMOJI.containsSome(first, last)) { for (int cp = first; cp <= last; ++cp) { - out.println(Utility.hex(cp) - + (doIndexes ? " ; " + firstIndex : "") - + "; \t" + getName(cp)); + out.println( + Utility.hex(cp) + + (doIndexes ? " ; " + firstIndex : "") + + "; \t" + + getName(cp)); ++firstIndex; } } else { - out.println(Utility.hex(first) + ".." + Utility.hex(last) - + (doIndexes ? " ; " + firstIndex : "")); + out.println( + Utility.hex(first) + + ".." + + Utility.hex(last) + + (doIndexes ? " ; " + firstIndex : "")); } } last = -2; } + private String getName(String s) { String name = EMOJI_DATA.getName(s); - return name != null ? name : UCharacter.getName(s,"+"); + return name != null ? name : UCharacter.getName(s, "+"); } + private String getName(int s) { String name = null; try { @@ -221,16 +247,18 @@ private String getName(int s) { } return name != null ? name : UCharacter.getName(s); } + public void add(String s) { ++currentIndex; int current = s.codePointAt(0); if (UCharacter.charCount(current) != s.length()) { flush(); - out.println(Utility.hex(s) - + (doIndexes ? " ; " + currentIndex : "") - + (EMOJI.contains(first) ? " ;\t" + getName(s) : "")); + out.println( + Utility.hex(s) + + (doIndexes ? " ; " + currentIndex : "") + + (EMOJI.contains(first) ? " ;\t" + getName(s) : "")); } else { - if (current == last+1) { + if (current == last + 1) { last = current; } else { flush(); diff --git a/unicodetools/src/main/java/org/unicode/tools/CharacterCategories.java b/unicodetools/src/main/java/org/unicode/tools/CharacterCategories.java index c4d3edf7d..5cc4d36bc 100644 --- a/unicodetools/src/main/java/org/unicode/tools/CharacterCategories.java +++ b/unicodetools/src/main/java/org/unicode/tools/CharacterCategories.java @@ -1,34 +1,34 @@ package org.unicode.tools; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import com.ibm.icu.util.ULocale; import java.util.Collections; import java.util.Comparator; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.PropertyValueSets; import org.unicode.props.UcdPropertyValues.General_Category_Values; import org.unicode.props.UcdPropertyValues.Script_Values; import org.unicode.text.utility.Utility; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.SpanCondition; -import com.ibm.icu.util.ULocale; - public class CharacterCategories { static final Comparator COLLATOR; + static { Collator COLLATOR1 = Collator.getInstance(ULocale.forLanguageTag("und-u-co-emoji")); COLLATOR1.setStrength(Collator.IDENTICAL); Comparator stringComparator = new UTF16.StringComparator(true, false, 0); - COLLATOR = new MultiComparator( - (Comparator) COLLATOR1, - (Comparator) stringComparator); + COLLATOR = + new MultiComparator( + (Comparator) COLLATOR1, (Comparator) stringComparator); } + private static final int LINE_WIDTH = 50; private static final Map data = new TreeMap<>(COLLATOR); static final UnicodeSet nonspacing = new UnicodeSet("[[:Mn:][:Me:]]").freeze(); @@ -38,7 +38,7 @@ public class CharacterCategories { UnicodeSet currentSet = null; UnicodeSet ASCII_ID = new UnicodeSet("[_:/\\&A-Za-z0-9]").freeze(); UnicodeSet UNASSIGNED = new UnicodeSet("[:cn:]").freeze(); - + for (String line : FileUtilities.in(CharacterCategories.class, "characterCategories.txt")) { if (line.isEmpty()) { continue; @@ -54,7 +54,7 @@ public class CharacterCategories { } line = line.trim(); if (line.startsWith("#")) { - //comment + // comment } else if (UnicodeSet.resemblesPattern(line, 0)) { currentSet.addAll(new UnicodeSet(line)); } else if (line.startsWith("-") && UnicodeSet.resemblesPattern(line, 1)) { @@ -64,23 +64,36 @@ public class CharacterCategories { } } data.put("Format_&_Whitespace:Whitespace", new UnicodeSet("[:Whitespace:]")); - data.put("Format_&_Whitespace:Format", new UnicodeSet("[[:Cf:][:di:][:Variation_Selector:][:block=Ideographic_Description_Characters:]-[:Whitespace:]]")); + data.put( + "Format_&_Whitespace:Format", + new UnicodeSet( + "[[:Cf:][:di:][:Variation_Selector:][:block=Ideographic_Description_Characters:]-[:Whitespace:]]")); data.put("Symbols:Emoji", new UnicodeSet("[:emoji:]")); - data.put("Symbols:Currency_Symbols", new UnicodeSet(FixedProps.FixedGeneralCategory.getSet(General_Category_Values.Currency_Symbol))); // "[:sc:]" + data.put( + "Symbols:Currency_Symbols", + new UnicodeSet( + FixedProps.FixedGeneralCategory.getSet( + General_Category_Values.Currency_Symbol))); // "[:sc:]" data.put("Symbols:Non-Spacing", new UnicodeSet(nonspacing)); - UnicodeSet common = FixedProps.FixedScriptExceptions.getSet(Collections.singleton(Script_Values.Common)); - UnicodeSet inherited = FixedProps.FixedScriptExceptions.getSet(Collections.singleton(Script_Values.Inherited)); + UnicodeSet common = + FixedProps.FixedScriptExceptions.getSet( + Collections.singleton(Script_Values.Common)); + UnicodeSet inherited = + FixedProps.FixedScriptExceptions.getSet( + Collections.singleton(Script_Values.Inherited)); UnicodeSet control = FixedProps.FixedGeneralCategory.getSet(PropertyValueSets.CONTROL); - UnicodeSet punctuation = FixedProps.FixedGeneralCategory.getSet(PropertyValueSets.PUNCTUATION); - UnicodeSet missing = new UnicodeSet(common).addAll(inherited).removeAll(control).removeAll(punctuation); - -// "[" -// + "[:scx=common:]" -// + "[:scx=inherited:]" -// + "-[:C:]" -// + "-[:p:]" -// + "]"); + UnicodeSet punctuation = + FixedProps.FixedGeneralCategory.getSet(PropertyValueSets.PUNCTUATION); + UnicodeSet missing = + new UnicodeSet(common).addAll(inherited).removeAll(control).removeAll(punctuation); + + // "[" + // + "[:scx=common:]" + // + "[:scx=inherited:]" + // + "-[:C:]" + // + "-[:p:]" + // + "]"); UnicodeSet punc = new UnicodeSet(punctuation); for (Entry entry : data.entrySet()) { UnicodeSet us = entry.getValue(); @@ -92,9 +105,10 @@ public class CharacterCategories { data.put("ZSymbol_missing", missing.freeze()); data.put("Punctuation:Other", punc.freeze()); } - + public static void main(String[] args) { - UnicodeSet invisible = new UnicodeSet("[[:c:][:z:][:whitespace:][:di:][:Variation_Selector:]]").freeze(); + UnicodeSet invisible = + new UnicodeSet("[[:c:][:z:][:whitespace:][:di:][:Variation_Selector:]]").freeze(); StringBuilder b = new StringBuilder(); TreeSet sorted = new TreeSet(COLLATOR); for (Entry entry : data.entrySet()) { @@ -111,8 +125,8 @@ public static void main(String[] args) { count = 0; } if (invisible.contains(cp)) { - b.append(" \\x{").append(Utility.hex(cp,1)).append("}"); - count+=8; + b.append(" \\x{").append(Utility.hex(cp, 1)).append("}"); + count += 8; } else if (nonspacing.contains(cp)) { b.append(" ").append(cp); count++; diff --git a/unicodetools/src/main/java/org/unicode/tools/CheckEmojiProps.java b/unicodetools/src/main/java/org/unicode/tools/CheckEmojiProps.java index 75fde1f9a..f87119081 100644 --- a/unicodetools/src/main/java/org/unicode/tools/CheckEmojiProps.java +++ b/unicodetools/src/main/java/org/unicode/tools/CheckEmojiProps.java @@ -1,5 +1,9 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.EntryRange; import org.unicode.cldr.util.Counter; import org.unicode.props.GenerateEnums; import org.unicode.props.IndexUnicodeProperties; @@ -8,36 +12,32 @@ import org.unicode.tools.emoji.Emoji; import org.unicode.tools.emoji.EmojiData; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.EntryRange; - public class CheckEmojiProps { - static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); + static final IndexUnicodeProperties iup = + IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); static final UnicodeMap General_Category = iup.load(UcdProperty.General_Category); static final UnicodeMap Name = iup.load(UcdProperty.Name); - static final UnicodeMap Grapheme_Cluster_Break = iup.load(UcdProperty.Grapheme_Cluster_Break); + static final UnicodeMap Grapheme_Cluster_Break = + iup.load(UcdProperty.Grapheme_Cluster_Break); static final UnicodeMap Word_Break = iup.load(UcdProperty.Word_Break); static final UnicodeMap Line_Break = iup.load(UcdProperty.Line_Break); static final EmojiData data = EmojiData.of(Emoji.VERSION_LAST_RELEASED); public static void main(String[] args) { - - final UnicodeSet specials = new UnicodeSet("[[:block=tags:]-[:cn:]-[:deprecated:]]") - .add(0x200D) - .add(0x20E3) - .add(0x20E0) - .add(0xFE0E) - .add(0xFE0F) - ; + + final UnicodeSet specials = + new UnicodeSet("[[:block=tags:]-[:cn:]-[:deprecated:]]") + .add(0x200D) + .add(0x20E3) + .add(0x20E0) + .add(0xFE0E) + .add(0xFE0F); showSet("Specials", specials); - + final UnicodeSet flags = new UnicodeSet(0x1F1E6, 0x1F1FF); showSet("Flags", flags); - final UnicodeSet zwjs = new UnicodeSet(); for (String s : data.getZwjSequencesAll()) { boolean haveZwj = false; @@ -54,14 +54,14 @@ public static void main(String[] args) { showSet("Modifiers", data.MODIFIERS); showSet("Modifier_Bases", data.getModifierBases()); showSet("After_Joiners", zwjs); - - final UnicodeSet others = new UnicodeSet(data.getSingletonsWithDefectives()) - .removeAll(specials) - .removeAll(flags) - .removeAll(data.MODIFIERS) - .removeAll(data.getModifierBases()) - .removeAll(zwjs) - ; + + final UnicodeSet others = + new UnicodeSet(data.getSingletonsWithDefectives()) + .removeAll(specials) + .removeAll(flags) + .removeAll(data.MODIFIERS) + .removeAll(data.getModifierBases()) + .removeAll(zwjs); showSet("Others", others); } @@ -70,9 +70,14 @@ private static void showSet(String string, final UnicodeSet all) { System.out.println("\n" + string); UnicodeMap m = new UnicodeMap(); for (String s : all) { - m.put(s, "GCB=" + Grapheme_Cluster_Break.get(s) - + "; WB=" + Word_Break.get(s) - + "; LB=" + Line_Break.get(s)); + m.put( + s, + "GCB=" + + Grapheme_Cluster_Break.get(s) + + "; WB=" + + Word_Break.get(s) + + "; LB=" + + Line_Break.get(s)); } Counter c = new Counter(); for (String item : m.getAvailableValues()) { @@ -93,17 +98,19 @@ private static void showSet(String string, final UnicodeSet all) { System.out.print(", "); } System.out.print(Utility.hex(s.codepoint)); - if (s.codepoint!=s.codepointEnd) { + if (s.codepoint != s.codepointEnd) { System.out.print(".." + Utility.hex(s.codepointEnd)); } continue; } boolean single = s.codepointEnd == s.codepoint; - System.out.println("\t" + Utility.hex(s.codepoint) - + (single ? "" : ".." + Utility.hex(s.codepointEnd)) - + "\t" + Name.get(s.codepoint) - + (single ? "" : ".." + Name.get(s.codepointEnd)) - ); + System.out.println( + "\t" + + Utility.hex(s.codepoint) + + (single ? "" : ".." + Utility.hex(s.codepointEnd)) + + "\t" + + Name.get(s.codepoint) + + (single ? "" : ".." + Name.get(s.codepointEnd))); } if (!firstMax) { System.out.println(); @@ -124,7 +131,7 @@ private static void showRange(StringBuilder result, EntryRange range) { result.append(", "); } result.append(Utility.hex(range.codepoint)); - if (range.codepoint!=range.codepointEnd) { + if (range.codepoint != range.codepointEnd) { result.append(".." + Utility.hex(range.codepointEnd)); } } diff --git a/unicodetools/src/main/java/org/unicode/tools/CheckSimpTrad.java b/unicodetools/src/main/java/org/unicode/tools/CheckSimpTrad.java index 30e619efa..2e88cab36 100644 --- a/unicodetools/src/main/java/org/unicode/tools/CheckSimpTrad.java +++ b/unicodetools/src/main/java/org/unicode/tools/CheckSimpTrad.java @@ -13,10 +13,7 @@ public static void main(String[] args) { cp = Character.codePointAt(test, i); String sVar = iup.getResolvedValue(UcdProperty.kSimplifiedVariant, cp); String tVar = iup.getResolvedValue(UcdProperty.kTraditionalVariant, cp); - System.out.println(Utility.hex(cp) - + "\tsVar:\t" + sVar - + "\ttVar:\t" + tVar - ); + System.out.println(Utility.hex(cp) + "\tsVar:\t" + sVar + "\ttVar:\t" + tVar); } ; } diff --git a/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalences.java b/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalences.java index c564857df..5bd64e25b 100644 --- a/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalences.java +++ b/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalences.java @@ -1,5 +1,18 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.CollationElementIterator; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UTF16.StringComparator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -13,31 +26,16 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.MultiComparator; import org.unicode.cldr.util.With; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.CollationElementIterator; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UTF16.StringComparator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - public class CollatorEquivalences { private static final Normalizer2 nfc = Normalizer2.getNFCInstance(); private static final Normalizer2 nfkccf = Normalizer2.getNFKCCasefoldInstance(); public static void main(String[] args) throws IOException { showMappings(equiv); - //System.out.println(remapped.toPattern(false)); + // System.out.println(remapped.toPattern(false)); } static final class RawKey implements Comparable { @@ -85,14 +83,17 @@ public int compareTo(RawKey o) { } } } + @Override public int hashCode() { return key.hashCode(); } + @Override public boolean equals(Object obj) { - return key.equals(((RawKey)obj).key); + return key.equals(((RawKey) obj).key); } + @Override public String toString() { StringBuilder b = new StringBuilder("["); @@ -111,7 +112,8 @@ public String toString() { static final UnicodeSet LETTER = new UnicodeSet("[:L:]").freeze(); static final UnicodeSet KATAKANA = new UnicodeSet("[:sc=Katakana:]").freeze(); static final UnicodeSet KATAKANA_SMALL = new UnicodeSet("[ァィゥェォッャュョヮヵヶㇰ-ㇿァ-ッ]").freeze(); - static final UnicodeSet ChangesWithNfkccf = new UnicodeSet("[:Changes_When_NFKC_Casefolded:]").freeze(); + static final UnicodeSet ChangesWithNfkccf = + new UnicodeSet("[:Changes_When_NFKC_Casefolded:]").freeze(); static { RuleBasedCollator temp = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); @@ -120,62 +122,82 @@ public String toString() { temp.freeze(); UCA_SECONDARY_ONLY = temp; - Comparator LONGER_FIRST = new Comparator() { - @Override - public int compare(String o1, String o2) { - return o1.equals(o2) ? 0 - : o1.isEmpty() ? -1 - :o2.isEmpty() ? 1 - : o2.codePointCount(0, o2.length()) - o1.codePointCount(0, o1.length()); - } - }; - Comparator REGULAR_FIRST = new Comparator() { - @Override - public int compare(String o1, String o2) { - return (LETTER.containsAll(o1) ? 0 : 1) - (LETTER.containsAll(o2) ? 0 : 1); - } - }; - - Comparator NFKCCF_FIRST = new Comparator() { - @Override - public int compare(String o1, String o2) { - return (ChangesWithNfkccf.containsNone(o1) ? 0 : 1) - (ChangesWithNfkccf.containsNone(o2) ? 0 : 1); - } - }; + Comparator LONGER_FIRST = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return o1.equals(o2) + ? 0 + : o1.isEmpty() + ? -1 + : o2.isEmpty() + ? 1 + : o2.codePointCount(0, o2.length()) + - o1.codePointCount(0, o1.length()); + } + }; + Comparator REGULAR_FIRST = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return (LETTER.containsAll(o1) ? 0 : 1) - (LETTER.containsAll(o2) ? 0 : 1); + } + }; - Comparator KANA_FIRST = new Comparator() { - @Override - public int compare(String o1, String o2) { - int order1 = (KATAKANA_SMALL.containsAll(o1) ? 1 : KATAKANA.containsAll(o1) ? 0 : 2); - int order2 = (KATAKANA_SMALL.containsAll(o2) ? 1 : KATAKANA.containsAll(o2) ? 0 : 2); - if (order1 != order2) { - int debug = 0; - } - return order1 - order2; - } - }; + Comparator NFKCCF_FIRST = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return (ChangesWithNfkccf.containsNone(o1) ? 0 : 1) + - (ChangesWithNfkccf.containsNone(o2) ? 0 : 1); + } + }; + + Comparator KANA_FIRST = + new Comparator() { + @Override + public int compare(String o1, String o2) { + int order1 = + (KATAKANA_SMALL.containsAll(o1) + ? 1 + : KATAKANA.containsAll(o1) ? 0 : 2); + int order2 = + (KATAKANA_SMALL.containsAll(o2) + ? 1 + : KATAKANA.containsAll(o2) ? 0 : 2); + if (order1 != order2) { + int debug = 0; + } + return order1 - order2; + } + }; RuleBasedCollator temp2 = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); temp2.setDecomposition(Collator.CANONICAL_DECOMPOSITION); temp2.freeze(); - Comparator CODEPOINT = new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); - BEST_IS_LEAST = new MultiComparator( - LONGER_FIRST, - KANA_FIRST, - //NFKCCF_FIRST, - (Comparator)(Comparator) temp2, - REGULAR_FIRST, - CODEPOINT); + Comparator CODEPOINT = + new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); + BEST_IS_LEAST = + new MultiComparator( + LONGER_FIRST, + KANA_FIRST, + // NFKCCF_FIRST, + (Comparator) (Comparator) temp2, + REGULAR_FIRST, + CODEPOINT); } public static UnicodeMap COLLATION_MAP = new UnicodeMap(); - static Relation equiv = Relation.of(new TreeMap>(), TreeSet.class, BEST_IS_LEAST); + static Relation equiv = + Relation.of(new TreeMap>(), TreeSet.class, BEST_IS_LEAST); static Set failed = new LinkedHashSet<>(); - static { + static { for (int i = 0; i <= 0x10FFFF; ++i) { int gc = UCharacter.getIntPropertyValue(i, UProperty.GENERAL_CATEGORY); - if (gc == UCharacter.UNASSIGNED || gc == UCharacter.PRIVATE_USE || gc == UCharacter.SURROGATE) { + if (gc == UCharacter.UNASSIGNED + || gc == UCharacter.PRIVATE_USE + || gc == UCharacter.SURROGATE) { continue; } String s = UTF16.valueOf(i); @@ -187,21 +209,21 @@ public int compare(String o1, String o2) { StringBuilder b = new StringBuilder(); Set moreStrings = new HashSet<>(); combos: - for (RawKey rawKey : equiv.keySet()) { - if (rawKey.size() < 2) continue; - b.setLength(0); - for (Integer temp : rawKey.key){ - RawKey singleKey = new RawKey(temp); - Set items = equiv.get(singleKey); - if (items == null) { - failed.add(singleKey); - //System.out.println("Failed to map " + rawKey + "\t" + equiv.get(rawKey)); - continue combos; - } - b.append(items.iterator().next()); + for (RawKey rawKey : equiv.keySet()) { + if (rawKey.size() < 2) continue; + b.setLength(0); + for (Integer temp : rawKey.key) { + RawKey singleKey = new RawKey(temp); + Set items = equiv.get(singleKey); + if (items == null) { + failed.add(singleKey); + // System.out.println("Failed to map " + rawKey + "\t" + equiv.get(rawKey)); + continue combos; } - moreStrings.add(nfc.normalize(b.toString())); + b.append(items.iterator().next()); } + moreStrings.add(nfc.normalize(b.toString())); + } // we found something, try adding // might be redundant, but we don't care. @@ -230,7 +252,8 @@ public int compare(String o1, String o2) { private static void showMappings(Relation equiv) { UnicodeSet remapped = new UnicodeSet(); - System.out.println("Source Hex\tTarget Hex\t(Source→Target)\tCollation Key (P/S)\tSource Name → Target Name"); + System.out.println( + "Source Hex\tTarget Hex\t(Source→Target)\tCollation Key (P/S)\tSource Name → Target Name"); for (Entry> entry : equiv.keyValuesSet()) { RawKey rawKey = entry.getKey(); Set equivalentItems = entry.getValue(); @@ -242,8 +265,10 @@ private static void showMappings(Relation equiv) { // private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze(); // private static final UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]").freeze(); - // private static final UnicodeSet COMMON = new UnicodeSet("[[:scx=Common:]-[:Block=Counting Rod Numerals:]]").freeze(); - // private static final UnicodeSet SKIP = new UnicodeSet("[\\u0C01\\u0020 ः\u20DD\u0982\\p{Block=Musical Symbols}" + // private static final UnicodeSet COMMON = new UnicodeSet("[[:scx=Common:]-[:Block=Counting + // Rod Numerals:]]").freeze(); + // private static final UnicodeSet SKIP = new UnicodeSet("[\\u0C01\\u0020 + // ः\u20DD\u0982\\p{Block=Musical Symbols}" // + "[:sc=Hiragana:]" // + "[:sc=Katakana:]" // + "]").freeze(); @@ -266,10 +291,20 @@ static String showItems(RawKey rawKey, Set equivalentItems, UnicodeSet r } private static void showMapping(RawKey rawKey, String source, String target) { - System.out.println(Utility.hex(source) + " ;\t" + Utility.hex(target,4," ") - + " #\t(" + source + "→" + target + ")\t" - + rawKey + "\t" - + getName(source," + ") + " → " + getName(target," + ")); + System.out.println( + Utility.hex(source) + + " ;\t" + + Utility.hex(target, 4, " ") + + " #\t(" + + source + + "→" + + target + + ")\t" + + rawKey + + "\t" + + getName(source, " + ") + + " → " + + getName(target, " + ")); } private static String getName(String best, String separator) { @@ -298,5 +333,4 @@ private static String getBest(Set fixed) { } return fixed.iterator().next(); } - } diff --git a/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalencesNew.java b/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalencesNew.java index c887f429e..362c6301d 100644 --- a/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalencesNew.java +++ b/unicodetools/src/main/java/org/unicode/tools/CollatorEquivalencesNew.java @@ -1,5 +1,14 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Utility; +// import com.ibm.icu.text.CollationElementIterator; +// import com.ibm.icu.text.Collator; +// import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UTF16.StringComparator; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.util.Comparator; import java.util.HashSet; @@ -8,7 +17,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.MultiComparator; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyValueSets; @@ -20,41 +28,45 @@ import org.unicode.text.UCD.Default; import org.unicode.text.UCD.Normalizer; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Utility; -//import com.ibm.icu.text.CollationElementIterator; -//import com.ibm.icu.text.Collator; -//import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UTF16.StringComparator; -import com.ibm.icu.text.UnicodeSet; - public class CollatorEquivalencesNew { - private static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Default.ucdVersion()); - static UnicodeSet.XSymbolTable NO_PROPS = new UnicodeSet.XSymbolTable() { - @Override - public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { - throw new IllegalArgumentException("Don't use any ICU Unicode Properties! " + propertyName + "=" + propertyValue); - }; - }; + private static final IndexUnicodeProperties iup = + IndexUnicodeProperties.make(Default.ucdVersion()); + static UnicodeSet.XSymbolTable NO_PROPS = + new UnicodeSet.XSymbolTable() { + @Override + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { + throw new IllegalArgumentException( + "Don't use any ICU Unicode Properties! " + + propertyName + + "=" + + propertyValue); + } + ; + }; + static { UnicodeSet.setDefaultXSymbolTable(NO_PROPS); } - static interface BaseCollatorKey extends Comparable { - } + static interface BaseCollatorKey extends Comparable {} + static interface BaseCollator extends Comparator { BaseCollatorKey getSortKey(String c); } - private static final UnicodeMap SC = iup.loadEnum(UcdProperty.Script, Script_Values.class); - private static final UnicodeMap GC = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); - private static final UnicodeSet CN_CS_CO = PropertyValueSets.getSet(GC, - General_Category_Values.Unassigned, - General_Category_Values.Surrogate, - General_Category_Values.Private_Use); - static final UnicodeSet NOT_BEST = iup.loadEnumSet(UcdProperty.Block, Block_Values.CJK_Radicals_Supplement); + private static final UnicodeMap SC = + iup.loadEnum(UcdProperty.Script, Script_Values.class); + private static final UnicodeMap GC = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + private static final UnicodeSet CN_CS_CO = + PropertyValueSets.getSet( + GC, + General_Category_Values.Unassigned, + General_Category_Values.Surrogate, + General_Category_Values.Private_Use); + static final UnicodeSet NOT_BEST = + iup.loadEnumSet(UcdProperty.Block, Block_Values.CJK_Radicals_Supplement); static final UnicodeSet LETTER = PropertyValueSets.getSet(GC, PropertyValueSets.LETTER); static final UnicodeSet KATAKANA = SC.getSet(Script_Values.Katakana); static final UnicodeSet KATAKANA_SMALL = new UnicodeSet("[ァィゥェォッャュョヮヵヶㇰ-ㇿァ-ッ]").freeze(); @@ -64,36 +76,43 @@ static interface BaseCollator extends Comparator { public static void main(String[] args) throws IOException { showMappings(equiv); - //System.out.println(remapped.toPattern(false)); + // System.out.println(remapped.toPattern(false)); } - private static final org.unicode.text.UCA.UCA uca_raw = org.unicode.text.UCA.UCA.buildCollator(null); - private static final org.unicode.text.UCA.UCA uca_level2Only = org.unicode.text.UCA.UCA.buildCollator(null); + private static final org.unicode.text.UCA.UCA uca_raw = + org.unicode.text.UCA.UCA.buildCollator(null); + private static final org.unicode.text.UCA.UCA uca_level2Only = + org.unicode.text.UCA.UCA.buildCollator(null); + static { uca_level2Only.setStrength(2); } - static final BaseCollator MyCollator = new BaseCollator() { - final class MyBaseCollatorKey implements BaseCollatorKey { - final String sortKey; - MyBaseCollatorKey(String a) { - sortKey = uca_level2Only.getSortKey(a); - } - @Override - public int compareTo(BaseCollatorKey o) { - return sortKey.compareTo(((MyBaseCollatorKey) o).sortKey); - } - } - @Override - public int compare(String a, String b) { - return getSortKey(a).compareTo(getSortKey(b)); - } + static final BaseCollator MyCollator = + new BaseCollator() { + final class MyBaseCollatorKey implements BaseCollatorKey { + final String sortKey; - @Override - public BaseCollatorKey getSortKey(String c) { - return new MyBaseCollatorKey(c); - } - }; + MyBaseCollatorKey(String a) { + sortKey = uca_level2Only.getSortKey(a); + } + + @Override + public int compareTo(BaseCollatorKey o) { + return sortKey.compareTo(((MyBaseCollatorKey) o).sortKey); + } + } + + @Override + public int compare(String a, String b) { + return getSortKey(a).compareTo(getSortKey(b)); + } + + @Override + public BaseCollatorKey getSortKey(String c) { + return new MyBaseCollatorKey(c); + } + }; // static final class RawKey implements BaseCollatorKey { // final List key; @@ -106,7 +125,8 @@ public BaseCollatorKey getSortKey(String c) { // String norm = nfkccf.normalize(s); // List key = new ArrayList<>(); // CollationElementIterator it = uca.getCollationElementIterator(norm); - // for (int ce = it.next(); ce != CollationElementIterator.NULLORDER; ce = it.next()) { + // for (int ce = it.next(); ce != CollationElementIterator.NULLORDER; ce = it.next()) + // { // int cePS = ce >>> 8; // only primary/secondary differences // if (cePS != 0) { // key.add(cePS); @@ -165,47 +185,59 @@ public BaseCollatorKey getSortKey(String c) { static final Comparator BEST_IS_LEAST; static { - Comparator LONGER_FIRST = new Comparator() { - @Override - public int compare(String o1, String o2) { - return o1.equals(o2) ? 0 - : o1.isEmpty() ? -1 - :o2.isEmpty() ? 1 - : o2.codePointCount(0, o2.length()) - o1.codePointCount(0, o1.length()); - } - }; - Comparator REGULAR_FIRST = new Comparator() { - @Override - public int compare(String o1, String o2) { - return (LETTER.containsAll(o1) ? 0 : 1) - (LETTER.containsAll(o2) ? 0 : 1); - } - }; - Comparator KANA_FIRST = new Comparator() { - @Override - public int compare(String o1, String o2) { - int order1 = (KATAKANA_SMALL.containsAll(o1) ? 1 : KATAKANA.containsAll(o1) ? 0 : 2); - int order2 = (KATAKANA_SMALL.containsAll(o2) ? 1 : KATAKANA.containsAll(o2) ? 0 : 2); - if (order1 != order2) { - int debug = 0; - } - return order1 - order2; - } - }; + Comparator LONGER_FIRST = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return o1.equals(o2) + ? 0 + : o1.isEmpty() + ? -1 + : o2.isEmpty() + ? 1 + : o2.codePointCount(0, o2.length()) + - o1.codePointCount(0, o1.length()); + } + }; + Comparator REGULAR_FIRST = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return (LETTER.containsAll(o1) ? 0 : 1) - (LETTER.containsAll(o2) ? 0 : 1); + } + }; + Comparator KANA_FIRST = + new Comparator() { + @Override + public int compare(String o1, String o2) { + int order1 = + (KATAKANA_SMALL.containsAll(o1) + ? 1 + : KATAKANA.containsAll(o1) ? 0 : 2); + int order2 = + (KATAKANA_SMALL.containsAll(o2) + ? 1 + : KATAKANA.containsAll(o2) ? 0 : 2); + if (order1 != order2) { + int debug = 0; + } + return order1 - order2; + } + }; - Comparator CODEPOINT = new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); - BEST_IS_LEAST = new MultiComparator( - LONGER_FIRST, - KANA_FIRST, - uca_raw, - REGULAR_FIRST, - CODEPOINT); + Comparator CODEPOINT = + new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); + BEST_IS_LEAST = + new MultiComparator( + LONGER_FIRST, KANA_FIRST, uca_raw, REGULAR_FIRST, CODEPOINT); } public static UnicodeMap COLLATION_MAP = new UnicodeMap<>(); - static Relation equiv = Relation.of(new TreeMap>(), TreeSet.class, BEST_IS_LEAST); + static Relation equiv = + Relation.of(new TreeMap>(), TreeSet.class, BEST_IS_LEAST); static Set failed = new LinkedHashSet<>(); - static { + static { for (int i = 0; i <= 0x10FFFF; ++i) { if (CN_CS_CO.contains(i)) { continue; @@ -219,22 +251,23 @@ public int compare(String o1, String o2) { StringBuilder b = new StringBuilder(); Set moreStrings = new HashSet<>(); combos: - for (BaseCollatorKey rawKey : equiv.keySet()) { - // if (rawKey.size() < 2) continue; - b.setLength(0); - // TODO -// for (Integer temp : rawKey.key){ -// RawKey singleKey = new RawKey(temp); -// Set items = equiv.get(singleKey); -// if (items == null) { -// failed.add(singleKey); -// //System.out.println("Failed to map " + rawKey + "\t" + equiv.get(rawKey)); -// continue combos; -// } -// b.append(items.iterator().next()); -// } - moreStrings.add(nfc.normalize(b.toString())); - } + for (BaseCollatorKey rawKey : equiv.keySet()) { + // if (rawKey.size() < 2) continue; + b.setLength(0); + // TODO + // for (Integer temp : rawKey.key){ + // RawKey singleKey = new RawKey(temp); + // Set items = equiv.get(singleKey); + // if (items == null) { + // failed.add(singleKey); + // //System.out.println("Failed to map " + rawKey + "\t" + + // equiv.get(rawKey)); + // continue combos; + // } + // b.append(items.iterator().next()); + // } + moreStrings.add(nfc.normalize(b.toString())); + } // we found something, try adding // might be redundant, but we don't care. @@ -263,7 +296,8 @@ public int compare(String o1, String o2) { private static void showMappings(Relation equiv) { UnicodeSet remapped = new UnicodeSet(); - System.out.println("Source Hex\tTarget Hex\t(Source→Target)\tCollation Key (P/S)\tSource Name → Target Name"); + System.out.println( + "Source Hex\tTarget Hex\t(Source→Target)\tCollation Key (P/S)\tSource Name → Target Name"); for (Entry> entry : equiv.keyValuesSet()) { BaseCollatorKey rawKey = entry.getKey(); Set equivalentItems = entry.getValue(); @@ -275,13 +309,16 @@ private static void showMappings(Relation equiv) { // private static final UnicodeSet DIGIT = new UnicodeSet("[0-9]").freeze(); // private static final UnicodeSet NSM = new UnicodeSet("[[:Mn:][:Me:]]").freeze(); - // private static final UnicodeSet COMMON = new UnicodeSet("[[:scx=Common:]-[:Block=Counting Rod Numerals:]]").freeze(); - // private static final UnicodeSet SKIP = new UnicodeSet("[\\u0C01\\u0020 ः\u20DD\u0982\\p{Block=Musical Symbols}" + // private static final UnicodeSet COMMON = new UnicodeSet("[[:scx=Common:]-[:Block=Counting + // Rod Numerals:]]").freeze(); + // private static final UnicodeSet SKIP = new UnicodeSet("[\\u0C01\\u0020 + // ः\u20DD\u0982\\p{Block=Musical Symbols}" // + "[:sc=Hiragana:]" // + "[:sc=Katakana:]" // + "]").freeze(); - static String showItems(BaseCollatorKey rawKey, Set equivalentItems, UnicodeSet remapped) { + static String showItems( + BaseCollatorKey rawKey, Set equivalentItems, UnicodeSet remapped) { if (equivalentItems.size() > 1) { String best = getBest(equivalentItems); for (String s : equivalentItems) { @@ -299,10 +336,20 @@ static String showItems(BaseCollatorKey rawKey, Set equivalentItems, Uni } private static void showMapping(BaseCollatorKey rawKey, String source, String target) { - System.out.println(Utility.hex(source) + " ;\t" + Utility.hex(target,4," ") - + " #\t(" + source + "→" + target + ")\t" - + rawKey + "\t" - + iup.getName(source, " + ") + " → " + iup.getName(target, " + ")); + System.out.println( + Utility.hex(source) + + " ;\t" + + Utility.hex(target, 4, " ") + + " #\t(" + + source + + "→" + + target + + ")\t" + + rawKey + + "\t" + + iup.getName(source, " + ") + + " → " + + iup.getName(target, " + ")); } private static String getBest(Set fixed) { @@ -318,5 +365,4 @@ private static String getBest(Set fixed) { } return fixed.iterator().next(); } - } diff --git a/unicodetools/src/main/java/org/unicode/tools/Common.java b/unicodetools/src/main/java/org/unicode/tools/Common.java index ec474aea2..76a9bec77 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Common.java +++ b/unicodetools/src/main/java/org/unicode/tools/Common.java @@ -1,9 +1,8 @@ package org.unicode.tools; -import java.util.regex.Pattern; - import com.google.common.base.Joiner; import com.google.common.base.Splitter; +import java.util.regex.Pattern; public class Common { @@ -16,5 +15,6 @@ public class Common { public static final Joiner SPACE_JOINER = Joiner.on(' '); public static final Joiner COMMA_JOINER = Joiner.on(", "); public static final Joiner CRLF_JOINER = Joiner.on('\n'); - public static final Pattern ADOBE_RS_MATCHER = Pattern.compile("[CV]\\+[0-9]{1,5}\\+([1-9][0-9]{0,2})\\.([1-9][0-9]?)\\.([0-9]{1,2})"); + public static final Pattern ADOBE_RS_MATCHER = + Pattern.compile("[CV]\\+[0-9]{1,5}\\+([1-9][0-9]{0,2})\\.([1-9][0-9]?)\\.([0-9]{1,2})"); } diff --git a/unicodetools/src/main/java/org/unicode/tools/CompareUnicodeSets.java b/unicodetools/src/main/java/org/unicode/tools/CompareUnicodeSets.java index 0a24af510..ff87dd21c 100644 --- a/unicodetools/src/main/java/org/unicode/tools/CompareUnicodeSets.java +++ b/unicodetools/src/main/java/org/unicode/tools/CompareUnicodeSets.java @@ -1,17 +1,20 @@ package org.unicode.tools; -import java.util.Objects; - import com.ibm.icu.text.UnicodeSet; +import java.util.Objects; public class CompareUnicodeSets { -public static void main(String[] args) { - UnicodeSet a = new UnicodeSet("[ا أ آ ب پ ت ٹ ث ج چ ح خ د ڈ ذ ر ڑ ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن ں و ؤ ہ ۂ ھ ء ی ئ ے ة ه]"); - UnicodeSet b = new UnicodeSet("[ا ب پ ت ٹ ث ج چ ح خ د ڈ ذ ر ڑ ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ہ ھ ء ی ے]"); - System.out.println("old:\t" + a.toPattern(false)); - System.out.println("new:\t" + b.toPattern(false)); - System.out.println("old-only:\t" + new UnicodeSet(a).removeAll(b).toPattern(false)); - System.out.println("new-only:\t" + new UnicodeSet(b).removeAll(a).toPattern(false)); - Objects.equals(3, 3); -} + public static void main(String[] args) { + UnicodeSet a = + new UnicodeSet( + "[ا أ آ ب پ ت ٹ ث ج چ ح خ د ڈ ذ ر ڑ ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن ں و ؤ ہ ۂ ھ ء ی ئ ے ة ه]"); + UnicodeSet b = + new UnicodeSet( + "[ا ب پ ت ٹ ث ج چ ح خ د ڈ ذ ر ڑ ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ہ ھ ء ی ے]"); + System.out.println("old:\t" + a.toPattern(false)); + System.out.println("new:\t" + b.toPattern(false)); + System.out.println("old-only:\t" + new UnicodeSet(a).removeAll(b).toPattern(false)); + System.out.println("new-only:\t" + new UnicodeSet(b).removeAll(a).toPattern(false)); + Objects.equals(3, 3); + } } diff --git a/unicodetools/src/main/java/org/unicode/tools/Confusables.java b/unicodetools/src/main/java/org/unicode/tools/Confusables.java index 43bed1720..ce6afed0d 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Confusables.java +++ b/unicodetools/src/main/java/org/unicode/tools/Confusables.java @@ -1,5 +1,11 @@ package org.unicode.tools; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Freezable; +import com.ibm.icu.util.ICUException; import java.io.IOException; import java.io.PrintWriter; import java.util.EnumMap; @@ -8,7 +14,6 @@ import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CldrUtility; import org.unicode.props.UcdProperty; @@ -16,41 +21,36 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Freezable; -import com.ibm.icu.util.ICUException; - /** - * Class that encapsulates the data from confusables.txt. It currently generates its own whole-script confusable data, - * because of omissions in the Unicode data file. - * @author markdavis + * Class that encapsulates the data from confusables.txt. It currently generates its own + * whole-script confusable data, because of omissions in the Unicode data file. * + * @author markdavis */ public class Confusables { public static final Splitter SEMI = Splitter.on(';').trimResults(); - /** type of confusable data. Only the MA data is used; the rest is deprecated. + /** + * type of confusable data. Only the MA data is used; the rest is deprecated. + * * @author markdavis */ public enum Style { -// SL, -// SA, -// ML, - MA} - -// private static class Data { -// final Style style; -// final String result; -// public Data(Style style, String result) { -// this.style = style; -// this.result = result; -// } -// } - - + // SL, + // SA, + // ML, + MA + } + + // private static class Data { + // final Style style; + // final String result; + // public Data(Style style, String result) { + // this.style = style; + // this.result = result; + // } + // } + /** * @return the style2map */ @@ -60,30 +60,34 @@ public UnicodeMap getRawMapToRepresentative(Style style) { /** * Get the mapping from character to representative confusable. + * * @return the char2data */ public UnicodeMap> getChar2data() { return char2data; } - final private Map> style2RawMapToRepresentative; - final private UnicodeSet hasConfusable = new UnicodeSet(); - final private UnicodeMap> char2data = new UnicodeMap>(); + private final Map> style2RawMapToRepresentative; + private final UnicodeSet hasConfusable = new UnicodeSet(); + private final UnicodeMap> char2data = + new UnicodeMap>(); - final private Map> scriptToScriptToCodepointToUnicodeSet; + private final Map> + scriptToScriptToCodepointToUnicodeSet; - /** - * Mapping from codepoint to name. - */ - public static final UnicodeMap CODEPOINT_TO_NAME = ScriptDetector.IUP.load(UcdProperty.Name); + /** Mapping from codepoint to name. */ + public static final UnicodeMap CODEPOINT_TO_NAME = + ScriptDetector.IUP.load(UcdProperty.Name); /** * Create confusables data from a directory—not cached! + * * @param directory */ - public Confusables (String directory) { + public Confusables(String directory) { try { - EnumMap> _style2RawMapToRepresentative = new EnumMap>(Style.class); + EnumMap> _style2RawMapToRepresentative = + new EnumMap>(Style.class); for (String line : FileUtilities.in(directory, "confusables.txt")) { if (line.startsWith("\uFEFF")) { line = line.substring(1); @@ -93,7 +97,7 @@ public Confusables (String directory) { continue; } int hashPos = line.indexOf('#'); - line = line.substring(0,hashPos).trim(); + line = line.substring(0, hashPos).trim(); if (line.isEmpty()) { continue; } @@ -106,24 +110,30 @@ public Confusables (String directory) { Style style = Style.valueOf(parts[2]); addConfusable(style, source, target, _style2RawMapToRepresentative); -// if (CharSequences.getSingleCodePoint(target) != Integer.MAX_VALUE) { -// addConfusable(style, target, source, _style2RawMapToRepresentative); -// } + // if (CharSequences.getSingleCodePoint(target) != Integer.MAX_VALUE) + // { + // addConfusable(style, target, source, + // _style2RawMapToRepresentative); + // } } - style2RawMapToRepresentative = CldrUtility.protectCollection(_style2RawMapToRepresentative); + style2RawMapToRepresentative = + CldrUtility.protectCollection(_style2RawMapToRepresentative); char2data.freeze(); hasConfusable.freeze(); // patch, because the file doesn't contain X => common/inherited or the targetSet - - UnicodeMap codepointToRepresentativeConfusable = style2RawMapToRepresentative.get(Style.MA); - Map> _scriptToScriptToCodepointToUnicodeSet = new EnumMap<>(Script_Values.class); + + UnicodeMap codepointToRepresentativeConfusable = + style2RawMapToRepresentative.get(Style.MA); + Map> + _scriptToScriptToCodepointToUnicodeSet = new EnumMap<>(Script_Values.class); // get the equivalence classes final ScriptDetector scriptDetector = new ScriptDetector(); for (String representative : codepointToRepresentativeConfusable.values()) { - UnicodeSet equivalents = new UnicodeSet(codepointToRepresentativeConfusable.getSet(representative)) - .add(representative); + UnicodeSet equivalents = + new UnicodeSet(codepointToRepresentativeConfusable.getSet(representative)) + .add(representative); for (String a : equivalents) { for (String b : equivalents) { if (a.equals(b)) { @@ -136,23 +146,35 @@ public Confusables (String directory) { } Set aScripts = ScriptDetector.getScriptExtensions(aSingle); for (Script_Values aScript : aScripts) { - Set bScripts = scriptDetector.set(b).getSingleSetOrNull(); + Set bScripts = + scriptDetector.set(b).getSingleSetOrNull(); if (bScripts == null) { continue; // not a single set of scripts } for (Script_Values bScript : bScripts) { - addToMap(aScript, bScript, aSingle, b, _scriptToScriptToCodepointToUnicodeSet); + addToMap( + aScript, + bScript, + aSingle, + b, + _scriptToScriptToCodepointToUnicodeSet); } } } } } - scriptToScriptToCodepointToUnicodeSet = CldrUtility.protectCollection(_scriptToScriptToCodepointToUnicodeSet); + scriptToScriptToCodepointToUnicodeSet = + CldrUtility.protectCollection(_scriptToScriptToCodepointToUnicodeSet); } catch (Exception e) { throw new RuntimeException(e); } } - private void addConfusable(Style style, String source, String target, EnumMap> _style2map) { + + private void addConfusable( + Style style, + String source, + String target, + EnumMap> _style2map) { UnicodeMap map = _style2map.get(style); if (map == null) { _style2map.put(style, map = new UnicodeMap<>()); @@ -168,21 +190,25 @@ private void addConfusable(Style style, String source, String target, EnumMap> getScriptToScriptToCodepointToUnicodeSet() { + public Map> + getScriptToScriptToCodepointToUnicodeSet() { return scriptToScriptToCodepointToUnicodeSet; } - /** * Return single code point if a is one; otherwise -1; + * * @param a * @return */ private int getSingleCodePoint(String a) { - return a.length() == 1 || a.length() == 2 && Character.isHighSurrogate(a.charAt(0)) ? a.codePointAt(0) : -1; + return a.length() == 1 || a.length() == 2 && Character.isHighSurrogate(a.charAt(0)) + ? a.codePointAt(0) + : -1; } - private void addToMap(Map> _scriptToScriptToUnicodeSet, + private void addToMap( + Map> _scriptToScriptToUnicodeSet, int aSingle, Script_Values sourceScript, Script_Values targetScript) { @@ -198,8 +224,11 @@ private void addToMap(Map> _script uset.add(aSingle); } - private void addToMap(Script_Values sourceScript, - Script_Values targetScript, int aSingle, String b, + private void addToMap( + Script_Values sourceScript, + Script_Values targetScript, + int aSingle, + String b, Map> map) { Map map2 = map.get(sourceScript); if (map2 == null) { @@ -212,7 +241,6 @@ private void addToMap(Script_Values sourceScript, map3.add(aSingle, b); } - // /** // * Return the script of the string, or null if there is not a unique one. // * Only uses Script property for now. @@ -223,7 +251,8 @@ private void addToMap(Script_Values sourceScript, // Script_Values result = null; // boolean haveCommon = false; // for (UnicodeSet.EntryRange range : source.ranges()) { - // for (int codepoint = range.codepoint; codepoint <= range.codepointEnd; ++codepoint) { + // for (int codepoint = range.codepoint; codepoint <= range.codepointEnd; + // ++codepoint) { // Script_Values current = Confusables.CODEPOINT_TO_SCRIPT.get(codepoint); // if (current == Script_Values.Common || current == Script_Values.Inherited) { // haveCommon = true; @@ -238,11 +267,15 @@ private void addToMap(Script_Values sourceScript, // } /** - * Return whole-script confusables data. Augments the Unicode data by adding the set of characters mapped to. + * Return whole-script confusables data. Augments the Unicode data by adding the set of + * characters mapped to. + * * @return null if no match for script1+script2 */ - public CodepointToConfusables getCharsToConfusables(Script_Values sourceScript, Script_Values targetScript) { - Map map1 = scriptToScriptToCodepointToUnicodeSet.get(sourceScript); + public CodepointToConfusables getCharsToConfusables( + Script_Values sourceScript, Script_Values targetScript) { + Map map1 = + scriptToScriptToCodepointToUnicodeSet.get(sourceScript); if (map1 == null) { return null; } @@ -252,14 +285,16 @@ public CodepointToConfusables getCharsToConfusables(Script_Values sourceScript, /** * A map from codepoints to sets of characters. Encapsulated to make it easier to manage. + * * @author markdavis */ - public static class CodepointToConfusables implements Iterable>, Freezable { + public static class CodepointToConfusables + implements Iterable>, Freezable { boolean isFrozen; Map data = new TreeMap<>(); @Override - public Iterator> iterator() { + public Iterator> iterator() { return data.entrySet().iterator(); } @@ -303,6 +338,7 @@ public UnicodeSet keySet() { /** * Returns all the characters that have some confusable. + * * @return */ public UnicodeSet getCharsWithConfusables() { @@ -311,15 +347,17 @@ public UnicodeSet getCharsWithConfusables() { /** * Prints out the whole-script confusable data. + * * @param out */ public void print(Appendable out) { try { - for (Entry> scriptToCodepointToUnicodeSet - : scriptToScriptToCodepointToUnicodeSet.entrySet()) { + for (Entry> + scriptToCodepointToUnicodeSet : + scriptToScriptToCodepointToUnicodeSet.entrySet()) { String sourceScript = scriptToCodepointToUnicodeSet.getKey().getShortName(); - for (Entry codepointToUnicodeSet - : scriptToCodepointToUnicodeSet.getValue().entrySet()) { + for (Entry codepointToUnicodeSet : + scriptToCodepointToUnicodeSet.getValue().entrySet()) { String targetScript = codepointToUnicodeSet.getKey().getShortName(); UnicodeMap temp = new UnicodeMap(); for (Entry value : codepointToUnicodeSet.getValue()) { @@ -329,15 +367,23 @@ public void print(Appendable out) { UnicodeSet keys = temp.getSet(values); for (UnicodeSet.EntryRange range : keys.ranges()) { final boolean single = range.codepointEnd == range.codepoint; - out.append(Utility.hex(range.codepoint) - + (single ? "\t\t" : ".." + Utility.hex(range.codepointEnd)) - + ";\t" + sourceScript - + ";\t" + targetScript - + "; A; " + values.toPattern(false) - + "\t# ( " + UTF16.valueOf(range.codepoint) + " ) " + CODEPOINT_TO_NAME.get(range.codepoint) - + (single ? "" : "...") - + "\n" - ); + out.append( + Utility.hex(range.codepoint) + + (single + ? "\t\t" + : ".." + Utility.hex(range.codepointEnd)) + + ";\t" + + sourceScript + + ";\t" + + targetScript + + "; A; " + + values.toPattern(false) + + "\t# ( " + + UTF16.valueOf(range.codepoint) + + " ) " + + CODEPOINT_TO_NAME.get(range.codepoint) + + (single ? "" : "...") + + "\n"); } } } @@ -348,13 +394,14 @@ public void print(Appendable out) { } } - /** - * Write out the whole-script confusables data. - */ + /** Write out the whole-script confusables data. */ public static void main(String[] args) throws IOException { - final String securityDir = Settings.UnicodeTools.getDataPathStringForLatestVersion("security"); + final String securityDir = + Settings.UnicodeTools.getDataPathStringForLatestVersion("security"); final Confusables CONFUSABLES = new Confusables(securityDir); - try (PrintWriter pw = FileUtilities.openUTF8Writer(Settings.Output.GEN_UCD_DIR, "confusablesWholeScript.txt")) { + try (PrintWriter pw = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_UCD_DIR, "confusablesWholeScript.txt")) { CONFUSABLES.print(pw); pw.flush(); } diff --git a/unicodetools/src/main/java/org/unicode/tools/CopyPropsToUnicodeJsp.java b/unicodetools/src/main/java/org/unicode/tools/CopyPropsToUnicodeJsp.java index 36888bab4..9aaa0d900 100644 --- a/unicodetools/src/main/java/org/unicode/tools/CopyPropsToUnicodeJsp.java +++ b/unicodetools/src/main/java/org/unicode/tools/CopyPropsToUnicodeJsp.java @@ -1,5 +1,7 @@ package org.unicode.tools; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.util.VersionInfo; import java.io.File; import java.io.IOException; import java.nio.file.CopyOption; @@ -10,25 +12,31 @@ import java.util.Collections; import java.util.EnumSet; import java.util.Set; - import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyStatus; import org.unicode.props.UcdProperty; import org.unicode.text.utility.Settings; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.util.VersionInfo; - public class CopyPropsToUnicodeJsp { public static void main(String[] args) throws IOException { IndexUnicodeProperties latest = IndexUnicodeProperties.make(); VersionInfo ucdVersion = latest.getUcdVersion(); System.out.println("Copying Props for " + ucdVersion + " into JSP"); String fromDir = Settings.Output.BIN_DIR + ucdVersion + "/"; - String toDir = Settings.UnicodeTools.UNICODEJSPS_DIR + "src/main/resources/org/unicode/jsp/props/"; - //overwrite existing file, if exists - CopyOption[] options = new CopyOption[] {StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES}; - Set kExceptions = ImmutableSet.of("kAccountingNumeric.bin", "kOtherNumeric.bin", "kPrimaryNumeric.bin", "kSimplifiedVariant.bin", "kTraditionalVariant.bin"); + String toDir = + Settings.UnicodeTools.UNICODEJSPS_DIR + "src/main/resources/org/unicode/jsp/props/"; + // overwrite existing file, if exists + CopyOption[] options = + new CopyOption[] { + StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES + }; + Set kExceptions = + ImmutableSet.of( + "kAccountingNumeric.bin", + "kOtherNumeric.bin", + "kPrimaryNumeric.bin", + "kSimplifiedVariant.bin", + "kTraditionalVariant.bin"); for (String name : new File(fromDir).list()) { if (!name.endsWith(".bin")) { @@ -43,7 +51,7 @@ public static void main(String[] args) throws IOException { System.out.println("Retaining2 " + name); } } - String pname = name.substring(0, name.length()-4); + String pname = name.substring(0, name.length() - 4); UcdProperty prop = UcdProperty.forString(pname); EnumSet status = PropertyStatus.getPropertyStatusSet(prop); diff --git a/unicodetools/src/main/java/org/unicode/tools/DraftUtils.java b/unicodetools/src/main/java/org/unicode/tools/DraftUtils.java index 6e6872414..a2980fa39 100644 --- a/unicodetools/src/main/java/org/unicode/tools/DraftUtils.java +++ b/unicodetools/src/main/java/org/unicode/tools/DraftUtils.java @@ -4,14 +4,12 @@ /** * Stuff used by the 'draft' class that doesn't belong in CLDR core. - * @author srl * + * @author srl */ public class DraftUtils { - /** - * This actually refers into the unicodetools project. - */ - public static final String UCD_DIRECTORY = Settings.UnicodeTools.getDataPathStringForLatestVersion("ucd"); - + /** This actually refers into the unicodetools project. */ + public static final String UCD_DIRECTORY = + Settings.UnicodeTools.getDataPathStringForLatestVersion("ucd"); } diff --git a/unicodetools/src/main/java/org/unicode/tools/ExtendedPictographic.java b/unicodetools/src/main/java/org/unicode/tools/ExtendedPictographic.java index f73b75dec..367cd4b30 100644 --- a/unicodetools/src/main/java/org/unicode/tools/ExtendedPictographic.java +++ b/unicodetools/src/main/java/org/unicode/tools/ExtendedPictographic.java @@ -1,12 +1,15 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.EntryRange; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.EnumSet; import java.util.List; import java.util.Set; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.tool.Option; import org.unicode.cldr.tool.Option.Options; @@ -23,29 +26,28 @@ import org.unicode.tools.emoji.Emoji; import org.unicode.tools.emoji.EmojiData; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.EntryRange; - class ExtendedPictographic { static final EmojiData emojiData = EmojiData.of(Emoji.VERSION11); - + static final UnicodeSet GLUE_AFTER_ZWJ = new UnicodeSet(); static String HEADER; static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Settings.latestVersion); static final UnicodeMap age = iup.loadEnum(UcdProperty.Age); static final UnicodeMap names = iup.load(UcdProperty.Name); - static final UnicodeMap gencat = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + static final UnicodeMap gencat = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); static final UnicodeSet Cn = gencat.getSet(General_Category_Values.Unassigned); - static final UnicodeMap blocks = iup.loadEnum(UcdProperty.Block, Block_Values.class); + static final UnicodeMap blocks = + iup.loadEnum(UcdProperty.Block, Block_Values.class); static PrintWriter out = null; static void load(File file) throws IOException { StringBuilder header = new StringBuilder(); boolean inHeader = true; - for (String line : FileUtilities.in(file.getParent(), file.getName())) { // Settings.DATA_DIR + "cldr/","ExtendedPictographic.txt") + for (String line : + FileUtilities.in(file.getParent(), file.getName())) { // Settings.DATA_DIR + + // "cldr/","ExtendedPictographic.txt") // U+02704 ; Glue_After_Zwj # ✄ WHITE SCISSORS if (line.startsWith("#") || line.isEmpty()) { if (inHeader) { @@ -64,19 +66,23 @@ static void load(File file) throws IOException { if (pos < 0) { codePoint = codePointEnd = Utility.fromHex(f0).codePointAt(0); } else { - codePoint = Utility.fromHex(f0.substring(0,pos)).codePointAt(0); - codePointEnd = Utility.fromHex(f0.substring(pos+2)).codePointAt(0); + codePoint = Utility.fromHex(f0.substring(0, pos)).codePointAt(0); + codePointEnd = Utility.fromHex(f0.substring(pos + 2)).codePointAt(0); } - GLUE_AFTER_ZWJ.add(codePoint,codePointEnd); + GLUE_AFTER_ZWJ.add(codePoint, codePointEnd); } GLUE_AFTER_ZWJ.freeze(); HEADER = header.toString(); } enum MyOptions { - destination(new Params().setHelp("File to read and change.") - .setMatch(".*") - .setDefault(CLDRPaths.COMMON_DIRECTORY + "properties/ExtendedPictographic.txt")), + destination( + new Params() + .setHelp("File to read and change.") + .setMatch(".*") + .setDefault( + CLDRPaths.COMMON_DIRECTORY + + "properties/ExtendedPictographic.txt")), normal(new Params().setHelp("Generate the CLDR file (default option)")), list(new Params().setHelp("List the extended pictographs and emoji")), operations(new Params().setHelp("List the operations")), @@ -85,15 +91,19 @@ enum MyOptions { // BOILERPLATE TO COPY final Option option; + private MyOptions(Params params) { option = new Option(this, params); } + private static Options myOptions = new Options(); + static { for (MyOptions option : MyOptions.values()) { myOptions.add(option, option.option); } } + private static Set parse(String[] args, boolean showArguments) { return myOptions.parse(MyOptions.values()[0], args, true); } @@ -106,14 +116,15 @@ public static void main(String[] args) throws IOException { MyOptions.parse(args, true); File file = new File(MyOptions.destination.option.getValue()); load(file); - + try (PrintWriter _out = FileUtilities.openUTF8Writer(file.getParent(), file.getName())) { out = _out; boolean doNormal = true; - final UnicodeSet picto = new UnicodeSet(GLUE_AFTER_ZWJ) - .addAll(EmojiData.EMOJI_DATA.getSingletonsWithoutDefectives()) - .freeze(); + final UnicodeSet picto = + new UnicodeSet(GLUE_AFTER_ZWJ) + .addAll(EmojiData.EMOJI_DATA.getSingletonsWithoutDefectives()) + .freeze(); if (MyOptions.list.option.doesOccur()) { out.println(picto.toPattern(false)); @@ -127,14 +138,15 @@ public static void main(String[] args) throws IOException { doNormal = false; } - if (MyOptions.operations.option.doesOccur()) { - UnicodeSet ops = new UnicodeSet("[[:s:][:p:]-[:sc:]-[:xidcontinue:]-[:nfkcqc=n:]&[:scx=Common:]]") - .removeAll(picto); + UnicodeSet ops = + new UnicodeSet( + "[[:s:][:p:]-[:sc:]-[:xidcontinue:]-[:nfkcqc=n:]&[:scx=Common:]]") + .removeAll(picto); // showAge(ops, iup, age, true); for (String cp : ops) { - out.println(showCodePoint(iup, null, cp.codePointAt(0))); + out.println(showCodePoint(iup, null, cp.codePointAt(0))); } doNormal = false; } @@ -153,11 +165,17 @@ public static void main(String[] args) throws IOException { UnicodeSet blockSet = blocks.getSet(block); UnicodeSet emojiInBlock = new UnicodeSet(blockSet).retainAll(emoji); - UnicodeSet gazInBlock = new UnicodeSet(blockSet).retainAll(ExtendedPictographic.GLUE_AFTER_ZWJ); + UnicodeSet gazInBlock = + new UnicodeSet(blockSet).retainAll(ExtendedPictographic.GLUE_AFTER_ZWJ); UnicodeSet gazInBlockNoCn = new UnicodeSet(gazInBlock).removeAll(Cn); UnicodeSet gazInBlockCn = new UnicodeSet(gazInBlock).retainAll(Cn); - UnicodeSet cnInBlock = new UnicodeSet(blockSet).retainAll(Cn).removeAll(gazInBlock); - UnicodeSet otherInBlock = new UnicodeSet(blockSet).removeAll(emojiInBlock).removeAll(cnInBlock).removeAll(gazInBlock); + UnicodeSet cnInBlock = + new UnicodeSet(blockSet).retainAll(Cn).removeAll(gazInBlock); + UnicodeSet otherInBlock = + new UnicodeSet(blockSet) + .removeAll(emojiInBlock) + .removeAll(cnInBlock) + .removeAll(gazInBlock); showNonEmpty("emoji", emojiInBlock, true); showNonEmpty("EP", gazInBlock, true); @@ -182,13 +200,28 @@ public static void main(String[] args) throws IOException { private static void showValue(int cp, final UcdProperty prop, final Class classIn) { Named value = (Named) iup.loadEnum(prop, classIn).get(cp); - out.println(Utility.hex(cp) + " " + names.get(cp) - + " → " + prop.getShortName() + "=" + value.getShortName() + "\t" + prop + "=" + value); + out.println( + Utility.hex(cp) + + " " + + names.get(cp) + + " → " + + prop.getShortName() + + "=" + + value.getShortName() + + "\t" + + prop + + "=" + + value); } private static void showNonEmpty(String title, UnicodeSet emojiInBlock, boolean includeUS) { if (!emojiInBlock.isEmpty()) { - out.println("# " + title + "=" + emojiInBlock.size() + (includeUS ? "\t: " + emojiInBlock.toPattern(false) : "")); + out.println( + "# " + + title + + "=" + + emojiInBlock.size() + + (includeUS ? "\t: " + emojiInBlock.toPattern(false) : "")); } } @@ -196,12 +229,14 @@ private static void showRanges(UnicodeSet gazInBlock, boolean includeSetName) { for (UnicodeSet.EntryRange range : gazInBlock.ranges()) { out.println( printRange(range.codepoint, range.codepointEnd) - + "\t; ExtendedPictographic" - + " #\t" - + (includeSetName ? - new UnicodeSet(range.codepoint, range.codepointEnd).toPattern(false) - + "\t" + getNames(range.codepoint, range.codepointEnd) : "GC=Cn") - ); + + "\t; ExtendedPictographic" + + " #\t" + + (includeSetName + ? new UnicodeSet(range.codepoint, range.codepointEnd) + .toPattern(false) + + "\t" + + getNames(range.codepoint, range.codepointEnd) + : "GC=Cn")); } } @@ -220,11 +255,14 @@ private static String getName(int codepoint) { } private static String printRange(int start, int end) { - return "U+" + Utility.hex(start) - + (start == end ? "" : "..U+" + Utility.hex(end)); + return "U+" + Utility.hex(start) + (start == end ? "" : "..U+" + Utility.hex(end)); } - private static void showAge(final UnicodeSet picto, IndexUnicodeProperties iup, UnicodeMap age, boolean all) { + private static void showAge( + final UnicodeSet picto, + IndexUnicodeProperties iup, + UnicodeMap age, + boolean all) { UnicodeMap mm = new UnicodeMap<>(); for (String s : picto) { final Age_Values currentAge = age.get(s); @@ -241,7 +279,7 @@ private static void showAge(final UnicodeSet picto, IndexUnicodeProperties iup, out.println(showCodePoint(iup, v, range.codepoint)); } else if (all) { for (int i = range.codepoint; i <= range.codepointEnd; ++i) { - out.println(showCodePoint(iup, v, i)); + out.println(showCodePoint(iup, v, i)); } } else { out.println(showRange(iup, v, range.codepoint, range.codepointEnd)); @@ -251,18 +289,28 @@ private static void showAge(final UnicodeSet picto, IndexUnicodeProperties iup, } } - private static String showRange(IndexUnicodeProperties iup, Age_Values v, int cpStart, int cpEnd) { - return Utility.hex(cpStart) + ".." + Utility.hex(cpEnd) - + (v == null ? "" : "; " + v.getShortName()) - + " # " - + UTF16.valueOf(cpStart) + ".." + UTF16.valueOf(cpEnd) + "; " - + iup.getName(cpStart) + ".." + iup.getName(cpEnd); + private static String showRange( + IndexUnicodeProperties iup, Age_Values v, int cpStart, int cpEnd) { + return Utility.hex(cpStart) + + ".." + + Utility.hex(cpEnd) + + (v == null ? "" : "; " + v.getShortName()) + + " # " + + UTF16.valueOf(cpStart) + + ".." + + UTF16.valueOf(cpEnd) + + "; " + + iup.getName(cpStart) + + ".." + + iup.getName(cpEnd); } + private static String showCodePoint(IndexUnicodeProperties iup, Age_Values v, int cp) { return Utility.hex(cp) + (v == null ? "" : "; " + v.getShortName()) - + " # " - + UTF16.valueOf(cp) + "; " + + " # " + + UTF16.valueOf(cp) + + "; " + iup.getName(cp); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/FixNcrs.java b/unicodetools/src/main/java/org/unicode/tools/FixNcrs.java index b437e3008..89e93be1e 100644 --- a/unicodetools/src/main/java/org/unicode/tools/FixNcrs.java +++ b/unicodetools/src/main/java/org/unicode/tools/FixNcrs.java @@ -1,40 +1,39 @@ package org.unicode.tools; +import com.ibm.icu.text.Transliterator; import java.io.IOException; import java.io.PrintWriter; - import org.unicode.cldr.draft.FileUtilities; -import org.unicode.cldr.util.TransliteratorUtilities; - -import com.ibm.icu.text.Transliterator; public class FixNcrs { // TODO add command line support later public static void main(String[] args) throws IOException { -// String relativeFileName = FileUtilities.getRelativeFileName(FixNcrs.class, "entity.txt/../"); -// TransliteratorUtilities.registerTransliteratorFromFile(relativeFileName, "entity"); + // String relativeFileName = FileUtilities.getRelativeFileName(FixNcrs.class, + // "entity.txt/../"); + // TransliteratorUtilities.registerTransliteratorFromFile(relativeFileName, + // "entity"); Transliterator te = Transliterator.getInstance("hex-any/xml; hex-any/xml10"); String dir = "/Users/markdavis/eclipse-workspace/unicode-draft/emoji/"; - + try (PrintWriter out = FileUtilities.openUTF8Writer(dir, "frequency2.html")) { for (String line : FileUtilities.in(dir, "frequency.html")) { String original = line; if (line.contains("&")) { - String newLine = te.transform(line) - .replace("‍", "\u200D") - .replace("…", "\u2026") - .replace("♠", "\u2660") - .replace("¼", "\u00BC") - .replace("½", "\u00BD") - .replace(" ", "\u00A0") - .replace("♥", "\u2665") - .replace("♣", "\u2663") - .replace("↔", "\u2194") - .replace("♦", "\u2666") - ; + String newLine = + te.transform(line) + .replace("‍", "\u200D") + .replace("…", "\u2026") + .replace("♠", "\u2660") + .replace("¼", "\u00BC") + .replace("½", "\u00BD") + .replace(" ", "\u00A0") + .replace("♥", "\u2665") + .replace("♣", "\u2663") + .replace("↔", "\u2194") + .replace("♦", "\u2666"); // System.out.println(original + " => " + line); - line = newLine; + line = newLine; } if (line.contains("&")) { System.out.println(original + " => " + line); diff --git a/unicodetools/src/main/java/org/unicode/tools/FixedProps.java b/unicodetools/src/main/java/org/unicode/tools/FixedProps.java index 4d77ef9a4..9bf9ce253 100644 --- a/unicodetools/src/main/java/org/unicode/tools/FixedProps.java +++ b/unicodetools/src/main/java/org/unicode/tools/FixedProps.java @@ -1,5 +1,11 @@ package org.unicode.tools; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.EntryRange; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -12,7 +18,6 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.IdUsage; @@ -33,26 +38,24 @@ import org.unicode.text.utility.UnicodeSetParser; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.EntryRange; - - public class FixedProps { - private static final Set SINGLETON_INHERITED = singleton(Script_Values.Inherited); + private static final Set SINGLETON_INHERITED = + singleton(Script_Values.Inherited); private static final Set SINGLETON_COMMON = singleton(Script_Values.Common); private static final String VERSION = "9.0.0"; private static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(VERSION); - private static final UnicodeMap AGE = iup.loadEnum(UcdProperty.Age, Age_Values.class); - //private static final UnicodeMap sc = iup.loadEnum(UcdProperty.Script, Script_Values.class); - private static final UnicodeMap> scx = iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); + private static final UnicodeMap AGE = + iup.loadEnum(UcdProperty.Age, Age_Values.class); + // private static final UnicodeMap sc = iup.loadEnum(UcdProperty.Script, + // Script_Values.class); + private static final UnicodeMap> scx = + iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); private static final UnicodeMap name = iup.load(UcdProperty.Name); private static final UnicodeMap emoji = iup.loadEnum(UcdProperty.Emoji, Binary.class); - private static final UnicodeMap dt = iup.loadEnum(UcdProperty.Decomposition_Type, Decomposition_Type_Values.class); - private static final UnicodeMap gc = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + private static final UnicodeMap dt = + iup.loadEnum(UcdProperty.Decomposition_Type, Decomposition_Type_Values.class); + private static final UnicodeMap gc = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); private static final Splitter semi = Splitter.on(';').trimResults(); private static final Splitter hash = Splitter.on('#').trimResults(); @@ -65,8 +68,8 @@ public static final class FixedNfkd { private static final UnicodeMap fixNfkd = new UnicodeMap<>(); private static final UnicodeMap fixDt = new UnicodeMap<>(); private static final UnicodeSet changes = new UnicodeSet(); - static { + static { UnicodeSetParser hp = new UnicodeSetParser(true); UnicodeSet sourceRanges = new UnicodeSet(); StringBuilder targetString = new StringBuilder(); @@ -94,11 +97,11 @@ public static final class FixedNfkd { } } // close the set - UnicodeSet include = new UnicodeSet(0,0x10ffff) - .removeAll(gc.getSet(General_Category_Values.Unassigned)) - .removeAll(gc.getSet(General_Category_Values.Surrogate)) - .removeAll(gc.getSet(General_Category_Values.Private_Use)) - ; + UnicodeSet include = + new UnicodeSet(0, 0x10ffff) + .removeAll(gc.getSet(General_Category_Values.Unassigned)) + .removeAll(gc.getSet(General_Category_Values.Surrogate)) + .removeAll(gc.getSet(General_Category_Values.Private_Use)); boolean madeChange; do { madeChange = false; // have to repeat? @@ -117,10 +120,12 @@ public static final class FixedNfkd { fixNfkd.freeze(); fixDt.freeze(); } + public static String normalize(String source) { String result = fixNfkd.transform(source); return result != null ? result : nfkd.normalize(source); } + public static boolean isNormalized(int source) { return !changes.contains(source); } @@ -129,10 +134,11 @@ public static boolean isNormalized(int source) { public static final class FixedGeneralCategory { static final UnicodeMap generalCategoryRev = new UnicodeMap<>(); private static final UnicodeSet changes = new UnicodeSet(); + static { generalCategoryRev.putAll(gc); EnumSet temp = EnumSet.noneOf(General_Category_Values.class); - UnicodeSet allButOther = new UnicodeSet(0,0x10FFFF); + UnicodeSet allButOther = new UnicodeSet(0, 0x10FFFF); for (General_Category_Values value : PropertyValueSets.CONTROL) { allButOther.removeAll(gc.getSet(value)); } @@ -145,7 +151,8 @@ public static final class FixedGeneralCategory { continue; } // 〔S〕 - EnumSet gcSet = getSetValues(nfkded, generalCategoryRev, temp); + EnumSet gcSet = + getSetValues(nfkded, generalCategoryRev, temp); if (gcSet.size() == 1) { General_Category_Values first = gcSet.iterator().next(); if (first != generalCategoryRev.get(s)) { @@ -157,15 +164,19 @@ public static final class FixedGeneralCategory { generalCategoryRev.freeze(); changes.freeze(); } + public static General_Category_Values get(String source) { return generalCategoryRev.get(source); } + public static General_Category_Values get(int source) { return generalCategoryRev.get(source); } + public static UnicodeSet getSet(General_Category_Values gcv) { return generalCategoryRev.getSet(gcv); } + public static UnicodeSet getSet(Collection gcv) { UnicodeSet punctuation = new UnicodeSet(); for (General_Category_Values v : gcv) { @@ -179,14 +190,16 @@ public static final class FixedScriptExceptions { static final UnicodeMap> scriptRev = new UnicodeMap<>(); static final UnicodeMap reasons = new UnicodeMap<>(); static final UnicodeSet changes = new UnicodeSet(); + static { Splitter semi = Splitter.on(';').trimResults(); scriptRev.putAll(scx); // add mixed cases - UnicodeSet inheritedAndCommon = new UnicodeSet(scx.getSet(SINGLETON_COMMON)) - .addAll(scx.getSet(SINGLETON_INHERITED)) - .freeze(); + UnicodeSet inheritedAndCommon = + new UnicodeSet(scx.getSet(SINGLETON_COMMON)) + .addAll(scx.getSet(SINGLETON_INHERITED)) + .freeze(); Normalizer nfkd = new Normalizer(Normalizer.NFKD, VERSION); EnumSet temp = EnumSet.noneOf(General_Category_Values.class); @@ -205,8 +218,8 @@ public static final class FixedScriptExceptions { Set> altern = IDENTIFIER_INFO.getAlternates(); scripts.remove(Script_Values.Common); scripts.remove(Script_Values.Inherited); - if (altern.size() == 0 - && scripts.size() == 1 + if (altern.size() == 0 + && scripts.size() == 1 && !scripts.equals(scriptRev.get(s))) { scriptRev.put(s, singleton(scripts)); reasons.put(s, "ScriptDecomp"); @@ -232,8 +245,7 @@ public static final class FixedScriptExceptions { Script_Values scriptValue = Script_Values.forName(scriptCodeName); Set old = scriptRev.get(cp); - if (!old.equals(SINGLETON_COMMON) - && !old.equals(SINGLETON_INHERITED)) { + if (!old.equals(SINGLETON_COMMON) && !old.equals(SINGLETON_INHERITED)) { continue; } final Set scriptSingleton = singleton(scriptValue); @@ -249,27 +261,30 @@ public static final class FixedScriptExceptions { reasons.freeze(); changes.freeze(); } + public static Set get(String source) { return scriptRev.get(source); } + public static Set get(int source) { return scriptRev.get(source); } + public static UnicodeSet getSet(Set values) { return scriptRev.getSet(values); } + public static Collection> values() { return scriptRev.values(); } } - - private static Set singleton(Script_Values script_Values) { return Collections.singleton(script_Values); } - private static > U getSetValues(String source, UnicodeMap map, U target) { + private static > U getSetValues( + String source, UnicodeMap map, U target) { target.clear(); for (int cp : CharSequences.codePoints(source)) { T result = map.get(cp); @@ -295,32 +310,46 @@ private static Script_Values getScript(final String scriptCode) { Script_Values scriptValue = Script_Values.forName(scriptCode); if (scriptValue == null) { switch (scriptCode) { - case "Punc": scriptValue = Script_Values.Japanese; break; - case "Symm": scriptValue = Script_Values.Math_Symbols; break; - case "Syme": scriptValue = Script_Values.Emoji_Symbols; break; - case "Symo": scriptValue = Script_Values.Other_Symbols; break; - default: throw new IllegalArgumentException(); + case "Punc": + scriptValue = Script_Values.Japanese; + break; + case "Symm": + scriptValue = Script_Values.Math_Symbols; + break; + case "Syme": + scriptValue = Script_Values.Emoji_Symbols; + break; + case "Symo": + scriptValue = Script_Values.Other_Symbols; + break; + default: + throw new IllegalArgumentException(); } } return scriptValue; } + private static String show(Script_Values s) { switch (s) { - case Japanese: return "Punc"; - // case Korean: return "Zmth"; - // case Han_with_Bopomofo: return "Zsye"; - // case Katakana_Or_Hiragana: return "Zsym"; - default: return s.getShortName(); + case Japanese: + return "Punc"; + // case Korean: return "Zmth"; + // case Han_with_Bopomofo: return "Zsye"; + // case Katakana_Or_Hiragana: return "Zsym"; + default: + return s.getShortName(); } } private static String showLong(Script_Values s) { switch (s) { - case Japanese: return "Punctuation"; - // case Korean: return "Math Symbol"; - // case Han_with_Bopomofo: return "Emoji"; - // case Katakana_Or_Hiragana: return "Other Symbol"; - default: return s.toString(); + case Japanese: + return "Punctuation"; + // case Korean: return "Math Symbol"; + // case Han_with_Bopomofo: return "Emoji"; + // case Katakana_Or_Hiragana: return "Other Symbol"; + default: + return s.toString(); } } @@ -357,19 +386,21 @@ public static void main(String[] args) throws IOException { } temp.put(s, hex + "\t;\t" + dtv.getShortName()); literalTemp.put(s, newValue + "\t;\t" + dtv.getShortName()); - } + } try (PrintWriter out = FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedNfkd.txt")) { showmap(out, FixedNfkd.changes, temp, literalTemp); } - try (PrintWriter out = FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedNfkdDiff.txt")) { + try (PrintWriter out = + FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedNfkdDiff.txt")) { showmap(out, diff.keySet(), diff, literalDiff); } } System.out.println("\n# Fixed GC.\n"); { - final UnicodeMap gc = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + final UnicodeMap gc = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); UnicodeMap diff = new UnicodeMap<>(); for (String s : FixedGeneralCategory.changes) { final General_Category_Values oldValue = gc.get(s); @@ -379,7 +410,11 @@ public static void main(String[] args) throws IOException { } } try (PrintWriter out = FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedGc.txt")) { - showmap(out, FixedGeneralCategory.changes, FixedGeneralCategory.generalCategoryRev, FixedGeneralCategory.generalCategoryRev); + showmap( + out, + FixedGeneralCategory.changes, + FixedGeneralCategory.generalCategoryRev, + FixedGeneralCategory.generalCategoryRev); } try (PrintWriter out = FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedGcDiff.txt")) { showmap(out, diff.keySet(), diff, diff); @@ -388,7 +423,8 @@ public static void main(String[] args) throws IOException { System.out.println("\n# Fixed Script.\n"); { - final UnicodeMap> scx = iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); + final UnicodeMap> scx = + iup.loadEnumSet(UcdProperty.Script_Extensions, Script_Values.class); UnicodeMap diff = new UnicodeMap<>(); for (String s : FixedScriptExceptions.changes) { final Set oldValue = scx.get(s); @@ -398,18 +434,22 @@ public static void main(String[] args) throws IOException { } } try (PrintWriter out = FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedScx.txt")) { - showmap(out, FixedScriptExceptions.changes, FixedScriptExceptions.scriptRev, FixedScriptExceptions.scriptRev); + showmap( + out, + FixedScriptExceptions.changes, + FixedScriptExceptions.scriptRev, + FixedScriptExceptions.scriptRev); } - try (PrintWriter out = FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedScxDiff.txt")) { + try (PrintWriter out = + FileUtilities.openUTF8Writer(GEN_FIXED_DIR, "FixedScxDiff.txt")) { showmap(out, diff.keySet(), diff, diff); } } - // System.out.println("\n# Changed by Ken's data.\n"); // showMap(FixedScriptExceptions.kensRev); - //showOld(); + // showOld(); showGrowthTable(); } @@ -423,15 +463,22 @@ private static void showOld() { // showMap(FixedScriptExceptions.diff); System.out.println("\n# Remaining Common+Inherited - spaces, controls, etc.\n"); - UnicodeSet us = new UnicodeSet(FixedScriptExceptions.getSet(SINGLETON_COMMON)) - .addAll(FixedScriptExceptions.scriptRev.getSet(SINGLETON_INHERITED)) - .removeAll(FixedGeneralCategory.getSet(General_Category_Values.Space_Separator)) - .removeAll(FixedGeneralCategory.getSet(General_Category_Values.Line_Separator)) - .removeAll(FixedGeneralCategory.getSet(General_Category_Values.Paragraph_Separator)) - .removeAll(FixedGeneralCategory.getSet(General_Category_Values.Format)) - .removeAll(FixedGeneralCategory.getSet(General_Category_Values.Control)) - .removeAll(iup.loadEnum(UcdProperty.Variation_Selector, Binary.class).getSet(Binary.Yes)) - ; + UnicodeSet us = + new UnicodeSet(FixedScriptExceptions.getSet(SINGLETON_COMMON)) + .addAll(FixedScriptExceptions.scriptRev.getSet(SINGLETON_INHERITED)) + .removeAll( + FixedGeneralCategory.getSet( + General_Category_Values.Space_Separator)) + .removeAll( + FixedGeneralCategory.getSet(General_Category_Values.Line_Separator)) + .removeAll( + FixedGeneralCategory.getSet( + General_Category_Values.Paragraph_Separator)) + .removeAll(FixedGeneralCategory.getSet(General_Category_Values.Format)) + .removeAll(FixedGeneralCategory.getSet(General_Category_Values.Control)) + .removeAll( + iup.loadEnum(UcdProperty.Variation_Selector, Binary.class) + .getSet(Binary.Yes)); for (General_Category_Values value : General_Category_Values.values()) { UnicodeSet us2 = new UnicodeSet(FixedGeneralCategory.getSet(value)).retainAll(us); if (us2.size() != 0) { @@ -443,10 +490,8 @@ private static void showOld() { UnicodeSet a = emoji.getSet(Binary.Yes); UnicodeSet b = FixedScriptExceptions.getSet(singleton(Script_Values.Emoji_Symbols)); - UnicodeSet[] diffs = new UnicodeSet[] { - new UnicodeSet(a).removeAll(b), - new UnicodeSet(b).removeAll(a) - }; + UnicodeSet[] diffs = + new UnicodeSet[] {new UnicodeSet(a).removeAll(b), new UnicodeSet(b).removeAll(a)}; String[] names = {"Emoji-Zsye", "Zsye-Emoji"}; int count = 0; for (UnicodeSet value : diffs) { @@ -471,13 +516,13 @@ private static void showGrowthTable() { } System.out.print("Script\tID Usage"); - for (Age_Values av : Age_Values.values()){ + for (Age_Values av : Age_Values.values()) { if (av == av.Unassigned) { continue; } System.out.print("\t" + av.getShortName()); } - for (Age_Values av : Age_Values.values()){ + for (Age_Values av : Age_Values.values()) { if (av == av.Unassigned) { continue; } @@ -493,23 +538,23 @@ private static void showGrowthTable() { } System.out.print(showLong(script) + "\t"); Info info = ScriptMetadata.getInfo(show(script)); - switch(info == null ? IdUsage.RECOMMENDED : info.idUsage) { - case EXCLUSION: - System.out.print("Historic"); - break; - case ASPIRATIONAL: - case LIMITED_USE: - System.out.print("Limited Use"); - break; - case RECOMMENDED: - System.out.print("Recommended"); - break; - case UNKNOWN: - default: - break; + switch (info == null ? IdUsage.RECOMMENDED : info.idUsage) { + case EXCLUSION: + System.out.print("Historic"); + break; + case ASPIRATIONAL: + case LIMITED_USE: + System.out.print("Limited Use"); + break; + case RECOMMENDED: + System.out.print("Recommended"); + break; + case UNKNOWN: + default: + break; } - for (Age_Values av : Age_Values.values()){ + for (Age_Values av : Age_Values.values()) { us = AGE.getSet(av); UnicodeSet result = new UnicodeSet(values).retainAll(us); System.out.print("\t" + result.size()); @@ -526,8 +571,12 @@ private static void showMapSorted(UnicodeMap unicodeMap) { showSet(value, us); } } - - private static void showmap(PrintWriter printWriter, UnicodeSet inclusion, UnicodeMap unicodeMap, UnicodeMap literalMap) { + + private static void showmap( + PrintWriter printWriter, + UnicodeSet inclusion, + UnicodeMap unicodeMap, + UnicodeMap literalMap) { Splitter tabSplitter = Splitter.on('\t'); if (literalMap == null) { literalMap = unicodeMap; @@ -553,7 +602,7 @@ private static void showmap(PrintWriter printWriter, UnicodeSet inclusion, U } Tabber tabber = new Tabber.MonoTabber(); for (int i = 0; i < maxItems; ++i) { - tabber.add(max[i]+1, Tabber.LEFT); + tabber.add(max[i] + 1, Tabber.LEFT); } for (String line : temp) { printWriter.println(tabber.process(line)); @@ -564,30 +613,49 @@ private static void showmap(PrintWriter printWriter, UnicodeSet inclusion, U } private static String getLineValues(final String key, final T value, final T literalValue) { - return Utility.hex(key) - + "\t;\t" + value - + "\t # (" + key + " → " + literalValue.toString().replace('\t',' ') + ")\t" + return Utility.hex(key) + + "\t;\t" + + value + + "\t # (" + + key + + " → " + + literalValue.toString().replace('\t', ' ') + + ")\t" + name.get(key); } private static void showSet(String value, UnicodeSet us) { - Tabber tabber = new Tabber.MonoTabber() - .add(19, Tabber.LEFT) - .add(value.length() + 1, Tabber.LEFT) - ; + Tabber tabber = + new Tabber.MonoTabber().add(19, Tabber.LEFT).add(value.length() + 1, Tabber.LEFT); for (EntryRange range : us.ranges()) { if (range.codepoint == range.codepointEnd) { - System.out.println(tabber.process( - "U+" + Utility.hex(range.codepoint) - + " ;\t" + value - + "\t # (" + UTF16.valueOf(range.codepoint) + ") " - + name.get(range.codepoint))); + System.out.println( + tabber.process( + "U+" + + Utility.hex(range.codepoint) + + " ;\t" + + value + + "\t # (" + + UTF16.valueOf(range.codepoint) + + ") " + + name.get(range.codepoint))); } else { - System.out.println(tabber.process( - "U+" + Utility.hex(range.codepoint) + "..U+" + Utility.hex(range.codepointEnd) - + " ;\t" + value - + "\t # (" + UTF16.valueOf(range.codepoint) + ".." + UTF16.valueOf(range.codepointEnd) + ") " - + name.get(range.codepoint) + ".." + name.get(range.codepointEnd))); + System.out.println( + tabber.process( + "U+" + + Utility.hex(range.codepoint) + + "..U+" + + Utility.hex(range.codepointEnd) + + " ;\t" + + value + + "\t # (" + + UTF16.valueOf(range.codepoint) + + ".." + + UTF16.valueOf(range.codepointEnd) + + ") " + + name.get(range.codepoint) + + ".." + + name.get(range.codepointEnd))); } } System.out.println("# TOTAL code points:\t" + us.size() + "\n"); diff --git a/unicodetools/src/main/java/org/unicode/tools/GenerateNormalizeForMatch.java b/unicodetools/src/main/java/org/unicode/tools/GenerateNormalizeForMatch.java index 709c24669..5b71893db 100644 --- a/unicodetools/src/main/java/org/unicode/tools/GenerateNormalizeForMatch.java +++ b/unicodetools/src/main/java/org/unicode/tools/GenerateNormalizeForMatch.java @@ -1,5 +1,18 @@ package org.unicode.tools; +import com.google.common.base.Objects; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMap.Builder; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UTF16.StringComparator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.EntryRange; import java.io.IOException; import java.io.PrintWriter; import java.util.Collection; @@ -13,7 +26,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.With; @@ -32,57 +44,59 @@ import org.unicode.text.utility.Settings; import org.unicode.tools.NormalizeForMatch.SpecialReason; -import com.google.common.base.Objects; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableMap.Builder; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UTF16.StringComparator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.EntryRange; - public class GenerateNormalizeForMatch { - private static final String dir = Settings.CLDR.BASE_DIRECTORY + "Google Drive/workspace/DATA/frequency/"; + private static final String dir = + Settings.CLDR.BASE_DIRECTORY + "Google Drive/workspace/DATA/frequency/"; private static final String GOOGLE_FOLDING_TXT = "google_folding.txt"; private static final Pattern SPACES = Pattern.compile("[,\\s]+"); - private static final Comparator CODEPOINT = new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); + private static final Comparator CODEPOINT = + new StringComparator(true, false, StringComparator.FOLD_CASE_DEFAULT); private static final Comparator UCA; static { org.unicode.text.UCA.UCA uca_raw = org.unicode.text.UCA.UCA.buildCollator(null); // uca_raw.setDecomposition(Collator.CANONICAL_DECOMPOSITION); - UCA = new MultiComparator((Comparator)(Comparator) uca_raw, CODEPOINT); + UCA = new MultiComparator((Comparator) (Comparator) uca_raw, CODEPOINT); } + private static final UnicodeMap COLLATION_MAP = CollatorEquivalences.COLLATION_MAP; - private static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Default.ucdVersion()); + private static final IndexUnicodeProperties iup = + IndexUnicodeProperties.make(Default.ucdVersion()); + + static UnicodeSet.XSymbolTable NO_PROPS = + new UnicodeSet.XSymbolTable() { + @Override + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { + throw new IllegalArgumentException( + "Don't use any ICU Unicode Properties! " + + propertyName + + "=" + + propertyValue); + } + ; + }; - static UnicodeSet.XSymbolTable NO_PROPS = new UnicodeSet.XSymbolTable() { - @Override - public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { - throw new IllegalArgumentException("Don't use any ICU Unicode Properties! " + propertyName + "=" + propertyValue); - }; - }; static { UnicodeSet.setDefaultXSymbolTable(NO_PROPS); } - private static final UnicodeMap GC = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); - private static final UnicodeMap SC = iup.loadEnum(UcdProperty.Script, Script_Values.class); + private static final UnicodeMap GC = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + private static final UnicodeMap SC = + iup.loadEnum(UcdProperty.Script, Script_Values.class); private static final UnicodeMap cpToName = iup.load(UcdProperty.Name); private static final UnicodeSet NO_NAME = cpToName.getSet(null); - private static final UnicodeSet DI = iup.loadEnumSet(UcdProperty.Default_Ignorable_Code_Point, Binary.Yes); + private static final UnicodeSet DI = + iup.loadEnumSet(UcdProperty.Default_Ignorable_Code_Point, Binary.Yes); private static final UnicodeMap cpToNFKCCF = iup.load(UcdProperty.NFKC_Casefold); private static final UnicodeMap cpToNFKC = new UnicodeMap<>(); + static { Normalizer nfkc = Default.nfkc(); for (int i = 0; i <= 0x10FFFF; ++i) { @@ -92,15 +106,21 @@ public boolean applyPropertyAlias(String propertyName, String propertyValue, Uni } cpToNFKC.freeze(); } + private static final UnicodeMap cpToLower = iup.load(UcdProperty.Lowercase_Mapping); - private static final UnicodeMap cpToSimpleLower = iup.load(UcdProperty.Simple_Lowercase_Mapping); + private static final UnicodeMap cpToSimpleLower = + iup.load(UcdProperty.Simple_Lowercase_Mapping); private static final UnicodeSet UNASSIGNED = GC.getSet(General_Category_Values.Unassigned); - private static final UnicodeMap DT = iup.loadEnum(UcdProperty.Decomposition_Type, Decomposition_Type_Values.class); - private static final UnicodeMap AGE = iup.loadEnum(UcdProperty.Age, Age_Values.class); - private static final UnicodeMap BLOCK = iup.loadEnum(UcdProperty.Block, Block_Values.class); + private static final UnicodeMap DT = + iup.loadEnum(UcdProperty.Decomposition_Type, Decomposition_Type_Values.class); + private static final UnicodeMap AGE = + iup.loadEnum(UcdProperty.Age, Age_Values.class); + private static final UnicodeMap BLOCK = + iup.loadEnum(UcdProperty.Block, Block_Values.class); private static final UnicodeSet TAGS = BLOCK.getSet(Block_Values.Tags); - private static final UnicodeSet NFKCCF_SET = iup.loadEnumSet(UcdProperty.Changes_When_NFKC_Casefolded, Binary.Yes); + private static final UnicodeSet NFKCCF_SET = + iup.loadEnumSet(UcdProperty.Changes_When_NFKC_Casefolded, Binary.Yes); private static final Normalizer nfc = Default.nfc(); // Results @@ -109,35 +129,42 @@ public boolean applyPropertyAlias(String propertyName, String propertyValue, Uni private static final UnicodeMap TRIAL_BASE = new UnicodeMap<>(); private static final UnicodeMap> REASONS = new UnicodeMap<>(); private static final UnicodeMap> REASONS_BASE = new UnicodeMap<>(); - private static final NormalizeForMatch ADDITIONS_TO_NFKCCF = NormalizeForMatch.load(null, "XNFKCCF-Curated.txt", true); + private static final NormalizeForMatch ADDITIONS_TO_NFKCCF = + NormalizeForMatch.load(null, "XNFKCCF-Curated.txt", true); private static final NormalizeForMatch ADDITIONS_TO_NFKC = - NormalizeForMatch.load(Settings.UnicodeTools.DATA_DIR + "cldr/", "NFXC-Curated.txt", true); - - - private static final UnicodeSet HANGUL_COMPAT_minus_DI_CN - = new UnicodeSet(iup.loadEnumSet(UcdProperty.Block, Block_Values.Hangul_Compatibility_Jamo)) - .removeAll(DI) - .removeAll(UNASSIGNED) - .freeze(); - - private static final UnicodeSet CN_CS_CO = PropertyValueSets.getSet(GC, - General_Category_Values.Unassigned, - General_Category_Values.Surrogate, - General_Category_Values.Private_Use); - //"[[:Cn:][:Cs:][:Co:]]").freeze(); // -[:di:] - - private static final UnicodeSet SPECIAL_DECOMP_TYPES = PropertyValueSets.getSet(DT, - Decomposition_Type_Values.Square, - Decomposition_Type_Values.Fraction); + NormalizeForMatch.load( + Settings.UnicodeTools.DATA_DIR + "cldr/", "NFXC-Curated.txt", true); + + private static final UnicodeSet HANGUL_COMPAT_minus_DI_CN = + new UnicodeSet( + iup.loadEnumSet( + UcdProperty.Block, Block_Values.Hangul_Compatibility_Jamo)) + .removeAll(DI) + .removeAll(UNASSIGNED) + .freeze(); + + private static final UnicodeSet CN_CS_CO = + PropertyValueSets.getSet( + GC, + General_Category_Values.Unassigned, + General_Category_Values.Surrogate, + General_Category_Values.Private_Use); + // "[[:Cn:][:Cs:][:Co:]]").freeze(); // -[:di:] + + private static final UnicodeSet SPECIAL_DECOMP_TYPES = + PropertyValueSets.getSet( + DT, Decomposition_Type_Values.Square, Decomposition_Type_Values.Fraction); // new UnicodeSet("[" // + "[:dt=Square:]" // + "[:dt=Fraction:]" // + "]") - private static final UnicodeSet NOCHANGE_DECOMP_TYPES = PropertyValueSets.getSet(DT, - Decomposition_Type_Values.Super, - Decomposition_Type_Values.Sub, - Decomposition_Type_Values.Vertical); + private static final UnicodeSet NOCHANGE_DECOMP_TYPES = + PropertyValueSets.getSet( + DT, + Decomposition_Type_Values.Super, + Decomposition_Type_Values.Sub, + Decomposition_Type_Values.Vertical); // new UnicodeSet("[" // + "[:dt=Super:]" // + "[:dt=Sub:]" @@ -150,16 +177,20 @@ public boolean applyPropertyAlias(String propertyName, String propertyValue, Uni // + "]") // .freeze(); - private static final UnicodeSet COMBINING = PropertyValueSets.getSet(GC, PropertyValueSets.MARK); + private static final UnicodeSet COMBINING = + PropertyValueSets.getSet(GC, PropertyValueSets.MARK); // new UnicodeSet("[:m:]").freeze(); - private static final UnicodeSet HIRAGANA = SC.getSet(Script_Values.Hiragana); // new UnicodeSet("[:sc=Hiragana:]").freeze(); - private static final UnicodeSet DECIMAL = PropertyValueSets.getSet(GC, PropertyValueSets.NUMBER); + private static final UnicodeSet HIRAGANA = + SC.getSet(Script_Values.Hiragana); // new UnicodeSet("[:sc=Hiragana:]").freeze(); + private static final UnicodeSet DECIMAL = + PropertyValueSets.getSet(GC, PropertyValueSets.NUMBER); // new UnicodeSet("[:N:]").freeze(); - private static final Map NAME_TO_CP; + private static final Map NAME_TO_CP; + static { - Builder builder = ImmutableMap.builder(); + Builder builder = ImmutableMap.builder(); for (EntryRange entry : NO_NAME.ranges()) { for (int cp = entry.codepoint; cp < entry.codepointEnd; ++cp) { final String name = iup.getName(UTF16.valueOf(cp), " + "); @@ -173,7 +204,8 @@ public boolean applyPropertyAlias(String propertyName, String propertyValue, Uni } } // add fake numbers that aren't handled with the number hack above - // see also http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:name = / NUMBER /:]&[:scx=common:] + // see also http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:name = / NUMBER + // /:]&[:scx=common:] builder.put("NUMBER SIXTY", "60"); builder.put("NUMBER SEVENTY", "70"); builder.put("NUMBER EIGHTY", "80"); @@ -191,38 +223,64 @@ public static void main(String[] args) throws IOException { compareTrial(false); compareTrial(true); -// UnicodeMap trialWithoutCase = new UnicodeMap<>(); -// for (String s : TRIAL.keySet()) { -// String trial = TRIAL.get(s); -// String lower = Default.ucd().getCase(s, UCD_Types.FULL, UCD_Types.LOWER); -// if (!trial.equals(lower)) { -// trialWithoutCase.put(s, trial); -// } -// } -// trialWithoutCase.freeze(); - showSimpleData(TRIAL, REASONS, "XNFKCCF-NFKCCF.txt", "# Cases where XNFKCCF differs from NFKCCF.", cpToNFKCCF); - showSimpleData(TRIAL_BASE, REASONS_BASE, "NFXC-NFKC.txt", "# Cases where NFXC differs from NFKC.", cpToNFKC); - showSimpleData(TRIAL_BASE, REASONS_BASE, "NFXC-Curated.txt", "# Curated file of exceptions", null); - - - showSimpleData(TRIAL, REASONS, "XNFKCCF-NFKC2.txt", "# Cases where XNFKCCF differs from NFKC.", cpToNFKC); - - showSimpleData(N4M, REASONS, "N4M-XNFKCCF.txt", "# Cases where N4M differs from XNFKCCF", TRIAL); + // UnicodeMap trialWithoutCase = new UnicodeMap<>(); + // for (String s : TRIAL.keySet()) { + // String trial = TRIAL.get(s); + // String lower = Default.ucd().getCase(s, UCD_Types.FULL, UCD_Types.LOWER); + // if (!trial.equals(lower)) { + // trialWithoutCase.put(s, trial); + // } + // } + // trialWithoutCase.freeze(); + showSimpleData( + TRIAL, + REASONS, + "XNFKCCF-NFKCCF.txt", + "# Cases where XNFKCCF differs from NFKCCF.", + cpToNFKCCF); + showSimpleData( + TRIAL_BASE, + REASONS_BASE, + "NFXC-NFKC.txt", + "# Cases where NFXC differs from NFKC.", + cpToNFKC); + showSimpleData( + TRIAL_BASE, REASONS_BASE, "NFXC-Curated.txt", "# Curated file of exceptions", null); + + showSimpleData( + TRIAL, + REASONS, + "XNFKCCF-NFKC2.txt", + "# Cases where XNFKCCF differs from NFKC.", + cpToNFKC); + + showSimpleData( + N4M, REASONS, "N4M-XNFKCCF.txt", "# Cases where N4M differs from XNFKCCF", TRIAL); NormalizeForMatch curated = NormalizeForMatch.load(null, "XNFKCCF-Curated.txt", true); - - showSimpleData(curated.getSourceToTarget(), curated.getSourceToReason(), "XNFKCCF-Curated.txt", "# Curated file of exceptions", null); - NormalizeForMatch newCurated = NormalizeForMatch.load(Settings.UnicodeTools.DATA_DIR + "n4m/9.0.0/", "XNFKCCF-Curated.txt", true); + + showSimpleData( + curated.getSourceToTarget(), + curated.getSourceToReason(), + "XNFKCCF-Curated.txt", + "# Curated file of exceptions", + null); + NormalizeForMatch newCurated = + NormalizeForMatch.load( + Settings.UnicodeTools.DATA_DIR + "n4m/9.0.0/", "XNFKCCF-Curated.txt", true); checkNewCurated(curated, newCurated); - // private static final NormalizeForMatch ADDITIONS_TO_NFKCCF = NormalizeForMatch.load("XNFKCCF-Curated.txt"); + // private static final NormalizeForMatch ADDITIONS_TO_NFKCCF = + // NormalizeForMatch.load("XNFKCCF-Curated.txt"); computeCandidateFile(Age_Values.V10_0); - //if (true) return; + // if (true) return; // printData(); // showItemsIn(new UnicodeSet(N4M.keySet()).addAll(TRIAL.keySet())); } private static void checkNewCurated(NormalizeForMatch curated, NormalizeForMatch newCurated) { - UnicodeSet sources = new UnicodeSet(curated.getSourceToTarget().keySet()).addAll(newCurated.getSourceToTarget().keySet()); + UnicodeSet sources = + new UnicodeSet(curated.getSourceToTarget().keySet()) + .addAll(newCurated.getSourceToTarget().keySet()); int diffCount = 0; for (String s : sources) { String t1 = curated.getSourceToTarget().get(s); @@ -232,11 +290,13 @@ private static void checkNewCurated(NormalizeForMatch curated, NormalizeForMatch diffCount++; } } - System.out.println("Total diff from old to new Curated: " + diffCount + " out of " + sources.size()); + System.out.println( + "Total diff from old to new Curated: " + diffCount + " out of " + sources.size()); } private static void findExtraCaps() { - HashMap nameToCp = cpToName.addInverseTo(new HashMap()); + HashMap nameToCp = + cpToName.addInverseTo(new HashMap()); for (Entry entry : cpToName.entrySet()) { String cp = entry.getKey(); String lower = cpToSimpleLower.get(cp); @@ -249,33 +309,48 @@ private static void findExtraCaps() { int otherFirst = other.getRangeStart(0); final String otherCp = UTF16.valueOf(otherFirst); final String cpNkfccf = CldrUtility.ifNull(cpToNFKCCF.get(cp), cp); - final String otherCpNfkccf = CldrUtility.ifNull(cpToNFKCCF.get(otherCp), otherCp); - System.out.println((cpNkfccf.equals(otherCpNfkccf) ? "=" : "≠") - + "\t" + cp - + "\t" + Utility.hex(cp, 4, " ") - + "\t" + otherCp - + "\t" + Utility.hex(otherCp, 4, " ") - + "\t" + cpNkfccf - + "\t" + otherCpNfkccf - + "\t" + name - + "\t" + cpToName.get(otherFirst)); + final String otherCpNfkccf = + CldrUtility.ifNull(cpToNFKCCF.get(otherCp), otherCp); + System.out.println( + (cpNkfccf.equals(otherCpNfkccf) ? "=" : "≠") + + "\t" + + cp + + "\t" + + Utility.hex(cp, 4, " ") + + "\t" + + otherCp + + "\t" + + Utility.hex(otherCp, 4, " ") + + "\t" + + cpNkfccf + + "\t" + + otherCpNfkccf + + "\t" + + name + + "\t" + + cpToName.get(otherFirst)); } } } } private static void compareTrial(boolean ucaOnly) throws IOException { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "n4m/", "XNFKCCF" + (ucaOnly ? "-comp-uca" : "-comp") - + ".txt")) { - UnicodeSet interest = new UnicodeSet(N4M.keySet()) - .addAll(TRIAL.keySet()) - .addAll(NFKCCF_SET) - .addAll(COLLATION_MAP.keySet()) - .retainAll(new UnicodeSet(0,0x10FFFF)) - .removeAll(UNASSIGNED); - - out.println("#Code\tAge\tGC\tDT\tName\tN4M\tTrial\tNFKCCF\tUCA\tΔ\tΔ\tΔ\tN4M Hex\tTrial Hex\tNFKCCF Hex\tUCA Hex\tTrial Reasons"); - TreeSet> sorted = new TreeSet<>(); + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "n4m/", + "XNFKCCF" + (ucaOnly ? "-comp-uca" : "-comp") + ".txt")) { + UnicodeSet interest = + new UnicodeSet(N4M.keySet()) + .addAll(TRIAL.keySet()) + .addAll(NFKCCF_SET) + .addAll(COLLATION_MAP.keySet()) + .retainAll(new UnicodeSet(0, 0x10FFFF)) + .removeAll(UNASSIGNED); + + out.println( + "#Code\tAge\tGC\tDT\tName\tN4M\tTrial\tNFKCCF\tUCA\tΔ\tΔ\tΔ\tN4M Hex\tTrial Hex\tNFKCCF Hex\tUCA Hex\tTrial Reasons"); + TreeSet> sorted = + new TreeSet<>(); for (String s : interest) { String nfkccf = Normalizer3.NFKCCF.normalize(s); String n4m = N4M.get(s); @@ -290,8 +365,10 @@ private static void compareTrial(boolean ucaOnly) throws IOException { if (colEquiv == null) { colEquiv = s; } - final boolean trial_n4m_nfkccfEqual = Objects.equal(trial, n4m) && Objects.equal(trial, nfkccf); - if (trial_n4m_nfkccfEqual && Objects.equal(trial, colEquiv)) { // all equal, we don't care + final boolean trial_n4m_nfkccfEqual = + Objects.equal(trial, n4m) && Objects.equal(trial, nfkccf); + if (trial_n4m_nfkccfEqual + && Objects.equal(trial, colEquiv)) { // all equal, we don't care continue; } if (ucaOnly != trial_n4m_nfkccfEqual) { @@ -299,30 +376,51 @@ private static void compareTrial(boolean ucaOnly) throws IOException { } String reasons = CollectionUtilities.join(REASONS.get(s), " + "); final int cp = s.codePointAt(0); - String line = "'" + Utility.hex(s) - + "\t'" + AGE.get(cp).getShortName() // UCharacter.getAge(cp).getVersionString(2, 2) - + "\t'" + GC.get(cp).getNames().getShortName() - + "\t'" + DT.get(cp).getNames().getShortName() - + "\t" + getName(s) - + "\t'" + showEmpty(n4m) - + "\t'" + showEmpty(trial) - + "\t'" + showEmpty(nfkccf) - + "\t'" + showEmpty(colEquiv) - + "\t" + (Objects.equal(n4m, trial) ? "" : "Tr≠N4M") - + "\t" + (Objects.equal(trial,nfkccf) ? "" : "Tr≠NF") - + "\t" + (Objects.equal(trial, colEquiv) ? "" : "Tr≠UCA") - + "\t'" + Utility.hex(n4m, 4, " ") - + "\t'" + Utility.hex(trial, 4, " ") - + "\t'" + Utility.hex(nfkccf, 4, " ") - + "\t'" + Utility.hex(colEquiv, 4, " ") - + "\t" + (reasons == null ? "" : reasons); - Row.R4 row = Row.of( - 100-GC.get(cp).ordinal(), - DT.get(cp), - (Objects.equal(n4m, trial) ? "a" : "b") + - (Objects.equal(trial,nfkccf) ? "a" : "b") + - (Objects.equal(trial, colEquiv) ? "a" : "b"), - line); + String line = + "'" + + Utility.hex(s) + + "\t'" + + AGE.get(cp) + .getShortName() // UCharacter.getAge(cp).getVersionString(2, + // 2) + + "\t'" + + GC.get(cp).getNames().getShortName() + + "\t'" + + DT.get(cp).getNames().getShortName() + + "\t" + + getName(s) + + "\t'" + + showEmpty(n4m) + + "\t'" + + showEmpty(trial) + + "\t'" + + showEmpty(nfkccf) + + "\t'" + + showEmpty(colEquiv) + + "\t" + + (Objects.equal(n4m, trial) ? "" : "Tr≠N4M") + + "\t" + + (Objects.equal(trial, nfkccf) ? "" : "Tr≠NF") + + "\t" + + (Objects.equal(trial, colEquiv) ? "" : "Tr≠UCA") + + "\t'" + + Utility.hex(n4m, 4, " ") + + "\t'" + + Utility.hex(trial, 4, " ") + + "\t'" + + Utility.hex(nfkccf, 4, " ") + + "\t'" + + Utility.hex(colEquiv, 4, " ") + + "\t" + + (reasons == null ? "" : reasons); + Row.R4 row = + Row.of( + 100 - GC.get(cp).ordinal(), + DT.get(cp), + (Objects.equal(n4m, trial) ? "a" : "b") + + (Objects.equal(trial, nfkccf) ? "a" : "b") + + (Objects.equal(trial, colEquiv) ? "a" : "b"), + line); sorted.add(row); } for (Row.R4 row : sorted) { @@ -347,16 +445,17 @@ private static String showEmpty(String source) { private static final String TEST_NAME_START = "NEGATIVE CIRCLED NUMBER"; private static final boolean SIMPLE = true; - private static void computeCandidateFile(Age_Values age) throws IOException { - UnicodeMap setToCheck = new UnicodeMap<>(); + UnicodeMap setToCheck = new UnicodeMap<>(); UnicodeRelation reasons = new UnicodeRelation<>(); - Matcher nameCheck = Pattern.compile( - "FINAL|MEDIAL|INITIAL" - + "|WIDE|WIDTH|NARROW|CIRCLE" - + "|SQUARE|CUBE|CAPITAL|OVER|NEGATIVE" - + "|RADICAL|INPUT SYMBOL" - + "|PARENTHESIS|PARENTHESIZED|BRACKET").matcher(""); + Matcher nameCheck = + Pattern.compile( + "FINAL|MEDIAL|INITIAL" + + "|WIDE|WIDTH|NARROW|CIRCLE" + + "|SQUARE|CUBE|CAPITAL|OVER|NEGATIVE" + + "|RADICAL|INPUT SYMBOL" + + "|PARENTHESIS|PARENTHESIZED|BRACKET") + .matcher(""); for (String source : AGE.getSet(age)) { String nfkccf = Normalizer3.NFKCCF.normalize(source); if (!source.equals(nfkccf)) { @@ -386,16 +485,22 @@ private static void computeCandidateFile(Age_Values age) throws IOException { } enum Status { - different, missing, extra, same; + different, + missing, + extra, + same; + static Status get(String source, String oldTarget, String newTarget) { - return oldTarget == null ? Status.extra - : newTarget == null ? Status.missing - : oldTarget.equals(newTarget) ? Status.same - : Status.different; + return oldTarget == null + ? Status.extra + : newTarget == null + ? Status.missing + : oldTarget.equals(newTarget) ? Status.same : Status.different; } } - private static void removeString(final String name, String cp, boolean hack, final String... stringsToFind) { + private static void removeString( + final String name, String cp, boolean hack, final String... stringsToFind) { int finalPos = name.indexOf(stringsToFind[0]); if (finalPos >= 0) { String newName = name; @@ -404,7 +509,10 @@ private static void removeString(final String name, String cp, boolean hack, fin } String otherCode = NAME_TO_CP.get(newName); if (otherCode != null) { - final String target = HANGUL_COMPAT_minus_DI_CN.contains(otherCode) ? otherCode : Normalizer3.NFKCCF.normalize(otherCode); + final String target = + HANGUL_COMPAT_minus_DI_CN.contains(otherCode) + ? otherCode + : Normalizer3.NFKCCF.normalize(otherCode); if (!Normalizer3.NFKCCF.normalize(cp).equals(target)) { X_FILE.put(cp, target); } @@ -417,14 +525,22 @@ private static void removeString(final String name, String cp, boolean hack, fin } } - private static void showSimpleData(UnicodeMap mapping, UnicodeMap reasons2, String filename, String header, - UnicodeMap skipIfSame) throws IOException { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "n4m/", filename)) { + private static void showSimpleData( + UnicodeMap mapping, + UnicodeMap reasons2, + String filename, + String header, + UnicodeMap skipIfSame) + throws IOException { + try (PrintWriter out = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "n4m/", filename)) { out.println(header); out.println("# Source; \tTarget; \tOther; \tReason(s); \tComments"); - UnicodeSet trialWithoutReason = new UnicodeSet(mapping.keySet()).removeAll(reasons2.keySet()); + UnicodeSet trialWithoutReason = + new UnicodeSet(mapping.keySet()).removeAll(reasons2.keySet()); if (!trialWithoutReason.isEmpty()) { - throw new IllegalArgumentException("Unexplained difference between TRIAL and REASONS: " + trialWithoutReason); + throw new IllegalArgumentException( + "Unexplained difference between TRIAL and REASONS: " + trialWithoutReason); } final Set values = reasons2.values(); Comparator comp = null; @@ -435,12 +551,20 @@ private static void showSimpleData(UnicodeMap mapping, UnicodeMap sorted.addAll(values); for (T reason : sorted) { UnicodeSet set = reasons2.getSet(reason); - String reasons = reason instanceof Set ? CollectionUtilities.join((Set)reason, " + ") : reason.toString(); + String reasons = + reason instanceof Set + ? CollectionUtilities.join((Set) reason, " + ") + : reason.toString(); showSimpleSet(out, set, mapping, reasons, skipIfSame); } if (skipIfSame != null) { - showSimpleSet(out, new UnicodeSet(mapping.keySet()).complement(), mapping, "other", skipIfSame); + showSimpleSet( + out, + new UnicodeSet(mapping.keySet()).complement(), + mapping, + "other", + skipIfSame); } } // for (Entry entry : TRIAL.entrySet()) { @@ -454,14 +578,20 @@ private static void showSimpleData(UnicodeMap mapping, UnicodeMap // if (source.contains(DEBUG_PRINT)) { // int debug = 0; // } - // System.out.println(Utility.hex(source) + ";\t" + Utility.hex(target, 4, " ") + // System.out.println(Utility.hex(source) + ";\t" + Utility.hex(target, 4, " ") // + "\t # ( " + source + " → " + target + " )\t" - // + UCharacter.getName(source, " + ") + " → " + UCharacter.getName(target, " + ") + // + UCharacter.getName(source, " + ") + " → " + + // UCharacter.getName(target, " + ") // + "\t" + reason); // } } - private static void showSimpleSet(PrintWriter out, UnicodeSet set, UnicodeMap mapping, String reason, UnicodeMap skipIfSame) { + private static void showSimpleSet( + PrintWriter out, + UnicodeSet set, + UnicodeMap mapping, + String reason, + UnicodeMap skipIfSame) { boolean first = true; for (String source : set) { String target = mapping.get(source); @@ -482,13 +612,25 @@ private static void showSimpleSet(PrintWriter out, UnicodeSet set, UnicodeMa out.println("\n#@override reason=" + reason + "\n"); first = false; } - out.println(Utility.hex(source) - + ";\t" + Utility.hex(target, 4, " ") - + (skipIfSame == null ? "" : ";\t" + Utility.hex(toFilterIfSame, 4, " ")) - + ";\t" + reason - + "\t #" + getStringValues(AGE, source, ", ") - + "\t ( " + source + " → " + target + " )\t" - + getName(source) + " → " + getName(target)); + out.println( + Utility.hex(source) + + ";\t" + + Utility.hex(target, 4, " ") + + (skipIfSame == null + ? "" + : ";\t" + Utility.hex(toFilterIfSame, 4, " ")) + + ";\t" + + reason + + "\t #" + + getStringValues(AGE, source, ", ") + + "\t ( " + + source + + " → " + + target + + " )\t" + + getName(source) + + " → " + + getName(target)); } } @@ -498,17 +640,23 @@ private static void printData() { /** * Here we try to reverse engineer the derivation, starting with NFKCCasefold + * * @param normalizer3 TODO * @param additions TODO * @param trial TODO * @param reasons TODO */ - private static void computeTrial(Normalizer3 normalizer3, NormalizeForMatch additions, UnicodeMap trial, UnicodeMap> reasons) { + private static void computeTrial( + Normalizer3 normalizer3, + NormalizeForMatch additions, + UnicodeMap trial, + UnicodeMap> reasons) { UnicodeSet skipIfInMultiCodepointDecomp = new UnicodeSet("[\\u0020<>]"); UnicodeMap toSuper = new UnicodeMap<>(); - for (String s : DT.getSet(Decomposition_Type_Values.Super)) { // new UnicodeSet("[:dt=Super:]") + for (String s : + DT.getSet(Decomposition_Type_Values.Super)) { // new UnicodeSet("[:dt=Super:]") String normal = Normalizer3.NFKCCF.normalize(s); if (Nd.contains(normal)) { toSuper.put(normal, s); @@ -519,100 +667,111 @@ private static void computeTrial(Normalizer3 normalizer3, NormalizeForMatch addi final char SEPARATOR = ' '; long out = System.nanoTime(); main: - for (int cp = 0; cp <= 0x10FFFF; ++cp) { - // Unassigned or strange Cx → no change + for (int cp = 0; cp <= 0x10FFFF; ++cp) { + // Unassigned or strange Cx → no change - if (CN_CS_CO.contains(cp)) { - continue main; - } + if (CN_CS_CO.contains(cp)) { + continue main; + } - if (cp==0x33AE) { - int debug = 0; - } + if (cp == 0x33AE) { + int debug = 0; + } - String source = UTF16.valueOf(cp); - String nfkccf = normalizer3.normalize(source); - String target = source; - Set reason = new LinkedHashSet<>(); + String source = UTF16.valueOf(cp); + String nfkccf = normalizer3.normalize(source); + String target = source; + Set reason = new LinkedHashSet<>(); - subloop: { - if (HANGUL_COMPAT_minus_DI_CN.contains(cp)) { - reason.add(SpecialReason.retain_hangul); - break subloop; - } + subloop: + { + if (HANGUL_COMPAT_minus_DI_CN.contains(cp)) { + reason.add(SpecialReason.retain_hangul); + break subloop; + } - String remapped = null; - remapped = additions.getSourceToTarget().get(source); - if (remapped != null) { - target = remapped; - reason.add(additions.getSourceToReason().get(source)); - break subloop; - } + String remapped = null; + remapped = additions.getSourceToTarget().get(source); + if (remapped != null) { + target = remapped; + reason.add(additions.getSourceToReason().get(source)); + break subloop; + } - // decomposition type = squared, fraction → Map to NFKC - // if the target ends with a digit, and there are no other digits, superscript the last - // if there is more than one cp in the target, surround by separators. - // if (SPECIAL_DECOMP_TYPES.contains(cp)) { - // target = nfkccf; - // reason = "07. DT_SQUARE_FRACTION"; - // int lastCp = target.codePointBefore(target.length()); - // String mod = toSuper.get(lastCp); - // if (mod != null) { - // String prefix = target.substring(0,target.length() - Character.charCount(lastCp)); - // if (Nd.containsNone(prefix)) { - // target = prefix + mod; - // // System.out.println(Utility.hex(source) + "; " + Utility.hex(target, 4, " ") + " # " + source + " → " + target); - // reason += " for superscript-numbers"; - // } - // } - // break subloop; - // } - - // decomposition type = super, sub → do not map, stop - if (NOCHANGE_DECOMP_TYPES.contains(cp)) { - reason.add(SpecialReason.forString("retain_"+DT.get(cp))); // "9 skip certain types"; - break subloop; - } - if (TAGS.contains(cp)) { - reason.add(SpecialReason.retain_tags); // "9 skip certain types"; - break subloop; - } - // Get NFKC_CF mapping - - target = nfkccf; - - // length(value) ≠1 && contains any of " ", "(", ".", ",", "〔" → no change (discard mapping) - - if (target.codePointCount(0, target.length()) > 1 - && skipIfInMultiCodepointDecomp.containsSome(target)) { - reason.add(SpecialReason.retain_sequences_with_exclusions); - // ("14 Skip decomp contains «" - // + new UnicodeSet().addAll(target).retainAll(skipIfInMultiCodepointDecomp) - // + "» (and isn't singleton)"); - target=source; - } else if (!reasons.containsKey(cp)) { - // if we don't have a reason, it is because of NFKC_CF, so add that reason. - reason.add(SpecialReason.forString("nfkccf_" + DT.get(cp))); // "16. NFKC_CF-" + DT.get(cp); - } + // decomposition type = squared, fraction → Map to NFKC + // if the target ends with a digit, and there are no other digits, superscript the + // last + // if there is more than one cp in the target, surround by separators. + // if (SPECIAL_DECOMP_TYPES.contains(cp)) { + // target = nfkccf; + // reason = "07. DT_SQUARE_FRACTION"; + // int lastCp = target.codePointBefore(target.length()); + // String mod = toSuper.get(lastCp); + // if (mod != null) { + // String prefix = target.substring(0,target.length() - + // Character.charCount(lastCp)); + // if (Nd.containsNone(prefix)) { + // target = prefix + mod; + // // + // System.out.println(Utility.hex(source) + "; " + Utility.hex(target, 4, " ") + " # + // " + source + " → " + target); + // reason += " for superscript-numbers"; + // } + // } + // break subloop; + // } + + // decomposition type = super, sub → do not map, stop + if (NOCHANGE_DECOMP_TYPES.contains(cp)) { + reason.add( + SpecialReason.forString( + "retain_" + DT.get(cp))); // "9 skip certain types"; + break subloop; } - if (target.contains("\u2044") || target.contains("\u2215")) { - target = target.replace('\u2044', '/').replace('\u2215', '/'); // fraction slash #15 - reason.add(SpecialReason.fix_slash); // " + fix fraction slash"; + if (TAGS.contains(cp)) { + reason.add(SpecialReason.retain_tags); // "9 skip certain types"; + break subloop; } - - if (!target.equals("/") && target.contains("/")) { - target = SEPARATOR + target + SEPARATOR; - //System.out.println("«" + target + "»"); - reason.add(SpecialReason.add_separator); - //reason += " + add separator"; + // Get NFKC_CF mapping + + target = nfkccf; + + // length(value) ≠1 && contains any of " ", "(", ".", ",", "〔" → no change + // (discard mapping) + + if (target.codePointCount(0, target.length()) > 1 + && skipIfInMultiCodepointDecomp.containsSome(target)) { + reason.add(SpecialReason.retain_sequences_with_exclusions); + // ("14 Skip decomp contains «" + // + new + // UnicodeSet().addAll(target).retainAll(skipIfInMultiCodepointDecomp) + // + "» (and isn't singleton)"); + target = source; + } else if (!reasons.containsKey(cp)) { + // if we don't have a reason, it is because of NFKC_CF, so add that reason. + reason.add( + SpecialReason.forString( + "nfkccf_" + DT.get(cp))); // "16. NFKC_CF-" + DT.get(cp); } + } + if (target.contains("\u2044") || target.contains("\u2215")) { + target = target.replace('\u2044', '/').replace('\u2215', '/'); // fraction slash #15 + reason.add(SpecialReason.fix_slash); // " + fix fraction slash"; + } - target = nfc.normalize(target); // just in case!! - if (!source.equals(target)) { - trial.put(cp, target); - } - reasons.put(cp, reason); + if (!target.equals("/") && target.contains("/")) { + target = SEPARATOR + target + SEPARATOR; + // System.out.println("«" + target + "»"); + reason.add(SpecialReason.add_separator); + // reason += " + add separator"; + } + + target = nfc.normalize(target); // just in case!! + if (!source.equals(target)) { + trial.put(cp, target); } + reasons.put(cp, reason); + } // Recurse on trial while (true) { UnicodeMap delta = new UnicodeMap(); @@ -626,7 +785,8 @@ private static void computeTrial(Normalizer3 normalizer3, NormalizeForMatch addi removals.add(source); } else { delta.put(source, newTarget); - LinkedHashSet reason = new LinkedHashSet<>(reasons.get(source)); + LinkedHashSet reason = + new LinkedHashSet<>(reasons.get(source)); reason.add(SpecialReason.recursion); reasons.put(source, reason); } @@ -634,12 +794,12 @@ private static void computeTrial(Normalizer3 normalizer3, NormalizeForMatch addi } if (delta.isEmpty()) break; trial.putAll(delta); - //System.out.println("# Recursion " + delta); + // System.out.println("# Recursion " + delta); } trial.freeze(); reasons.freeze(); long out2 = System.nanoTime(); - System.out.println((out2-out)/1000000000.0 + " sec"); + System.out.println((out2 - out) / 1000000000.0 + " sec"); } private static UnicodeMap gatherData() { @@ -660,7 +820,8 @@ private static UnicodeMap gatherData() { private static void showMapping(NormalizeForMatch sourceMap, Normalizer3 nfkccf2) { UnicodeSet changed = new UnicodeSet(); - System.out.println("#source ; target ; nfkccf (if ≠) ; uca equiv (if ≠) # (source→target) names"); + System.out.println( + "#source ; target ; nfkccf (if ≠) ; uca equiv (if ≠) # (source→target) names"); for (Entry x : sourceMap.getSourceToTarget().entrySet()) { String source = x.getKey(); String target = x.getValue(); @@ -674,15 +835,24 @@ private static void showMapping(NormalizeForMatch sourceMap, Normalizer3 nfkccf2 } changed.add(source); - System.out.println(Utility.hex(source) - + " ;\t" + Utility.hex(target,4," ") - + " ;\t" + (target.equals(nfkccfResult) ? "" : Utility.hex(nfkccfResult,4," ")) - + " ;\t" + (target.equals(colEquiv) ? "" : Utility.hex(colEquiv,4," ")) - + " #" + getStringValues(AGE, source, ", ") - + "\t(" + source + "→" + target + ")\t" - + getName(source," + ") + " → " + getName(target," + ") - ); - + System.out.println( + Utility.hex(source) + + " ;\t" + + Utility.hex(target, 4, " ") + + " ;\t" + + (target.equals(nfkccfResult) ? "" : Utility.hex(nfkccfResult, 4, " ")) + + " ;\t" + + (target.equals(colEquiv) ? "" : Utility.hex(colEquiv, 4, " ")) + + " #" + + getStringValues(AGE, source, ", ") + + "\t(" + + source + + "→" + + target + + ")\t" + + getName(source, " + ") + + " → " + + getName(target, " + ")); } System.out.println("# Total: " + changed.size()); System.out.println("# " + changed.toPattern(false)); @@ -712,13 +882,20 @@ private static void showMapping(NormalizeForMatch sourceMap, Normalizer3 nfkccf2 } changed.add(source); - System.out.println(Utility.hex(source) - + " ;\t" + Utility.hex(target,4," ") - + " #" + getStringValues(AGE, source, ", ") - + "\t(" + source + "→" + target + ")\t" - + getName(source," + ") + " → " + getName(target," + ") - ); - + System.out.println( + Utility.hex(source) + + " ;\t" + + Utility.hex(target, 4, " ") + + " #" + + getStringValues(AGE, source, ", ") + + "\t(" + + source + + "→" + + target + + ")\t" + + getName(source, " + ") + + " → " + + getName(target, " + ")); } System.out.println("# Total: " + changed.size()); System.out.println("# " + changed.toPattern(false)); @@ -752,7 +929,8 @@ private static String getName(String best, String separator) { // private static void showItemsIn(UnicodeSet combined) { // - // Set> sorted = new TreeSet<>(); + // Set> sorted = new TreeSet<>(); // Counter> counter = new Counter<>(); // for (String source : combined) { // // Skip anything ≥ Unicode 8.0 @@ -769,25 +947,28 @@ private static String getName(String best, String separator) { // } // // String reason = REASONS.get(source); - // General_Category_Values generalCategory = GC.get(sourceCodePoint); // UCharacter.getIntPropertyValue(sourceCodePoint, UProperty.GENERAL_CATEGORY); - // Decomposition_Type_Values decompType = DT.get(sourceCodePoint); // int decompType = UCharacter.getIntPropertyValue(sourceCodePoint, UProperty.DECOMPOSITION_TYPE); + // General_Category_Values generalCategory = GC.get(sourceCodePoint); // + // UCharacter.getIntPropertyValue(sourceCodePoint, UProperty.GENERAL_CATEGORY); + // Decomposition_Type_Values decompType = DT.get(sourceCodePoint); // int decompType + // = UCharacter.getIntPropertyValue(sourceCodePoint, UProperty.DECOMPOSITION_TYPE); // // Age age = ageValue.compareTo(Age_Values.V5_1) >= 0 ? Age.from51to70 // : Age.before51; // - // Difference difference = n4mValue == null ? Difference.trial_only - // : trialValue == null ? Difference.n4m_only + // Difference difference = n4mValue == null ? Difference.trial_only + // : trialValue == null ? Difference.n4m_only // : Difference.different; // // String nfkccfValue = NFKCCF.normalize(source); // if (nfkccfValue.equals(source)) { // nfkccfValue = null; // below, null means no change // } - // final R5 row + // final R5 row // = Row.of(age, difference, decompType, generalCategory, ageValue.getShortName() // // ageValue.getVersionString(2, 2) // + SEP + source - // + SEP + hex(source) + // + SEP + hex(source) // + SEP + hex(n4mValue) // + SEP + hex(trialValue) // + SEP + (Objects.equal(nfkccfValue,trialValue) ? "≣" : hex(nfkccfValue)) @@ -800,10 +981,10 @@ private static String getName(String best, String separator) { // // Age lastAge = null; // Difference lastDifference = null; - // System.out.println("#AgeCat" - // + SEP + "Type of difference" - // + SEP + "Decomp type" - // + SEP + "General Category" + // System.out.println("#AgeCat" + // + SEP + "Type of difference" + // + SEP + "Decomp type" + // + SEP + "General Category" // + SEP + "Version" // + SEP + "Source" // + SEP + "Hex" @@ -814,7 +995,8 @@ private static String getName(String best, String separator) { // + SEP + "Name of Source" // ); // - // for (R5 item : sorted) { + // for (R5 + // item : sorted) { // final Age age = item.get0(); // final Difference difference = item.get1(); // final String decompType = item.get2().name(); @@ -825,7 +1007,8 @@ private static String getName(String best, String separator) { // lastAge = age; // lastDifference = difference; // } - // System.out.println(age + SEP + difference + SEP + decompType + SEP + cat + SEP + info); + // System.out.println(age + SEP + difference + SEP + decompType + SEP + cat + SEP + + // info); // } // System.out.println(); // for (R2 key : counter.getKeysetSortedByKey()) { @@ -834,7 +1017,8 @@ private static String getName(String best, String separator) { // } private static String hex(String n4mValue) { - return n4mValue == null ? "" : n4mValue.isEmpty() - ? "delete" : "U+" + Utility.hex(n4mValue,4,", U+"); + return n4mValue == null + ? "" + : n4mValue.isEmpty() ? "delete" : "U+" + Utility.hex(n4mValue, 4, ", U+"); } } diff --git a/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData.java b/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData.java index 6c3b05580..832cbfeed 100644 --- a/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData.java +++ b/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData.java @@ -1,5 +1,17 @@ package org.unicode.tools; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.LocaleData; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; @@ -29,7 +41,6 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.CharacterListCompressor; import org.unicode.cldr.draft.Compacter; import org.unicode.cldr.draft.ScriptMetadata; @@ -42,20 +53,10 @@ import org.unicode.text.utility.Settings; import org.unicode.tools.GeneratePickerData.CategoryTable.Separation; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.Normalizer; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.LocaleData; -import com.ibm.icu.util.ULocale; - -@CLDRTool(alias = "generate-picker-data", description = "Generate draft.PickerData content", hidden = "generator for draft data") +@CLDRTool( + alias = "generate-picker-data", + description = "Generate draft.PickerData content", + hidden = "generator for draft data") class GeneratePickerData { static final boolean DEBUG = true; @@ -80,76 +81,104 @@ class GeneratePickerData { private static final String EAST_ASIAN = "Other East Asian Scripts"; - static final UnicodeSet COMPATIBILITY = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:nfkcqc=n:]-[:Lm:]]") - .removeAll(ScriptCategories2.IPA).removeAll(ScriptCategories2.IPA_EXTENSIONS).freeze(); + static final UnicodeSet COMPATIBILITY = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet("[[:nfkcqc=n:]-[:Lm:]]") + .removeAll(ScriptCategories2.IPA) + .removeAll(ScriptCategories2.IPA_EXTENSIONS) + .freeze(); - private static final UnicodeSet PRIVATE_USE = (UnicodeSet) new UnicodeSet("[:private use:]").freeze(); + private static final UnicodeSet PRIVATE_USE = + (UnicodeSet) new UnicodeSet("[:private use:]").freeze(); - static final UnicodeSet SKIP = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:cn:][:cs:][:co:][:cc:]\uFFFC]") - .addAll(ScriptCategories2.DEPRECATED_NEW).freeze(); - private static final UnicodeSet KNOWN_DUPLICATES = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:Nd:]").freeze(); + static final UnicodeSet SKIP = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet("[[:cn:][:cs:][:co:][:cc:]\uFFFC]") + .addAll(ScriptCategories2.DEPRECATED_NEW) + .freeze(); + private static final UnicodeSet KNOWN_DUPLICATES = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:Nd:]").freeze(); public static final UnicodeSet HISTORIC = ScriptCategories2.ARCHAIC; // public static final UnicodeSet UNCOMMON = (UnicodeSet) new // UnicodeSet(ScriptCategories.ARCHAIC).addAll(COMPATIBILITY).freeze(); - private static final UnicodeSet NAMED_CHARACTERS = (UnicodeSet) new UnicodeSet( - "[[:Z:][:default_ignorable_code_point:][:Pd:][:cf:]]").removeAll(SKIP).freeze(); - private static final UnicodeSet MODERN_JAMO = (UnicodeSet) new UnicodeSet( - "[\u1100-\u1112 \u1161-\u1175 \u11A8-\u11C2]").removeAll(SKIP).freeze(); - - private static final UnicodeSet HST_L = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:HST=L:]").freeze(); - private static final UnicodeSet single = (UnicodeSet) ScriptCategories2.parseUnicodeSet( - "[[:HST=L:][:HST=V:][:HST=T:]]").freeze(); - private static final UnicodeSet syllable = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:HST=LV:][:HST=LVT:]]") - .freeze(); - private static final UnicodeSet all = (UnicodeSet) new UnicodeSet(single).addAll(syllable).freeze(); + private static final UnicodeSet NAMED_CHARACTERS = + (UnicodeSet) + new UnicodeSet("[[:Z:][:default_ignorable_code_point:][:Pd:][:cf:]]") + .removeAll(SKIP) + .freeze(); + private static final UnicodeSet MODERN_JAMO = + (UnicodeSet) + new UnicodeSet("[\u1100-\u1112 \u1161-\u1175 \u11A8-\u11C2]") + .removeAll(SKIP) + .freeze(); + + private static final UnicodeSet HST_L = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:HST=L:]").freeze(); + private static final UnicodeSet single = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet("[[:HST=L:][:HST=V:][:HST=T:]]").freeze(); + private static final UnicodeSet syllable = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[[:HST=LV:][:HST=LVT:]]").freeze(); + private static final UnicodeSet all = + (UnicodeSet) new UnicodeSet(single).addAll(syllable).freeze(); static RuleBasedCollator UCA_BASE = (RuleBasedCollator) Collator.getInstance(Locale.ENGLISH); static { UCA_BASE.setNumericCollation(true); } - public static final Comparator CODE_POINT_ORDER = new UTF16.StringComparator(true, false, 0); + public static final Comparator CODE_POINT_ORDER = + new UTF16.StringComparator(true, false, 0); static Comparator UCA = new MultilevelComparator(UCA_BASE, CODE_POINT_ORDER); - static Comparator buttonComparator = new MultilevelComparator( - // new UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[:ascii:]")), - // new UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[[:Letter:]&[:^NFKC_QuickCheck=N:]]")), - new UnicodeSetInclusionFirst(ScriptCategories2.parseUnicodeSet("[[:Letter:]-[:Lm:]]")), - new UnicodeSetInclusionFirst(ScriptCategories2.parseUnicodeSet("[:Lm:]")), UCA_BASE, CODE_POINT_ORDER); - - static Comparator LinkedHashSetComparator = new Comparator() { - public int compare(String arg0, String arg1) { - throw new IllegalArgumentException(); // only used to signal usage - } - }; - - static Comparator ListComparator = new Comparator() { - public int compare(String arg0, String arg1) { - throw new IllegalArgumentException(); // only used to signal usage - } - }; - - public static final Comparator SORT_ALWAYS = CODE_POINT_ORDER; // null for piecemeal sorting, ENGLISH for - // UCA + static Comparator buttonComparator = + new MultilevelComparator( + // new UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[:ascii:]")), + // new + // UnicodeSetInclusionFirst(ScriptCategories.parseUnicodeSet("[[:Letter:]&[:^NFKC_QuickCheck=N:]]")), + new UnicodeSetInclusionFirst( + ScriptCategories2.parseUnicodeSet("[[:Letter:]-[:Lm:]]")), + new UnicodeSetInclusionFirst(ScriptCategories2.parseUnicodeSet("[:Lm:]")), + UCA_BASE, + CODE_POINT_ORDER); + + static Comparator LinkedHashSetComparator = + new Comparator() { + public int compare(String arg0, String arg1) { + throw new IllegalArgumentException(); // only used to signal usage + } + }; - static Comparator subCategoryComparator = new Comparator() { - public int compare(String o1, String o2) { - boolean a = o1.startsWith(ARCHAIC_MARKER); - boolean b = o2.startsWith(ARCHAIC_MARKER); - if (a != b) { - return a ? 1 : -1; - } - a = o1.startsWith(COMPAT_MARKER); - b = o2.startsWith(COMPAT_MARKER); - if (a != b) { - return a ? 1 : -1; - } - return UCA.compare(o1, o2); - } - }; + static Comparator ListComparator = + new Comparator() { + public int compare(String arg0, String arg1) { + throw new IllegalArgumentException(); // only used to signal usage + } + }; + + public static final Comparator SORT_ALWAYS = + CODE_POINT_ORDER; // null for piecemeal sorting, ENGLISH for + // UCA + + static Comparator subCategoryComparator = + new Comparator() { + public int compare(String o1, String o2) { + boolean a = o1.startsWith(ARCHAIC_MARKER); + boolean b = o2.startsWith(ARCHAIC_MARKER); + if (a != b) { + return a ? 1 : -1; + } + a = o1.startsWith(COMPAT_MARKER); + b = o2.startsWith(COMPAT_MARKER); + if (a != b) { + return a ? 1 : -1; + } + return UCA.compare(o1, o2); + } + }; static CategoryTable CATEGORYTABLE = new CategoryTable(); static Subheader subheader; @@ -158,13 +187,14 @@ public int compare(String o1, String o2) { static Renamer renamer; private static PrintWriter renamingLog; - final static Options myOptions = new Options(); + static final Options myOptions = new Options(); enum MyOptions { output(".*", Settings.Output.GEN_DIR + "picker/", "output data directory"), unicodedata(".*", DraftUtils.UCD_DIRECTORY, "Unicode Data directory"), verbose(null, null, "verbose debugging messages"), - korean(null, null, "generate korean hangul defectives instead"), ; + korean(null, null, "generate korean hangul defectives instead"), + ; // boilerplate final Option option; @@ -180,16 +210,21 @@ public static void main(String[] args) throws Exception { generateHangulDefectives(); return; } - outputDirectory = new File(MyOptions.output.option.getValue()).getCanonicalPath() + File.separator; - unicodeDataDirectory = new File(MyOptions.unicodedata.option.getValue()).getCanonicalPath() + File.separator; + outputDirectory = + new File(MyOptions.output.option.getValue()).getCanonicalPath() + File.separator; + unicodeDataDirectory = + new File(MyOptions.unicodedata.option.getValue()).getCanonicalPath() + + File.separator; renamingLog = getFileWriter(outputDirectory, "renamingLog.txt"); renamer = new Renamer("GeneratePickerData.txt"); if (DEBUG) - System.out.println("Whitespace? " - + ScriptCategories2.parseUnicodeSet("[:z:]").equals(ScriptCategories2.parseUnicodeSet("[:whitespace:]"))); + System.out.println( + "Whitespace? " + + ScriptCategories2.parseUnicodeSet("[:z:]") + .equals(ScriptCategories2.parseUnicodeSet("[:whitespace:]"))); buildMainTable(); addEmojiCharacters(); @@ -218,8 +253,13 @@ public static void main(String[] args) throws Exception { } throw new Exception(ERROR_COUNT.size() + " errors above!"); } - System.out.println("Compression\t" + Compacter.getTotalOld() + ",\t" + Compacter.getTotalNew() + ",\t" - + (Compacter.getTotalNew() / Compacter.getTotalOld())); + System.out.println( + "Compression\t" + + Compacter.getTotalOld() + + ",\t" + + Compacter.getTotalNew() + + ",\t" + + (Compacter.getTotalNew() / Compacter.getTotalOld())); System.out.println("DONE"); } @@ -241,7 +281,7 @@ public static void writeCategories() throws FileNotFoundException, IOException { } out.close(); } - + public static void writeCategories2() throws FileNotFoundException, IOException { PrintWriter out = getFileWriter(outputDirectory, "categories2.txt"); for (Entry> catData : CategoryTable.categoryTable.entrySet()) { @@ -256,10 +296,14 @@ public static void writeCategories2() throws FileNotFoundException, IOException continue; } final UnicodeSet uset = new UnicodeSet().addAll(subData.getValue().strings); - out.println(main + " ;\t" + sub + " ;\t" - + uset.size() + " ;\t" - + uset.toPattern(false) - ); + out.println( + main + + " ;\t" + + sub + + " ;\t" + + uset.size() + + " ;\t" + + uset.toPattern(false)); } } out.close(); @@ -289,18 +333,43 @@ private static void buildMainTable() throws IOException { addSymbols(); - addProperty("General_Category", "Category", buttonComparator, - ScriptCategories2.parseUnicodeSet("[[:script=common:][:script=inherited:][:N:]" + "-[:letter:]" - + "-[:default_ignorable_code_point:]" + "-[:cf:]" + "-[:whitespace:]" + "-[:So:]" + - // "-[[:M:]-[:script=common:]-[:script=inherited:]]" + - "]")); + addProperty( + "General_Category", + "Category", + buttonComparator, + ScriptCategories2.parseUnicodeSet( + "[[:script=common:][:script=inherited:][:N:]" + + "-[:letter:]" + + "-[:default_ignorable_code_point:]" + + "-[:cf:]" + + "-[:whitespace:]" + + "-[:So:]" + + + // "-[[:M:]-[:script=common:]-[:script=inherited:]]" + + "]")); - CATEGORYTABLE.add("Format & Whitespace", true, "Whitespace", buttonComparator, Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[:whitespace:]")); - CATEGORYTABLE.add("Format & Whitespace", true, "Format", buttonComparator, Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[:cf:]")); - CATEGORYTABLE.add("Format & Whitespace", true, "Other", buttonComparator, Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[[:default_ignorable_code_point:]-[:cf:]-[:whitespace:]]")); + CATEGORYTABLE.add( + "Format & Whitespace", + true, + "Whitespace", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet("[:whitespace:]")); + CATEGORYTABLE.add( + "Format & Whitespace", + true, + "Format", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet("[:cf:]")); + CATEGORYTABLE.add( + "Format & Whitespace", + true, + "Other", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet( + "[[:default_ignorable_code_point:]-[:cf:]-[:whitespace:]]")); addLatin(); Set EuropeanMinusLatin = new TreeSet(ScriptCategories2.EUROPEAN); @@ -323,26 +392,33 @@ private static void addHan() throws IOException { UnicodeSet others = ScriptCategories2.parseUnicodeSet("[:script=Han:]"); // find base values - for (int radicalStrokes : RadicalStroke.SINGLETON.radStrokesToRadToRemainingStrokes.keySet()) { + for (int radicalStrokes : + RadicalStroke.SINGLETON.radStrokesToRadToRemainingStrokes.keySet()) { // String mainCat = null; - Map> char2RemStrokes2Set = RadicalStroke.SINGLETON.radStrokesToRadToRemainingStrokes - .get(radicalStrokes); + Map> char2RemStrokes2Set = + RadicalStroke.SINGLETON.radStrokesToRadToRemainingStrokes.get(radicalStrokes); for (String radical : char2RemStrokes2Set.keySet()) { Map remStrokes2Set = char2RemStrokes2Set.get(radical); for (int remStrokes : remStrokes2Set.keySet()) { int radicalChar = ScriptCategories2.getRadicalNum2char(radical); - String mainCat = "Han " + (radicalStrokes > 10 ? "11..17" : String.valueOf(radicalStrokes)) - + "-Stroke Radicals"; + String mainCat = + "Han " + + (radicalStrokes > 10 + ? "11..17" + : String.valueOf(radicalStrokes)) + + "-Stroke Radicals"; String subCat = UTF16.valueOf(radicalChar); // if (DEBUG) System.out.println(radical + " => " + radicalToChar.get(radical)); // String radChar = getRadicalName(radicalToChar, radical); // String subCat = radChar + " Han"; // try { - // String radical2 = radical.endsWith("'") ? radical.substring(0, radical.length() - 1) : radical; + // String radical2 = radical.endsWith("'") ? radical.substring(0, + // radical.length() - 1) : radical; // int x = Integer.parseInt(radical2); // int base = (x / 20) * 20; // int top = base + 19; - // mainCat = "CJK (Han) " + getRadicalName(radicalToChar, Math.max(base,1)) + " - " + + // mainCat = "CJK (Han) " + getRadicalName(radicalToChar, Math.max(base,1)) + " + // - " + // getRadicalName(radicalToChar, Math.min(top,214)); // } catch (Exception e) {} // if (mainCat == null) { @@ -353,7 +429,9 @@ private static void addHan() throws IOException { final UnicodeSet values = remStrokes2Set.get(remStrokes); // close over NFKC - for (UnicodeSetIterator it = new UnicodeSetIterator(RadicalStroke.SINGLETON.remainder); it.next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator(RadicalStroke.SINGLETON.remainder); + it.next(); ) { String nfkc = Normalizer.normalize(it.codepoint, Normalizer.NFKC); if (values.contains(nfkc)) { values.add(it.codepoint); @@ -361,24 +439,43 @@ private static void addHan() throws IOException { } others.removeAll(values); - UnicodeSet normal = new UnicodeSet(values).removeAll(GeneratePickerData.HISTORIC) - .removeAll(GeneratePickerData.COMPATIBILITY).removeAll(GeneratePickerData.UNCOMMON_HAN); - GeneratePickerData.CATEGORYTABLE.add(mainCat, true, subCat, - GeneratePickerData.LinkedHashSetComparator, Separation.AUTOMATIC, normal); + UnicodeSet normal = + new UnicodeSet(values) + .removeAll(GeneratePickerData.HISTORIC) + .removeAll(GeneratePickerData.COMPATIBILITY) + .removeAll(GeneratePickerData.UNCOMMON_HAN); + GeneratePickerData.CATEGORYTABLE.add( + mainCat, + true, + subCat, + GeneratePickerData.LinkedHashSetComparator, + Separation.AUTOMATIC, + normal); values.removeAll(normal); - GeneratePickerData.CATEGORYTABLE.add(mainCat, true, "Other", - GeneratePickerData.LinkedHashSetComparator, Separation.AUTOMATIC, values); + GeneratePickerData.CATEGORYTABLE.add( + mainCat, + true, + "Other", + GeneratePickerData.LinkedHashSetComparator, + Separation.AUTOMATIC, + values); } } } - GeneratePickerData.CATEGORYTABLE.add("Han - Other", true, "Other", GeneratePickerData.LinkedHashSetComparator, - Separation.AUTOMATIC, others); + GeneratePickerData.CATEGORYTABLE.add( + "Han - Other", + true, + "Other", + GeneratePickerData.LinkedHashSetComparator, + Separation.AUTOMATIC, + others); GeneratePickerData.UNCOMMON_HAN.removeAll(RadicalStroke.SINGLETON.iiCoreSet); - } - static UnicodeSet LATIN = (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:script=Latin:]").freeze(); + static UnicodeSet LATIN = + (UnicodeSet) ScriptCategories2.parseUnicodeSet("[:script=Latin:]").freeze(); static Set SKIP_LOCALES = new HashSet(); + static { SKIP_LOCALES.add("kl"); SKIP_LOCALES.add("eo"); @@ -403,16 +500,26 @@ private static void addLatin() { } UnicodeSet diff = new UnicodeSet(exemplarSet).removeAll(LATIN); if (!diff.isEmpty()) { - System.out.println(loc + " Latin: " + new UnicodeSet(exemplarSet).retainAll(LATIN).toPattern(false)); + System.out.println( + loc + + " Latin: " + + new UnicodeSet(exemplarSet).retainAll(LATIN).toPattern(false)); while (!diff.isEmpty()) { String first = diff.iterator().next(); int script = UScript.getScript(first.codePointAt(0)); - UnicodeSet scriptSet = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script) - .retainAll(diff).add(first); + UnicodeSet scriptSet = + new UnicodeSet() + .applyIntPropertyValue(UProperty.SCRIPT, script) + .retainAll(diff) + .add(first); diff.removeAll(scriptSet).remove(first); if (script != UScript.INHERITED) { - System.out.println(loc + " Latin with : " + UScript.getName(script) + ", " - + scriptSet.toPattern(false)); + System.out.println( + loc + + " Latin with : " + + UScript.getName(script) + + ", " + + scriptSet.toPattern(false)); } } } @@ -422,31 +529,55 @@ private static void addLatin() { closeOver(closed).retainAll(ScriptCategories2.parseUnicodeSet("[[:L:][:M:]-[:nfkcqc=n:]]")); System.out.println("Exemplars: " + closed); - final UnicodeSet common = ScriptCategories2 - .parseUnicodeSet("[[aáàăâấầåäãąāảạậ æ b c ćčç dđð eéèêếềểěëėęēệ ə f ƒ gğ h iíìî ïįīị ı j-lľļł m nńňñņ oóòô ốồổöőõøơớờởợọộ œ p-rř s śšş tťţ uúùûůüűųūủưứữ ựụ v-yýÿ zźžż þ]]"); + final UnicodeSet common = + ScriptCategories2.parseUnicodeSet( + "[[aáàăâấầåäãąāảạậ æ b c ćčç dđð eéèêếềểěëėęēệ ə f ƒ gğ h iíìî ïįīị ı j-lľļł m nńňñņ oóòô ốồổöőõøơớờởợọộ œ p-rř s śšş tťţ uúùûůüűųūủưứữ ựụ v-yýÿ zźžż þ]]"); closeOver(common).retainAll(ScriptCategories2.parseUnicodeSet("[[:L:][:M:]-[:nfkcqc=n:]]")); System.out.println("Common: " + common); exemplars.retainAll(ScriptCategories2.parseUnicodeSet("[[:L:][:M:]-[:nfkcqc=n:]]")); - CATEGORYTABLE.add("Latin", true, "Common", buttonComparator, Separation.ALL_ORDINARY, exemplars); - CATEGORYTABLE.add("Latin", true, "Phonetics (IPA)", buttonComparator, Separation.ALL_ORDINARY, - ScriptCategories2.IPA); - CATEGORYTABLE.add("Latin", true, "Phonetics (X-IPA)", buttonComparator, Separation.ALL_ORDINARY, - ScriptCategories2.IPA_EXTENSIONS); - String flipped = "ɒdɔbɘᎸǫʜiꞁʞlmnoqpɿƨƚuvwxʏƹ؟" + "AᙠƆᗡƎꟻᎮHIႱᐴᏗMИOꟼϘЯƧTUVWXYƸ" + "ɐqɔpǝɟɓɥɪſʞ1ɯuodbɹsʇnʌʍxʎz¿" - + "∀ᙠƆᗡƎℲ⅁HIΓᐴ⅂ꟽNOԀÓᴚƧ⊥ȠɅM⅄Z"; - CATEGORYTABLE.add("Latin", true, "Flipped/Mirrored", ListComparator, Separation.ALL_ORDINARY, flipped); CATEGORYTABLE.add( - "Latin", - true, - "Other", - buttonComparator, - Separation.AUTOMATIC, - ScriptCategories2.parseUnicodeSet("[:script=Latin:]").removeAll(ScriptCategories2.SCRIPT_CHANGED) - .addAll(ScriptCategories2.SCRIPT_NEW.get("Latin")).removeAll(ScriptCategories2.IPA) - .removeAll(ScriptCategories2.IPA_EXTENSIONS).removeAll(exemplars)); + "Latin", true, "Common", buttonComparator, Separation.ALL_ORDINARY, exemplars); + CATEGORYTABLE.add( + "Latin", + true, + "Phonetics (IPA)", + buttonComparator, + Separation.ALL_ORDINARY, + ScriptCategories2.IPA); + CATEGORYTABLE.add( + "Latin", + true, + "Phonetics (X-IPA)", + buttonComparator, + Separation.ALL_ORDINARY, + ScriptCategories2.IPA_EXTENSIONS); + String flipped = + "ɒdɔbɘᎸǫʜiꞁʞlmnoqpɿƨƚuvwxʏƹ؟" + + "AᙠƆᗡƎꟻᎮHIႱᐴᏗMИOꟼϘЯƧTUVWXYƸ" + + "ɐqɔpǝɟɓɥɪſʞ1ɯuodbɹsʇnʌʍxʎz¿" + + "∀ᙠƆᗡƎℲ⅁HIΓᐴ⅂ꟽNOԀÓᴚƧ⊥ȠɅM⅄Z"; + CATEGORYTABLE.add( + "Latin", + true, + "Flipped/Mirrored", + ListComparator, + Separation.ALL_ORDINARY, + flipped); + CATEGORYTABLE.add( + "Latin", + true, + "Other", + buttonComparator, + Separation.AUTOMATIC, + ScriptCategories2.parseUnicodeSet("[:script=Latin:]") + .removeAll(ScriptCategories2.SCRIPT_CHANGED) + .addAll(ScriptCategories2.SCRIPT_NEW.get("Latin")) + .removeAll(ScriptCategories2.IPA) + .removeAll(ScriptCategories2.IPA_EXTENSIONS) + .removeAll(exemplars)); } private static UnicodeSet closeOver(UnicodeSet closed) { @@ -466,12 +597,18 @@ private static UnicodeSet closeOver(UnicodeSet closed) { private static void addAndNoteNew(ULocale title, UnicodeSet toAddTo, final UnicodeSet toAdd) { flatten(toAdd); if (toAddTo.containsAll(toAdd)) return; - System.out.println("Adding Common\t" + title.getDisplayName() + "\t" + title.toString() + "\t" - + new UnicodeSet(toAdd).removeAll(toAddTo).toPattern(false)); + System.out.println( + "Adding Common\t" + + title.getDisplayName() + + "\t" + + title.toString() + + "\t" + + new UnicodeSet(toAdd).removeAll(toAddTo).toPattern(false)); toAddTo.addAll(toAdd); } - private static void writeMainFile(String directory, String categoryTable) throws IOException, FileNotFoundException { + private static void writeMainFile(String directory, String categoryTable) + throws IOException, FileNotFoundException { PrintWriter out = getFileWriter(directory, "CharData.java"); out.println("package org.unicode.cldr.draft.picker;"); out.println("public class CharData {"); @@ -484,10 +621,13 @@ private static void writeMainFile(String directory, String categoryTable) throws out.close(); } - static PrintWriter getFileWriter(String directory, String filename) throws IOException, FileNotFoundException { + static PrintWriter getFileWriter(String directory, String filename) + throws IOException, FileNotFoundException { File f = new File(directory, filename); System.out.println("Writing: " + f.getCanonicalFile()); - PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(f), Charset.forName("UTF-8"))); + PrintWriter out = + new PrintWriter( + new OutputStreamWriter(new FileOutputStream(f), Charset.forName("UTF-8"))); return out; } @@ -504,51 +644,78 @@ static PrintWriter getFileWriter(String directory, String filename) throws IOExc // } private static void addSymbols() { - final UnicodeSet symbolsMinusScripts = ScriptCategories2 - .parseUnicodeSet("[[[:script=common:][:script=inherited:]]&[[:S:][:Letter:]]]"); + final UnicodeSet symbolsMinusScripts = + ScriptCategories2.parseUnicodeSet( + "[[[:script=common:][:script=inherited:]]&[[:S:][:Letter:]]]"); if (true) { System.out.println("***Contains:" + symbolsMinusScripts.contains(0x3192)); } final UnicodeSet math = ScriptCategories2.parseUnicodeSet("[:math:]"); - final UnicodeSet superscripts = ScriptCategories2.parseUnicodeSet("[[:dt=super:]-[:block=kanbun:]]"); + final UnicodeSet superscripts = + ScriptCategories2.parseUnicodeSet("[[:dt=super:]-[:block=kanbun:]]"); final UnicodeSet subscripts = ScriptCategories2.parseUnicodeSet("[:dt=sub:]"); - UnicodeSet skip = new UnicodeSet().addAll(math).addAll(superscripts).addAll(subscripts) - .retainAll(COMPATIBILITY); + UnicodeSet skip = + new UnicodeSet() + .addAll(math) + .addAll(superscripts) + .addAll(subscripts) + .retainAll(COMPATIBILITY); - for (int i = UCharacter.getIntPropertyMinValue(UProperty.GENERAL_CATEGORY); i <= UCharacter - .getIntPropertyMaxValue(UProperty.GENERAL_CATEGORY); ++i) { - String valueAlias = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, i, - UProperty.NameChoice.LONG); + for (int i = UCharacter.getIntPropertyMinValue(UProperty.GENERAL_CATEGORY); + i <= UCharacter.getIntPropertyMaxValue(UProperty.GENERAL_CATEGORY); + ++i) { + String valueAlias = + UCharacter.getPropertyValueName( + UProperty.GENERAL_CATEGORY, i, UProperty.NameChoice.LONG); UnicodeSet temp = new UnicodeSet(); ScriptCategories2.applyPropertyAlias("General Category", valueAlias, temp); - for (UnicodeSetIterator it = new UnicodeSetIterator(temp.retainAll(symbolsMinusScripts).removeAll(skip)); it - .next();) { - String block = UCharacter.getStringPropertyValue(UProperty.BLOCK, it.codepoint, - UProperty.NameChoice.LONG).toString(); - CATEGORYTABLE.add("Symbol", true, block + "@" + valueAlias, buttonComparator, Separation.AUTOMATIC, - it.codepoint, it.codepoint); + for (UnicodeSetIterator it = + new UnicodeSetIterator( + temp.retainAll(symbolsMinusScripts).removeAll(skip)); + it.next(); ) { + String block = + UCharacter.getStringPropertyValue( + UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG) + .toString(); + CATEGORYTABLE.add( + "Symbol", + true, + block + "@" + valueAlias, + buttonComparator, + Separation.AUTOMATIC, + it.codepoint, + it.codepoint); } } CATEGORYTABLE.add("Symbol", true, "Math", CODE_POINT_ORDER, Separation.ALL_ORDINARY, math); - CATEGORYTABLE.add("Symbol", true, "Superscript", buttonComparator, Separation.ALL_ORDINARY, superscripts); - CATEGORYTABLE.add("Symbol", true, "Subscript", buttonComparator, Separation.ALL_ORDINARY, subscripts); - + CATEGORYTABLE.add( + "Symbol", + true, + "Superscript", + buttonComparator, + Separation.ALL_ORDINARY, + superscripts); + CATEGORYTABLE.add( + "Symbol", true, "Subscript", buttonComparator, Separation.ALL_ORDINARY, subscripts); } private static void generateHangulDefectives() { for (int atomic = 0; atomic < 2; ++atomic) { for (int modern = 0; modern < 2; ++modern) { - for (char c : new char[] { 'L', 'V', 'T' }) { + for (char c : new char[] {'L', 'V', 'T'}) { UnicodeSet uset = new UnicodeSet(); - for (UnicodeSetIterator it = new UnicodeSetIterator(ScriptCategories2.parseUnicodeSet("[:HST=" + c - + ":]")); it.next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator( + ScriptCategories2.parseUnicodeSet("[:HST=" + c + ":]")); + it.next(); ) { if (UCharacter.getName(it.codepoint).contains("FILLER")) continue; String s = it.getString(); String d = MKD.transform(s); - if (s.equals(d) == (atomic == 1) && MODERN_JAMO.contains(it.codepoint) == (modern == 1)) { + if (s.equals(d) == (atomic == 1) + && MODERN_JAMO.contains(it.codepoint) == (modern == 1)) { uset.add(it.codepoint); } } @@ -582,7 +749,7 @@ private static void generateHangulDefectives() { System.out.println("testing roundtrip"); // test roundtrip - for (UnicodeSetIterator it = new UnicodeSetIterator(all); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(all); it.next(); ) { final String a = it.getString(); String b = MKD.transform(a); String c = MKC.transform(b); @@ -595,8 +762,10 @@ private static void generateHangulDefectives() { Map decomp2comp = new HashMap(); System.out.println("find defectives"); - for (UnicodeSetIterator it = new UnicodeSetIterator(ScriptCategories2.parseUnicodeSet("[:script=Hangul:]")); it - .next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator( + ScriptCategories2.parseUnicodeSet("[:script=Hangul:]")); + it.next(); ) { final String comp = it.getString(); String decomp = MKD.transform(comp); decomp2comp.put(decomp, comp); @@ -615,19 +784,19 @@ private static void generateHangulDefectives() { System.out.println("testing single+all"); - for (UnicodeSetIterator it = new UnicodeSetIterator(single); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(single); it.next(); ) { final String a = it.getString(); System.out.println(a); - for (UnicodeSetIterator it2 = new UnicodeSetIterator(all); it2.next();) { + for (UnicodeSetIterator it2 = new UnicodeSetIterator(all); it2.next(); ) { final String b = it2.getString(); checkPair(a, b, count); } } System.out.println("testing syllable+single"); - for (UnicodeSetIterator it = new UnicodeSetIterator(syllable); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(syllable); it.next(); ) { final String a = it.getString(); System.out.println(a); - for (UnicodeSetIterator it2 = new UnicodeSetIterator(single); it2.next();) { + for (UnicodeSetIterator it2 = new UnicodeSetIterator(single); it2.next(); ) { final String b = it2.getString(); checkPair(a, b, count); } @@ -653,16 +822,29 @@ private static void checkPair(final String a, final String b, int[] count) { } public static String codeAndName(String comp) { - return CldrUtility.toHex(comp, false) + "(" + comp + ")" + UCharacter.getExtendedName(comp.codePointAt(0)); + return CldrUtility.toHex(comp, false) + + "(" + + comp + + ")" + + UCharacter.getExtendedName(comp.codePointAt(0)); } private static void addHangul() { - for (UnicodeSetIterator it = new UnicodeSetIterator(ScriptCategories2.parseUnicodeSet("[:script=Hangul:]") - .removeAll(SKIP)); it.next();) { + for (UnicodeSetIterator it = + new UnicodeSetIterator( + ScriptCategories2.parseUnicodeSet("[:script=Hangul:]") + .removeAll(SKIP)); + it.next(); ) { String str = it.getString(); if (ScriptCategories2.ARCHAIC.contains(it.codepoint)) { - CATEGORYTABLE.add("Hangul", true, "Archaic Hangul", buttonComparator, Separation.AUTOMATIC, - it.codepoint, it.codepoint); + CATEGORYTABLE.add( + "Hangul", + true, + "Archaic Hangul", + buttonComparator, + Separation.AUTOMATIC, + it.codepoint, + it.codepoint); continue; } String s = MKKD.transform(str); @@ -670,25 +852,49 @@ private static void addHangul() { if (decompCodePoint1 == '(') { decompCodePoint1 = s.codePointAt(1); } - if (!HST_L.contains(decompCodePoint1) || it.codepoint == 0x115F || it.codepoint == 0x1160) { - CATEGORYTABLE.add("Hangul", true, "Other", buttonComparator, Separation.AUTOMATIC, it.codepoint); + if (!HST_L.contains(decompCodePoint1) + || it.codepoint == 0x115F + || it.codepoint == 0x1160) { + CATEGORYTABLE.add( + "Hangul", + true, + "Other", + buttonComparator, + Separation.AUTOMATIC, + it.codepoint); continue; } if (COMPATIBILITY.contains(it.codepoint)) { - CATEGORYTABLE - .add("Hangul", true, "Compatibility", buttonComparator, Separation.AUTOMATIC, it.codepoint); + CATEGORYTABLE.add( + "Hangul", + true, + "Compatibility", + buttonComparator, + Separation.AUTOMATIC, + it.codepoint); continue; } - CATEGORYTABLE.add("Hangul", true, - UTF16.valueOf(decompCodePoint1) + " " + UCharacter.getExtendedName(decompCodePoint1), buttonComparator, - Separation.AUTOMATIC, it.codepoint); + CATEGORYTABLE.add( + "Hangul", + true, + UTF16.valueOf(decompCodePoint1) + + " " + + UCharacter.getExtendedName(decompCodePoint1), + buttonComparator, + Separation.AUTOMATIC, + it.codepoint); } } private static String buildNames() { StringBuilder result = new StringBuilder(); - for (UnicodeSetIterator it = new UnicodeSetIterator(NAMED_CHARACTERS); it.next();) { - result.append("{\"" + it.getString() + "\",\"" + UCharacter.getExtendedName(it.codepoint) + "\"},\n"); + for (UnicodeSetIterator it = new UnicodeSetIterator(NAMED_CHARACTERS); it.next(); ) { + result.append( + "{\"" + + it.getString() + + "\",\"" + + UCharacter.getExtendedName(it.codepoint) + + "\"},\n"); } return result.toString(); } @@ -730,16 +936,26 @@ public String toString() { static class CategoryTable { enum Separation { - AUTOMATIC, ALL_UNCOMMON, ALL_HISTORIC, ALL_COMPATIBILITY, ALL_ORDINARY - } - - static Map> categoryTable = // new TreeMap>(ENGLISH); // - new LinkedHashMap>(); - - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, UnicodeSet values) { - for (UnicodeSetIterator it = new UnicodeSetIterator(values); it.next();) { + AUTOMATIC, + ALL_UNCOMMON, + ALL_HISTORIC, + ALL_COMPATIBILITY, + ALL_ORDINARY + } + + static Map> + categoryTable = // new TreeMap>(ENGLISH); // + new LinkedHashMap>(); + + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + UnicodeSet values) { + for (UnicodeSetIterator it = new UnicodeSetIterator(values); it.next(); ) { add(category, sortSubcategory, subcategory, sortValues, separateOld, it.codepoint); } } @@ -748,7 +964,9 @@ public Collection getLocalizations() { TreeSet result = new TreeSet(); result.add("variation selector-PLACEHOLDER"); for (String cp : new UnicodeSet(NAMED_CHARACTERS).removeAll(SKIPNAMES)) { - addNames(UCharacter.toLowerCase(UCharacter.getExtendedName(cp.codePointAt(0))), result); + addNames( + UCharacter.toLowerCase(UCharacter.getExtendedName(cp.codePointAt(0))), + result); } for (String category : categoryTable.keySet()) { addNames(category, result); @@ -790,23 +1008,45 @@ private void addNames(String name, Collection result) { result.add(name); } - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, String values) { + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + String values) { int cp; for (int i = 0; i < values.length(); i += UTF16.getCharCount(cp)) { - add(category, sortSubcategory, subcategory, sortValues, separateOld, cp = values.charAt(i)); - } - } - - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, int startCodePoint, int endCodePoint) { + add( + category, + sortSubcategory, + subcategory, + sortValues, + separateOld, + cp = values.charAt(i)); + } + } + + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + int startCodePoint, + int endCodePoint) { for (int i = startCodePoint; i <= endCodePoint; ++i) { add(category, sortSubcategory, subcategory, sortValues, separateOld, i); } } - public void add(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - Separation separateOld, int codepoint) { + public void add( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + Separation separateOld, + int codepoint) { // if (ADD_SUBHEAD.contains(codepoint)) String subhead = subheader.getSubheader(codepoint); @@ -826,18 +1066,18 @@ public void add(String category, boolean sortSubcategory, String subcategory, Co } } switch (separateOld) { - case ALL_HISTORIC: - prefix = ARCHAIC_MARKER; - sortValues = CODE_POINT_ORDER; - break; - case ALL_COMPATIBILITY: - prefix = COMPAT_MARKER; - sortValues = CODE_POINT_ORDER; - break; - case ALL_UNCOMMON: - prefix = LESS_COMMON_MARKER; - sortValues = CODE_POINT_ORDER; - break; + case ALL_HISTORIC: + prefix = ARCHAIC_MARKER; + sortValues = CODE_POINT_ORDER; + break; + case ALL_COMPATIBILITY: + prefix = COMPAT_MARKER; + sortValues = CODE_POINT_ORDER; + break; + case ALL_UNCOMMON: + prefix = LESS_COMMON_MARKER; + sortValues = CODE_POINT_ORDER; + break; } SimplePair names = renamer.rename(category, prefix + subcategory); @@ -850,9 +1090,14 @@ public void add(String category, boolean sortSubcategory, String subcategory, Co CATEGORYTABLE.add2(mainCategory, sortSubcategory, subCategory, sortValues, codepoint); } - private void add2(String category, boolean sortSubcategory, String subcategory, Comparator sortValues, - int codePoint) { - GeneratePickerData.USet oldValue = getValues(category, sortSubcategory, subcategory, sortValues); + private void add2( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues, + int codePoint) { + GeneratePickerData.USet oldValue = + getValues(category, sortSubcategory, subcategory, sortValues); if (!SKIP.contains(codePoint)) { oldValue.strings.add(UTF16.valueOf(codePoint)); } @@ -862,7 +1107,9 @@ public void removeAll(String category, String subcategory, UnicodeSet values) { Map sub = addMainCategory(category); GeneratePickerData.USet oldValue = sub.get(subcategory); if (oldValue != null) { - System.out.println(oldValue.strings.removeAll(addAllToCollection(values, new HashSet()))); + System.out.println( + oldValue.strings.removeAll( + addAllToCollection(values, new HashSet()))); } } @@ -887,20 +1134,25 @@ public void removeAll(UnicodeSet values) { // } // } - public void remove(String category, String subcategory, int startCodePoint, int endCodePoint) { + public void remove( + String category, String subcategory, int startCodePoint, int endCodePoint) { removeAll(category, subcategory, new UnicodeSet(startCodePoint, endCodePoint)); } public Map addMainCategory(String mainCategory) { Map sub = categoryTable.get(mainCategory); if (sub == null) { - categoryTable.put(mainCategory, sub = new TreeMap(UCA)); + categoryTable.put( + mainCategory, sub = new TreeMap(UCA)); } return sub; } - private GeneratePickerData.USet getValues(String category, boolean sortSubcategory, String subcategory, - Comparator sortValues) { + private GeneratePickerData.USet getValues( + String category, + boolean sortSubcategory, + String subcategory, + Comparator sortValues) { Map sub = addMainCategory(category); GeneratePickerData.USet oldValue = sub.get(subcategory); if (oldValue == null) { @@ -918,14 +1170,20 @@ public String toString() { static final UnicodeSet DEPRECATED = new UnicodeSet("[:deprecated:]").freeze(); static final UnicodeSet CONTROLS = new UnicodeSet("[[:cc:]]").freeze(); - public String toString(boolean displayData, String localDataDirectory) throws FileNotFoundException, - IOException { - UnicodeSet missing = new UnicodeSet(0, 0x10FFFF).removeAll(Typology.SKIP) - .removeAll(DEPRECATED) - .removeAll(CONTROLS); + public String toString(boolean displayData, String localDataDirectory) + throws FileNotFoundException, IOException { + UnicodeSet missing = + new UnicodeSet(0, 0x10FFFF) + .removeAll(Typology.SKIP) + .removeAll(DEPRECATED) + .removeAll(CONTROLS); PrintWriter htmlChart = getFileWriter(localDataDirectory, "index.html"); - writeHtmlHeader(htmlChart, localDataDirectory, null, "main", - "p {font-size:100%; margin:0; margin-left:1em; text-indent:-1em;}"); + writeHtmlHeader( + htmlChart, + localDataDirectory, + null, + "main", + "p {font-size:100%; margin:0; margin-left:1em; text-indent:-1em;}"); writePageIndex(htmlChart, categoryTable.keySet()); int totalChars = 0, totalCompressed = 0; @@ -934,11 +1192,13 @@ public String toString(boolean displayData, String localDataDirectory) throws Fi StringBuilder result = new StringBuilder(); for (String category : categoryTable.keySet()) { Map sub = categoryTable.get(category); - htmlChart = openChart(htmlChart, localDataDirectory, category, categoryTable.keySet()); + htmlChart = + openChart(htmlChart, localDataDirectory, category, categoryTable.keySet()); result.append("{{\"" + category + "\"},\n"); // clean up results - for (Iterator subcategoryIterator = sub.keySet().iterator(); subcategoryIterator.hasNext();) { + for (Iterator subcategoryIterator = sub.keySet().iterator(); + subcategoryIterator.hasNext(); ) { String subcategory = subcategoryIterator.next(); GeneratePickerData.USet valueChars = sub.get(subcategory); if (valueChars.strings.isEmpty()) { @@ -956,21 +1216,38 @@ public String toString(boolean displayData, String localDataDirectory) throws Fi labelString.append(" ‧ "); } UnicodeSet labelSet = Typology.getSet(s); - labelString.append(getUnicodeSetUrl(s, labelSet) + percentSuperscript(set, labelSet)); + labelString.append( + getUnicodeSetUrl(s, labelSet) + percentSuperscript(set, labelSet)); } - htmlChart.println("" - + - // "" + category + "" + - "" - + fixHtml(fixCategoryName(subcategory)) + "" + "" - + valueChars.strings.size() + "" + "" + fixHtml(valueChars.strings) + "" - + "\n" + "" + "" + labelString + "" + ""); - String valueCharsString = addResult(result, valueChars, category, subcategory, displayData); + htmlChart.println( + "" + + + // "" + category + "" + + "" + + fixHtml(fixCategoryName(subcategory)) + + "" + + "" + + valueChars.strings.size() + + "" + + "" + + fixHtml(valueChars.strings) + + "" + + "\n" + + "" + + "" + + labelString + + "" + + ""); + String valueCharsString = + addResult(result, valueChars, category, subcategory, displayData); totalChars += utf8Length(valueChars.strings); totalCompressed += utf8Length(valueCharsString); // if (valueChars.set.size() > 1000) { - // System.out.println("//Big class: " + category + MAIN_SUB_SEPARATOR + subcategory + + // System.out.println("//Big class: " + category + MAIN_SUB_SEPARATOR + + // subcategory + // MAIN_SUBSUB_SEPARATOR + valueChars.set.size()); // } UnicodeSet dups = new UnicodeSet(soFar); @@ -1027,13 +1304,18 @@ private String percentSuperscript(UnicodeSet set, UnicodeSet labelSet) { if (set.containsAll(labelSet)) return ""; UnicodeSet inSet = new UnicodeSet(labelSet).retainAll(set); UnicodeSet outSet = new UnicodeSet(labelSet).removeAll(set); - String result = " " + getUnicodeSetUrl(String.valueOf(inSet.size()), inSet) + ":" - + getUnicodeSetUrl(String.valueOf(outSet.size()), outSet) + ""; + String result = + " " + + getUnicodeSetUrl(String.valueOf(inSet.size()), inSet) + + ":" + + getUnicodeSetUrl(String.valueOf(outSet.size()), outSet) + + ""; return result; } private String getUnicodeSetUrl(UnicodeSet set) { - return "http://unicode.org/cldr/utility/list-unicodeset.jsp?a=" + fixURL(set.toPattern(false)); + return "http://unicode.org/cldr/utility/list-unicodeset.jsp?a=" + + fixURL(set.toPattern(false)); } private String fixURL(String string) { @@ -1057,17 +1339,22 @@ private String fixHtml(Collection strings) { return result.toString(); } - private PrintWriter openChart(PrintWriter htmlChart, String localDataDirectory, - String category, Set set) - throws IOException, FileNotFoundException { + private PrintWriter openChart( + PrintWriter htmlChart, String localDataDirectory, String category, Set set) + throws IOException, FileNotFoundException { if (htmlChart != null) { htmlChart = writeHtmlFooterAndClose(htmlChart); } if (category != null) { String fileNameFromCategory = fileNameFromCategory(category); htmlChart = getFileWriter(localDataDirectory, fileNameFromCategory); - htmlChart = writeHtmlHeader(htmlChart, localDataDirectory, category, "main", - "table, th, td {border-collapse:collapse; border:1px solid blue;}"); + htmlChart = + writeHtmlHeader( + htmlChart, + localDataDirectory, + category, + "main", + "table, th, td {border-collapse:collapse; border:1px solid blue;}"); writeCategoryH1(htmlChart, category); htmlChart.println("

Index

"); htmlChart.println(""); @@ -1081,21 +1368,33 @@ private PrintWriter writeHtmlFooterAndClose(PrintWriter htmlChart) { return null; } - private PrintWriter writeHtmlHeader(PrintWriter htmlChart, String localDataDirectory, String category, - String baseTarget, String styles) throws IOException { - htmlChart.println("\n\n" - + "\n" - + "Picker Data\n" + "\n" - + "\n" - + (styles == null ? "" : "\n") + "\n" - + "\n"); + private PrintWriter writeHtmlHeader( + PrintWriter htmlChart, + String localDataDirectory, + String category, + String baseTarget, + String styles) + throws IOException { + htmlChart.println( + "\n\n" + + "\n" + + "Picker Data\n" + + "\n" + + "\n" + + (styles == null + ? "" + : "\n") + + "\n" + + "\n"); return htmlChart; } private String fileNameFromCategory(String category) { - return "PickerData_" + fixCategoryName(category) - .replace(' ', '_') - .replace("&", "and") + ".html"; + return "PickerData_" + + fixCategoryName(category).replace(' ', '_').replace("&", "and") + + ".html"; } private void writePageIndex(PrintWriter htmlChart, Set set) { @@ -1103,7 +1402,12 @@ private void writePageIndex(PrintWriter htmlChart, Set set) { htmlChart.println("\n

 

(" + new Date() + ")

"); } @@ -1112,8 +1416,12 @@ private void writeCategoryH1(PrintWriter htmlChart, String category) { htmlChart.println("

" + fixCategoryName(category) + "

"); } - private String addResult(StringBuilder result, GeneratePickerData.USet valueChars, String category, - String subcategory, boolean doDisplayData) { + private String addResult( + StringBuilder result, + GeneratePickerData.USet valueChars, + String category, + String subcategory, + boolean doDisplayData) { subcategory = fixCategoryName(subcategory); category = fixCategoryName(category); @@ -1122,8 +1430,16 @@ private String addResult(StringBuilder result, GeneratePickerData.USet valueChar try { valueCharsString = valueChars.toString(); } catch (IllegalArgumentException e) { - System.out.println("/*" + size + "*/" + " " + category + MAIN_SUB_SEPARATOR + subcategory + "\t" - + valueChars.strings); + System.out.println( + "/*" + + size + + "*/" + + " " + + category + + MAIN_SUB_SEPARATOR + + subcategory + + "\t" + + valueChars.strings); throw e; } final int length = valueCharsString.length(); @@ -1131,12 +1447,34 @@ private String addResult(StringBuilder result, GeneratePickerData.USet valueChar for (String s : valueChars.strings) { valueSet.add(s); } - final String quoteFixedvalueCharsString = valueCharsString.replace("\\", "\\\\").replace("\"", "\\\""); - result.append("/*" + size + "," + length + "*/" + " {\"" + subcategory + "\",\"" - + quoteFixedvalueCharsString + "\"},\n"); + final String quoteFixedvalueCharsString = + valueCharsString.replace("\\", "\\\\").replace("\"", "\\\""); + result.append( + "/*" + + size + + "," + + length + + "*/" + + " {\"" + + subcategory + + "\",\"" + + quoteFixedvalueCharsString + + "\"},\n"); if (doDisplayData) { - System.out.println("/*" + size + "," + length + "*/" + " " + category + MAIN_SUB_SEPARATOR - + subcategory + "\t" + valueSet.toPattern(false) + ", " + CldrUtility.toHex(valueCharsString, true)); + System.out.println( + "/*" + + size + + "," + + length + + "*/" + + " " + + category + + MAIN_SUB_SEPARATOR + + subcategory + + "\t" + + valueSet.toPattern(false) + + ", " + + CldrUtility.toHex(valueCharsString, true)); } return valueCharsString; } @@ -1154,13 +1492,17 @@ private String fixCategoryName(String subcategory) { if (DEBUG) System.out.println("Skip: " + SKIP); } - private static void addProperty(String propertyAlias, String title, Comparator sort, UnicodeSet retain) { + private static void addProperty( + String propertyAlias, String title, Comparator sort, UnicodeSet retain) { int propEnum = UCharacter.getPropertyEnum(propertyAlias); // get all the value strings, sorted UnicodeSet valueChars = new UnicodeSet(); - for (int i = UCharacter.getIntPropertyMinValue(propEnum); i <= UCharacter.getIntPropertyMaxValue(propEnum); ++i) { - String valueAlias = UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.LONG); + for (int i = UCharacter.getIntPropertyMinValue(propEnum); + i <= UCharacter.getIntPropertyMaxValue(propEnum); + ++i) { + String valueAlias = + UCharacter.getPropertyValueName(propEnum, i, UProperty.NameChoice.LONG); if (valueAlias.contains("Symbol")) { System.out.println("Skipping " + valueAlias); continue; @@ -1170,24 +1512,35 @@ private static void addProperty(String propertyAlias, String title, Comparator sort, - Set propertyValues) { + private static void addProperty( + String propertyAlias, + String title, + Comparator sort, + Set propertyValues) { // get all the value strings, sorted UnicodeSet valueChars = new UnicodeSet(); for (String valueAlias : propertyValues) { @@ -1202,15 +1555,20 @@ private static void addProperty(String propertyAlias, String title, Comparator sortItems(Comparator sort, String propertyAlias, String valueAlias) { + private static Comparator sortItems( + Comparator sort, String propertyAlias, String valueAlias) { if (valueAlias.equals("Decimal_Number") && propertyAlias.equals("General_Category")) { return null; } @@ -1252,7 +1614,6 @@ public int compare(T arg0, T arg1) { } return 0; } - } static class UnicodeSetInclusionFirst> implements Comparator { @@ -1267,13 +1628,13 @@ public int compare(T arg0, T arg1) { boolean a1 = included.containsAll(arg1.toString()); return a0 == a1 ? arg0.compareTo(arg1) : a0 ? -1 : 1; } - } public static Set ERROR_COUNT = new LinkedHashSet(); /** * Provide a simple list of strings + * * @param source * @return */ @@ -1294,13 +1655,14 @@ public static String simpleList(Iterable source) { } return b.toString(); } - + static class USet { Collection strings; /** - * A few choices. As a plain list, as a LinkedHashSet, sorted by code point, or sorted by specific comparator - * + * A few choices. As a plain list, as a LinkedHashSet, sorted by code point, or sorted by + * specific comparator + * * @param sorted */ public USet(Comparator sorted) { @@ -1322,7 +1684,8 @@ public String toString() { // for (String s : set) { // set2.add(s); // } - // //if (DEBUG) System.out.println("Sorted " + value + ": " + valueChars.size() + ", " + valueChars); + // //if (DEBUG) System.out.println("Sorted " + value + ": " + valueChars.size() + ", " + + // valueChars); // if (set2.isEmpty()) { // return null; // } @@ -1341,30 +1704,43 @@ public String toString() { Set ba = new LinkedHashSet(reversal); ba.removeAll(original); System.out.println("FAILED!!!!"); - IllegalArgumentException e = new IllegalArgumentException("Failed with: " + original + "\n" - + "Range String: " + Compacter.getInternalRangeString(strings) + "\n" - + "In original but not restored: " + ab + "\n" + "In restored but not original: " + ba + "\n" - + "Returned range string: " + CharacterListCompressor.base88DecodeList(result.toString()) - // CharacterListCompressor.base88Decode(in); - ); + IllegalArgumentException e = + new IllegalArgumentException( + "Failed with: " + + original + + "\n" + + "Range String: " + + Compacter.getInternalRangeString(strings) + + "\n" + + "In original but not restored: " + + ab + + "\n" + + "In restored but not original: " + + ba + + "\n" + + "Returned range string: " + + CharacterListCompressor.base88DecodeList( + result.toString()) + // CharacterListCompressor.base88Decode(in); + ); e.printStackTrace(System.err); ERROR_COUNT.add(e); } return result.toString(); } - } /** - * Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd] Returns the set for chaining. - * + * Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd] Returns the set for + * chaining. + * * @param exemplar1 * @return */ public static UnicodeSet flatten(UnicodeSet exemplar1) { UnicodeSet result = new UnicodeSet(); boolean gotString = false; - for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange(); ) { if (it.codepoint == UnicodeSetIterator.IS_STRING) { result.addAll(it.string); gotString = true; @@ -1376,186 +1752,627 @@ public static UnicodeSet flatten(UnicodeSet exemplar1) { return exemplar1; } - static String MKD_RULES = "\u1101 > \u1100\u1100;" + "\u1104 > \u1103\u1103;" + "\u1108 > \u1107\u1107;" - + "\u110A > \u1109\u1109;" + "\u110D > \u110C\u110C;" + "\u1113 > \u1102\u1100;" + "\u1114 > \u1102\u1102;" - + "\u1115 > \u1102\u1103;" + "\u1116 > \u1102\u1107;" + "\u1117 > \u1103\u1100;" + "\u1118 > \u1105\u1102;" - + "\u1119 > \u1105\u1105;" + "\u111A > \u1105\u1112;" + "\u111B > \u1105\u110B;" + "\u111C > \u1106\u1107;" - + "\u111D > \u1106\u110B;" + "\u111E > \u1107\u1100;" + "\u111F > \u1107\u1102;" + "\u1120 > \u1107\u1103;" - + "\u1121 > \u1107\u1109;" + "\u1122 > \u1107\u1109\u1100;" + "\u1123 > \u1107\u1109\u1103;" - + "\u1124 > \u1107\u1109\u1107;" + "\u1125 > \u1107\u1109\u1109;" + "\u1126 > \u1107\u1109\u110C;" - + "\u1127 > \u1107\u110C;" + "\u1128 > \u1107\u110E;" + "\u1129 > \u1107\u1110;" + "\u112A > \u1107\u1111;" - + "\u112B > \u1107\u110B;" + "\u112C > \u1107\u1107\u110B;" + "\u112D > \u1109\u1100;" - + "\u112E > \u1109\u1102;" + "\u112F > \u1109\u1103;" + "\u1130 > \u1109\u1105;" + "\u1131 > \u1109\u1106;" - + "\u1132 > \u1109\u1107;" + "\u1133 > \u1109\u1107\u1100;" + "\u1134 > \u1109\u1109\u1109;" - + "\u1135 > \u1109\u110B;" + "\u1136 > \u1109\u110C;" + "\u1137 > \u1109\u110E;" + "\u1138 > \u1109\u110F;" - + "\u1139 > \u1109\u1110;" + "\u113A > \u1109\u1111;" + "\u113B > \u1109\u1112;" + "\u113D > \u113C\u113C;" - + "\u113F > \u113E\u113E;" + "\u1141 > \u110B\u1100;" + "\u1142 > \u110B\u1103;" + "\u1143 > \u110B\u1106;" - + "\u1144 > \u110B\u1107;" + "\u1145 > \u110B\u1109;" + "\u1146 > \u110B\u1140;" + "\u1147 > \u110B\u110B;" - + "\u1148 > \u110B\u110C;" + "\u1149 > \u110B\u110E;" + "\u114A > \u110B\u1110;" + "\u114B > \u110B\u1111;" - + "\u114D > \u110C\u110B;" + "\u114F > \u114E\u114E;" + "\u1151 > \u1150\u1150;" + "\u1152 > \u110E\u110F;" - + "\u1153 > \u110E\u1112;" + "\u1156 > \u1111\u1107;" + "\u1157 > \u1111\u110B;" + "\u1158 > \u1112\u1112;" - + "\u115A > \u1100\u1103;" + "\u115B > \u1102\u1109;" + "\u115C > \u1102\u110C;" + "\u115D > \u1102\u1112;" - + "\u115E > \u1103\u1105;" + "\uA960 > \u1103\u1106;" + "\uA961 > \u1103\u1107;" + "\uA962 > \u1103\u1109;" - + "\uA963 > \u1103\u110C;" + "\uA964 > \u1105\u1100;" + "\uA965 > \u1105\u1100\u1100;" - + "\uA966 > \u1105\u1103;" + "\uA967 > \u1105\u1103\u1103;" + "\uA968 > \u1105\u1106;" - + "\uA969 > \u1105\u1107;" + "\uA96A > \u1105\u1107\u1107;" + "\uA96B > \u1105\u1107\u110B;" - + "\uA96C > \u1105\u1109;" + "\uA96D > \u1105\u110C;" + "\uA96E > \u1105\u110F;" + "\uA96F > \u1106\u1100;" - + "\uA970 > \u1106\u1103;" + "\uA971 > \u1106\u1109;" + "\uA972 > \u1107\u1109\u1110;" - + "\uA973 > \u1107\u110F;" + "\uA974 > \u1107\u1112;" + "\uA975 > \u1109\u1109\u1107;" - + "\uA976 > \u110B\u1105;" + "\uA977 > \u110B\u1112;" + "\uA978 > \u110C\u110C\u1112;" - + "\uA979 > \u1110\u1110;" + "\uA97A > \u1111\u1112;" + "\uA97B > \u1112\u1109;" + "\uA97C > \u1159\u1159;" - + "\u1162 > \u1161\u1175;" + "\u1164 > \u1163\u1175;" + "\u1166 > \u1165\u1175;" + "\u1168 > \u1167\u1175;" - + "\u116A > \u1169\u1161;" + "\u116B > \u1169\u1161\u1175;" + "\u116C > \u1169\u1175;" - + "\u116F > \u116E\u1165;" + "\u1170 > \u116E\u1165\u1175;" + "\u1171 > \u116E\u1175;" - + "\u1174 > \u1173\u1175;" + "\u1176 > \u1161\u1169;" + "\u1177 > \u1161\u116E;" + "\u1178 > \u1163\u1169;" - + "\u1179 > \u1163\u116D;" + "\u117A > \u1165\u1169;" + "\u117B > \u1165\u116E;" + "\u117C > \u1165\u1173;" - + "\u117D > \u1167\u1169;" + "\u117E > \u1167\u116E;" + "\u117F > \u1169\u1165;" - + "\u1180 > \u1169\u1165\u1175;" + "\u1181 > \u1169\u1167\u1175;" + "\u1182 > \u1169\u1169;" - + "\u1183 > \u1169\u116E;" + "\u1184 > \u116D\u1163;" + "\u1185 > \u116D\u1163\u1175;" - + "\u1186 > \u116D\u1167;" + "\u1187 > \u116D\u1169;" + "\u1188 > \u116D\u1175;" + "\u1189 > \u116E\u1161;" - + "\u118A > \u116E\u1161\u1175;" + "\u118B > \u116E\u1165\u1173;" + "\u118C > \u116E\u1167\u1175;" - + "\u118D > \u116E\u116E;" + "\u118E > \u1172\u1161;" + "\u118F > \u1172\u1165;" - + "\u1190 > \u1172\u1165\u1175;" + "\u1191 > \u1172\u1167;" + "\u1192 > \u1172\u1167\u1175;" - + "\u1193 > \u1172\u116E;" + "\u1194 > \u1172\u1175;" + "\u1195 > \u1173\u116E;" + "\u1196 > \u1173\u1173;" - + "\u1197 > \u1173\u1175\u116E;" + "\u1198 > \u1175\u1161;" + "\u1199 > \u1175\u1163;" - + "\u119A > \u1175\u1169;" + "\u119B > \u1175\u116E;" + "\u119C > \u1175\u1173;" + "\u119D > \u1175\u119E;" - + "\u119F > \u119E\u1165;" + "\u11A0 > \u119E\u116E;" + "\u11A1 > \u119E\u1175;" + "\u11A2 > \u119E\u119E;" - + "\u11A3 > \u1161\u1173;" + "\u11A4 > \u1163\u116E;" + "\u11A5 > \u1167\u1163;" + "\u11A6 > \u1169\u1163;" - + "\u11A7 > \u1169\u1163\u1175;" + "\uD7B0 > \u1169\u1167;" + "\uD7B1 > \u1169\u1169\u1175;" - + "\uD7B2 > \u116D\u1161;" + "\uD7B3 > \u116D\u1161\u1175;" + "\uD7B4 > \u116D\u1165;" - + "\uD7B5 > \u116E\u1167;" + "\uD7B6 > \u116E\u1175\u1175;" + "\uD7B7 > \u1172\u1161\u1175;" - + "\uD7B8 > \u1172\u1169;" + "\uD7B9 > \u1173\u1161;" + "\uD7BA > \u1173\u1165;" - + "\uD7BB > \u1173\u1165\u1175;" + "\uD7BC > \u1173\u1169;" + "\uD7BD > \u1175\u1163\u1169;" - + "\uD7BE > \u1175\u1163\u1175;" + "\uD7BF > \u1175\u1167;" + "\uD7C0 > \u1175\u1167\u1175;" - + "\uD7C1 > \u1175\u1169\u1175;" + "\uD7C2 > \u1175\u116D;" + "\uD7C3 > \u1175\u1172;" - + "\uD7C4 > \u1175\u1175;" + "\uD7C5 > \u119E\u1161;" + "\uD7C6 > \u119E\u1165\u1175;" - + "\u11A9 > \u11A8\u11A8;" + "\u11AA > \u11A8\u11BA;" + "\u11AC > \u11AB\u11BD;" + "\u11AD > \u11AB\u11C2;" - + "\u11B0 > \u11AF\u11A8;" + "\u11B1 > \u11AF\u11B7;" + "\u11B2 > \u11AF\u11B8;" + "\u11B3 > \u11AF\u11BA;" - + "\u11B4 > \u11AF\u11C0;" + "\u11B5 > \u11AF\u11C1;" + "\u11B6 > \u11AF\u11C2;" + "\u11B9 > \u11B8\u11BA;" - + "\u11BB > \u11BA\u11BA;" + "\u11C3 > \u11A8\u11AF;" + "\u11C4 > \u11A8\u11BA\u11A8;" - + "\u11C5 > \u11AB\u11A8;" + "\u11C6 > \u11AB\u11AE;" + "\u11C7 > \u11AB\u11BA;" + "\u11C8 > \u11AB\u11EB;" - + "\u11C9 > \u11AB\u11C0;" + "\u11CA > \u11AE\u11A8;" + "\u11CB > \u11AE\u11AF;" - + "\u11CC > \u11AF\u11A8\u11BA;" + "\u11CD > \u11AF\u11AB;" + "\u11CE > \u11AF\u11AE;" - + "\u11CF > \u11AF\u11AE\u11C2;" + "\u11D0 > \u11AF\u11AF;" + "\u11D1 > \u11AF\u11B7\u11A8;" - + "\u11D2 > \u11AF\u11B7\u11BA;" + "\u11D3 > \u11AF\u11B8\u11BA;" + "\u11D4 > \u11AF\u11B8\u11C2;" - + "\u11D5 > \u11AF\u11B8\u11BC;" + "\u11D6 > \u11AF\u11BA\u11BA;" + "\u11D7 > \u11AF\u11EB;" - + "\u11D8 > \u11AF\u11BF;" + "\u11D9 > \u11AF\u11F9;" + "\u11DA > \u11B7\u11A8;" + "\u11DB > \u11B7\u11AF;" - + "\u11DC > \u11B7\u11B8;" + "\u11DD > \u11B7\u11BA;" + "\u11DE > \u11B7\u11BA\u11BA;" - + "\u11DF > \u11B7\u11EB;" + "\u11E0 > \u11B7\u11BE;" + "\u11E1 > \u11B7\u11C2;" + "\u11E2 > \u11B7\u11BC;" - + "\u11E3 > \u11B8\u11AF;" + "\u11E4 > \u11B8\u11C1;" + "\u11E5 > \u11B8\u11C2;" + "\u11E6 > \u11B8\u11BC;" - + "\u11E7 > \u11BA\u11A8;" + "\u11E8 > \u11BA\u11AE;" + "\u11E9 > \u11BA\u11AF;" + "\u11EA > \u11BA\u11B8;" - + "\u11EC > \u11BC\u11A8;" + "\u11ED > \u11BC\u11A8\u11A8;" + "\u11EE > \u11BC\u11BC;" - + "\u11EF > \u11BC\u11BF;" + "\u11F1 > \u11F0\u11BA;" + "\u11F2 > \u11F0\u11EB;" + "\u11F3 > \u11C1\u11B8;" - + "\u11F4 > \u11C1\u11BC;" + "\u11F5 > \u11C2\u11AB;" + "\u11F6 > \u11C2\u11AF;" + "\u11F7 > \u11C2\u11B7;" - + "\u11F8 > \u11C2\u11B8;" + "\u11FA > \u11A8\u11AB;" + "\u11FB > \u11A8\u11B8;" + "\u11FC > \u11A8\u11BE;" - + "\u11FD > \u11A8\u11BF;" + "\u11FE > \u11A8\u11C2;" + "\u11FF > \u11AB\u11AB;" + "\uD7CB > \u11AB\u11AF;" - + "\uD7CC > \u11AB\u11BE;" + "\uD7CD > \u11AE\u11AE;" + "\uD7CE > \u11AE\u11AE\u11B8;" - + "\uD7CF > \u11AE\u11B8;" + "\uD7D0 > \u11AE\u11BA;" + "\uD7D1 > \u11AE\u11BA\u11A8;" - + "\uD7D2 > \u11AE\u11BD;" + "\uD7D3 > \u11AE\u11BE;" + "\uD7D4 > \u11AE\u11C0;" - + "\uD7D5 > \u11AF\u11A8\u11A8;" + "\uD7D6 > \u11AF\u11A8\u11C2;" + "\uD7D7 > \u11AF\u11AF\u11BF;" - + "\uD7D8 > \u11AF\u11B7\u11C2;" + "\uD7D9 > \u11AF\u11B8\u11AE;" + "\uD7DA > \u11AF\u11B8\u11C1;" - + "\uD7DB > \u11AF\u11F0;" + "\uD7DC > \u11AF\u11F9\u11C2;" + "\uD7DD > \u11AF\u11BC;" - + "\uD7DE > \u11B7\u11AB;" + "\uD7DF > \u11B7\u11AB\u11AB;" + "\uD7E0 > \u11B7\u11B7;" - + "\uD7E1 > \u11B7\u11B8\u11BA;" + "\uD7E2 > \u11B7\u11BD;" + "\uD7E3 > \u11B8\u11AE;" - + "\uD7E4 > \u11B8\u11AF\u11C1;" + "\uD7E5 > \u11B8\u11B7;" + "\uD7E6 > \u11B8\u11B8;" - + "\uD7E7 > \u11B8\u11BA\u11AE;" + "\uD7E8 > \u11B8\u11BD;" + "\uD7E9 > \u11B8\u11BE;" - + "\uD7EA > \u11BA\u11B7;" + "\uD7EB > \u11BA\u11B8\u11BC;" + "\uD7EC > \u11BA\u11BA\u11A8;" - + "\uD7ED > \u11BA\u11BA\u11AE;" + "\uD7EE > \u11BA\u11EB;" + "\uD7EF > \u11BA\u11BD;" - + "\uD7F0 > \u11BA\u11BE;" + "\uD7F1 > \u11BA\u11C0;" + "\uD7F2 > \u11BA\u11C2;" + "\uD7F3 > \u11EB\u11B8;" - + "\uD7F4 > \u11EB\u11B8\u11BC;" + "\uD7F5 > \u11F0\u11B7;" + "\uD7F6 > \u11F0\u11C2;" - + "\uD7F7 > \u11BD\u11B8;" + "\uD7F8 > \u11BD\u11B8\u11B8;" + "\uD7F9 > \u11BD\u11BD;" - + "\uD7FA > \u11C1\u11BA;" + "\uD7FB > \u11C1\u11C0;"; + static String MKD_RULES = + "\u1101 > \u1100\u1100;" + + "\u1104 > \u1103\u1103;" + + "\u1108 > \u1107\u1107;" + + "\u110A > \u1109\u1109;" + + "\u110D > \u110C\u110C;" + + "\u1113 > \u1102\u1100;" + + "\u1114 > \u1102\u1102;" + + "\u1115 > \u1102\u1103;" + + "\u1116 > \u1102\u1107;" + + "\u1117 > \u1103\u1100;" + + "\u1118 > \u1105\u1102;" + + "\u1119 > \u1105\u1105;" + + "\u111A > \u1105\u1112;" + + "\u111B > \u1105\u110B;" + + "\u111C > \u1106\u1107;" + + "\u111D > \u1106\u110B;" + + "\u111E > \u1107\u1100;" + + "\u111F > \u1107\u1102;" + + "\u1120 > \u1107\u1103;" + + "\u1121 > \u1107\u1109;" + + "\u1122 > \u1107\u1109\u1100;" + + "\u1123 > \u1107\u1109\u1103;" + + "\u1124 > \u1107\u1109\u1107;" + + "\u1125 > \u1107\u1109\u1109;" + + "\u1126 > \u1107\u1109\u110C;" + + "\u1127 > \u1107\u110C;" + + "\u1128 > \u1107\u110E;" + + "\u1129 > \u1107\u1110;" + + "\u112A > \u1107\u1111;" + + "\u112B > \u1107\u110B;" + + "\u112C > \u1107\u1107\u110B;" + + "\u112D > \u1109\u1100;" + + "\u112E > \u1109\u1102;" + + "\u112F > \u1109\u1103;" + + "\u1130 > \u1109\u1105;" + + "\u1131 > \u1109\u1106;" + + "\u1132 > \u1109\u1107;" + + "\u1133 > \u1109\u1107\u1100;" + + "\u1134 > \u1109\u1109\u1109;" + + "\u1135 > \u1109\u110B;" + + "\u1136 > \u1109\u110C;" + + "\u1137 > \u1109\u110E;" + + "\u1138 > \u1109\u110F;" + + "\u1139 > \u1109\u1110;" + + "\u113A > \u1109\u1111;" + + "\u113B > \u1109\u1112;" + + "\u113D > \u113C\u113C;" + + "\u113F > \u113E\u113E;" + + "\u1141 > \u110B\u1100;" + + "\u1142 > \u110B\u1103;" + + "\u1143 > \u110B\u1106;" + + "\u1144 > \u110B\u1107;" + + "\u1145 > \u110B\u1109;" + + "\u1146 > \u110B\u1140;" + + "\u1147 > \u110B\u110B;" + + "\u1148 > \u110B\u110C;" + + "\u1149 > \u110B\u110E;" + + "\u114A > \u110B\u1110;" + + "\u114B > \u110B\u1111;" + + "\u114D > \u110C\u110B;" + + "\u114F > \u114E\u114E;" + + "\u1151 > \u1150\u1150;" + + "\u1152 > \u110E\u110F;" + + "\u1153 > \u110E\u1112;" + + "\u1156 > \u1111\u1107;" + + "\u1157 > \u1111\u110B;" + + "\u1158 > \u1112\u1112;" + + "\u115A > \u1100\u1103;" + + "\u115B > \u1102\u1109;" + + "\u115C > \u1102\u110C;" + + "\u115D > \u1102\u1112;" + + "\u115E > \u1103\u1105;" + + "\uA960 > \u1103\u1106;" + + "\uA961 > \u1103\u1107;" + + "\uA962 > \u1103\u1109;" + + "\uA963 > \u1103\u110C;" + + "\uA964 > \u1105\u1100;" + + "\uA965 > \u1105\u1100\u1100;" + + "\uA966 > \u1105\u1103;" + + "\uA967 > \u1105\u1103\u1103;" + + "\uA968 > \u1105\u1106;" + + "\uA969 > \u1105\u1107;" + + "\uA96A > \u1105\u1107\u1107;" + + "\uA96B > \u1105\u1107\u110B;" + + "\uA96C > \u1105\u1109;" + + "\uA96D > \u1105\u110C;" + + "\uA96E > \u1105\u110F;" + + "\uA96F > \u1106\u1100;" + + "\uA970 > \u1106\u1103;" + + "\uA971 > \u1106\u1109;" + + "\uA972 > \u1107\u1109\u1110;" + + "\uA973 > \u1107\u110F;" + + "\uA974 > \u1107\u1112;" + + "\uA975 > \u1109\u1109\u1107;" + + "\uA976 > \u110B\u1105;" + + "\uA977 > \u110B\u1112;" + + "\uA978 > \u110C\u110C\u1112;" + + "\uA979 > \u1110\u1110;" + + "\uA97A > \u1111\u1112;" + + "\uA97B > \u1112\u1109;" + + "\uA97C > \u1159\u1159;" + + "\u1162 > \u1161\u1175;" + + "\u1164 > \u1163\u1175;" + + "\u1166 > \u1165\u1175;" + + "\u1168 > \u1167\u1175;" + + "\u116A > \u1169\u1161;" + + "\u116B > \u1169\u1161\u1175;" + + "\u116C > \u1169\u1175;" + + "\u116F > \u116E\u1165;" + + "\u1170 > \u116E\u1165\u1175;" + + "\u1171 > \u116E\u1175;" + + "\u1174 > \u1173\u1175;" + + "\u1176 > \u1161\u1169;" + + "\u1177 > \u1161\u116E;" + + "\u1178 > \u1163\u1169;" + + "\u1179 > \u1163\u116D;" + + "\u117A > \u1165\u1169;" + + "\u117B > \u1165\u116E;" + + "\u117C > \u1165\u1173;" + + "\u117D > \u1167\u1169;" + + "\u117E > \u1167\u116E;" + + "\u117F > \u1169\u1165;" + + "\u1180 > \u1169\u1165\u1175;" + + "\u1181 > \u1169\u1167\u1175;" + + "\u1182 > \u1169\u1169;" + + "\u1183 > \u1169\u116E;" + + "\u1184 > \u116D\u1163;" + + "\u1185 > \u116D\u1163\u1175;" + + "\u1186 > \u116D\u1167;" + + "\u1187 > \u116D\u1169;" + + "\u1188 > \u116D\u1175;" + + "\u1189 > \u116E\u1161;" + + "\u118A > \u116E\u1161\u1175;" + + "\u118B > \u116E\u1165\u1173;" + + "\u118C > \u116E\u1167\u1175;" + + "\u118D > \u116E\u116E;" + + "\u118E > \u1172\u1161;" + + "\u118F > \u1172\u1165;" + + "\u1190 > \u1172\u1165\u1175;" + + "\u1191 > \u1172\u1167;" + + "\u1192 > \u1172\u1167\u1175;" + + "\u1193 > \u1172\u116E;" + + "\u1194 > \u1172\u1175;" + + "\u1195 > \u1173\u116E;" + + "\u1196 > \u1173\u1173;" + + "\u1197 > \u1173\u1175\u116E;" + + "\u1198 > \u1175\u1161;" + + "\u1199 > \u1175\u1163;" + + "\u119A > \u1175\u1169;" + + "\u119B > \u1175\u116E;" + + "\u119C > \u1175\u1173;" + + "\u119D > \u1175\u119E;" + + "\u119F > \u119E\u1165;" + + "\u11A0 > \u119E\u116E;" + + "\u11A1 > \u119E\u1175;" + + "\u11A2 > \u119E\u119E;" + + "\u11A3 > \u1161\u1173;" + + "\u11A4 > \u1163\u116E;" + + "\u11A5 > \u1167\u1163;" + + "\u11A6 > \u1169\u1163;" + + "\u11A7 > \u1169\u1163\u1175;" + + "\uD7B0 > \u1169\u1167;" + + "\uD7B1 > \u1169\u1169\u1175;" + + "\uD7B2 > \u116D\u1161;" + + "\uD7B3 > \u116D\u1161\u1175;" + + "\uD7B4 > \u116D\u1165;" + + "\uD7B5 > \u116E\u1167;" + + "\uD7B6 > \u116E\u1175\u1175;" + + "\uD7B7 > \u1172\u1161\u1175;" + + "\uD7B8 > \u1172\u1169;" + + "\uD7B9 > \u1173\u1161;" + + "\uD7BA > \u1173\u1165;" + + "\uD7BB > \u1173\u1165\u1175;" + + "\uD7BC > \u1173\u1169;" + + "\uD7BD > \u1175\u1163\u1169;" + + "\uD7BE > \u1175\u1163\u1175;" + + "\uD7BF > \u1175\u1167;" + + "\uD7C0 > \u1175\u1167\u1175;" + + "\uD7C1 > \u1175\u1169\u1175;" + + "\uD7C2 > \u1175\u116D;" + + "\uD7C3 > \u1175\u1172;" + + "\uD7C4 > \u1175\u1175;" + + "\uD7C5 > \u119E\u1161;" + + "\uD7C6 > \u119E\u1165\u1175;" + + "\u11A9 > \u11A8\u11A8;" + + "\u11AA > \u11A8\u11BA;" + + "\u11AC > \u11AB\u11BD;" + + "\u11AD > \u11AB\u11C2;" + + "\u11B0 > \u11AF\u11A8;" + + "\u11B1 > \u11AF\u11B7;" + + "\u11B2 > \u11AF\u11B8;" + + "\u11B3 > \u11AF\u11BA;" + + "\u11B4 > \u11AF\u11C0;" + + "\u11B5 > \u11AF\u11C1;" + + "\u11B6 > \u11AF\u11C2;" + + "\u11B9 > \u11B8\u11BA;" + + "\u11BB > \u11BA\u11BA;" + + "\u11C3 > \u11A8\u11AF;" + + "\u11C4 > \u11A8\u11BA\u11A8;" + + "\u11C5 > \u11AB\u11A8;" + + "\u11C6 > \u11AB\u11AE;" + + "\u11C7 > \u11AB\u11BA;" + + "\u11C8 > \u11AB\u11EB;" + + "\u11C9 > \u11AB\u11C0;" + + "\u11CA > \u11AE\u11A8;" + + "\u11CB > \u11AE\u11AF;" + + "\u11CC > \u11AF\u11A8\u11BA;" + + "\u11CD > \u11AF\u11AB;" + + "\u11CE > \u11AF\u11AE;" + + "\u11CF > \u11AF\u11AE\u11C2;" + + "\u11D0 > \u11AF\u11AF;" + + "\u11D1 > \u11AF\u11B7\u11A8;" + + "\u11D2 > \u11AF\u11B7\u11BA;" + + "\u11D3 > \u11AF\u11B8\u11BA;" + + "\u11D4 > \u11AF\u11B8\u11C2;" + + "\u11D5 > \u11AF\u11B8\u11BC;" + + "\u11D6 > \u11AF\u11BA\u11BA;" + + "\u11D7 > \u11AF\u11EB;" + + "\u11D8 > \u11AF\u11BF;" + + "\u11D9 > \u11AF\u11F9;" + + "\u11DA > \u11B7\u11A8;" + + "\u11DB > \u11B7\u11AF;" + + "\u11DC > \u11B7\u11B8;" + + "\u11DD > \u11B7\u11BA;" + + "\u11DE > \u11B7\u11BA\u11BA;" + + "\u11DF > \u11B7\u11EB;" + + "\u11E0 > \u11B7\u11BE;" + + "\u11E1 > \u11B7\u11C2;" + + "\u11E2 > \u11B7\u11BC;" + + "\u11E3 > \u11B8\u11AF;" + + "\u11E4 > \u11B8\u11C1;" + + "\u11E5 > \u11B8\u11C2;" + + "\u11E6 > \u11B8\u11BC;" + + "\u11E7 > \u11BA\u11A8;" + + "\u11E8 > \u11BA\u11AE;" + + "\u11E9 > \u11BA\u11AF;" + + "\u11EA > \u11BA\u11B8;" + + "\u11EC > \u11BC\u11A8;" + + "\u11ED > \u11BC\u11A8\u11A8;" + + "\u11EE > \u11BC\u11BC;" + + "\u11EF > \u11BC\u11BF;" + + "\u11F1 > \u11F0\u11BA;" + + "\u11F2 > \u11F0\u11EB;" + + "\u11F3 > \u11C1\u11B8;" + + "\u11F4 > \u11C1\u11BC;" + + "\u11F5 > \u11C2\u11AB;" + + "\u11F6 > \u11C2\u11AF;" + + "\u11F7 > \u11C2\u11B7;" + + "\u11F8 > \u11C2\u11B8;" + + "\u11FA > \u11A8\u11AB;" + + "\u11FB > \u11A8\u11B8;" + + "\u11FC > \u11A8\u11BE;" + + "\u11FD > \u11A8\u11BF;" + + "\u11FE > \u11A8\u11C2;" + + "\u11FF > \u11AB\u11AB;" + + "\uD7CB > \u11AB\u11AF;" + + "\uD7CC > \u11AB\u11BE;" + + "\uD7CD > \u11AE\u11AE;" + + "\uD7CE > \u11AE\u11AE\u11B8;" + + "\uD7CF > \u11AE\u11B8;" + + "\uD7D0 > \u11AE\u11BA;" + + "\uD7D1 > \u11AE\u11BA\u11A8;" + + "\uD7D2 > \u11AE\u11BD;" + + "\uD7D3 > \u11AE\u11BE;" + + "\uD7D4 > \u11AE\u11C0;" + + "\uD7D5 > \u11AF\u11A8\u11A8;" + + "\uD7D6 > \u11AF\u11A8\u11C2;" + + "\uD7D7 > \u11AF\u11AF\u11BF;" + + "\uD7D8 > \u11AF\u11B7\u11C2;" + + "\uD7D9 > \u11AF\u11B8\u11AE;" + + "\uD7DA > \u11AF\u11B8\u11C1;" + + "\uD7DB > \u11AF\u11F0;" + + "\uD7DC > \u11AF\u11F9\u11C2;" + + "\uD7DD > \u11AF\u11BC;" + + "\uD7DE > \u11B7\u11AB;" + + "\uD7DF > \u11B7\u11AB\u11AB;" + + "\uD7E0 > \u11B7\u11B7;" + + "\uD7E1 > \u11B7\u11B8\u11BA;" + + "\uD7E2 > \u11B7\u11BD;" + + "\uD7E3 > \u11B8\u11AE;" + + "\uD7E4 > \u11B8\u11AF\u11C1;" + + "\uD7E5 > \u11B8\u11B7;" + + "\uD7E6 > \u11B8\u11B8;" + + "\uD7E7 > \u11B8\u11BA\u11AE;" + + "\uD7E8 > \u11B8\u11BD;" + + "\uD7E9 > \u11B8\u11BE;" + + "\uD7EA > \u11BA\u11B7;" + + "\uD7EB > \u11BA\u11B8\u11BC;" + + "\uD7EC > \u11BA\u11BA\u11A8;" + + "\uD7ED > \u11BA\u11BA\u11AE;" + + "\uD7EE > \u11BA\u11EB;" + + "\uD7EF > \u11BA\u11BD;" + + "\uD7F0 > \u11BA\u11BE;" + + "\uD7F1 > \u11BA\u11C0;" + + "\uD7F2 > \u11BA\u11C2;" + + "\uD7F3 > \u11EB\u11B8;" + + "\uD7F4 > \u11EB\u11B8\u11BC;" + + "\uD7F5 > \u11F0\u11B7;" + + "\uD7F6 > \u11F0\u11C2;" + + "\uD7F7 > \u11BD\u11B8;" + + "\uD7F8 > \u11BD\u11B8\u11B8;" + + "\uD7F9 > \u11BD\u11BD;" + + "\uD7FA > \u11C1\u11BA;" + + "\uD7FB > \u11C1\u11C0;"; static final String MKC_RULES = // "::MKD;"+ - "\u1107\u1109\u1100 > \u1122;" + "\u1107\u1109\u1103 > \u1123;" + "\u1107\u1109\u1107 > \u1124;" - + "\u1107\u1109\u1109 > \u1125;" + "\u1107\u1109\u110C > \u1126;" + "\u1107\u1107\u110B > \u112C;" - + "\u1109\u1107\u1100 > \u1133;" + "\u1109\u1109\u1109 > \u1134;" + "\u1169\u1161\u1175 > \u116B;" - + "\u116E\u1165\u1175 > \u1170;" + "\u1169\u1165\u1175 > \u1180;" + "\u1169\u1167\u1175 > \u1181;" - + "\u116D\u1163\u1175 > \u1185;" + "\u116E\u1161\u1175 > \u118A;" + "\u116E\u1165\u1173 > \u118B;" - + "\u116E\u1167\u1175 > \u118C;" + "\u1172\u1165\u1175 > \u1190;" + "\u1172\u1167\u1175 > \u1192;" - + "\u1173\u1175\u116E > \u1197;" + "\u1169\u1163\u1175 > \u11A7;" + "\u11A8\u11BA\u11A8 > \u11C4;" - + "\u11AF\u11A8\u11BA > \u11CC;" + "\u11AF\u11AE\u11C2 > \u11CF;" + "\u11AF\u11B7\u11A8 > \u11D1;" - + "\u11AF\u11B7\u11BA > \u11D2;" + "\u11AF\u11B8\u11BA > \u11D3;" + "\u11AF\u11B8\u11C2 > \u11D4;" - + "\u11AF\u11B8\u11BC > \u11D5;" + "\u11AF\u11BA\u11BA > \u11D6;" + "\u11B7\u11BA\u11BA > \u11DE;" - + "\u11BC\u11A8\u11A8 > \u11ED;" + "\u1105\u1100\u1100 > \uA965;" + "\u1105\u1103\u1103 > \uA967;" - + "\u1105\u1107\u1107 > \uA96A;" + "\u1105\u1107\u110B > \uA96B;" + "\u1107\u1109\u1110 > \uA972;" - + "\u1109\u1109\u1107 > \uA975;" + "\u110C\u110C\u1112 > \uA978;" + "\u1169\u1169\u1175 > \uD7B1;" - + "\u116D\u1161\u1175 > \uD7B3;" + "\u116E\u1175\u1175 > \uD7B6;" + "\u1172\u1161\u1175 > \uD7B7;" - + "\u1173\u1165\u1175 > \uD7BB;" + "\u1175\u1163\u1169 > \uD7BD;" + "\u1175\u1163\u1175 > \uD7BE;" - + "\u1175\u1167\u1175 > \uD7C0;" + "\u1175\u1169\u1175 > \uD7C1;" + "\u119E\u1165\u1175 > \uD7C6;" - + "\u11AE\u11AE\u11B8 > \uD7CE;" + "\u11AE\u11BA\u11A8 > \uD7D1;" + "\u11AF\u11A8\u11A8 > \uD7D5;" - + "\u11AF\u11A8\u11C2 > \uD7D6;" + "\u11AF\u11AF\u11BF > \uD7D7;" + "\u11AF\u11B7\u11C2 > \uD7D8;" - + "\u11AF\u11B8\u11AE > \uD7D9;" + "\u11AF\u11B8\u11C1 > \uD7DA;" + "\u11AF\u11F9\u11C2 > \uD7DC;" - + "\u11B7\u11AB\u11AB > \uD7DF;" + "\u11B7\u11B8\u11BA > \uD7E1;" + "\u11B8\u11AF\u11C1 > \uD7E4;" - + "\u11B8\u11BA\u11AE > \uD7E7;" + "\u11BA\u11B8\u11BC > \uD7EB;" + "\u11BA\u11BA\u11A8 > \uD7EC;" - + "\u11BA\u11BA\u11AE > \uD7ED;" + "\u11EB\u11B8\u11BC > \uD7F4;" + "\u11BD\u11B8\u11B8 > \uD7F8;" - + "\u1100\u1100 > \u1101;" + "\u1103\u1103 > \u1104;" + "\u1107\u1107 > \u1108;" + "\u1109\u1109 > \u110A;" - + "\u110C\u110C > \u110D;" + "\u1102\u1100 > \u1113;" + "\u1102\u1102 > \u1114;" + "\u1102\u1103 > \u1115;" - + "\u1102\u1107 > \u1116;" + "\u1103\u1100 > \u1117;" + "\u1105\u1102 > \u1118;" + "\u1105\u1105 > \u1119;" - + "\u1105\u1112 > \u111A;" + "\u1105\u110B > \u111B;" + "\u1106\u1107 > \u111C;" + "\u1106\u110B > \u111D;" - + "\u1107\u1100 > \u111E;" + "\u1107\u1102 > \u111F;" + "\u1107\u1103 > \u1120;" + "\u1107\u1109 > \u1121;" - + "\u1107\u110C > \u1127;" + "\u1107\u110E > \u1128;" + "\u1107\u1110 > \u1129;" + "\u1107\u1111 > \u112A;" - + "\u1107\u110B > \u112B;" + "\u1109\u1100 > \u112D;" + "\u1109\u1102 > \u112E;" + "\u1109\u1103 > \u112F;" - + "\u1109\u1105 > \u1130;" + "\u1109\u1106 > \u1131;" + "\u1109\u1107 > \u1132;" + "\u1109\u110B > \u1135;" - + "\u1109\u110C > \u1136;" + "\u1109\u110E > \u1137;" + "\u1109\u110F > \u1138;" + "\u1109\u1110 > \u1139;" - + "\u1109\u1111 > \u113A;" + "\u1109\u1112 > \u113B;" + "\u113C\u113C > \u113D;" + "\u113E\u113E > \u113F;" - + "\u110B\u1100 > \u1141;" + "\u110B\u1103 > \u1142;" + "\u110B\u1106 > \u1143;" + "\u110B\u1107 > \u1144;" - + "\u110B\u1109 > \u1145;" + "\u110B\u1140 > \u1146;" + "\u110B\u110B > \u1147;" + "\u110B\u110C > \u1148;" - + "\u110B\u110E > \u1149;" + "\u110B\u1110 > \u114A;" + "\u110B\u1111 > \u114B;" + "\u110C\u110B > \u114D;" - + "\u114E\u114E > \u114F;" + "\u1150\u1150 > \u1151;" + "\u110E\u110F > \u1152;" + "\u110E\u1112 > \u1153;" - + "\u1111\u1107 > \u1156;" + "\u1111\u110B > \u1157;" + "\u1112\u1112 > \u1158;" + "\u1100\u1103 > \u115A;" - + "\u1102\u1109 > \u115B;" + "\u1102\u110C > \u115C;" + "\u1102\u1112 > \u115D;" + "\u1103\u1105 > \u115E;" - + "\u1161\u1175 > \u1162;" + "\u1163\u1175 > \u1164;" + "\u1165\u1175 > \u1166;" + "\u1167\u1175 > \u1168;" - + "\u1169\u1161 > \u116A;" + "\u1169\u1175 > \u116C;" + "\u116E\u1165 > \u116F;" + "\u116E\u1175 > \u1171;" - + "\u1173\u1175 > \u1174;" + "\u1161\u1169 > \u1176;" + "\u1161\u116E > \u1177;" + "\u1163\u1169 > \u1178;" - + "\u1163\u116D > \u1179;" + "\u1165\u1169 > \u117A;" + "\u1165\u116E > \u117B;" + "\u1165\u1173 > \u117C;" - + "\u1167\u1169 > \u117D;" + "\u1167\u116E > \u117E;" + "\u1169\u1165 > \u117F;" + "\u1169\u1169 > \u1182;" - + "\u1169\u116E > \u1183;" + "\u116D\u1163 > \u1184;" + "\u116D\u1167 > \u1186;" + "\u116D\u1169 > \u1187;" - + "\u116D\u1175 > \u1188;" + "\u116E\u1161 > \u1189;" + "\u116E\u116E > \u118D;" + "\u1172\u1161 > \u118E;" - + "\u1172\u1165 > \u118F;" + "\u1172\u1167 > \u1191;" + "\u1172\u116E > \u1193;" + "\u1172\u1175 > \u1194;" - + "\u1173\u116E > \u1195;" + "\u1173\u1173 > \u1196;" + "\u1175\u1161 > \u1198;" + "\u1175\u1163 > \u1199;" - + "\u1175\u1169 > \u119A;" + "\u1175\u116E > \u119B;" + "\u1175\u1173 > \u119C;" + "\u1175\u119E > \u119D;" - + "\u119E\u1165 > \u119F;" + "\u119E\u116E > \u11A0;" + "\u119E\u1175 > \u11A1;" + "\u119E\u119E > \u11A2;" - + "\u1161\u1173 > \u11A3;" + "\u1163\u116E > \u11A4;" + "\u1167\u1163 > \u11A5;" + "\u1169\u1163 > \u11A6;" - + "\u11A8\u11A8 > \u11A9;" + "\u11A8\u11BA > \u11AA;" + "\u11AB\u11BD > \u11AC;" + "\u11AB\u11C2 > \u11AD;" - + "\u11AF\u11A8 > \u11B0;" + "\u11AF\u11B7 > \u11B1;" + "\u11AF\u11B8 > \u11B2;" + "\u11AF\u11BA > \u11B3;" - + "\u11AF\u11C0 > \u11B4;" + "\u11AF\u11C1 > \u11B5;" + "\u11AF\u11C2 > \u11B6;" + "\u11B8\u11BA > \u11B9;" - + "\u11BA\u11BA > \u11BB;" + "\u11A8\u11AF > \u11C3;" + "\u11AB\u11A8 > \u11C5;" + "\u11AB\u11AE > \u11C6;" - + "\u11AB\u11BA > \u11C7;" + "\u11AB\u11EB > \u11C8;" + "\u11AB\u11C0 > \u11C9;" + "\u11AE\u11A8 > \u11CA;" - + "\u11AE\u11AF > \u11CB;" + "\u11AF\u11AB > \u11CD;" + "\u11AF\u11AE > \u11CE;" + "\u11AF\u11AF > \u11D0;" - + "\u11AF\u11EB > \u11D7;" + "\u11AF\u11BF > \u11D8;" + "\u11AF\u11F9 > \u11D9;" + "\u11B7\u11A8 > \u11DA;" - + "\u11B7\u11AF > \u11DB;" + "\u11B7\u11B8 > \u11DC;" + "\u11B7\u11BA > \u11DD;" + "\u11B7\u11EB > \u11DF;" - + "\u11B7\u11BE > \u11E0;" + "\u11B7\u11C2 > \u11E1;" + "\u11B7\u11BC > \u11E2;" + "\u11B8\u11AF > \u11E3;" - + "\u11B8\u11C1 > \u11E4;" + "\u11B8\u11C2 > \u11E5;" + "\u11B8\u11BC > \u11E6;" + "\u11BA\u11A8 > \u11E7;" - + "\u11BA\u11AE > \u11E8;" + "\u11BA\u11AF > \u11E9;" + "\u11BA\u11B8 > \u11EA;" + "\u11BC\u11A8 > \u11EC;" - + "\u11BC\u11BC > \u11EE;" + "\u11BC\u11BF > \u11EF;" + "\u11F0\u11BA > \u11F1;" + "\u11F0\u11EB > \u11F2;" - + "\u11C1\u11B8 > \u11F3;" + "\u11C1\u11BC > \u11F4;" + "\u11C2\u11AB > \u11F5;" + "\u11C2\u11AF > \u11F6;" - + "\u11C2\u11B7 > \u11F7;" + "\u11C2\u11B8 > \u11F8;" + "\u11A8\u11AB > \u11FA;" + "\u11A8\u11B8 > \u11FB;" - + "\u11A8\u11BE > \u11FC;" + "\u11A8\u11BF > \u11FD;" + "\u11A8\u11C2 > \u11FE;" + "\u11AB\u11AB > \u11FF;" - + "\u1103\u1106 > \uA960;" + "\u1103\u1107 > \uA961;" + "\u1103\u1109 > \uA962;" + "\u1103\u110C > \uA963;" - + "\u1105\u1100 > \uA964;" + "\u1105\u1103 > \uA966;" + "\u1105\u1106 > \uA968;" + "\u1105\u1107 > \uA969;" - + "\u1105\u1109 > \uA96C;" + "\u1105\u110C > \uA96D;" + "\u1105\u110F > \uA96E;" + "\u1106\u1100 > \uA96F;" - + "\u1106\u1103 > \uA970;" + "\u1106\u1109 > \uA971;" + "\u1107\u110F > \uA973;" + "\u1107\u1112 > \uA974;" - + "\u110B\u1105 > \uA976;" + "\u110B\u1112 > \uA977;" + "\u1110\u1110 > \uA979;" + "\u1111\u1112 > \uA97A;" - + "\u1112\u1109 > \uA97B;" + "\u1159\u1159 > \uA97C;" + "\u1169\u1167 > \uD7B0;" + "\u116D\u1161 > \uD7B2;" - + "\u116D\u1165 > \uD7B4;" + "\u116E\u1167 > \uD7B5;" + "\u1172\u1169 > \uD7B8;" + "\u1173\u1161 > \uD7B9;" - + "\u1173\u1165 > \uD7BA;" + "\u1173\u1169 > \uD7BC;" + "\u1175\u1167 > \uD7BF;" + "\u1175\u116D > \uD7C2;" - + "\u1175\u1172 > \uD7C3;" + "\u1175\u1175 > \uD7C4;" + "\u119E\u1161 > \uD7C5;" + "\u11AB\u11AF > \uD7CB;" - + "\u11AB\u11BE > \uD7CC;" + "\u11AE\u11AE > \uD7CD;" + "\u11AE\u11B8 > \uD7CF;" + "\u11AE\u11BA > \uD7D0;" - + "\u11AE\u11BD > \uD7D2;" + "\u11AE\u11BE > \uD7D3;" + "\u11AE\u11C0 > \uD7D4;" + "\u11AF\u11F0 > \uD7DB;" - + "\u11AF\u11BC > \uD7DD;" + "\u11B7\u11AB > \uD7DE;" + "\u11B7\u11B7 > \uD7E0;" + "\u11B7\u11BD > \uD7E2;" - + "\u11B8\u11AE > \uD7E3;" + "\u11B8\u11B7 > \uD7E5;" + "\u11B8\u11B8 > \uD7E6;" + "\u11B8\u11BD > \uD7E8;" - + "\u11B8\u11BE > \uD7E9;" + "\u11BA\u11B7 > \uD7EA;" + "\u11BA\u11EB > \uD7EE;" + "\u11BA\u11BD > \uD7EF;" - + "\u11BA\u11BE > \uD7F0;" + "\u11BA\u11C0 > \uD7F1;" + "\u11BA\u11C2 > \uD7F2;" + "\u11EB\u11B8 > \uD7F3;" - + "\u11F0\u11B7 > \uD7F5;" + "\u11F0\u11C2 > \uD7F6;" + "\u11BD\u11B8 > \uD7F7;" + "\u11BD\u11BD > \uD7F9;" - + "\u11C1\u11BA > \uD7FA;" + "\u11C1\u11C0 > \uD7FB;"; - - static final Transliterator MKD = Transliterator.createFromRules("MKD", "::NFD;" + MKD_RULES, - Transliterator.FORWARD); - static final Transliterator MKKD = Transliterator.createFromRules("MKD", "::NFKD;" + MKD_RULES, - Transliterator.FORWARD); - static final Transliterator MKC = Transliterator.createFromRules("MKC", "::NFD;" + MKD_RULES + "::null;" - + MKC_RULES + "::NFC;", Transliterator.FORWARD); + "\u1107\u1109\u1100 > \u1122;" + + "\u1107\u1109\u1103 > \u1123;" + + "\u1107\u1109\u1107 > \u1124;" + + "\u1107\u1109\u1109 > \u1125;" + + "\u1107\u1109\u110C > \u1126;" + + "\u1107\u1107\u110B > \u112C;" + + "\u1109\u1107\u1100 > \u1133;" + + "\u1109\u1109\u1109 > \u1134;" + + "\u1169\u1161\u1175 > \u116B;" + + "\u116E\u1165\u1175 > \u1170;" + + "\u1169\u1165\u1175 > \u1180;" + + "\u1169\u1167\u1175 > \u1181;" + + "\u116D\u1163\u1175 > \u1185;" + + "\u116E\u1161\u1175 > \u118A;" + + "\u116E\u1165\u1173 > \u118B;" + + "\u116E\u1167\u1175 > \u118C;" + + "\u1172\u1165\u1175 > \u1190;" + + "\u1172\u1167\u1175 > \u1192;" + + "\u1173\u1175\u116E > \u1197;" + + "\u1169\u1163\u1175 > \u11A7;" + + "\u11A8\u11BA\u11A8 > \u11C4;" + + "\u11AF\u11A8\u11BA > \u11CC;" + + "\u11AF\u11AE\u11C2 > \u11CF;" + + "\u11AF\u11B7\u11A8 > \u11D1;" + + "\u11AF\u11B7\u11BA > \u11D2;" + + "\u11AF\u11B8\u11BA > \u11D3;" + + "\u11AF\u11B8\u11C2 > \u11D4;" + + "\u11AF\u11B8\u11BC > \u11D5;" + + "\u11AF\u11BA\u11BA > \u11D6;" + + "\u11B7\u11BA\u11BA > \u11DE;" + + "\u11BC\u11A8\u11A8 > \u11ED;" + + "\u1105\u1100\u1100 > \uA965;" + + "\u1105\u1103\u1103 > \uA967;" + + "\u1105\u1107\u1107 > \uA96A;" + + "\u1105\u1107\u110B > \uA96B;" + + "\u1107\u1109\u1110 > \uA972;" + + "\u1109\u1109\u1107 > \uA975;" + + "\u110C\u110C\u1112 > \uA978;" + + "\u1169\u1169\u1175 > \uD7B1;" + + "\u116D\u1161\u1175 > \uD7B3;" + + "\u116E\u1175\u1175 > \uD7B6;" + + "\u1172\u1161\u1175 > \uD7B7;" + + "\u1173\u1165\u1175 > \uD7BB;" + + "\u1175\u1163\u1169 > \uD7BD;" + + "\u1175\u1163\u1175 > \uD7BE;" + + "\u1175\u1167\u1175 > \uD7C0;" + + "\u1175\u1169\u1175 > \uD7C1;" + + "\u119E\u1165\u1175 > \uD7C6;" + + "\u11AE\u11AE\u11B8 > \uD7CE;" + + "\u11AE\u11BA\u11A8 > \uD7D1;" + + "\u11AF\u11A8\u11A8 > \uD7D5;" + + "\u11AF\u11A8\u11C2 > \uD7D6;" + + "\u11AF\u11AF\u11BF > \uD7D7;" + + "\u11AF\u11B7\u11C2 > \uD7D8;" + + "\u11AF\u11B8\u11AE > \uD7D9;" + + "\u11AF\u11B8\u11C1 > \uD7DA;" + + "\u11AF\u11F9\u11C2 > \uD7DC;" + + "\u11B7\u11AB\u11AB > \uD7DF;" + + "\u11B7\u11B8\u11BA > \uD7E1;" + + "\u11B8\u11AF\u11C1 > \uD7E4;" + + "\u11B8\u11BA\u11AE > \uD7E7;" + + "\u11BA\u11B8\u11BC > \uD7EB;" + + "\u11BA\u11BA\u11A8 > \uD7EC;" + + "\u11BA\u11BA\u11AE > \uD7ED;" + + "\u11EB\u11B8\u11BC > \uD7F4;" + + "\u11BD\u11B8\u11B8 > \uD7F8;" + + "\u1100\u1100 > \u1101;" + + "\u1103\u1103 > \u1104;" + + "\u1107\u1107 > \u1108;" + + "\u1109\u1109 > \u110A;" + + "\u110C\u110C > \u110D;" + + "\u1102\u1100 > \u1113;" + + "\u1102\u1102 > \u1114;" + + "\u1102\u1103 > \u1115;" + + "\u1102\u1107 > \u1116;" + + "\u1103\u1100 > \u1117;" + + "\u1105\u1102 > \u1118;" + + "\u1105\u1105 > \u1119;" + + "\u1105\u1112 > \u111A;" + + "\u1105\u110B > \u111B;" + + "\u1106\u1107 > \u111C;" + + "\u1106\u110B > \u111D;" + + "\u1107\u1100 > \u111E;" + + "\u1107\u1102 > \u111F;" + + "\u1107\u1103 > \u1120;" + + "\u1107\u1109 > \u1121;" + + "\u1107\u110C > \u1127;" + + "\u1107\u110E > \u1128;" + + "\u1107\u1110 > \u1129;" + + "\u1107\u1111 > \u112A;" + + "\u1107\u110B > \u112B;" + + "\u1109\u1100 > \u112D;" + + "\u1109\u1102 > \u112E;" + + "\u1109\u1103 > \u112F;" + + "\u1109\u1105 > \u1130;" + + "\u1109\u1106 > \u1131;" + + "\u1109\u1107 > \u1132;" + + "\u1109\u110B > \u1135;" + + "\u1109\u110C > \u1136;" + + "\u1109\u110E > \u1137;" + + "\u1109\u110F > \u1138;" + + "\u1109\u1110 > \u1139;" + + "\u1109\u1111 > \u113A;" + + "\u1109\u1112 > \u113B;" + + "\u113C\u113C > \u113D;" + + "\u113E\u113E > \u113F;" + + "\u110B\u1100 > \u1141;" + + "\u110B\u1103 > \u1142;" + + "\u110B\u1106 > \u1143;" + + "\u110B\u1107 > \u1144;" + + "\u110B\u1109 > \u1145;" + + "\u110B\u1140 > \u1146;" + + "\u110B\u110B > \u1147;" + + "\u110B\u110C > \u1148;" + + "\u110B\u110E > \u1149;" + + "\u110B\u1110 > \u114A;" + + "\u110B\u1111 > \u114B;" + + "\u110C\u110B > \u114D;" + + "\u114E\u114E > \u114F;" + + "\u1150\u1150 > \u1151;" + + "\u110E\u110F > \u1152;" + + "\u110E\u1112 > \u1153;" + + "\u1111\u1107 > \u1156;" + + "\u1111\u110B > \u1157;" + + "\u1112\u1112 > \u1158;" + + "\u1100\u1103 > \u115A;" + + "\u1102\u1109 > \u115B;" + + "\u1102\u110C > \u115C;" + + "\u1102\u1112 > \u115D;" + + "\u1103\u1105 > \u115E;" + + "\u1161\u1175 > \u1162;" + + "\u1163\u1175 > \u1164;" + + "\u1165\u1175 > \u1166;" + + "\u1167\u1175 > \u1168;" + + "\u1169\u1161 > \u116A;" + + "\u1169\u1175 > \u116C;" + + "\u116E\u1165 > \u116F;" + + "\u116E\u1175 > \u1171;" + + "\u1173\u1175 > \u1174;" + + "\u1161\u1169 > \u1176;" + + "\u1161\u116E > \u1177;" + + "\u1163\u1169 > \u1178;" + + "\u1163\u116D > \u1179;" + + "\u1165\u1169 > \u117A;" + + "\u1165\u116E > \u117B;" + + "\u1165\u1173 > \u117C;" + + "\u1167\u1169 > \u117D;" + + "\u1167\u116E > \u117E;" + + "\u1169\u1165 > \u117F;" + + "\u1169\u1169 > \u1182;" + + "\u1169\u116E > \u1183;" + + "\u116D\u1163 > \u1184;" + + "\u116D\u1167 > \u1186;" + + "\u116D\u1169 > \u1187;" + + "\u116D\u1175 > \u1188;" + + "\u116E\u1161 > \u1189;" + + "\u116E\u116E > \u118D;" + + "\u1172\u1161 > \u118E;" + + "\u1172\u1165 > \u118F;" + + "\u1172\u1167 > \u1191;" + + "\u1172\u116E > \u1193;" + + "\u1172\u1175 > \u1194;" + + "\u1173\u116E > \u1195;" + + "\u1173\u1173 > \u1196;" + + "\u1175\u1161 > \u1198;" + + "\u1175\u1163 > \u1199;" + + "\u1175\u1169 > \u119A;" + + "\u1175\u116E > \u119B;" + + "\u1175\u1173 > \u119C;" + + "\u1175\u119E > \u119D;" + + "\u119E\u1165 > \u119F;" + + "\u119E\u116E > \u11A0;" + + "\u119E\u1175 > \u11A1;" + + "\u119E\u119E > \u11A2;" + + "\u1161\u1173 > \u11A3;" + + "\u1163\u116E > \u11A4;" + + "\u1167\u1163 > \u11A5;" + + "\u1169\u1163 > \u11A6;" + + "\u11A8\u11A8 > \u11A9;" + + "\u11A8\u11BA > \u11AA;" + + "\u11AB\u11BD > \u11AC;" + + "\u11AB\u11C2 > \u11AD;" + + "\u11AF\u11A8 > \u11B0;" + + "\u11AF\u11B7 > \u11B1;" + + "\u11AF\u11B8 > \u11B2;" + + "\u11AF\u11BA > \u11B3;" + + "\u11AF\u11C0 > \u11B4;" + + "\u11AF\u11C1 > \u11B5;" + + "\u11AF\u11C2 > \u11B6;" + + "\u11B8\u11BA > \u11B9;" + + "\u11BA\u11BA > \u11BB;" + + "\u11A8\u11AF > \u11C3;" + + "\u11AB\u11A8 > \u11C5;" + + "\u11AB\u11AE > \u11C6;" + + "\u11AB\u11BA > \u11C7;" + + "\u11AB\u11EB > \u11C8;" + + "\u11AB\u11C0 > \u11C9;" + + "\u11AE\u11A8 > \u11CA;" + + "\u11AE\u11AF > \u11CB;" + + "\u11AF\u11AB > \u11CD;" + + "\u11AF\u11AE > \u11CE;" + + "\u11AF\u11AF > \u11D0;" + + "\u11AF\u11EB > \u11D7;" + + "\u11AF\u11BF > \u11D8;" + + "\u11AF\u11F9 > \u11D9;" + + "\u11B7\u11A8 > \u11DA;" + + "\u11B7\u11AF > \u11DB;" + + "\u11B7\u11B8 > \u11DC;" + + "\u11B7\u11BA > \u11DD;" + + "\u11B7\u11EB > \u11DF;" + + "\u11B7\u11BE > \u11E0;" + + "\u11B7\u11C2 > \u11E1;" + + "\u11B7\u11BC > \u11E2;" + + "\u11B8\u11AF > \u11E3;" + + "\u11B8\u11C1 > \u11E4;" + + "\u11B8\u11C2 > \u11E5;" + + "\u11B8\u11BC > \u11E6;" + + "\u11BA\u11A8 > \u11E7;" + + "\u11BA\u11AE > \u11E8;" + + "\u11BA\u11AF > \u11E9;" + + "\u11BA\u11B8 > \u11EA;" + + "\u11BC\u11A8 > \u11EC;" + + "\u11BC\u11BC > \u11EE;" + + "\u11BC\u11BF > \u11EF;" + + "\u11F0\u11BA > \u11F1;" + + "\u11F0\u11EB > \u11F2;" + + "\u11C1\u11B8 > \u11F3;" + + "\u11C1\u11BC > \u11F4;" + + "\u11C2\u11AB > \u11F5;" + + "\u11C2\u11AF > \u11F6;" + + "\u11C2\u11B7 > \u11F7;" + + "\u11C2\u11B8 > \u11F8;" + + "\u11A8\u11AB > \u11FA;" + + "\u11A8\u11B8 > \u11FB;" + + "\u11A8\u11BE > \u11FC;" + + "\u11A8\u11BF > \u11FD;" + + "\u11A8\u11C2 > \u11FE;" + + "\u11AB\u11AB > \u11FF;" + + "\u1103\u1106 > \uA960;" + + "\u1103\u1107 > \uA961;" + + "\u1103\u1109 > \uA962;" + + "\u1103\u110C > \uA963;" + + "\u1105\u1100 > \uA964;" + + "\u1105\u1103 > \uA966;" + + "\u1105\u1106 > \uA968;" + + "\u1105\u1107 > \uA969;" + + "\u1105\u1109 > \uA96C;" + + "\u1105\u110C > \uA96D;" + + "\u1105\u110F > \uA96E;" + + "\u1106\u1100 > \uA96F;" + + "\u1106\u1103 > \uA970;" + + "\u1106\u1109 > \uA971;" + + "\u1107\u110F > \uA973;" + + "\u1107\u1112 > \uA974;" + + "\u110B\u1105 > \uA976;" + + "\u110B\u1112 > \uA977;" + + "\u1110\u1110 > \uA979;" + + "\u1111\u1112 > \uA97A;" + + "\u1112\u1109 > \uA97B;" + + "\u1159\u1159 > \uA97C;" + + "\u1169\u1167 > \uD7B0;" + + "\u116D\u1161 > \uD7B2;" + + "\u116D\u1165 > \uD7B4;" + + "\u116E\u1167 > \uD7B5;" + + "\u1172\u1169 > \uD7B8;" + + "\u1173\u1161 > \uD7B9;" + + "\u1173\u1165 > \uD7BA;" + + "\u1173\u1169 > \uD7BC;" + + "\u1175\u1167 > \uD7BF;" + + "\u1175\u116D > \uD7C2;" + + "\u1175\u1172 > \uD7C3;" + + "\u1175\u1175 > \uD7C4;" + + "\u119E\u1161 > \uD7C5;" + + "\u11AB\u11AF > \uD7CB;" + + "\u11AB\u11BE > \uD7CC;" + + "\u11AE\u11AE > \uD7CD;" + + "\u11AE\u11B8 > \uD7CF;" + + "\u11AE\u11BA > \uD7D0;" + + "\u11AE\u11BD > \uD7D2;" + + "\u11AE\u11BE > \uD7D3;" + + "\u11AE\u11C0 > \uD7D4;" + + "\u11AF\u11F0 > \uD7DB;" + + "\u11AF\u11BC > \uD7DD;" + + "\u11B7\u11AB > \uD7DE;" + + "\u11B7\u11B7 > \uD7E0;" + + "\u11B7\u11BD > \uD7E2;" + + "\u11B8\u11AE > \uD7E3;" + + "\u11B8\u11B7 > \uD7E5;" + + "\u11B8\u11B8 > \uD7E6;" + + "\u11B8\u11BD > \uD7E8;" + + "\u11B8\u11BE > \uD7E9;" + + "\u11BA\u11B7 > \uD7EA;" + + "\u11BA\u11EB > \uD7EE;" + + "\u11BA\u11BD > \uD7EF;" + + "\u11BA\u11BE > \uD7F0;" + + "\u11BA\u11C0 > \uD7F1;" + + "\u11BA\u11C2 > \uD7F2;" + + "\u11EB\u11B8 > \uD7F3;" + + "\u11F0\u11B7 > \uD7F5;" + + "\u11F0\u11C2 > \uD7F6;" + + "\u11BD\u11B8 > \uD7F7;" + + "\u11BD\u11BD > \uD7F9;" + + "\u11C1\u11BA > \uD7FA;" + + "\u11C1\u11C0 > \uD7FB;"; + + static final Transliterator MKD = + Transliterator.createFromRules("MKD", "::NFD;" + MKD_RULES, Transliterator.FORWARD); + static final Transliterator MKKD = + Transliterator.createFromRules("MKD", "::NFKD;" + MKD_RULES, Transliterator.FORWARD); + static final Transliterator MKC = + Transliterator.createFromRules( + "MKC", + "::NFD;" + MKD_RULES + "::null;" + MKC_RULES + "::NFC;", + Transliterator.FORWARD); // static final String MKDP_RULES = // MKD_RULES + @@ -1574,18 +2391,32 @@ public static UnicodeSet flatten(UnicodeSet exemplar1) { // "::NFD;"+ MKDP_RULES + "::null;" + MKCP_RULES + "::NFC;", // Transliterator.FORWARD); - static Pattern IS_ARCHAIC = Pattern.compile("(Obsolete|Ancient|Archaic|Medieval|New Testament|\\bUPA\\b)", - Pattern.CASE_INSENSITIVE); - - public static final UnicodeSet ADD_SUBHEAD = (UnicodeSet) ScriptCategories2 - .parseUnicodeSet("[[:S:][:P:][:M:]&[[:script=common:][:script=inherited:]]-[:nfkdqc=n:]]") - .removeAll(ScriptCategories2.ARCHAIC).freeze(); - static UnicodeSet UNCOMMON_HAN = ScriptCategories2.parseUnicodeSet("[" + "[:script=han:]" - + "-[:block=CJK Unified Ideographs:]" + "-[:block=CJK Symbols And Punctuation:]" - + "-[:block=CJK Radicals Supplement:]" + "-[:block=Ideographic Description Characters:]" - + "-[:block=CJK Strokes:]" + "-[:script=hiragana:]" + "-[:script=katakana:]" + "-[〇]" + "]"); // we'll alter - // below to remove - // iicore + static Pattern IS_ARCHAIC = + Pattern.compile( + "(Obsolete|Ancient|Archaic|Medieval|New Testament|\\bUPA\\b)", + Pattern.CASE_INSENSITIVE); + + public static final UnicodeSet ADD_SUBHEAD = + (UnicodeSet) + ScriptCategories2.parseUnicodeSet( + "[[:S:][:P:][:M:]&[[:script=common:][:script=inherited:]]-[:nfkdqc=n:]]") + .removeAll(ScriptCategories2.ARCHAIC) + .freeze(); + static UnicodeSet UNCOMMON_HAN = + ScriptCategories2.parseUnicodeSet( + "[" + + "[:script=han:]" + + "-[:block=CJK Unified Ideographs:]" + + "-[:block=CJK Symbols And Punctuation:]" + + "-[:block=CJK Radicals Supplement:]" + + "-[:block=Ideographic Description Characters:]" + + "-[:block=CJK Strokes:]" + + "-[:script=hiragana:]" + + "-[:script=katakana:]" + + "-[〇]" + + "]"); // we'll alter + // below to remove + // iicore static class Renamer { static class MatchData { @@ -1628,14 +2459,15 @@ void getRenameData(String filename) throws IOException { int breaker = line.indexOf(">"); String source = line.substring(0, breaker).trim(); String target = line.substring(breaker + 1).trim(); - renameTable.put(Pattern.compile(source, Pattern.CASE_INSENSITIVE).matcher(""), new MatchData( - source, target)); + renameTable.put( + Pattern.compile(source, Pattern.CASE_INSENSITIVE).matcher(""), + new MatchData(source, target)); } catch (Exception e) { - throw (RuntimeException) new IllegalArgumentException("Problem with: " + line).initCause(e); + throw (RuntimeException) + new IllegalArgumentException("Problem with: " + line).initCause(e); } } in.close(); - } // static final String[] RENAME_TABLE = { @@ -1643,11 +2475,13 @@ void getRenameData(String filename) throws IOException { // ".*Category:(.*) - (.*)>$1:$2", // ".*Category:([^ ]*)[ ](.*)>$2:$1", // ".*Category:(.*)>$1:Miscellaneous", - // "Symbol:Latin 1 Supplement - Latin-1 punctuation and symbols > Symbol:Latin-1 punctuation and symbols", + // "Symbol:Latin 1 Supplement - Latin-1 punctuation and symbols > Symbol:Latin-1 punctuation + // and symbols", // "Mark:(.*) > General Diacritic:$1", // "Symbol:(.*) - (.*(arrows|harpoons).*) > Arrows:$2", // "Symbol:Control Pictures.*>Symbol:Control Pictures", - // "Symbol:(Box Drawing|Block Elements|Geometric Shapes|Miscellaneous Symbols And Arrows).*>Symbol:Geometric Shapes", + // "Symbol:(Box Drawing|Block Elements|Geometric Shapes|Miscellaneous Symbols And + // Arrows).*>Symbol:Geometric Shapes", // "Symbol:(.*) Tiles.*>Symbol:Tiles and Dominoes", // "Symbol:.*Musical.*>Symbol:Musical Symbols", // "Symbol:Tai Xuan Jing Symbols.*>Symbol:Tai Xuan Jing Symbols", @@ -1691,7 +2525,8 @@ void getRenameData(String filename) throws IOException { // String target = row.substring(breaker+1).trim(); // renameTable.put(Pattern.compile(source,Pattern.CASE_INSENSITIVE).matcher(""), target); // } catch (Exception e) { - // throw (RuntimeException) new IllegalArgumentException("Problem with: " + row).initCause(e); + // throw (RuntimeException) new IllegalArgumentException("Problem with: " + + // row).initCause(e); // } // } // } @@ -1710,7 +2545,7 @@ SimplePair rename(String maincategory, String subcategory) { if (true) System.out.println(); } String indent = ""; - for (int count = 0;; ++count) { + for (int count = 0; ; ++count) { boolean didMatch = false; for (Matcher m : renameTable.keySet()) { if (m.reset(lookup).matches()) { @@ -1722,8 +2557,15 @@ SimplePair rename(String maincategory, String subcategory) { if (lookup.equals(newName)) { continue; } - renamingLog.println(indent + lookup + "\t=>\t" + newName + "\t // " + newNames.source + " > " - + newNames.target); + renamingLog.println( + indent + + lookup + + "\t=>\t" + + newName + + "\t // " + + newNames.source + + " > " + + newNames.target); lookup = newName; indent += "\t"; newNames.used = true; @@ -1759,7 +2601,7 @@ public void showUnusedRules() { } public static > U addAllToCollection(UnicodeSet input, U output) { - for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) { output.add(it.getString()); } return output; @@ -1777,7 +2619,8 @@ private static void addManualCorrections(String fileName) throws IOException { } String components[] = line.split(";"); if (components.length != 4) { - throw new IOException("Invalid line: <" + line + "> - Expecting 4 ';' separated components"); + throw new IOException( + "Invalid line: <" + line + "> - Expecting 4 ';' separated components"); } UnicodeSet set = new UnicodeSet(components[3]); String subCategory = components[1]; @@ -1788,19 +2631,28 @@ private static void addManualCorrections(String fileName) throws IOException { } if (components[2].equals("Add")) { - CATEGORYTABLE.add(components[0], false, subCategory, buttonComparator, Separation.ALL_ORDINARY, set); - } - else if (components[2].equals("Remove")) { + CATEGORYTABLE.add( + components[0], + false, + subCategory, + buttonComparator, + Separation.ALL_ORDINARY, + set); + } else if (components[2].equals("Remove")) { CATEGORYTABLE.removeAll(components[0], subCategory, set); } else { - throw new IOException("Invalid operation: <" + components[2] + "> - Expecting one of {Add,Remove}"); + throw new IOException( + "Invalid operation: <" + + components[2] + + "> - Expecting one of {Add,Remove}"); } } } private static void addEmojiCharacters() throws IOException { - File emojiSources = new File(unicodeDataDirectory + "/EmojiSources.txt"); // Needs fixing for release vs - // non-released directory + File emojiSources = + new File(unicodeDataDirectory + "/EmojiSources.txt"); // Needs fixing for release vs + // non-released directory FileInputStream fis = new FileInputStream(emojiSources); BufferedReader in = new BufferedReader(new InputStreamReader(fis, "UTF-8")); UnicodeSet emojiCharacters = new UnicodeSet(); @@ -1821,11 +2673,18 @@ private static void addEmojiCharacters() throws IOException { emojiCharacters.add(codepoint); } in.close(); - CATEGORYTABLE.add("Symbol", false, "Emoji", buttonComparator, Separation.ALL_ORDINARY, emojiCharacters); + CATEGORYTABLE.add( + "Symbol", + false, + "Emoji", + buttonComparator, + Separation.ALL_ORDINARY, + emojiCharacters); } - public static > U removeAllFromCollection(UnicodeSet input, U output) { - for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) { + public static > U removeAllFromCollection( + UnicodeSet input, U output) { + for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next(); ) { output.remove(it.getString()); } return output; diff --git a/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData2.java b/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData2.java index 4ef1233cb..1b0954185 100644 --- a/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData2.java +++ b/unicodetools/src/main/java/org/unicode/tools/GeneratePickerData2.java @@ -4,7 +4,7 @@ import org.unicode.cldr.draft.ScriptMetadata.Info; public class GeneratePickerData2 { - + enum Patterns { all, category_list, @@ -17,7 +17,7 @@ enum Patterns { scripts, strokes, } - + enum Categories { // general limited_use, @@ -50,7 +50,7 @@ enum Categories { ideographic_desc_characters, consonantal_jamo, vocalic_jamo, - + variant_forms, small_form_variant, full_width_form_variant, @@ -124,6 +124,7 @@ enum Categories { travel_places, weather, } + public static void main(String[] args) { for (String scriptName : ScriptMetadata.getScripts()) { Info info = ScriptMetadata.getInfo(scriptName); diff --git a/unicodetools/src/main/java/org/unicode/tools/GenerateRadicalEnum.java b/unicodetools/src/main/java/org/unicode/tools/GenerateRadicalEnum.java index 98e7bf1a3..6ef0b71fb 100644 --- a/unicodetools/src/main/java/org/unicode/tools/GenerateRadicalEnum.java +++ b/unicodetools/src/main/java/org/unicode/tools/GenerateRadicalEnum.java @@ -1,18 +1,16 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.util.ULocale; import java.util.List; import java.util.Map.Entry; import java.util.TreeSet; - import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.text.utility.Settings; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.util.ULocale; - public class GenerateRadicalEnum { public static void main(String[] args) { final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Settings.latestVersion); @@ -29,7 +27,7 @@ public static void main(String[] args) { for (String item : sorted) { boolean prime = item.endsWith("'"); if (prime) { - item = item.substring(0,item.length()-1); + item = item.substring(0, item.length() - 1); } System.out.println("R" + item + (prime ? "a" : "") + ","); } diff --git a/unicodetools/src/main/java/org/unicode/tools/GenerateXIDModSets.java b/unicodetools/src/main/java/org/unicode/tools/GenerateXIDModSets.java index 471cf3363..a25fb7c8c 100644 --- a/unicodetools/src/main/java/org/unicode/tools/GenerateXIDModSets.java +++ b/unicodetools/src/main/java/org/unicode/tools/GenerateXIDModSets.java @@ -1,12 +1,9 @@ package org.unicode.tools; -import java.util.Locale; - -import org.unicode.jsp.XIDModifications; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.text.UnicodeSet; - +import java.util.Locale; +import org.unicode.jsp.XIDModifications; public class GenerateXIDModSets { public static void main(String[] args) { @@ -22,18 +19,26 @@ public static void main(String[] args) { private static void showSet(String title, UnicodeSet set) { title = title.toUpperCase(Locale.ENGLISH).replace('-', '_'); final String possibleBridge = "[[:Cn:][:nfkcqc=n:][:XIDC=n:]]"; - final String compact = getCompact(set, possibleBridge, true, 60); // "[[:Cn:][:nfkcqc=n:][:XID_Continue=n:]]"); - System.out.println("public static final UnicodeSet " + title + " = new UnicodeSet(" + compact + ");"); + final String compact = + getCompact( + set, + possibleBridge, + true, + 60); // "[[:Cn:][:nfkcqc=n:][:XID_Continue=n:]]"); + System.out.println( + "public static final UnicodeSet " + title + " = new UnicodeSet(" + compact + ");"); } - private static String getCompact(UnicodeSet original, String possibleBridge, boolean escape, int width) { + private static String getCompact( + UnicodeSet original, String possibleBridge, boolean escape, int width) { final String originalString = original.toPattern(escape); String s = originalString; if (!possibleBridge.isEmpty()) { final UnicodeSet dontCare = new UnicodeSet(possibleBridge); if (dontCare.containsNone(original)) { final UnicodeSet compact = new UnicodeSet(original).addBridges(dontCare); - final String compactString = "[" + compact.toPattern(escape) + "-" + possibleBridge + "]"; + final String compactString = + "[" + compact.toPattern(escape) + "-" + possibleBridge + "]"; if (compactString.length() < originalString.length()) { @@ -50,7 +55,7 @@ private static String getCompact(UnicodeSet original, String possibleBridge, boo s = s.replace("\\", "\\\\"); final StringBuilder b = new StringBuilder("\"["); - for (int pos = 0;;) { + for (int pos = 0; ; ) { int nextBreakPoint = pos + width; if (s.length() < nextBreakPoint) { if (b.length() > 0) { @@ -67,10 +72,9 @@ private static String getCompact(UnicodeSet original, String possibleBridge, boo if (b.length() > 1) { b.append("\"\n+ \""); } - b.append(s.substring(pos,nextBreakPoint)); + b.append(s.substring(pos, nextBreakPoint)); pos = nextBreakPoint; } return b.toString(); } - } diff --git a/unicodetools/src/main/java/org/unicode/tools/GetSIUnitTranslations.java b/unicodetools/src/main/java/org/unicode/tools/GetSIUnitTranslations.java index d8a6f4e60..281365efc 100644 --- a/unicodetools/src/main/java/org/unicode/tools/GetSIUnitTranslations.java +++ b/unicodetools/src/main/java/org/unicode/tools/GetSIUnitTranslations.java @@ -1,7 +1,6 @@ package org.unicode.tools; import java.util.TreeMap; - import org.unicode.cldr.tool.LanguageCodeConverter; import org.unicode.cldr.util.CLDRConfig; import org.unicode.cldr.util.CLDRFile; @@ -11,36 +10,41 @@ import org.unicode.cldr.util.XPathParts; public class GetSIUnitTranslations { -public static void main(String[] args) { - Factory factory = CLDRConfig.getInstance().getCldrFactory(); - for (String locale : "af sq am ar hy as az eu be bn bs bg my ca zh-HK zh-CN zh-TW hr cs da nl en-GB et fa fil fi fr fr-CA gl ka de el gu iw hi hu is id it ja kn kk km ko ky lo lv lt mk ms ml mr mn ne no or pl pt-BR pt-PT pa ro ru sr si sk sl es es-419 sw sv ta te th tr uk ur uz vi zu rm".split(" ")) { - String cldrLocale = LanguageCodeConverter.GOOGLE_CLDR.getOrDefault(locale, locale.replace('-', '_')); - //System.out.println("# " + cldrLocale + "\t" + locale); - CLDRFile cldrFile = factory.make(cldrLocale, true); - M3 cm = ChainedMap.of(new TreeMap(), new TreeMap(), String.class); - for (String path : cldrFile) { - //ldml/units/unitLength[@type="short"]/unit[@type="digital-petabyte"]/unitPattern[@count="one"] - if (!path.startsWith("//ldml/units/unitLength[@type=\"short\"]")) { - continue; - } - XPathParts parts = XPathParts.getFrozenInstance(path); - if (!"unitPattern".equals(parts.getElement(-1))) { - continue; - } - String type = parts.getAttributeValue(-2, "type"); - if (!type.contains("byte")) { - continue; + public static void main(String[] args) { + Factory factory = CLDRConfig.getInstance().getCldrFactory(); + for (String locale : + "af sq am ar hy as az eu be bn bs bg my ca zh-HK zh-CN zh-TW hr cs da nl en-GB et fa fil fi fr fr-CA gl ka de el gu iw hi hu is id it ja kn kk km ko ky lo lv lt mk ms ml mr mn ne no or pl pt-BR pt-PT pa ro ru sr si sk sl es es-419 sw sv ta te th tr uk ur uz vi zu rm" + .split(" ")) { + String cldrLocale = + LanguageCodeConverter.GOOGLE_CLDR.getOrDefault( + locale, locale.replace('-', '_')); + // System.out.println("# " + cldrLocale + "\t" + locale); + CLDRFile cldrFile = factory.make(cldrLocale, true); + M3 cm = + ChainedMap.of(new TreeMap(), new TreeMap(), String.class); + for (String path : cldrFile) { + // ldml/units/unitLength[@type="short"]/unit[@type="digital-petabyte"]/unitPattern[@count="one"] + if (!path.startsWith("//ldml/units/unitLength[@type=\"short\"]")) { + continue; + } + XPathParts parts = XPathParts.getFrozenInstance(path); + if (!"unitPattern".equals(parts.getElement(-1))) { + continue; + } + String type = parts.getAttributeValue(-2, "type"); + if (!type.contains("byte")) { + continue; + } + String count = parts.getAttributeValue(-1, "count"); + if (!count.equals("other")) { + continue; + } + String value = cldrFile.getStringValue(path); + value = value.replace("{0}", "").trim(); + String typeName = type.substring("digital-".length()); + System.out.println(locale + "\t" + typeName + "\t" + value); + // cm.put(locale, type, value); } - String count = parts.getAttributeValue(-1, "count"); - if (!count.equals("other")) { - continue; - } - String value = cldrFile.getStringValue(path); - value = value.replace("{0}", "").trim(); - String typeName = type.substring("digital-".length()); - System.out.println(locale + "\t" + typeName + "\t" + value); - //cm.put(locale, type, value); } } } -} diff --git a/unicodetools/src/main/java/org/unicode/tools/Ids.java b/unicodetools/src/main/java/org/unicode/tools/Ids.java index b6759cf05..d959b51aa 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Ids.java +++ b/unicodetools/src/main/java/org/unicode/tools/Ids.java @@ -1,4 +1,19 @@ package org.unicode.tools; + +import com.google.common.collect.ComparisonChain; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.dev.util.UnicodeMap.EntryRange; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.Output; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; @@ -15,7 +30,6 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.ChainedMap; import org.unicode.cldr.util.ChainedMap.M3; @@ -29,47 +43,48 @@ import org.unicode.props.UcdPropertyValues.General_Category_Values; import org.unicode.text.utility.Settings; -import com.google.common.collect.ComparisonChain; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.dev.util.UnicodeMap.EntryRange; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.Output; - - public class Ids { private static final boolean DEBUG = false; - private static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Settings.latestVersion); - - private static final UnicodeMap GC_PROPERTY = iup.loadEnum(UcdProperty.General_Category, UcdPropertyValues.General_Category_Values.class); - private static final UnicodeSet UNASSIGNED = GC_PROPERTY.getSet(General_Category_Values.Unassigned); - - private static final UnicodeMap BLOCK_PROPERTY = iup.loadEnum(UcdProperty.Block, UcdPropertyValues.Block_Values.class); - private static final UnicodeSet KANGXI_BLOCK = new UnicodeSet(BLOCK_PROPERTY.getSet(Block_Values.Kangxi_Radicals)) - .removeAll(UNASSIGNED).freeze(); - static final UnicodeSet CJK_Radicals_Supplement_BLOCK = new UnicodeSet(BLOCK_PROPERTY.getSet(Block_Values.CJK_Radicals_Supplement)) - .removeAll(UNASSIGNED).freeze(); - private static final UnicodeSet CJK_STROKES_BLOCK = new UnicodeSet(BLOCK_PROPERTY.getSet(Block_Values.CJK_Strokes)) - .removeAll(UNASSIGNED).freeze(); - private static final UnicodeSet RADICAL_OR_STROKE = new UnicodeSet(KANGXI_BLOCK) - .addAll(CJK_Radicals_Supplement_BLOCK) - .addAll(CJK_STROKES_BLOCK); - - private static final UnicodeMap> radicalStroke = iup.loadList(UcdProperty.kRSUnicode); - static final UnicodeMap> kTotalStrokes = iup.loadIntList(UcdProperty.kTotalStrokes); - private static final UnicodeMap> adobeRadicalStroke = iup.loadSet(UcdProperty.kRSAdobe_Japan1_6); + private static final IndexUnicodeProperties iup = + IndexUnicodeProperties.make(Settings.latestVersion); + + private static final UnicodeMap GC_PROPERTY = + iup.loadEnum( + UcdProperty.General_Category, UcdPropertyValues.General_Category_Values.class); + private static final UnicodeSet UNASSIGNED = + GC_PROPERTY.getSet(General_Category_Values.Unassigned); + + private static final UnicodeMap BLOCK_PROPERTY = + iup.loadEnum(UcdProperty.Block, UcdPropertyValues.Block_Values.class); + private static final UnicodeSet KANGXI_BLOCK = + new UnicodeSet(BLOCK_PROPERTY.getSet(Block_Values.Kangxi_Radicals)) + .removeAll(UNASSIGNED) + .freeze(); + static final UnicodeSet CJK_Radicals_Supplement_BLOCK = + new UnicodeSet(BLOCK_PROPERTY.getSet(Block_Values.CJK_Radicals_Supplement)) + .removeAll(UNASSIGNED) + .freeze(); + private static final UnicodeSet CJK_STROKES_BLOCK = + new UnicodeSet(BLOCK_PROPERTY.getSet(Block_Values.CJK_Strokes)) + .removeAll(UNASSIGNED) + .freeze(); + private static final UnicodeSet RADICAL_OR_STROKE = + new UnicodeSet(KANGXI_BLOCK) + .addAll(CJK_Radicals_Supplement_BLOCK) + .addAll(CJK_STROKES_BLOCK); + + private static final UnicodeMap> radicalStroke = + iup.loadList(UcdProperty.kRSUnicode); + static final UnicodeMap> kTotalStrokes = + iup.loadIntList(UcdProperty.kTotalStrokes); + private static final UnicodeMap> adobeRadicalStroke = + iup.loadSet(UcdProperty.kRSAdobe_Japan1_6); private static final UnicodeMap numericRadicalStroke; - static final M3 USTROKE = ChainedMap.of(new TreeMap(), new TreeMap(), UnicodeSet.class); + static final M3 USTROKE = + ChainedMap.of(new TreeMap(), new TreeMap(), UnicodeSet.class); + static { numericRadicalStroke = new UnicodeMap<>(); for (Entry> entry : radicalStroke.entrySet()) { @@ -78,9 +93,10 @@ public class Ids { String rad = parts.get(0); int radInt; boolean alt = rad.endsWith("'"); - radInt = Integer.parseInt(alt ? rad.substring(0,rad.length()-1) : rad); + radInt = Integer.parseInt(alt ? rad.substring(0, rad.length() - 1) : rad); final int remStrokes = Integer.parseInt(parts.get(1)); - numericRadicalStroke.put(entry.getKey(), radInt*10000 + (alt ? 1000 : 0) + remStrokes); + numericRadicalStroke.put( + entry.getKey(), radInt * 10000 + (alt ? 1000 : 0) + remStrokes); if (remStrokes == 0) { UnicodeSet uset = USTROKE.get(radInt, alt); if (uset == null) { @@ -96,17 +112,16 @@ public class Ids { entry3.getValue().freeze(); } } - } - static final Map kRSJapaneseRadicals = loadRS(UcdProperty.kRSJapanese); - static final Map kRSKanWaRadicals = loadRS(UcdProperty.kRSKanWa); - static final Map kRSKoreanRadicals = loadRS(UcdProperty.kRSKorean); - static final Map kRSKangXiRadicals = loadRS(UcdProperty.kRSKangXi); + static final Map kRSJapaneseRadicals = loadRS(UcdProperty.kRSJapanese); + static final Map kRSKanWaRadicals = loadRS(UcdProperty.kRSKanWa); + static final Map kRSKoreanRadicals = loadRS(UcdProperty.kRSKorean); + static final Map kRSKangXiRadicals = loadRS(UcdProperty.kRSKangXi); - private static Map loadRS(UcdProperty simpleRadicalStroke) { + private static Map loadRS(UcdProperty simpleRadicalStroke) { UnicodeMap rs2 = iup.load(simpleRadicalStroke); - Map result = new TreeMap(); + Map result = new TreeMap(); for (EntryRange entry : rs2.entryRanges()) { String rsItem = entry.value; if (rsItem.contains(" ") || rsItem.contains("|")) { @@ -125,65 +140,68 @@ private static Map loadRS(UcdProperty simpleRadicalStroke) return Collections.unmodifiableMap(result); } - static final Comparator UNIHAN = new Comparator() { - @Override - public int compare(String o1, String o2) { - int diff = compare2(o1, o2); - if (diff == 0 && !o1.equals(o2)) { - compare2(o1, o2); - throw new IllegalAccessError(); - } - return diff; - } - public int compare2(String o1, String o2) { - int cp1; - int cp2; - int i1 = 0; - int i2 = 0; - while (i1 < o1.length() && i2 < o2.length()) { - cp1 = o1.codePointAt(i1++); - cp2 = o2.codePointAt(i2++); - if (cp1 != cp2) { - Integer rs1 = numericRadicalStroke.get(cp1); - Integer rs2 = numericRadicalStroke.get(cp2); - if (rs1 == null) { - if (rs2 != null) { - return -1; - } - } else { // ≠ null - if (rs2 == null) { - return 1; - } else { // ≠ null - int diff = rs1 - rs2; - if (diff != 0) { - return diff; - } - } + static final Comparator UNIHAN = + new Comparator() { + @Override + public int compare(String o1, String o2) { + int diff = compare2(o1, o2); + if (diff == 0 && !o1.equals(o2)) { + compare2(o1, o2); + throw new IllegalAccessError(); } - if (KANGXI_BLOCK.contains(cp1)) { - if (!KANGXI_BLOCK.contains(cp2)) { - return -1; + return diff; + } + + public int compare2(String o1, String o2) { + int cp1; + int cp2; + int i1 = 0; + int i2 = 0; + while (i1 < o1.length() && i2 < o2.length()) { + cp1 = o1.codePointAt(i1++); + cp2 = o2.codePointAt(i2++); + if (cp1 != cp2) { + Integer rs1 = numericRadicalStroke.get(cp1); + Integer rs2 = numericRadicalStroke.get(cp2); + if (rs1 == null) { + if (rs2 != null) { + return -1; + } + } else { // ≠ null + if (rs2 == null) { + return 1; + } else { // ≠ null + int diff = rs1 - rs2; + if (diff != 0) { + return diff; + } + } + } + if (KANGXI_BLOCK.contains(cp1)) { + if (!KANGXI_BLOCK.contains(cp2)) { + return -1; + } + } else if (KANGXI_BLOCK.contains(cp2)) { + return 1; + } + return cp1 - cp2; } - } else if (KANGXI_BLOCK.contains(cp2)) { - return 1; + if (cp1 > 0xFFFF) ++i1; + if (cp2 > 0xFFFF) ++i2; } - return cp1 - cp2; + return o1.length() - o2.length(); } - if (cp1 > 0xFFFF) ++i1; - if (cp2 > 0xFFFF) ++i2; - } - return o1.length() - o2.length(); - } - - }; - //private static final UnicodeMap totalStrokes = iup.load(UcdProperty.kTotalStrokes); + }; + // private static final UnicodeMap totalStrokes = iup.load(UcdProperty.kTotalStrokes); /** - * http://www.unicode.org/reports/tr38/#kRSAdobe_Japan1_6 - * radical, rstrokes, remaining => Unicode => remaining - + * http://www.unicode.org/reports/tr38/#kRSAdobe_Japan1_6 radical, rstrokes, remaining => + * Unicode => remaining */ - static final M4 ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), UnicodeSet.class); + static final M4 + ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET = + ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), UnicodeSet.class); + static { Matcher m = Common.ADOBE_RS_MATCHER.matcher(""); for (Entry> entry : adobeRadicalStroke.entrySet()) { @@ -195,14 +213,18 @@ public int compare2(String o1, String o2) { int radical = Integer.parseInt(m.group(1)); int rstrokes = Integer.parseInt(m.group(2)); int remaining = Integer.parseInt(m.group(3)); - UnicodeSet map = ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get(radical, rstrokes, remaining); + UnicodeSet map = + ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get( + radical, rstrokes, remaining); if (map == null) { - ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.put(radical, rstrokes, remaining, map = new UnicodeSet()); + ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.put( + radical, rstrokes, remaining, map = new UnicodeSet()); } map.add(source); } } - for (Entry>> entry : ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET) { + for (Entry>> entry : + ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET) { for (Entry> entry2 : entry.getValue().entrySet()) { for (Entry entry3 : entry2.getValue().entrySet()) { entry3.getValue().freeze(); @@ -212,8 +234,9 @@ public int compare2(String o1, String o2) { } private static final UnicodeMap unicodeToRadical; - static final Relation rawRadToUnicode; - static final Relation radToUnicode; + static final Relation rawRadToUnicode; + static final Relation radToUnicode; + static { UnicodeMap> unicodeToRadicalRaw = iup.loadList(UcdProperty.CJK_Radical); @@ -225,7 +248,8 @@ public int compare2(String o1, String o2) { unicodeToRadical = new UnicodeMap<>(); // Add extra Adobe radicals first - for (Entry>> entry : ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET) { + for (Entry>> entry : + ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET) { final int radical = entry.getKey(); final String radString = String.valueOf(radical); for (Entry> entry2 : entry.getValue().entrySet()) { @@ -256,7 +280,7 @@ public int compare2(String o1, String o2) { } private static void fillRadical(UnicodeMap> cjkRadicalRaw) { - Relation unicodeToRsRadicals; + Relation unicodeToRsRadicals; for (EntryRange> entry : radicalStroke.entryRanges()) { for (int cp = entry.codepoint; cp <= entry.codepointEnd; ++cp) { for (String rs : entry.value) { @@ -279,21 +303,24 @@ private static void fillRadical(UnicodeMap> cjkRadicalRaw) { for (Entry> entry : cjkRadicalRaw.entrySet()) { if (!unicodeToRadical.containsKey(entry.getKey())) { unicodeToRadical.put(entry.getKey(), entry.getValue().iterator().next()); - //System.out.println("Missing:\t" + entry.getKey() + "\t" + entry.getValue() + "\t" + UCharacter.getName(entry.getKey(),"+")); + // System.out.println("Missing:\t" + entry.getKey() + "\t" + entry.getValue() + "\t" + // + UCharacter.getName(entry.getKey(),"+")); } } unicodeToRadical.freeze(); } - private static final UnicodeSet IDS = new UnicodeSet("[[:IDS_Binary_Operator:][:IDS_Trinary_Operator:]]").freeze(); + private static final UnicodeSet IDS = + new UnicodeSet("[[:IDS_Binary_Operator:][:IDS_Trinary_Operator:]]").freeze(); private static final UnicodeSet STROKES = new UnicodeSet("[㇀-㇣]").freeze(); - private static final UnicodeSet EXT = new UnicodeSet(0xE000,0xEF00).freeze(); - private static final UnicodeSet EXT_E = new UnicodeSet(0x2B820,0x2CEA1).freeze(); - private static final UnicodeSet IDEOGRAPHIC = new UnicodeSet("[[:Ideographic:]-[:Block=CJK_Symbols_And_Punctuation:]]").freeze(); + private static final UnicodeSet EXT = new UnicodeSet(0xE000, 0xEF00).freeze(); + private static final UnicodeSet EXT_E = new UnicodeSet(0x2B820, 0x2CEA1).freeze(); + private static final UnicodeSet IDEOGRAPHIC = + new UnicodeSet("[[:Ideographic:]-[:Block=CJK_Symbols_And_Punctuation:]]").freeze(); static final UnicodeSet RADICAL = new UnicodeSet("[:Radical:]").freeze(); private static final class Positioning implements Comparable { - private static final Positioning BASE = new Positioning(0,0,1,1); + private static final Positioning BASE = new Positioning(0, 0, 1, 1); final double x1; final double y1; final double x2; @@ -307,7 +334,7 @@ public Positioning(double x, double y, double x2, double y2) { } public Positioning times(Positioning other) { - // a..b * c..d : a + (b-a * c) .. a + (b-a * d) + // a..b * c..d : a + (b-a * c) .. a + (b-a * d) final double width = x2 - x1; final double x1n = x1 + width * other.x1; final double x2n = x1 + width * other.x2; @@ -316,26 +343,28 @@ public Positioning times(Positioning other) { final double y1n = y1 + height * other.y1; final double y2n = y1 + height * other.y2; - return new Positioning( - x1n, - y1n, - x2n, - y2n); + return new Positioning(x1n, y1n, x2n, y2n); } @Override public String toString() { - return "{" + (int)(100*x1) - + ", " + (int)(100*y1) - + "; " + (int)(100*x2) - + ", " + (int)(100*y2) + return "{" + + (int) (100 * x1) + + ", " + + (int) (100 * y1) + + "; " + + (int) (100 * x2) + + ", " + + (int) (100 * y2) + "}"; } + @Override public boolean equals(Object obj) { - Positioning other = (Positioning)obj; + Positioning other = (Positioning) obj; return x1 == other.x1 && x2 == other.x2 && y1 == other.y1 && y2 == other.y2; } + @Override public int hashCode() { return Objects.hash(x1, x2, y1, y2); @@ -371,80 +400,120 @@ public String toString() { } } - private static void add(int codepoint, String sample, String sampleDecomp, Positioning... part) { + private static void add( + int codepoint, String sample, String sampleDecomp, Positioning... part) { final IdsData value = new IdsData(sample, sampleDecomp, Arrays.asList(part)); IDS_INFO.put(codepoint, value); System.out.println(UTF16.valueOf(codepoint) + ", " + value); } private static final UnicodeMap IDS_INFO = new UnicodeMap<>(); - static { + static { IDS_INFO.putAll(IDEOGRAPHIC, IdsData.IDEO); IDS_INFO.putAll(RADICAL, IdsData.IDEO); IDS_INFO.putAll(STROKES, IdsData.IDEO); IDS_INFO.putAll(EXT, IdsData.IDEO); IDS_INFO.putAll(EXT_E, IdsData.IDEO); - double ZERO = 0d, ALL = 1d, - HALF = 0.5d, - THIRD = 1/3d, TWO_THIRDS = 2/3d, - QUARTER = 1/4d, THREE_QUARTERS = 3/4d, - ALMOST_ZERO = 1/12d, ALMOST_ALL = 11/12d - ; - - add(0x2ff0, "㐖", "⿰吉乚", - new Positioning(ZERO, ZERO, HALF, ALL), + double ZERO = 0d, + ALL = 1d, + HALF = 0.5d, + THIRD = 1 / 3d, + TWO_THIRDS = 2 / 3d, + QUARTER = 1 / 4d, + THREE_QUARTERS = 3 / 4d, + ALMOST_ZERO = 1 / 12d, + ALMOST_ALL = 11 / 12d; + + add( + 0x2ff0, + "㐖", + "⿰吉乚", + new Positioning(ZERO, ZERO, HALF, ALL), new Positioning(HALF, ZERO, ALL, ALL)); // ⿱ U+2FF1 IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW - add(0x2ff1, "㐀", "⿱卝一", - new Positioning(ZERO, ZERO, ALL, HALF), + add( + 0x2ff1, + "㐀", + "⿱卝一", + new Positioning(ZERO, ZERO, ALL, HALF), new Positioning(ZERO, HALF, ALL, ALL)); // ⿲ U+2FF2 IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT - add(0x2ff2, "㣠", "⿲彳丨冬", - new Positioning(ZERO, ZERO, THIRD, ALL), - new Positioning(THIRD, ZERO, TWO_THIRDS, ALL), + add( + 0x2ff2, + "㣠", + "⿲彳丨冬", + new Positioning(ZERO, ZERO, THIRD, ALL), + new Positioning(THIRD, ZERO, TWO_THIRDS, ALL), new Positioning(TWO_THIRDS, ZERO, ALL, ALL)); // ⿳ U+2FF3 IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW - add(0x2ff3, "㞿", "⿳山土乂", - new Positioning(ZERO, ZERO, ALL, THIRD), - new Positioning(ZERO, THIRD, ALL, TWO_THIRDS), + add( + 0x2ff3, + "㞿", + "⿳山土乂", + new Positioning(ZERO, ZERO, ALL, THIRD), + new Positioning(ZERO, THIRD, ALL, TWO_THIRDS), new Positioning(ZERO, TWO_THIRDS, ALL, ALL)); // ⿴ U+2FF4 IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND - add(0x2ff4, "囝", "⿴囗子", - new Positioning(ZERO, ZERO, ALL, ALL), + add( + 0x2ff4, + "囝", + "⿴囗子", + new Positioning(ZERO, ZERO, ALL, ALL), new Positioning(QUARTER, QUARTER, THREE_QUARTERS, THREE_QUARTERS)); // ⿵ U+2FF5 IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE - add(0x2ff5, "悶", "⿵門心", - new Positioning(ZERO, ZERO, ALL, ALL), + add( + 0x2ff5, + "悶", + "⿵門心", + new Positioning(ZERO, ZERO, ALL, ALL), new Positioning(THIRD, THIRD, TWO_THIRDS, ALL)); // ⿶ U+2FF6 IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW - add(0x2ff6, "𠙶", "⿶凵了", - new Positioning(ZERO, ZERO, ALL, ALL), + add( + 0x2ff6, + "𠙶", + "⿶凵了", + new Positioning(ZERO, ZERO, ALL, ALL), new Positioning(THIRD, ZERO, TWO_THIRDS, TWO_THIRDS)); // ⿷ U+2FF7 IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT - add(0x2ff7, "𠤭", "⿷匚人", - new Positioning(ZERO, ZERO, ALL, ALL), + add( + 0x2ff7, + "𠤭", + "⿷匚人", + new Positioning(ZERO, ZERO, ALL, ALL), new Positioning(THIRD, THIRD, ALL, TWO_THIRDS)); // ⿸ U+2FF8 IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT - add(0x2ff8, "産", "⿸产生", - new Positioning(ZERO, ZERO, ALMOST_ALL, ALMOST_ALL), + add( + 0x2ff8, + "産", + "⿸产生", + new Positioning(ZERO, ZERO, ALMOST_ALL, ALMOST_ALL), new Positioning(HALF, HALF, ALL, ALL)); // ⿹ U+2FF9 IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT - add(0x2ff9, "甸", "⿹勹田", - new Positioning(ZERO, ZERO, ALL, ALL), + add( + 0x2ff9, + "甸", + "⿹勹田", + new Positioning(ZERO, ZERO, ALL, ALL), new Positioning(ZERO, HALF, HALF, ALL)); // ⿺ U+2FFA IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT - add(0x2ffa, "䆪", "⿺光空", - new Positioning(ZERO, ALMOST_ZERO, ALMOST_ALL, ALL), + add( + 0x2ffa, + "䆪", + "⿺光空", + new Positioning(ZERO, ALMOST_ZERO, ALMOST_ALL, ALL), new Positioning(HALF, ZERO, ALL, HALF)); // ⿻ U+2FFB IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID - add(0x2ffb, "𠆥", "⿻人丿", - new Positioning(ZERO, ZERO, ALMOST_ALL, ALMOST_ALL), + add( + 0x2ffb, + "𠆥", + "⿻人丿", + new Positioning(ZERO, ZERO, ALMOST_ALL, ALMOST_ALL), new Positioning(ALMOST_ZERO, ALMOST_ZERO, ALL, ALL)); } - //㿂 ⿸疒⿰⿱山王攵 + // 㿂 ⿸疒⿰⿱山王攵 private static final String FLIPPED = "或止虎"; private static final String MIRRORED = "𦣞正止臣"; @@ -459,6 +528,7 @@ public boolean equals(Object obj) { CpPart other = (CpPart) obj; return codepoint == other.codepoint && part == other.part; } + @Override public int hashCode() { return Objects.hash(codepoint, part); @@ -475,24 +545,25 @@ public String getColor() { float c = color * 0x3; final int dr, dg, db; if (c <= 1) { - dr = Math.round(c*0xFF); - dg = Math.round((1-c)*0xFF); + dr = Math.round(c * 0xFF); + dg = Math.round((1 - c) * 0xFF); db = 0; } else if (c <= 2) { dr = 0; - dg = Math.round((c-1)*0xFF); - db = Math.round((2-c)*0xFF); + dg = Math.round((c - 1) * 0xFF); + db = Math.round((2 - c) * 0xFF); } else { dg = 0; - db = Math.round((c-2)*0xFF); - dr = Math.round((3-c)*0xFF); + db = Math.round((c - 2) * 0xFF); + dr = Math.round((3 - c) * 0xFF); } - long v = 0x1000000L | (dr<<16) | (dg <<8) | db; + long v = 0x1000000L | (dr << 16) | (dg << 8) | db; String rgb = Long.toHexString(v); - return "#" + rgb.substring(rgb.length()-6); + return "#" + rgb.substring(rgb.length() - 6); } - public static List parse(String sourceChar, String source, Output questionable) { + public static List parse( + String sourceChar, String source, Output questionable) { if (DEBUG) System.out.println(source); ArrayList result = new ArrayList(); if (source.contains("〢")) { // HACK @@ -502,20 +573,35 @@ public static List parse(String sourceChar, String source, Output result, Output questionable) { + private static int parse( + String sourceChar, + int depth, + Positioning position, + int pos, + int[] codePoints, + ArrayList result, + Output questionable) { if (sourceChar.equals("𠃻")) { int debug = 0; } @@ -525,16 +611,20 @@ private static int parse(String sourceChar, int depth, Positioning position, int if (ids == null || ids == IdsData.IDEO) { String radical = unicodeToRadical.get(lead); if (radical != null) { - result.add(new CpPart(lead, position, pos/(float)codePoints.length)); + result.add(new CpPart(lead, position, pos / (float) codePoints.length)); return pos; } - throw new IllegalArgumentException("Error: no IDS/Radical/Stroke at " + (pos-1)); + throw new IllegalArgumentException("Error: no IDS/Radical/Stroke at " + (pos - 1)); } - if (DEBUG) System.out.println(Utility.repeat("\t",depth) + UTF16.valueOf(lead) + " => " + ids.part); + if (DEBUG) + System.out.println( + Utility.repeat("\t", depth) + UTF16.valueOf(lead) + " => " + ids.part); for (final Positioning subpart : ids.part) { final Positioning combo = position.times(subpart); int codePoint = codePoints[pos++]; - if (DEBUG) System.out.println(Utility.repeat("\t",depth) + UTF16.valueOf(codePoint) + " & " + combo); + if (DEBUG) + System.out.println( + Utility.repeat("\t", depth) + UTF16.valueOf(codePoint) + " & " + combo); IdsData partData = IDS_INFO.get(codePoint); while (partData == null && codePoint == '?') { @@ -544,77 +634,116 @@ private static int parse(String sourceChar, int depth, Positioning position, int } if (partData == null) { - switch(codePoint) { - default: { - if (IDS_HACK.contains(codePoint)) { - codePoint = Special.addSpecialX(sourceChar, UTF16.valueOf(codePoint)); - partData = IDS_INFO.get(codePoint); - } - break; - } - // ⿱&CDP-8B5E;廾[UG] ⿱&CDP-88F0;廾[T] - case '&': { - StringBuilder sb = new StringBuilder(); - while (true) { - int codePoint2 = codePoints[pos++]; - if (codePoint2 == ';') { + switch (codePoint) { + default: + { + if (IDS_HACK.contains(codePoint)) { + codePoint = + Special.addSpecialX( + sourceChar, UTF16.valueOf(codePoint)); + partData = IDS_INFO.get(codePoint); + } + break; + } + // ⿱&CDP-8B5E;廾[UG] ⿱&CDP-88F0;廾[T] + case '&': + { + StringBuilder sb = new StringBuilder(); + while (true) { + int codePoint2 = codePoints[pos++]; + if (codePoint2 == ';') { + break; + } + sb.appendCodePoint(codePoint2); + } + final String otherString = sb.toString(); + String value = MACROS.get(otherString); + codePoint = + Special.addSpecialX( + sourceChar, value == null ? otherString : value); + partData = IDS_INFO.get(codePoint); + break; + } + case '{': + { + int first = codePoints[pos++] - 0x30; + if (first < 0 || first > 9) { + throw new IllegalArgumentException( + "Error: unexpected character " + + Utility.hex(codePoint) + + " at " + + (pos - 1)); + } + int second = codePoints[pos++] - 0x30; + if (second < 0 || second > 9) { + throw new IllegalArgumentException( + "Error: unexpected character " + + Utility.hex(codePoint) + + " at " + + (pos - 1)); + } + int third = codePoints[pos++]; + if (third != '}') { + throw new IllegalArgumentException( + "Error: unexpected character " + + Utility.hex(codePoint) + + " at " + + (pos - 1)); + } + codePoint = 0xE000 + first * 10 + second; + Special.addToSpecial(codePoint, sourceChar); + partData = IDS_INFO.get(codePoint); + // } else if (codePoint == '?') { + // codePoint = 0xE07F; + // partData = IDS_INFO.get(codePoint); + break; + } + case '↔': + { + codePoint = codePoints[pos++]; + int mirrored = MIRRORED.indexOf(codePoint); + if (mirrored >= 0) { + partData = IDS_INFO.get(0xE040 + mirrored); + Special.addSpecial( + 0xE040 + mirrored, + sourceChar, + "mirrored " + UTF16.valueOf(codePoint)); + } + break; + } + case '↷': + { + codePoint = codePoints[pos++]; + int mirrored = FLIPPED.indexOf(codePoint); + if (mirrored >= 0) { + partData = IDS_INFO.get(0xE050 + mirrored); + Special.addSpecial( + 0xE050 + mirrored, + sourceChar, + "rotated " + UTF16.valueOf(codePoint)); + } break; } - sb.appendCodePoint(codePoint2); - } - final String otherString = sb.toString(); - String value = MACROS.get(otherString); - codePoint = Special.addSpecialX(sourceChar, value == null ? otherString : value); - partData = IDS_INFO.get(codePoint); - break; - } - case '{': { - int first = codePoints[pos++] - 0x30; - if (first < 0 || first > 9) { - throw new IllegalArgumentException("Error: unexpected character " + Utility.hex(codePoint) + " at " + (pos-1)); - } - int second = codePoints[pos++] - 0x30; - if (second < 0 || second > 9) { - throw new IllegalArgumentException("Error: unexpected character " + Utility.hex(codePoint) + " at " + (pos-1)); - } - int third = codePoints[pos++]; - if (third != '}') { - throw new IllegalArgumentException("Error: unexpected character " + Utility.hex(codePoint) + " at " + (pos-1)); - } - codePoint = 0xE000 + first * 10 + second; - Special.addToSpecial(codePoint, sourceChar); - partData = IDS_INFO.get(codePoint); - // } else if (codePoint == '?') { - // codePoint = 0xE07F; - // partData = IDS_INFO.get(codePoint); - break; - } - case '↔': { - codePoint = codePoints[pos++]; - int mirrored = MIRRORED.indexOf(codePoint); - if (mirrored >= 0) { - partData = IDS_INFO.get(0xE040 + mirrored); - Special.addSpecial(0xE040 + mirrored, sourceChar, "mirrored " + UTF16.valueOf(codePoint)); - } - break; - } - case '↷': { - codePoint = codePoints[pos++]; - int mirrored = FLIPPED.indexOf(codePoint); - if (mirrored >= 0) { - partData = IDS_INFO.get(0xE050 + mirrored); - Special.addSpecial(0xE050 + mirrored, sourceChar, "rotated " + UTF16.valueOf(codePoint)); - } - break; - } } } if (partData == null) { - throw new IllegalArgumentException("Error: unexpected character " + Utility.hex(codePoint) + " at " + (pos-1)); + throw new IllegalArgumentException( + "Error: unexpected character " + + Utility.hex(codePoint) + + " at " + + (pos - 1)); } else if (partData == IdsData.IDEO) { - result.add(new CpPart(codePoint, combo, pos/(float)codePoints.length)); + result.add(new CpPart(codePoint, combo, pos / (float) codePoints.length)); } else { - pos = parse(sourceChar, depth + 1, combo, pos-1, codePoints, result, questionable); + pos = + parse( + sourceChar, + depth + 1, + combo, + pos - 1, + codePoints, + result, + questionable); } } return pos; @@ -627,20 +756,38 @@ public String toString() { public String svgRect(String color, boolean showRect) { return "" - + (showRect ? - "\n" : "") - + "" + UTF16.valueOf(codepoint) + "\n"; + + (showRect + ? "\n" + : "") + + "" + + UTF16.valueOf(codepoint) + + "\n"; } + @Override public int compareTo(CpPart o) { int diff; @@ -677,7 +824,7 @@ public static int addSpecialX(String sourceChar, String description) { } else { codepoint = cp; } - int codepointBase = codepoint-0xE000; + int codepointBase = codepoint - 0xE000; Special special = specials.get(codepointBase); if (specials.get(codepointBase) == null) { special = new Special(description); @@ -698,7 +845,7 @@ public static void addSpecial(int codepoint, String sourceChar, String descripti } public static void addSpecial(String line) { - int codepoint = (line.charAt(1)-0x30)*10+(line.charAt(2)-0x30); + int codepoint = (line.charAt(1) - 0x30) * 10 + (line.charAt(2) - 0x30); if (specials.get(codepoint) != null) { throw new IllegalArgumentException("special collision"); } @@ -708,29 +855,38 @@ public static void addSpecial(String line) { } public static void addToSpecial(int codePoint, String sourceChar) { - Special special = specials.get(codePoint-0xE000); + Special special = specials.get(codePoint - 0xE000); special.samples.add(sourceChar); } static void listSpecials() throws IOException { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", "specials.html"); - ) { + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "ids/", "specials.html"); ) { showHeader(out); - out.println("
"); + out.println( + ""); for (Entry entry : specials.entrySet()) { - out.println(""); - + out.println( + ""); } showFooter(out); - //System.out.println("items:\t" + count); + // System.out.println("items:\t" + count); } } + @Override public String toString() { return description + "\t" + samples.toPattern(false); @@ -751,18 +907,20 @@ public static void main(String[] args) throws IOException { showRadicalMissing(); Special.listSpecials(); - // for (int i : CharSequences.codePoints(FLIPPED)) { - // System.out.println("FLIPPED:\t" + UTF16.valueOf(i) + "\t" + IDS_DATA.get(i) + "\t" + radicalStroke.get(i)); + // System.out.println("FLIPPED:\t" + UTF16.valueOf(i) + "\t" + IDS_DATA.get(i) + + // "\t" + radicalStroke.get(i)); // } // for (int i : CharSequences.codePoints(MIRRORED)) { - // System.out.println("MIRRORED:\t" + UTF16.valueOf(i) + "\t" + IDS_DATA.get(i) + "\t" + radicalStroke.get(i)); + // System.out.println("MIRRORED:\t" + UTF16.valueOf(i) + "\t" + IDS_DATA.get(i) + + // "\t" + radicalStroke.get(i)); // } // for (Entry entry : IDS_DATA.entrySet()) { // String source = entry.getKey(); // final String idsSource = entry.getValue().idsSource; // if (idsSource.contains("↔") || idsSource.contains("↷")) { - // System.out.println(source + "\t" + idsSource + "\t" + radicalStroke.get(source)); + // System.out.println(source + "\t" + idsSource + "\t" + + // radicalStroke.get(source)); // } // } } @@ -811,45 +969,64 @@ private static void showCjkRadicals() { private static void showRadicalMissing() throws IOException { // Relation, String> invert = Relation.of(new HashMap(), HashSet.class); - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", "radicalMissing.html"); - ) { - showHeader(out); + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "ids/", "radicalMissing.html"); ) { + showHeader(out); int count = 0; - out.println(""); + out.println( + ""); main: - for (Entry entry : IDS_RECURSIVE.entrySet()) { - // invert.put(entry.getValue().parts, entry.getKey()); - final String source = entry.getKey(); - final List rs2 = radicalStroke.get(source); - if (rs2 == null) { - System.out.println("showRadicalMissing Skipping " + source); - continue; + for (Entry entry : IDS_RECURSIVE.entrySet()) { + // invert.put(entry.getValue().parts, entry.getKey()); + final String source = entry.getKey(); + final List rs2 = radicalStroke.get(source); + if (rs2 == null) { + System.out.println("showRadicalMissing Skipping " + source); + continue; + } + String rs = rs2.get(0); + String idsSource = entry.getValue().idsSource; + Set rads = radToUnicode.get(Common.DOT_SPLITTER.splitToList(rs).get(0)); + for (String rad : rads) { + if (idsSource.contains(rad)) { + continue main; } - String rs = rs2.get(0); - String idsSource = entry.getValue().idsSource; - Set rads = radToUnicode.get(Common.DOT_SPLITTER.splitToList(rs).get(0)); - for (String rad : rads) { - if (idsSource.contains(rad)) { + CharacterIds decomp = IDS_RECURSIVE.get(rad); + if (decomp != null) { + String rad2 = decomp.idsSource; + if (idsSource.contains(rad2)) { continue main; } - CharacterIds decomp = IDS_RECURSIVE.get(rad); - if (decomp != null) { - String rad2 = decomp.idsSource; - if (idsSource.contains(rad2)) { - continue main; - } - } } - //System.out.println("Rec. Decomp doesn't contain radical: " + entry.getKey() + "\t" + rs + "\t" + rads + "\t" + idsSource); - out.println("" - + "" - + "" - + "" - + "" - + "" - + ""); - } + // System.out.println("Rec. Decomp doesn't contain radical: " + entry.getKey() + + // "\t" + rs + "\t" + rads + "\t" + idsSource); + out.println( + "" + + "" + + "" + + "" + + "" + + "" + + ""); + } // for (Entry, Set> entry : invert.keyValuesSet()) { // if (entry.getValue().size() > 1) { // System.out.println(entry); @@ -877,27 +1054,49 @@ private static String getDecomp(String idsSource) { return b.toString(); } - static void showParseErrors(M3 problems, String fileName) throws IOException { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", fileName); - ) { + static void showParseErrors(M3 problems, String fileName) + throws IOException { + try (PrintWriter out = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", fileName); ) { showHeader(out); - out.println(""); + out.println( + ""); Output> radChar = new Output<>(); - //System.out.println("Failed to parse: "); + // System.out.println("Failed to parse: "); int count = 0; for (Entry> entry : problems) { for (Entry entry2 : entry.getValue().entrySet()) { - //System.out.println(entry.getKey() + "\t" + entry2.getKey() + "\t" + entry2.getValue()); + // System.out.println(entry.getKey() + "\t" + entry2.getKey() + "\t" + + // entry2.getValue()); final String key = entry2.getKey(); List rad = getRS(key, radChar); - out.println("" - + "" - + "" - + "" - + "" - + "" - + "" - + ""); + out.println( + "" + + "" + + "" + + "" + + "" + + "" + + "" + + ""); } } showFooter(out); @@ -920,22 +1119,24 @@ private static List getRS(final String key, Output> radicalC private static void showRadicalCompare() throws IOException { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", "radicalCompare.html"); - ) { + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "ids/", "radicalCompare.html"); ) { showHeader(out); - Map,Set>> sorted = new TreeMap<>(); + Map, Set>> sorted = new TreeMap<>(); for (Entry> entry : radToUnicode.keyValuesSet()) { final String key = entry.getKey(); final double clean = cleanRadical(key); sorted.put(clean, Row.of(entry.getValue(), rawRadToUnicode.get(key))); } - out.println("" - + "" - + "" - + "" - + "" - + "" - + ""); + out.println( + "" + + "" + + "" + + "" + + "" + + "" + + ""); int count = 0; Set adobeItems = new TreeSet<>(); @@ -946,13 +1147,15 @@ private static void showRadicalCompare() throws IOException { Set rad = rad2.get0(); final Set raw = rad2.get1(); double doubleRadical = key.doubleValue(); - int intRadical = (int)doubleRadical; + int intRadical = (int) doubleRadical; final boolean alt = intRadical != doubleRadical; - String samples = alt ? "" : getSamples((int)key.doubleValue()); + String samples = alt ? "" : getSamples((int) key.doubleValue()); String key2 = intRadical + (alt ? "'" : ""); - final Set cjkRad = alt ? Collections.EMPTY_SET : IdsFileData.radToCjkRad.get(intRadical); + final Set cjkRad = + alt ? Collections.EMPTY_SET : IdsFileData.radToCjkRad.get(intRadical); UnicodeSet RSUnicode = USTROKE.get(intRadical, alt); - M3 adobe = ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get(intRadical); + M3 adobe = + ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get(intRadical); adobeItems.clear(); if (!alt) { for (Entry> entry2 : adobe) { @@ -967,15 +1170,30 @@ private static void showRadicalCompare() throws IOException { } } } - out.println("" - + "" - + "" - + "" - + "" - + "" - + "" - + "" - + ""); + out.println( + "" + + "" + + "" + + "" + + "" + + "" + + "" + + "" + + ""); } showFooter(out); System.out.println("items:\t" + count); @@ -984,21 +1202,22 @@ private static void showRadicalCompare() throws IOException { private static void showRadicalCompareTxt() throws IOException { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", "radicalCompare.txt"); - ) { - out.println("# Additional Radical Mappings (beyond CJKRadicals.txt)\n" - + "#\n" - + "# The sources are:\n" - + "# • CJK_Radicals.txt\n" - + "# • kRS* with zero remaining strokes;\n" - + "# • Nameslist annotations (for CJK Radicals Supplement)\n" - + "# • idsCjkRadicals.txt (*draft* extra items)\n" - + "#\n" - + "# Format:\n" - + "# Code ; Rad. №; Strokes # (char) character-name ;\tsources\n" - + "#" - ); - for (Entry entry : RadicalData.entrySet()) { + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "ids/", "radicalCompare.txt"); ) { + out.println( + "# Additional Radical Mappings (beyond CJKRadicals.txt)\n" + + "#\n" + + "# The sources are:\n" + + "# • CJK_Radicals.txt\n" + + "# • kRS* with zero remaining strokes;\n" + + "# • Nameslist annotations (for CJK Radicals Supplement)\n" + + "# • idsCjkRadicals.txt (*draft* extra items)\n" + + "#\n" + + "# Format:\n" + + "# Code ; Rad. №; Strokes # (char) character-name ;\tsources\n" + + "#"); + for (Entry entry : RadicalData.entrySet()) { entry.getValue().print(out); } } @@ -1006,22 +1225,24 @@ private static void showRadicalCompareTxt() throws IOException { private static class Wikiwand { static UnicodeMap cjkRadSupToIdeo = new UnicodeMap<>(); + static { for (String line : FileUtilities.in(Ids.class, "wikiwand.txt")) { if (line.startsWith("#")) { continue; } List row = Common.SEMI_SPLITTER.splitToList(line); - // U+2ED6 (11990) ; ⻖ ; CJK RADICAL MOUND TWO ; CJK-Radikal 170 Hügel, 2. Form (links) = 阝 (U+961D) + // U+2ED6 (11990) ; ⻖ ; CJK RADICAL MOUND TWO ; CJK-Radikal 170 Hügel, 2. Form + // (links) = 阝 (U+961D) String cjkRadSup = row.get(1); String target = row.get(3); // = 阝 ( int equals = target.lastIndexOf('→'); - int paren = target.indexOf('(',equals); + int paren = target.indexOf('(', equals); if (equals == -1 || paren == -1) { throw new ICUException(target); } - String cp = target.substring(equals+1, paren).trim(); + String cp = target.substring(equals + 1, paren).trim(); if (UTF16.countCodePoint(cp) != 1) { throw new ICUException(target); } @@ -1029,11 +1250,13 @@ private static class Wikiwand { } cjkRadSupToIdeo.freeze(); } + static boolean check(Set items) { for (String item : items) { String v = cjkRadSupToIdeo.get(item); if (v != null && !items.contains(v)) { - System.out.println("Wikiwand: " + items + " don't contain " + v + " from " + item); + System.out.println( + "Wikiwand: " + items + " don't contain " + v + " from " + item); return false; } } @@ -1049,8 +1272,17 @@ static String check(String key2, Set items, Output cjkRad) { continue; } if (!items.contains(v)) { - System.out.println("Radical " + key2 + ", Nameslist Annotations for " + item + " " + UCharacter.getName(item, ", ") - + ": " + items + " doesn't contain " + v); + System.out.println( + "Radical " + + key2 + + ", Nameslist Annotations for " + + item + + " " + + UCharacter.getName(item, ", ") + + ": " + + items + + " doesn't contain " + + v); } cjkRad.value = item; return v; @@ -1059,8 +1291,11 @@ static String check(String key2, Set items, Output cjkRad) { } } - - private static void addItems(UnicodeSet sourceItems, String reason, Set sortedChars, Relation reasonMap) { + private static void addItems( + UnicodeSet sourceItems, + String reason, + Set sortedChars, + Relation reasonMap) { if (sourceItems != null && !sourceItems.isEmpty()) { sourceItems.addAllTo(sortedChars); for (String s : sourceItems) { @@ -1069,7 +1304,11 @@ private static void addItems(UnicodeSet sourceItems, String reason, Set } } - private static void addItems(Set sourceItems, String reason, Set sortedChars, Relation reasons) { + private static void addItems( + Set sourceItems, + String reason, + Set sortedChars, + Relation reasons) { if (sourceItems != null && !sourceItems.isEmpty()) { sortedChars.addAll(sourceItems); for (String s : sourceItems) { @@ -1078,8 +1317,12 @@ private static void addItems(Set sourceItems, String reason, Set } } - private static void addRadicals(int intRadical, Map radicalSource, - Set sortedChars, String reason, Relation reasons) { + private static void addRadicals( + int intRadical, + Map radicalSource, + Set sortedChars, + String reason, + Relation reasons) { UnicodeSet us = radicalSource.get(intRadical); if (us != null) { us.addAllTo(sortedChars); @@ -1090,7 +1333,8 @@ private static void addRadicals(int intRadical, Map radical } private static String getSamples(int radical) { - M3 data = ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get(radical); + M3 data = + ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get(radical); int count = 0; StringBuilder samples = new StringBuilder(); for (final Entry> entry2 : data) { @@ -1122,9 +1366,10 @@ private static String getSamples(int radical) { return samples.toString(); } - private static void showMainList(String type, UnicodeMap idsDataMap, int itemsPerFile) throws IOException { + private static void showMainList( + String type, UnicodeMap idsDataMap, int itemsPerFile) throws IOException { TreeSet sorted = new TreeSet<>(UNIHAN); - //System.out.println("IDS_DATA.keySet(): " + IDS_DATA.keySet().size()); + // System.out.println("IDS_DATA.keySet(): " + IDS_DATA.keySet().size()); idsDataMap.keySet().addAllTo(sorted); int count = 0; int oldFileCount = -1; @@ -1137,10 +1382,14 @@ private static void showMainList(String type, UnicodeMap idsDataMa showFooter(out); out.close(); } - out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", "ids" + type + fileCount + ".html"); + out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "ids/", + "ids" + type + fileCount + ".html"); oldFileCount = fileCount; showHeader(out); - out.println(""); + out.println( + ""); } show(++count, out, s.codePointAt(0), entry.idsSource, entry.parts); } @@ -1150,9 +1399,11 @@ private static void showMainList(String type, UnicodeMap idsDataMa } private static void showConfusables() throws IOException { - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "ids/", "cjkConfusableCandidates.txt"); - ) { - Relation invert = Relation.of(new TreeMap>(), TreeSet.class, UNIHAN); + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "ids/", "cjkConfusableCandidates.txt"); ) { + Relation invert = + Relation.of(new TreeMap>(), TreeSet.class, UNIHAN); for (EntryRange entry : IDS_RECURSIVE.entryRanges()) { if (entry.string != null) { invert.put(entry.value.getComponents(), entry.string); @@ -1171,7 +1422,14 @@ private static void showConfusables() throws IOException { if (UNIHAN.compare(item1, item2) <= 0) { continue; } - out.println(item1 + " ; " + item2 + " # " + Utility.hex(item1) + " ; " + Utility.hex(item2)); + out.println( + item1 + + " ; " + + item2 + + " # " + + Utility.hex(item1) + + " ; " + + Utility.hex(item2)); ++count; } } @@ -1181,25 +1439,27 @@ private static void showConfusables() throws IOException { } } - - private static void showFooter(PrintWriter out) { out.println("
" + "key" - + "" + "description" - + "" + "samples" - + "
" + + "key" + + "" + + "description" + + "" + + "samples" + + "
" + entry.getKey() - + "" + entry.getValue().description - + "" + entry.getValue().samples.toPattern(false) - + "
" + + entry.getKey() + + "" + + entry.getValue().description + + "" + + entry.getValue().samples.toPattern(false) + + "
CountSourceIDSSource Radical + RS(.0)Source RS
CountSourceIDSSource Radical + RS(.0)Source RS
" + ++count + "" + source + "" + rs + "" + Common.SPACE_JOINER.join(rads) + "" + idsSource + "
" + + ++count + + "" + + source + + "" + + rs + + "" + + Common.SPACE_JOINER.join(rads) + + "" + + idsSource + + "
CountReason for failureSourceIDSSource Radical + RS(.0)Source RS
CountReason for failureSourceIDSSource Radical + RS(.0)Source RS
" + ++count + "" + entry.getKey() + "" + key + "" + entry2.getValue() + "" + Common.SPACE_JOINER.join(radChar.value) + "" + rad + "
" + + ++count + + "" + + entry.getKey() + + "" + + key + + "" + + entry2.getValue() + + "" + + Common.SPACE_JOINER.join(radChar.value) + + "" + + rad + + "
Radical#CJKRadicals.txt:F1RSUnicode(.0)CJK_Rad Block?RSAdobe(.0)RSAdobe DetailsName
Radical#CJKRadicals.txt:F1RSUnicode(.0)CJK_Rad Block?RSAdobe(.0)RSAdobe DetailsName
" + key2 + "" + raw.iterator().next() + "" + (RSUnicode == null ? "" : Common.SPACE_JOINER.join(RSUnicode)) + "" + (cjkRad == null ? "" : Common.SPACE_JOINER.join(cjkRad)) + "" + Common.SPACE_JOINER.join(adobeItems) + "" + samples + "" + UCharacter.getName(raw.iterator().next(), ",") + "
" + + key2 + + "" + + raw.iterator().next() + + "" + + (RSUnicode == null ? "" : Common.SPACE_JOINER.join(RSUnicode)) + + "" + + (cjkRad == null ? "" : Common.SPACE_JOINER.join(cjkRad)) + + "" + + Common.SPACE_JOINER.join(adobeItems) + + "" + + samples + + "" + + UCharacter.getName(raw.iterator().next(), ",") + + "
CountSourceIDS App. Pos.IDSApp. Pos.
CountSourceIDS App. Pos.IDSApp. Pos.
"); } private static void showHeader(PrintWriter out) { - out.println(""); + out.println( + "
"); } private static final class CharacterIds implements Comparable { @@ -1222,6 +1482,7 @@ public CharacterIds(String sourceChar, String idsSource) { confusableList = temp2.toString(); this.questionable = questionable.value; } + public String getComponents() { StringBuilder result = new StringBuilder(); for (CpPart part : parts) { @@ -1229,10 +1490,12 @@ public String getComponents() { } return result.toString(); } + @Override public String toString() { return idsSource + "\t" + parts; } + @Override public int compareTo(CharacterIds other) { int diff = confusableList.compareTo(other.confusableList); @@ -1243,7 +1506,7 @@ public int compareTo(CharacterIds other) { } } - public static class MyTreeMap extends TreeMap { + public static class MyTreeMap extends TreeMap { public MyTreeMap() { super((Comparator) UNIHAN); } @@ -1251,8 +1514,12 @@ public MyTreeMap() { private static final UnicodeMap IDS_DATA = new UnicodeMap<>(); // private static final UnicodeMap IDS2_DATA = new UnicodeMap(); - private static final M3 failures = ChainedMap.of(new TreeMap(), new MyTreeMap(), String.class); - private static final M3 missing = ChainedMap.of(new TreeMap(), new MyTreeMap(), String.class); + private static final M3 failures = + ChainedMap.of( + new TreeMap(), new MyTreeMap(), String.class); + private static final M3 missing = + ChainedMap.of( + new TreeMap(), new MyTreeMap(), String.class); static String SOURCE_IDS = "ids.txt"; // "babelstoneIds.txt"; static UnicodeSet IDS_HACK = new UnicodeSet("[△ 々 ① ⑩-⑲ ② ⑳ ③-⑨ ℓ α い キ サ よ 〇 〢 \\&]").freeze(); @@ -1260,8 +1527,8 @@ public MyTreeMap() { static final Map MACROS = getMacros(); private static void load() { - String [] biggestCp = new String[50]; - CharacterIds [] biggest = new CharacterIds[50]; + String[] biggestCp = new String[50]; + CharacterIds[] biggest = new CharacterIds[50]; Counter counter = new Counter<>(); for (String line : FileUtilities.in(Settings.Output.GEN_DIR + "ids/", SOURCE_IDS)) { @@ -1285,8 +1552,8 @@ private static void load() { if (source.startsWith("&")) { // CDP-854B &CDP-854B; ⿻冂从 continue; -// int cp = Special.addSpecialX("?", source); -// source = UTF16.valueOf(cp); + // int cp = Special.addSpecialX("?", source); + // source = UTF16.valueOf(cp); } try { if (ids.equals(source)) { @@ -1318,16 +1585,15 @@ private static void load() { System.out.println(counter); for (Entry entry : IDS_DATA.entrySet()) { String decomp = getDecomp(entry.getValue().idsSource); - //System.out.println(entry.getKey() + "\t" + decomp); + // System.out.println(entry.getKey() + "\t" + decomp); try { IDS_RECURSIVE.put(entry.getKey(), new CharacterIds(entry.getKey(), decomp)); } catch (Exception e) { - //failures.put(e.getMessage(), entry.getKey(), decomp); + // failures.put(e.getMessage(), entry.getKey(), decomp); } } IDS_RECURSIVE.freeze(); - // UnicodeSet missing = new UnicodeSet(IDEOGRAPHIC) // .removeAll(identicals) // .removeAll(IDS_DATA.keySet()) @@ -1342,18 +1608,21 @@ private static void load() { // System.out.println("Failed to parse: "); // for (Entry> entry : failures) { // for (Entry entry2 : entry.getValue().entrySet()) { - // System.out.println(entry.getKey() + "\t" + entry2.getKey() + "\t" + entry2.getValue()); + // System.out.println(entry.getKey() + "\t" + entry2.getKey() + "\t" + + // entry2.getValue()); // } // } // for (Entry entry : cjkRadical.entrySet()) { // String key = entry.getKey(); // String totalStrokeCount = totalStrokes.get(key); // double value = cleanRadical(entry.getValue()); - // System.out.println(entry.getKey() + "\t" + value + "\t" + UCharacter.getName(entry.getKey(), "+") + "\t" + totalStrokeCount); + // System.out.println(entry.getKey() + "\t" + value + "\t" + + // UCharacter.getName(entry.getKey(), "+") + "\t" + totalStrokeCount); // } // for (String s : STROKES) { // String totalStrokeCount = totalStrokes.get(s); - // System.out.println(s + "\t" + "?" + "\t" + UCharacter.getName(s, "+") + "\t" + totalStrokeCount); + // System.out.println(s + "\t" + "?" + "\t" + UCharacter.getName(s, "+") + "\t" + + // totalStrokeCount); // } } @@ -1362,7 +1631,8 @@ private static void load() { // for (String s : missingSorted) { // final Set rs = radicalStroke.get(s); // if (rs == null) { - // System.out.println(title + "\t" + s + "\t" + "null" + "\t" + "null" + "\t" + totalStrokes.get(s)); + // System.out.println(title + "\t" + s + "\t" + "null" + "\t" + "null" + "\t" + + // totalStrokes.get(s)); // continue; // } // // if (rs.contains("|")) { @@ -1371,23 +1641,23 @@ private static void load() { // for (String rsItem : rs) { // List rsAlts = VBAR_SPLITTER.splitToList(rsItem); // List rsArray = DOT_SPLITTER.splitToList(rsAlts.get(0)); - // System.out.println(title + "\t" + s + "\t" + cleanRadical(rsArray.get(0)) + "\t" + rsArray.get(1) + "\t" + totalStrokes.get(s)); + // System.out.println(title + "\t" + s + "\t" + cleanRadical(rsArray.get(0)) + + // "\t" + rsArray.get(1) + "\t" + totalStrokes.get(s)); // } // } // } - private static Map getMacros() { - Relation temp = Relation.of(new HashMap(), HashSet.class); + private static Map getMacros() { + Relation temp = Relation.of(new HashMap(), HashSet.class); // fetch the data for (String line : FileUtilities.in(Settings.Output.GEN_DIR + "ids/", SOURCE_IDS)) { // CDP-8DA8 &CDP-8DA8; ⿻廿木 ⿱丗木 ⿱𠀍木 // U+3022 〢 ⿰丨丨 // CDP-8DBA &CDP-8DBA; &CDP-8DBA; - if (line.startsWith(";") || line.startsWith("#") + if (line.startsWith(";") + || line.startsWith("#") || line.compareTo("U+34") > 0 - || (line.compareTo("U+2000") > 0 - && line.charAt(7) == '\t') - ) { + || (line.compareTo("U+2000") > 0 && line.charAt(7) == '\t')) { continue; } List parts = Common.SPACE_SPLITTER.splitToList(line); @@ -1399,11 +1669,11 @@ private static Map getMacros() { for (int i = 2; i < parts.size(); ++i) { String trial = parts.get(i); int bracket = trial.indexOf('['); - temp.put(base, bracket < 0 ? trial : trial.substring(0,bracket)); + temp.put(base, bracket < 0 ? trial : trial.substring(0, bracket)); } } // now try to reduce - Map result = new HashMap<>(); + Map result = new HashMap<>(); while (true) { boolean added = false; for (Entry entry : temp.entrySet()) { @@ -1448,21 +1718,21 @@ static double cleanRadical(String string) { double increment = 0; if (string.endsWith("'")) { increment = 0.5; - string = string.substring(0,string.length()-1); + string = string.substring(0, string.length() - 1); } return Double.parseDouble(string) + increment; } private static String clean(String idsSource) { - int start = 0; + int start = 0; int end = idsSource.length(); if (idsSource.startsWith("^")) { start = 1; } if (idsSource.endsWith("$")) { - end = end-1; + end = end - 1; } - String x = idsSource.substring(start,end); + String x = idsSource.substring(start, end); return x; } @@ -1470,37 +1740,42 @@ private static String clean(String idsSource) { // show(out, codepoint, source, CpPart.parse(source)); // } - private static void show(int count, PrintWriter out, int codepoint, String source, List breakdown) { + private static void show( + int count, PrintWriter out, int codepoint, String source, List breakdown) { if (DEBUG) System.out.println(UTF16.valueOf(codepoint) + "\t" + source); - out.println("" - + "\n" - + "\n" - + "\n" - + "\n" - + "" - + "" - ); + out.println( + "" + + "\n" + + "\n" + + "\n" + + "\n" + + "" + + ""); } private static String show(List data) { - StringBuilder b = new StringBuilder(""); + StringBuilder b = new StringBuilder(""); int count = 0; for (CpPart part : data) { b.append(part.svgRect(part.getColor(), false)); // b.append("
" + // + "style='" + part.html() + "'>" // + codePoint + ":" + ++count + "
\n"); ++count; } @@ -1508,28 +1783,28 @@ private static String show(List data) { } // Confusable /* - ⼖ U+2F16 KANGXI RADICAL HIDING ENCLOSURE - 匚 U+531A CJK UNIFIED IDEOGRAPH-531A - 匸 U+5338 CJK UNIFIED IDEOGRAPH-5338 - - ⼢ U+2F22 KANGXI RADICAL GO SLOWLY - 夂 U+5902 CJK UNIFIED IDEOGRAPH-5902 - 夊 U+590A CJK UNIFIED IDEOGRAPH-590A - ⽇ U+2F47 KANGXI RADICAL SUN - 日 U+65E5 CJK UNIFIED IDEOGRAPH-65E5 - 曰 U+66F0 CJK UNIFIED IDEOGRAPH-66F0 - - ⽒ U+2F52 KANGXI RADICAL CLAN - 氏 U+6C0F CJK UNIFIED IDEOGRAPH-6C0F - 氐 U+6C10 CJK UNIFIED IDEOGRAPH-6C10 - - 辶 U+8FB6 CJK UNIFIED IDEOGRAPH-8FB6 - 辶 U+FA66 CJK COMPATIBILITY IDEOGRAPH-FA66 - - ⿈ U+2FC8 KANGXI RADICAL YELLOW - 黃 U+9EC3 CJK UNIFIED IDEOGRAPH-9EC3 - 黄 U+9EC4 CJK UNIFIED IDEOGRAPH-9EC4 - */ + ⼖ U+2F16 KANGXI RADICAL HIDING ENCLOSURE + 匚 U+531A CJK UNIFIED IDEOGRAPH-531A + 匸 U+5338 CJK UNIFIED IDEOGRAPH-5338 + + ⼢ U+2F22 KANGXI RADICAL GO SLOWLY + 夂 U+5902 CJK UNIFIED IDEOGRAPH-5902 + 夊 U+590A CJK UNIFIED IDEOGRAPH-590A + ⽇ U+2F47 KANGXI RADICAL SUN + 日 U+65E5 CJK UNIFIED IDEOGRAPH-65E5 + 曰 U+66F0 CJK UNIFIED IDEOGRAPH-66F0 + + ⽒ U+2F52 KANGXI RADICAL CLAN + 氏 U+6C0F CJK UNIFIED IDEOGRAPH-6C0F + 氐 U+6C10 CJK UNIFIED IDEOGRAPH-6C10 + + 辶 U+8FB6 CJK UNIFIED IDEOGRAPH-8FB6 + 辶 U+FA66 CJK COMPATIBILITY IDEOGRAPH-FA66 + + ⿈ U+2FC8 KANGXI RADICAL YELLOW + 黃 U+9EC3 CJK UNIFIED IDEOGRAPH-9EC3 + 黄 U+9EC4 CJK UNIFIED IDEOGRAPH-9EC4 + */ static { buildConfusableRadicals(); load(); diff --git a/unicodetools/src/main/java/org/unicode/tools/IdsFileData.java b/unicodetools/src/main/java/org/unicode/tools/IdsFileData.java index d4169f3a0..de98567ac 100644 --- a/unicodetools/src/main/java/org/unicode/tools/IdsFileData.java +++ b/unicodetools/src/main/java/org/unicode/tools/IdsFileData.java @@ -1,14 +1,5 @@ package org.unicode.tools; -import java.util.List; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; - -import org.unicode.cldr.draft.FileUtilities; -import org.unicode.text.utility.Settings; -import org.unicode.text.utility.Utility; - import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList.Builder; import com.ibm.icu.dev.util.UnicodeMap; @@ -16,11 +7,20 @@ import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ICUException; +import java.util.List; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.text.utility.Settings; +import org.unicode.text.utility.Utility; public class IdsFileData { public static final UnicodeMap cjkStrokeToExamples = new UnicodeMap(); - public static final Relation radToCjkRad = Relation.of(new TreeMap>(), TreeSet.class); - public static final Relation cjkRadToRad = Relation.of(new TreeMap>(), TreeSet.class); + public static final Relation radToCjkRad = + Relation.of(new TreeMap>(), TreeSet.class); + public static final Relation cjkRadToRad = + Relation.of(new TreeMap>(), TreeSet.class); public static final UnicodeMap cjkRadSupToIdeo = new UnicodeMap<>(); public static final UnicodeMap> TOTAL_STROKES = new UnicodeMap<>(); @@ -43,11 +43,11 @@ public class IdsFileData { TOTAL_STROKES.put(cp, ilb.build()); } TOTAL_STROKES.freeze(); - + for (String line : FileUtilities.in(Ids.class, "n3063StrokeExamples.txt")) { int hashPos = line.indexOf('#'); if (hashPos >= 0) { - line= line.substring(0, hashPos).trim(); + line = line.substring(0, hashPos).trim(); } if (line.isEmpty()) { continue; @@ -62,7 +62,7 @@ public class IdsFileData { for (String line : FileUtilities.in(Ids.class, "idsCjkRadicals.txt")) { int hashPos = line.indexOf('#'); if (hashPos >= 0) { - line= line.substring(0, hashPos).trim(); + line = line.substring(0, hashPos).trim(); } if (line.isEmpty()) { continue; @@ -93,4 +93,3 @@ public class IdsFileData { cjkRadSupToIdeo.freeze(); } } - diff --git a/unicodetools/src/main/java/org/unicode/tools/ListProps.java b/unicodetools/src/main/java/org/unicode/tools/ListProps.java index a715bb498..98657ebad 100644 --- a/unicodetools/src/main/java/org/unicode/tools/ListProps.java +++ b/unicodetools/src/main/java/org/unicode/tools/ListProps.java @@ -1,5 +1,13 @@ package org.unicode.tools; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multimap; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UnicodeSet; import java.io.File; import java.util.ArrayList; import java.util.Collection; @@ -7,149 +15,154 @@ import java.util.EnumSet; import java.util.LinkedHashSet; import java.util.Set; - -import org.unicode.props.UnicodeProperty; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyStatus; import org.unicode.props.PropertyStatus.PropertyScope; import org.unicode.props.PropertyType; import org.unicode.props.UcdProperty; +import org.unicode.props.UnicodeProperty; import org.unicode.props.ValueCardinality; import org.unicode.text.utility.Settings; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableMultimap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Multimap; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UnicodeSet; - public class ListProps { private static final String BIN_PROPS = Settings.Output.BIN_DIR; - private static final UcdProperty DEBUG_LIST_VALUES = null ; // UcdProperty.Confusable_MA; + private static final UcdProperty DEBUG_LIST_VALUES = null; // UcdProperty.Confusable_MA; private static String ONLY_PROP = null; // "Emoji"; static final boolean ONLY_JSP = true; - public static final Set SKIP_JSP_STATUS = ImmutableSet.of( - PropertyStatus.Deprecated, - PropertyStatus.Obsolete, - PropertyStatus.Stabilized, - PropertyStatus.Contributory, - PropertyStatus.Internal - ); + public static final Set SKIP_JSP_STATUS = + ImmutableSet.of( + PropertyStatus.Deprecated, + PropertyStatus.Obsolete, + PropertyStatus.Stabilized, + PropertyStatus.Contributory, + PropertyStatus.Internal); public static void main(String[] args) { IndexUnicodeProperties latest = IndexUnicodeProperties.make(); System.out.println(ListProps.class.getName() + ": UCD: " + latest.getUcdVersion()); new File(BIN_PROPS).mkdirs(); -// if (true) { -// Set sc = EnumSet.copyOf(latest.loadEnum(UcdProperty.Script, Script_Values.class).values()); -// sc.remove(Script_Values.Inherited); -// sc.remove(Script_Values.Common); -// sc.remove(Script_Values.Unknown); -// System.out.println(sc.size()); -// return; -// } -// if (true) { -// UnicodeSet ep = latest.loadEnum(UcdProperty.Extended_Pictographic, UcdPropertyValues.Binary.class).getSet(Binary.Yes); -// UnicodeSet em = latest.loadEnum(UcdProperty.Emoji, UcdPropertyValues.Binary.class).getSet(Binary.Yes); -// UnicodeSet combined = new UnicodeSet(ep).addAll(em).freeze(); -// PropertyLister pl = new PropertyLister(latest); -// System.out.println( -// pl.listSet(combined, -// UcdProperty.Extended_Pictographic.toString(), -// new StringBuilder())); -// return; -// } + // if (true) { + // Set sc = EnumSet.copyOf(latest.loadEnum(UcdProperty.Script, + // Script_Values.class).values()); + // sc.remove(Script_Values.Inherited); + // sc.remove(Script_Values.Common); + // sc.remove(Script_Values.Unknown); + // System.out.println(sc.size()); + // return; + // } + // if (true) { + // UnicodeSet ep = latest.loadEnum(UcdProperty.Extended_Pictographic, + // UcdPropertyValues.Binary.class).getSet(Binary.Yes); + // UnicodeSet em = latest.loadEnum(UcdProperty.Emoji, + // UcdPropertyValues.Binary.class).getSet(Binary.Yes); + // UnicodeSet combined = new UnicodeSet(ep).addAll(em).freeze(); + // PropertyLister pl = new PropertyLister(latest); + // System.out.println( + // pl.listSet(combined, + // UcdProperty.Extended_Pictographic.toString(), + // new StringBuilder())); + // return; + // } PropertyType lastType = null; Set skipped = new LinkedHashSet<>(); Set failures = new LinkedHashSet<>(); Set unknownScope = new LinkedHashSet<>(); main: - for (UcdProperty item : UcdProperty.values()) { - String propName = item.toString(); - if (ONLY_PROP != null && !propName.contains(ONLY_PROP)) { - continue; - } - PropertyType type = item.getType(); - ValueCardinality cardinality = item.getCardinality(); - if (type != lastType) { - //System.out.println("\n" + type + "\n"); - lastType = type; - } - EnumSet status = PropertyStatus.getPropertyStatusSet(item); - - UnicodeMap map = latest.load(item); - Set values = map.values(); - - PropertyScope scope = PropertyStatus.getScope(propName); - String itemInfo = item - + "\tType:\t" + type - + "\tStatus:\t"+ CollectionUtilities.join(status, ", ") - + "\tCard:\t" + cardinality - + "\tDefVal:\t" + IndexUnicodeProperties.getDefaultValue(item) - + "\tScope:\t" + scope - + "\tOrigin:\t" + PropertyStatus.getOrigin(propName) - + "\tValues:\t" + clip(values) - ; - if (scope == PropertyScope.Unknown) { - unknownScope.add(propName); - } - if (item == DEBUG_LIST_VALUES) { - for (String value : map.values()) { - UnicodeSet uset = map.getSet(value); - System.out.println(value + "\t" + Utility.hex(value) + "\t" + uset.toPattern(false)); - } + for (UcdProperty item : UcdProperty.values()) { + String propName = item.toString(); + if (ONLY_PROP != null && !propName.contains(ONLY_PROP)) { + continue; + } + PropertyType type = item.getType(); + ValueCardinality cardinality = item.getCardinality(); + if (type != lastType) { + // System.out.println("\n" + type + "\n"); + lastType = type; + } + EnumSet status = PropertyStatus.getPropertyStatusSet(item); + + UnicodeMap map = latest.load(item); + Set values = map.values(); + + PropertyScope scope = PropertyStatus.getScope(propName); + String itemInfo = + item + + "\tType:\t" + + type + + "\tStatus:\t" + + CollectionUtilities.join(status, ", ") + + "\tCard:\t" + + cardinality + + "\tDefVal:\t" + + IndexUnicodeProperties.getDefaultValue(item) + + "\tScope:\t" + + scope + + "\tOrigin:\t" + + PropertyStatus.getOrigin(propName) + + "\tValues:\t" + + clip(values); + if (scope == PropertyScope.Unknown) { + unknownScope.add(propName); + } + if (item == DEBUG_LIST_VALUES) { + for (String value : map.values()) { + UnicodeSet uset = map.getSet(value); + System.out.println( + value + "\t" + Utility.hex(value) + "\t" + uset.toPattern(false)); } + } - if (ONLY_JSP) { - if (!Collections.disjoint(status, SKIP_JSP_STATUS)) { - skipped.add(itemInfo); - continue main; - } - if (propName.startsWith("k")) { - switch (type) { + if (ONLY_JSP) { + if (!Collections.disjoint(status, SKIP_JSP_STATUS)) { + skipped.add(itemInfo); + continue main; + } + if (propName.startsWith("k")) { + switch (type) { case Miscellaneous: case String: if (item == UcdProperty.kSimplifiedVariant - || item == UcdProperty.kTraditionalVariant) { + || item == UcdProperty.kTraditionalVariant) { break; } skipped.add(itemInfo); continue main; - default: break; - } + default: + break; } } - System.out.println("➕\t" + itemInfo); - try { - - UnicodeProperty uprop = latest.getProperty(propName); - - Set enums = item.getEnums(); - if (enums != null) { - Set flatValues = flattenValues(values); - Set enumStrings = getStrings(enums); - Collection exceptions = propExceptions.get(propName); - if (exceptions != null) { - enumStrings.removeAll(exceptions); - } - if (!enumStrings.equals(flatValues)) { - System.out.println("\t" + "≠ VALUES!!!\t" + showDiff("enums", enumStrings, "values", flatValues)); - failures.add(propName); - } - for (String pval : uprop.getAvailableValues()) { - uprop.getValueAliases(pval); - } + } + System.out.println("➕\t" + itemInfo); + try { + + UnicodeProperty uprop = latest.getProperty(propName); + + Set enums = item.getEnums(); + if (enums != null) { + Set flatValues = flattenValues(values); + Set enumStrings = getStrings(enums); + Collection exceptions = propExceptions.get(propName); + if (exceptions != null) { + enumStrings.removeAll(exceptions); + } + if (!enumStrings.equals(flatValues)) { + System.out.println( + "\t" + + "≠ VALUES!!!\t" + + showDiff("enums", enumStrings, "values", flatValues)); + failures.add(propName); + } + for (String pval : uprop.getAvailableValues()) { + uprop.getValueAliases(pval); } - latest.internalStoreCachedMap(BIN_PROPS, item, map); - } catch (Exception e) { - e.printStackTrace(); } + latest.internalStoreCachedMap(BIN_PROPS, item, map); + } catch (Exception e) { + e.printStackTrace(); } + } for (String skip : skipped) { System.out.println("➖\t" + skip); } @@ -171,26 +184,42 @@ private static Set flattenValues(Set values) { return result; } - static Multimap propExceptions = ImmutableMultimap.builder() - .putAll("Script", "Katakana_Or_Hiragana, Japanese, Korean, Han_with_Bopomofo, Math_Symbols, Emoji_Symbols, Other_Symbols, Unwritten".split(", ")) - .putAll("Script_Extensions", "Katakana_Or_Hiragana, Japanese, Korean, Han_with_Bopomofo, Math_Symbols, Emoji_Symbols, Other_Symbols, Unwritten".split(", ")) - .putAll("Canonical_Combining_Class", "CCC133, Attached_Below_Left".split(", ")) - .putAll("General_Category", "Other, Letter, Cased_Letter, Mark, Number, Punctuation, Symbol, Separator".split(", ")) - .putAll("Identifier_Type", "Aspirational".split(", ")) - .putAll("Word_Break", "E_Base_GAZ, Glue_After_Zwj, E_Base, E_Modifier".split(", ")) - .putAll("Grapheme_Cluster_Break", "E_Base_GAZ, Glue_After_Zwj, E_Base, E_Modifier".split(", ")) - .build(); + static Multimap propExceptions = + ImmutableMultimap.builder() + .putAll( + "Script", + "Katakana_Or_Hiragana, Japanese, Korean, Han_with_Bopomofo, Math_Symbols, Emoji_Symbols, Other_Symbols, Unwritten" + .split(", ")) + .putAll( + "Script_Extensions", + "Katakana_Or_Hiragana, Japanese, Korean, Han_with_Bopomofo, Math_Symbols, Emoji_Symbols, Other_Symbols, Unwritten" + .split(", ")) + .putAll("Canonical_Combining_Class", "CCC133, Attached_Below_Left".split(", ")) + .putAll( + "General_Category", + "Other, Letter, Cased_Letter, Mark, Number, Punctuation, Symbol, Separator" + .split(", ")) + .putAll("Identifier_Type", "Aspirational".split(", ")) + .putAll( + "Word_Break", + "E_Base_GAZ, Glue_After_Zwj, E_Base, E_Modifier".split(", ")) + .putAll( + "Grapheme_Cluster_Break", + "E_Base_GAZ, Glue_After_Zwj, E_Base, E_Modifier".split(", ")) + .build(); private static String showDiff(String as, Set a, String bs, Set b) { // TODO Auto-generated method stub - return as + " - " + bs + ": " + diff(a,b) + "; " + bs + " - " + as + ": " + diff(b,a); + return as + " - " + bs + ": " + diff(a, b) + "; " + bs + " - " + as + ": " + diff(b, a); } + private static Set diff(Collection a, Collection b) { Set result = new LinkedHashSet(); result.addAll(a); result.removeAll(b); return result; } + private static Set getStrings(Collection enums) { Set result = new LinkedHashSet(); for (Object item : enums) { @@ -198,7 +227,10 @@ private static Set getStrings(Collection enums) { } return result; } + private static String clip(Collection availableValues) { - return availableValues.size() > 24 ? new ArrayList(availableValues).subList(0, 23) + ", …" : availableValues.toString(); + return availableValues.size() > 24 + ? new ArrayList(availableValues).subList(0, 23) + ", …" + : availableValues.toString(); } } diff --git a/unicodetools/src/main/java/org/unicode/tools/MultiComparator.java b/unicodetools/src/main/java/org/unicode/tools/MultiComparator.java index 93e2b47bb..052de478d 100644 --- a/unicodetools/src/main/java/org/unicode/tools/MultiComparator.java +++ b/unicodetools/src/main/java/org/unicode/tools/MultiComparator.java @@ -6,12 +6,12 @@ public class MultiComparator implements Comparator { private Comparator[] comparators; @SafeVarargs - public > MultiComparator (U... comparators) { + public > MultiComparator(U... comparators) { this.comparators = comparators; } /* Lexigraphic compare. Returns the first difference - * @return zero if equal. Otherwise +/- (i+1) + * @return zero if equal. Otherwise +/- (i+1) * where i is the index of the first comparator finding a difference * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) */ diff --git a/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatch.java b/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatch.java index 690451668..83ee4cd8a 100644 --- a/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatch.java +++ b/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatch.java @@ -1,28 +1,26 @@ package org.unicode.tools; +import com.google.common.base.Splitter; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.BufferedReader; import java.io.IOException; import java.util.List; import java.util.Locale; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.util.ICUUncheckedIOException; - public class NormalizeForMatch { public enum SpecialReason { - digraph, - final_form, - fixed_superscript, + digraph, + final_form, + fixed_superscript, missing_sequence, missing_enclosed, missing_case, - missing_nfkc, - fixed_nfkc, - radical, + missing_nfkc, + fixed_nfkc, + radical, retain_cf, retain_hangul, retain_tags, @@ -45,9 +43,10 @@ public enum SpecialReason { nfkccf_wide, nfkccf_medial, nfkccf_small, - fix_slash, - add_separator, + fix_slash, + add_separator, recursion; + static final SpecialReason forString(String s) { s = s.toLowerCase(Locale.ROOT); if (s.equals("retain_superscript")) { @@ -71,7 +70,8 @@ public UnicodeMap getSourceToReason() { return sourceToReason; } - public NormalizeForMatch(UnicodeMap sourceToTarget2, UnicodeMap sourceToReason2) { + public NormalizeForMatch( + UnicodeMap sourceToTarget2, UnicodeMap sourceToReason2) { sourceToTarget = sourceToTarget2.freeze(); sourceToReason = sourceToReason2.freeze(); } @@ -89,8 +89,9 @@ public static NormalizeForMatch load(String directory, String file, boolean acce UnicodeMap sourceToTarget = new UnicodeMap<>(); UnicodeMap sourceToReason = new UnicodeMap<>(); NormalizeForMatch.SpecialReason overrideReason = null; - try (BufferedReader in = directory == null - ? FileUtilities.openFile(NormalizeForMatch.class, file) + try (BufferedReader in = + directory == null + ? FileUtilities.openFile(NormalizeForMatch.class, file) : FileUtilities.openFile(directory, file)) { while (true) { String line = in.readLine(); @@ -113,7 +114,10 @@ public static NormalizeForMatch load(String directory, String file, boolean acce boolean debug = true; } String target = Utility.fromHex(parts.get(1), acceptRawChars); - SpecialReason reason = parts.size() > 2 ? NormalizeForMatch.SpecialReason.forString(parts.get(2)) : overrideReason; + SpecialReason reason = + parts.size() > 2 + ? NormalizeForMatch.SpecialReason.forString(parts.get(2)) + : overrideReason; sourceToTarget.put(source, target); sourceToReason.put(source, reason); @@ -133,11 +137,14 @@ public static NormalizeForMatch load(String directory, String file, boolean acce // + (target == null ? "" : "; " + Utility.hex(target, " ") // + "; " + latest.getName(target, " + "))); // } - // UnicodeSet newCharsOnly = latest.loadEnum(UcdProperty.Age, Age_Values.class).getSet(Age_Values.V9_0); + // UnicodeSet newCharsOnly = latest.loadEnum(UcdProperty.Age, + // Age_Values.class).getSet(Age_Values.V9_0); // UnicodeMap NFKC_Casefold = latest.load(UcdProperty.NFKC_Casefold); - // UnicodeMap gc = latest.loadEnum(UcdProperty.General_Category, UcdPropertyValues.General_Category_Values.class); + // UnicodeMap gc = latest.loadEnum(UcdProperty.General_Category, + // UcdPropertyValues.General_Category_Values.class); - // for (General_Category_Values gcv : UcdPropertyValues.General_Category_Values.values()) { + // for (General_Category_Values gcv : UcdPropertyValues.General_Category_Values.values()) + // { // boolean first = true; // for (String s : newCharsOnly) { // if (gc.get(s) != gcv) { @@ -152,7 +159,7 @@ public static NormalizeForMatch load(String directory, String file, boolean acce // first = false; // } // String target = curated.getSourceToTarget().get(s); - // //String target = NFKC_Casefold.get(s); + // //String target = NFKC_Casefold.get(s); // System.out.println(Utility.hex(s) + "; " + latest.getName(s, " + ") // + (target == null ? "" : "; " + Utility.hex(target, " ") // + "; " + latest.getName(target, " + "))); diff --git a/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatchDiff.java b/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatchDiff.java index 1e9466fbf..8864b7993 100644 --- a/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatchDiff.java +++ b/unicodetools/src/main/java/org/unicode/tools/NormalizeForMatchDiff.java @@ -1,59 +1,70 @@ package org.unicode.tools; -import org.unicode.text.utility.Settings; -import org.unicode.text.utility.Utility; - import com.google.common.base.Objects; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.UnicodeSet; +import org.unicode.text.utility.Settings; +import org.unicode.text.utility.Utility; public class NormalizeForMatchDiff { public static void main(String[] args) { - NormalizeForMatch production = NormalizeForMatch.load( - "/Users/markdavis/Google Drive/workspace/Generated/n4m-old/","xnfkccf_curated.txt"); - NormalizeForMatch sourceDirectory = NormalizeForMatch.load( - null,"XNFKCCF-Curated.txt"); - NormalizeForMatch dir90 = NormalizeForMatch.load( - Settings.UnicodeTools.DATA_DIR + "n4m/9.0.0/", "XNFKCCF-Curated.txt"); - NormalizeForMatch gen = NormalizeForMatch.load( - "/Users/markdavis/Google Drive/workspace/Generated/n4m/", "XNFKCCF-Curated.txt"); - - UnicodeSet keys = new UnicodeSet() - .addAll(production.getSourceToTarget()) - .addAll(sourceDirectory.getSourceToTarget()) - .addAll(dir90.getSourceToTarget()) - .addAll(gen.getSourceToTarget()) - ; - + NormalizeForMatch production = + NormalizeForMatch.load( + "/Users/markdavis/Google Drive/workspace/Generated/n4m-old/", + "xnfkccf_curated.txt"); + NormalizeForMatch sourceDirectory = NormalizeForMatch.load(null, "XNFKCCF-Curated.txt"); + NormalizeForMatch dir90 = + NormalizeForMatch.load( + Settings.UnicodeTools.DATA_DIR + "n4m/9.0.0/", "XNFKCCF-Curated.txt"); + NormalizeForMatch gen = + NormalizeForMatch.load( + "/Users/markdavis/Google Drive/workspace/Generated/n4m/", + "XNFKCCF-Curated.txt"); + + UnicodeSet keys = + new UnicodeSet() + .addAll(production.getSourceToTarget()) + .addAll(sourceDirectory.getSourceToTarget()) + .addAll(dir90.getSourceToTarget()) + .addAll(gen.getSourceToTarget()); + System.out.println( Utility.hex("Key") - + "\t" + "pValue" - + "\t" + "sValue" - + "\t" + "dValue9" - + "\t" + "Age" - + "\t" + "Name" - ); + + "\t" + + "pValue" + + "\t" + + "sValue" + + "\t" + + "dValue9" + + "\t" + + "Age" + + "\t" + + "Name"); for (String key : keys) { String pValue = production.getSourceToTarget().get(key); String sValue = sourceDirectory.getSourceToTarget().get(key); String dValue9 = dir90.getSourceToTarget().get(key); String gValue = gen.getSourceToTarget().get(key); - if (Objects.equal(pValue, sValue) + if (Objects.equal(pValue, sValue) && Objects.equal(pValue, dValue9) - && Objects.equal(pValue, gValue) - ) { + && Objects.equal(pValue, gValue)) { continue; } System.out.println( Utility.hex(key) - + "\t" + pValue - + "\t" + sValue - + "\t" + dValue9 - + "\t" + gValue - + "\t" + UCharacter.getAge(key.codePointAt(0)) - + "\t" + UCharacter.getName(key.codePointAt(0)) - ); + + "\t" + + pValue + + "\t" + + sValue + + "\t" + + dValue9 + + "\t" + + gValue + + "\t" + + UCharacter.getAge(key.codePointAt(0)) + + "\t" + + UCharacter.getName(key.codePointAt(0))); } } } diff --git a/unicodetools/src/main/java/org/unicode/tools/Normalizer3.java b/unicodetools/src/main/java/org/unicode/tools/Normalizer3.java index b896b4339..42d7fcd28 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Normalizer3.java +++ b/unicodetools/src/main/java/org/unicode/tools/Normalizer3.java @@ -1,29 +1,32 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Transform; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.text.UCD.Default; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Transform; - public abstract class Normalizer3 implements Transform { - private static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Default.ucdVersion()); + private static final IndexUnicodeProperties iup = + IndexUnicodeProperties.make(Default.ucdVersion()); private static final UnicodeMap NFKC_Casefold = iup.load(UcdProperty.NFKC_Casefold); public String normalize(String source) { return transform(source); } - public static final Normalizer3 NFKCCF = new Normalizer3() { - @Override - public String transform(String source) { - return Default.nfc().normalize(NFKC_Casefold.transform(source)); - } - }; - public static final Normalizer3 NFKC = new Normalizer3() { - @Override - public String transform(String source) { - return Default.nfkc().normalize(source); - } - }; -} \ No newline at end of file + + public static final Normalizer3 NFKCCF = + new Normalizer3() { + @Override + public String transform(String source) { + return Default.nfc().normalize(NFKC_Casefold.transform(source)); + } + }; + public static final Normalizer3 NFKC = + new Normalizer3() { + @Override + public String transform(String source) { + return Default.nfkc().normalize(source); + } + }; +} diff --git a/unicodetools/src/main/java/org/unicode/tools/Quick.java b/unicodetools/src/main/java/org/unicode/tools/Quick.java index d69b38b34..dda018140 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Quick.java +++ b/unicodetools/src/main/java/org/unicode/tools/Quick.java @@ -1,8 +1,9 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.util.EnumSet; import java.util.Set; - import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.PropertyNames.Named; import org.unicode.props.UcdProperty; @@ -13,24 +14,26 @@ import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.EmojiData; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class Quick { static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(Settings.latestVersion); static final UnicodeMap names = iup.load(UcdProperty.Name); - static final UnicodeMap blocks = iup.loadEnum(UcdProperty.Block, Block_Values.class); + static final UnicodeMap blocks = + iup.loadEnum(UcdProperty.Block, Block_Values.class); static final IndexUnicodeProperties iupOld = IndexUnicodeProperties.make(Settings.lastVersion); static final EmojiData emojiData = EmojiData.EMOJI_DATA; - static final UnicodeMap gencat = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + static final UnicodeMap gencat = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); static final UnicodeSet Cn = gencat.getSet(General_Category_Values.Unassigned); - + public static void main(String[] args) { showValue(0xE0041, UcdProperty.Word_Break, UcdPropertyValues.Word_Break_Values.class); showValue(0xE0041, UcdProperty.Line_Break, UcdPropertyValues.Line_Break_Values.class); - showValue(0xE0041, UcdProperty.Grapheme_Cluster_Break, UcdPropertyValues.Grapheme_Cluster_Break_Values.class); + showValue( + 0xE0041, + UcdProperty.Grapheme_Cluster_Break, + UcdPropertyValues.Grapheme_Cluster_Break_Values.class); if (true) return; - + Set emojiBlocks = EnumSet.noneOf(Block_Values.class); UnicodeSet emoji = emojiData.getSingletonsWithoutDefectives(); for (String s : new UnicodeSet(emoji).addAll(ExtendedPictographic.GLUE_AFTER_ZWJ)) { @@ -41,14 +44,19 @@ public static void main(String[] args) { System.out.println(ExtendedPictographic.HEADER); for (Block_Values block : emojiBlocks) { System.out.println("# " + block); - + UnicodeSet blockSet = blocks.getSet(block); UnicodeSet emojiInBlock = new UnicodeSet(blockSet).retainAll(emoji); - UnicodeSet gazInBlock = new UnicodeSet(blockSet).retainAll(ExtendedPictographic.GLUE_AFTER_ZWJ); + UnicodeSet gazInBlock = + new UnicodeSet(blockSet).retainAll(ExtendedPictographic.GLUE_AFTER_ZWJ); UnicodeSet gazInBlockNoCn = new UnicodeSet(gazInBlock).removeAll(Cn); UnicodeSet gazInBlockCn = new UnicodeSet(gazInBlock).retainAll(Cn); UnicodeSet cnInBlock = new UnicodeSet(blockSet).retainAll(Cn).removeAll(gazInBlock); - UnicodeSet otherInBlock = new UnicodeSet(blockSet).removeAll(emojiInBlock).removeAll(cnInBlock).removeAll(gazInBlock); + UnicodeSet otherInBlock = + new UnicodeSet(blockSet) + .removeAll(emojiInBlock) + .removeAll(cnInBlock) + .removeAll(gazInBlock); showNonEmpty("emoji", emojiInBlock, true); showNonEmpty("gaz", gazInBlock, true); @@ -71,31 +79,50 @@ public static void main(String[] args) { private static void showValue(int cp, final UcdProperty prop, final Class classIn) { Named value = (Named) iup.loadEnum(prop, classIn).get(cp); - System.out.println(Utility.hex(cp) + " " + names.get(cp) - + " → " + prop.getShortName() + "=" + value.getShortName() + "\t" + prop + "=" + value); + System.out.println( + Utility.hex(cp) + + " " + + names.get(cp) + + " → " + + prop.getShortName() + + "=" + + value.getShortName() + + "\t" + + prop + + "=" + + value); } private static void showRanges(UnicodeSet gazInBlock, boolean includeSetName) { for (UnicodeSet.EntryRange range : gazInBlock.ranges()) { System.out.println( printRange(range.codepoint, range.codepointEnd) - + "\t; Glue_After_Zwj" - + " #\t" - + (includeSetName ? - new UnicodeSet(range.codepoint, range.codepointEnd).toPattern(false) - + "\t" + getNames(range.codepoint, range.codepointEnd) : "GC=Cn") - ); + + "\t; Glue_After_Zwj" + + " #\t" + + (includeSetName + ? new UnicodeSet(range.codepoint, range.codepointEnd) + .toPattern(false) + + "\t" + + getNames(range.codepoint, range.codepointEnd) + : "GC=Cn")); } } private static void showNonEmpty(String title, UnicodeSet emojiInBlock, boolean includeUS) { if (!emojiInBlock.isEmpty()) { - System.out.println("# " + title + "=" + emojiInBlock.size() + (includeUS ? "\t: " + emojiInBlock.toPattern(false) : "")); + System.out.println( + "# " + + title + + "=" + + emojiInBlock.size() + + (includeUS ? "\t: " + emojiInBlock.toPattern(false) : "")); } } - // UnicodeMap linebreak = iup.loadEnum(UcdProperty.Line_Break, Line_Break_Values.class); - // UnicodeMap linebreakOld = iupOld.loadEnum(UcdProperty.Line_Break, Line_Break_Values.class); + // UnicodeMap linebreak = iup.loadEnum(UcdProperty.Line_Break, + // Line_Break_Values.class); + // UnicodeMap linebreakOld = iupOld.loadEnum(UcdProperty.Line_Break, + // Line_Break_Values.class); // UnicodeSet ID = linebreak.getSet(Line_Break_Values.Ideographic); // UnicodeSet IdOld = linebreakOld.getSet(Line_Break_Values.Ideographic); // UnicodeSet IdCn = new UnicodeSet(ID) @@ -131,7 +158,6 @@ private static String getName(int codepoint) { } private static String printRange(int start, int end) { - return "U+" + Utility.hex(start) - + (start == end ? "" : "..U+" + Utility.hex(end)); + return "U+" + Utility.hex(start) + (start == end ? "" : "..U+" + Utility.hex(end)); } } diff --git a/unicodetools/src/main/java/org/unicode/tools/RadicalData.java b/unicodetools/src/main/java/org/unicode/tools/RadicalData.java index d6888b5dd..4fc3de88a 100644 --- a/unicodetools/src/main/java/org/unicode/tools/RadicalData.java +++ b/unicodetools/src/main/java/org/unicode/tools/RadicalData.java @@ -1,5 +1,14 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.impl.Row; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ICUUncheckedIOException; +import com.ibm.icu.util.Output; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -12,25 +21,15 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.ChainedMap.M3; import org.unicode.cldr.util.Tabber; import org.unicode.tools.Ids.Nameslist; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.impl.Row; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ICUUncheckedIOException; -import com.ibm.icu.util.Output; - class RadicalData { private final String radical; private List strokeCounts; - private final Relation codeToReason = Relation.of(new TreeMap>(Ids.UNIHAN), LinkedHashSet.class); + private final Relation codeToReason = + Relation.of(new TreeMap>(Ids.UNIHAN), LinkedHashSet.class); public RadicalData(String _radical) { radical = _radical; @@ -39,9 +38,11 @@ public RadicalData(String _radical) { public Set getChars() { return codeToReason.keySet(); } + public List getStrokeCounts() { return strokeCounts; } + public void addItem(String string, String reason) { codeToReason.put(string, reason); } @@ -50,32 +51,41 @@ public void print(Appendable out) { if (strokeCounts == null) { throw new IllegalArgumentException("Must call finish first"); } - Tabber tabber = new Tabber.MonoTabber() - .add(8, Tabber.LEFT) - .add(5, Tabber.RIGHT) - .add(9, Tabber.RIGHT) - .add(50, Tabber.LEFT) - .add(60, Tabber.LEFT) - ; + Tabber tabber = + new Tabber.MonoTabber() + .add(8, Tabber.LEFT) + .add(5, Tabber.RIGHT) + .add(9, Tabber.RIGHT) + .add(50, Tabber.LEFT) + .add(60, Tabber.LEFT); try { for (Entry> entry : codeToReason.keyValuesSet()) { String codePoint = entry.getKey(); final Set reasons = entry.getValue(); - //multipleRadicals.put(codePoint, key2); + // multipleRadicals.put(codePoint, key2); List strokes = Ids.kTotalStrokes.get(codePoint); List strokes2 = Ids.RADICAL.contains(codePoint) ? getStrokeCounts() : null; out.append( tabber.process( Utility.hex(codePoint) - + " ;\t" + radical - + " ;\t" + (strokes != null ? CollectionUtilities.join(strokes, "/") - : strokes2 != null ? CollectionUtilities.join(strokes2, "/") - : "?") - + "\t # (" + codePoint + ") " + UCharacter.getName(codePoint, ", ") - + " ;\t" + (reasons == null ? "" : CollectionUtilities.join(reasons, ", ")) - + "\n") - ); + + " ;\t" + + radical + + " ;\t" + + (strokes != null + ? CollectionUtilities.join(strokes, "/") + : strokes2 != null + ? CollectionUtilities.join(strokes2, "/") + : "?") + + "\t # (" + + codePoint + + ") " + + UCharacter.getName(codePoint, ", ") + + " ;\t" + + (reasons == null + ? "" + : CollectionUtilities.join(reasons, ", ")) + + "\n")); } out.append("\n"); } catch (IOException e) { @@ -95,7 +105,8 @@ private void addItems(Set sourceItems, String reason) { } } - private void addRadicals(int intRadical, Map radicalSource, String reason) { + private void addRadicals( + int intRadical, Map radicalSource, String reason) { UnicodeSet us = radicalSource.get(intRadical); if (us != null) { codeToReason.putAll(us.addAllTo(new HashSet()), reason); @@ -115,13 +126,15 @@ public void finish() { strokeCounts = Collections.unmodifiableList(new ArrayList<>(_strokeCounts)); RadicalDataCache.put(radical, this); } + static Map RadicalDataCache = new HashMap<>(); + static Set> entrySet() { return RadicalDataCache.entrySet(); } - + static { - Map,Set>> sorted = new TreeMap<>(); + Map, Set>> sorted = new TreeMap<>(); for (Entry> entry : Ids.radToUnicode.keyValuesSet()) { final String key = entry.getKey(); final double clean = Ids.cleanRadical(key); @@ -129,26 +142,30 @@ static Set> entrySet() { } int count = 0; Set adobeItems = new TreeSet<>(); - //Set sortedChars = new TreeSet<>(UNIHAN); + // Set sortedChars = new TreeSet<>(UNIHAN); UnicodeSet missingCjkRadicals = new UnicodeSet(Ids.CJK_Radicals_Supplement_BLOCK); Output cjkRadValue = new Output(); - Relation multipleRadicals = Relation.of(new TreeMap>(Ids.UNIHAN), TreeSet.class); + Relation multipleRadicals = + Relation.of(new TreeMap>(Ids.UNIHAN), TreeSet.class); for (Entry, Set>> entry : sorted.entrySet()) { ++count; - //Relation reasonMap = Relation.of(new HashMap(), LinkedHashSet.class); + // Relation reasonMap = Relation.of(new HashMap(), + // LinkedHashSet.class); final Double key = entry.getKey(); final R2, Set> rad2 = entry.getValue(); final Set raw = rad2.get1(); double doubleRadical = key.doubleValue(); - int intRadical = (int)doubleRadical; + int intRadical = (int) doubleRadical; final boolean alt = intRadical != doubleRadical; String key2 = intRadical + (alt ? "'" : ""); RadicalData radicalData = new RadicalData(key2); - final Set cjkRad = alt ? Collections.EMPTY_SET : IdsFileData.radToCjkRad.get(intRadical); + final Set cjkRad = + alt ? Collections.EMPTY_SET : IdsFileData.radToCjkRad.get(intRadical); UnicodeSet RSUnicode = Ids.USTROKE.get(intRadical, alt); - M3 adobe = Ids.ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get(intRadical); + M3 adobe = + Ids.ADOBE_RADICAL_STROKESINRADICAL_REMAINDER_USET.get(intRadical); adobeItems.clear(); if (!alt) { for (Entry> entry2 : adobe) { @@ -166,8 +183,10 @@ static Set> entrySet() { radicalData.addItems(raw, "CJKRadicals.txt"); radicalData.addItems(RSUnicode, "kRSUnicode"); - Ids.radToUnicode.get(intRadical+"'"); - boolean hasAltRadical = Ids.USTROKE.get(intRadical, true) != null || Ids.radToUnicode.get(intRadical+"'") != null; + Ids.radToUnicode.get(intRadical + "'"); + boolean hasAltRadical = + Ids.USTROKE.get(intRadical, true) != null + || Ids.radToUnicode.get(intRadical + "'") != null; if (!alt && !hasAltRadical) { radicalData.addRadicals(intRadical, Ids.kRSKangXiRadicals, "kRSKangXi"); @@ -182,7 +201,7 @@ static Set> entrySet() { } missingCjkRadicals.removeAll(radicalData.getChars()); - //Wikiwand.check(sortedChars); + // Wikiwand.check(sortedChars); String extra = Nameslist.check(key2, radicalData.getChars(), cjkRadValue); if (extra != null) { radicalData.addItem(extra, "Nameslist"); @@ -192,13 +211,17 @@ static Set> entrySet() { // for (String codePoint : sortedChars) { // multipleRadicals.put(codePoint, key2); - // final Set reasons = reasonMap.get(codePoint.codePointAt(0)); + // final Set reasons = + // reasonMap.get(codePoint.codePointAt(0)); // List strokes = kTotalStrokes.get(codePoint); // out.println(Utility.hex(codePoint) // + " ;\t" + key2 - // + " ; " + (strokes == null ? "?" : CollectionUtilities.join(strokes, ", ")) - // + " \t# (" + codePoint + ") " + UCharacter.getName(codePoint, ", ") - // + " ;\t" + (reasons == null ? "" : CollectionUtilities.join(reasons, ", ")) + // + " ; " + (strokes == null ? "?" : + // CollectionUtilities.join(strokes, ", ")) + // + " \t# (" + codePoint + ") " + + // UCharacter.getName(codePoint, ", ") + // + " ;\t" + (reasons == null ? "" : + // CollectionUtilities.join(reasons, ", ")) // ); // } // out.println(); @@ -210,8 +233,7 @@ static Set> entrySet() { final Set radicals = entry.getValue(); if (radicals.size() > 1) { System.out.println(codepoint + " ??? " + radicals); - } + } } } - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/RadicalEnum.java b/unicodetools/src/main/java/org/unicode/tools/RadicalEnum.java index 4848659fd..fcf14b5ff 100644 --- a/unicodetools/src/main/java/org/unicode/tools/RadicalEnum.java +++ b/unicodetools/src/main/java/org/unicode/tools/RadicalEnum.java @@ -241,22 +241,25 @@ public enum RadicalEnum { R213, R213a, R214; + @Override public String toString() { String result = name(); if (result.endsWith("a")) { - return result.substring(1, result.length()-1) + "'"; + return result.substring(1, result.length() - 1) + "'"; } else { return result.substring(1, result.length()); } } + public static RadicalEnum fromString(String source) { if (source.endsWith("'")) { - return valueOf("R" + source.substring(0, source.length()-1) + "a"); + return valueOf("R" + source.substring(0, source.length() - 1) + "a"); } else { return valueOf("R" + source); } } + public static RadicalEnum fromInt(int source) { return valueOf("R" + source); } diff --git a/unicodetools/src/main/java/org/unicode/tools/RadicalStroke.java b/unicodetools/src/main/java/org/unicode/tools/RadicalStroke.java index 5ebb7c468..a1340dfca 100644 --- a/unicodetools/src/main/java/org/unicode/tools/RadicalStroke.java +++ b/unicodetools/src/main/java/org/unicode/tools/RadicalStroke.java @@ -1,5 +1,7 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; @@ -8,15 +10,11 @@ import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.CodePoints; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.PatternCache; import org.unicode.draft.ScriptCategories2; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; - public class RadicalStroke { // U+3433 kRSUnicode 9.3 private static Pattern RAD_STROKE = PatternCache.get("U\\+([A-Z0-9]+)\\s+kRSUnicode\\s+(.*)"); @@ -42,14 +40,18 @@ private RadicalStroke() { Matcher radStrokeMatcher = RAD_STROKE.matcher(""); Matcher radDataMatcher = RAD_DATA.matcher(""); Matcher iiCore = IICORE.matcher(""); - radStrokesToRadToRemainingStrokes = new TreeMap>>(); - remainder = ScriptCategories2.parseUnicodeSet("[:script=Han:]").removeAll(GeneratePickerData.SKIP); + radStrokesToRadToRemainingStrokes = + new TreeMap>>(); + remainder = + ScriptCategories2.parseUnicodeSet("[:script=Han:]") + .removeAll(GeneratePickerData.SKIP); String dataDir = DraftUtils.UCD_DIRECTORY + "/Unihan/"; - BufferedReader in = new BufferedReader( - new FileReader( - Subheader.getFileNameFromPattern( - dataDir, "Unihan_RadicalStrokeCounts.*\\.txt"))); + BufferedReader in = + new BufferedReader( + new FileReader( + Subheader.getFileNameFromPattern( + dataDir, "Unihan_RadicalStrokeCounts.*\\.txt"))); while (true) { String line = in.readLine(); @@ -69,10 +71,12 @@ private RadicalStroke() { Integer radicalChar = ScriptCategories2.getRadicalNum2char(radical); if (radicalChar == null) { in.close(); - throw new IllegalArgumentException("No radical value for <" + radical + ">"); + throw new IllegalArgumentException( + "No radical value for <" + radical + ">"); } charToRadical.put(cp, radicalChar); - int radicalStrokes = ScriptCategories2.RADICAL_CHAR2STROKES.get(radicalChar); + int radicalStrokes = + ScriptCategories2.RADICAL_CHAR2STROKES.get(radicalChar); int remainingStrokes = Integer.parseInt(radDataMatcher.group(2)); charToTotalStrokes.put(cp, radicalStrokes + remainingStrokes); charToRemainingStrokes.put(cp, remainingStrokes); @@ -80,10 +84,15 @@ private RadicalStroke() { // if (radical.startsWith("211")) { // System.out.println(line); // } - // String baseRadical = radical.endsWith("'") ? radical.substring(0, radical.length()-1) : + // String baseRadical = radical.endsWith("'") ? radical.substring(0, + // radical.length()-1) : // radical; - RadicalStroke.mapToUnicodeSetAdd(radStrokesToRadToRemainingStrokes, radicalStrokes, radical, - remainingStrokes, cp); + RadicalStroke.mapToUnicodeSetAdd( + radStrokesToRadToRemainingStrokes, + radicalStrokes, + radical, + remainingStrokes, + cp); remainder.remove(cp); // if (radDataMatcher.group(2).equals("0") && radical.endsWith("'")) { // String radicalString = Normalizer.normalize(cp, Normalizer.NFKC); @@ -91,7 +100,8 @@ private RadicalStroke() { // if (old == null) { // radicalToChar.put(radical, radicalString); // } else if (!radicalString.equals(old)) { - // System.out.println("Duplicate radical: " + line + " with " + radicalString + " and " + old); + // System.out.println("Duplicate radical: " + line + " with " + + // radicalString + " and " + old); // } // } } @@ -107,7 +117,8 @@ private RadicalStroke() { charToRemainingStrokes.freeze(); charToRadical.freeze(); - radStrokesToRadToRemainingStrokes = CldrUtility.protectCollection(radStrokesToRadToRemainingStrokes); + radStrokesToRadToRemainingStrokes = + CldrUtility.protectCollection(radStrokesToRadToRemainingStrokes); // UnicodeSet temp = new UnicodeSet(); // for (UnicodeSetIterator it = new @@ -115,24 +126,33 @@ private RadicalStroke() { // temp.add(it.codepoint); // if (temp.size() >= 800) { // int code = temp.charAt(0); - // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, false), false, temp); + // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, + // false), false, temp); // temp.clear(); // } // } // if (temp.size() > 0) { // int code = temp.charAt(0); - // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, false), false, temp); + // CATEGORYTABLE.add("Han (CJK)", false, UTF16.valueOf(code) + " Han " + toHex(code, + // false), false, temp); // } } catch (IOException e) { throw new IllegalArgumentException(e); } } - static void mapToUnicodeSetAdd(Map>> index, - int radicalStrokes, String radicalChar, int remainingStrokes, int cp) { + static void mapToUnicodeSetAdd( + Map>> index, + int radicalStrokes, + String radicalChar, + int remainingStrokes, + int cp) { Map> subIndex = index.get(radicalStrokes); if (subIndex == null) { - index.put(radicalStrokes, subIndex = new TreeMap>(GeneratePickerData.UCA)); + index.put( + radicalStrokes, + subIndex = + new TreeMap>(GeneratePickerData.UCA)); } Map uset = subIndex.get(radicalChar); if (uset == null) { @@ -145,43 +165,43 @@ static void mapToUnicodeSetAdd(Map RadicalStrokeComparator = new Comparator() { - CodePoints cps1 = new CodePoints(""); - CodePoints cps2 = new CodePoints(""); - - public int compare(CharSequence o1, CharSequence o2) { - cps1.reset(o1); - cps2.reset(o2); - boolean n1 = cps1.next(); - boolean n2 = cps2.next(); - // shorter strings are less - if (!n1) { - return n2 ? -1 : 0; - } else if (!n2) { - return 1; - } - int cp1 = cps1.getCodePoint(); - int cp2 = cps2.getCodePoint(); - - // lower stroke counts are less (null counts as zero) - Integer s1 = SINGLETON.charToTotalStrokes.get(cp1); - Integer s2 = SINGLETON.charToTotalStrokes.get(cp2); - if (s1 == null && s2 == null) { - // no info, return codepoint order - return cp1 - cp2; - } - int ss1 = s1 == null ? 0 : s1; - int ss2 = s2 == null ? 0 : s2; - if (ss1 < ss2) return -1; - if (ss1 > ss2) return 1; - - Integer r1 = SINGLETON.charToRadical.get(cp1); - Integer r2 = SINGLETON.charToRadical.get(cp2); - if (r1 < r2) return -1; - if (r1 > r2) return 1; - // no other diff, return codepoint order - return cp1 - cp2; - } - }; - + static Comparator RadicalStrokeComparator = + new Comparator() { + CodePoints cps1 = new CodePoints(""); + CodePoints cps2 = new CodePoints(""); + + public int compare(CharSequence o1, CharSequence o2) { + cps1.reset(o1); + cps2.reset(o2); + boolean n1 = cps1.next(); + boolean n2 = cps2.next(); + // shorter strings are less + if (!n1) { + return n2 ? -1 : 0; + } else if (!n2) { + return 1; + } + int cp1 = cps1.getCodePoint(); + int cp2 = cps2.getCodePoint(); + + // lower stroke counts are less (null counts as zero) + Integer s1 = SINGLETON.charToTotalStrokes.get(cp1); + Integer s2 = SINGLETON.charToTotalStrokes.get(cp2); + if (s1 == null && s2 == null) { + // no info, return codepoint order + return cp1 - cp2; + } + int ss1 = s1 == null ? 0 : s1; + int ss2 = s2 == null ? 0 : s2; + if (ss1 < ss2) return -1; + if (ss1 > ss2) return 1; + + Integer r1 = SINGLETON.charToRadical.get(cp1); + Integer r2 = SINGLETON.charToRadical.get(cp2); + if (r1 < r2) return -1; + if (r1 > r2) return 1; + // no other diff, return codepoint order + return cp1 - cp2; + } + }; } diff --git a/unicodetools/src/main/java/org/unicode/tools/ScriptDetector.java b/unicodetools/src/main/java/org/unicode/tools/ScriptDetector.java index 56b849c43..640df0649 100644 --- a/unicodetools/src/main/java/org/unicode/tools/ScriptDetector.java +++ b/unicodetools/src/main/java/org/unicode/tools/ScriptDetector.java @@ -1,45 +1,47 @@ package org.unicode.tools; +import com.google.common.base.Joiner; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.UnicodeSet; import java.util.Collections; import java.util.EnumSet; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Set; - import org.unicode.props.GenerateEnums; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; import org.unicode.props.UcdPropertyValues.Script_Values; -import com.google.common.base.Joiner; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.UnicodeSet; - /** - * Detect the (fixed) scripts of a character or string. Note that the values are "fixed" from what is in the UCD: + * Detect the (fixed) scripts of a character or string. Note that the values are "fixed" from what + * is in the UCD: + * *
    - *
  • All Inherited become Common
  • - *
  • All characters with Han add Japanese (Jpan) and Korean (Kore), and remove Bopomofo, Katakana, and Hiragana.
  • - *
  • All characters with Hangul add Korean (Kore) and remove Katakana, and Hiragana.
  • - *
  • All characters with Katakana or Hiragana add Japanese, and remove Katakana, and Hiragana.
  • + *
  • All Inherited become Common + *
  • All characters with Han add Japanese (Jpan) and Korean (Kore), and remove Bopomofo, + * Katakana, and Hiragana. + *
  • All characters with Hangul add Korean (Kore) and remove Katakana, and Hiragana. + *
  • All characters with Katakana or Hiragana add Japanese, and remove Katakana, and Hiragana. *
- * @author markdavis * + * @author markdavis */ public final class ScriptDetector { public static final Joiner JOINER_COMMA_SPACE = Joiner.on(", "); - public static final IndexUnicodeProperties IUP = IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); - private static final Set INHERITED_SET = Collections.singleton(Script_Values.Inherited); + public static final IndexUnicodeProperties IUP = + IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); + private static final Set INHERITED_SET = + Collections.singleton(Script_Values.Inherited); public static final Set COMMON_SET = Collections.singleton(Script_Values.Common); - - private static final UnicodeMap> FIXED_CODEPOINT_TO_SCRIPTS = fix(); -// public static final UnicodeSet COMMON_OR_INHERITED -// = new UnicodeSet(ScriptDetector._CODEPOINT_TO_SCRIPTS.getSet(ScriptDetector.COMMON_SET)) -// .addAll(ScriptDetector._CODEPOINT_TO_SCRIPTS.getSet(ScriptDetector.INHERITED_SET)) -// .freeze(); + private static final UnicodeMap> FIXED_CODEPOINT_TO_SCRIPTS = fix(); + // public static final UnicodeSet COMMON_OR_INHERITED + // = new UnicodeSet(ScriptDetector._CODEPOINT_TO_SCRIPTS.getSet(ScriptDetector.COMMON_SET)) + // .addAll(ScriptDetector._CODEPOINT_TO_SCRIPTS.getSet(ScriptDetector.INHERITED_SET)) + // .freeze(); private boolean isCommon; private final EnumSet singleScripts = EnumSet.noneOf(Script_Values.class); @@ -47,7 +49,9 @@ public final class ScriptDetector { private final HashSet> toRemove = new HashSet>(); /** - * Sets the source string, which causes it to be analyzed. Afterwards getAll(), size(), etc. can be called. + * Sets the source string, which causes it to be analyzed. Afterwards getAll(), size(), etc. can + * be called. + * * @param source * @return */ @@ -99,11 +103,13 @@ public ScriptDetector set(String source) { } private static UnicodeMap> fix() { - final UnicodeMap> codepointToScripts - = ScriptDetector.IUP.loadEnumSet(UcdProperty.Script_Extensions, UcdPropertyValues.Script_Values.class); - final EnumSet KANA = EnumSet.of(Script_Values.Hangul, Script_Values.Hiragana, Script_Values.Katakana); + final UnicodeMap> codepointToScripts = + ScriptDetector.IUP.loadEnumSet( + UcdProperty.Script_Extensions, UcdPropertyValues.Script_Values.class); + final EnumSet KANA = + EnumSet.of(Script_Values.Hangul, Script_Values.Hiragana, Script_Values.Katakana); - UnicodeMap>result = new UnicodeMap<>(); + UnicodeMap> result = new UnicodeMap<>(); for (Set scriptValueSet : codepointToScripts.values()) { UnicodeSet uset = codepointToScripts.getSet(scriptValueSet); for (UnicodeSet.EntryRange range : uset.ranges()) { @@ -118,8 +124,8 @@ private static UnicodeMap> fix() { return result; } - - private static Set fix(Set scriptValueSet, Set KANA) { + private static Set fix( + Set scriptValueSet, Set KANA) { EnumSet temp = null; if (scriptValueSet.equals(ScriptDetector.INHERITED_SET)) { scriptValueSet = ScriptDetector.COMMON_SET; @@ -145,31 +151,37 @@ private static Set fix(Set scriptValueSet, Set getSingleSetOrNull() { - return getCombinations().isEmpty() ? singleScripts - : !singleScripts.isEmpty() ? null - : getCombinations().size() > 1 ? null - : getCombinations().iterator().next(); + return getCombinations().isEmpty() + ? singleScripts + : !singleScripts.isEmpty() + ? null + : getCombinations().size() > 1 ? null : getCombinations().iterator().next(); } + @Override public String toString() { - return singleScripts.isEmpty() ? JOINER_COMMA_SPACE.join(getCombinations()) - : getCombinations().isEmpty() ? JOINER_COMMA_SPACE.join(singleScripts) - : JOINER_COMMA_SPACE.join(singleScripts) + ", " + JOINER_COMMA_SPACE.join(getCombinations()); + return singleScripts.isEmpty() + ? JOINER_COMMA_SPACE.join(getCombinations()) + : getCombinations().isEmpty() + ? JOINER_COMMA_SPACE.join(singleScripts) + : JOINER_COMMA_SPACE.join(singleScripts) + + ", " + + JOINER_COMMA_SPACE.join(getCombinations()); } /** - * Return all of the set of sets of script values in the source string. - * The set is minimized. It will only be empty if the set() value was the empty string. + * Return all of the set of sets of script values in the source string. The set is minimized. It + * will only be empty if the set() value was the empty string. + * * @return */ public Set> getAll() { @@ -184,16 +196,12 @@ public Set> getAll() { return result; } - /** - * Return the characters having the given script value set. - */ + /** Return the characters having the given script value set. */ public static UnicodeSet getCharactersForScriptExtensions(Set scriptValueSet) { return ScriptDetector.FIXED_CODEPOINT_TO_SCRIPTS.getSet(scriptValueSet); } - /** - * Return the set of script values for the code point. - */ + /** Return the set of script values for the code point. */ public static Set getScriptExtensions(int codepoint) { return ScriptDetector.FIXED_CODEPOINT_TO_SCRIPTS.get(codepoint); } @@ -205,17 +213,13 @@ public boolean isCommon() { return isCommon; } - /** - * Same as getAll, but filters to only singleton sets. - */ + /** Same as getAll, but filters to only singleton sets. */ public EnumSet getSingleScripts() { return singleScripts; } - /** - * Same as getAll, but filters out singleton sets. - */ + /** Same as getAll, but filters out singleton sets. */ public HashSet> getCombinations() { return combinations; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/Segmenter.java b/unicodetools/src/main/java/org/unicode/tools/Segmenter.java index 83722fdc0..505e1069e 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Segmenter.java +++ b/unicodetools/src/main/java/org/unicode/tools/Segmenter.java @@ -8,6 +8,18 @@ package org.unicode.tools; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Multimap; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.dev.util.UnicodeMap.Composer; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.XSymbolTable; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.text.ParsePosition; @@ -23,7 +35,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.RegexUtilities; import org.unicode.cldr.util.TransliteratorUtilities; @@ -33,32 +44,20 @@ import org.unicode.text.utility.Settings; import org.unicode.tools.Segmenter.Rule.Breaks; -import com.google.common.collect.ImmutableMultimap; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Multimap; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.dev.util.UnicodeMap.Composer; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.XSymbolTable; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; - -/** - * Ordered list of rules, with variables resolved before building. Use Builder to make. - */ +/** Ordered list of rules, with variables resolved before building. Use Builder to make. */ public class Segmenter { public enum Target { FOR_UCD, FOR_CLDR } + public static final int REGEX_FLAGS = Pattern.COMMENTS | Pattern.MULTILINE | Pattern.DOTALL; - public static final Pattern IDENTIFIER_PATTERN = Pattern.compile("[$][\\p{Alnum}_]+", REGEX_FLAGS); + public static final Pattern IDENTIFIER_PATTERN = + Pattern.compile("[$][\\p{Alnum}_]+", REGEX_FLAGS); /** - * If not null, masks off the character properties so the UnicodeSets are easier to use when debugging. + * If not null, masks off the character properties so the UnicodeSets are easier to use when + * debugging. */ public static UnicodeSet DEBUG_REDUCE_SET_SIZE = null; // new // UnicodeSet("[\\u0000-\\u00FF\\u0300-\\u03FF\\u2000-\\u20FF]"); @@ -77,7 +76,7 @@ private Segmenter(Target target) { this.target = target; } - static public interface CodePointShower { + public static interface CodePointShower { String show(int codePoint); } @@ -86,7 +85,8 @@ public static Builder make(UnicodeProperty.Factory propFactory, String type) { } public static Builder make(UnicodeProperty.Factory propFactory, String type, Target target) { - String sourceFileName = target == Target.FOR_CLDR ? "SegmenterCldr.txt" : "SegmenterDefault.txt"; + String sourceFileName = + target == Target.FOR_CLDR ? "SegmenterCldr.txt" : "SegmenterDefault.txt"; Builder b = new Builder(propFactory, target); // quick and dirty cache of file lines, so we don't hit file multiple times. @@ -107,7 +107,11 @@ public static Builder make(UnicodeProperty.Factory propFactory, String type, Tar continue; } if (key == null) { - throw new IllegalArgumentException("Missing @ type in rule syntax, " + sourceFileName + " line=" + lineCount); + throw new IllegalArgumentException( + "Missing @ type in rule syntax, " + + sourceFileName + + " line=" + + lineCount); } data.put(key, line); } @@ -115,7 +119,8 @@ public static Builder make(UnicodeProperty.Factory propFactory, String type, Tar } Collection lines = data.get(type); if (lines == null) { - throw new IllegalArgumentException("Missing type=" + type + " in file " + sourceFileName); + throw new IllegalArgumentException( + "Missing type=" + type + " in file " + sourceFileName); } for (String line : lines) { b.addLine(line); @@ -123,23 +128,23 @@ public static Builder make(UnicodeProperty.Factory propFactory, String type, Tar return b; } - static final Map> FILE_CACHE = new ConcurrentHashMap<>(); + static final Map> FILE_CACHE = new ConcurrentHashMap<>(); - /** - * Certain rules are generated, and have artificial numbers - */ - public static final double NOBREAK_SUPPLEMENTARY = 0.1, BREAK_SOT = 0.2, BREAK_EOT = 0.3, BREAK_ANY = 999; - /** - * Convenience for formatting doubles - */ + /** Certain rules are generated, and have artificial numbers */ + public static final double NOBREAK_SUPPLEMENTARY = 0.1, + BREAK_SOT = 0.2, + BREAK_EOT = 0.3, + BREAK_ANY = 999; + /** Convenience for formatting doubles */ public static NumberFormat nf = NumberFormat.getInstance(ULocale.ENGLISH); + static { nf.setMinimumFractionDigits(0); } /** - * Does the rule list give a break at this point? - * Also sets the rule number that matches, for return by getBreakRule. + * Does the rule list give a break at this point? Also sets the rule number that matches, for + * return by getBreakRule. * * @param text * @param position @@ -158,13 +163,15 @@ public boolean breaksAt(CharSequence text, int position) { return true; } // don't break in middle of surrogate - if (UTF16.isLeadSurrogate(text.charAt(position - 1)) && UTF16.isTrailSurrogate(text.charAt(position))) { + if (UTF16.isLeadSurrogate(text.charAt(position - 1)) + && UTF16.isTrailSurrogate(text.charAt(position))) { breakRule = NOBREAK_SUPPLEMENTARY; return false; } for (int i = 0; i < rules.size(); ++i) { Rule rule = rules.get(i); - if (DEBUG_AT_RULE_CONTAINING != null && rule.toString().contains(DEBUG_AT_RULE_CONTAINING)) { + if (DEBUG_AT_RULE_CONTAINING != null + && rule.toString().contains(DEBUG_AT_RULE_CONTAINING)) { System.out.println(" !#$@543 Debug"); } Breaks result = rule.matches(text, position); @@ -208,9 +215,7 @@ public double getBreakRule() { return breakRule; } - /** - * Debugging aid - */ + /** Debugging aid */ public String toString() { return toString(false); } @@ -224,24 +229,19 @@ public String toString(boolean showResolved) { return result; } - /** - * A rule that determines the status of an offset. - */ + /** A rule that determines the status of an offset. */ public static class Rule { - /** - * Status of a breaking rule - */ + /** Status of a breaking rule */ public enum Breaks { - UNKNOWN_BREAK, BREAK, NO_BREAK + UNKNOWN_BREAK, + BREAK, + NO_BREAK }; /** - * @param before - * pattern for the text after the offset. All variables must be resolved. - * @param result - * the break status to return when the rule is invoked - * @param after - * pattern for the text before the offset. All variables must be resolved. + * @param before pattern for the text after the offset. All variables must be resolved. + * @param result the break status to return when the rule is invoked + * @param after pattern for the text before the offset. All variables must be resolved. * @param line */ public Rule(String before, Breaks result, String after, String line) { @@ -254,18 +254,26 @@ public Rule(String before, Breaks result, String after, String line) { } catch (PatternSyntaxException e) { // Format: Unclosed character class near index 927 int index = e.getIndex(); - throw (RuntimeException) new IllegalArgumentException("On <" + line + ">, Can't parse: " - + parsing.substring(0, index) - + "<<<>>>" + parsing.substring(index)) - .initCause(e); + throw (RuntimeException) + new IllegalArgumentException( + "On <" + + line + + ">, Can't parse: " + + parsing.substring(0, index) + + "<<<>>>" + + parsing.substring(index)) + .initCause(e); } catch (RuntimeException e) { // Unclosed character class near index 927 - throw (RuntimeException) new IllegalArgumentException("On <" + line + ">, Can't parse: " + parsing) - .initCause(e); + throw (RuntimeException) + new IllegalArgumentException("On <" + line + ">, Can't parse: " + parsing) + .initCause(e); } name = line; - resolved = Utility.escape(before) + (result == Breaks.NO_BREAK ? " \u00D7 " : " \u00F7 ") - + Utility.escape(after); + resolved = + Utility.escape(before) + + (result == Breaks.NO_BREAK ? " \u00D7 " : " \u00F7 ") + + Utility.escape(after); // COMMENTS allows whitespace } @@ -284,9 +292,7 @@ public Breaks matches(CharSequence text, int position) { return breaks; } - /** - * Debugging aid - */ + /** Debugging aid */ public String toString() { return toString(false); } @@ -308,45 +314,38 @@ public String toString(boolean showResolved) { private Breaks breaks; } - /** - * utility, since we are using Java 1.4 - */ + /** utility, since we are using Java 1.4 */ static boolean matchAfter(Matcher matcher, CharSequence text, int position) { return matcher.reset(text.subSequence(position, text.length())).lookingAt(); } /** - * utility, since we are using Java 1.4 - * depends on the pattern having been built with .* - * not very efficient, works for testing and the best we can do. + * utility, since we are using Java 1.4 depends on the pattern having been built with .* not + * very efficient, works for testing and the best we can do. */ static boolean matchBefore(Matcher matcher, CharSequence text, int position) { return matcher.reset(text.subSequence(0, position)).matches(); } - /** - * Separate the builder for clarity - */ - - /** - * Sort the longest strings first. Used for variable lists. - */ - static Comparator LONGEST_STRING_FIRST = new Comparator() { - public int compare(String s0, String s1) { - int len0 = s0.length(); - int len1 = s1.length(); - if (len0 < len1) return 1; // longest first - if (len0 > len1) return -1; - // lengths equal, use string order - return s0.compareTo(s1); - } - }; + /** Separate the builder for clarity */ + + /** Sort the longest strings first. Used for variable lists. */ + static Comparator LONGEST_STRING_FIRST = + new Comparator() { + public int compare(String s0, String s1) { + int len0 = s0.length(); + int len1 = s1.length(); + if (len0 < len1) return 1; // longest first + if (len0 > len1) return -1; + // lengths equal, use string order + return s0.compareTo(s1); + } + }; /** - * Used to build RuleLists. Can be used to do inheritance, since (a) adding a variable overrides any previous value, - * and - * any variables used in its value are resolved before adding, and (b) adding a rule sorts/overrides according to - * numeric value. + * Used to build RuleLists. Can be used to do inheritance, since (a) adding a variable overrides + * any previous value, and any variables used in its value are resolved before adding, and (b) + * adding a rule sorts/overrides according to numeric value. */ public static class Builder { private final UnicodeProperty.Factory propFactory; @@ -369,7 +368,8 @@ public Builder(UnicodeProperty.Factory factory, Target target) { // copied to make independent of ICU4J internals private class MyXSymbolTable extends UnicodeSet.XSymbolTable { - public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { + public boolean applyPropertyAlias( + String propertyName, String propertyValue, UnicodeSet result) { UnicodeProperty prop = propFactory.getProperty(propertyName); if (prop == null) { if (propertyValue.isEmpty()) { @@ -386,7 +386,8 @@ public boolean applyPropertyAlias(String propertyName, String propertyValue, Uni // For example, as long as we require "gc=Cn" and don't handle "Cn" here, // falling back to built-in ICU data means that we get gc=Cn ranges from ICU // rather than from the current Unicode beta. - throw new IllegalArgumentException("Segmenter.MyXSymbolTable: Unknown property " + propertyName); + throw new IllegalArgumentException( + "Segmenter.MyXSymbolTable: Unknown property " + propertyName); } // Binary properties: // \p{Extended_Pictographic} is equivalent with \p{Extended_Pictographic=Yes} @@ -398,7 +399,10 @@ public boolean applyPropertyAlias(String propertyName, String propertyValue, Uni if (x.isEmpty()) { // didn't find anything System.out.println( - "Segmenter.MyXSymbolTable: !Empty! " + propertyName + "=" + propertyValue); + "Segmenter.MyXSymbolTable: !Empty! " + + propertyName + + "=" + + propertyValue); } return true; // mark that we handled it even if there are no results. } @@ -407,16 +411,21 @@ public boolean applyPropertyAlias(String propertyName, String propertyValue, Uni public String toString(String testName, String indent) { StringBuffer result = new StringBuffer(); - result.append(indent + "").append(Utility.LINE_SEPARATOR); + result.append(indent + "") + .append(Utility.LINE_SEPARATOR); result.append(indent + "\t").append(Utility.LINE_SEPARATOR); for (int i = 0; i < rawVariables.size(); ++i) { - result.append(indent + "\t\t").append(rawVariables.get(i)).append(Utility.LINE_SEPARATOR); + result.append(indent + "\t\t") + .append(rawVariables.get(i)) + .append(Utility.LINE_SEPARATOR); } result.append(indent + "\t").append(Utility.LINE_SEPARATOR); result.append(indent + "\t").append(Utility.LINE_SEPARATOR); - for (Iterator it = xmlRules.keySet().iterator(); it.hasNext();) { + for (Iterator it = xmlRules.keySet().iterator(); it.hasNext(); ) { Double key = it.next(); - result.append(indent + "\t\t").append(xmlRules.get(key)).append(Utility.LINE_SEPARATOR); + result.append(indent + "\t\t") + .append(xmlRules.get(key)) + .append(Utility.LINE_SEPARATOR); } for (String comment : lastComments) { result.append(indent + "\t\t").append(comment).append(Utility.LINE_SEPARATOR); @@ -427,10 +436,10 @@ public String toString(String testName, String indent) { } /** - * Add a line. If contains a =, is a variable definition. - * Otherwise, is of the form nn) rule, where nn is the number of the rule. - * For now, pretty lame parsing, because we can't easily determine whether =, etc is part of the regex or not. - * So any 'real' =, etc in a regex must be expressed with unicode escapes, \\u.... + * Add a line. If contains a =, is a variable definition. Otherwise, is of the form nn) + * rule, where nn is the number of the rule. For now, pretty lame parsing, because we can't + * easily determine whether =, etc is part of the regex or not. So any 'real' =, etc in a + * regex must be expressed with unicode escapes, \\u.... * * @param line * @return @@ -450,7 +459,9 @@ public boolean addLine(String line) { } int relationPosition = line.indexOf('='); if (relationPosition >= 0) { - addVariable(line.substring(0, relationPosition).trim(), line.substring(relationPosition + 1).trim()); + addVariable( + line.substring(0, relationPosition).trim(), + line.substring(relationPosition + 1).trim()); return false; } relationPosition = line.indexOf(')'); @@ -466,12 +477,17 @@ public boolean addLine(String line) { if (relationPosition < 0) { relationPosition = line.indexOf('\u00D7'); if (relationPosition < 0) { - throw new IllegalArgumentException("Couldn't find =, \u00F7, or \u00D7 on line: " + line); + throw new IllegalArgumentException( + "Couldn't find =, \u00F7, or \u00D7 on line: " + line); } breaks = Segmenter.Rule.Breaks.NO_BREAK; } - addRule(order, line.substring(0, relationPosition).trim(), breaks, line.substring(relationPosition + 1) - .trim(), line); + addRule( + order, + line.substring(0, relationPosition).trim(), + breaks, + line.substring(relationPosition + 1).trim(), + line); return true; } @@ -486,7 +502,6 @@ public boolean addLine(String line) { * @param value * @return */ - static class MyComposer extends UnicodeMap.Composer { public String compose(int codePoint, String string, String a, String b) { if (a == null) return b; @@ -504,11 +519,16 @@ Builder addVariable(String name, String value) { rawVariables.addAll(lastComments); lastComments.clear(); } - rawVariables.add("" - + TransliteratorUtilities.toXML.transliterate(value) + ""); + rawVariables.add( + "" + + TransliteratorUtilities.toXML.transliterate(value) + + ""); if (!identifierMatcher.reset(name).matches()) { String show = RegexUtilities.showMismatch(identifierMatcher, name); - throw new IllegalArgumentException("Variable name must be $id: '" + name + "' — " + show); + throw new IllegalArgumentException( + "Variable name must be $id: '" + name + "' — " + show); } value = replaceVariables(value); if (!name.endsWith("_")) { @@ -517,15 +537,20 @@ Builder addVariable(String name, String value) { UnicodeSet valueSet = new UnicodeSet(value, parsePosition, symbolTable); if (parsePosition.getIndex() != value.length()) { if (SHOW_SAMPLES) - System.out.println(parsePosition.getIndex() + ", " + value.length() - + " -- No samples for: " + name + " = " + value); + System.out.println( + parsePosition.getIndex() + + ", " + + value.length() + + " -- No samples for: " + + name + + " = " + + value); } else if (valueSet.size() == 0) { if (SHOW_SAMPLES) System.out.println("Empty -- No samples for: " + name + " = " + value); } else { String name2 = name; - if (name2.startsWith("$")) - name2 = name2.substring(1); + if (name2.startsWith("$")) name2 = name2.substring(1); composeWith(samples, valueSet, name2, myComposer); if (SHOW_SAMPLES) { System.out.println("Samples for: " + name + " = " + value); @@ -550,9 +575,12 @@ Builder addVariable(String name, String value) { return this; } - public static UnicodeMap composeWith(UnicodeMap target, UnicodeSet set, String value, + public static UnicodeMap composeWith( + UnicodeMap target, + UnicodeSet set, + String value, Composer composer) { - for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next(); ) { int i = it.codepoint; String v1 = target.getValue(i); String v3 = composer.compose(i, null, v1, value); @@ -602,13 +630,22 @@ Builder addRule(Double order, String before, Breaks breaks, String after, String throw new IllegalArgumentException("Duplicate numbers for rules: " + order); } htmlRules.put(order, TransliteratorUtilities.toHTML.transliterate(line)); - xmlRules.put(order, " " + TransliteratorUtilities.toXML.transliterate(line) + " "); + xmlRules.put( + order, + " " + + TransliteratorUtilities.toXML.transliterate(line) + + " "); if (after.contains("[^$OLetter")) { System.out.println("!@#$31 Debug"); } - rules.put(order, new Segmenter.Rule(replaceVariables(before), breaks, replaceVariables(after), line)); + rules.put( + order, + new Segmenter.Rule( + replaceVariables(before), breaks, replaceVariables(after), line)); return this; } @@ -627,7 +664,8 @@ public Segmenter make() { } // ============== internals =================== - private Map variables = new TreeMap(LONGEST_STRING_FIRST); // sorted by length, + private Map variables = + new TreeMap(LONGEST_STRING_FIRST); // sorted by length, // longest first, to // make substitution // easy @@ -638,10 +676,9 @@ public Map getProcessedRules() { } /** - * A workhorse. Replaces all variable references: anything of the form $id. - * Flags an error if anything of that form is not a variable. - * Since we are using Java regex, the properties support - * are extremely week. So replace them by literals. + * A workhorse. Replaces all variable references: anything of the form $id. Flags an error + * if anything of that form is not a variable. Since we are using Java regex, the properties + * support are extremely week. So replace them by literals. * * @param input * @return @@ -650,19 +687,24 @@ private String replaceVariables(String input) { // to do, optimize String result = input; int position = -1; - main: while (true) { + main: + while (true) { position = result.indexOf('$', position); if (position < 0) break; for (String name : variables.keySet()) { if (result.regionMatches(position, name, 0, name.length())) { String value = variables.get(name); - result = result.substring(0, position) + value + result.substring(position + name.length()); + result = + result.substring(0, position) + + value + + result.substring(position + name.length()); position += value.length(); // don't allow overlap continue main; } } if (identifierMatcher.reset(result.substring(position)).lookingAt()) { - throw new IllegalArgumentException("Illegal variable at: '" + result.substring(position) + "'"); + throw new IllegalArgumentException( + "Illegal variable at: '" + result.substring(position) + "'"); } } // replace properties @@ -672,8 +714,13 @@ private String replaceVariables(String input) { parsePosition.setIndex(i); UnicodeSet temp = new UnicodeSet(result, parsePosition, symbolTable); String insert = getInsertablePattern(temp); - result = result.substring(0, i) + insert + result.substring(parsePosition.getIndex()); - i += insert.length() - 1; // skip over inserted stuff; -1 since the loop will add + result = + result.substring(0, i) + + insert + + result.substring(parsePosition.getIndex()); + i += + insert.length() + - 1; // skip over inserted stuff; -1 since the loop will add } } return result; @@ -702,32 +749,38 @@ private String getInsertablePattern(UnicodeSet temp) { String result = toPattern(temp, JavaRegexShower); // double check the pattern!! UnicodeSet reversal = new UnicodeSet(result); - if (!reversal.equals(temp)) throw new IllegalArgumentException("Failure on UnicodeSet print"); + if (!reversal.equals(temp)) + throw new IllegalArgumentException("Failure on UnicodeSet print"); return result; } - static UnicodeSet JavaRegex_uxxx = new UnicodeSet( - "[[[:White_Space:][:defaultignorablecodepoint:]#]&[\\u0000-\\uFFFF]]"); // hack to fix # in Java - static UnicodeSet JavaRegex_slash = new UnicodeSet("[[:Pattern_White_Space:]" + - "\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]"); - static CodePointShower JavaRegexShower = new CodePointShower() { - public String show(int codePoint) { - if (JavaRegex_uxxx.contains(codePoint)) { - if (codePoint > 0xFFFF) { - return "\\u" + Utility.hex(UTF16.getLeadSurrogate(codePoint)) - + "\\u" + Utility.hex(UTF16.getTrailSurrogate(codePoint)); + static UnicodeSet JavaRegex_uxxx = + new UnicodeSet( + "[[[:White_Space:][:defaultignorablecodepoint:]#]&[\\u0000-\\uFFFF]]"); // hack to fix # in Java + static UnicodeSet JavaRegex_slash = + new UnicodeSet("[[:Pattern_White_Space:]" + "\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]"); + static CodePointShower JavaRegexShower = + new CodePointShower() { + public String show(int codePoint) { + if (JavaRegex_uxxx.contains(codePoint)) { + if (codePoint > 0xFFFF) { + return "\\u" + + Utility.hex(UTF16.getLeadSurrogate(codePoint)) + + "\\u" + + Utility.hex(UTF16.getTrailSurrogate(codePoint)); + } + return "\\u" + Utility.hex(codePoint); + } + if (JavaRegex_slash.contains(codePoint)) + return "\\" + UTF16.valueOf(codePoint); + return UTF16.valueOf(codePoint); } - return "\\u" + Utility.hex(codePoint); - } - if (JavaRegex_slash.contains(codePoint)) return "\\" + UTF16.valueOf(codePoint); - return UTF16.valueOf(codePoint); - } - }; + }; private static String toPattern(UnicodeSet temp, CodePointShower shower) { StringBuffer result = new StringBuffer(); result.append('['); - for (UnicodeSetIterator it = new UnicodeSetIterator(temp); it.nextRange();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(temp); it.nextRange(); ) { // three cases: single, adjacent, range int first = it.codepoint; result.append(shower.show(first++)); @@ -769,479 +822,465 @@ public UnicodeMap getSamples() { // TODO: delete? move elsewhere? // Only used in main() to write to some files. Out of sync with SegmenterDefault.txt. private static final String[][] cannedRules = { - { - "GraphemeClusterBreak", - "$CR=\\p{Grapheme_Cluster_Break=CR}", - "$LF=\\p{Grapheme_Cluster_Break=LF}", - "$Control=\\p{Grapheme_Cluster_Break=Control}", - "$Extend=\\p{Grapheme_Cluster_Break=Extend}", - "$ZWJ=\\p{Grapheme_Cluster_Break=ZWJ}", - "$RI=\\p{Grapheme_Cluster_Break=Regional_Indicator}", - "$Prepend=\\p{Grapheme_Cluster_Break=Prepend}", - "$SpacingMark=\\p{Grapheme_Cluster_Break=SpacingMark}", - "$L=\\p{Grapheme_Cluster_Break=L}", - "$V=\\p{Grapheme_Cluster_Break=V}", - "$T=\\p{Grapheme_Cluster_Break=T}", - "$LV=\\p{Grapheme_Cluster_Break=LV}", - "$LVT=\\p{Grapheme_Cluster_Break=LVT}", - - "$Virama=[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]", - "$LinkingConsonant=[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]", - - // "$E_Base=\\p{Grapheme_Cluster_Break=E_Base}", - // "$E_Modifier=\\p{Grapheme_Cluster_Break=E_Modifier}", - - "$ExtPict=\\p{Extended_Pictographic}", - "$ExtCccZwj=[[$Extend-\\p{ccc=0}] $ZWJ]", - //"$EBG=\\p{Grapheme_Cluster_Break=E_Base_GAZ}", - //"$Glue_After_Zwj=\\p{Grapheme_Cluster_Break=Glue_After_Zwj}", - - "# Rules", - "# Break at the start and end of text, unless the text is empty.", - "# Do not break between a CR and LF. Otherwise, break before and after controls.", - "3) $CR \u00D7 $LF", - "4) ( $Control | $CR | $LF ) \u00F7", - "5) \u00F7 ( $Control | $CR | $LF )", - "# Do not break Hangul syllable sequences.", - "6) $L \u00D7 ( $L | $V | $LV | $LVT )", - "7) ( $LV | $V ) \u00D7 ( $V | $T )", - "8) ( $LVT | $T) \u00D7 $T", - "# Do not break before extending characters or ZWJ.", - //"9) \u00D7 ($Extend | $ZWJ | $Virama)", - "9) \u00D7 ($Extend | $ZWJ)", - "# Only for extended grapheme clusters: Do not break before SpacingMarks, or after Prepend characters.", - "9.1) \u00D7 $SpacingMark", - "9.2) $Prepend \u00D7", - "9.3) $LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* \u00D7 $LinkingConsonant", - "# Do not break within emoji modifier sequences or emoji zwj sequences.", - //"10) $E_Base $Extend* × $E_Modifier", - "11) $ExtPict $Extend* $ZWJ × $ExtPict", - "# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.", - "12) ^ ($RI $RI)* $RI × $RI", - "13) [^$RI] ($RI $RI)* $RI × $RI", - "# Otherwise, break everywhere.", - }, - { - "LineBreak", - "# Variables", - "$AI=\\p{Line_Break=Ambiguous}", - "$AL=\\p{Line_Break=Alphabetic}", - "$B2=\\p{Line_Break=Break_Both}", - "$BA=\\p{Line_Break=Break_After}", - "$BB=\\p{Line_Break=Break_Before}", - "$BK=\\p{Line_Break=Mandatory_Break}", - "$CB=\\p{Line_Break=Contingent_Break}", - "$CL=\\p{Line_Break=Close_Punctuation}", - "$CP=\\p{Line_Break=CP}", - "$CM1=\\p{Line_Break=Combining_Mark}", - "$CR=\\p{Line_Break=Carriage_Return}", - "$EX=\\p{Line_Break=Exclamation}", - "$GL=\\p{Line_Break=Glue}", - "$H2=\\p{Line_Break=H2}", - "$H3=\\p{Line_Break=H3}", - "$HL=\\p{Line_Break=HL}", - "$HY=\\p{Line_Break=Hyphen}", - "$ID=\\p{Line_Break=Ideographic}", - "$IN=\\p{Line_Break=Inseparable}", - "$IS=\\p{Line_Break=Infix_Numeric}", - "$JL=\\p{Line_Break=JL}", - "$JT=\\p{Line_Break=JT}", - "$JV=\\p{Line_Break=JV}", - "$LF=\\p{Line_Break=Line_Feed}", - "$NL=\\p{Line_Break=Next_Line}", - "$NS=\\p{Line_Break=Nonstarter}", - "$NU=\\p{Line_Break=Numeric}", - "$OP=\\p{Line_Break=Open_Punctuation}", - "$PO=\\p{Line_Break=Postfix_Numeric}", - "$PR=\\p{Line_Break=Prefix_Numeric}", - "$QU=\\p{Line_Break=Quotation}", - "$SA=\\p{Line_Break=Complex_Context}", - "$SG=\\p{Line_Break=Surrogate}", - "$SP=\\p{Line_Break=Space}", - "$SY=\\p{Line_Break=Break_Symbols}", - "$WJ=\\p{Line_Break=Word_Joiner}", - "$XX=\\p{Line_Break=Unknown}", - "$ZW=\\p{Line_Break=ZWSpace}", - "$CJ=\\p{Line_Break=Conditional_Japanese_Starter}", - "$RI=\\p{Line_Break=Regional_Indicator}", - - "$EB=\\p{Line_Break=E_Base}", - "$EM=\\p{Line_Break=E_Modifier}", - "$ZWJ_O=\\p{Line_Break=ZWJ}", - "$ZWJ=\\p{Line_Break=ZWJ}", - - "# Macros", - "$CM=[$CM1 $ZWJ]", - - "# LB 1 Assign a line breaking class to each code point of the input. ", - "# Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.", - "# NOTE: CB is ok to fall through, but must handle others here.", - // "show $AL", - "$AL=[$AI $AL $SG $XX $SA]", - "$NS=[$NS $CJ]", - // "show $AL", - // "$oldAL=$AL", // for debugging - "# WARNING: Fixes for Rule 9", - "# Treat X (CM|ZWJ* as if it were X.", - "# Where X is any line break class except SP, BK, CR, LF, NL or ZW.", - "$X=$CM*", - "# Macros", - "$Spec1_=[$SP $BK $CR $LF $NL $ZW]", - "$Spec2_=[^ $SP $BK $CR $LF $NL $ZW]", - "$Spec3a_=[^ $SP $BA $HY $CM]", - "$Spec3b_=[^ $BA $HY $CM]", - "$Spec4_=[^ $NU $CM]", - - "$AI=($AI $X)", - "$AL=($AL $X)", - "$B2=($B2 $X)", - "$BA=($BA $X)", - "$BB=($BB $X)", - "$CB=($CB $X)", - "$CL=($CL $X)", - "$CP=($CP $X)", - "$CM=($CM $X)", - //"$CM=($CM $X)", - "$EX=($EX $X)", - "$GL=($GL $X)", - "$H2=($H2 $X)", - "$H3=($H3 $X)", - "$HL=($HL $X)", - "$HY=($HY $X)", - "$ID=($ID $X)", - "$IN=($IN $X)", - "$IS=($IS $X)", - "$JL=($JL $X)", - "$JT=($JT $X)", - "$JV=($JV $X)", - "$NS=($NS $X)", - "$NU=($NU $X)", - "$OP=($OP $X)", - "$PO=($PO $X)", - "$PR=($PR $X)", - "$QU=($QU $X)", - "$SA=($SA $X)", - "$SG=($SG $X)", - "$SY=($SY $X)", - "$WJ=($WJ $X)", - "$XX=($XX $X)", - "$RI=($RI $X)", - - "$EB=($EB $X)", - "$EM=($EM $X)", - "$ZWJ=($ZWJ $X)", - - "# OUT OF ORDER ON PURPOSE", - "# LB 10 Treat any remaining combining mark as AL.", - "$AL=($AL | ^ $CM | (?<=$Spec1_) $CM)", - - "# Rules", - - "# LB 4 Always break after hard line breaks (but never between CR and LF).", - "4) $BK \u00F7", - "# LB 5 Treat CR followed by LF, as well as CR, LF and NL as hard line breaks.", - "5.01) $CR \u00D7 $LF", - "5.02) $CR \u00F7", - "5.03) $LF \u00F7", - "5.04) $NL \u00F7", - "# LB 6 Do not break before hard line breaks.", - "6) \u00D7 ( $BK | $CR | $LF | $NL )", - "# LB 7 Do not break before spaces or zero-width space.", - "7.01) \u00D7 $SP", - "7.02) \u00D7 $ZW", - "# LB 8 Break before any character following a zero-width space, even if one or more spaces intervene.", - "8) $ZW $SP* \u00F7", - "# LB 8a Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences)", - "8.1) $ZWJ_O \u00D7", - "# LB 9 Do not break a combining character sequence; treat it as if it has the LB class of the base character", - "# in all of the following rules. (Where X is any line break class except SP, BK, CR, LF, NL or ZW.)", - "9) $Spec2_ \u00D7 $CM", - "#WARNING: this is done by modifying the variable values for all but SP.... That is, $AL is really ($AI $CM*)!", - "# LB 11 Do not break before or after WORD JOINER and related characters.", - "11.01) \u00D7 $WJ", - "11.02) $WJ \u00D7", - "# LB 12 Do not break after NBSP and related characters.", - // "12.01) [^$SP] \u00D7 $GL", - "12) $GL \u00D7", - "12.1) $Spec3a_ \u00D7 $GL", - "12.2) $Spec3b_ $CM+ \u00D7 $GL", - "12.3) ^ $CM+ \u00D7 $GL", - - "# LB 13 Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces.", - "# Using customization 7.", - "13.01) \u00D7 $EX", - "13.02) $Spec4_ \u00D7 ($CL | $CP | $IS | $SY)", - "13.03) $Spec4_ $CM+ \u00D7 ($CL | $CP | $IS | $SY)", - "13.04) ^ $CM+ \u00D7 ($CL | $CP | $IS | $SY)", - // "13.03) $Spec4_ \u00D7 $IS", - // "13.04) $Spec4_ \u00D7 $SY", - "#LB 14 Do not break after \u2018[\u2019, even after spaces.", - "14) $OP $SP* \u00D7", - "# LB 15 Do not break within \u2018\"[\u2019, even with intervening spaces.", - "15) $QU $SP* \u00D7 $OP", - "# LB 16 Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces.", - "16) ($CL | $CP) $SP* \u00D7 $NS", - "# LB 17 Do not break within \u2018\u2014\u2014\u2019, even with intervening spaces.", - "17) $B2 $SP* \u00D7 $B2", - "# LB 18 Break after spaces.", - "18) $SP \u00F7", - "# LB 19 Do not break before or after \u2018\"\u2019.", - "19.01) \u00D7 $QU", - "19.02) $QU \u00D7", - "# LB 20 Break before and after unresolved CB.", - "20.01) \u00F7 $CB", - "20.02) $CB \u00F7", - "# LB 21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents.", - "21.01) \u00D7 $BA", - "21.02) \u00D7 $HY", - "21.03) \u00D7 $NS", - "21.04) $BB \u00D7", - "# LB 21a Don't break after Hebrew + Hyphen.", - "21.1) $HL ($HY | $BA) \u00D7", - "# LB 21b Don’t break between Solidus and Hebrew letters.", - "21.2) $SY × $HL", - "# LB 22 Do not break between two ellipses, or between letters, numbers or exclamations and ellipsis.", - // "show $AL", - "22.01) ($AL | $HL) \u00D7 $IN", - "22.02) $EX \u00D7 $IN", - "22.03) ($ID | $EB | $EM) \u00D7 $IN", - "22.04) $IN \u00D7 $IN", - "22.05) $NU \u00D7 $IN", - "# LB 23 Do not break between digits and letters.", - //"23.01) ($ID | $EB | $EM) \u00D7 $PO", - "23.02) ($AL | $HL) \u00D7 $NU", - "23.03) $NU \u00D7 ($AL | $HL)", - "# LB 24 Do not break between prefix and letters or ideographs.", - "23.12) $PR \u00D7 ($ID | $EB | $EM)", - "23.13) ($ID | $EB | $EM) \u00D7 $PO", - "# LB24 Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix.", - "24.02) ($PR | $PO) \u00D7 ($AL | $HL)", - "24.03) ($AL | $HL) \u00D7 ($PR | $PO)", - "# Using customization 7", - "# LB Alternative: ( PR | PO) ? ( OP | HY ) ? NU (NU | SY | IS) * (CL | CP) ? ( PR | PO) ?", - "# Insert \u00D7 every place it could go. However, make sure that at least one thing is concrete, otherwise would cause $NU to not break before or after ", - "25.01) ($PR | $PO) \u00D7 ( $OP | $HY )? $NU", - "25.02) ( $OP | $HY ) \u00D7 $NU", - "25.03) $NU \u00D7 ($NU | $SY | $IS)", - "25.04) $NU ($NU | $SY | $IS)* \u00D7 ($NU | $SY | $IS | $CL | $CP)", - "25.05) $NU ($NU | $SY | $IS)* ($CL | $CP)? \u00D7 ($PO | $PR)", - - "#LB 26 Do not break a Korean syllable.", - "26.01) $JL \u00D7 $JL | $JV | $H2 | $H3", - "26.02) $JV | $H2 \u00D7 $JV | $JT", - "26.03) $JT | $H3 \u00D7 $JT", - "# LB 27 Treat a Korean Syllable Block the same as ID.", - "27.01) $JL | $JV | $JT | $H2 | $H3 \u00D7 $PO", - "27.02) $PR \u00D7 $JL | $JV | $JT | $H2 | $H3", - "# LB 28 Do not break between alphabetics (\"at\").", - "28) ($AL | $HL) \u00D7 ($AL | $HL)", - "# LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").", - "29) $IS \u00D7 ($AL | $HL)", - "# LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation.", - "30.01) ($AL | $HL | $NU) \u00D7 $OP", - "30.02) $CP \u00D7 ($AL | $HL | $NU)", - "# LB 30a Break between two Regional Indicators if and only if there is an even number of them before the point being considered.", - "30.11) ^ ($RI $RI)* $RI × $RI", - "30.12) [^$RI] ($RI $RI)* $RI × $RI", - "30.13) $RI ÷ $RI", - "30.2) $EB × $EM", - }, - { - "SentenceBreak", - "$CR=\\p{Sentence_Break=CR}", - "$LF=\\p{Sentence_Break=LF}", - "$Extend=\\p{Sentence_Break=Extend}", - "$Format=\\p{Sentence_Break=Format}", - "$Sep=\\p{Sentence_Break=Sep}", - "$Sp=\\p{Sentence_Break=Sp}", - "$Lower=\\p{Sentence_Break=Lower}", - "$Upper=\\p{Sentence_Break=Upper}", - "$OLetter=\\p{Sentence_Break=OLetter}", - "$Numeric=\\p{Sentence_Break=Numeric}", - "$ATerm=\\p{Sentence_Break=ATerm}", - "$STerm=\\p{Sentence_Break=STerm}", - "$Close=\\p{Sentence_Break=Close}", - "$SContinue=\\p{Sentence_Break=SContinue}", - "$Any=.", - // "# subtract Format from Control, since we don't want to break before/after", - // "$Control=[$Control-$Format]", - "# Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need.", - // "$NotStuff=[^$OLetter $Upper $Lower $Sep]", - // "# $ATerm and $Sterm are temporary, to match ICU until UTC decides.", - - "# WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend", - "$FE=[$Format $Extend]", - "# Special rules", - "$NotPreLower_=[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]", - // "$NotSep_=[^ $Sep $CR $LF]", - - // "$FE=$Extend* $Format*", - "$Sp=($Sp $FE*)", - "$Lower=($Lower $FE*)", - "$Upper=($Upper $FE*)", - "$OLetter=($OLetter $FE*)", - "$Numeric=($Numeric $FE*)", - "$ATerm=($ATerm $FE*)", - "$STerm=($STerm $FE*)", - "$Close=($Close $FE*)", - "$SContinue=($SContinue $FE*)", - - "# Macros", - "$ParaSep = ($Sep | $CR | $LF)", - "$SATerm = ($STerm | $ATerm)", - - "# Rules", - "# Break at the start and end of text, unless the text is empty.", - "# Do not break within CRLF.", - "3) $CR \u00D7 $LF", - "# Break after paragraph separators.", - "4) $ParaSep \u00F7", - // "3.4) ( $Control | $CR | $LF ) \u00F7", - // "3.5) \u00F7 ( $Control | $CR | $LF )", - "# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)", - "# WARNING: Implemented as don't break before format (except after linebreaks),", - "# AND add format and extend in all variables definitions that appear after this point!", - // "3.91) [^$Control | $CR | $LF] \u00D7 $Extend", - "5) \u00D7 [$Format $Extend]", - "# Do not break after full stop in certain contexts. [See note below.]", - "# Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter,", - "# is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase.", - "# For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.", - "6) $ATerm \u00D7 $Numeric", - "7) ($Upper | $Lower) $ATerm \u00D7 $Upper", - "8) $ATerm $Close* $Sp* \u00D7 $NotPreLower_* $Lower", - "8.1) $SATerm $Close* $Sp* \u00D7 ($SContinue | $SATerm)", - "# Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator.", - "9) $SATerm $Close* \u00D7 ( $Close | $Sp | $ParaSep )", - "# Note the fix to $Sp*, $Sep?", - "10) $SATerm $Close* $Sp* \u00D7 ( $Sp | $ParaSep )", - "11) $SATerm $Close* $Sp* $ParaSep? \u00F7", - "#Otherwise, do not break", - "998) \u00D7 $Any", - }, { - "WordBreak", - "$CR=\\p{Word_Break=CR}", - "$LF=\\p{Word_Break=LF}", - "$Newline=\\p{Word_Break=Newline}", - // "$Control=\\p{Word_Break=Control}", - "$Extend=\\p{Word_Break=Extend}", - // "$NEWLINE=[$CR $LF \\u0085 \\u000B \\u000C \\u2028 \\u2029]", - // "$Sep=\\p{Sentence_Break=Sep}", - "# Now normal variables", - "$Format=\\p{Word_Break=Format}", - "$Katakana=\\p{Word_Break=Katakana}", - "$ALetter=\\p{Word_Break=ALetter}", - "$MidLetter=\\p{Word_Break=MidLetter}", - "$MidNum=\\p{Word_Break=MidNum}", - "$MidNumLet=\\p{Word_Break=MidNumLet}", - "$Numeric=\\p{Word_Break=Numeric}", - "$ExtendNumLet=\\p{Word_Break=ExtendNumLet}", - "$RI=\\p{Word_Break=Regional_Indicator}", - "$Hebrew_Letter=\\p{Word_Break=Hebrew_Letter}", - "$Double_Quote=\\p{Word_Break=Double_Quote}", - "$Single_Quote=\\p{Word_Break=Single_Quote}", - - // "$E_Base=\\p{Word_Break=E_Base}", - // "$E_Modifier=\\p{Word_Break=E_Modifier}", - "$ZWJ=\\p{Word_Break=ZWJ}", - "$ExtPict=\\p{Extended_Pictographic}", - - //"$EBG=\\p{Word_Break=E_Base_GAZ}", - //"$Glue_After_Zwj=\\p{Word_Break=Glue_After_Zwj}", - - "$WSegSpace=\\p{Word_Break=WSegSpace}", - - "# Macros", - - "$AHLetter=($ALetter | $Hebrew_Letter)", - "$MidNumLetQ=($MidNumLet | $Single_Quote)", - - "# WARNING: For Rule 4: Fixes for GC, Format", - // "# Subtract Format from Control, since we don't want to break before/after", - // "$Control=[$Control-$Format]", - "# Add format and extend to everything", - "$FE=[$Format $Extend $ZWJ]", - "# Special rules", - "$NotBreak_=[^ $Newline $CR $LF ]", - // "$FE= ($Extend | $Format)*", - "$Katakana=($Katakana $FE*)", - "$ALetter=($ALetter $FE*)", - "$MidLetter=($MidLetter $FE*)", - "$MidNum=($MidNum $FE*)", - "$MidNumLet=($MidNumLet $FE*)", - "$Numeric=($Numeric $FE*)", - "$ExtendNumLet=($ExtendNumLet $FE*)", - "$RI=($RI $FE*)", - "$Hebrew_Letter=($Hebrew_Letter $FE*)", - "$Double_Quote=($Double_Quote $FE*)", - "$Single_Quote=($Single_Quote $FE*)", - - // "$E_Base=($E_Base $FE*)", - // "$E_Modifier=($E_Modifier $FE*)", - //"$ZWJ=($ZWJ $FE*)", don't do this one! - //"$Glue_After_Zwj=($Glue_After_Zwj $FE*)", - //"$EBG=($EBG $FE*)", - - "$AHLetter=($AHLetter $FE*)", - "$MidNumLetQ=($MidNumLetQ $FE*)", - - "# Rules", - - "# Break at the start and end of text, unless the text is empty.", - "# Do not break within CRLF.", - "3) $CR \u00D7 $LF", - "# Otherwise break before and after Newlines (including CR and LF)", - "3.1) ($Newline | $CR | $LF) \u00F7", - "3.2) \u00F7 ($Newline | $CR | $LF)", - "# Do not break within emoji zwj sequences.", - "3.3) $ZWJ × $ExtPict", - "3.4) $WSegSpace × $WSegSpace", - - // "3.4) ( $Control | $CR | $LF ) \u00F7", - // "3.5) \u00F7 ( $Control | $CR | $LF )", - // "3.9) \u00D7 $Extend", - // "3.91) [^$Control | $CR | $LF] \u00D7 $Extend", - "# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)", - "# WARNING: Implemented as don't break before format (except after linebreaks),", - "# AND add format and extend in all variables definitions that appear after this point!", - // "4) \u00D7 [$Format $Extend]", - "4) $NotBreak_ \u00D7 [$Format $Extend $ZWJ]", - "# Vanilla rules", - "# Do not break between most letters.", - "5) $AHLetter \u00D7 $AHLetter", - "# Do not break letters across certain punctuation.", - "6) $AHLetter \u00D7 ($MidLetter | $MidNumLetQ) $AHLetter", - "7) $AHLetter ($MidLetter | $MidNumLetQ) \u00D7 $AHLetter", - "7.1) $Hebrew_Letter × $Single_Quote", - "7.2) $Hebrew_Letter × $Double_Quote $Hebrew_Letter", - "7.3) $Hebrew_Letter $Double_Quote × $Hebrew_Letter", - "# Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”).", - "8) $Numeric \u00D7 $Numeric", - "9) $AHLetter \u00D7 $Numeric", - "10) $Numeric \u00D7 $AHLetter", - "# Do not break within sequences, such as “3.2” or “3,456.789”.", - "11) $Numeric ($MidNum | $MidNumLetQ) \u00D7 $Numeric", - "12) $Numeric \u00D7 ($MidNum | $MidNumLetQ) $Numeric", - "# Do not break between Katakana.", - "13) $Katakana \u00D7 $Katakana", - "# Do not break from extenders.", - "13.1) ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) \u00D7 $ExtendNumLet", - "13.2) $ExtendNumLet \u00D7 ($AHLetter | $Numeric | $Katakana)", - - //"# Do not break within emoji modifier sequences.", - //"14) $E_Base × $E_Modifier", - - "# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.", - "15) ^ ($RI $RI)* $RI × $RI", - "16) [^$RI] ($RI $RI)* $RI × $RI", - "# Otherwise, break everywhere (including around ideographs).", - } }; + { + "GraphemeClusterBreak", + "$CR=\\p{Grapheme_Cluster_Break=CR}", + "$LF=\\p{Grapheme_Cluster_Break=LF}", + "$Control=\\p{Grapheme_Cluster_Break=Control}", + "$Extend=\\p{Grapheme_Cluster_Break=Extend}", + "$ZWJ=\\p{Grapheme_Cluster_Break=ZWJ}", + "$RI=\\p{Grapheme_Cluster_Break=Regional_Indicator}", + "$Prepend=\\p{Grapheme_Cluster_Break=Prepend}", + "$SpacingMark=\\p{Grapheme_Cluster_Break=SpacingMark}", + "$L=\\p{Grapheme_Cluster_Break=L}", + "$V=\\p{Grapheme_Cluster_Break=V}", + "$T=\\p{Grapheme_Cluster_Break=T}", + "$LV=\\p{Grapheme_Cluster_Break=LV}", + "$LVT=\\p{Grapheme_Cluster_Break=LVT}", + "$Virama=[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Virama}]", + "$LinkingConsonant=[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&\\p{Indic_Syllabic_Category=Consonant}]", + + // "$E_Base=\\p{Grapheme_Cluster_Break=E_Base}", + // "$E_Modifier=\\p{Grapheme_Cluster_Break=E_Modifier}", + + "$ExtPict=\\p{Extended_Pictographic}", + "$ExtCccZwj=[[$Extend-\\p{ccc=0}] $ZWJ]", + // "$EBG=\\p{Grapheme_Cluster_Break=E_Base_GAZ}", + // "$Glue_After_Zwj=\\p{Grapheme_Cluster_Break=Glue_After_Zwj}", + + "# Rules", + "# Break at the start and end of text, unless the text is empty.", + "# Do not break between a CR and LF. Otherwise, break before and after controls.", + "3) $CR \u00D7 $LF", + "4) ( $Control | $CR | $LF ) \u00F7", + "5) \u00F7 ( $Control | $CR | $LF )", + "# Do not break Hangul syllable sequences.", + "6) $L \u00D7 ( $L | $V | $LV | $LVT )", + "7) ( $LV | $V ) \u00D7 ( $V | $T )", + "8) ( $LVT | $T) \u00D7 $T", + "# Do not break before extending characters or ZWJ.", + // "9) \u00D7 ($Extend | $ZWJ | $Virama)", + "9) \u00D7 ($Extend | $ZWJ)", + "# Only for extended grapheme clusters: Do not break before SpacingMarks, or after Prepend characters.", + "9.1) \u00D7 $SpacingMark", + "9.2) $Prepend \u00D7", + "9.3) $LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* \u00D7 $LinkingConsonant", + "# Do not break within emoji modifier sequences or emoji zwj sequences.", + // "10) $E_Base $Extend* × $E_Modifier", + "11) $ExtPict $Extend* $ZWJ × $ExtPict", + "# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.", + "12) ^ ($RI $RI)* $RI × $RI", + "13) [^$RI] ($RI $RI)* $RI × $RI", + "# Otherwise, break everywhere.", + }, + { + "LineBreak", + "# Variables", + "$AI=\\p{Line_Break=Ambiguous}", + "$AL=\\p{Line_Break=Alphabetic}", + "$B2=\\p{Line_Break=Break_Both}", + "$BA=\\p{Line_Break=Break_After}", + "$BB=\\p{Line_Break=Break_Before}", + "$BK=\\p{Line_Break=Mandatory_Break}", + "$CB=\\p{Line_Break=Contingent_Break}", + "$CL=\\p{Line_Break=Close_Punctuation}", + "$CP=\\p{Line_Break=CP}", + "$CM1=\\p{Line_Break=Combining_Mark}", + "$CR=\\p{Line_Break=Carriage_Return}", + "$EX=\\p{Line_Break=Exclamation}", + "$GL=\\p{Line_Break=Glue}", + "$H2=\\p{Line_Break=H2}", + "$H3=\\p{Line_Break=H3}", + "$HL=\\p{Line_Break=HL}", + "$HY=\\p{Line_Break=Hyphen}", + "$ID=\\p{Line_Break=Ideographic}", + "$IN=\\p{Line_Break=Inseparable}", + "$IS=\\p{Line_Break=Infix_Numeric}", + "$JL=\\p{Line_Break=JL}", + "$JT=\\p{Line_Break=JT}", + "$JV=\\p{Line_Break=JV}", + "$LF=\\p{Line_Break=Line_Feed}", + "$NL=\\p{Line_Break=Next_Line}", + "$NS=\\p{Line_Break=Nonstarter}", + "$NU=\\p{Line_Break=Numeric}", + "$OP=\\p{Line_Break=Open_Punctuation}", + "$PO=\\p{Line_Break=Postfix_Numeric}", + "$PR=\\p{Line_Break=Prefix_Numeric}", + "$QU=\\p{Line_Break=Quotation}", + "$SA=\\p{Line_Break=Complex_Context}", + "$SG=\\p{Line_Break=Surrogate}", + "$SP=\\p{Line_Break=Space}", + "$SY=\\p{Line_Break=Break_Symbols}", + "$WJ=\\p{Line_Break=Word_Joiner}", + "$XX=\\p{Line_Break=Unknown}", + "$ZW=\\p{Line_Break=ZWSpace}", + "$CJ=\\p{Line_Break=Conditional_Japanese_Starter}", + "$RI=\\p{Line_Break=Regional_Indicator}", + "$EB=\\p{Line_Break=E_Base}", + "$EM=\\p{Line_Break=E_Modifier}", + "$ZWJ_O=\\p{Line_Break=ZWJ}", + "$ZWJ=\\p{Line_Break=ZWJ}", + "# Macros", + "$CM=[$CM1 $ZWJ]", + "# LB 1 Assign a line breaking class to each code point of the input. ", + "# Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm.", + "# NOTE: CB is ok to fall through, but must handle others here.", + // "show $AL", + "$AL=[$AI $AL $SG $XX $SA]", + "$NS=[$NS $CJ]", + // "show $AL", + // "$oldAL=$AL", // for debugging + "# WARNING: Fixes for Rule 9", + "# Treat X (CM|ZWJ* as if it were X.", + "# Where X is any line break class except SP, BK, CR, LF, NL or ZW.", + "$X=$CM*", + "# Macros", + "$Spec1_=[$SP $BK $CR $LF $NL $ZW]", + "$Spec2_=[^ $SP $BK $CR $LF $NL $ZW]", + "$Spec3a_=[^ $SP $BA $HY $CM]", + "$Spec3b_=[^ $BA $HY $CM]", + "$Spec4_=[^ $NU $CM]", + "$AI=($AI $X)", + "$AL=($AL $X)", + "$B2=($B2 $X)", + "$BA=($BA $X)", + "$BB=($BB $X)", + "$CB=($CB $X)", + "$CL=($CL $X)", + "$CP=($CP $X)", + "$CM=($CM $X)", + // "$CM=($CM $X)", + "$EX=($EX $X)", + "$GL=($GL $X)", + "$H2=($H2 $X)", + "$H3=($H3 $X)", + "$HL=($HL $X)", + "$HY=($HY $X)", + "$ID=($ID $X)", + "$IN=($IN $X)", + "$IS=($IS $X)", + "$JL=($JL $X)", + "$JT=($JT $X)", + "$JV=($JV $X)", + "$NS=($NS $X)", + "$NU=($NU $X)", + "$OP=($OP $X)", + "$PO=($PO $X)", + "$PR=($PR $X)", + "$QU=($QU $X)", + "$SA=($SA $X)", + "$SG=($SG $X)", + "$SY=($SY $X)", + "$WJ=($WJ $X)", + "$XX=($XX $X)", + "$RI=($RI $X)", + "$EB=($EB $X)", + "$EM=($EM $X)", + "$ZWJ=($ZWJ $X)", + "# OUT OF ORDER ON PURPOSE", + "# LB 10 Treat any remaining combining mark as AL.", + "$AL=($AL | ^ $CM | (?<=$Spec1_) $CM)", + "# Rules", + "# LB 4 Always break after hard line breaks (but never between CR and LF).", + "4) $BK \u00F7", + "# LB 5 Treat CR followed by LF, as well as CR, LF and NL as hard line breaks.", + "5.01) $CR \u00D7 $LF", + "5.02) $CR \u00F7", + "5.03) $LF \u00F7", + "5.04) $NL \u00F7", + "# LB 6 Do not break before hard line breaks.", + "6) \u00D7 ( $BK | $CR | $LF | $NL )", + "# LB 7 Do not break before spaces or zero-width space.", + "7.01) \u00D7 $SP", + "7.02) \u00D7 $ZW", + "# LB 8 Break before any character following a zero-width space, even if one or more spaces intervene.", + "8) $ZW $SP* \u00F7", + "# LB 8a Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences)", + "8.1) $ZWJ_O \u00D7", + "# LB 9 Do not break a combining character sequence; treat it as if it has the LB class of the base character", + "# in all of the following rules. (Where X is any line break class except SP, BK, CR, LF, NL or ZW.)", + "9) $Spec2_ \u00D7 $CM", + "#WARNING: this is done by modifying the variable values for all but SP.... That is, $AL is really ($AI $CM*)!", + "# LB 11 Do not break before or after WORD JOINER and related characters.", + "11.01) \u00D7 $WJ", + "11.02) $WJ \u00D7", + "# LB 12 Do not break after NBSP and related characters.", + // "12.01) [^$SP] \u00D7 $GL", + "12) $GL \u00D7", + "12.1) $Spec3a_ \u00D7 $GL", + "12.2) $Spec3b_ $CM+ \u00D7 $GL", + "12.3) ^ $CM+ \u00D7 $GL", + "# LB 13 Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces.", + "# Using customization 7.", + "13.01) \u00D7 $EX", + "13.02) $Spec4_ \u00D7 ($CL | $CP | $IS | $SY)", + "13.03) $Spec4_ $CM+ \u00D7 ($CL | $CP | $IS | $SY)", + "13.04) ^ $CM+ \u00D7 ($CL | $CP | $IS | $SY)", + // "13.03) $Spec4_ \u00D7 $IS", + // "13.04) $Spec4_ \u00D7 $SY", + "#LB 14 Do not break after \u2018[\u2019, even after spaces.", + "14) $OP $SP* \u00D7", + "# LB 15 Do not break within \u2018\"[\u2019, even with intervening spaces.", + "15) $QU $SP* \u00D7 $OP", + "# LB 16 Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces.", + "16) ($CL | $CP) $SP* \u00D7 $NS", + "# LB 17 Do not break within \u2018\u2014\u2014\u2019, even with intervening spaces.", + "17) $B2 $SP* \u00D7 $B2", + "# LB 18 Break after spaces.", + "18) $SP \u00F7", + "# LB 19 Do not break before or after \u2018\"\u2019.", + "19.01) \u00D7 $QU", + "19.02) $QU \u00D7", + "# LB 20 Break before and after unresolved CB.", + "20.01) \u00F7 $CB", + "20.02) $CB \u00F7", + "# LB 21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents.", + "21.01) \u00D7 $BA", + "21.02) \u00D7 $HY", + "21.03) \u00D7 $NS", + "21.04) $BB \u00D7", + "# LB 21a Don't break after Hebrew + Hyphen.", + "21.1) $HL ($HY | $BA) \u00D7", + "# LB 21b Don’t break between Solidus and Hebrew letters.", + "21.2) $SY × $HL", + "# LB 22 Do not break between two ellipses, or between letters, numbers or exclamations and ellipsis.", + // "show $AL", + "22.01) ($AL | $HL) \u00D7 $IN", + "22.02) $EX \u00D7 $IN", + "22.03) ($ID | $EB | $EM) \u00D7 $IN", + "22.04) $IN \u00D7 $IN", + "22.05) $NU \u00D7 $IN", + "# LB 23 Do not break between digits and letters.", + // "23.01) ($ID | $EB | $EM) \u00D7 $PO", + "23.02) ($AL | $HL) \u00D7 $NU", + "23.03) $NU \u00D7 ($AL | $HL)", + "# LB 24 Do not break between prefix and letters or ideographs.", + "23.12) $PR \u00D7 ($ID | $EB | $EM)", + "23.13) ($ID | $EB | $EM) \u00D7 $PO", + "# LB24 Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix.", + "24.02) ($PR | $PO) \u00D7 ($AL | $HL)", + "24.03) ($AL | $HL) \u00D7 ($PR | $PO)", + "# Using customization 7", + "# LB Alternative: ( PR | PO) ? ( OP | HY ) ? NU (NU | SY | IS) * (CL | CP) ? ( PR | PO) ?", + "# Insert \u00D7 every place it could go. However, make sure that at least one thing is concrete, otherwise would cause $NU to not break before or after ", + "25.01) ($PR | $PO) \u00D7 ( $OP | $HY )? $NU", + "25.02) ( $OP | $HY ) \u00D7 $NU", + "25.03) $NU \u00D7 ($NU | $SY | $IS)", + "25.04) $NU ($NU | $SY | $IS)* \u00D7 ($NU | $SY | $IS | $CL | $CP)", + "25.05) $NU ($NU | $SY | $IS)* ($CL | $CP)? \u00D7 ($PO | $PR)", + "#LB 26 Do not break a Korean syllable.", + "26.01) $JL \u00D7 $JL | $JV | $H2 | $H3", + "26.02) $JV | $H2 \u00D7 $JV | $JT", + "26.03) $JT | $H3 \u00D7 $JT", + "# LB 27 Treat a Korean Syllable Block the same as ID.", + "27.01) $JL | $JV | $JT | $H2 | $H3 \u00D7 $PO", + "27.02) $PR \u00D7 $JL | $JV | $JT | $H2 | $H3", + "# LB 28 Do not break between alphabetics (\"at\").", + "28) ($AL | $HL) \u00D7 ($AL | $HL)", + "# LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\").", + "29) $IS \u00D7 ($AL | $HL)", + "# LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation.", + "30.01) ($AL | $HL | $NU) \u00D7 $OP", + "30.02) $CP \u00D7 ($AL | $HL | $NU)", + "# LB 30a Break between two Regional Indicators if and only if there is an even number of them before the point being considered.", + "30.11) ^ ($RI $RI)* $RI × $RI", + "30.12) [^$RI] ($RI $RI)* $RI × $RI", + "30.13) $RI ÷ $RI", + "30.2) $EB × $EM", + }, + { + "SentenceBreak", + "$CR=\\p{Sentence_Break=CR}", + "$LF=\\p{Sentence_Break=LF}", + "$Extend=\\p{Sentence_Break=Extend}", + "$Format=\\p{Sentence_Break=Format}", + "$Sep=\\p{Sentence_Break=Sep}", + "$Sp=\\p{Sentence_Break=Sp}", + "$Lower=\\p{Sentence_Break=Lower}", + "$Upper=\\p{Sentence_Break=Upper}", + "$OLetter=\\p{Sentence_Break=OLetter}", + "$Numeric=\\p{Sentence_Break=Numeric}", + "$ATerm=\\p{Sentence_Break=ATerm}", + "$STerm=\\p{Sentence_Break=STerm}", + "$Close=\\p{Sentence_Break=Close}", + "$SContinue=\\p{Sentence_Break=SContinue}", + "$Any=.", + // "# subtract Format from Control, since we don't want to break before/after", + // "$Control=[$Control-$Format]", + "# Expresses the negation in rule 8; can't do this with normal regex, but works with UnicodeSet, which is all we need.", + // "$NotStuff=[^$OLetter $Upper $Lower $Sep]", + // "# $ATerm and $Sterm are temporary, to match ICU until UTC decides.", + + "# WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend", + "$FE=[$Format $Extend]", + "# Special rules", + "$NotPreLower_=[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]", + // "$NotSep_=[^ $Sep $CR $LF]", + + // "$FE=$Extend* $Format*", + "$Sp=($Sp $FE*)", + "$Lower=($Lower $FE*)", + "$Upper=($Upper $FE*)", + "$OLetter=($OLetter $FE*)", + "$Numeric=($Numeric $FE*)", + "$ATerm=($ATerm $FE*)", + "$STerm=($STerm $FE*)", + "$Close=($Close $FE*)", + "$SContinue=($SContinue $FE*)", + "# Macros", + "$ParaSep = ($Sep | $CR | $LF)", + "$SATerm = ($STerm | $ATerm)", + "# Rules", + "# Break at the start and end of text, unless the text is empty.", + "# Do not break within CRLF.", + "3) $CR \u00D7 $LF", + "# Break after paragraph separators.", + "4) $ParaSep \u00F7", + // "3.4) ( $Control | $CR | $LF ) \u00F7", + // "3.5) \u00F7 ( $Control | $CR | $LF )", + "# Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)", + "# WARNING: Implemented as don't break before format (except after linebreaks),", + "# AND add format and extend in all variables definitions that appear after this point!", + // "3.91) [^$Control | $CR | $LF] \u00D7 $Extend", + "5) \u00D7 [$Format $Extend]", + "# Do not break after full stop in certain contexts. [See note below.]", + "# Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter,", + "# is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase.", + "# For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.", + "6) $ATerm \u00D7 $Numeric", + "7) ($Upper | $Lower) $ATerm \u00D7 $Upper", + "8) $ATerm $Close* $Sp* \u00D7 $NotPreLower_* $Lower", + "8.1) $SATerm $Close* $Sp* \u00D7 ($SContinue | $SATerm)", + "# Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator.", + "9) $SATerm $Close* \u00D7 ( $Close | $Sp | $ParaSep )", + "# Note the fix to $Sp*, $Sep?", + "10) $SATerm $Close* $Sp* \u00D7 ( $Sp | $ParaSep )", + "11) $SATerm $Close* $Sp* $ParaSep? \u00F7", + "#Otherwise, do not break", + "998) \u00D7 $Any", + }, + { + "WordBreak", + "$CR=\\p{Word_Break=CR}", + "$LF=\\p{Word_Break=LF}", + "$Newline=\\p{Word_Break=Newline}", + // "$Control=\\p{Word_Break=Control}", + "$Extend=\\p{Word_Break=Extend}", + // "$NEWLINE=[$CR $LF \\u0085 \\u000B \\u000C \\u2028 \\u2029]", + // "$Sep=\\p{Sentence_Break=Sep}", + "# Now normal variables", + "$Format=\\p{Word_Break=Format}", + "$Katakana=\\p{Word_Break=Katakana}", + "$ALetter=\\p{Word_Break=ALetter}", + "$MidLetter=\\p{Word_Break=MidLetter}", + "$MidNum=\\p{Word_Break=MidNum}", + "$MidNumLet=\\p{Word_Break=MidNumLet}", + "$Numeric=\\p{Word_Break=Numeric}", + "$ExtendNumLet=\\p{Word_Break=ExtendNumLet}", + "$RI=\\p{Word_Break=Regional_Indicator}", + "$Hebrew_Letter=\\p{Word_Break=Hebrew_Letter}", + "$Double_Quote=\\p{Word_Break=Double_Quote}", + "$Single_Quote=\\p{Word_Break=Single_Quote}", + + // "$E_Base=\\p{Word_Break=E_Base}", + // "$E_Modifier=\\p{Word_Break=E_Modifier}", + "$ZWJ=\\p{Word_Break=ZWJ}", + "$ExtPict=\\p{Extended_Pictographic}", + + // "$EBG=\\p{Word_Break=E_Base_GAZ}", + // "$Glue_After_Zwj=\\p{Word_Break=Glue_After_Zwj}", + + "$WSegSpace=\\p{Word_Break=WSegSpace}", + "# Macros", + "$AHLetter=($ALetter | $Hebrew_Letter)", + "$MidNumLetQ=($MidNumLet | $Single_Quote)", + "# WARNING: For Rule 4: Fixes for GC, Format", + // "# Subtract Format from Control, since we don't want to break before/after", + // "$Control=[$Control-$Format]", + "# Add format and extend to everything", + "$FE=[$Format $Extend $ZWJ]", + "# Special rules", + "$NotBreak_=[^ $Newline $CR $LF ]", + // "$FE= ($Extend | $Format)*", + "$Katakana=($Katakana $FE*)", + "$ALetter=($ALetter $FE*)", + "$MidLetter=($MidLetter $FE*)", + "$MidNum=($MidNum $FE*)", + "$MidNumLet=($MidNumLet $FE*)", + "$Numeric=($Numeric $FE*)", + "$ExtendNumLet=($ExtendNumLet $FE*)", + "$RI=($RI $FE*)", + "$Hebrew_Letter=($Hebrew_Letter $FE*)", + "$Double_Quote=($Double_Quote $FE*)", + "$Single_Quote=($Single_Quote $FE*)", + + // "$E_Base=($E_Base $FE*)", + // "$E_Modifier=($E_Modifier $FE*)", + // "$ZWJ=($ZWJ $FE*)", don't do this one! + // "$Glue_After_Zwj=($Glue_After_Zwj $FE*)", + // "$EBG=($EBG $FE*)", + + "$AHLetter=($AHLetter $FE*)", + "$MidNumLetQ=($MidNumLetQ $FE*)", + "# Rules", + "# Break at the start and end of text, unless the text is empty.", + "# Do not break within CRLF.", + "3) $CR \u00D7 $LF", + "# Otherwise break before and after Newlines (including CR and LF)", + "3.1) ($Newline | $CR | $LF) \u00F7", + "3.2) \u00F7 ($Newline | $CR | $LF)", + "# Do not break within emoji zwj sequences.", + "3.3) $ZWJ × $ExtPict", + "3.4) $WSegSpace × $WSegSpace", + + // "3.4) ( $Control | $CR | $LF ) \u00F7", + // "3.5) \u00F7 ( $Control | $CR | $LF )", + // "3.9) \u00D7 $Extend", + // "3.91) [^$Control | $CR | $LF] \u00D7 $Extend", + "# Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend)", + "# WARNING: Implemented as don't break before format (except after linebreaks),", + "# AND add format and extend in all variables definitions that appear after this point!", + // "4) \u00D7 [$Format $Extend]", + "4) $NotBreak_ \u00D7 [$Format $Extend $ZWJ]", + "# Vanilla rules", + "# Do not break between most letters.", + "5) $AHLetter \u00D7 $AHLetter", + "# Do not break letters across certain punctuation.", + "6) $AHLetter \u00D7 ($MidLetter | $MidNumLetQ) $AHLetter", + "7) $AHLetter ($MidLetter | $MidNumLetQ) \u00D7 $AHLetter", + "7.1) $Hebrew_Letter × $Single_Quote", + "7.2) $Hebrew_Letter × $Double_Quote $Hebrew_Letter", + "7.3) $Hebrew_Letter $Double_Quote × $Hebrew_Letter", + "# Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”).", + "8) $Numeric \u00D7 $Numeric", + "9) $AHLetter \u00D7 $Numeric", + "10) $Numeric \u00D7 $AHLetter", + "# Do not break within sequences, such as “3.2” or “3,456.789”.", + "11) $Numeric ($MidNum | $MidNumLetQ) \u00D7 $Numeric", + "12) $Numeric \u00D7 ($MidNum | $MidNumLetQ) $Numeric", + "# Do not break between Katakana.", + "13) $Katakana \u00D7 $Katakana", + "# Do not break from extenders.", + "13.1) ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) \u00D7 $ExtendNumLet", + "13.2) $ExtendNumLet \u00D7 ($AHLetter | $Numeric | $Katakana)", + + // "# Do not break within emoji modifier sequences.", + // "14) $E_Base × $E_Modifier", + + "# Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.", + "15) ^ ($RI $RI)* $RI × $RI", + "16) [^$RI] ($RI $RI)* $RI × $RI", + "# Otherwise, break everywhere (including around ideographs).", + } + }; public static void main(String[] args) throws IOException { for (int i = 0; i < cannedRules.length; ++i) { String type = cannedRules[i][0]; boolean hadHash = false; - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "segmentation/", type + "Rules.txt")) { + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "segmentation/", type + "Rules.txt")) { out.println("# Segmentation rules for " + type); out.println("#"); out.println("# Character Classes"); @@ -1264,31 +1303,38 @@ public static void main(String[] args) throws IOException { } } - try (PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR + "cldr/segmentation/", "rootAddon.xml")) { - out.println("\n" - + "\n" - + "\n" - + "\n" - + "\t\n" - + "\t\t\n" - + "\t\t\n" - + "\t\n" - + "\t"); - for (final String type : new String[] {"GraphemeClusterBreak", "LineBreak", "SentenceBreak", "WordBreak"}) { - final Builder segBuilder = Segmenter.make(ToolUnicodePropertySource.make(Default.ucdVersion()), type); - out.print(segBuilder.toString(type,"\t\t")); + try (PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.Output.GEN_DIR + "cldr/segmentation/", "rootAddon.xml")) { + out.println( + "\n" + + "\n" + + "\n" + + "\n" + + "\t\n" + + "\t\t\n" + + "\t\t\n" + + "\t\n" + + "\t"); + for (final String type : + new String[] { + "GraphemeClusterBreak", "LineBreak", "SentenceBreak", "WordBreak" + }) { + final Builder segBuilder = + Segmenter.make(ToolUnicodePropertySource.make(Default.ucdVersion()), type); + out.print(segBuilder.toString(type, "\t\t")); if (type.equals("")) { - out.print("\t\t\t\n" - + "\t\t\t\t\n" - + "\t\t\t\n"); + out.print( + "\t\t\t\n" + + "\t\t\t\t\n" + + "\t\t\t\n"); } } - out.println("\t\n" - + ""); + out.println("\t\n" + ""); } } } diff --git a/unicodetools/src/main/java/org/unicode/tools/ShowScriptCategories.java b/unicodetools/src/main/java/org/unicode/tools/ShowScriptCategories.java index 3d74c2b31..f1073e22b 100644 --- a/unicodetools/src/main/java/org/unicode/tools/ShowScriptCategories.java +++ b/unicodetools/src/main/java/org/unicode/tools/ShowScriptCategories.java @@ -1,12 +1,16 @@ package org.unicode.tools; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.text.LocaleDisplayNames; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; - import org.unicode.cldr.draft.ScriptMetadata; import org.unicode.cldr.draft.ScriptMetadata.IdUsage; import org.unicode.cldr.draft.ScriptMetadata.Info; @@ -19,26 +23,23 @@ import org.unicode.props.UcdPropertyValues.Script_Values; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.text.LocaleDisplayNames; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ULocale; - public class ShowScriptCategories { static final SupplementalDataInfo SDI = CLDRConfig.getInstance().getSupplementalDataInfo(); static final Relation core = SDI.getContainmentCore(); static final LocaleDisplayNames ldn = LocaleDisplayNames.getInstance(ULocale.ENGLISH); - public static final IndexUnicodeProperties IUP = IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); - public static UnicodeMap SCRIPT = IUP.loadEnum(UcdProperty.Script, Script_Values.class); + public static final IndexUnicodeProperties IUP = + IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); + public static UnicodeMap SCRIPT = + IUP.loadEnum(UcdProperty.Script, Script_Values.class); public static UnicodeMap VERSION = IUP.loadEnum(UcdProperty.Age, Age_Values.class); public static UnicodeMap NAME = IUP.load(UcdProperty.Name); public static void main(String[] args) { - Map> rows = getRows(Collections.singletonList("001"), "001", new TreeMap<>()); -// for (Entry> s : rows.entrySet()) { -// System.out.println(s.getKey() + "\t" + s.getValue()); -// } + Map> rows = + getRows(Collections.singletonList("001"), "001", new TreeMap<>()); + // for (Entry> s : rows.entrySet()) { + // System.out.println(s.getKey() + "\t" + s.getValue()); + // } for (Script_Values scriptv : SCRIPT.values()) { if (scriptv == Script_Values.Unknown) { continue; @@ -55,50 +56,63 @@ public static void main(String[] args) { String subcontinent = row == null ? "ZZ" : row.get(2); int size = SCRIPT.getSet(scriptv).size(); - - final String continentString = ldn.regionDisplayName(continent) + " (" + continent + ")"; - final String subcontinentString = ldn.regionDisplayName(subcontinent) + " (" + subcontinent + ")"; + + final String continentString = + ldn.regionDisplayName(continent) + " (" + continent + ")"; + final String subcontinentString = + ldn.regionDisplayName(subcontinent) + " (" + subcontinent + ")"; final String countryString = ldn.regionDisplayName(country) + " (" + country + ")"; String scriptName = ldn.scriptDisplayName(script); if (scriptName.equals(script)) { scriptName = scriptv.toString().replace('_', ' '); } - final String usageName = script.equals("Brai") || script.equals("Sgnw") ? "Symbol" : usageName(info.idUsage); - System.out.println( - (continent.equals("142") - ? subcontinentString - : continentString) - + "\t" + usageName - + "\t" + scriptName - + "\t" + scriptv.toString() - + "\t" + script - + "\t" + size - + "\t" + continentString - + "\t" + subcontinentString - + "\t" + countryString - ); + final String usageName = + script.equals("Brai") || script.equals("Sgnw") + ? "Symbol" + : usageName(info.idUsage); + System.out.println( + (continent.equals("142") ? subcontinentString : continentString) + + "\t" + + usageName + + "\t" + + scriptName + + "\t" + + scriptv.toString() + + "\t" + + script + + "\t" + + size + + "\t" + + continentString + + "\t" + + subcontinentString + + "\t" + + countryString); } System.out.println(); - for (String s : new UnicodeSet(VERSION.getSet(Age_Values.V9_0)).retainAll(SCRIPT.getSet(Script_Values.Common))) { + for (String s : + new UnicodeSet(VERSION.getSet(Age_Values.V9_0)) + .retainAll(SCRIPT.getSet(Script_Values.Common))) { System.out.println("U+" + Utility.hex(s) + "\t" + NAME.get(s)); } } private static String usageName(IdUsage idUsage) { - switch(idUsage) { - case ASPIRATIONAL: - case LIMITED_USE: - return "Limited Use"; - case EXCLUSION: - return "Historic"; - case RECOMMENDED: - case UNKNOWN: - default: - return "Modern"; + switch (idUsage) { + case ASPIRATIONAL: + case LIMITED_USE: + return "Limited Use"; + case EXCLUSION: + return "Historic"; + case RECOMMENDED: + case UNKNOWN: + default: + return "Modern"; } } - private static Map> getRows(List list, String territoryCode, Map> target) { + private static Map> getRows( + List list, String territoryCode, Map> target) { Set contained = core.get(territoryCode); if (contained == null) { target.put(territoryCode, Collections.unmodifiableList(list)); diff --git a/unicodetools/src/main/java/org/unicode/tools/Subheader.java b/unicodetools/src/main/java/org/unicode/tools/Subheader.java index 5f2153f2d..d40c8f9dd 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Subheader.java +++ b/unicodetools/src/main/java/org/unicode/tools/Subheader.java @@ -1,8 +1,12 @@ -/** - * - */ +/** */ package org.unicode.tools; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UTF16.StringComparator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; @@ -17,16 +21,8 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; - import org.unicode.cldr.util.PatternCache; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UTF16.StringComparator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; - class Subheader { Matcher isArchaic = GeneratePickerData.IS_ARCHAIC.matcher(""); Matcher subheadMatcher = PatternCache.get("(@+)\\s+(.*)").matcher(""); @@ -44,7 +40,8 @@ class Subheader { // if (false) { // if (GeneratePickerData.DEBUG) // System.out.println("*** Fixing plurals"); - // for (java.util.Iterator it = subblock2UnicodeSet.keySet().iterator(); it.hasNext();) { + // for (java.util.Iterator it = subblock2UnicodeSet.keySet().iterator(); + // it.hasNext();) { // String subblock = it.next(); // final String pluralSubblock = subblock + "s"; // UnicodeSet plural = subblock2UnicodeSet.get(pluralSubblock); @@ -61,12 +58,15 @@ class Subheader { for (String subblock : subblock2UnicodeSet.keySet()) { final UnicodeSet uset = subblock2UnicodeSet.get(subblock); - for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next();) { + for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.next(); ) { codePoint2Subblock.put(it.codepoint, subblock); - String block = UCharacter - .getStringPropertyValue(UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG).toString() - .replace('_', ' ').intern(); + String block = + UCharacter.getStringPropertyValue( + UProperty.BLOCK, it.codepoint, UProperty.NameChoice.LONG) + .toString() + .replace('_', ' ') + .intern(); Set set = block2subblock.get(block); if (set == null) { @@ -81,7 +81,9 @@ class Subheader { set.add(block); String name = UCharacter.getExtendedName(it.codepoint); - if (isArchaic.reset(block).find() || isArchaic.reset(subblock).find() || isArchaic.reset(name).find()) { + if (isArchaic.reset(block).find() + || isArchaic.reset(subblock).find() + || isArchaic.reset(name).find()) { archaicSubblock.add(it.codepoint); } } @@ -92,19 +94,34 @@ class Subheader { private void writeBlockInfo(String outputDirectory) throws IOException, FileNotFoundException { System.out.println("***Block/Subblock start"); - PrintWriter out = GeneratePickerData.getFileWriter(outputDirectory, "blocks_subblocks.html"); + PrintWriter out = + GeneratePickerData.getFileWriter(outputDirectory, "blocks_subblocks.html"); htmlHeader(out); - out.println("
"); + out.println( + ""); for (String block : block2subblock.keySet()) { final Set set = block2subblock.get(block); for (String subblock2 : set) { - out.println(""); + out.println( + ""); } } out.println("
" + count + "" + UTF16.valueOf(codepoint) + "\n" - + show(breakdown) - + "" + source + "" + CollectionUtilities.join(breakdown, "
") + "
" + + count + + "" + + UTF16.valueOf(codepoint) + + "\n" + + show(breakdown) + + "" + + source + + "" + + CollectionUtilities.join(breakdown, "
") + + "
" + "Block" + "" + "Notes" + "" + "Subblock" + "
" + + "Block" + + "" + + "Notes" + + "" + + "Subblock" + + "
" + block + "" + - (subblock2.equalsIgnoreCase(block) || subblock2.equalsIgnoreCase(block + "s") ? "duplicate" : "") + - (set.size() < 2 ? " singleton" : "") - + "\u00a0" - + "" + subblock2 + - "
" + + block + + "" + + (subblock2.equalsIgnoreCase(block) + || subblock2.equalsIgnoreCase(block + "s") + ? "duplicate" + : "") + + (set.size() < 2 ? " singleton" : "") + + "\u00a0" + + "" + + subblock2 + + "
"); @@ -114,7 +131,14 @@ private void writeBlockInfo(String outputDirectory) throws IOException, FileNotF out = GeneratePickerData.getFileWriter(outputDirectory, "subblocks_blocks.html"); htmlHeader(out); - out.println("" + "Subblock" + "" + "Notes" + "" + "Blocks" + ""); + out.println( + "" + + "Subblock" + + "" + + "Notes" + + "" + + "Blocks" + + ""); StringComparator caseless = new UTF16.StringComparator(true, true, 0); TreeSet tests = new TreeSet(caseless); tests.addAll(subblock2block.keySet()); @@ -123,10 +147,14 @@ private void writeBlockInfo(String outputDirectory) throws IOException, FileNotF final String first = set.iterator().next(); String otherString = String.valueOf(set); otherString = otherString.substring(1, otherString.length() - 1) + '\u00a0'; - out.println("" + subblock2 - + "" + getComments(subblock2, tests) - + "" + otherString - + ""); + out.println( + "" + + subblock2 + + "" + + getComments(subblock2, tests) + + "" + + otherString + + ""); } System.out.println("***Block/Subblock end"); out.close(); @@ -135,34 +163,30 @@ private void writeBlockInfo(String outputDirectory) throws IOException, FileNotF private String getComments(String subblock2, Set keySet) { if (keySet.contains(subblock2 + "s") - || keySet.contains("Additional " + subblock2) - || keySet.contains("Additional " + subblock2 + "s") - || keySet.contains("Other " + subblock2) - || keySet.contains("Other " + subblock2 + "s") - || keySet.contains("Miscellaneous " + subblock2) - || keySet.contains("Miscellaneous " + subblock2 + "s")) return "has-longer"; + || keySet.contains("Additional " + subblock2) + || keySet.contains("Additional " + subblock2 + "s") + || keySet.contains("Other " + subblock2) + || keySet.contains("Other " + subblock2 + "s") + || keySet.contains("Miscellaneous " + subblock2) + || keySet.contains("Miscellaneous " + subblock2 + "s")) return "has-longer"; return "\u00a0"; } private void htmlHeader(PrintWriter out) { - out.println("" - + - "" - + - "" + - "" + - "" - ); + out.println( + "" + + "" + + "" + + "" + + "
"); } - private String getDataFromFile(String dir, String filenameRegex) throws FileNotFoundException, IOException { + private String getDataFromFile(String dir, String filenameRegex) + throws FileNotFoundException, IOException { String subblock = "?"; File actualName = getFileNameFromPattern(dir, filenameRegex); BufferedReader in = new BufferedReader(new FileReader(actualName)); @@ -194,8 +218,13 @@ public static File getFileNameFromPattern(String directory, String filenameRegex } String[] files = dir.list(new RegexFileFilter(filenameRegex)); if (files.length != 1) { - throw new IllegalArgumentException("Not a unique match for : " + dir.getCanonicalPath() + " / " - + filenameRegex + " : " + Arrays.asList(files)); + throw new IllegalArgumentException( + "Not a unique match for : " + + dir.getCanonicalPath() + + " / " + + filenameRegex + + " : " + + Arrays.asList(files)); } return new File(directory, files[0]); } catch (IOException e) { @@ -222,4 +251,4 @@ public void set(String regex) { String getSubheader(int codepoint) { return codePoint2Subblock.get(codepoint); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/TestSegments.java b/unicodetools/src/main/java/org/unicode/tools/TestSegments.java index 41e768a14..e748ff9c6 100644 --- a/unicodetools/src/main/java/org/unicode/tools/TestSegments.java +++ b/unicodetools/src/main/java/org/unicode/tools/TestSegments.java @@ -6,57 +6,51 @@ */ package org.unicode.tools; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.Log; +import org.unicode.jsp.ICUPropertyFactory; import org.unicode.props.RandomStringGenerator; import org.unicode.props.UnicodeProperty; import org.unicode.tools.Segmenter.Rule.Breaks; -import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.RuleBasedBreakIterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - -import org.unicode.jsp.ICUPropertyFactory; - /** - * Quick class for testing proposed syntax for Segments. - * TODO doesn't yet handle supplementaries. It looks like even Java 5 won't help, since it doesn't have syntax for them. - * Will have to change [...X-Y] into ([...] | X1 [Y1-\uDFFF] | [X2-X3][\uDC00-\uDFFF] | X4[\uD800-Y2) - * where the X1,Y1 is the first surrogate pair, and X4,Y2 is the last (2nd and 3rd ranges are only if X4 != X2). + * Quick class for testing proposed syntax for Segments. TODO doesn't yet handle supplementaries. It + * looks like even Java 5 won't help, since it doesn't have syntax for them. Will have to change + * [...X-Y] into ([...] | X1 [Y1-\uDFFF] | [X2-X3][\uDC00-\uDFFF] | X4[\uD800-Y2) where the X1,Y1 is + * the first surrogate pair, and X4,Y2 is the last (2nd and 3rd ranges are only if X4 != X2). * * @author davis */ - public class TestSegments { private static final boolean TESTING = true; static String indent = "\t\t"; // static String indent = ""; - /** - * Shows the rule that caused the result at each offset. - */ + /** Shows the rule that caused the result at each offset. */ private static final boolean DEBUG_SHOW_MATCHES = false; + private static final boolean SHOW_RULE_LIST = false; private static final int monkeyLimit = 1000, monkeyStringCount = 10; -// private static final Matcher flagItems = PatternCache.get( -// "[$](BK|CR|LF|CM|NL|WJ|ZW|GL|SP|CB)").matcher(""); + // private static final Matcher flagItems = PatternCache.get( + // "[$](BK|CR|LF|CM|NL|WJ|ZW|GL|SP|CB)").matcher(""); /** * Quick test of features for debugging * - * @param args - * unused + * @param args unused * @throws IOException */ public static void main(String[] args) throws IOException { @@ -71,7 +65,7 @@ public static void main(String[] args) throws IOException { Log.println("\t"); if (args.length == 0) - args = new String[] { "GraphemeClusterBreak", "LineBreak", "SentenceBreak", "WordBreak" }; + args = new String[] {"GraphemeClusterBreak", "LineBreak", "SentenceBreak", "WordBreak"}; List testChoice = Arrays.asList(args); UnicodeProperty.Factory propFactory = ICUPropertyFactory.make(); @@ -123,7 +117,8 @@ public static void main(String[] args) throws IOException { showingBreaks += '|'; } if (DEBUG_SHOW_MATCHES && rl.getBreakRule() >= 0) { - showingBreaks += "\u00AB" + Segmenter.nf.format(rl.getBreakRule()) + "\u00BB"; + showingBreaks += + "\u00AB" + Segmenter.nf.format(rl.getBreakRule()) + "\u00BB"; } if (k < line.length()) showingBreaks += line.charAt(k); } @@ -144,7 +139,8 @@ private static void debugRule(Segmenter.Builder rb) { String testStr = "\uA80D/\u0745\u2026"; for (int k = 0; k < testStr.length(); ++k) { boolean inside = oldALSet.contains(testStr.charAt(k)); - System.out.println(k + ": " + inside + com.ibm.icu.impl.Utility.escape("" + testStr.charAt(k))); + System.out.println( + k + ": " + inside + com.ibm.icu.impl.Utility.escape("" + testStr.charAt(k))); } Breaks m = rule.matches(testStr, 3); } @@ -189,10 +185,18 @@ private static void doCompare(UnicodeProperty.Factory factory, Segmenter rl, Str System.out.println(); gotDot = false; } - System.out.println(line + "\tMismatch at Line\t" + i - + ",\toffset\t" + j - + ",\twith Rule\t" + rl.getBreakRule() - + ":\t" + (icuBreakResults ? "ICU Breaks, CLDR Doesn't" : "ICU Doesn't, CLDR Breaks")); + System.out.println( + line + + "\tMismatch at Line\t" + + i + + ",\toffset\t" + + j + + ",\twith Rule\t" + + rl.getBreakRule() + + ":\t" + + (icuBreakResults + ? "ICU Breaks, CLDR Doesn't" + : "ICU Doesn't, CLDR Breaks")); System.out.println(showResults(test, j, rsg, icuBreakResults)); rl.breaksAt(test, j); // for debugging } @@ -216,35 +220,45 @@ static boolean equalStatus(int[] status1, int len1, int[] status2, int len2) { return true; } - private static String showResults(String test, int j, RandomStringGenerator rsg, boolean icuBreakResults) { + private static String showResults( + String test, int j, RandomStringGenerator rsg, boolean icuBreakResults) { StringBuffer results = new StringBuffer(); int cp; for (int i = 0; i < test.length(); i += UTF16.getCharCount(cp)) { if (i == j) - results.append(icuBreakResults ? "<" + CldrUtility.LINE_SEPARATOR + "$ >" : "<" - + CldrUtility.LINE_SEPARATOR + "@ >"); + results.append( + icuBreakResults + ? "<" + CldrUtility.LINE_SEPARATOR + "$ >" + : "<" + CldrUtility.LINE_SEPARATOR + "@ >"); cp = UTF16.charAt(test, i); - results.append("[" + rsg.getValue(cp) + ":" + com.ibm.icu.impl.Utility.escape(UTF16.valueOf(cp)) + "]"); + results.append( + "[" + + rsg.getValue(cp) + + ":" + + com.ibm.icu.impl.Utility.escape(UTF16.valueOf(cp)) + + "]"); } if (test.length() == j) - results.append(icuBreakResults ? "<" + CldrUtility.LINE_SEPARATOR + "$ >" : "<" - + CldrUtility.LINE_SEPARATOR + "@ >"); + results.append( + icuBreakResults + ? "<" + CldrUtility.LINE_SEPARATOR + "$ >" + : "<" + CldrUtility.LINE_SEPARATOR + "@ >"); return results.toString(); } - /** - * For quickly checking regex syntax implications in Java - */ + /** For quickly checking regex syntax implications in Java */ private static boolean quickCheck() { String[][] rtests = { { - ".*" + new UnicodeSet("[\\p{Grapheme_Cluster_Break=LVT}]").complement().complement(), + ".*" + + new UnicodeSet("[\\p{Grapheme_Cluster_Break=LVT}]") + .complement() + .complement(), "\u001E\uC237\u1123\n\uC91B" - }, { - "(?<=a)b", "ab" - }, { - "[$]\\p{Alpha}\\p{Alnum}*", "$Letter" - } }; + }, + {"(?<=a)b", "ab"}, + {"[$]\\p{Alpha}\\p{Alnum}*", "$Letter"} + }; for (int i = 0; i < rtests.length; ++i) { Matcher m = Pattern.compile(rtests[i][0], Segmenter.REGEX_FLAGS).matcher(""); m.reset(rtests[i][1]); @@ -255,14 +269,7 @@ private static boolean quickCheck() { } static final String[][] tests = { - { - "QuickCheck", - "1) \u00F7 b", - "2) \u00D7 .", - "0.5) a \u00D7", - "test", - "abcbdb" - }, + {"QuickCheck", "1) \u00F7 b", "2) \u00D7 .", "0.5) a \u00D7", "test", "abcbdb"}, { "QuickCheck2", "$Letter=\\p{Alphabetic}", @@ -272,31 +279,30 @@ private static boolean quickCheck() { "test", "The quick 100 brown foxes." }, - { - "GraphemeClusterBreak", - "test", - "The qui\u0300ck 100 brown foxes.", - "compareGrapheme" - }, + {"GraphemeClusterBreak", "test", "The qui\u0300ck 100 brown foxes.", "compareGrapheme"}, { "LineBreak", "test", "\uCD40\u1185", "http://www.cs.tut.fi/%7Ejkorpela/html/nobr.html?abcd=high&hijk=low#anchor", - "T\u0300he qui\u0300ck 100.1 brown" + CldrUtility.LINE_SEPARATOR - + "\u0300foxes. And the beginning. \"Hi?\" Nope! or not.", + "T\u0300he qui\u0300ck 100.1 brown" + + CldrUtility.LINE_SEPARATOR + + "\u0300foxes. And the beginning. \"Hi?\" Nope! or not.", "compareLine" }, { "SentenceBreak", "test", - "T\u0300he qui\u0300ck 100.1 brown" + CldrUtility.LINE_SEPARATOR - + "\u0300foxes. And the beginning. \"Hi?\" Nope! or not.", + "T\u0300he qui\u0300ck 100.1 brown" + + CldrUtility.LINE_SEPARATOR + + "\u0300foxes. And the beginning. \"Hi?\" Nope! or not.", "compareSentence" - }, { + }, + { "WordBreak", "test", "T\u0300he qui\u0300ck 100.1 brown" + CldrUtility.LINE_SEPARATOR + "\u0300foxes.", "compareWord" - } }; + } + }; } diff --git a/unicodetools/src/main/java/org/unicode/tools/Unilex.java b/unicodetools/src/main/java/org/unicode/tools/Unilex.java index 6d6c90725..725d85369 100644 --- a/unicodetools/src/main/java/org/unicode/tools/Unilex.java +++ b/unicodetools/src/main/java/org/unicode/tools/Unilex.java @@ -1,28 +1,5 @@ package org.unicode.tools; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; -import java.util.function.Consumer; - -import org.unicode.cldr.draft.FileUtilities; -import org.unicode.cldr.util.Counter; -import org.unicode.cldr.util.With; -import org.unicode.props.IndexUnicodeProperties; -import org.unicode.props.UcdProperty; -import org.unicode.props.UcdPropertyValues.General_Category_Values; -import org.unicode.text.utility.Utility; - import com.google.common.base.CharMatcher; import com.google.common.base.Objects; import com.google.common.base.Splitter; @@ -46,13 +23,37 @@ import com.ibm.icu.util.ICUUncheckedIOException; import com.ibm.icu.util.StringTrieBuilder.Option; import com.ibm.icu.util.ULocale; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.function.Consumer; +import org.unicode.cldr.draft.FileUtilities; +import org.unicode.cldr.util.Counter; +import org.unicode.cldr.util.With; +import org.unicode.props.IndexUnicodeProperties; +import org.unicode.props.UcdProperty; +import org.unicode.props.UcdPropertyValues.General_Category_Values; +import org.unicode.text.utility.Utility; public class Unilex { static final Splitter TAB = Splitter.on('\t'); - static final String DATADIR = "/Users/markdavis/Google Drive/workspace/DATA/unilex/"; - static final String GENDIR = "/Users/markdavis/Google Drive/workspace/Generated/unilex/"; - static final UnicodeSet IPA_VOWELS = new UnicodeSet("[a æ ɐ-ɒ e ə ɛ ɘ ɜ ɞ ɤ i ɪ ɨ oø œ ɶ ɔ ɵ u ʉ ɯ ʊ ʌ y ʏ]").freeze(); - static final UnicodeSet VOWELS = new UnicodeSet("[AEIOUYaeiouyÀ-ÆÈ-ÏÒ-Ö Ø-Ýà-æè-ïò-öø-ýÿ-ąĒ-ě Ĩ-İŌ-œŨ-ųŶ-ŸƆƏƐƗƜƟ-ơƯ-Ʊ Ǎ-ǜǞ-ǣǪ-ǭǺ-ȏȔ-ȗȦ-ȳɄɅɐ-ɒ ɔɘəɛɜɞɤɨɪɯɵɶʉʊʌʏḀḁḔ-ḝ Ḭ-ḯṌ-ṓṲ-ṻẎẏẙẠ-ỹÅⱭⱯⱰꞫꞮ]"); + static final String DATADIR = "/Users/markdavis/Google Drive/workspace/DATA/unilex/"; + static final String GENDIR = "/Users/markdavis/Google Drive/workspace/Generated/unilex/"; + static final UnicodeSet IPA_VOWELS = + new UnicodeSet("[a æ ɐ-ɒ e ə ɛ ɘ ɜ ɞ ɤ i ɪ ɨ oø œ ɶ ɔ ɵ u ʉ ɯ ʊ ʌ y ʏ]").freeze(); + static final UnicodeSet VOWELS = + new UnicodeSet( + "[AEIOUYaeiouyÀ-ÆÈ-ÏÒ-Ö Ø-Ýà-æè-ïò-öø-ýÿ-ąĒ-ě Ĩ-İŌ-œŨ-ųŶ-ŸƆƏƐƗƜƟ-ơƯ-Ʊ Ǎ-ǜǞ-ǣǪ-ǭǺ-ȏȔ-ȗȦ-ȳɄɅɐ-ɒ ɔɘəɛɜɞɤɨɪɯɵɶʉʊʌʏḀḁḔ-ḝ Ḭ-ḯṌ-ṓṲ-ṻẎẏẙẠ-ỹÅⱭⱯⱰꞫꞮ]"); private static final Long ZERO = new Long(0); private static final Long ONE = new Long(1); private static Normalizer2 NFC = Normalizer2.getNFCInstance(); @@ -64,42 +65,51 @@ public static String cleanTerm(String source) { // TODO clean IPA də.ˈˈpɥi => dəˈpɥi, etc. public static class Frequency { - private final Map data; + private final Map data; private final Multimap value2keys; public Long get(String key) { Long result = data.get(key); return result == null ? ZERO : result; } + public int size() { return data.size(); } + public Set keySet() { return data.keySet(); } + public Set valueSet() { return value2keys.keySet(); } + public Set getKeys(Long value) { return (Set) value2keys.get(value); } - private Frequency(Map data) { + private Frequency(Map data) { this.data = data; - TreeMultimap inverted = Multimaps.invertFrom(Multimaps.forMap(data), TreeMultimap.create()); + TreeMultimap inverted = + Multimaps.invertFrom( + Multimaps.forMap(data), TreeMultimap.create()); value2keys = ImmutableSetMultimap.copyOf(inverted); } - public static Frequency create(String locale) { - locale = locale.replace("-fonxsamp",""); - Map temp = new TreeMap<>(); - processFields(DATADIR + "frequency", locale+".txt", parts -> { - if (parts.size() != 2) { - throw new IllegalArgumentException("Wrong number of items: " + parts); - } - temp.put(cleanTerm(parts.get(0)), Long.parseLong(parts.get(1))); - } - ); + public static Frequency create(String locale) { + locale = locale.replace("-fonxsamp", ""); + + Map temp = new TreeMap<>(); + processFields( + DATADIR + "frequency", + locale + ".txt", + parts -> { + if (parts.size() != 2) { + throw new IllegalArgumentException("Wrong number of items: " + parts); + } + temp.put(cleanTerm(parts.get(0)), Long.parseLong(parts.get(1))); + }); return new Frequency(ImmutableMap.copyOf(temp)); } } @@ -107,66 +117,77 @@ public static Frequency create(String locale) { static final Transliterator XSampa_IPA = Transliterator.getInstance("XSampa-IPA"); public static class Pronunciation { - private final Map data; - private final Map rawToIpa; + private final Map data; + private final Map rawToIpa; private final Multimap value2keys; public int size() { return data.size(); } + public String get(Object key) { return data.get(key); } + public Set keySet() { return data.keySet(); } + public Set valueSet() { return value2keys.keySet(); } + public Set getKeys(String value) { return (Set) value2keys.get(value); } - private Pronunciation(Map data, Map rawToIpa) { + private Pronunciation(Map data, Map rawToIpa) { this.data = data; this.rawToIpa = rawToIpa; - - TreeMultimap inverted = Multimaps.invertFrom(Multimaps.forMap(data), TreeMultimap.create()); + + TreeMultimap inverted = + Multimaps.invertFrom( + Multimaps.forMap(data), TreeMultimap.create()); value2keys = ImmutableSetMultimap.copyOf(inverted); } + public static Pronunciation create(String locale) { - Map temp = new TreeMap<>(); - Map _rawToIpa = new TreeMap<>(); + Map temp = new TreeMap<>(); + Map _rawToIpa = new TreeMap<>(); boolean isXsampa = locale.contains("fonxsamp"); - processFields(DATADIR + "pronunciation", locale+".txt", parts -> { - String source = cleanTerm(parts.get(0)); - switch(parts.size()) { - case 2: - String target = parts.get(1); - _rawToIpa.put(source, target); - if (isXsampa) { - target = XSampa_IPA.transform(target - .replace('\'', 'ˈ') - .replace('-', '.') - .replace(',', 'ˌ') - ); - } - target = target.replace(".ˈ", "ˈ").replace(".ˌ", "ˌ"); - temp.put(source, target); - break; - case 0: - case 1: - break; - default: - throw new IllegalArgumentException("Wrong number of items: " + parts); - } - } - ); + processFields( + DATADIR + "pronunciation", + locale + ".txt", + parts -> { + String source = cleanTerm(parts.get(0)); + switch (parts.size()) { + case 2: + String target = parts.get(1); + _rawToIpa.put(source, target); + if (isXsampa) { + target = + XSampa_IPA.transform( + target.replace('\'', 'ˈ') + .replace('-', '.') + .replace(',', 'ˌ')); + } + target = target.replace(".ˈ", "ˈ").replace(".ˌ", "ˌ"); + temp.put(source, target); + break; + case 0: + case 1: + break; + default: + throw new IllegalArgumentException( + "Wrong number of items: " + parts); + } + }); return new Pronunciation(ImmutableMap.copyOf(temp), ImmutableMap.copyOf(_rawToIpa)); } } - private static void processFields(String directory, String file, Consumer> processor) { + private static void processFields( + String directory, String file, Consumer> processor) { boolean firstNonEmpty = true; for (String line : FileUtilities.in(directory, file)) { if (line.isEmpty() || line.startsWith("#")) { @@ -184,7 +205,7 @@ private static void processFields(String directory, String file, Consumer
  • combined = new LinkedHashSet<>(xsamp.data.keySet()); combined.addAll(plain.keySet()); Set extra = new LinkedHashSet<>(); @@ -194,17 +215,20 @@ public static void main(String[] args) { if (Objects.equal(xsampValue, plainValue)) { continue; } - String plainValue2 = plainValue == null ? null : plainValue - .replace("ɐ̯", "ɐ") - .replace("ʊ̯", "ʊ") - .replace("ʏ̯", "ʏ") - .replace("ɪ̯", "ɪ") - //.replace("ʊ̯̯", "ʊ") - .replace("t͡s", "ts") - .replace("t͡ʃ", "tʃ") - .replace("p͡f", "pf") - // - ; + String plainValue2 = + plainValue == null + ? null + : plainValue + .replace("ɐ̯", "ɐ") + .replace("ʊ̯", "ʊ") + .replace("ʏ̯", "ʏ") + .replace("ɪ̯", "ɪ") + // .replace("ʊ̯̯", "ʊ") + .replace("t͡s", "ts") + .replace("t͡ʃ", "tʃ") + .replace("p͡f", "pf") + // + ; if (plainValue != null && !plainValue2.contains("ˈ")) { plainValue2 = "ˈ" + plainValue2; } @@ -258,12 +282,12 @@ private static Pronunciation getData(String locale) { ++i; } } - + out.println("\n#1a. Odd syllable breaks"); pronGraphemes.print(out, 2, "Pronunciation Graphemes"); - //termGraphemes.print(out, 3, "Pronunciation Term Graphemes"); + // termGraphemes.print(out, 3, "Pronunciation Term Graphemes"); // show the pronunciations in descending frequency order i = 0; @@ -290,10 +314,10 @@ private static Pronunciation getData(String locale) { lines.add(0 + "\t" + fkey + "\t" + p); i++; } - + CharsTrie onsetCharTrie = freqGraphemes.termOnsets.computeStringTrie(); CharsTrie codaCharTrie = freqGraphemes.termCodas.computeStringTrie(); - + check("ʃta", onsetCharTrie, codaCharTrie); freqGraphemes.print(out, 3, "Term Graphemes"); @@ -311,7 +335,7 @@ private static Pronunciation getData(String locale) { private static Set reduceCase(Set set) { Set result = new LinkedHashSet<>(set); - for (Iterator it = result.iterator(); it.hasNext();) { + for (Iterator it = result.iterator(); it.hasNext(); ) { String item = it.next(); String lc = UCharacter.toLowerCase(item); // TODO locale sensitive if (!lc.equals(item) && set.contains(lc)) { @@ -322,9 +346,10 @@ private static Set reduceCase(Set set) { } static final IndexUnicodeProperties iup = IndexUnicodeProperties.make(); - static final UnicodeMap cat = iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); + static final UnicodeMap cat = + iup.loadEnum(UcdProperty.General_Category, General_Category_Values.class); - static public final class GraphemeList implements Iterable { + public static final class GraphemeList implements Iterable { private List graphemes = new ArrayList<>(); private transient StringBuilder item = new StringBuilder(); @@ -332,37 +357,38 @@ public void fill(String source) { graphemes.clear(); for (int cp : With.codePointArray(source)) { switch (cat.get(cp)) { - default: - // case Uppercase_Letter: - // case Lowercase_Letter: - // case Other_Letter: - // case Titlecase_Letter: - if (item.length() != 0 && item.charAt(item.length()-1) == '͡') { - // special - } else { - flush(source); - } - item.appendCodePoint(cp); - break; - case Enclosing_Mark: - case Nonspacing_Mark: - case Spacing_Mark: - item.appendCodePoint(cp); - break; - case Modifier_Letter: - case Modifier_Symbol: - if (cp == 'ˈ' || cp == 'ˌ' || cp == '.') { // singleton - flush(source); + default: + // case Uppercase_Letter: + // case Lowercase_Letter: + // case Other_Letter: + // case Titlecase_Letter: + if (item.length() != 0 && item.charAt(item.length() - 1) == '͡') { + // special + } else { + flush(source); + } item.appendCodePoint(cp); - flush(source); - } else { + break; + case Enclosing_Mark: + case Nonspacing_Mark: + case Spacing_Mark: item.appendCodePoint(cp); - } - break; + break; + case Modifier_Letter: + case Modifier_Symbol: + if (cp == 'ˈ' || cp == 'ˌ' || cp == '.') { // singleton + flush(source); + item.appendCodePoint(cp); + flush(source); + } else { + item.appendCodePoint(cp); + } + break; } } flush(source); } + private void flush(String source) { if (item.length() != 0) { String string = item.toString(); @@ -370,6 +396,7 @@ private void flush(String source) { item.setLength(0); } } + @Override public Iterator iterator() { return graphemes.iterator(); @@ -378,16 +405,16 @@ public Iterator iterator() { static class FrequencyAndSamples { private Counter targetMap = new Counter<>(); - private Map sampleMap = new HashMap<>(); + private Map sampleMap = new HashMap<>(); private void add(String string, Long frequency, String sampleString) { Long oldFreq = targetMap.get(string); - if (oldFreq == null || oldFreq <= frequency) { + if (oldFreq == null || oldFreq <= frequency) { targetMap.add(string, frequency); sampleMap.put(string, sampleString); } } - + public CharsTrie computeStringTrie() { CharsTrieBuilder b = new CharsTrieBuilder(); int item = 0; @@ -397,17 +424,18 @@ public CharsTrie computeStringTrie() { return b.build(Option.FAST); } } - - static final Splitter SYLLABLE_SPLITTER = Splitter.on(CharMatcher.anyOf(".")).omitEmptyStrings().trimResults(); - + + static final Splitter SYLLABLE_SPLITTER = + Splitter.on(CharMatcher.anyOf(".")).omitEmptyStrings().trimResults(); + static boolean check(String source, CharsTrie onset, CharsTrie coda) { -/* -Result result=current(); -for(each c in s) -if(!result.hasNext()) return Result.NO_MATCH; -result=next(c); -return result; -*/ + /* + Result result=current(); + for(each c in s) + if(!result.hasNext()) return Result.NO_MATCH; + result=next(c); + return result; + */ for (String part : SYLLABLE_SPLITTER.split(source)) { int sLimit = part.length(); Result result = onset.current(); @@ -417,20 +445,19 @@ static boolean check(String source, CharsTrie onset, CharsTrie coda) { break; } char ch = part.charAt(start); - result=onset.next(ch); + result = onset.next(ch); } System.out.println("onset: " + result + ", " + start); switch (result) { - case FINAL_VALUE: - case INTERMEDIATE_VALUE: - System.out.println("value: " + onset.getValue()); + case FINAL_VALUE: + case INTERMEDIATE_VALUE: + System.out.println("value: " + onset.getValue()); } } return true; } - - static public final class GraphemeCount { + public static final class GraphemeCount { FrequencyAndSamples source = new FrequencyAndSamples(); private Counter target = new Counter<>(); @@ -449,7 +476,7 @@ static public final class GraphemeCount { public GraphemeCount(Comparator collator) { this.collator = (Comparator) collator; } - + void add(Long frequency, String source, boolean toLower, String... samples) { // if (frequency.equals(ZERO)) { // frequency = ONE; @@ -474,7 +501,8 @@ void add(Long frequency, String source, boolean toLower, String... samples) { termOnset.append(grapheme); } } - // capture the rime (vowel & everything after), but flush whenever we transition from consonant to vowel + // capture the rime (vowel & everything after), but flush whenever we transition + // from consonant to vowel if (isVowel) { termCoda.setLength(0); if (!wasVowel) { @@ -492,9 +520,9 @@ void add(Long frequency, String source, boolean toLower, String... samples) { termRimes.add(clean(termRime), frequency, sampleString); } } - + private String clean(CharSequence source) { - return source.toString().replace("ˈ","").replace(".",""); + return source.toString().replace("ˈ", "").replace(".", ""); } private void flush(String grapheme, Long frequency, String sampleString) { @@ -518,28 +546,41 @@ void print(PrintWriter out, int num, String title) { out.println("\n#" + num + "a." + title); out.println("#Frequency\tGrapheme\tHex\tSample"); for (R2 item : entries()) { - out.println(item.get0() - + "\t" + item.get1() - + "\t" + Utility.hex(item.get1(), " ") - + "\t" + getSample(item.get1())); + out.println( + item.get0() + + "\t" + + item.get1() + + "\t" + + Utility.hex(item.get1(), " ") + + "\t" + + getSample(item.get1())); } out.println("\n#" + num + "b." + title); out.println("#Frequency\tOnset\tHex"); - for (R2 item : termOnsets.targetMap.getEntrySetSortedByCount(false, collator)) { - out.println(item.get0() - + "\t" + item.get1() - + "\t" + Utility.hex(item.get1(), " ") - + "\t" + termOnsets.sampleMap.get(item.get1())); + for (R2 item : + termOnsets.targetMap.getEntrySetSortedByCount(false, collator)) { + out.println( + item.get0() + + "\t" + + item.get1() + + "\t" + + Utility.hex(item.get1(), " ") + + "\t" + + termOnsets.sampleMap.get(item.get1())); } out.println("\n#" + num + "c." + title); out.println("#Frequency\tCoda\tHex"); - for (R2 item : termCodas.targetMap.getEntrySetSortedByCount(false, collator)) { - out.println(item.get0() - + "\t" + item.get1() - + "\t" + Utility.hex(item.get1(), " ") - + "\t" + termCodas.sampleMap.get(item.get1())); + for (R2 item : + termCodas.targetMap.getEntrySetSortedByCount(false, collator)) { + out.println( + item.get0() + + "\t" + + item.get1() + + "\t" + + Utility.hex(item.get1(), " ") + + "\t" + + termCodas.sampleMap.get(item.get1())); } } - } } diff --git a/unicodetools/src/main/java/org/unicode/tools/UpdateJspFiles.java b/unicodetools/src/main/java/org/unicode/tools/UpdateJspFiles.java index 9a3c4a87a..7be631ca9 100644 --- a/unicodetools/src/main/java/org/unicode/tools/UpdateJspFiles.java +++ b/unicodetools/src/main/java/org/unicode/tools/UpdateJspFiles.java @@ -1,5 +1,6 @@ package org.unicode.tools; +import com.ibm.icu.util.VersionInfo; import java.io.IOException; import java.io.PrintWriter; import java.nio.file.CopyOption; @@ -7,24 +8,24 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.text.tools.GenerateSubtagNames; import org.unicode.text.utility.Settings; -import com.ibm.icu.util.VersionInfo; - public class UpdateJspFiles { - static CopyOption[] options = new CopyOption[] {StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES}; - static final Path JSP_RESOURCE_DATA = Paths.get(Settings.UnicodeTools.UNICODEJSPS_DIR, - "src/main/resources/org/unicode/jsp/"); + static CopyOption[] options = + new CopyOption[] { + StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.COPY_ATTRIBUTES + }; + static final Path JSP_RESOURCE_DATA = + Paths.get(Settings.UnicodeTools.UNICODEJSPS_DIR, "src/main/resources/org/unicode/jsp/"); static final Path UNICODE_TOOLS_DIR = Paths.get(Settings.UnicodeTools.UNICODETOOLS_DIR); static final Path TRIM_PARENT = Paths.get(Settings.UnicodeTools.UNICODETOOLS_REPO_DIR); static final String trim(Path p) { - if(p.startsWith(TRIM_PARENT)) { - return "{...}/"+p.subpath(TRIM_PARENT.getNameCount(), p.getNameCount()).toString(); + if (p.startsWith(TRIM_PARENT)) { + return "{...}/" + p.subpath(TRIM_PARENT.getNameCount(), p.getNameCount()).toString(); } else { return p.toString(); } @@ -35,7 +36,11 @@ public static void main(String args[]) throws IOException { IndexUnicodeProperties latest = IndexUnicodeProperties.make(); VersionInfo ucdVersion = latest.getUcdVersion(); - System.out.println("Updating all JSP files for " + ucdVersion + " into " + TRIM_PARENT.toAbsolutePath()); + System.out.println( + "Updating all JSP files for " + + ucdVersion + + " into " + + TRIM_PARENT.toAbsolutePath()); copyTextFiles(ucdVersion); @@ -54,58 +59,67 @@ public static void main(String args[]) throws IOException { private static void copyTextFiles(VersionInfo fromVersion) throws IOException { System.out.println("1. Copying text files from " + fromVersion); - copyTextFiles(fromVersion, Settings.UnicodeTools.DataDir.SECURITY, + copyTextFiles( + fromVersion, + Settings.UnicodeTools.DataDir.SECURITY, "confusables.txt", "IdentifierStatus.txt", "IdentifierType.txt"); - copyTextFiles(fromVersion, Settings.UnicodeTools.DataDir.UCD, + copyTextFiles( + fromVersion, + Settings.UnicodeTools.DataDir.UCD, "NameAliases.txt", "NamesList.txt", "ScriptExtensions.txt", "StandardizedVariants.txt"); - copyTextFiles(fromVersion, Settings.UnicodeTools.DataDir.IDNA, - "IdnaMappingTable.txt"); - copyTextFiles(fromVersion, Settings.UnicodeTools.DataDir.EMOJI, + copyTextFiles(fromVersion, Settings.UnicodeTools.DataDir.IDNA, "IdnaMappingTable.txt"); + copyTextFiles( + fromVersion, + Settings.UnicodeTools.DataDir.EMOJI, "emoji-sequences.txt", "emoji-zwj-sequences.txt"); System.err.println("TODO: "); - } - private static void copyTextFiles(VersionInfo fromVersion, - Settings.UnicodeTools.DataDir dir, String... filenames) throws IOException { + private static void copyTextFiles( + VersionInfo fromVersion, Settings.UnicodeTools.DataDir dir, String... filenames) + throws IOException { final Path targDir = JSP_RESOURCE_DATA; copyTextFiles(targDir, fromVersion, dir, filenames); } - private static void copyTextFiles(final Path targDir, VersionInfo fromVersion, - Settings.UnicodeTools.DataDir dir, String... filenames) throws IOException { + private static void copyTextFiles( + final Path targDir, + VersionInfo fromVersion, + Settings.UnicodeTools.DataDir dir, + String... filenames) + throws IOException { final Path srcDir = dir.asPath(fromVersion); - System.out.println(" Copy from " + dir.name() + - " copying from " + trim(srcDir)); + System.out.println(" Copy from " + dir.name() + " copying from " + trim(srcDir)); for (final String file : filenames) { final Path srcFile = srcDir.resolve(file); if (!srcFile.toFile().canRead()) { - throw new IllegalArgumentException(dir.name()+"/"+file+" not readable: " + - srcFile.toAbsolutePath()); + throw new IllegalArgumentException( + dir.name() + "/" + file + " not readable: " + srcFile.toAbsolutePath()); } copyFile(srcFile, targDir.resolve(file)); } } - private static void copyTextFiles(Path srcDir, Path targDir, String... filenames) throws IOException { + + private static void copyTextFiles(Path srcDir, Path targDir, String... filenames) + throws IOException { System.out.println(" Copying from " + trim(srcDir) + " to " + trim(targDir)); for (final String file : filenames) { final Path srcFile = srcDir.resolve(file); if (!srcFile.toFile().canRead()) { - throw new IllegalArgumentException("Not readable: " + - srcFile.toAbsolutePath()); + throw new IllegalArgumentException("Not readable: " + srcFile.toAbsolutePath()); } copyFile(srcFile, targDir.resolve(file)); } } private static void copyFile(final Path srcFile, final Path targFile) throws IOException { - if(srcFile.getFileName().equals(targFile.getFileName())) { + if (srcFile.getFileName().equals(targFile.getFileName())) { System.out.println(trim(targFile) + " <-- " + trim(srcFile.getParent())); } else { System.out.println(trim(targFile) + " <-- " + trim(srcFile)); @@ -115,26 +129,29 @@ private static void copyFile(final Path srcFile, final Path targFile) throws IOE private static void generateSubtagNames() throws IOException { System.out.println("Generating " + GenerateSubtagNames.SUBTAG_NAMES_TXT); - try ( - PrintWriter pw = FileUtilities.openUTF8Writer(JSP_RESOURCE_DATA.toFile(), - GenerateSubtagNames.SUBTAG_NAMES_TXT); - ) { + try (PrintWriter pw = + FileUtilities.openUTF8Writer( + JSP_RESOURCE_DATA.toFile(), GenerateSubtagNames.SUBTAG_NAMES_TXT); ) { int count = GenerateSubtagNames.generate(pw); - System.out.println("Wrote " + count + " entries to " + GenerateSubtagNames.SUBTAG_NAMES_TXT); + System.out.println( + "Wrote " + count + " entries to " + GenerateSubtagNames.SUBTAG_NAMES_TXT); } - } private static void copyOtherProps(VersionInfo fromVersion) throws IOException { - copyTextFiles(UNICODE_TOOLS_DIR.resolve("org/unicode/props"), // TODO: will break with mavenize - JSP_RESOURCE_DATA.resolve("data"), + copyTextFiles( + UNICODE_TOOLS_DIR.resolve("org/unicode/props"), // TODO: will break with mavenize + JSP_RESOURCE_DATA.resolve("data"), "ExtraPropertyAliases.txt", "ExtraPropertyValueAliases.txt"); - // Nota Bene! These aren't in the earlier list, becaause they are in the /data and not /ucd dir - copyTextFiles(JSP_RESOURCE_DATA.resolve("data"), fromVersion, Settings.UnicodeTools.DataDir.UCD, + // Nota Bene! These aren't in the earlier list, becaause they are in the /data and not /ucd + // dir + copyTextFiles( + JSP_RESOURCE_DATA.resolve("data"), + fromVersion, + Settings.UnicodeTools.DataDir.UCD, "PropertyAliases.txt", "PropertyValueAliases.txt"); - } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/BirthInfo.java b/unicodetools/src/main/java/org/unicode/tools/emoji/BirthInfo.java index 347b614fc..8d6fa08ef 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/BirthInfo.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/BirthInfo.java @@ -1,28 +1,27 @@ package org.unicode.tools.emoji; +import com.google.common.collect.ImmutableMap; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.VersionInfo; import java.util.Collections; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.tools.emoji.Emoji.CharSource; -import com.google.common.collect.ImmutableMap; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.VersionInfo; - public class BirthInfo implements Comparable { private static final VersionInfo ZERO_VERSION = VersionInfo.getInstance(0); - private static final BirthInfo MISSING = new BirthInfo(-1,ZERO_VERSION); + private static final BirthInfo MISSING = new BirthInfo(-1, ZERO_VERSION); public BirthInfo(int year, VersionInfo versionInfo) { super(); this.year = year; - this.emojiVersionInfo = versionInfo; + this.emojiVersionInfo = versionInfo; } + public final int year; public final VersionInfo emojiVersionInfo; @@ -30,10 +29,12 @@ public BirthInfo(int year, VersionInfo versionInfo) { public int compareTo(BirthInfo o) { return emojiVersionInfo.compareTo(o.emojiVersionInfo); } + @Override public boolean equals(Object obj) { return 0 == compareTo((BirthInfo) obj); } + @Override public int hashCode() { return emojiVersionInfo.hashCode(); @@ -41,7 +42,7 @@ public int hashCode() { @Override public String toString() { - return "v"+emojiVersionInfo.getVersionString(2, 2) + " (" + year + ")"; + return "v" + emojiVersionInfo.getVersionString(2, 2) + " (" + year + ")"; } static void checkYears() { @@ -51,10 +52,11 @@ static void checkYears() { System.out.println(value + "\t" + set.size() + "\t" + set.toPattern(false)); } } + static final UnicodeMap birthYear = new UnicodeMap(); static Map yearToEmoji; static Map emojiVersionToEmoji; - //static final UnicodeMap birthYearWithVarians = new UnicodeMap(); + // static final UnicodeMap birthYearWithVarians = new UnicodeMap(); public static BirthInfo getBirthInfo(String s) { UnicodeMap years = getBirthInfoMap(); @@ -80,8 +82,9 @@ private static synchronized void buildYears() { UnicodeMap _years = new UnicodeMap<>(); UnicodeMap _emojiVersionToEmoji = new UnicodeMap<>(); -// Collection output = new TreeSet<>(Collections.reverseOrder()); // latest first -// VersionInfo firstVersion = null; + // Collection output = new TreeSet<>(Collections.reverseOrder()); + // // latest first + // VersionInfo firstVersion = null; EmojiData beta = EmojiData.EMOJI_DATA_BETA; for (String s : beta.getAllEmojiWithDefectives()) { @@ -91,7 +94,8 @@ private static synchronized void buildYears() { String withoutVariants = EmojiData.removeEmojiVariants(s); String withVariants = beta.addEmojiVariants(s); // if single code point, remove var - if (Character.charCount(withoutVariants.codePointAt(0)) == withoutVariants.length()) { + if (Character.charCount(withoutVariants.codePointAt(0)) + == withoutVariants.length()) { s = withoutVariants; } if (birthYear.containsKey(s)) { @@ -110,17 +114,19 @@ private static synchronized void buildYears() { if (sources.contains(CharSource.JCarrier)) { year = 2010; versionInfo = Emoji.VERSION0_6; - } else if (sources.contains(CharSource.ARIB) || sources.contains(CharSource.WDings)) { + } else if (sources.contains(CharSource.ARIB) + || sources.contains(CharSource.WDings)) { year = 2014; versionInfo = Emoji.VERSION0_7; } else { - for (Entry entry : Emoji.EMOJI_VERSION_TO_YEAR.entrySet()) { + for (Entry entry : + Emoji.EMOJI_VERSION_TO_YEAR.entrySet()) { versionInfo = entry.getKey(); EmojiData data = EmojiData.of(versionInfo); if (data.getAllEmojiWithDefectives().contains(s)) { year = entry.getValue(); // if (firstVersion == null) { - // firstVersion = versionInfo; + // firstVersion = versionInfo; // } // if (versionInfo == firstVersion) { // year = 2015; @@ -128,19 +134,25 @@ private static synchronized void buildYears() { // // // handle specially // // get the ages of all the components - // Collection items = Emoji.getValues(s, Emoji.VERSION_ENUM, output); - // Age_Values ageValue = output.iterator().next(); // output is latest first - // // TODO: have E0.1, E0.2 ... for years between 2010 and 2014 - // long date = VersionToAge.ucd.getLongDate(ageValue); + // Collection items = + // Emoji.getValues(s, Emoji.VERSION_ENUM, output); + // Age_Values ageValue = + // output.iterator().next(); // output is latest first + // // TODO: have E0.1, E0.2 ... for years + // between 2010 and 2014 + // long date = + // VersionToAge.ucd.getLongDate(ageValue); // year = new Date(date).getYear()+1900; - // if (year < 2010) { // && !Emoji.isSingleCodePoint(s) - // // keycaps, etc. came in with Japanese + // if (year < 2010) { // && + // !Emoji.isSingleCodePoint(s) + // // keycaps, etc. came in with + // Japanese // year = 2010; // versionInfo = Emoji.VERSION1; // } else { // int debug = 0; // } - // } + // } break; } } @@ -166,9 +178,10 @@ private static synchronized void buildYears() { // } } birthYear.freeze(); - //birthYearWithVariants.freeze(); + // birthYearWithVariants.freeze(); TreeMap _years2 = new TreeMap<>(Collections.reverseOrder()); - TreeMap _emojiVersionToEmoji2 = new TreeMap<>(Collections.reverseOrder()); + TreeMap _emojiVersionToEmoji2 = + new TreeMap<>(Collections.reverseOrder()); _years.addInverseTo(_years2); _emojiVersionToEmoji.addInverseTo(_emojiVersionToEmoji2); // protect @@ -184,34 +197,40 @@ private static synchronized void buildYears() { } /** * Return the year values, from largest to smallest + * * @return */ public static Set years() { return yearToEmoji.keySet(); } + public static Set versions() { return emojiVersionToEmoji.keySet(); } + public static UnicodeSet getSetForYears(int year2) { return yearToEmoji.get(year2); } + public static UnicodeSet getSetForVersion(VersionInfo version) { return emojiVersionToEmoji.get(version); } + public static int getYear(String s) { - BirthInfo data = getBirthInfo(s); + BirthInfo data = getBirthInfo(s); return data == null ? -1 : data.year; } + public static VersionInfo getVersionInfo(String s) { - BirthInfo data = getBirthInfo(s); + BirthInfo data = getBirthInfo(s); return data == null ? ZERO_VERSION : data.emojiVersionInfo; } + public static int getYear(VersionInfo versionInfo) { Integer year = Emoji.EMOJI_VERSION_TO_YEAR.get(versionInfo); return year == null ? -1 : year; } - public static void main(String[] args) { getBirthInfoMap(); for (CharSource charSource : Emoji.CharSource.values()) { @@ -223,7 +242,8 @@ public static void main(String[] args) { } for (VersionInfo version : BirthInfo.versions()) { final UnicodeSet setForVersion = BirthInfo.getSetForVersion(version); - System.out.println(version + "\t" + setForVersion.size() + "\t" + setForVersion.toPattern(false)); + System.out.println( + version + "\t" + setForVersion.size() + "\t" + setForVersion.toPattern(false)); } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateAnnotations.java b/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateAnnotations.java index 20fcbfe21..83fbc5d94 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateAnnotations.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateAnnotations.java @@ -1,24 +1,23 @@ package org.unicode.tools.emoji; +import com.ibm.icu.dev.util.CollectionUtilities; import java.util.Arrays; import java.util.Locale; import java.util.Set; import java.util.TreeSet; - import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.CandidateData.Quarter; -import com.ibm.icu.dev.util.CollectionUtilities; - public class CandidateAnnotations { public static void main(String[] args) { final CandidateData cd = CandidateData.getInstance(); final TreeSet sorted = cd.keySet().addAllTo(new TreeSet(cd.comparator)); // Internal sheet - // U+1F600 =image(C1,4,36,36) http://unicode.org/reports/tr51/images/android/android_1f600.png + // U+1F600 =image(C1,4,36,36) + // http://unicode.org/reports/tr51/images/android/android_1f600.png // Native template // U+1F471 =vlookup(A1,Internal!A:B,2,0) person with blond hair sarı saçlı adam - + System.out.println("Internal"); int row = 0; for (String s : sorted) { @@ -26,10 +25,15 @@ public static void main(String[] args) { if (q.isFuture()) continue; ++row; final String hex = Utility.hex(s).toLowerCase(Locale.ENGLISH); - System.out.println("U+" + hex - + "\t=image(C" + row + ",4,36,36)" - + "\thttp://unicode.org/draft/reports/tr51/images/android/android_" + hex + ".png" - ); + System.out.println( + "U+" + + hex + + "\t=image(C" + + row + + ",4,36,36)" + + "\thttp://unicode.org/draft/reports/tr51/images/android/android_" + + hex + + ".png"); } System.out.println("\n\nTemplate"); @@ -43,12 +47,18 @@ public static void main(String[] args) { final String name = cd.getName(s).toLowerCase(Locale.ENGLISH); final Set annotations = new TreeSet<>(cd.getAnnotations(s)); annotations.addAll(Arrays.asList(name.split(" "))); - System.out.println("U+" + hex - + "\t=vlookup(A" + row + ",Internal!A:B,2,0)" - + "\t" + name - + "\t" + "" - + "\t" + CollectionUtilities.join(annotations, " | ") - ); + System.out.println( + "U+" + + hex + + "\t=vlookup(A" + + row + + ",Internal!A:B,2,0)" + + "\t" + + name + + "\t" + + "" + + "\t" + + CollectionUtilities.join(annotations, " | ")); } } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateData.java b/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateData.java index 69a1a710f..85cb8da64 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateData.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/CandidateData.java @@ -1,5 +1,23 @@ package org.unicode.tools.emoji; +import com.google.common.base.Joiner; +import com.google.common.base.Objects; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableList.Builder; +import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Multimap; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.DateFormat; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.File; import java.text.Collator; import java.util.ArrayList; @@ -16,9 +34,7 @@ import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; -import java.util.function.Function; import java.util.function.Predicate; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CldrUtility; import org.unicode.props.IndexUnicodeProperties; @@ -33,29 +49,9 @@ import org.unicode.tools.emoji.CountEmoji.Category; import org.unicode.tools.emoji.EmojiOrder.MajorGroup; -import com.google.common.base.Joiner; -import com.google.common.base.Objects; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableList.Builder; -import com.google.common.collect.ImmutableMultimap; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Multimap; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.DateFormat; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.SpanCondition; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - /** - * Provides data for candidates, reading the file candidateData.txt. - * Note: At the end of a release, before the Draft Candidates are retired, run CandidateData.java to get the proposals for those + * Provides data for candidates, reading the file candidateData.txt. Note: At the end of a release, + * before the Draft Candidates are retired, run CandidateData.java to get the proposals for those * candidates, and add to the end of proposalData.txt */ public class CandidateData implements Transform, EmojiDataSource { @@ -64,8 +60,9 @@ public class CandidateData implements Transform, EmojiDataSource private static boolean DEBUG = CldrUtility.getProperty("CandidateData:DEBUG", false); // TODO Replace after values by using emojiOrdering. - private static final UnicodeSet ZWJ_SET = new UnicodeSet(Emoji.JOINER,Emoji.JOINER); - private static final Splitter SPLITTER_COMMA = Splitter.on(',').trimResults().omitEmptyStrings(); + private static final UnicodeSet ZWJ_SET = new UnicodeSet(Emoji.JOINER, Emoji.JOINER); + private static final Splitter SPLITTER_COMMA = + Splitter.on(',').trimResults().omitEmptyStrings(); private static final Joiner JOIN_COMMA = Joiner.on(", "); static final Splitter barSplit = Splitter.on('|').trimResults().omitEmptyStrings(); static final Splitter equalSplit = Splitter.on('=').trimResults(); @@ -74,16 +71,27 @@ public class CandidateData implements Transform, EmojiDataSource public enum Quarter { _RELEASED, - _2015Q1, _2015Q2, _2015Q3, _2015Q4, - _2016Q1, _2016Q2, _2016Q3, _2016Q4, - _2017Q1, _2017Q2, _2017Q3, _2017Q4 - ; + _2015Q1, + _2015Q2, + _2015Q3, + _2015Q4, + _2016Q1, + _2016Q2, + _2016Q3, + _2016Q4, + _2017Q1, + _2017Q2, + _2017Q3, + _2017Q4; + public boolean isFuture() { return compareTo(_2016Q1) >= 0; } + static Quarter fromString(String item) { - return valueOf('_'+item); + return valueOf('_' + item); } + public String toString() { return name().substring(1); } @@ -92,14 +100,17 @@ public String toString() { public enum Status { Final_Candidate("Code points are final."), // final code points Draft_Candidate("Code points are draft."), // draft code points - Provisional_Candidate("Temporary IDs are assigned, not code points."); // no code points + Provisional_Candidate("Temporary IDs are assigned, not code points."); // no code points public final String comment; + private Status(String _comment) { - comment= _comment; + comment = _comment; } + public static Status fromString(String string) { return valueOf(string.replace(' ', '_')); } + public String toString() { return name().replace('_', ' '); } @@ -114,12 +125,13 @@ public String toString() { // boolean Emoji_Gender_Base; // } - public static SetMaker SORTED_TREESET_MAKER = new SetMaker() { - @Override - public Set make() { - return new TreeSet(Collator.getInstance(Locale.ENGLISH)); - } - }; + public static SetMaker SORTED_TREESET_MAKER = + new SetMaker() { + @Override + public Set make() { + return new TreeSet(Collator.getInstance(Locale.ENGLISH)); + } + }; // TODO change to have a CandidateDatum object with this information, instead of separate maps private final List order; @@ -149,7 +161,8 @@ public Set make() { private Map> existingToDraftCandidatesAfter = new HashMap<>(); - static final UnicodeSet SEQUENCE_MAKER = new UnicodeSet().add(Emoji.JOINER).add(EmojiData.MODIFIERS).freeze(); + static final UnicodeSet SEQUENCE_MAKER = + new UnicodeSet().add(Emoji.JOINER).add(EmojiData.MODIFIERS).freeze(); static final CandidateData SINGLE = new CandidateData("candidateData.txt"); @@ -164,7 +177,9 @@ private CandidateData(String sourceFile) { String proposalItem = null; Status status = null; - date = new File(FileUtilities.getRelativeFileName(CandidateData.class, sourceFile)).lastModified(); + date = + new File(FileUtilities.getRelativeFileName(CandidateData.class, sourceFile)) + .lastModified(); for (String line : FileUtilities.in(CandidateData.class, sourceFile)) { line = line.trim(); try { @@ -178,7 +193,8 @@ private CandidateData(String sourceFile) { if (TEST_STRING.equals(source)) { int debug = 0; } - addCombosWithGenderAndSkin(source); // fix old source. we do it here so we know the properties + addCombosWithGenderAndSkin( + source); // fix old source. we do it here so we know the properties source = Utility.fromHex(line); @@ -198,7 +214,9 @@ private CandidateData(String sourceFile) { addAfter(source, afterItem); check(allCharacters, after); - proposal.put(source.replace(Emoji.EMOJI_VARIANT_STRING,""), ProposalData.cleanProposalString(proposalItem)); + proposal.put( + source.replace(Emoji.EMOJI_VARIANT_STRING, ""), + ProposalData.cleanProposalString(proposalItem)); String afterString = "> " + afterItem; Age_Values age = Emoji.VERSION_ENUM.get(afterItem.codePointAt(0)); if (age.compareTo(Age_Values.V10_0) >= 0) { @@ -220,80 +238,84 @@ private CandidateData(String sourceFile) { } else { // must be category int equalPos = line.indexOf('='); - String leftSide = equalPos < 0 ? line : line.substring(0,equalPos).trim(); - String rightSide = equalPos < 0 ? null : line.substring(equalPos+1).trim(); - switch(leftSide) { - - // go before character - case "Status": - status = CandidateData.Status.fromString(rightSide); - break; - case "Quarter": - quarter = CandidateData.Quarter.fromString(rightSide); - break; - case "After": - afterItem = rightSide; - if (afterItem.equals("🧑‍🦰")) { - int debug = 0; - } - category = EmojiOrder.STD_ORDER.getCategory(afterItem); - break; - case "Proposal": - proposalItem = rightSide; - break; - - // go after character - case "Name": - final String name = rightSide; - if (name.contains("|")) { - throw new IllegalArgumentException("Name with | on " + line); - } - names.put(source, name.toLowerCase(Locale.ENGLISH)); - names.put(source.replaceAll(Emoji.EMOJI_VARIANT_STRING, ""), name.toLowerCase(Locale.ENGLISH)); - break; - case "UName": - String oldName = names.get(source); - if (!oldName.equalsIgnoreCase(rightSide)) { - final String uname = rightSide.toUpperCase(Locale.ROOT); - if (uname.contains("|")) { - throw new IllegalArgumentException("UName with | on " + line); + String leftSide = equalPos < 0 ? line : line.substring(0, equalPos).trim(); + String rightSide = equalPos < 0 ? null : line.substring(equalPos + 1).trim(); + switch (leftSide) { + + // go before character + case "Status": + status = CandidateData.Status.fromString(rightSide); + break; + case "Quarter": + quarter = CandidateData.Quarter.fromString(rightSide); + break; + case "After": + afterItem = rightSide; + if (afterItem.equals("🧑‍🦰")) { + int debug = 0; } - unames.put(source, uname); - } - break; - case "Keywords": - if (rightSide.contains("dengue")) { - int debug = 0; - } - if (rightSide.contains(",")) { - System.err.println("Keywords contain: " + rightSide); - } - List cleanKeywords = barSplit.splitToList(rightSide); - for (String item : cleanKeywords) { - if (item.isEmpty()) { - throw new IllegalArgumentException("Empty keyword on " + line); + category = EmojiOrder.STD_ORDER.getCategory(afterItem); + break; + case "Proposal": + proposalItem = rightSide; + break; + + // go after character + case "Name": + final String name = rightSide; + if (name.contains("|")) { + throw new IllegalArgumentException("Name with | on " + line); } - // if (!item.equals(item.toLowerCase(Locale.ENGLISH))) { - // System.err.println("Warning: Cased Keyword on " + line); - // } - } - annotations.addAll(source, cleanKeywords); - break; - case "Emoji_Modifier_Base": - addAttribute(source, emoji_Modifier_Base, "∈ modifier_base"); - break; - case "Emoji_Gender_Base": - addAttribute(source, emoji_Gender_Base, "∈ gender_base"); - break; - case "Emoji_Component": - addAttribute(source, emoji_Component, "∈ component"); - break; - case "Comment": - addComment(source, rightSide); - break; - - default: - throw new IllegalArgumentException(line); + names.put(source, name.toLowerCase(Locale.ENGLISH)); + names.put( + source.replaceAll(Emoji.EMOJI_VARIANT_STRING, ""), + name.toLowerCase(Locale.ENGLISH)); + break; + case "UName": + String oldName = names.get(source); + if (!oldName.equalsIgnoreCase(rightSide)) { + final String uname = rightSide.toUpperCase(Locale.ROOT); + if (uname.contains("|")) { + throw new IllegalArgumentException("UName with | on " + line); + } + unames.put(source, uname); + } + break; + case "Keywords": + if (rightSide.contains("dengue")) { + int debug = 0; + } + if (rightSide.contains(",")) { + System.err.println("Keywords contain: " + rightSide); + } + List cleanKeywords = barSplit.splitToList(rightSide); + for (String item : cleanKeywords) { + if (item.isEmpty()) { + throw new IllegalArgumentException("Empty keyword on " + line); + } + // if + // (!item.equals(item.toLowerCase(Locale.ENGLISH))) { + // System.err.println("Warning: Cased + // Keyword on " + line); + // } + } + annotations.addAll(source, cleanKeywords); + break; + case "Emoji_Modifier_Base": + addAttribute(source, emoji_Modifier_Base, "∈ modifier_base"); + break; + case "Emoji_Gender_Base": + addAttribute(source, emoji_Gender_Base, "∈ gender_base"); + break; + case "Emoji_Component": + addAttribute(source, emoji_Component, "∈ component"); + break; + case "Comment": + addComment(source, rightSide); + break; + + default: + throw new IllegalArgumentException(line); } } } catch (Exception e) { @@ -301,13 +323,15 @@ private CandidateData(String sourceFile) { } } - addCombosWithGenderAndSkin(source); // fix last source. We do it here so we know the properties + addCombosWithGenderAndSkin( + source); // fix last source. We do it here so we know the properties addHackName("handshake", "\uD83E\uDD1D", "\uDBC6\uDD03", "\u200D\uDBC6\uDD04"); // allCharacters.addAll(singleCharacters); // just to be sure - UnicodeSet duplicates = new UnicodeSet(EmojiData.EMOJI_DATA_RELEASED.getAllEmojiWithDefectives()) - .retainAll(allCharacters) - .addAll(Emoji.EXCLUSIONS); + UnicodeSet duplicates = + new UnicodeSet(EmojiData.EMOJI_DATA_RELEASED.getAllEmojiWithDefectives()) + .retainAll(allCharacters) + .addAll(Emoji.EXCLUSIONS); // allCharacters.removeAll(singleCharacters); allCharacters.removeAll(duplicates).freeze(); @@ -331,14 +355,15 @@ private CandidateData(String sourceFile) { singleCharacters.addAll(allCharacters).removeAllStrings().freeze(); - Multimap _existingToDraftCandidatesAfter = LinkedHashMultimap.create(); + Multimap _existingToDraftCandidatesAfter = LinkedHashMultimap.create(); for (Entry entry : after.entrySet()) { if (statuses.getValue(entry.getKey()) == Status.Draft_Candidate) { _existingToDraftCandidatesAfter.put(entry.getValue(), entry.getKey()); } } // ImmutableMultimaps have values as lists, so rework - for (Entry> entry : _existingToDraftCandidatesAfter.asMap().entrySet()) { + for (Entry> entry : + _existingToDraftCandidatesAfter.asMap().entrySet()) { existingToDraftCandidatesAfter.put(entry.getKey(), (Set) entry.getValue()); } existingToDraftCandidatesAfter = ImmutableMap.copyOf(existingToDraftCandidatesAfter); @@ -352,24 +377,32 @@ private CandidateData(String sourceFile) { if (!provisional.contains(s)) { allNonProvisional.add(s); } - if (s.contains(Emoji.JOINER_STR + Emoji.FEMALE + Emoji.EMOJI_VARIANT_STRING) || s.contains(Emoji.JOINER_STR + Emoji.MALE + Emoji.EMOJI_VARIANT_STRING)) { - takesSign.add(s.substring(0, s.length()-(Emoji.JOINER_STR + Emoji.FEMALE + Emoji.EMOJI_VARIANT_STRING).length())); + if (s.contains(Emoji.JOINER_STR + Emoji.FEMALE + Emoji.EMOJI_VARIANT_STRING) + || s.contains(Emoji.JOINER_STR + Emoji.MALE + Emoji.EMOJI_VARIANT_STRING)) { + takesSign.add( + s.substring( + 0, + s.length() + - (Emoji.JOINER_STR + + Emoji.FEMALE + + Emoji.EMOJI_VARIANT_STRING) + .length())); } - } UnicodeMap ages = Emoji.LATEST.loadEnum(UcdProperty.Age, Age_Values.class); - Age_Values minAge = Age_Values.forName(Emoji.VERSION_LAST_RELEASED_UNICODE.getVersionString(2, 2)); + Age_Values minAge = + Age_Values.forName(Emoji.VERSION_LAST_RELEASED_UNICODE.getVersionString(2, 2)); EmojiData releasedData = EmojiData.of(Emoji.VERSION_LAST_RELEASED); for (String s : allCharacters) { // if not single code point, we don't care int first = CharSequences.getSingleCodePoint(s); if (first == Integer.MAX_VALUE) { - continue; + continue; } // if a character is in the released emoji data, we use its value if (releasedData.getAllEmojiWithDefectives().contains(s)) { if (!releasedData.getEmojiPresentationSet().contains(s)) { - textPresentation.add(s); + textPresentation.add(s); } continue; } @@ -404,10 +437,10 @@ private CandidateData(String sourceFile) { } } - /** - * We need to add special names. Rather than enhance the data-driving algorithm for a one-off, + /** + * We need to add special names. Rather than enhance the data-driving algorithm for a one-off, * it is easier to do in code. - * */ + */ private void addHackName(String name, String singleton, String prefix, String suffix) { StringBuilder b = new StringBuilder(); for (String tone1 : EmojiData.MODIFIERS) { @@ -475,7 +508,7 @@ private static boolean checkData(CandidateData instance) { // check that old emoji have emoji VS // TODO - if (Emoji.GENDER_MARKERS.containsSome(item) + if (Emoji.GENDER_MARKERS.containsSome(item) || EmojiData.MODIFIERS.containsSome(item) || Emoji.MAN_OR_WOMAN_OR_ADULT.containsSome(item)) { continue; @@ -484,11 +517,11 @@ private static boolean checkData(CandidateData instance) { String name = instance.getName(item); Set keywords = instance.getAnnotations(item); if (keywords.size() > 6) { - System.err.println("Too many keywords? (" + keywords.size() - + "): " + name + ": " + keywords); + System.err.println( + "Too many keywords? (" + keywords.size() + "): " + name + ": " + keywords); } else if (keywords.size() < 1) { - System.err.println("Too few keywords? (" + keywords.size() - + "): " + name + ": " + keywords); + System.err.println( + "Too few keywords? (" + keywords.size() + "): " + name + ": " + keywords); } if (item.contains(Emoji.JOINER_STR)) { continue; @@ -501,9 +534,10 @@ private static boolean checkData(CandidateData instance) { if (cname == null) { cname = name.toUpperCase(Locale.ROOT); } - String uname = iup.getName(item," + "); + String uname = iup.getName(item, " + "); if (!uname.equals(cname)) { - System.err.println(Utility.hex(item) + " — Names differ UCD: " + uname + "\t≠\tCLDR:" + cname); + System.err.println( + Utility.hex(item) + " — Names differ UCD: " + uname + "\t≠\tCLDR:" + cname); result = false; } } @@ -526,11 +560,13 @@ private void addCombosWithGenderAndSkin(String source) { int debug = 0; } - boolean hasModifierBase = emoji_Modifier_Base.containsSome(source) - || EmojiData.EMOJI_DATA_BETA.getModifierBasesRgi().containsSome(source); + boolean hasModifierBase = + emoji_Modifier_Base.containsSome(source) + || EmojiData.EMOJI_DATA_BETA.getModifierBasesRgi().containsSome(source); UnicodeSet all_Emoji_Modifier_Base = null; String fromNames = names.get(source); - // disable this, so that all gender variants come from the file instead of being manufactured. + // disable this, so that all gender variants come from the file instead of being + // manufactured. // if (hasModifierBase) { // // find the point where it occurs; not efficient but we don't care // all_Emoji_Modifier_Base = new UnicodeSet(emoji_Modifier_Base) @@ -538,7 +574,8 @@ private void addCombosWithGenderAndSkin(String source) { // .remove("🤝") // special hack to remove skin color // .freeze(); // - // addCombos(source, fromNames, "", source, "", ": ", all_Emoji_Modifier_Base, "", ""); + // addCombos(source, fromNames, "", source, "", ": ", all_Emoji_Modifier_Base, + // "", ""); // } int single = UnicodeSet.getSingleCodePoint(source); @@ -548,7 +585,8 @@ private void addCombosWithGenderAndSkin(String source) { boolean isGenderBase = emoji_Gender_Base.contains(source); if (isGenderBase) { - boolean isMultiPerson = EmojiData.EMOJI_DATA_BETA.getMultiPersonGroupings().contains(source); + boolean isMultiPerson = + EmojiData.EMOJI_DATA_BETA.getMultiPersonGroupings().contains(source); for (String gen : Emoji.GENDER_MARKERS) { String genSuffix = Emoji.JOINER_STR + gen + Emoji.EMOJI_VARIANT_STRING; @@ -569,37 +607,64 @@ private void addCombosWithGenderAndSkin(String source) { String newSource = source + genSuffix; addCombo(source, sourceName, newSource, ""); if (hasModifierBase) { - addCombos(newSource, sourceName, "", newSource, "", ": ", all_Emoji_Modifier_Base, "", ""); + addCombos( + newSource, + sourceName, + "", + newSource, + "", + ": ", + all_Emoji_Modifier_Base, + "", + ""); // // for (String mod : EmojiData.MODIFIERS) { - // addCombo(source, sourceName, source + mod + genSuffix, ": " + EmojiData.EMOJI_DATA_BETA.getName(mod)); + // addCombo(source, sourceName, source + mod + genSuffix, + // ": " + EmojiData.EMOJI_DATA_BETA.getName(mod)); // } } } } // if (isGenderBase && hasModifierBase) { - // addComment(source, "Combinations of gender and skin-tone produce 17 more emoji sequences."); + // addComment(source, "Combinations of gender and skin-tone produce 17 more emoji + // sequences."); // } else if (isGenderBase) { - // addComment(source, "Combinations of gender and skin-tone produce 2 more emoji sequences."); + // addComment(source, "Combinations of gender and skin-tone produce 2 more emoji + // sequences."); // } else if (hasModifierBase) { - // addComment(source, "Combinations of gender and skin-tone produce 5 more emoji sequences."); + // addComment(source, "Combinations of gender and skin-tone produce 5 more emoji + // sequences."); // } // Comment=There will be 55 emoji sequences with combinations of gender and skin-tone } - private void addCombos(String source, String sourceName, String combined, String remainder, String nameSuffix, - String separator, UnicodeSet forSplitting, String lastBase, String lastMod) { - if (SHOW_COMBOS) System.out.println( - "source: " + source - + ", combined: " + combined - + ", remainder: " + remainder - + ", nameSuffix: " + nameSuffix - + ", separator: " + separator - ); + private void addCombos( + String source, + String sourceName, + String combined, + String remainder, + String nameSuffix, + String separator, + UnicodeSet forSplitting, + String lastBase, + String lastMod) { + if (SHOW_COMBOS) + System.out.println( + "source: " + + source + + ", combined: " + + combined + + ", remainder: " + + remainder + + ", nameSuffix: " + + nameSuffix + + ", separator: " + + separator); int start = forSplitting.span(remainder, SpanCondition.NOT_CONTAINED); if (start == remainder.length()) { - // addCombo(source, source + mod + genSuffix, genPrefix, ": " + EmojiData.EMOJI_DATA_BETA.getName(mod)); + // addCombo(source, source + mod + genSuffix, genPrefix, ": " + + // EmojiData.EMOJI_DATA_BETA.getName(mod)); addCombo(source, sourceName, combined + remainder, nameSuffix); return; } @@ -614,8 +679,16 @@ private void addCombos(String source, String sourceName, String combined, String if (base.equals(lastBase) && mod.compareTo(lastMod) == 0) { continue; } - addCombos(source, sourceName, combined + remainder.substring(0, end) + mod, - remainder.substring(end), nameSuffix + separator + EmojiData.EMOJI_DATA_BETA.getName(mod), ", ", forSplitting, base, mod); + addCombos( + source, + sourceName, + combined + remainder.substring(0, end) + mod, + remainder.substring(end), + nameSuffix + separator + EmojiData.EMOJI_DATA_BETA.getName(mod), + ", ", + forSplitting, + base, + mod); } } @@ -628,12 +701,16 @@ private void addAttribute(String source, UnicodeSet unicodeSet, String title) { private void addCombo(String source, String sourceName, String combo, String nameSuffix) { String newName = sourceName + nameSuffix; - if (SHOW_COMBOS) System.out.println("*addCombo: " - + "cp: " + source - + ", combo: " + combo - + ", newName: " + newName - ); - //System.out.println("Adding: " + newName); + if (SHOW_COMBOS) + System.out.println( + "*addCombo: " + + "cp: " + + source + + ", combo: " + + combo + + ", newName: " + + newName); + // System.out.println("Adding: " + newName); allCharacters.add(combo); names.put(combo, newName); Status status = statuses.get(source); @@ -650,70 +727,73 @@ private void addCombo(String source, String sourceName, String combo, String nam setCategoryAndSuborder(combo, categories.get(source)); // if (Emoji.HAIR_PIECES.containsSome(cp)) { // HACK - // names.put(combo, - // EmojiData.EMOJI_DATA.getName(UTF16.valueOf(Character.codePointAt(combo, 0))) + // names.put(combo, + // + // EmojiData.EMOJI_DATA.getName(UTF16.valueOf(Character.codePointAt(combo, 0))) // + ": " + // getName(Character.codePointBefore(combo, combo.length()))); // } } - public final Comparator comparator = new Comparator() { - @Override - public int compare(String o1, String o2) { - if ("🛼".equals(o1) || "🛼".equals(o1)) { - int debug = 0; - } - if (o1 == o2) { - return 0; - } + public final Comparator comparator = + new Comparator() { + @Override + public int compare(String o1, String o2) { + if ("🛼".equals(o1) || "🛼".equals(o1)) { + int debug = 0; + } + if (o1 == o2) { + return 0; + } - // if both items have "real" collation data, use that. - int r1 = EmojiOrder.STD_ORDER.mapCollator.getOrdering(o1); - int r2 = EmojiOrder.STD_ORDER.mapCollator.getOrdering(o2); - if (r1 >= 0 && r2 >= 0) { - return EmojiOrder.STD_ORDER.codepointCompare.compare(o1, o2); - } + // if both items have "real" collation data, use that. + int r1 = EmojiOrder.STD_ORDER.mapCollator.getOrdering(o1); + int r2 = EmojiOrder.STD_ORDER.mapCollator.getOrdering(o2); + if (r1 >= 0 && r2 >= 0) { + return EmojiOrder.STD_ORDER.codepointCompare.compare(o1, o2); + } - // if there are no after values, then neither item is in CandidateData - String after1 = after.get(o1); - String after2 = after.get(o2); - if (after1 == null && after1 == null) { - return EmojiOrder.STD_ORDER.codepointCompare.compare(o1, o2); - } + // if there are no after values, then neither item is in CandidateData + String after1 = after.get(o1); + String after2 = after.get(o2); + if (after1 == null && after1 == null) { + return EmojiOrder.STD_ORDER.codepointCompare.compare(o1, o2); + } - // // this getCategory falls back to the full emojit set. - // String cat1 = getCategory(o1); - // int catOrder1 = EmojiOrder.STD_ORDER.getGroupOrder(cat1); - // - // String cat2 = getCategory(o2); - // int catOrder2 = EmojiOrder.STD_ORDER.getGroupOrder(cat2); - // if (catOrder1 != catOrder2) { - // return catOrder1 > catOrder2 ? 1 : -1; - // } + // // this getCategory falls back to the full emojit set. + // String cat1 = getCategory(o1); + // int catOrder1 = EmojiOrder.STD_ORDER.getGroupOrder(cat1); + // + // String cat2 = getCategory(o2); + // int catOrder2 = EmojiOrder.STD_ORDER.getGroupOrder(cat2); + // if (catOrder1 != catOrder2) { + // return catOrder1 > catOrder2 ? 1 : -1; + // } - // if the after values are different, return them - // either either is null, then the character is outside of + // if the after values are different, return them + // either either is null, then the character is outside of - if (after1 == null) { - after1 = o1; - } - if (after2 == null) { - after2 = o2; - } - if (!after1.equals(after2)) { - return EmojiOrder.STD_ORDER.codepointCompare.compare(after1, after2); - } + if (after1 == null) { + after1 = o1; + } + if (after2 == null) { + after2 = o2; + } + if (!after1.equals(after2)) { + return EmojiOrder.STD_ORDER.codepointCompare.compare(after1, after2); + } - // The after values are identical, so get the suborders - // If one is missing (they both can't be simultaneously missing), use -1 to get before - Integer so1 = suborder.get(o1); - int so1i = so1 == null ? -1 : so1; - Integer so2 = suborder.get(o2); - int so2i = so2 == null ? -1 : so2; + // The after values are identical, so get the suborders + // If one is missing (they both can't be simultaneously missing), use -1 to get + // before + Integer so1 = suborder.get(o1); + int so1i = so1 == null ? -1 : so1; + Integer so2 = suborder.get(o2); + int so2i = so2 == null ? -1 : so2; - return so1i-so2i; - } - }; + return so1i - so2i; + } + }; private long date; /** @@ -754,6 +834,7 @@ public String getName(int source) { public String getUName(String source) { return unames.get(source); } + public String getUName(int source) { return unames.get(source); } @@ -766,6 +847,7 @@ public Set getAnnotations(String source) { Set list = annotations.get(source); return list == null ? Collections.emptySet() : new TreeSet<>(list); } + public Set getAnnotations(int source) { return CldrUtility.ifNull(annotations.get(source), Collections.emptySet()); } @@ -782,6 +864,7 @@ public Set getProposal(String source) { public CandidateData.Quarter getQuarter(String source) { return quarters.get(source); } + public CandidateData.Quarter getQuarter(int source) { return quarters.get(source); } @@ -789,6 +872,7 @@ public CandidateData.Quarter getQuarter(int source) { public Status getStatus(String source) { return statuses.get(source); } + public Status getStatus(int source) { return statuses.get(source); } @@ -796,6 +880,7 @@ public Status getStatus(int source) { public String getComment(String source) { return comments.get(source); } + public String getComment(int source) { return comments.get(source); } @@ -804,16 +889,18 @@ public String getCategory(int source) { String result = EmojiOrder.STD_ORDER.charactersToOrdering.get(source); return result != null ? result : categories.get(source); } + public String getCategory(String source) { String result = EmojiOrder.STD_ORDER.charactersToOrdering.get(source); if (result != null) { return result; } -// final String stripped = EmojiData.removeEmojiVariants(EmojiData.MODIFIERS.stripFrom(source, true)); -// result = EmojiOrder.STD_ORDER.charactersToOrdering.get(stripped); -// if (result != null) { -// return result; -// } + // final String stripped = + // EmojiData.removeEmojiVariants(EmojiData.MODIFIERS.stripFrom(source, true)); + // result = EmojiOrder.STD_ORDER.charactersToOrdering.get(stripped); + // if (result != null) { + // return result; + // } result = categories.get(source); if (result != null) { return result; @@ -831,12 +918,16 @@ public List getOrder() { */ public MajorGroup getMajorGroup(String s) { MajorGroup result = EmojiOrder.STD_ORDER.majorGroupings.get(s); - return result != null ? result : EmojiOrder.STD_ORDER.getMajorGroupFromCategory(getCategory(s)); + return result != null + ? result + : EmojiOrder.STD_ORDER.getMajorGroupFromCategory(getCategory(s)); } public MajorGroup getMajorGroup(int s) { MajorGroup result = EmojiOrder.STD_ORDER.majorGroupings.get(s); - return result != null ? result :EmojiOrder.STD_ORDER.getMajorGroupFromCategory(getCategory(s)); + return result != null + ? result + : EmojiOrder.STD_ORDER.getMajorGroupFromCategory(getCategory(s)); } public MajorGroup getMajorGroupFromCategory(String category) { @@ -852,27 +943,31 @@ public static void main(String[] args) { int count = 0; for (String arg : args) { switch (arg) { - case "proposals": - generateProposalData(candidateData); - ++count; - break; - case "order": - IndexUnicodeProperties iup = IndexUnicodeProperties.make(Emoji.VERSION_BETA); - UnicodeMap gc = iup.loadEnum(UcdProperty.General_Category, UcdPropertyValues.General_Category_Values.class); - UnicodeSet unassigned = gc.getSet(UcdPropertyValues.General_Category_Values.Unassigned); - showOrdering(candidateData, unassigned); - ++count; - break; - default: - throw new IllegalArgumentException("Bad argument: " + arg); - + case "proposals": + generateProposalData(candidateData); + ++count; + break; + case "order": + IndexUnicodeProperties iup = IndexUnicodeProperties.make(Emoji.VERSION_BETA); + UnicodeMap gc = + iup.loadEnum( + UcdProperty.General_Category, + UcdPropertyValues.General_Category_Values.class); + UnicodeSet unassigned = + gc.getSet(UcdPropertyValues.General_Category_Values.Unassigned); + showOrdering(candidateData, unassigned); + ++count; + break; + default: + throw new IllegalArgumentException("Bad argument: " + arg); } } if (count == 0) { throw new IllegalArgumentException("No arguments found"); } - // try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(System.out))) { + // try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(System.out))) + // { // new EmojiDataSourceCombined().showOrderingInterleaved(MAX_PER_LINE, out); // } catch (IOException e) { // throw new ICUUncheckedIOException(e); @@ -898,10 +993,12 @@ public static void main(String[] args) { // // UnicodeSet candChars = candidateData.getAllCharacters(); // System.out.println(candChars.size() + "\t" + candChars); - // System.out.println("all\t" + DebugUtilities.composeStringsWhen("•", candChars, s -> s.contains(Emoji.TRANSGENDER))); + // System.out.println("all\t" + DebugUtilities.composeStringsWhen("•", candChars, + // s -> s.contains(Emoji.TRANSGENDER))); // UnicodeSet withoutDefectives = candidateData.getAllEmojiWithoutDefectives(); // System.out.println(withoutDefectives.size() + "\t" + withoutDefectives); - // System.out.println("wd\t" + DebugUtilities.composeStringsWhen("•", withoutDefectives, s -> s.contains(Emoji.TRANSGENDER))); + // System.out.println("wd\t" + DebugUtilities.composeStringsWhen("•", + // withoutDefectives, s -> s.contains(Emoji.TRANSGENDER))); // // String added = EmojiData.EMOJI_DATA_BETA.addEmojiVariants(Emoji.TRANSGENDER); // System.out.println(Utility.hex(Emoji.TRANSFLAG)); @@ -915,14 +1012,14 @@ public static void main(String[] args) { private static void generateProposalData(CandidateData instance) { System.out.println("\nData for proposalData.txt\n"); - //1F931; L2/16-280,L2/16-282r; BREAST-FEEDING + // 1F931; L2/16-280,L2/16-282r; BREAST-FEEDING Set done = new HashSet<>(); UnicodeSet missing = new UnicodeSet(); for (String item : instance.fullDraftForProposals) { if (instance.statuses.get(item) == Status.Provisional_Candidate -// || EmojiData.MODIFIERS.containsSome(item) -// || Emoji.GENDER_MARKERS.containsSome(item) - ) { + // || EmojiData.MODIFIERS.containsSome(item) + // || Emoji.GENDER_MARKERS.containsSome(item) + ) { continue; } String skeleton = ProposalData.getSkeleton(item); @@ -931,20 +1028,21 @@ private static void generateProposalData(CandidateData instance) { } done.add(skeleton); Set proposals = instance.getProposal(item); -// if (proposals == null) { -// missing.add(item); -// } - System.out.println(Utility.hex(skeleton) - + "; " + CollectionUtilities.join(proposals, ", ") - + "; " + instance.getName(item)); + // if (proposals == null) { + // missing.add(item); + // } + System.out.println( + Utility.hex(skeleton) + + "; " + + CollectionUtilities.join(proposals, ", ") + + "; " + + instance.getName(item)); } if (missing.isEmpty()) { return; } for (String item : missing) { - System.out.println(Utility.hex(item) - + "; " + "XXX" - + "; " + instance.getName(item)); + System.out.println(Utility.hex(item) + "; " + "XXX" + "; " + instance.getName(item)); } } @@ -957,9 +1055,10 @@ public static void showOrdering(CandidateData instance, UnicodeSet discard) { if (DEBUG) System.out.println("\nOrdering Data\n"); CountEmoji cm = new CountEmoji(); - Set sorted = instance.getAllCharacters().addAllTo( - new TreeSet<>(instance.comparator)); - // Map> baseToList = new TreeMap<>(EmojiOrder.STD_ORDER.codepointCompare); + Set sorted = + instance.getAllCharacters().addAllTo(new TreeSet<>(instance.comparator)); + // Map> baseToList = new + // TreeMap<>(EmojiOrder.STD_ORDER.codepointCompare); // baseToList. // for (String item : instance.suborder) { // // if (EmojiData.MODIFIERS.containsSome(item)) { @@ -976,16 +1075,23 @@ public static void showOrdering(CandidateData instance, UnicodeSet discard) { if (discard.containsSome(subItem)) { continue; } - if (DEBUG) System.out.println( - instance.after.get(subItem) - + "\t" + subItem - + "\t" + Utility.hex(subItem) - + "\t" + instance.categories.get(subItem) - + "\t" + instance.suborder.get(subItem) - + "\t" + instance.getName(subItem) - + "\tkw:" + instance.getAnnotations(subItem) - + "\tucd:" + instance.getUName(subItem) - ); + if (DEBUG) + System.out.println( + instance.after.get(subItem) + + "\t" + + subItem + + "\t" + + Utility.hex(subItem) + + "\t" + + instance.categories.get(subItem) + + "\t" + + instance.suborder.get(subItem) + + "\t" + + instance.getName(subItem) + + "\tkw:" + + instance.getAnnotations(subItem) + + "\tucd:" + + instance.getUName(subItem)); if (instance.getAfter(subItem) == null) { throw new IllegalArgumentException(); } @@ -1000,7 +1106,8 @@ public static void showOrdering(CandidateData instance, UnicodeSet discard) { // for (String s : instance.allCharacters) { // System.out.println(s + "\t" + ordering.getCategory(s)); // } - // for (String s : new UnicodeSet("[{👨‍🦰️}{👨‍🦱️}{👨‍🦲️}{👨‍🦳️}{👨🏻‍🦰️}{👨🏻‍🦱️}{👨🏻‍🦲️}{👨🏻‍🦳️}{👨🏼‍🦰️}{👨🏼‍🦱️}{👨🏼‍🦲️}{👨🏼‍🦳️}{👨🏽‍🦰️}{👨🏽‍🦱️}{👨🏽‍🦲️}{👨🏽‍🦳️}{👨🏾‍🦰️}{👨🏾‍🦱️}{👨🏾‍🦲️}{👨🏾‍🦳️}{👨🏿‍🦰️}{👨🏿‍🦱️}{👨🏿‍🦲️}{👨🏿‍🦳️}{👩‍🦰️}{👩‍🦱️}{👩‍🦲️}{👩‍🦳️}{👩🏻‍🦰️}{👩🏻‍🦱️}{👩🏻‍🦲️}{👩🏻‍🦳️}{👩🏼‍🦰️}{👩🏼‍🦱️}{👩🏼‍🦲️}{👩🏼‍🦳️}{👩🏽‍🦰️}{👩🏽‍🦱️}{👩🏽‍🦲️}{👩🏽‍🦳️}{👩🏾‍🦰️}{👩🏾‍🦱️}{👩🏾‍🦲️}{👩🏾‍🦳️}{👩🏿‍🦰️}{👩🏿‍🦱️}{👩🏿‍🦲️}{👩🏿‍🦳️}{🦸️‍♀️}{🦸️‍♂️}{🦹️‍♀️}{🦹️‍♂️}]")) { + // for (String s : new + // UnicodeSet("[{👨‍🦰️}{👨‍🦱️}{👨‍🦲️}{👨‍🦳️}{👨🏻‍🦰️}{👨🏻‍🦱️}{👨🏻‍🦲️}{👨🏻‍🦳️}{👨🏼‍🦰️}{👨🏼‍🦱️}{👨🏼‍🦲️}{👨🏼‍🦳️}{👨🏽‍🦰️}{👨🏽‍🦱️}{👨🏽‍🦲️}{👨🏽‍🦳️}{👨🏾‍🦰️}{👨🏾‍🦱️}{👨🏾‍🦲️}{👨🏾‍🦳️}{👨🏿‍🦰️}{👨🏿‍🦱️}{👨🏿‍🦲️}{👨🏿‍🦳️}{👩‍🦰️}{👩‍🦱️}{👩‍🦲️}{👩‍🦳️}{👩🏻‍🦰️}{👩🏻‍🦱️}{👩🏻‍🦲️}{👩🏻‍🦳️}{👩🏼‍🦰️}{👩🏼‍🦱️}{👩🏼‍🦲️}{👩🏼‍🦳️}{👩🏽‍🦰️}{👩🏽‍🦱️}{👩🏽‍🦲️}{👩🏽‍🦳️}{👩🏾‍🦰️}{👩🏾‍🦱️}{👩🏾‍🦲️}{👩🏾‍🦳️}{👩🏿‍🦰️}{👩🏿‍🦱️}{👩🏿‍🦲️}{👩🏿‍🦳️}{🦸️‍♀️}{🦸️‍♂️}{🦹️‍♀️}{🦹️‍♂️}]")) { // if (DEBUG) System.out.println(s + "\t" + ordering.getCategory(s)); // } if (DEBUG) System.out.println("\n\nSO\tType\tCategory\tHex\tCldr Name\tUcd Name"); @@ -1013,18 +1120,24 @@ public static void showOrdering(CandidateData instance, UnicodeSet discard) { UnicodeSet uset = bucket.sets.getSet(maj); if (uset.isEmpty()) continue; Set items = uset.addAllTo(new TreeSet<>(instance.comparator)); - // if (DEBUG) System.out.println(evalue.toStringPlain() + "\t" + maj.toPlainString() + "\t" + items.size()); + // if (DEBUG) System.out.println(evalue.toStringPlain() + "\t" + maj.toPlainString() + // + "\t" + items.size()); for (String subItem : items) { String uName = instance.getUName(subItem); - if (DEBUG) System.out.println( - instance.getAfter(subItem) - + "\t" + ++sortOrder - + "\t" + evalue.toStringPlain() - + "\t" + maj.toPlainString() - + "\tU+" + Utility.hex(subItem, ", U+") - + "\t" + instance.getName(subItem) - + (uName != null ? "\t" + uName : "") - ); + if (DEBUG) + System.out.println( + instance.getAfter(subItem) + + "\t" + + ++sortOrder + + "\t" + + evalue.toStringPlain() + + "\t" + + maj.toPlainString() + + "\tU+" + + Utility.hex(subItem, ", U+") + + "\t" + + instance.getName(subItem) + + (uName != null ? "\t" + uName : "")); if (lastSubitem != null) { if (instance.comparator.compare(lastSubitem, subItem) >= 0) { int debug = 0; @@ -1037,8 +1150,6 @@ public static void showOrdering(CandidateData instance, UnicodeSet discard) { } } - - public Set getCandidatesAfter(String s) { return (Set) existingToDraftCandidatesAfter.get(s); } @@ -1047,27 +1158,44 @@ public String getAfter(String s) { return after.get(s); } - private static void showCandidateData(CandidateData cd, boolean sortWithCandidateComparator, boolean retainOnlyNew) { + private static void showCandidateData( + CandidateData cd, boolean sortWithCandidateComparator, boolean retainOnlyNew) { cd.comparator.compare(Utility.fromHex("1F9B5"), Utility.fromHex("1F9B6 1F3FF")); if (DEBUG) System.out.println("Code Point\tChart\tGlyph\tSample\tColored Glyph\tName"); UnicodeSet chars2 = cd.getAllEmojiWithoutDefectives(); if (retainOnlyNew) { - chars2 = new UnicodeSet(chars2).removeAll(EmojiData.EMOJI_DATA_BETA.getAllEmojiWithoutDefectives()).freeze(); + chars2 = + new UnicodeSet(chars2) + .removeAll(EmojiData.EMOJI_DATA_BETA.getAllEmojiWithoutDefectives()) + .freeze(); } - List sorted = new ArrayList<>(chars2.addAllTo(new TreeSet( - sortWithCandidateComparator - ? cd.comparator - : EmojiOrder.STD_ORDER.codepointCompare))); + List sorted = + new ArrayList<>( + chars2.addAllTo( + new TreeSet( + sortWithCandidateComparator + ? cd.comparator + : EmojiOrder.STD_ORDER.codepointCompare))); String lastCategory = null; MajorGroup lastMajorGroup = null; List lastCategoryList = new ArrayList(); if (DEBUG) { - System.out.println("chars2: " + chars2.size()+ "\n" + DebugUtilities.composeStringsWhen("•", chars2, s1 -> s1.contains(Emoji.TRANSGENDER))); - System.out.println("sorted: " + sorted.size()+ "\n" + DebugUtilities.composeStringsWhen("•", sorted, s2 -> s2.contains(Emoji.TRANSGENDER))); + System.out.println( + "chars2: " + + chars2.size() + + "\n" + + DebugUtilities.composeStringsWhen( + "•", chars2, s1 -> s1.contains(Emoji.TRANSGENDER))); + System.out.println( + "sorted: " + + sorted.size() + + "\n" + + DebugUtilities.composeStringsWhen( + "•", sorted, s2 -> s2.contains(Emoji.TRANSGENDER))); } - Map errors = new LinkedHashMap<>(); + Map errors = new LinkedHashMap<>(); for (String s : sorted) { if (s.contains(Emoji.TRANSGENDER)) { int debug = 0; @@ -1075,53 +1203,74 @@ private static void showCandidateData(CandidateData cd, boolean sortWithCandidat String category = cd.getCategory(s); MajorGroup majorGroup = cd.getMajorGroup(s); if (majorGroup == null) { - cd.getMajorGroup(s); + cd.getMajorGroup(s); } if (majorGroup != lastMajorGroup) { if (DEBUG) System.out.println("\n@ " + majorGroup.name()); - lastMajorGroup = majorGroup; + lastMajorGroup = majorGroup; } - if (!Objects.equal(category,lastCategory)) { + if (!Objects.equal(category, lastCategory)) { if (lastCategory != null) { - if (DEBUG) System.out.println("# lastCategory: " + lastCategory + " = \t" + CollectionUtilities.join(lastCategoryList, " ")); + if (DEBUG) + System.out.println( + "# lastCategory: " + + lastCategory + + " = \t" + + CollectionUtilities.join(lastCategoryList, " ")); } if (DEBUG) System.out.println(category); - lastCategory = category; + lastCategory = category; lastCategoryList.clear(); } lastCategoryList.add(s); if (cd.getProposal(s) == null) { errors.put(s, "No proposal value for: "); } - if (DEBUG) System.out.println(Utility.hex(s) - + "\t" + s - // + "\t" + cd.getQuarter(s) - + "\t" + cd.getName(s) - + "\t" + cd.getProposal(s) - + "\t" + CollectionUtilities.join(cd.getAnnotations(s), " | ") - ); + if (DEBUG) + System.out.println( + Utility.hex(s) + + "\t" + + s + // + "\t" + cd.getQuarter(s) + + "\t" + + cd.getName(s) + + "\t" + + cd.getProposal(s) + + "\t" + + CollectionUtilities.join(cd.getAnnotations(s), " | ")); // for (String annotation : cd.getAnnotations(s)) { // if (DEBUG) System.out.println("• " + annotation); // } } - if (DEBUG) System.out.println("# list: " + lastCategory + " = \t" + CollectionUtilities.join(lastCategoryList, " ")); + if (DEBUG) + System.out.println( + "# list: " + + lastCategory + + " = \t" + + CollectionUtilities.join(lastCategoryList, " ")); if (errors.isEmpty()) { - errors.forEach((String key, String value) -> System.out.println(Utility.hex(key) + "\t" + value)); + errors.forEach( + (String key, String value) -> + System.out.println(Utility.hex(key) + "\t" + value)); throw new IllegalArgumentException("Failed"); } } private static void showLast(UnicodeSet last) { if (DEBUG) System.out.println("# Total: " + last.size()); - if (DEBUG) System.out.println("# USet: " + CollectionUtilities.join( - last.addAllTo(new LinkedHashSet<>())," ") + "\n"); + if (DEBUG) + System.out.println( + "# USet: " + + CollectionUtilities.join(last.addAllTo(new LinkedHashSet<>()), " ") + + "\n"); last.clear(); } @Override public String transform(String source) { String temp = getName(source); - main: { + main: + { if ("I LOVE YOU HAND SIGN".equals(temp)) { temp = "LOVE-YOU HAND"; break main; @@ -1133,48 +1282,50 @@ public String transform(String source) { temp = EmojiData.EMOJI_DATA_BETA.getFallbackName(source); break main; } - switch(CountEmoji.Category.getBucket(source)) { - case component: - temp = UCharacter.getName(EmojiData.removeEmojiVariants(source), "+"); - break; - case character: - case flag_seq: - case keycap_seq: - case tag_seq: - break; - default: - String replacement = null; - int trailPos = source.lastIndexOf(Emoji.JOINER_STR); - if (trailPos > 0) { - String ending = source.substring(trailPos); - switch (ending.replace(Emoji.EMOJI_VARIANT_STRING, "")) { - case Emoji.JOINER_STR + Emoji.MALE: - replacement = "MAN"; + switch (CountEmoji.Category.getBucket(source)) { + case component: + temp = UCharacter.getName(EmojiData.removeEmojiVariants(source), "+"); break; - case Emoji.JOINER_STR + Emoji.FEMALE: - replacement = "WOMAN"; - - } - if (replacement != null) { - temp = getName(source.substring(0, source.length() - ending.length())); - } - if (temp != null) { - if (temp.contains("PERSON")) { - temp = temp.replaceAll("PERSON", replacement); - } else if (temp.contains("person")) { - temp = temp.replaceAll("person", replacement); - } else { - temp = replacement + " " + temp; + case character: + case flag_seq: + case keycap_seq: + case tag_seq: + break; + default: + String replacement = null; + int trailPos = source.lastIndexOf(Emoji.JOINER_STR); + if (trailPos > 0) { + String ending = source.substring(trailPos); + switch (ending.replace(Emoji.EMOJI_VARIANT_STRING, "")) { + case Emoji.JOINER_STR + Emoji.MALE: + replacement = "MAN"; + break; + case Emoji.JOINER_STR + Emoji.FEMALE: + replacement = "WOMAN"; + } + if (replacement != null) { + temp = getName(source.substring(0, source.length() - ending.length())); + } + if (temp != null) { + if (temp.contains("PERSON")) { + temp = temp.replaceAll("PERSON", replacement); + } else if (temp.contains("person")) { + temp = temp.replaceAll("person", replacement); + } else { + temp = replacement + " " + temp; + } } } - } - break; + break; } } return temp == null ? temp : temp.toLowerCase(Locale.ROOT); } - enum MatchInclusion {includeFilterMatches, excludeFilterMatches} + enum MatchInclusion { + includeFilterMatches, + excludeFilterMatches + } private UnicodeSet addWithCharFilter(UnicodeSet source, Predicate filter) { UnicodeSet result = new UnicodeSet(); @@ -1190,7 +1341,6 @@ private UnicodeSet addWithCharFilter(UnicodeSet source, UnicodeSet filter) { return new UnicodeSet(source).retainAll(filter).freeze(); } - @Override public UnicodeSet getEmojiComponents() { return addWithCharFilter(emoji_Component, draft); @@ -1203,8 +1353,9 @@ public UnicodeSet getSingletonsWithDefectives() { @Override public UnicodeSet getEmojiPresentationSet() { - return addWithCharFilter(singleCharacters, s -> draft.contains(s) - && !getTextPresentationSet().containsSome(s)); + return addWithCharFilter( + singleCharacters, + s -> draft.contains(s) && !getTextPresentationSet().containsSome(s)); } @Override @@ -1224,8 +1375,8 @@ public UnicodeSet getTagSequences() { @Override public UnicodeSet getModifierSequences() { - return addWithCharFilter(draft, s -> EmojiData.MODIFIERS.containsSome(s) - && !ZWJ_SET.containsSome(s)); + return addWithCharFilter( + draft, s -> EmojiData.MODIFIERS.containsSome(s) && !ZWJ_SET.containsSome(s)); } @Override @@ -1258,7 +1409,8 @@ public UnicodeSet getEmojiWithVariants() { @Override public UnicodeSet getAllEmojiWithoutDefectives() { - return addWithCharFilter(draft, + return addWithCharFilter( + draft, x -> { if (x.contains(Emoji.TRANSGENDER)) { int debug = 0; @@ -1284,8 +1436,8 @@ public UnicodeSet getGenderBases() { @Override public UnicodeSet getSingletonsWithoutDefectives() { - return addWithCharFilter(singleCharacters, s -> draft.contains(s) - && !getEmojiComponents().containsSome(s)); + return addWithCharFilter( + singleCharacters, s -> draft.contains(s) && !getEmojiComponents().containsSome(s)); } @Override @@ -1301,7 +1453,7 @@ public String getUnicodeName(String source) { public VersionInfo getNewest(String s) { return BirthInfo.getVersionInfo(s); // Age_Values result = Emoji.getNewest(s); - // return result == Age_Values.Unassigned ? Emoji.UCD11 + // return result == Age_Values.Unassigned ? Emoji.UCD11 // : VersionInfo.getInstance(result.getShortName()); } @@ -1311,10 +1463,13 @@ public UnicodeSet getTakesSign() { } public UnicodeSet getAllCharacters(Status status) { - switch(status) { - case Provisional_Candidate: return provisional; - case Draft_Candidate : return draft; - default: throw new IllegalArgumentException(); + switch (status) { + case Provisional_Candidate: + return provisional; + case Draft_Candidate: + return draft; + default: + throw new IllegalArgumentException(); } } @@ -1339,7 +1494,8 @@ public String getVersionString() { @Override public String getPlainVersion() { - return "candidates:" + DateFormat.getInstanceForSkeleton("yyyyMMdd", ULocale.ROOT).format(date); + return "candidates:" + + DateFormat.getInstanceForSkeleton("yyyyMMdd", ULocale.ROOT).format(date); } /** We don't expect to have any more of these */ @@ -1363,5 +1519,4 @@ public UnicodeSet getModifierBasesRgi() { public UnicodeSet getAllEmojiWithoutDefectivesOrModifiers() { return addWithCharFilter(draft, s -> EmojiData.MODIFIERS.containsNone(s)); } - } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/CarrierGlyphs.java b/unicodetools/src/main/java/org/unicode/tools/emoji/CarrierGlyphs.java index c7c5716ae..bfd0b1d31 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/CarrierGlyphs.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/CarrierGlyphs.java @@ -1,68 +1,66 @@ package org.unicode.tools.emoji; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.UnicodeSet; import java.io.IOException; import java.io.PrintWriter; import java.util.Comparator; import java.util.Locale; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.UnicodeSet; - public class CarrierGlyphs { - static final IndexUnicodeProperties LATEST = IndexUnicodeProperties - .make(Emoji.VERSION_BETA_UNICODE); - static final UnicodeMap Emoji_SB = LATEST - .load(UcdProperty.Emoji_SB); - static final UnicodeMap Emoji_DCM = LATEST - .load(UcdProperty.Emoji_DCM); - static final UnicodeMap Emoji_KDDI = LATEST - .load(UcdProperty.Emoji_KDDI); - private static final Comparator SB_FIRST = new Comparator() { - public int compare(String o1, String o2) { - if (o1.equals(o2)) { - return 0; - } - String c1 = Emoji_SB.get(o1); - String c2 = Emoji_SB.get(o2); - if (c1 == null) { - return c2 == null ? o1.compareTo(o2) : 1; - } else if (c2 == null) { - return -1; - } - return c1.compareTo(c2); - } - }; + static final IndexUnicodeProperties LATEST = + IndexUnicodeProperties.make(Emoji.VERSION_BETA_UNICODE); + static final UnicodeMap Emoji_SB = LATEST.load(UcdProperty.Emoji_SB); + static final UnicodeMap Emoji_DCM = LATEST.load(UcdProperty.Emoji_DCM); + static final UnicodeMap Emoji_KDDI = LATEST.load(UcdProperty.Emoji_KDDI); + private static final Comparator SB_FIRST = + new Comparator() { + public int compare(String o1, String o2) { + if (o1.equals(o2)) { + return 0; + } + String c1 = Emoji_SB.get(o1); + String c2 = Emoji_SB.get(o2); + if (c1 == null) { + return c2 == null ? o1.compareTo(o2) : 1; + } else if (c2 == null) { + return -1; + } + return c1.compareTo(c2); + } + }; public static void main(String[] args) throws IOException { UnicodeSet carrier = new UnicodeSet(); // new TreeSet(SB_FIRST); carrier.addAll(Emoji_KDDI.keySet()); carrier.addAll(Emoji_DCM.keySet()); carrier.addAll(Emoji_SB.keySet()); - PrintWriter out = FileUtilities.openUTF8Writer(Settings.UnicodeTools.UNICODETOOLS_REPO_DIR - + "/reports/tr51/", "carrier-emoji.html"); - out.println("\n" + - "\n" + - "\n" + - "\n" + - "Draft Carrier Data (Full)\n" + - "\n" + - "
  • \n" + - "" + - "" + - "" + - "" + - "" + - "" + - "\n" - ); + PrintWriter out = + FileUtilities.openUTF8Writer( + Settings.UnicodeTools.UNICODETOOLS_REPO_DIR + "/reports/tr51/", + "carrier-emoji.html"); + out.println( + "\n" + + "\n" + + "\n" + + "\n" + + "Draft Carrier Data (Full)\n" + + "\n" + + "
    HexChargmail imagehex kddikddi imagehex dcmdcm imagehex sbsb imageName
    \n" + + "" + + "" + + "" + + "" + + "" + + "" + + "\n"); for (String s : carrier) { final String au = AU_URL.transform(s); final String dcm = DCM_URL.transform(s); @@ -71,18 +69,37 @@ public static void main(String[] args) throws IOException { final String dcmCode = Emoji_DCM.get(s); final String sbCode = Emoji_SB.get(s); out.println( - "" + - "" + - "\n\t" + - "\n\t" + - "\n\t" + - "\n\t" + - "\n\t" + - "\n\t" + - "\n\t" + - "\n\t" + - "" - ); + "" + + "" + + "\n\t" + + "\n\t" + + "\n\t" + + "\n\t" + + "\n\t" + + "\n\t" + + "\n\t" + + "\n\t" + + ""); } out.println("
    HexChargmail imagehex kddikddi imagehex dcmdcm imagehex sbsb imageName
    " + Utility.hex(s) + "" + s + "" + img("gmail", s, "x") + "" + replaceNull(kddiCode, "n/a") + "" + img("kddi", s, kddiCode) + "" + replaceNull(dcmCode, "n/a") + "" + img("dcm", s, dcmCode) + "" + replaceNull(sbCode, "n/a") + "" + img("sb", s, sbCode) + "" + UCharacter.getName(s, "+") + "
    " + + Utility.hex(s) + + "" + + s + + "" + + img("gmail", s, "x") + + "" + + replaceNull(kddiCode, "n/a") + + "" + + img("kddi", s, kddiCode) + + "" + + replaceNull(dcmCode, "n/a") + + "" + + img("dcm", s, dcmCode) + + "" + + replaceNull(sbCode, "n/a") + + "" + + img("sb", s, sbCode) + + "" + + UCharacter.getName(s, "+") + + "
    "); out.close(); @@ -97,66 +114,68 @@ public static String img(String type, final String unicode, String code) { return ""; } final String dir = Settings.UnicodeTools.UNICODETOOLS_REPO_DIR + "/reports/tr51/"; - final String filename = "images/" - + type - + "/" - + type + - "_" + Utility.hex(unicode, "_").toLowerCase(Locale.ROOT) - + ".gif"; - return // !new File(dir, filename).exists() ? "missing" : - ""; + final String filename = + "images/" + + type + + "/" + + type + + "_" + + Utility.hex(unicode, "_").toLowerCase(Locale.ROOT) + + ".gif"; + return // !new File(dir, filename).exists() ? "missing" : + ""; } - static final Transform AU_URL = new Transform() { - public String transform(String s) { - String transformed = Emoji_KDDI.get(s); - if (transformed == null) { - return null; - // transformed = "fffd"; - } else { - transformed = transformed.toLowerCase(Locale.ROOT); - } - return "http://trialgoods.com/images/200807au/" + transformed - + ".gif"; - } - }; - static final Transform DCM_URL = new Transform() { - public String transform(String s) { - String transformed = Emoji_DCM.get(s); - if (transformed == null) { - return null; - // transformed = "fffd"; - } else { - transformed = transformed.toLowerCase(Locale.ROOT); - } - return "http://trialgoods.com/images/200807i/" + transformed - + ".gif"; - } - }; + static final Transform AU_URL = + new Transform() { + public String transform(String s) { + String transformed = Emoji_KDDI.get(s); + if (transformed == null) { + return null; + // transformed = "fffd"; + } else { + transformed = transformed.toLowerCase(Locale.ROOT); + } + return "http://trialgoods.com/images/200807au/" + transformed + ".gif"; + } + }; + static final Transform DCM_URL = + new Transform() { + public String transform(String s) { + String transformed = Emoji_DCM.get(s); + if (transformed == null) { + return null; + // transformed = "fffd"; + } else { + transformed = transformed.toLowerCase(Locale.ROOT); + } + return "http://trialgoods.com/images/200807i/" + transformed + ".gif"; + } + }; // http://trialgoods.com/images/200807sb/F72.gif - static final Transform SB_URL = new Transform() { - public String transform(String s) { - //if (true) return null; + static final Transform SB_URL = + new Transform() { + public String transform(String s) { + // if (true) return null; - String transformed = Emoji_SB.get(s); - if (transformed == null) { - return null; - // transformed = "fffd"; - } else { - int sjis = Integer.parseInt(transformed, 16); - int fixed = sbFromShiftJis(sjis); - String trail = Utility.hex((fixed & 0xFF),2).toLowerCase(Locale.ROOT); - transformed = ((char)(fixed >> 8))+ trail; - } - return "http://trialgoods.com/images/200807sb/" + transformed - + ".gif"; - } - }; + String transformed = Emoji_SB.get(s); + if (transformed == null) { + return null; + // transformed = "fffd"; + } else { + int sjis = Integer.parseInt(transformed, 16); + int fixed = sbFromShiftJis(sjis); + String trail = Utility.hex((fixed & 0xFF), 2).toLowerCase(Locale.ROOT); + transformed = ((char) (fixed >> 8)) + trail; + } + return "http://trialgoods.com/images/200807sb/" + transformed + ".gif"; + } + }; static int sbFromShiftJis(int b) { int b1 = b >> 8; int b2 = b & 0xFF; - //Create a RowCell instance from a Shift-JIS byte pair. + // Create a RowCell instance from a Shift-JIS byte pair. // Returns: // A RowCell instance with the row-cell value pair. @@ -168,24 +187,24 @@ static int sbFromShiftJis(int b) { // Can't use the following, since it blows out on F7.. - // if (!((0x81 <= b1 && b1 <= 0x9f || 0xe0 <= b1 && b1 <= 0xef) + // if (!((0x81 <= b1 && b1 <= 0x9f || 0xe0 <= b1 && b1 <= 0xef) // && 0x40 <= b2 && b2 <= 0xfc && b2 != 0x7f)) { // throw new IllegalArgumentException("value out of range " + Utility.hex(b)); // } if (b1 <= 0x9f) { - b1 = (b1 - 0x80) << 1; + b1 = (b1 - 0x80) << 1; } else { - b1 = (b1 - 0xc0) << 1; + b1 = (b1 - 0xc0) << 1; } if (b2 <= 0x9e) { - b1 -= 1; - if (b2 <= 0x7e) { - b2 -= 0x3f; - } else { - b2 -= 0x40; - } + b1 -= 1; + if (b2 <= 0x7e) { + b2 -= 0x3f; + } else { + b2 -= 0x40; + } } else { - b2 -= 0x9e; + b2 -= 0x9e; } // According to http://trialgoods.com/emoji/?career=sb&page=all, // F741 => $E! = 4521 @@ -198,28 +217,28 @@ static int sbFromShiftJis(int b) { // but then fails for the F's // 🚶 F7A1 => F" 4622, should be 4621 if (b1 >= 0x46) { - --b2; - // 👦 F941 => G! 4721 - if (b1 >= 0x49) { - b1 -= 2; - b2 += 1; - // 📝 F9A1 => H" 4822 should be O! 4F21 - if (b1 >= 'H') { - b1 += 'O' - 'H'; - b2 -= 1; - // 😥 FB41 => R 5220 should be $P! 5021 - if (b1 >= 'R') { + --b2; + // 👦 F941 => G! 4721 + if (b1 >= 0x49) { b1 -= 2; b2 += 1; - // 🏩 FBA1 => Q" 5122 should be $Q! 5121 - if (b1 >= 'Q') { - b2 -= 1; + // 📝 F9A1 => H" 4822 should be O! 4F21 + if (b1 >= 'H') { + b1 += 'O' - 'H'; + b2 -= 1; + // 😥 FB41 => R 5220 should be $P! 5021 + if (b1 >= 'R') { + b1 -= 2; + b2 += 1; + // 🏩 FBA1 => Q" 5122 should be $Q! 5121 + if (b1 >= 'Q') { + b2 -= 1; + } + // final is ™ FBD7 => QW 5157, so checks out. + } } - // final is ™ FBD7 => QW 5157, so checks out. - } } - } } return (b1 << 8) & 0xFF00 | (b2 & 0xFF); - } + } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/ChartUtilities.java b/unicodetools/src/main/java/org/unicode/tools/emoji/ChartUtilities.java index 681b134fa..0be73db67 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/ChartUtilities.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/ChartUtilities.java @@ -1,49 +1,63 @@ package org.unicode.tools.emoji; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.IOException; import java.util.Locale; - import org.unicode.cldr.draft.FileUtilities; - import org.unicode.text.utility.UtilityBase; -import com.ibm.icu.util.ICUUncheckedIOException; - public class ChartUtilities { - public static void writeHeader(String outFileName, Appendable out, String title, String indexRelLink, - boolean skipVersion, String firstLine, String dataDir, String tr51Url) { + public static void writeHeader( + String outFileName, + Appendable out, + String title, + String indexRelLink, + boolean skipVersion, + String firstLine, + String dataDir, + String tr51Url) { final String fullTitle = title + (skipVersion ? "" : ", v" + Emoji.VERSION_STRING); String headerLine = "\n" - + UtilityBase.HTML_HEAD - + "\n" - + "" + fullTitle - + (skipVersion ? "" : Emoji.BETA_TITLE_AFFIX) + "\n" - + "\n" - + "\n" - + ChartUtilities.getUnicodeHeader(indexRelLink) + ChartUtilities.getButton() + "\n" - + "

    " + fullTitle - + (skipVersion ? "" : Emoji.BETA_HEADER_AFFIX) + "

    \n" - + (skipVersion ? "" : ChartUtilities.getPointToOther(outFileName, title)) - + "

    " - + "Index & Help\n" - + " | Images & Rights\n" - + " | Spec\n" - + " | Proposing Additions" - + "

    \n" - + firstLine - + (dataDir == null ? "" : "" - + "

    While these charts use a particular version of the Unicode Emoji data files, " - + "the images and format may be updated at any time." - + " For any production usage, consult those data files. " - + " For information about the contents of each column, " - + "such as the CLDR Short Name, click on the column header." - + " For further information, see " - + "Index & Help.

    \n"); + + "\"http://www.w3.org/TR/html4/loose.dtd\">\n" + + UtilityBase.HTML_HEAD + + "\n" + + "" + + fullTitle + + (skipVersion ? "" : Emoji.BETA_TITLE_AFFIX) + + "\n" + + "\n" + + "\n" + + ChartUtilities.getUnicodeHeader(indexRelLink) + + ChartUtilities.getButton() + + "\n" + + "

    " + + fullTitle + + (skipVersion ? "" : Emoji.BETA_HEADER_AFFIX) + + "

    \n" + + (skipVersion ? "" : ChartUtilities.getPointToOther(outFileName, title)) + + "

    " + + "Index & Help\n" + + " | Images & Rights\n" + + " | Spec\n" + + " | Proposing Additions" + + "

    \n" + + firstLine + + (dataDir == null + ? "" + : "" + + "

    While these charts use a particular version of the Unicode Emoji data files, " + + "the images and format may be updated at any time." + + " For any production usage, consult those data files. " + + " For information about the contents of each column, " + + "such as the CLDR Short Name, click on the column header." + + " For further information, see " + + "Index & Help.

    \n"); try { out.append(headerLine); } catch (IOException e) { @@ -52,26 +66,39 @@ public static void writeHeader(String outFileName, Appendable out, String title, } static String getPointToOther(String outFileName, String title) { - return !Emoji.BETA_IS_OPEN && !Emoji.IS_BETA ? "" - : "
    For the " + (Emoji.IS_BETA - ? "current released version, see v" - + Emoji.VERSION_LAST_RELEASED_STRING - : "new beta version, see v" - + Emoji.VERSION_BETA_STRING_WITH_COLOR) - + ".
    \n"; + return !Emoji.BETA_IS_OPEN && !Emoji.IS_BETA + ? "" + : "
    For the " + + (Emoji.IS_BETA + ? "current released version, see v" + + Emoji.VERSION_LAST_RELEASED_STRING + : "new beta version, see v" + + Emoji.VERSION_BETA_STRING_WITH_COLOR) + + ".
    \n"; } - static final String UNICODE_HEADER = "" + "
    " - + "" - + "" - + "Emoji Charts" + "
    " - + "
     
    " + "
    "; + static final String UNICODE_HEADER = + "" + + "
    " + + "" + + "" + + "Emoji Charts" + + "
    " + + "
     
    " + + "
    "; public static String getUnicodeHeader(String indexRelLink) { return FileUtilities.replace( - UNICODE_HEADER, "%%CHARTS_LINK%%", + UNICODE_HEADER, + "%%CHARTS_LINK%%", (indexRelLink == null ? "index.html" : indexRelLink)); } @@ -84,15 +111,16 @@ public static String getButton() { public static void writeFooter(Appendable out) { try { - out.append("\n
    " - + "\n"); + out.append( + "\n
    " + + "\n"); } catch (IOException e) { throw new ICUUncheckedIOException(e); } @@ -109,8 +137,13 @@ public static String fixAnchor(String href) { public static String getLink(String href, String anchorText, String target) { href = fixAnchor(href); - return "" + anchorText + return "" + + anchorText + ""; } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/CompareEmojiFreq.java b/unicodetools/src/main/java/org/unicode/tools/emoji/CompareEmojiFreq.java index 5e0c1bca5..4399fe2a2 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/CompareEmojiFreq.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/CompareEmojiFreq.java @@ -1,33 +1,30 @@ package org.unicode.tools.emoji; +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.util.ULocale; import java.io.IOException; import java.io.PrintWriter; import java.util.HashSet; import java.util.Locale; import java.util.Map.Entry; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Counter; import org.unicode.tools.emoji.EmojiFrequency.CountInfo; import org.unicode.tools.emoji.EmojiFrequency.GBoardCounts; -import com.google.common.collect.Multimap; -import com.google.common.collect.TreeMultimap; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.util.ULocale; - public class CompareEmojiFreq { private static final int MINIMUM_RAW_COUNT_FOR_LOCALE = 0; private static final int MINIMUM_RAW_COUNT_FOR_EMOJI = 5000; private static final int MAXIMUM_RANK = 30; private static final int MINIMUM_RANK_BETTER = 3; private static final double MINIMUM_PERCENT_BETTER = 1; - + static CountInfo worldCounts = GBoardCounts.localeToCountInfo.get("001"); public static void main(String[] args) throws IOException { - Multimap ascending = TreeMultimap.create(); - + Multimap ascending = TreeMultimap.create(); for (String locale : GBoardCounts.localeToCountInfo.keySet()) { CountInfo counts = GBoardCounts.localeToCountInfo.get(locale); @@ -36,33 +33,36 @@ public static void main(String[] args) throws IOException { } ascending.put(counts.rawTotal, locale); } - try (PrintWriter out = FileUtilities.openUTF8Writer( - "/Users/markdavis/Google Drive/workspace/Generated/emoji/frequency", - "emoji-by-locale.txt"); - PrintWriter outWorse = FileUtilities.openUTF8Writer( - "/Users/markdavis/Google Drive/workspace/Generated/emoji/frequency", - "emoji-by-locale-worse.txt")) { - for (Entry entry : ascending.entries()) { - Long totalCount = entry.getKey(); - String locale = entry.getValue(); - CountInfo counts = GBoardCounts.localeToCountInfo.get(locale); - show(out, locale, counts, false); - show(outWorse, locale, counts, true); -// -// System.out.println(totalCount -// + "\t" + locale -// + "\t" + ULocale.getDisplayName(locale, "en")); - } + try (PrintWriter out = + FileUtilities.openUTF8Writer( + "/Users/markdavis/Google Drive/workspace/Generated/emoji/frequency", + "emoji-by-locale.txt"); + PrintWriter outWorse = + FileUtilities.openUTF8Writer( + "/Users/markdavis/Google Drive/workspace/Generated/emoji/frequency", + "emoji-by-locale-worse.txt")) { + for (Entry entry : ascending.entries()) { + Long totalCount = entry.getKey(); + String locale = entry.getValue(); + CountInfo counts = GBoardCounts.localeToCountInfo.get(locale); + show(out, locale, counts, false); + show(outWorse, locale, counts, true); + // + // System.out.println(totalCount + // + "\t" + locale + // + "\t" + ULocale.getDisplayName(locale, "en")); + } } } static final NumberFormat pf = NumberFormat.getPercentInstance(Locale.ENGLISH); static final NumberFormat nf = NumberFormat.getIntegerInstance(Locale.ENGLISH); + static { pf.setMaximumFractionDigits(2); pf.setMinimumFractionDigits(2); } - + private static void show(PrintWriter out, String locale, CountInfo counts, boolean worse) { Counter other_ = new Counter<>(); for (String emoji : worldCounts.keyToCount.keySet()) { @@ -84,29 +84,37 @@ private static void show(PrintWriter out, String locale, CountInfo counts, boole long otherCount = other.keyToCount.get(emoji); int otherRank = other.keyToRank.get(emoji); - long amountBetter = worse ? otherCount-localeCount : localeCount-otherCount; - int rankBetter = worse ? rank - otherRank: otherRank - rank; - - if (amountBetter >= MINIMUM_PERCENT_BETTER*CountInfo.SCALE/100 && rankBetter >= 1 + long amountBetter = worse ? otherCount - localeCount : localeCount - otherCount; + int rankBetter = worse ? rank - otherRank : otherRank - rank; + + if (amountBetter >= MINIMUM_PERCENT_BETTER * CountInfo.SCALE / 100 && rankBetter >= 1 || rankBetter >= MINIMUM_RANK_BETTER || isWorld) { -// if (!haveCounts) { -// out.println(localeName + "\t" + nf.format(counts.rawTotal)); -// haveCounts = true; -// } + // if (!haveCounts) { + // out.println(localeName + "\t" + nf.format(counts.rawTotal)); + // haveCounts = true; + // } haveCounts = true; out.println( - localeName - + "\t" + fixLocale(locale) - + "\t" + fix(emoji) - + "\t" + rank - + "\t" + pf.format(localeCount/(double)CountInfo.SCALE) - + "\t" + "+" + pf.format(amountBetter/(double)CountInfo.SCALE) - + "\t" + "+" + rankBetter - + "\t" + nf.format(counts.rawTotal) - ); - } - if (rank >= MAXIMUM_RANK) break; + localeName + + "\t" + + fixLocale(locale) + + "\t" + + fix(emoji) + + "\t" + + rank + + "\t" + + pf.format(localeCount / (double) CountInfo.SCALE) + + "\t" + + "+" + + pf.format(amountBetter / (double) CountInfo.SCALE) + + "\t" + + "+" + + rankBetter + + "\t" + + nf.format(counts.rawTotal)); + } + if (rank >= MAXIMUM_RANK) break; } if (haveCounts) { out.println(); @@ -115,12 +123,13 @@ private static void show(PrintWriter out, String locale, CountInfo counts, boole private static String fixLocale(String locale) { switch (locale) { - case "001": return "mul"; - case "zz": return "und"; - default: - return locale; + case "001": + return "mul"; + case "zz": + return "und"; + default: + return locale; } - } private static String fix(String emoji) { @@ -129,10 +138,12 @@ private static String fix(String emoji) { private static String name(String locale) { switch (locale) { - case "001": return "World"; - case "zz": return "Unknown"; - default: - return ULocale.getDisplayName(locale, "en"); + case "001": + return "World"; + case "zz": + return "Unknown"; + default: + return ULocale.getDisplayName(locale, "en"); } } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/CopyImagesToCldr.java b/unicodetools/src/main/java/org/unicode/tools/emoji/CopyImagesToCldr.java index 890232df2..8a74091b4 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/CopyImagesToCldr.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/CopyImagesToCldr.java @@ -1,38 +1,41 @@ package org.unicode.tools.emoji; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.io.Files; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.File; import java.io.IOException; import java.util.Map; import java.util.Set; - import org.unicode.cldr.util.CLDRPaths; import org.unicode.tools.emoji.CountEmoji.Attribute; import org.unicode.tools.emoji.CountEmoji.Category; import org.unicode.tools.emoji.Emoji.Source; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.io.Files; -import com.ibm.icu.util.ICUUncheckedIOException; - public class CopyImagesToCldr { private static final boolean VIEW_ONLY = false; - - private static final ImmutableSet DEFAULT_SOURCE_LIST = ImmutableSet.of( - Source.google, - Source.twitter, - Source.emojione, - Source.sample, - Source.proposed); - - private static final Map> OVERRIDE_SOURCES = ImmutableMap.>builder() - .put("🚙", ImmutableSet.builder().add(Source.emojione).addAll(DEFAULT_SOURCE_LIST).build()) - .build(); + + private static final ImmutableSet DEFAULT_SOURCE_LIST = + ImmutableSet.of( + Source.google, Source.twitter, Source.emojione, Source.sample, Source.proposed); + + private static final Map> OVERRIDE_SOURCES = + ImmutableMap.>builder() + .put( + "🚙", + ImmutableSet.builder() + .add(Source.emojione) + .addAll(DEFAULT_SOURCE_LIST) + .build()) + .build(); public static void main(String[] args) { - System.out.println("Warning: make sure that the images repo is updated to the latest images,\n" - + "and that you are using the environment variable for the version of Emoji"); - String targetDir = CLDRPaths.BASE_DIRECTORY + "tools/cldr-apps/src/main/webapp/images/emoji/"; + System.out.println( + "Warning: make sure that the images repo is updated to the latest images,\n" + + "and that you are using the environment variable for the version of Emoji"); + String targetDir = + CLDRPaths.BASE_DIRECTORY + "tools/cldr-apps/src/main/webapp/images/emoji/"; for (String emoji : EmojiData.EMOJI_DATA_BETA.getAllEmojiWithoutDefectives()) { Category bucket = Category.getBucket(emoji); if (bucket.hasAttribute(Attribute.skin) || bucket.hasAttribute(Attribute.hair)) { @@ -40,10 +43,12 @@ public static void main(String[] args) { } File file = getBestPublic(emoji); if (file == null) { - System.out.println("***No image for: " + emoji + ": " + EmojiData.EMOJI_DATA.getName(emoji)); + System.out.println( + "***No image for: " + emoji + ": " + EmojiData.EMOJI_DATA.getName(emoji)); } else { - String chars = emoji.replace(Emoji.EMOJI_VARIANT_STRING,""); - File newName = new File(targetDir, "emoji_" + Emoji.buildFileName(chars, "_") + ".png"); + String chars = emoji.replace(Emoji.EMOJI_VARIANT_STRING, ""); + File newName = + new File(targetDir, "emoji_" + Emoji.buildFileName(chars, "_") + ".png"); System.out.println(file.getName() + " ⇒ " + newName.getName()); if (VIEW_ONLY) continue; try { diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/CountEmoji.java b/unicodetools/src/main/java/org/unicode/tools/emoji/CountEmoji.java index 24e066dc7..deeef5d74 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/CountEmoji.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/CountEmoji.java @@ -1,5 +1,14 @@ package org.unicode.tools.emoji; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSortedSet; +import com.google.common.collect.Multimap; +import com.google.common.collect.Ordering; +import com.google.common.collect.TreeMultimap; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.UnicodeSet; import java.io.PrintWriter; import java.util.Arrays; import java.util.Collection; @@ -7,12 +16,10 @@ import java.util.EnumMap; import java.util.EnumSet; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.tool.Option; import org.unicode.cldr.tool.Option.Options; import org.unicode.cldr.tool.Option.Params; @@ -21,19 +28,11 @@ import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.EmojiOrder.MajorGroup; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSortedSet; -import com.google.common.collect.Multimap; -import com.google.common.collect.Ordering; -import com.google.common.collect.TreeMultimap; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.UnicodeSet; - public class CountEmoji { - public static final String EMOJI_COUNT_KEY = "Emoji Counts Key"; - public static final String STRUCTURE = "Structure"; + public static final String EMOJI_COUNT_KEY = + "Emoji Counts Key"; + public static final String STRUCTURE = + "Structure"; private static final EmojiData EMOJI_DATA_PREVIOUS = EmojiData.of(Emoji.VERSION_LAST_RELEASED); private static final EmojiData EMOJI_DATA_BETA = EmojiData.EMOJI_DATA_BETA; private static final EmojiOrder ORDER = EmojiOrder.BETA_ORDER; @@ -43,22 +42,26 @@ enum MyOptions { nonincrementalCount(new Params()), countVs(new Params()), invalid(new Params()), - verbose(new Params().setHelp("verbose debugging messages")), + verbose(new Params().setHelp("verbose debugging messages")), list(new Params()), major(new Params()), ; // BOILERPLATE TO COPY final Option option; + private MyOptions(Params params) { option = new Option(this, params); } + private static Options myOptions = new Options(); + static { for (MyOptions option : MyOptions.values()) { myOptions.add(option, option.option); } } + private static Set parse(String[] args, boolean showArguments) { return myOptions.parse(MyOptions.values()[0], args, true); } @@ -69,7 +72,7 @@ public static void main(String[] args) { boolean done = false; if (CountEmoji.MyOptions.countVs.option.doesOccur()) { countVs(); - done=true; + done = true; } // if (MyOptions.invalid.option.doesOccur()) { // countInvalid(); @@ -77,17 +80,18 @@ public static void main(String[] args) { // } if (CountEmoji.MyOptions.nonincrementalCount.option.doesOccur()) { countNonincremental(); - done=true; + done = true; } if (CountEmoji.MyOptions.list.option.doesOccur()) { - //Category bucket = Category.getBucket("👨‍⚖️"); + // Category bucket = Category.getBucket("👨‍⚖️"); UnicodeSet toDisplay = EmojiData.of(Emoji.VERSION_BETA).getAllEmojiWithoutDefectives(); System.out.println("\nEmoji v11"); listCategories(toDisplay); EmojiData EMOJI_DATA_PREVIOUS = EmojiData.of(Emoji.VERSION_TO_GENERATE_PREVIOUS); - UnicodeSet onlyNew = new UnicodeSet(EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives()) - .removeAll(EMOJI_DATA_PREVIOUS.getAllEmojiWithoutDefectives()); + UnicodeSet onlyNew = + new UnicodeSet(EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives()) + .removeAll(EMOJI_DATA_PREVIOUS.getAllEmojiWithoutDefectives()); System.out.println("\nEmoji v11-v5"); listCategories(onlyNew); done = true; @@ -96,7 +100,6 @@ public static void main(String[] args) { UnicodeSet toDisplay = EmojiData.of(Emoji.VERSION_BETA).getAllEmojiWithoutDefectives(); doMajor(toDisplay); done = true; - } if (!done) { countNew(); @@ -108,14 +111,15 @@ private static void doMajor(UnicodeSet toDisplay) { Counter subgroups = new Counter<>(); Set order = new LinkedHashSet<>(); UnicodeSet longPress = new UnicodeSet(); - TreeSet sorted = toDisplay.addAllTo(new TreeSet(EmojiOrder.STD_ORDER.codepointCompare)); + TreeSet sorted = + toDisplay.addAllTo(new TreeSet(EmojiOrder.STD_ORDER.codepointCompare)); for (String emoji : sorted) { Category cat = Category.getBucket(emoji); Set attr = cat.getAttributes(); if (attr.contains(Attribute.hair) || attr.contains(Attribute.skin)) { longPress.add(emoji); continue; - } + } String group = EmojiOrder.STD_ORDER.getCategory(emoji); order.add(group); subgroups.add(group, 1); @@ -127,12 +131,14 @@ private static void doMajor(UnicodeSet toDisplay) { } for (String subgroup : order) { MajorGroup majorGroup = EmojiOrder.STD_ORDER.getMajorGroupFromCategory(subgroup); - System.out.println(majorGroup.toPlainString() + "\t" + subgroup + "\t" + subgroups.get(subgroup)); + System.out.println( + majorGroup.toPlainString() + "\t" + subgroup + "\t" + subgroups.get(subgroup)); } } private static void listCategories(UnicodeSet toDisplay) { - Multimap items = TreeMultimap.create(Ordering.natural(), EmojiOrder.STD_ORDER.codepointCompare); + Multimap items = + TreeMultimap.create(Ordering.natural(), EmojiOrder.STD_ORDER.codepointCompare); for (Category x : Category.values()) { System.out.println(x.displayName + ":\t" + x.html); } @@ -142,10 +148,14 @@ private static void listCategories(UnicodeSet toDisplay) { } for (Category cat : Category.values()) { Collection set = items.get(cat); - System.out.println(cat.toStringPlain() - + "\t" + CollectionUtilities.join(cat.getAttributes(), " ") - + "\t" + set.size() - + "\t" + CollectionUtilities.join(set, " ")); + System.out.println( + cat.toStringPlain() + + "\t" + + CollectionUtilities.join(cat.getAttributes(), " ") + + "\t" + + set.size() + + "\t" + + CollectionUtilities.join(set, " ")); } } @@ -159,13 +169,13 @@ private static void countNonincremental() { int lastZwjIndex = zwj.lastIndexOf(Emoji.JOINER, pos); if (lastZwjIndex < 0) break; String prev = zwj.substring(0, lastZwjIndex); - if (!all.contains(prev) - && !all.contains(prev.replace(Emoji.EMOJI_VARIANT_STRING, "")) + if (!all.contains(prev) + && !all.contains(prev.replace(Emoji.EMOJI_VARIANT_STRING, "")) && !missing.contains(prev)) { - System.out.println(prev + "\t" + Utility.hex(prev)); + System.out.println(prev + "\t" + Utility.hex(prev)); missing.add(prev); } - pos = lastZwjIndex-1; + pos = lastZwjIndex - 1; } } System.out.println("ZSeq Count: " + EMOJI_DATA_BETA.getZwjSequencesNormal().size()); @@ -173,15 +183,15 @@ private static void countNonincremental() { } private static void countNew() { - UnicodeSet current = new UnicodeSet(EMOJI_DATA_BETA.getAllEmojiWithoutDefectives()) - .addAll(EMOJI_DATA_BETA.getEmojiComponents()) - .freeze(); - UnicodeSet previous = new UnicodeSet(EMOJI_DATA_PREVIOUS.getAllEmojiWithoutDefectives()) - .addAll(EMOJI_DATA_BETA.getEmojiComponents()) - .freeze(); - UnicodeSet ARE_NEW = new UnicodeSet(current) - .removeAll(previous) - .freeze(); + UnicodeSet current = + new UnicodeSet(EMOJI_DATA_BETA.getAllEmojiWithoutDefectives()) + .addAll(EMOJI_DATA_BETA.getEmojiComponents()) + .freeze(); + UnicodeSet previous = + new UnicodeSet(EMOJI_DATA_PREVIOUS.getAllEmojiWithoutDefectives()) + .addAll(EMOJI_DATA_BETA.getEmojiComponents()) + .freeze(); + UnicodeSet ARE_NEW = new UnicodeSet(current).removeAll(previous).freeze(); String vPrevious = "v" + Emoji.VERSION_LAST_RELEASED.getVersionString(2, 2); String vCurrent = "v" + Emoji.VERSION_BETA.getVersionString(2, 2); @@ -204,11 +214,13 @@ public String toString() { static class Bucket { final Counter majors = new Counter<>(); - final UnicodeMap sets= new UnicodeMap<>(); + final UnicodeMap sets = new UnicodeMap<>(); + public void add(MajorGroup maj, String cat, String s) { majors.add(maj, 1); - sets.put(s,maj); + sets.put(s, maj); } + @Override public String toString() { return "[majors:" + majors + "; sets:" + sets + "]"; @@ -218,6 +230,7 @@ public String toString() { void add(String s) { add(s, null); } + void add(String s, CandidateData candidateData) { String cat = ORDER.getCategory(s); if (cat == null && candidateData != null) { @@ -258,7 +271,11 @@ public void showCounts(PrintWriter out, boolean showCharacters, PrintWriter outP if (bucket == null) { continue; } - if ((evalue == Category.component || evalue == Category.ungendered ) // || evalue == Category.typical_dup_group | evalue == Category.typical_dup_sign) + if ((evalue == Category.component + || evalue + == Category + .ungendered) // || evalue == Category.typical_dup_group + // | evalue == Category.typical_dup_sign) && !doneSubtotal) { showTotalLine(out, "Subtotal", row, th, groups, columnCount); doneSubtotal = true; @@ -273,14 +290,20 @@ public void showCounts(PrintWriter out, boolean showCharacters, PrintWriter outP rowTotal1 += count; columnCount.add(maj, count); UnicodeSet set = bucket.sets.getSet(maj); - String tdTitle = SHOW_SAMPLE && count != 0 ? "" : td; + String tdTitle = + SHOW_SAMPLE && count != 0 + ? "" + : td; out.print(tdTitle + (count == 0 ? "" : count) + ""); if (count != 0 && outPlain != null) { - outPlain.println(evalue.toStringPlain() - + "\t" + maj.toPlainString() - + "\t" + count - + "\t" + EmojiData.getWithoutMods(set).toPattern(false)); + outPlain.println( + evalue.toStringPlain() + + "\t" + + maj.toPlainString() + + "\t" + + count + + "\t" + + EmojiData.getWithoutMods(set).toPattern(false)); } } out.println(th + rowTotal1 + "" + ""); @@ -312,14 +335,16 @@ private String getBestSample(UnicodeSet set) { for (String s : set) { if (best == null) { best = s; - isEmojiPresentation = EMOJI_DATA_BETA.getEmojiPresentationSet().contains(s.codePointAt(0)); + isEmojiPresentation = + EMOJI_DATA_BETA.getEmojiPresentationSet().contains(s.codePointAt(0)); year = BirthInfo.getYear(s); continue; } - boolean sEmojiPresentation = EMOJI_DATA_BETA.getEmojiPresentationSet().contains(s.codePointAt(0)); + boolean sEmojiPresentation = + EMOJI_DATA_BETA.getEmojiPresentationSet().contains(s.codePointAt(0)); int sYear = BirthInfo.getYear(s); - if (!isEmojiPresentation + if (!isEmojiPresentation || isEmojiPresentation == sEmojiPresentation && year > sYear) { best = s; isEmojiPresentation = sEmojiPresentation; @@ -333,7 +358,12 @@ private String getBestSample(UnicodeSet set) { return EMOJI_DATA_BETA.addEmojiVariants(best); } - private void showTotalLine(PrintWriter out, String title2, String row, String th, MajorGroup[] groups, + private void showTotalLine( + PrintWriter out, + String title2, + String row, + String th, + MajorGroup[] groups, Counter columnCount) { long rowTotal = 0; out.print(row + th + title2 + ""); @@ -349,17 +379,18 @@ private void showTotalLine(PrintWriter out, String title2, String row, String th } enum Attribute { - zwj("Ⓩ"), + zwj("Ⓩ"), gender(Emoji.FEMALE), role(Emoji.WOMAN_STR), - family(Emoji.NEUTRAL_FAMILY), - hair("🦰"), - singleton("Ⓒ"), + family(Emoji.NEUTRAL_FAMILY), + hair("🦰"), + singleton("Ⓒ"), dup("🧑"), skin("🏿"), ; private final String label; + private Attribute(String label) { this.label = label; } @@ -368,42 +399,48 @@ private Attribute(String label) { static final char NNBSP = '\u202F'; public enum Category { - character(Attribute.singleton), - mod_seq(Attribute.skin), + character(Attribute.singleton), + mod_seq(Attribute.skin), zwj_seq_hair(Attribute.zwj, Attribute.hair), zwj_seq_mod_hair(Attribute.zwj, Attribute.skin, Attribute.hair), - zwj_seq_gender(Attribute.zwj, Attribute.gender), + zwj_seq_gender(Attribute.zwj, Attribute.gender), zwj_seq_gender_mod(Attribute.zwj, Attribute.gender, Attribute.skin), zwj_seq_role(Attribute.zwj, Attribute.role), - zwj_seq_role_mod(Attribute.zwj, Attribute.role, Attribute.skin), - zwj_seq_fam(Attribute.zwj, Attribute.family), - //zwj_seq_fam_mod("" + zwjLabel + " "+Emoji.NEUTRAL_FAMILY + "&skin"), - //zwj_seq_mod("" + zwjLabel + " other&skin", Attribute.zwj, Attribute.skin), - zwj_seq_fam_mod(Attribute.zwj, Attribute.family, Attribute.skin), + zwj_seq_role_mod(Attribute.zwj, Attribute.role, Attribute.skin), + zwj_seq_fam(Attribute.zwj, Attribute.family), + // zwj_seq_fam_mod("" + zwjLabel + " "+Emoji.NEUTRAL_FAMILY + "&skin"), + // zwj_seq_mod("" + zwjLabel + " other&skin", Attribute.zwj, Attribute.skin), + zwj_seq_fam_mod(Attribute.zwj, Attribute.family, Attribute.skin), zwj_seq_mod(Attribute.zwj, Attribute.skin), zwj_seq_other(Attribute.zwj), keycap_seq("#️⃣"), flag_seq("🏁"), - tag_seq("🏴"), - ungendered(Attribute.dup), - ungendered_skin(Attribute.skin, Attribute.dup), - component("🔗"), - // typical_dup_sign, - // typical_dup_group, - ; - - final public String displayName; - final public String html; + tag_seq("🏴"), + ungendered(Attribute.dup), + ungendered_skin(Attribute.skin, Attribute.dup), + component("🔗"), + // typical_dup_sign, + // typical_dup_group, + ; + + public final String displayName; + public final String html; final Set attributes; + Category() { - this((String)null); + this((String) null); } + Category(Attribute... _baseCategories) { this(null, _baseCategories); } + Category(String _name, Attribute... _baseCategories) { - attributes = _baseCategories.length == 0 ? Collections.emptySortedSet() - : ImmutableSortedSet.copyOf(EnumSet.copyOf(Arrays.asList(_baseCategories))); + attributes = + _baseCategories.length == 0 + ? Collections.emptySortedSet() + : ImmutableSortedSet.copyOf( + EnumSet.copyOf(Arrays.asList(_baseCategories))); String title = null; if (!attributes.isEmpty()) { if (_name != null) { @@ -415,20 +452,21 @@ public enum Category { sb.append(Attribute.singleton.label); sbLong.append(Attribute.singleton.toString()); } - // if (!attributes.contains(Attribute.dup) && !attributes.contains(Attribute.zwj)) { + // if (!attributes.contains(Attribute.dup) && + // !attributes.contains(Attribute.zwj)) { // sb.append(Attribute.singleton.label); // sbLong.append(Attribute.singleton.toString()); // } for (Attribute a : attributes) { if (sb.length() != 0) { - sb.append(NNBSP+"‧"+NNBSP); + sb.append(NNBSP + "‧" + NNBSP); sbLong.append(" + "); } sb.append(a.label); sbLong.append(a.toString()); } if (attributes.contains(Attribute.zwj) && attributes.size() == 1) { - sb.append(NNBSP+"‧"+NNBSP).append(Attribute.singleton.label); + sb.append(NNBSP + "‧" + NNBSP).append(Attribute.singleton.label); sbLong.append(" + ").append(Attribute.singleton.toString()); } displayName = sb.toString(); @@ -443,7 +481,9 @@ public enum Category { String _html = TransliteratorUtilities.toHTML.transform(displayName); html = title == null ? _html : "" + _html + ""; } + static Map, Category> attributesToCategory; + static { Map, Category> _attributesToCategory = new HashMap<>(); Map names = new HashMap<>(); // check uniqueness @@ -452,32 +492,39 @@ public enum Category { Category old = names.get(cat.displayName); if (old != null) { throw new IllegalArgumentException( - "Duplicate display name: " + cat.displayName - + " for " + old + " and " + cat); + "Duplicate display name: " + + cat.displayName + + " for " + + old + + " and " + + cat); } names.put(cat.displayName, cat); } attributesToCategory = ImmutableMap.copyOf(_attributesToCategory); } + @Override public String toString() { return html; } + public String toStringPlain() { return displayName; } /** added to make migration easier */ - static public Category getType(String s) { + public static Category getType(String s) { return getBucket(s); } - static public Category getBucket(String s) { + + public static Category getBucket(String s) { try { String noVariants = EmojiData.removeEmojiVariants(s); Category bucket = null; if (noVariants.startsWith(Emoji.MALE) || noVariants.startsWith(Emoji.FEMALE)) { int debug = 0; } - if (noVariants.isEmpty() + if (noVariants.isEmpty() || CountEmoji.EMOJI_DATA_BETA.getEmojiComponents().contains(noVariants) || Emoji.FULL_GENDER_MARKERS.contains(noVariants)) { bucket = component; @@ -511,34 +558,42 @@ static public Category getBucket(String s) { int first = noVariants.codePointAt(0); String butFirst = noVariants.substring(Character.charCount(first)); - boolean role = Emoji.MAN_OR_WOMAN_OR_ADULT.contains(first) - && Emoji.PROFESSION_OBJECT.containsSome(noVariants); + boolean role = + Emoji.MAN_OR_WOMAN_OR_ADULT.contains(first) + && Emoji.PROFESSION_OBJECT.containsSome(noVariants); if (role) { attributes.add(Attribute.role); } - boolean family = noVariants.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ) - || Emoji.FAMILY_MARKERS.contains(first) && Emoji.FAMILY_MARKERS.containsSome(butFirst); + boolean family = + noVariants.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ) + || Emoji.FAMILY_MARKERS.contains(first) + && Emoji.FAMILY_MARKERS.containsSome(butFirst); if (family) { attributes.add(Attribute.family); } bucket = getCategory(attributes); - // + // // // if (!zwj) { // if (mods) { // bucket = mod_seq; // } else { - // throw new IllegalArgumentException("should never happen"); + // throw new IllegalArgumentException("should never + // happen"); // } // } else { // zwj // if (gender) { - // bucket = getVariety(mods, hair, zwj_seq_gender, zwj_seq_gender_mod, null, null); + // bucket = getVariety(mods, hair, zwj_seq_gender, + // zwj_seq_gender_mod, null, null); // } else if (role) { - // bucket = getVariety(mods, hair, zwj_seq_role, zwj_seq_role_mod, null, null); + // bucket = getVariety(mods, hair, zwj_seq_role, + // zwj_seq_role_mod, null, null); // } else if (family) { - // bucket = getVariety(mods, hair, zwj_seq_fam, null, null, null); + // bucket = getVariety(mods, hair, zwj_seq_fam, null, + // null, null); // } else { - // bucket = getVariety(mods, hair, zwj_seq_other, zwj_seq_mod, zwj_seq_hair, zwj_seq_mod_hair); + // bucket = getVariety(mods, hair, zwj_seq_other, + // zwj_seq_mod, zwj_seq_hair, zwj_seq_mod_hair); // } // } } @@ -549,7 +604,7 @@ static public Category getBucket(String s) { } return bucket; } catch (NoCategoryException e) { - throw new IllegalArgumentException("for «" + s + "» "+ Utility.hex(s), e); + throw new IllegalArgumentException("for «" + s + "» " + Utility.hex(s), e); } } @@ -567,18 +622,28 @@ public NoCategoryException(String string) { } } - public boolean hasAttribute(Attribute baseCategory) { return attributes.contains(baseCategory); } + public Set getAttributes() { return attributes; } } - /**@deprecated Replace by the {@link CountEmoji.Category}*/ + /** + * @deprecated Replace by the {@link CountEmoji.Category} + */ public enum ZwjType { - roleWithHair, roleWithObject, roleWithSign, gestures, activity, family, other, na; + roleWithHair, + roleWithObject, + roleWithSign, + gestures, + activity, + family, + other, + na; + public static ZwjType getType(String s) { if (!s.contains(Emoji.JOINER_STRING)) { return na; @@ -587,11 +652,13 @@ public static ZwjType getType(String s) { ZwjType zwjType = ZwjType.other; if (Emoji.HAIR_PIECES.containsSome(s)) { zwjType = roleWithHair; - } else if (Emoji.FAMILY_MARKERS.contains(cps[cps.length - 1])) { // last character is in boy..woman + } else if (Emoji.FAMILY_MARKERS.contains( + cps[cps.length - 1])) { // last character is in boy..woman zwjType = family; } else if (Emoji.ACTIVITY_MARKER.containsSome(s)) { zwjType = activity; - } else if (Emoji.ROLE_MARKER.containsSome(s)) { // || Emoji.FAMILY_MARKERS.containsSome(s) + } else if (Emoji.ROLE_MARKER.containsSome( + s)) { // || Emoji.FAMILY_MARKERS.containsSome(s) zwjType = Emoji.GENDER_MARKERS.containsSome(s) ? roleWithSign : roleWithObject; } else if (Emoji.GENDER_MARKERS.containsSome(s)) { zwjType = gestures; @@ -607,9 +674,7 @@ private void countItems(String title, UnicodeSet uset) { } System.out.println("\n" + title); PrintWriter pw = new PrintWriter(System.out); - pw.println("

    For a key to the format of the table, see " - + EMOJI_COUNT_KEY - + ".

    "); + pw.println("

    For a key to the format of the table, see " + EMOJI_COUNT_KEY + ".

    "); showCounts(pw, false, null); pw.close(); } @@ -634,10 +699,10 @@ public static void countVs() { countPlain++; continue; } - //without=first=full - //without=first≠full - //without≠first≠full - //without≠first=full + // without=first=full + // without=first≠full + // without≠first≠full + // without≠first=full String itemFirst = EMOJI_DATA_BETA.getOnlyFirstVariant(itemFull); if (!itemFirst.equals(itemFull)) { if (!itemFirst.equals(itemWithout)) { @@ -655,16 +720,25 @@ public static void countVs() { System.out.println("without≠first=full: " + countFull); System.out.println("without=first≠full: " + countFirst); System.out.println("without≠first≠full: " + countOther); - } - private static void showLine(int countFirst, String title, String itemWithout, String itemFirst, String itemFull) { - System.out.println(title - + "\t" + countFirst - + "\t" + Utility.hex(itemWithout, " ") - + "\t" + Utility.hex(itemFirst, " ") - + "\t" + Utility.hex(itemFull, " ") - + "\t(" + itemFull + ")" - + "\t" + EMOJI_DATA_BETA.getName(itemFull)); + + private static void showLine( + int countFirst, String title, String itemWithout, String itemFirst, String itemFull) { + System.out.println( + title + + "\t" + + countFirst + + "\t" + + Utility.hex(itemWithout, " ") + + "\t" + + Utility.hex(itemFirst, " ") + + "\t" + + Utility.hex(itemFull, " ") + + "\t(" + + itemFull + + ")" + + "\t" + + EMOJI_DATA_BETA.getName(itemFull)); } public void addAll(Iterable chars) { @@ -673,7 +747,6 @@ public void addAll(Iterable chars) { int debug = 0; } add(s); - } + } } - } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/CountValidEmoji.java b/unicodetools/src/main/java/org/unicode/tools/emoji/CountValidEmoji.java index e252c577a..9b7ab45f8 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/CountValidEmoji.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/CountValidEmoji.java @@ -1,23 +1,21 @@ package org.unicode.tools.emoji; +import com.google.common.collect.Multimap; +import com.google.common.collect.Multimaps; +import com.google.common.collect.TreeMultimap; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.ULocale; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; - import org.unicode.cldr.util.StandardCodes.LstrType; import org.unicode.cldr.util.Validity; import org.unicode.cldr.util.Validity.Status; -import com.google.common.collect.Multimap; -import com.google.common.collect.Multimaps; -import com.google.common.collect.TreeMultimap; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.ULocale; - public class CountValidEmoji { public static void main(String[] args) { countInvalid(); @@ -43,12 +41,15 @@ private static void countInvalid(Validity validity, LstrType type) { Set invalid = Collections.emptySet(); if (type == null) { - recommended = EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives().addAllTo(new LinkedHashSet()); - for (Iterator it = recommended.iterator(); it.hasNext();) { - String item = it.next(); - if (!item.contains(Emoji.JOINER_STR)) { - it.remove(); - } + recommended = + EmojiData.EMOJI_DATA + .getAllEmojiWithoutDefectives() + .addAllTo(new LinkedHashSet()); + for (Iterator it = recommended.iterator(); it.hasNext(); ) { + String item = it.next(); + if (!item.contains(Emoji.JOINER_STR)) { + it.remove(); + } } recommendedSize = recommended.size(); otherValidSize = Double.POSITIVE_INFINITY; @@ -56,36 +57,38 @@ private static void countInvalid(Validity validity, LstrType type) { title = "Emoji ZWJ Sequence"; } else { Map codeToStatus = validity.getCodeToStatus(type); - // idStatus="regular", "deprecated", or the "macroregion". However, for macroregions, only UN and EU are valid. - Multimap inverse = Multimaps.invertFrom(Multimaps.forMap(codeToStatus), TreeMultimap.create()); + // idStatus="regular", "deprecated", or the "macroregion". However, for macroregions, + // only UN and EU are valid. + Multimap inverse = + Multimaps.invertFrom(Multimaps.forMap(codeToStatus), TreeMultimap.create()); recommended = new LinkedHashSet<>(inverse.get(Status.regular)); other_valid = new LinkedHashSet<>(inverse.get(Status.deprecated)); switch (type) { - case region: - recommended.add("UN"); - recommended.add("EU"); - syntactic = 26*26; - title = "Emoji Flag Sequence"; - invalid = new LinkedHashSet<>(); - for (char f = 'A'; f <= 'Z'; ++f) { - for (char s = 'A'; s <= 'Z'; ++s) { - invalid.add(f + "" + s); + case region: + recommended.add("UN"); + recommended.add("EU"); + syntactic = 26 * 26; + title = "Emoji Flag Sequence"; + invalid = new LinkedHashSet<>(); + for (char f = 'A'; f <= 'Z'; ++f) { + for (char s = 'A'; s <= 'Z'; ++s) { + invalid.add(f + "" + s); + } } - } - invalid.removeAll(recommended); - invalid.removeAll(other_valid); - break; - case subdivision: - other_valid.addAll(recommended); - recommended.clear(); - recommended.addAll(Arrays.asList("gbeng", "gbsct", "gbwls")); - other_valid.removeAll(recommended); - syntactic = (26*26 + 999L) * (35*36*36); - title = "Emoji Tag Sequence"; - invalid = new LinkedHashSet<>(Arrays.asList("usa", "usb")); - break; - default: - throw new ICUException(); + invalid.removeAll(recommended); + invalid.removeAll(other_valid); + break; + case subdivision: + other_valid.addAll(recommended); + recommended.clear(); + recommended.addAll(Arrays.asList("gbeng", "gbsct", "gbwls")); + other_valid.removeAll(recommended); + syntactic = (26 * 26 + 999L) * (35 * 36 * 36); + title = "Emoji Tag Sequence"; + invalid = new LinkedHashSet<>(Arrays.asList("usa", "usb")); + break; + default: + throw new ICUException(); } recommendedSize = recommended.size(); otherValidSize = other_valid.size(); @@ -93,9 +96,14 @@ private static void countInvalid(Validity validity, LstrType type) { } System.out.println("\n" + title); - System.out.println("Recommended:\t" + nf.format(recommendedSize) + "\t" + clip(recommended)); + System.out.println( + "Recommended:\t" + nf.format(recommendedSize) + "\t" + clip(recommended)); System.out.println("Other Valid:\t" + nf.format(otherValidSize) + "\t" + clip(other_valid)); - System.out.println("Invalid (but WF):\t" + nf.format(syntactic_but_invalid_size) + "\t" + clip(invalid)); + System.out.println( + "Invalid (but WF):\t" + + nf.format(syntactic_but_invalid_size) + + "\t" + + clip(invalid)); } private static String clip(Set other_valid) { diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/DebugUtilities.java b/unicodetools/src/main/java/org/unicode/tools/emoji/DebugUtilities.java index 55ad3c641..4c6e93d33 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/DebugUtilities.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/DebugUtilities.java @@ -1,13 +1,13 @@ package org.unicode.tools.emoji; import java.util.function.Predicate; - import org.unicode.text.UCD.Default; import org.unicode.text.utility.Utility; public class DebugUtilities { - - public static String composeStringsWhen(String title, Iterable strings, Predicate predicate) { + + public static String composeStringsWhen( + String title, Iterable strings, Predicate predicate) { StringBuilder b = new StringBuilder(); for (String s : strings) { if (predicate.test(s)) { @@ -20,7 +20,8 @@ public static String composeStringsWhen(String title, Iterable strings, return b.toString(); } - public static void debugStringsWhen(String title, Iterable strings, Predicate predicate) { + public static void debugStringsWhen( + String title, Iterable strings, Predicate predicate) { System.out.println(composeStringsWhen(title, strings, predicate)); } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/DocRegistry.java b/unicodetools/src/main/java/org/unicode/tools/emoji/DocRegistry.java index 4614d7daf..3463a9c7e 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/DocRegistry.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/DocRegistry.java @@ -1,26 +1,22 @@ package org.unicode.tools.emoji; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; import java.time.LocalDate; import java.util.ArrayList; -import java.util.Date; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.util.Set; import java.util.TreeMap; - +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.TransliteratorUtilities; import org.unicode.tools.emoji.DocRegistry.DocRegistryEntry; -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; - public class DocRegistry { private static final Pattern PROPOSAL = Pattern.compile("(L2/\\d{2}-\\d{3})(R\\d?)?"); @@ -43,13 +39,10 @@ public DocRegistryEntry(List parts) { @Override public String toString() { - return "L2=«" + L2 - + "» title=«" + title - + "» source=«" + source - + "» other=" + other - ; + return "L2=«" + L2 + "» title=«" + title + "» source=«" + source + "» other=" + other; } } + static final Map map = load(); private static Map load() { @@ -115,10 +108,20 @@ public static DocRegistryEntry get(String proposal) { public static String getProposalForHtml(String proposalItem) { DocRegistryEntry item = get(proposalItem); - String title = item == null ? "" : " title ='" + TransliteratorUtilities.toHTML.transform(item.title + " 👈 " + item.source) + "'"; + String title = + item == null + ? "" + : " title ='" + + TransliteratorUtilities.toHTML.transform( + item.title + " 👈 " + item.source) + + "'"; return "" - + proposalItem + ""; + + "href='https://www.unicode.org/cgi-bin/GetDocumentLink?" + + proposalItem.replace('\u2011', '-') + + "'" + + title + + ">" + + proposalItem + + ""; } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/Emoji.java b/unicodetools/src/main/java/org/unicode/tools/emoji/Emoji.java index 3012798fc..cb2bf34e2 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/Emoji.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/Emoji.java @@ -1,5 +1,19 @@ package org.unicode.tools.emoji; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Relation; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.LocaleDisplayNames; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.File; import java.util.Arrays; import java.util.Collection; @@ -14,7 +28,6 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CldrUtility; import org.unicode.cldr.util.With; @@ -25,67 +38,69 @@ import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Relation; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.LocaleDisplayNames; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Output; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - /** - * You need a command line variable to generate either the beta version (not yet released) or the abbreviated version (see below). + * You need a command line variable to generate either the beta version (not yet released) or the + * abbreviated version (see below). + * *

    Example: -Demoji-beta + * *

    With each new version, set - *

    • VERSION_LAST_RELEASED2 = VERSIONxx;
    • - *
    • VERSION_LAST_RELEASED = VERSIONyy;
    • - *
    • VERSION_BETA = VERSIONzz;
    + * + *
      + *
    • VERSION_LAST_RELEASED2 = VERSIONxx; + *
    • VERSION_LAST_RELEASED = VERSIONyy; + *
    • VERSION_BETA = VERSIONzz; + *
    + * * You also need to add 2 new constants, such as: - *
    • VERSION15;
    • - *
    • UCD15;
    + * + *
      + *
    • VERSION15; + *
    • UCD15; + *
    + * * And add dates to the following: - *
    • EMOJI_TO_UNICODE_VERSION
    • - *
    • EMOJI_TO_DATE
    -*/ + * + *
      + *
    • EMOJI_TO_UNICODE_VERSION + *
    • EMOJI_TO_DATE + *
    + */ public class Emoji { /** - * The following is used to generate an abbreviated version of the charts, where only a few rows are produced, - * and all images are replaced by a colored square (small data size). - * This version can be used to do link-checks. + * The following is used to generate an abbreviated version of the charts, where only a few rows + * are produced, and all images are replaced by a colored square (small data size). This version + * can be used to do link-checks. */ static final boolean ABBR = CldrUtility.getProperty("emoji-abbr", false); - //static final boolean EMOJI_BUILD_VERSION = CldrUtility.getProperty("emoji-version", false); + // static final boolean EMOJI_BUILD_VERSION = CldrUtility.getProperty("emoji-version", false); /** - * Change the following according to whether we are generating the beta version of files, or the new version. - * We support generating the last version in order to make improvements to the charts. + * Change the following according to whether we are generating the beta version of files, or the + * new version. We support generating the last version in order to make improvements to the + * charts. */ public static final boolean IS_BETA = CldrUtility.getProperty("emoji-beta", false); + public static final boolean BETA_IS_OPEN = CldrUtility.getProperty("emoji-beta-open", false); /** - * Set the following to true iff the charts for the release should still point to proposed.html for TR51. - * The main function is to add pointers between the release and beta charts. - * Also change the VERSION_LAST_RELEASED2, etc below!!! + * Set the following to true iff the charts for the release should still point to proposed.html + * for TR51. The main function is to add pointers between the release and beta charts. Also + * change the VERSION_LAST_RELEASED2, etc below!!! */ - public static final boolean USE_PROPOSED = true; // set to true between the release of Emoji 5.0 & Unicode 10.0. (or similar situation) + public static final boolean USE_PROPOSED = + true; // set to true between the release of Emoji 5.0 & Unicode 10.0. (or similar + // situation) - /** - * Constants for versions - */ - public static final VersionInfo VERSION15 = VersionInfo.getInstance(15,0); - public static final VersionInfo VERSION14 = VersionInfo.getInstance(14,0); - public static final VersionInfo VERSION13_1 = VersionInfo.getInstance(13,1); + /** Constants for versions */ + public static final VersionInfo VERSION15 = VersionInfo.getInstance(15, 0); + + public static final VersionInfo VERSION14 = VersionInfo.getInstance(14, 0); + public static final VersionInfo VERSION13_1 = VersionInfo.getInstance(13, 1); public static final VersionInfo VERSION13 = VersionInfo.getInstance(13); - public static final VersionInfo VERSION12_1 = VersionInfo.getInstance(12,1); + public static final VersionInfo VERSION12_1 = VersionInfo.getInstance(12, 1); public static final VersionInfo VERSION12 = VersionInfo.getInstance(12); public static final VersionInfo VERSION11 = VersionInfo.getInstance(11); public static final VersionInfo VERSION5 = VersionInfo.getInstance(5); @@ -93,9 +108,9 @@ public class Emoji { public static final VersionInfo VERSION3 = VersionInfo.getInstance(3); public static final VersionInfo VERSION2 = VersionInfo.getInstance(2); public static final VersionInfo VERSION1 = VersionInfo.getInstance(1); - public static final VersionInfo VERSION0_7 = VersionInfo.getInstance(0,7); - public static final VersionInfo VERSION0_6 = VersionInfo.getInstance(0,6); - public static final VersionInfo VERSION0_5 = VersionInfo.getInstance(0,5,2); + public static final VersionInfo VERSION0_7 = VersionInfo.getInstance(0, 7); + public static final VersionInfo VERSION0_6 = VersionInfo.getInstance(0, 6); + public static final VersionInfo VERSION0_5 = VersionInfo.getInstance(0, 5, 2); // ALSO fix VersionToAge.java! public static final VersionInfo UCD15 = VERSION15; @@ -111,55 +126,59 @@ public class Emoji { public static final VersionInfo UCD6 = VersionInfo.getInstance(6); /** - * Change each following once we release. That is, VERSION_LAST_RELEASED* becomes VERSION_BETA*, and both the latter increment. - * Also add to EMOJI_TO_UNICODE_VERSION + * Change each following once we release. That is, VERSION_LAST_RELEASED* becomes VERSION_BETA*, + * and both the latter increment. Also add to EMOJI_TO_UNICODE_VERSION */ public static final VersionInfo VERSION_LAST_RELEASED2 = VERSION13_1; + public static final VersionInfo VERSION_LAST_RELEASED = VERSION14; public static final VersionInfo VERSION_BETA = VERSION15; public static final VersionInfo VERSION_TO_TEST = VERSION_BETA; public static final VersionInfo VERSION_TO_TEST_PREVIOUS = VERSION_LAST_RELEASED; - public static Map EMOJI_TO_UNICODE_VERSION = ImmutableMap.builder() - .put(VERSION15, UCD15) - .put(VERSION14, UCD14) - .put(VERSION13_1, UCD13) - .put(VERSION13, UCD13) - .put(VERSION12_1, UCD12_1) - .put(VERSION12, UCD12) - .put(VERSION11, UCD11) - .put(VERSION5, UCD10) - .put(VERSION4, UCD9) - .put(VERSION3, UCD9) - .put(VERSION2, UCD8) - .put(VERSION1, UCD8) -// .put(VERSION0_7, UCD7) -// .put(VERSION0_6, UCD6) - .build(); - - public final static Map EMOJI_TO_DATE = ImmutableMap.builder() - .put(VERSION15, "2022-09-10") - .put(VERSION14, "2021-09-10") - .put(VERSION13_1, "2020-09-10") - .put(VERSION13, "2020-03-10") - .put(VERSION12_1, "2019-10-29") - .put(VERSION12, "2019-02-04") - .put(VERSION11, "2018-02-07") - .put(VERSION5, "2017-03-27") - .put(VERSION4, "2016-11-22") - .put(VERSION3, "2016-06-03") - .put(VERSION2, "2015-11-12") - .put(VERSION1, "2015-06-09") -// .put(VERSION0_7, "2014-06-16") -// .put(VERSION0_6, "2010-06-09") - .build(); - - public final static Map YEAR_TO_EMOJI_VERSION_ASCENDING; - public final static Map EMOJI_VERSION_TO_YEAR; + public static Map EMOJI_TO_UNICODE_VERSION = + ImmutableMap.builder() + .put(VERSION15, UCD15) + .put(VERSION14, UCD14) + .put(VERSION13_1, UCD13) + .put(VERSION13, UCD13) + .put(VERSION12_1, UCD12_1) + .put(VERSION12, UCD12) + .put(VERSION11, UCD11) + .put(VERSION5, UCD10) + .put(VERSION4, UCD9) + .put(VERSION3, UCD9) + .put(VERSION2, UCD8) + .put(VERSION1, UCD8) + // .put(VERSION0_7, UCD7) + // .put(VERSION0_6, UCD6) + .build(); + + public static final Map EMOJI_TO_DATE = + ImmutableMap.builder() + .put(VERSION15, "2022-09-10") + .put(VERSION14, "2021-09-10") + .put(VERSION13_1, "2020-09-10") + .put(VERSION13, "2020-03-10") + .put(VERSION12_1, "2019-10-29") + .put(VERSION12, "2019-02-04") + .put(VERSION11, "2018-02-07") + .put(VERSION5, "2017-03-27") + .put(VERSION4, "2016-11-22") + .put(VERSION3, "2016-06-03") + .put(VERSION2, "2015-11-12") + .put(VERSION1, "2015-06-09") + // .put(VERSION0_7, "2014-06-16") + // .put(VERSION0_6, "2010-06-09") + .build(); + + public static final Map YEAR_TO_EMOJI_VERSION_ASCENDING; + public static final Map EMOJI_VERSION_TO_YEAR; + static { - Map _map = new TreeMap<>(); - Map _mapEmojiToYear = new TreeMap<>(); + Map _map = new TreeMap<>(); + Map _mapEmojiToYear = new TreeMap<>(); for (Entry entry : EMOJI_TO_DATE.entrySet()) { int year = Integer.parseInt(entry.getValue().substring(0, 4)); _mapEmojiToYear.put(entry.getKey(), year); @@ -171,53 +190,65 @@ public class Emoji { EMOJI_VERSION_TO_YEAR = ImmutableMap.copyOf(_mapEmojiToYear); } - public static final VersionInfo VERSION_LAST_RELEASED_UNICODE = EMOJI_TO_UNICODE_VERSION.get(VERSION_LAST_RELEASED); - public static final VersionInfo VERSION_BETA_UNICODE = EMOJI_TO_UNICODE_VERSION.get(VERSION_BETA); + public static final VersionInfo VERSION_LAST_RELEASED_UNICODE = + EMOJI_TO_UNICODE_VERSION.get(VERSION_LAST_RELEASED); + public static final VersionInfo VERSION_BETA_UNICODE = + EMOJI_TO_UNICODE_VERSION.get(VERSION_BETA); private static final String BETA_PLAIN = "β"; private static final String BETA_COLORED = "" + BETA_PLAIN + ""; - //public static final VersionInfo VERSION_FORMAT1 = VersionInfo.getInstance(1); + // public static final VersionInfo VERSION_FORMAT1 = VersionInfo.getInstance(1); - /** - * Computed - */ + /** Computed */ + public static final String BETA_TITLE_AFFIX = Emoji.IS_BETA ? BETA_PLAIN : ""; - public static final String BETA_TITLE_AFFIX = Emoji.IS_BETA ? BETA_PLAIN : ""; - public static final String BETA_TITLE_AFFIX_SHORT = Emoji.IS_BETA ? "β" : ""; - public static final String BETA_HEADER_AFFIX = Emoji.IS_BETA ? BETA_COLORED : ""; + public static final String BETA_TITLE_AFFIX_SHORT = Emoji.IS_BETA ? "β" : ""; + public static final String BETA_HEADER_AFFIX = Emoji.IS_BETA ? BETA_COLORED : ""; - public static final String VERSION_LAST_RELEASED_STRING = VERSION_LAST_RELEASED.getVersionString(2, 4); + public static final String VERSION_LAST_RELEASED_STRING = + VERSION_LAST_RELEASED.getVersionString(2, 4); public static final String VERSION_BETA_STRING = VERSION_BETA.getVersionString(2, 4); public static final String VERSION_BETA_STRING_WITH_COLOR = VERSION_BETA_STRING + BETA_COLORED; - public static final VersionInfo VERSION_TO_GENERATE = IS_BETA ? VERSION_BETA : VERSION_LAST_RELEASED; - public static final VersionInfo VERSION_TO_GENERATE_PREVIOUS = IS_BETA ? VERSION_LAST_RELEASED : VERSION_LAST_RELEASED2; + public static final VersionInfo VERSION_TO_GENERATE = + IS_BETA ? VERSION_BETA : VERSION_LAST_RELEASED; + public static final VersionInfo VERSION_TO_GENERATE_PREVIOUS = + IS_BETA ? VERSION_LAST_RELEASED : VERSION_LAST_RELEASED2; public static final String VERSION_STRING = VERSION_TO_GENERATE.getVersionString(2, 4); - public static final VersionInfo VERSION_TO_GENERATE_UNICODE = IS_BETA ? VERSION_BETA_UNICODE : VERSION_LAST_RELEASED_UNICODE; - public static final String VERSION_UNICODE_STRING = VERSION_TO_GENERATE_UNICODE.getVersionString(2, 4); + public static final VersionInfo VERSION_TO_GENERATE_UNICODE = + IS_BETA ? VERSION_BETA_UNICODE : VERSION_LAST_RELEASED_UNICODE; + public static final String VERSION_UNICODE_STRING = + VERSION_TO_GENERATE_UNICODE.getVersionString(2, 4); - //public static final String TR51_SVN_DIR = Settings.UNICODE_DRAFT_DIRECTORY + "reports/tr51/"; - //public static final String TR51_PREFIX = IS_BETA ? "internal-beta/" : "internal/"; + // public static final String TR51_SVN_DIR = Settings.UNICODE_DRAFT_DIRECTORY + "reports/tr51/"; + // public static final String TR51_PREFIX = IS_BETA ? "internal-beta/" : "internal/"; - public static final String EMOJI_DIR = Settings.Output.GEN_DIR + "emoji/" + (Emoji.ABBR ? "🏴" : ""); - public static final String CHARTS_DIR = EMOJI_DIR + "charts-" + VERSION_STRING + "/"; + public static final String EMOJI_DIR = + Settings.Output.GEN_DIR + "emoji/" + (Emoji.ABBR ? "🏴" : ""); + public static final String CHARTS_DIR = EMOJI_DIR + "charts-" + VERSION_STRING + "/"; public static final String FUTURE_DIR = EMOJI_DIR + "future" + "/"; public static final String TR51_INTERNAL_DIR = CHARTS_DIR + "internal/"; - public static final String RELEASE_CHARTS_DIR = EMOJI_DIR + "charts-" + VERSION_LAST_RELEASED_STRING + "/"; + public static final String RELEASE_CHARTS_DIR = + EMOJI_DIR + "charts-" + VERSION_LAST_RELEASED_STRING + "/"; public static final String DATA_DIR_PRODUCTION_BASE = "https://unicode.org/Public/emoji/"; - public static final String DATA_DIR_PRODUCTION = DATA_DIR_PRODUCTION_BASE + VERSION_STRING + "/"; + public static final String DATA_DIR_PRODUCTION = + DATA_DIR_PRODUCTION_BASE + VERSION_STRING + "/"; public static final String IMAGES_SOURCE_DIR_SVG = Settings.UnicodeTools.DATA_DIR + "images/"; - // TODO: This should be relative to GEN_DIR, not using "../../" to maybe get out of the repo source. - public static final String IMAGES_OUTPUT_DIR = Settings.UnicodeTools.UNICODETOOLS_DIR + "../../images/emoji/"; + // TODO: This should be relative to GEN_DIR, not using "../../" to maybe get out of the repo + // source. + public static final String IMAGES_OUTPUT_DIR = + Settings.UnicodeTools.UNICODETOOLS_DIR + "../../images/emoji/"; public enum ModifierStatus { - none, modifier, modifier_base; + none, + modifier, + modifier_base; } public static final char JOINER = '\u200D'; @@ -227,76 +258,94 @@ public enum ModifierStatus { public static final char TEXT_VARIANT = '\uFE0E'; // HACK - // static final UnicodeSet GENDER_BASE = new UnicodeSet("[👯💂👳👱⛹🏃🏄🏊-🏌👮👷💁💆💇🕵🙅-🙇🙋🙍🙎🚣 🚴-🚶🤹 \\U0001F926\\U0001F937\\U0001F938\\U0001F93C-\\U0001F93E]") + // static final UnicodeSet GENDER_BASE = new + // UnicodeSet("[👯💂👳👱⛹🏃🏄🏊-🏌👮👷💁💆💇🕵🙅-🙇🙋🙍🙎🚣 🚴-🚶🤹 + // \\U0001F926\\U0001F937\\U0001F938\\U0001F93C-\\U0001F93E]") // .freeze(); - static final UnicodeSet PROFESSION_OBJECT = new UnicodeSet("[⚕🌾🍳🎓🎤🏫🏭💻💼🔧🔬🎨 🚒 ✈ 🚀 ⚖ \\U0001F37C \\U0001F9AF \\U0001F9BC \\U0001F9BD]") - .freeze(); - static final UnicodeSet HAIR_STYLES = new UnicodeSet("[\\U0001F9B0-\\U0001F9B3]") - .freeze(); + static final UnicodeSet PROFESSION_OBJECT = + new UnicodeSet( + "[⚕🌾🍳🎓🎤🏫🏭💻💼🔧🔬🎨 🚒 ✈ 🚀 ⚖ \\U0001F37C \\U0001F9AF \\U0001F9BC \\U0001F9BD]") + .freeze(); + static final UnicodeSet HAIR_STYLES = new UnicodeSet("[\\U0001F9B0-\\U0001F9B3]").freeze(); static final UnicodeSet HAIR_EXPLICIT = new UnicodeSet("[🧔 👱]").freeze(); static final UnicodeSet HAIR_STYLES_WITH_JOINERS = new UnicodeSet(); + static { for (String s : HAIR_STYLES) { HAIR_STYLES_WITH_JOINERS.add(JOINER_STR + s); } HAIR_STYLES_WITH_JOINERS.freeze(); } + public static final String FEMALE = "\u2640"; public static final String MALE = "\u2642"; public static final String TRANSGENDER = "\u26A7"; public static final char TRANSGENDER_CP = '\u26A7'; - static final UnicodeMap TO_NEUTRAL = new UnicodeMap() - .put("👦", "🧒") - .put("👧", "🧒") - .put("👨", "🧑") - .put("👩", "🧑") - .put("👴", "🧓") - .put("👵", "🧓") - .put("🤴", "🧑\u200D👑") - .put("👸", "🧑\u200D👑") - .put("🎅", "🧑\u200D🎄") - .put("🤶", "🧑\u200D🎄") - .put("💃", "🧑\u200D🎶") - .put("🕺", "🧑\u200D🎶") - .put("👫", "🧑" + EmojiData.ZWJ_HANDSHAKE_ZWJ + "🧑") - .put("👬", "🧑" + EmojiData.ZWJ_HANDSHAKE_ZWJ + "🧑") - .put("👭", "🧑" + EmojiData.ZWJ_HANDSHAKE_ZWJ + "🧑") - .freeze(); - - static final UnicodeMap MALE_TO_OTHER = new UnicodeMap() - .put(UTF16.valueOf(0x2642), UTF16.valueOf(0x2640)) // MALE SIGN→FEMALE SIGN - .put(UTF16.valueOf(0x1F466), UTF16.valueOf(0x1F467)) // boy→girl - .put(UTF16.valueOf(0x1F468), UTF16.valueOf(0x1F469)) // man→woman - .put(UTF16.valueOf(0x1F474), UTF16.valueOf(0x1F475)) // old man→old woman - .put(UTF16.valueOf(0x1F385), UTF16.valueOf(0x1F936)) // Santa Claus→Mrs. Claus - .put(UTF16.valueOf(0x1F934), UTF16.valueOf(0x1F478)) // prince→princess - .put(UTF16.valueOf(0x1F57A), UTF16.valueOf(0x1F483)) // man dancing→woman dancing - // .put(UTF16.valueOf(0x1F46C), UTF16.valueOf(0x1F46B)) // two men holding hands→man and woman holding hands - // .put(UTF16.valueOf(0x1F46C), UTF16.valueOf(0x1F46D)) // two men holding hands→two women holding hands - // .put(UTF16.valueOf(0x1F935), "") // man in tuxedo→ - // .put(UTF16.valueOf(0x1F574), "") // man in suit levitating→ - // .put(UTF16.valueOf(0x1F472), "") // man with Chinese cap→ - // .put(UTF16.valueOf(0x1F9D4), "") // BEARDED PERSON→ - .freeze(); - static final UnicodeMap FEMALE_TO_OTHER = new UnicodeMap() - .put(UTF16.valueOf(0x2640),UTF16.valueOf(0x2642)) // FEMALE SIGN→MALE SIGN - .put(UTF16.valueOf(0x1F467), UTF16.valueOf(0x1F466)) // girl→boy - .put(UTF16.valueOf(0x1F469), UTF16.valueOf(0x1F468)) // woman→man - .put(UTF16.valueOf(0x1F475), UTF16.valueOf(0x1F474)) // old woman→old man - .put(UTF16.valueOf(0x1F936), UTF16.valueOf(0x1F385)) // Mrs. Claus→Santa Claus - .put(UTF16.valueOf(0x1F478), UTF16.valueOf(0x1F934)) // princess→prince - .put(UTF16.valueOf(0x1F483), UTF16.valueOf(0x1F57A)) // woman dancing→man dancing - // .put(UTF16.valueOf(0x1F46D), UTF16.valueOf(0x1F46C)) // two women holding hands→two men holding hands - // .put(UTF16.valueOf(0x1F46D), UTF16.valueOf(0x1F46B)) // two women holding hands→man and woman holding hands - // .put(UTF16.valueOf(0x1F470), "") // bride with veil→ - // .put(UTF16.valueOf(0x1F930), "") // pregnant woman→ - // .put(UTF16.valueOf(0x1F931), "") // breast-feeding→ - // .put(UTF16.valueOf(0x1F9D5), "") // woman with headscarf→ - .freeze(); - static final UnicodeSet NEUTRAL = new UnicodeSet("[⛷⛹🏂-🏄🏇🏊-🏎👤👥👪-👳👶👷👼💁💂💆💇💏💑🕴🕵🗣🙅-🙇🙋🙍🙎🚣🚴-🚶🛀🛌🤦🤰🤱🤵🤷-🤾🦸🦹🧑-🧟]"); + static final UnicodeMap TO_NEUTRAL = + new UnicodeMap() + .put("👦", "🧒") + .put("👧", "🧒") + .put("👨", "🧑") + .put("👩", "🧑") + .put("👴", "🧓") + .put("👵", "🧓") + .put("🤴", "🧑\u200D👑") + .put("👸", "🧑\u200D👑") + .put("🎅", "🧑\u200D🎄") + .put("🤶", "🧑\u200D🎄") + .put("💃", "🧑\u200D🎶") + .put("🕺", "🧑\u200D🎶") + .put("👫", "🧑" + EmojiData.ZWJ_HANDSHAKE_ZWJ + "🧑") + .put("👬", "🧑" + EmojiData.ZWJ_HANDSHAKE_ZWJ + "🧑") + .put("👭", "🧑" + EmojiData.ZWJ_HANDSHAKE_ZWJ + "🧑") + .freeze(); + + static final UnicodeMap MALE_TO_OTHER = + new UnicodeMap() + .put(UTF16.valueOf(0x2642), UTF16.valueOf(0x2640)) // MALE SIGN→FEMALE SIGN + .put(UTF16.valueOf(0x1F466), UTF16.valueOf(0x1F467)) // boy→girl + .put(UTF16.valueOf(0x1F468), UTF16.valueOf(0x1F469)) // man→woman + .put(UTF16.valueOf(0x1F474), UTF16.valueOf(0x1F475)) // old man→old woman + .put(UTF16.valueOf(0x1F385), UTF16.valueOf(0x1F936)) // Santa Claus→Mrs. Claus + .put(UTF16.valueOf(0x1F934), UTF16.valueOf(0x1F478)) // prince→princess + .put( + UTF16.valueOf(0x1F57A), + UTF16.valueOf(0x1F483)) // man dancing→woman dancing + // .put(UTF16.valueOf(0x1F46C), UTF16.valueOf(0x1F46B)) // two men + // holding hands→man and woman holding hands + // .put(UTF16.valueOf(0x1F46C), UTF16.valueOf(0x1F46D)) // two men + // holding hands→two women holding hands + // .put(UTF16.valueOf(0x1F935), "") // man in tuxedo→ + // .put(UTF16.valueOf(0x1F574), "") // man in suit levitating→ + // .put(UTF16.valueOf(0x1F472), "") // man with Chinese cap→ + // .put(UTF16.valueOf(0x1F9D4), "") // BEARDED PERSON→ + .freeze(); + static final UnicodeMap FEMALE_TO_OTHER = + new UnicodeMap() + .put(UTF16.valueOf(0x2640), UTF16.valueOf(0x2642)) // FEMALE SIGN→MALE SIGN + .put(UTF16.valueOf(0x1F467), UTF16.valueOf(0x1F466)) // girl→boy + .put(UTF16.valueOf(0x1F469), UTF16.valueOf(0x1F468)) // woman→man + .put(UTF16.valueOf(0x1F475), UTF16.valueOf(0x1F474)) // old woman→old man + .put(UTF16.valueOf(0x1F936), UTF16.valueOf(0x1F385)) // Mrs. Claus→Santa Claus + .put(UTF16.valueOf(0x1F478), UTF16.valueOf(0x1F934)) // princess→prince + .put( + UTF16.valueOf(0x1F483), + UTF16.valueOf(0x1F57A)) // woman dancing→man dancing + // .put(UTF16.valueOf(0x1F46D), UTF16.valueOf(0x1F46C)) // two women + // holding hands→two men holding hands + // .put(UTF16.valueOf(0x1F46D), UTF16.valueOf(0x1F46B)) // two women + // holding hands→man and woman holding hands + // .put(UTF16.valueOf(0x1F470), "") // bride with veil→ + // .put(UTF16.valueOf(0x1F930), "") // pregnant woman→ + // .put(UTF16.valueOf(0x1F931), "") // breast-feeding→ + // .put(UTF16.valueOf(0x1F9D5), "") // woman with headscarf→ + .freeze(); + static final UnicodeSet NEUTRAL = + new UnicodeSet( + "[⛷⛹🏂-🏄🏇🏊-🏎👤👥👪-👳👶👷👼💁💂💆💇💏💑🕴🕵🗣🙅-🙇🙋🙍🙎🚣🚴-🚶🛀🛌🤦🤰🤱🤵🤷-🤾🦸🦹🧑-🧟]"); public enum Source { // also used for accessing pngs; order is important @@ -320,29 +369,52 @@ public enum Source { sample("Samp2"), plain, // gifs; don't change order! - gmail("GMail"), sb("SB", "SoftBank"), dcm("DCM", "DoCoMo"), kddi("KDDI", "KDDI"), + gmail("GMail"), + sb("SB", "SoftBank"), + dcm("DCM", "DoCoMo"), + kddi("KDDI", "KDDI"), svg; - static final Set OLD_SOURCES = ImmutableSet.copyOf( - EnumSet.of(gmail, sb, dcm, kddi)); // do this to get same order as Source - static final Set VENDOR_SOURCES = ImmutableSet.copyOf( - EnumSet.of(apple, google, twitter, emojione, samsung, fb, windows)); // do this to get same order as Source - static final Set platformsToIncludeNormal = ImmutableSet.copyOf(EnumSet.of( - Source.apple, Source.google, Source.windows, Source.twitter, Source.emojione, Source.samsung, - Source.fb, // Source.fbm, - Source.gmail, Source.dcm, Source.kddi, Source.sb - )); + static final Set OLD_SOURCES = + ImmutableSet.copyOf( + EnumSet.of(gmail, sb, dcm, kddi)); // do this to get same order as Source + static final Set VENDOR_SOURCES = + ImmutableSet.copyOf( + EnumSet.of( + apple, google, twitter, emojione, samsung, fb, + windows)); // do this to get same order as Source + static final Set platformsToIncludeNormal = + ImmutableSet.copyOf( + EnumSet.of( + Source.apple, + Source.google, + Source.windows, + Source.twitter, + Source.emojione, + Source.samsung, + Source.fb, // Source.fbm, + Source.gmail, + Source.dcm, + Source.kddi, + Source.sb)); // Ordering is what will appear with … fallback - static final Set PLATFORM_FALLBACK = ImmutableSet.builder() - .addAll(EnumSet.of( - Source.apple, Source.google, Source.windows, Source.twitter, Source.emojione, Source.samsung, - Source.fb)) - .add(Source.emojipedia) - .add(Source.emojixpress) - .add(Source.emojination) - .add(Source.proposed) - .add(Source.sample) - .build(); + static final Set PLATFORM_FALLBACK = + ImmutableSet.builder() + .addAll( + EnumSet.of( + Source.apple, + Source.google, + Source.windows, + Source.twitter, + Source.emojione, + Source.samsung, + Source.fb)) + .add(Source.emojipedia) + .add(Source.emojixpress) + .add(Source.emojination) + .add(Source.proposed) + .add(Source.sample) + .build(); private final String shortName; private final String longName; @@ -350,9 +422,11 @@ public enum Source { private Source() { this(null, null); } + private Source(String shortName) { this(shortName, null); } + private Source(String shortName, String longName) { this.shortName = shortName != null ? shortName : UCharacter.toTitleCase(name(), null); this.longName = longName != null ? longName : UCharacter.toTitleCase(name(), null); @@ -388,17 +462,19 @@ public String shortName() { public String toString() { return longName; } + public String getFullPrefix() { - return this == svg ? "svg/emoji_" - : getPrefix() + "/" + getPrefix() + "_"; + return this == svg ? "svg/emoji_" : getPrefix() + "/" + getPrefix() + "_"; } public String getSuffix() { return this == Source.svg ? ".svg" : isGif() ? ".gif" : ".png"; } + public String getImageFileName(String cp) { return getFullPrefix() + buildFileName(cp, "_") + getSuffix(); } + public String getImageDirectory() { return this == svg ? Emoji.IMAGES_SOURCE_DIR_SVG : Emoji.IMAGES_OUTPUT_DIR; } @@ -411,8 +487,7 @@ public enum CharSource { WDings("ʷ", "w", "L2/11-052"), ARIB("ª", "a", "L2/07-259"), ZDings("ᶻ", "z", "Unicode1.0.0"), - Other("ˣ", "x", "n/a") - ; + Other("ˣ", "x", "n/a"); final String superscript; final String letter; final Set proposals; @@ -424,8 +499,11 @@ private CharSource(String shortString, String letter, String proposals) { } } - - public enum Qualified {all, first, none} + public enum Qualified { + all, + first, + none + } public static final int TAG_BASE = 0xE0000; public static final int TAG_TERM_CHAR = 0xE007F; @@ -434,43 +512,60 @@ public enum Qualified {all, first, none} public static final char KEYCAP_MARK = '\u20E3'; public static final String KEYCAP_MARK_STRING = String.valueOf(KEYCAP_MARK); - // private static final UnicodeSet Unicode8Emoji = new UnicodeSet("[\\x{1F3FB}\\x{1F3FC}\\x{1F3FD}\\x{1F3FE}\\x{1F3FF}\\x{1F4FF}\\x{1F54B}\\x{1F54C}\\x{1F54D}" - // +"\\x{1F54E}\\x{1F6D0}\\x{1F32D}\\x{1F32E}\\x{1F32F}\\x{1F37E}\\x{1F37F}\\x{1F983}\\x{1F984}\\x{1F9C0}" - // +"\\x{1F3CF}\\x{1F3D0}\\x{1F3D1}\\x{1F3D2}\\x{1F3D3}\\x{1F3F8}\\x{1F3F9}\\x{1F3FA}\\x{1F643}" - // +"\\x{1F644}\\x{1F910}\\x{1F911}\\x{1F912}\\x{1F913}\\x{1F914}\\x{1F915}\\x{1F916}\\x{1F917}" + // private static final UnicodeSet Unicode8Emoji = new + // UnicodeSet("[\\x{1F3FB}\\x{1F3FC}\\x{1F3FD}\\x{1F3FE}\\x{1F3FF}\\x{1F4FF}\\x{1F54B}\\x{1F54C}\\x{1F54D}" + // + // +"\\x{1F54E}\\x{1F6D0}\\x{1F32D}\\x{1F32E}\\x{1F32F}\\x{1F37E}\\x{1F37F}\\x{1F983}\\x{1F984}\\x{1F9C0}" + // + // +"\\x{1F3CF}\\x{1F3D0}\\x{1F3D1}\\x{1F3D2}\\x{1F3D3}\\x{1F3F8}\\x{1F3F9}\\x{1F3FA}\\x{1F643}" + // + // +"\\x{1F644}\\x{1F910}\\x{1F911}\\x{1F912}\\x{1F913}\\x{1F914}\\x{1F915}\\x{1F916}\\x{1F917}" // +"\\x{1F918}\\x{1F980}\\x{1F981}\\x{1F982}]").freeze(); // new UnicodeSet( - // "[🕉 ✡ ☸ ☯ ✝ ☦ ⛩ ☪ ⚛ 0-9©®‼⁉℗™ℹ↔-↙↩↪⌚⌛⌨⎈⏏⏩-⏺Ⓜ▪▫▶◀●◪◻-◾☀-☄☎-☒☔☕☘-☠☢-☤☦🕉☦ ☪ ☬ ☸ ✝ 🕉☪-☬☮☯☹-☾♈-♓♠-♯♲" + // "[🕉 ✡ ☸ ☯ ✝ ☦ ⛩ ☪ ⚛ 0-9©®‼⁉℗™ℹ↔-↙↩↪⌚⌛⌨⎈⏏⏩-⏺Ⓜ▪▫▶◀●◪◻-◾☀-☄☎-☒☔☕☘-☠☢-☤☦🕉☦ ☪ ☬ ☸ ✝ + // 🕉☪-☬☮☯☹-☾♈-♓♠-♯♲" // + "♻♾♿⚐-⚜⚠⚡⚪⚫⚰⚱⚽-⚿⛄-⛈⛍-⛙⛛-⛡⛨-⛪⛰-⛵⛷-⛺⛼-✒✔-✘✝✨✳✴❄❇❌❎❓-❕❗❢-❧➕-➗" // + "➡➰➿⤴⤵⬅-⬇⬛⬜⭐⭕⸙〰〽㊗㊙🀄🃏🅰🅱🅾🅿🆎🆏🆑-🆚🈁🈂🈚🈯🈲-🈺🉐🉑🌀-🌬🌰-🍽🎀-🏎" // + "🏔-🏷🐀-📾🔀-🔿🕊🕐-🕱🕳-🕹🖁-🖣🖥-🖩🖮-🗳🗺-🙂🙅-🙏🚀-🛏🛠-🛬🛰-🛳" // + "{#⃣}{*⃣}{0⃣}{1⃣}{2⃣}{3⃣}{4⃣}{5⃣}{6⃣}{7⃣}{8⃣}{9⃣}]") // .addAll(Unicode8Emoji) // .removeAll(new UnicodeSet("[☫☬🎕⚘⸙⎈]")) - // .removeAll(new UnicodeSet("[℗⏴-⏷●◪☙☤☼-☾♩-♯♾⚐⚑⚕⚚ ⚿⛆⛍⛐⛒⛕-⛙⛛⛜⛞-⛡⛨⛼⛾-✀✆✇✑ ❢❦❧🌢🌣🎔🎘🎜🎝🏱🏲🏶📾🔾🔿🕨-🕮🕱🖁-🖆 🖈🖉🖎🖏🖒-🖔🖗-🖣🖦🖧🖩🖮-🖰🖳-🖻🖽-🗁 🗅-🗐🗔-🗛🗟🗠🗤-🗮🗰-🗲🛆-🛈🛦-🛨🛪 🛱🛲]")) - // .removeAll(new UnicodeSet("[🛉 🛊 🖑🗢☏☐☒☚-☜☞☟♲⛇✁✃✄✎✐✕✗✘ ♤ ♡ ♢ ♧❥🆏 ☻ ⛝ 0 1 2 3 4 5 6 7 8 9]")) + // .removeAll(new UnicodeSet("[℗⏴-⏷●◪☙☤☼-☾♩-♯♾⚐⚑⚕⚚ ⚿⛆⛍⛐⛒⛕-⛙⛛⛜⛞-⛡⛨⛼⛾-✀✆✇✑ + // ❢❦❧🌢🌣🎔🎘🎜🎝🏱🏲🏶📾🔾🔿🕨-🕮🕱🖁-🖆 🖈🖉🖎🖏🖒-🖔🖗-🖣🖦🖧🖩🖮-🖰🖳-🖻🖽-🗁 + // 🗅-🗐🗔-🗛🗟🗠🗤-🗮🗰-🗲🛆-🛈🛦-🛨🛪 🛱🛲]")) + // .removeAll(new UnicodeSet("[🛉 🛊 🖑🗢☏☐☒☚-☜☞☟♲⛇✁✃✄✎✐✕✗✘ ♤ ♡ ♢ ♧❥🆏 ☻ ⛝ 0 1 2 3 4 5 + // 6 7 8 9]")) // .add("🗨") // // .freeze() will freeze later // ; // static { // if (IS_BETA) { - // EMOJI_CHARS.addAll("[🕺 🖤 🛑 🛒 🛴 🛵 🛶 🤙 🤚 🤛 🤜 🤝 🤞 🤠 🤡 🤢 🤣 🤤 🤥 🤦 🤧 🤰 🤳 🤴 🤵 🤶 🤷 🤸 🤹 🤺 🤻 🤼 🤽 🤾 🥀 🥁 🥂 🥃 🥄 🥅 🥆 🥇 🥈 🥉 🥊 🥋 🥐 🥑 🥒 🥓 🥔 🥕 🥖 🥗 🥘 🥙 🥚 🥛 🥜 🥝 🥞 🦅 🦆 🦇 🦈 🦉 🦊 🦋 🦌 🦍 🦎 🦏 🦐 🦑]"); + // EMOJI_CHARS.addAll("[🕺 🖤 🛑 🛒 🛴 🛵 🛶 🤙 🤚 🤛 🤜 🤝 🤞 🤠 🤡 🤢 🤣 🤤 🤥 🤦 + // 🤧 🤰 🤳 🤴 🤵 🤶 🤷 🤸 🤹 🤺 🤻 🤼 🤽 🤾 🥀 🥁 🥂 🥃 🥄 🥅 🥆 🥇 🥈 🥉 🥊 🥋 🥐 🥑 🥒 🥓 🥔 + // 🥕 🥖 🥗 🥘 🥙 🥚 🥛 🥜 🥝 🥞 🦅 🦆 🦇 🦈 🦉 🦊 🦋 🦌 🦍 🦎 🦏 🦐 🦑]"); // } // } - public static final UnicodeSet COMMON_ADDITIONS = new UnicodeSet("[➿🌍🌎🌐🌒🌖-🌘🌚🌜-🌞🌲🌳🍋🍐🍼🏇🏉🏤🐀-🐋🐏🐐🐓🐕🐖🐪👥👬👭💭💶💷📬📭📯📵🔀-🔂🔄-🔉🔕🔬🔭🕜-🕧😀😇😈😎😐😑😕😗😙😛😟😦😧😬😮😯😴😶🚁🚂🚆🚈🚊🚋🚍🚎🚐🚔🚖🚘🚛-🚡🚣🚦🚮-🚱🚳-🚵🚷🚸🚿🛁-🛅]").freeze(); - static final UnicodeSet ASCII_LETTER_HYPHEN = new UnicodeSet('-', '-', 'A', 'Z', 'a', 'z', '’', '’').freeze(); + public static final UnicodeSet COMMON_ADDITIONS = + new UnicodeSet( + "[➿🌍🌎🌐🌒🌖-🌘🌚🌜-🌞🌲🌳🍋🍐🍼🏇🏉🏤🐀-🐋🐏🐐🐓🐕🐖🐪👥👬👭💭💶💷📬📭📯📵🔀-🔂🔄-🔉🔕🔬🔭🕜-🕧😀😇😈😎😐😑😕😗😙😛😟😦😧😬😮😯😴😶🚁🚂🚆🚈🚊🚋🚍🚎🚐🚔🚖🚘🚛-🚡🚣🚦🚮-🚱🚳-🚵🚷🚸🚿🛁-🛅]") + .freeze(); + static final UnicodeSet ASCII_LETTER_HYPHEN = + new UnicodeSet('-', '-', 'A', 'Z', 'a', 'z', '’', '’').freeze(); static final UnicodeSet LATIN1_LETTER = new UnicodeSet("[[:L:]&[\\x{0}-\\x{FF}}]]").freeze(); - static final UnicodeSet KEYWORD_CHARS = new UnicodeSet(Emoji.ASCII_LETTER_HYPHEN) - .add('0','9') - .addAll(" +:.&") - .addAll(LATIN1_LETTER) - .freeze(); - static final UnicodeSet KEYCAPS = new UnicodeSet("[{#⃣}{*⃣}{0⃣}{1⃣}{2⃣}{3⃣}{4⃣}{5⃣}{6⃣}{7⃣}{8⃣}{9⃣}]").freeze(); + static final UnicodeSet KEYWORD_CHARS = + new UnicodeSet(Emoji.ASCII_LETTER_HYPHEN) + .add('0', '9') + .addAll(" +:.&") + .addAll(LATIN1_LETTER) + .freeze(); + static final UnicodeSet KEYCAPS = + new UnicodeSet("[{#⃣}{*⃣}{0⃣}{1⃣}{2⃣}{3⃣}{4⃣}{5⃣}{6⃣}{7⃣}{8⃣}{9⃣}]").freeze(); static final UnicodeSet KEYCAP_BASE = new UnicodeSet("[0-9#*]").freeze(); - //public static final UnicodeSet SKIP_ANDROID = new UnicodeSet("[♨ ⚠ ▶ ◀ ✉ ✏ ✒ ✂ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ♻ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ▪ ▫ ◻ ◼ ‼ ⁉ 〰 © ® 🅰 🅱 ℹ Ⓜ 🅾 🅿 ™ 🈂 🈷 ㊗ ㊙]").freeze(); + // public static final UnicodeSet SKIP_ANDROID = new UnicodeSet("[♨ ⚠ ▶ ◀ ✉ ✏ ✒ ✂ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ + // ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ♻ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ▪ ▫ ◻ ◼ ‼ ⁉ 〰 © ® 🅰 🅱 ℹ Ⓜ 🅾 🅿 ™ 🈂 🈷 ㊗ ㊙]").freeze(); - static public String buildFileName(String chars, String separator) { + public static String buildFileName(String chars, String separator) { StringBuilder result = new StringBuilder(); boolean first = true; for (int cp : With.codePointArray(chars)) { @@ -489,13 +584,13 @@ static public String buildFileName(String chars, String separator) { static Pattern DASH_OR_UNDERBAR = Pattern.compile("[-_]"); - static public String parseFileName(boolean hasPrefix, String chars) { + public static String parseFileName(boolean hasPrefix, String chars) { StringBuilder result = new StringBuilder(); int dotPos = chars.lastIndexOf('.'); if (dotPos >= 0) { - chars = chars.substring(0,dotPos); + chars = chars.substring(0, dotPos); } - String[] parts = DASH_OR_UNDERBAR.split(chars); //chars.split(separator); + String[] parts = DASH_OR_UNDERBAR.split(chars); // chars.split(separator); boolean first = true; for (String part : parts) { if (part.startsWith("x")) { @@ -505,16 +600,17 @@ static public String parseFileName(boolean hasPrefix, String chars) { first = false; continue; } - result.appendCodePoint(Integer.parseInt(part,16)); + result.appendCodePoint(Integer.parseInt(part, 16)); } - return result.toString(); + return result.toString(); } public static String getHexFromFlagCode(String isoCountries) { - String cc = new StringBuilder() - .appendCodePoint(isoCountries.charAt(0) + Emoji.FIRST_REGIONAL - 'A') - .appendCodePoint(isoCountries.charAt(1) + Emoji.FIRST_REGIONAL - 'A') - .toString(); + String cc = + new StringBuilder() + .appendCodePoint(isoCountries.charAt(0) + Emoji.FIRST_REGIONAL - 'A') + .appendCodePoint(isoCountries.charAt(1) + Emoji.FIRST_REGIONAL - 'A') + .toString(); return cc; } @@ -529,19 +625,21 @@ static String getRegionCodeFromEmoji(String chars) { int first = chars.codePointAt(0); return new StringBuilder() .appendCodePoint(first - FIRST_REGIONAL + 'A') - .appendCodePoint(chars.codePointAt(Character.charCount(first)) - FIRST_REGIONAL + 'A') + .appendCodePoint( + chars.codePointAt(Character.charCount(first)) - FIRST_REGIONAL + 'A') .toString(); } - public static final UnicodeSet FACES = new UnicodeSet("[☺ ☹ 🙁 🙂 😀-😆 😉-😷 😇 😈 👿 🙃 🙄 🤐-🤕 🤗]").freeze(); + public static final UnicodeSet FACES = + new UnicodeSet("[☺ ☹ 🙁 🙂 😀-😆 😉-😷 😇 😈 👿 🙃 🙄 🤐-🤕 🤗]").freeze(); - public static final UnicodeSet EMOJI_VARIANTS = new UnicodeSet().add(EMOJI_VARIANT).add(TEXT_VARIANT).freeze(); + public static final UnicodeSet EMOJI_VARIANTS = + new UnicodeSet().add(EMOJI_VARIANT).add(TEXT_VARIANT).freeze(); - public static final UnicodeSet EMOJI_VARIANTS_JOINER = new UnicodeSet(EMOJI_VARIANTS) - .add(JOINER) - .freeze(); + public static final UnicodeSet EMOJI_VARIANTS_JOINER = + new UnicodeSet(EMOJI_VARIANTS).add(JOINER).freeze(); - //public static final String PERSON = "\u263F"; + // public static final String PERSON = "\u263F"; public static final int BOY = 0x1F466; public static final int GIRL = 0x1F467; @@ -553,63 +651,72 @@ static String getRegionCodeFromEmoji(String chars) { public static final String WOMAN_STR = UTF16.valueOf(WOMAN); public static final String NEUTRAL_FAMILY = UTF16.valueOf(0x1F46A); - public static final UnicodeSet FAMILY_MARKERS = new UnicodeSet().add(BOY, WOMAN).freeze(); // includes girl, man - public static final UnicodeSet ACTIVITY_MARKER = new UnicodeSet("[🤱 🧖 🧗 🧘🤰 💆 💇 🚶 🏃 💃 🕺 👯 🕴 🗣 👤 👥 🏌 🏄 🚣 🏊 ⛹ 🏋 🚴 🚵 🤸 🤼-🤾 🤹]").freeze(); - public static final UnicodeSet GENDER_MARKERS = new UnicodeSet() - .add(FEMALE) - .add(MALE) - //.add(TRANSGENDER) - .freeze(); - public static final UnicodeSet FULL_GENDER_MARKERS = new UnicodeSet(GENDER_MARKERS) - .add(TRANSGENDER) - .freeze(); - public static final UnicodeSet ZWJ_GENDER_MARKERS = new UnicodeSet() - .add(JOINER + FEMALE) - .add(JOINER + MALE) - //.add(JOINER + TRANSGENDER) - .freeze(); - public static final UnicodeSet FULL_ZWJ_GENDER_MARKERS = new UnicodeSet(ZWJ_GENDER_MARKERS) - .add(JOINER + FEMALE + EMOJI_VARIANT) - .add(JOINER + MALE + EMOJI_VARIANT) - // .add(JOINER + TRANSGENDER + EMOJI_VARIANT) - .freeze(); - - public static final UnicodeSet MAN_OR_WOMAN_OR_ADULT = new UnicodeSet().add(Emoji.WOMAN).add(Emoji.MAN).add(Emoji.ADULT) - .freeze(); - - public static final String TRANSFLAG = Utility.toString(0x1F3F3,0xFE0F,0x200D,0x26A7,0xFE0F); + public static final UnicodeSet FAMILY_MARKERS = + new UnicodeSet().add(BOY, WOMAN).freeze(); // includes girl, man + public static final UnicodeSet ACTIVITY_MARKER = + new UnicodeSet( + "[🤱 🧖 🧗 🧘🤰 💆 💇 🚶 🏃 💃 🕺 👯 🕴 🗣 👤 👥 🏌 🏄 🚣 🏊 ⛹ 🏋 🚴 🚵 🤸 🤼-🤾 🤹]") + .freeze(); + public static final UnicodeSet GENDER_MARKERS = + new UnicodeSet() + .add(FEMALE) + .add(MALE) + // .add(TRANSGENDER) + .freeze(); + public static final UnicodeSet FULL_GENDER_MARKERS = + new UnicodeSet(GENDER_MARKERS).add(TRANSGENDER).freeze(); + public static final UnicodeSet ZWJ_GENDER_MARKERS = + new UnicodeSet() + .add(JOINER + FEMALE) + .add(JOINER + MALE) + // .add(JOINER + TRANSGENDER) + .freeze(); + public static final UnicodeSet FULL_ZWJ_GENDER_MARKERS = + new UnicodeSet(ZWJ_GENDER_MARKERS) + .add(JOINER + FEMALE + EMOJI_VARIANT) + .add(JOINER + MALE + EMOJI_VARIANT) + // .add(JOINER + TRANSGENDER + EMOJI_VARIANT) + .freeze(); + + public static final UnicodeSet MAN_OR_WOMAN_OR_ADULT = + new UnicodeSet().add(Emoji.WOMAN).add(Emoji.MAN).add(Emoji.ADULT).freeze(); + + public static final String TRANSFLAG = + Utility.toString(0x1F3F3, 0xFE0F, 0x200D, 0x26A7, 0xFE0F); public static final UnicodeSet HAIR_BASE = MAN_OR_WOMAN_OR_ADULT; public static final UnicodeSet HAIR_PIECES = HAIR_STYLES; - public static final UnicodeSet ROLE_MARKER = new UnicodeSet("[\\U0001F9D1 \\U0001F468 \\U0001F469 \\U0001F9D9-\\U0001F9DF 👱 👮 👳 👷 💂 🕵]").freeze(); + public static final UnicodeSet ROLE_MARKER = + new UnicodeSet( + "[\\U0001F9D1 \\U0001F468 \\U0001F469 \\U0001F9D9-\\U0001F9DF 👱 👮 👳 👷 💂 🕵]") + .freeze(); static final int FIRST_REGIONAL = 0x1F1E6; static final int LAST_REGIONAL = 0x1F1FF; - public static final UnicodeSet DEFECTIVE_COMPONENTS = new UnicodeSet("[\\u200d \\ufe0f \\u20e3 \\U000e0020-\\U000e007f]"); + public static final UnicodeSet DEFECTIVE_COMPONENTS = + new UnicodeSet("[\\u200d \\ufe0f \\u20e3 \\U000e0020-\\U000e007f]"); - public static final UnicodeSet REGIONAL_INDICATORS = new UnicodeSet(FIRST_REGIONAL,LAST_REGIONAL).freeze(); - public static final UnicodeSet DEFECTIVE = new UnicodeSet("[0123456789*#]") - .addAll(REGIONAL_INDICATORS) - .addAll(DEFECTIVE_COMPONENTS) - .freeze(); - public static final UnicodeSet EXCLUSIONS = new UnicodeSet() - .add("👩‍🤝‍👩") - .add("👩‍🤝‍👨") - .add("👨‍🤝‍👨") - .freeze(); + public static final UnicodeSet REGIONAL_INDICATORS = + new UnicodeSet(FIRST_REGIONAL, LAST_REGIONAL).freeze(); + public static final UnicodeSet DEFECTIVE = + new UnicodeSet("[0123456789*#]") + .addAll(REGIONAL_INDICATORS) + .addAll(DEFECTIVE_COMPONENTS) + .freeze(); + public static final UnicodeSet EXCLUSIONS = + new UnicodeSet().add("👩‍🤝‍👩").add("👩‍🤝‍👨").add("👨‍🤝‍👨").freeze(); public static final UnicodeSet EXCLUDED_FOR_SEGMENTATION = new UnicodeSet("[#*0-9©®™〰〽🇦-🇿]"); // static final UnicodeSet EXCLUDE = new UnicodeSet( - // "[🂠-🂮 🂱-🂿 🃁-🃎 🃑-🃵 🀀-🀃 🀅-🀫 〠🕲⍾☸🀰-🂓 🙬 🙭 🙮 🙯🗴🗵🗶🗷🗸🗹★☆⛫\uFFFC⛤-⛧ ⌤⌥⌦⌧⌫⌬⎆⎇⎋⎗⎘⎙⎚⏣⚝⛌⛚⛬⛭⛮⛯⛶⛻✓🆊\\U0001F544-\\U0001F549" + + // "[🂠-🂮 🂱-🂿 🃁-🃎 🃑-🃵 🀀-🀃 🀅-🀫 〠🕲⍾☸🀰-🂓 🙬 🙭 🙮 🙯🗴🗵🗶🗷🗸🗹★☆⛫\uFFFC⛤-⛧ + // ⌤⌥⌦⌧⌫⌬⎆⎇⎋⎗⎘⎙⎚⏣⚝⛌⛚⛬⛭⛮⛯⛶⛻✓🆊\\U0001F544-\\U0001F549" + // "☖ ☗ ⛉ ⛊ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ♛ ♜ ♝ ♞ ♟ ⛀ ⛁ ⛂ ⛃" + // "]").freeze(); // // 🖫🕾🕿🕻🕼🕽🕾🕿🖀🖪🖬🖭 - - // static final UnicodeSet EMOJI_CHARS_WITHOUT_FLAGS = new UnicodeSet(EMOJI_CHARS).freeze(); // static { // CLDRConfig config = CLDRConfig.getInstance(); @@ -617,13 +724,15 @@ static String getRegionCodeFromEmoji(String chars) { // SupplementalDataInfo sdi = config.getSupplementalDataInfo(); // Set container = new TreeSet<>(); // Set contained = new TreeSet<>(); - // for (Entry territoryToContained : sdi.getTerritoryToContained().entrySet()) { + // for (Entry territoryToContained : + // sdi.getTerritoryToContained().entrySet()) { // container.add(territoryToContained.getKey()); // contained.add(territoryToContained.getValue()); // } // contained.removeAll(container); // contained.add("EU"); // special case - // Map, String>> aliasInfo = sdi.getLocaleAliasInfo().get("territory"); + // Map, String>> aliasInfo = + // sdi.getLocaleAliasInfo().get("territory"); // contained.removeAll(aliasInfo.keySet()); // for (String s: contained) { // //System.out.println(s + "\t" + config.getEnglish().getName("territory", s)); @@ -638,13 +747,13 @@ public static boolean isRegionalIndicator(int firstCodepoint) { } public static final char ENCLOSING_KEYCAP = '\u20E3'; - static final Comparator CODEPOINT_LENGTH = new Comparator() { - @Override - public int compare(String o1, String o2) { - return o1.codePointCount(0, o1.length()) - o2.codePointCount(0, o2.length()); - } - }; - + static final Comparator CODEPOINT_LENGTH = + new Comparator() { + @Override + public int compare(String o1, String o2) { + return o1.codePointCount(0, o1.length()) - o2.codePointCount(0, o2.length()); + } + }; public static final UnicodeSet ASCII_LETTERS = new UnicodeSet("[A-Za-z]").freeze(); public static final String EMOJI_VARIANT_STRING = String.valueOf(EMOJI_VARIANT); @@ -652,7 +761,8 @@ public int compare(String o1, String o2) { public static final String JOINER_STRING = String.valueOf(JOINER); public static String getLabelFromLine(Output> newLabel, String original) { - String line = original.replace(EMOJI_VARIANT_STRING, "").replace(TEXT_VARIANT_STRING, "").trim(); + String line = + original.replace(EMOJI_VARIANT_STRING, "").replace(TEXT_VARIANT_STRING, "").trim(); if (line.isEmpty()) { return line; } @@ -666,7 +776,7 @@ public static String getLabelFromLine(Output> newLabel, String origi } if (tabPos >= 0) { newLabel.value.clear(); - String[] temp = line.substring(0,tabPos).trim().split(",\\s*"); + String[] temp = line.substring(0, tabPos).trim().split(",\\s*"); for (String part : temp) { if (KEYWORD_CHARS.containsAll(part)) { newLabel.value.add(part); @@ -678,7 +788,8 @@ public static String getLabelFromLine(Output> newLabel, String origi } return line; } - // private static final Transform WINDOWS_URL = new Transform() { + // private static final Transform WINDOWS_URL = new Transform() + // { // public String transform(String s) { // String base = "images /windows/windows_"; // String separator = "_"; @@ -692,30 +803,38 @@ public static String getEmojiSequence(String line, int i) { int firstCodepoint = line.codePointAt(i); int firstLen = Character.charCount(firstCodepoint); if (i + firstLen == line.length()) { - return line.substring(i, i+firstLen); + return line.substring(i, i + firstLen); } - int secondCodepoint = line.codePointAt(i+firstLen); + int secondCodepoint = line.codePointAt(i + firstLen); int secondLen = Character.charCount(secondCodepoint); if (secondCodepoint == ENCLOSING_KEYCAP || (isRegionalIndicator(firstCodepoint) && isRegionalIndicator(secondCodepoint))) { - return line.substring(i, i+firstLen+secondLen); + return line.substring(i, i + firstLen + secondLen); } - if (i+firstLen+secondLen == line.length()) { - return line.substring(i, i+firstLen); + if (i + firstLen + secondLen == line.length()) { + return line.substring(i, i + firstLen); } if (secondCodepoint == Emoji.JOINER) { - return line.substring(i, i+firstLen+secondLen) + getEmojiSequence(line, i+firstLen+secondLen); + return line.substring(i, i + firstLen + secondLen) + + getEmojiSequence(line, i + firstLen + secondLen); } - return line.substring(i, i+firstLen); + return line.substring(i, i + firstLen); } - static final UnicodeSet U80 = new UnicodeSet("[🌭🌮🌯🍾🍿🏏🏐🏑🏒🏓🏸🏹🏺🏻🏼🏽🏾🏿📿🕋🕌🕍🕎🙃🙄🛐🤀🤐🤑🤒🤓🤔🤕🤖🤗🤘🦀🦁🦂🦃🦄🧀]").freeze(); - static final UnicodeSet U90 = new UnicodeSet("[\\x{1F57A} \\x{1F5A4} \\x{1F6D1} \\x{1F6F4} \\x{1F6F5} \\x{1F919} \\x{1F91A} \\x{1F91B} \\x{1F91C} \\x{1F91D} \\x{1F91E} \\x{1F920} \\x{1F921} \\x{1F922} \\x{1F923} \\x{1F924} \\x{1F925} \\x{1F926} \\x{1F930} \\x{1F933} \\x{1F934} \\x{1F935} \\x{1F936} \\x{1F937} \\x{1F940} \\x{1F942} \\x{1F950} \\x{1F951} \\x{1F952} \\x{1F953} \\x{1F954} \\x{1F955} \\x{1F985} \\x{1F986} \\x{1F987} \\x{1F988} \\x{1F989} \\x{1F98A}]").freeze(); + static final UnicodeSet U80 = + new UnicodeSet( + "[🌭🌮🌯🍾🍿🏏🏐🏑🏒🏓🏸🏹🏺🏻🏼🏽🏾🏿📿🕋🕌🕍🕎🙃🙄🛐🤀🤐🤑🤒🤓🤔🤕🤖🤗🤘🦀🦁🦂🦃🦄🧀]") + .freeze(); + static final UnicodeSet U90 = + new UnicodeSet( + "[\\x{1F57A} \\x{1F5A4} \\x{1F6D1} \\x{1F6F4} \\x{1F6F5} \\x{1F919} \\x{1F91A} \\x{1F91B} \\x{1F91C} \\x{1F91D} \\x{1F91E} \\x{1F920} \\x{1F921} \\x{1F922} \\x{1F923} \\x{1F924} \\x{1F925} \\x{1F926} \\x{1F930} \\x{1F933} \\x{1F934} \\x{1F935} \\x{1F936} \\x{1F937} \\x{1F940} \\x{1F942} \\x{1F950} \\x{1F951} \\x{1F952} \\x{1F953} \\x{1F954} \\x{1F955} \\x{1F985} \\x{1F986} \\x{1F987} \\x{1F988} \\x{1F989} \\x{1F98A}]") + .freeze(); public static final Transliterator UNESCAPE = Transliterator.getInstance("hex-any/Perl"); static String getImageFilenameFromChars(Emoji.Source type, String chars) { - chars = chars.replace(Emoji.EMOJI_VARIANT_STRING,""); - // if (type == Emoji.Source.android && Emoji.SKIP_ANDROID.contains(chars)) { // hack to exclude certain android + chars = chars.replace(Emoji.EMOJI_VARIANT_STRING, ""); + // if (type == Emoji.Source.android && Emoji.SKIP_ANDROID.contains(chars)) { // hack + // to exclude certain android // return null; // } if (type == Source.charOverride) { @@ -746,9 +865,10 @@ static String getFlagCode(String chars) { return null; } secondCodepoint = chars.codePointAt(2); - String cc = (char) (firstCodepoint - FIRST_REGIONAL + 'A') - + "" - + (char) (secondCodepoint - FIRST_REGIONAL + 'A'); + String cc = + (char) (firstCodepoint - FIRST_REGIONAL + 'A') + + "" + + (char) (secondCodepoint - FIRST_REGIONAL + 'A'); // String remapped = REMAP_FLAGS.get(cc); // if (remapped != null) { // cc = remapped; @@ -759,8 +879,8 @@ static String getFlagCode(String chars) { return cc; } - static public File getImageFile(Source type, String chars) { - chars = chars.replace(Emoji.EMOJI_VARIANT_STRING,""); + public static File getImageFile(Source type, String chars) { + chars = chars.replace(Emoji.EMOJI_VARIANT_STRING, ""); String filename = getImageFilenameFromChars(type, chars); if (filename != null) { File file = new File(IMAGES_OUTPUT_DIR, filename); @@ -772,27 +892,28 @@ static public File getImageFile(Source type, String chars) { } static final UnicodeMap BEST_OVERRIDE = new UnicodeMap<>(); - static { + static { BEST_OVERRIDE.put(0x1F935, Emoji.Source.google); BEST_OVERRIDE.put(0x1F470, Emoji.Source.google); -// BEST_OVERRIDE.put("🛌", Emoji.Source.google); -// BEST_OVERRIDE.put("🛌🏻", Emoji.Source.google); -// BEST_OVERRIDE.put("🛌🏼", Emoji.Source.google); -// BEST_OVERRIDE.put("🛌🏽", Emoji.Source.google); -// BEST_OVERRIDE.put("🛌🏾", Emoji.Source.google); -// BEST_OVERRIDE.put("🛌🏿", Emoji.Source.google); + // BEST_OVERRIDE.put("🛌", Emoji.Source.google); + // BEST_OVERRIDE.put("🛌🏻", Emoji.Source.google); + // BEST_OVERRIDE.put("🛌🏼", Emoji.Source.google); + // BEST_OVERRIDE.put("🛌🏽", Emoji.Source.google); + // BEST_OVERRIDE.put("🛌🏾", Emoji.Source.google); + // BEST_OVERRIDE.put("🛌🏿", Emoji.Source.google); BEST_OVERRIDE.put(0x1F635, Emoji.Source.fb); -// BEST_OVERRIDE.put(0x1F917, Emoji.Source.emojione); - -// BEST_OVERRIDE.put(0x1FA72, Emoji.Source.proposed); -// BEST_OVERRIDE.put(0x1FA78, Emoji.Source.proposed); + // BEST_OVERRIDE.put(0x1F917, Emoji.Source.emojione); + // BEST_OVERRIDE.put(0x1FA72, Emoji.Source.proposed); + // BEST_OVERRIDE.put(0x1FA78, Emoji.Source.proposed); - // BEST_OVERRIDE.putAll(new UnicodeSet("[⛹🏃🏄🏊-🏌👨👩👮👯👱👳👷💁💂💆💇🕵🙅-🙇🙋🙍🙎🚣🚴-🚶🤦🤷-🤹🤼-🤾]"), Emoji.Source.google); + // BEST_OVERRIDE.putAll(new + // UnicodeSet("[⛹🏃🏄🏊-🏌👨👩👮👯👱👳👷💁💂💆💇🕵🙅-🙇🙋🙍🙎🚣🚴-🚶🤦🤷-🤹🤼-🤾]"), + // Emoji.Source.google); BEST_OVERRIDE.freeze(); } @@ -829,181 +950,195 @@ public static Iterable orderedEnum(Source... doFirst) { return ordered; } - public static final IndexUnicodeProperties LATEST = IndexUnicodeProperties.make(VERSION_TO_GENERATE_UNICODE); - public static final IndexUnicodeProperties BETA = IS_BETA - ? IndexUnicodeProperties.make(VERSION_BETA_UNICODE) : LATEST; + public static final IndexUnicodeProperties LATEST = + IndexUnicodeProperties.make(VERSION_TO_GENERATE_UNICODE); + public static final IndexUnicodeProperties BETA = + IS_BETA ? IndexUnicodeProperties.make(VERSION_BETA_UNICODE) : LATEST; - static final UnicodeMap VERSION_ENUM = BETA.loadEnum(UcdProperty.Age, Age_Values.class); + static final UnicodeMap VERSION_ENUM = + BETA.loadEnum(UcdProperty.Age, Age_Values.class); - // Certain resources we always load from latest. + // Certain resources we always load from latest. - static final UnicodeMap NAME = BETA.load(UcdProperty.Name); + static final UnicodeMap NAME = BETA.load(UcdProperty.Name); - public static final LocaleDisplayNames LOCALE_DISPLAY = LocaleDisplayNames.getInstance(ULocale.ENGLISH); + public static final LocaleDisplayNames LOCALE_DISPLAY = + LocaleDisplayNames.getInstance(ULocale.ENGLISH); - static final transient Collection output = new TreeSet(Collections.reverseOrder()); + static final transient Collection output = new TreeSet(Collections.reverseOrder()); -// static Age_Values getNewest(String s) { -// synchronized (Emoji.output) { -// Emoji.getValues(s, VERSION_ENUM, Emoji.output); -// return Emoji.output.iterator().next(); -// } -// } + // static Age_Values getNewest(String s) { + // synchronized (Emoji.output) { + // Emoji.getValues(s, VERSION_ENUM, Emoji.output); + // return Emoji.output.iterator().next(); + // } + // } - // should be method on UnicodeMap - static final > C getValues(String source, UnicodeMap data, C output) { - output.clear(); - for (int cp : CharSequences.codePoints(source)) { - T datum = data.get(cp); - if (datum != null) { - output.add(datum); - } - } - return output; + // should be method on UnicodeMap + static final > C getValues( + String source, UnicodeMap data, C output) { + output.clear(); + for (int cp : CharSequences.codePoints(source)) { + T datum = data.get(cp); + if (datum != null) { + output.add(datum); } + } + return output; + } - static final String INTERNAL_OUTPUT_DIR = Settings.Output.GEN_DIR + "emoji/" + VERSION_TO_GENERATE + "/"; - public static final String HEALTHCARE = "⚕"; - public static final String UN = "🇺🇳"; + static final String INTERNAL_OUTPUT_DIR = + Settings.Output.GEN_DIR + "emoji/" + VERSION_TO_GENERATE + "/"; + public static final String HEALTHCARE = "⚕"; + public static final String UN = "🇺🇳"; - public static String toUHex(String s) { - return "U+" + Utility.hex(s, " U+"); - } + public static String toUHex(String s) { + return "U+" + Utility.hex(s, " U+"); + } - public static String getFlagRegionName(String s) { - String result = Emoji.getFlagCode(s); - if (result != null) { - result = Emoji.LOCALE_DISPLAY.regionDisplayName(result); - if (result.endsWith(" SAR China")) { - result = result.substring(0, result.length() - " SAR China".length()); - } else if (result.contains("(")) { - result = result.substring(0, result.indexOf('(')) + result.substring(result.lastIndexOf(')') + 1); - } - result = result.replaceAll("\\s\\s+", " ").trim(); - } - return result; + public static String getFlagRegionName(String s) { + String result = Emoji.getFlagCode(s); + if (result != null) { + result = Emoji.LOCALE_DISPLAY.regionDisplayName(result); + if (result.endsWith(" SAR China")) { + result = result.substring(0, result.length() - " SAR China".length()); + } else if (result.contains("(")) { + result = + result.substring(0, result.indexOf('(')) + + result.substring(result.lastIndexOf(')') + 1); } + result = result.replaceAll("\\s\\s+", " ").trim(); + } + return result; + } - // public static void main(String[] args) { - // if (!EMOJI_CHARS.containsAll(Unicode8Emoji)) { - // throw new IllegalArgumentException(); - // } - // if (!EMOJI_CHARS.contains("🗨")) { - // throw new IllegalArgumentException(); - // } - // System.out.println(Source.fbm + " " + Source.fbm.shortName()); - // System.out.println("Singletons:\n" + EMOJI_SINGLETONS.toPattern(false)); - // System.out.println("Without flags:\n" + EMOJI_CHARS_WITHOUT_FLAGS.toPattern(false)); - // System.out.println("Flags:\n" + FLAGS.toPattern(false)); - // System.out.println("With flags:\n" + EMOJI_CHARS.toPattern(false)); - // System.out.println("FLAT:\n" + EMOJI_CHARS_FLAT.toPattern(false)); - // System.out.println("FLAT:\n" + EMOJI_CHARS_FLAT.toPattern(true)); - // } - - public static String show(String key) { - StringBuilder b = new StringBuilder(); - for (int cp : CharSequences.codePoints(key)) { - if (b.length() != 0) { - b.append(' '); - } - b.append("U+" + Utility.hex(cp) + " " + UTF16.valueOf(cp)); - } - return b.toString(); - } + // public static void main(String[] args) { + // if (!EMOJI_CHARS.containsAll(Unicode8Emoji)) { + // throw new IllegalArgumentException(); + // } + // if (!EMOJI_CHARS.contains("🗨")) { + // throw new IllegalArgumentException(); + // } + // System.out.println(Source.fbm + " " + Source.fbm.shortName()); + // System.out.println("Singletons:\n" + EMOJI_SINGLETONS.toPattern(false)); + // System.out.println("Without flags:\n" + EMOJI_CHARS_WITHOUT_FLAGS.toPattern(false)); + // System.out.println("Flags:\n" + FLAGS.toPattern(false)); + // System.out.println("With flags:\n" + EMOJI_CHARS.toPattern(false)); + // System.out.println("FLAT:\n" + EMOJI_CHARS_FLAT.toPattern(false)); + // System.out.println("FLAT:\n" + EMOJI_CHARS_FLAT.toPattern(true)); + // } - public static final String TR51_HTML_BETA = "../../reports/tr51/proposed.html"; - public static final String TR51_HTML = IS_BETA || USE_PROPOSED ? TR51_HTML_BETA : "https://unicode.org/reports/tr51/tr51.html"; + public static String show(String key) { + StringBuilder b = new StringBuilder(); + for (int cp : CharSequences.codePoints(key)) { + if (b.length() != 0) { + b.append(' '); + } + b.append("U+" + Utility.hex(cp) + " " + UTF16.valueOf(cp)); + } + return b.toString(); + } + public static final String TR51_HTML_BETA = "../../reports/tr51/proposed.html"; + public static final String TR51_HTML = + IS_BETA || USE_PROPOSED ? TR51_HTML_BETA : "https://unicode.org/reports/tr51/tr51.html"; - public static String getHexFromSubdivision(String string) { - string = string.toLowerCase(Locale.ROOT).replace("-",""); - StringBuilder result = new StringBuilder().appendCodePoint(0x1F3F4); - for (int cp : CharSequences.codePoints(string)) { - result.appendCodePoint(TAG_BASE + cp); - } - return result.appendCodePoint(TAG_TERM_CHAR).toString(); - } + public static String getHexFromSubdivision(String string) { + string = string.toLowerCase(Locale.ROOT).replace("-", ""); + StringBuilder result = new StringBuilder().appendCodePoint(0x1F3F4); + for (int cp : CharSequences.codePoints(string)) { + result.appendCodePoint(TAG_BASE + cp); + } + return result.appendCodePoint(TAG_TERM_CHAR).toString(); + } - public static String getShortName(VersionInfo versionInfo) { - return versionInfo.getVersionString(2, 2); - } + public static String getShortName(VersionInfo versionInfo) { + return versionInfo.getVersionString(2, 2); + } - public static String getShortName(Age_Values versionInfo) { - return versionInfo.getShortName(); - } + public static String getShortName(Age_Values versionInfo) { + return versionInfo.getShortName(); + } - public static boolean isSingleCodePoint(String nvs) { - int cp = nvs.codePointAt(0); - return Character.charCount(cp) == nvs.length(); - } + public static boolean isSingleCodePoint(String nvs) { + int cp = nvs.codePointAt(0); + return Character.charCount(cp) == nvs.length(); + } - public static final UnicodeSet ARIB = new UnicodeSet( - "[²³¼-¾࿖‼⁉ℓ№℡℻⅐-⅛Ⅰ-Ⅻ↉ ①-⑿⒈-⒓ⒹⓈ⓫⓬▶◀☀-☃☎☓☔☖☗♠ ♣♥♦♨♬⚓⚞⚟⚡⚾⚿⛄-⛿✈❶-❿➡⟐⨀ ⬅-⬇⬛⬤⬮⬯〒〖〗〶㈪-㈳㈶㈷㈹㉄-㉏㉑-㉛ ㊋㊙�㍱㍻-㍾㎏㎐㎝㎞㎠-㎢㎤㎥㏊円年日月 🄀-🄊🄐-🄭🄱🄽🄿🅂🅆🅊-🅏🅗🅟🅹🅻🅼🅿🆊-🆍 🈀🈐-🈰🉀-🉈]") + public static final UnicodeSet ARIB = + new UnicodeSet( + "[²³¼-¾࿖‼⁉ℓ№℡℻⅐-⅛Ⅰ-Ⅻ↉ ①-⑿⒈-⒓ⒹⓈ⓫⓬▶◀☀-☃☎☓☔☖☗♠ ♣♥♦♨♬⚓⚞⚟⚡⚾⚿⛄-⛿✈❶-❿➡⟐⨀ ⬅-⬇⬛⬤⬮⬯〒〖〗〶㈪-㈳㈶㈷㈹㉄-㉏㉑-㉛ ㊋㊙�㍱㍻-㍾㎏㎐㎝㎞㎠-㎢㎤㎥㏊円年日月 🄀-🄊🄐-🄭🄱🄽🄿🅂🅆🅊-🅏🅗🅟🅹🅻🅼🅿🆊-🆍 🈀🈐-🈰🉀-🉈]") .freeze(); - public static final UnicodeSet DINGBATS = new UnicodeSet( - "[\u2194\u2195\u260E\u261B\u261E\u2660\u2663\u2665\u2666\u2701-\u2704\u2706-\u2709\u270C-\u2712\u2714-\u2718\u2733\u2734\u2744\u2747\u2762-\u2767\u27A1]") + public static final UnicodeSet DINGBATS = + new UnicodeSet( + "[\u2194\u2195\u260E\u261B\u261E\u2660\u2663\u2665\u2666\u2701-\u2704\u2706-\u2709\u270C-\u2712\u2714-\u2718\u2733\u2734\u2744\u2747\u2762-\u2767\u27A1]") .freeze(); - public static final UnicodeMap DING_MAP = new UnicodeMap<>(); - static { - for (String line : FileUtilities.in(GenerateEmoji.class, "dings.txt")) { - line = line.trim(); - if (line.isEmpty() || line.startsWith("#")) { - continue; - } - String[] parts = line.split("\\s*;\\s*"); - DING_MAP.put(Integer.parseInt(parts[0], 16), Integer.parseInt(parts[1], 16)); - } - DING_MAP.freeze(); + public static final UnicodeMap DING_MAP = new UnicodeMap<>(); + + static { + for (String line : FileUtilities.in(GenerateEmoji.class, "dings.txt")) { + line = line.trim(); + if (line.isEmpty() || line.startsWith("#")) { + continue; } + String[] parts = line.split("\\s*;\\s*"); + DING_MAP.put(Integer.parseInt(parts[0], 16), Integer.parseInt(parts[1], 16)); + } + DING_MAP.freeze(); + } - static final UnicodeMap WHITESPACE = Emoji.LATEST.load(UcdProperty.White_Space); + static final UnicodeMap WHITESPACE = Emoji.LATEST.load(UcdProperty.White_Space); - public static final UnicodeSet JSOURCES = new UnicodeSet(); - private static final boolean DEBUG = false; - static { - UnicodeMap dcmProp = Emoji.LATEST.load(UcdProperty.Emoji_DCM); - UnicodeMap kddiProp = Emoji.LATEST.load(UcdProperty.Emoji_KDDI); - UnicodeMap sbProp = Emoji.LATEST.load(UcdProperty.Emoji_SB); - checkDuplicates(dcmProp, kddiProp, sbProp); - JSOURCES.addAll(dcmProp.keySet()) + public static final UnicodeSet JSOURCES = new UnicodeSet(); + private static final boolean DEBUG = false; + + static { + UnicodeMap dcmProp = Emoji.LATEST.load(UcdProperty.Emoji_DCM); + UnicodeMap kddiProp = Emoji.LATEST.load(UcdProperty.Emoji_KDDI); + UnicodeMap sbProp = Emoji.LATEST.load(UcdProperty.Emoji_SB); + checkDuplicates(dcmProp, kddiProp, sbProp); + JSOURCES.addAll(dcmProp.keySet()) .addAll(kddiProp.keySet()) .addAll(sbProp.keySet()) .removeAll(WHITESPACE.getSet(UcdPropertyValues.Binary.Yes.toString())) // HACK - .addAll(new UnicodeSet( - "[{0️⃣} {1️⃣} {2️⃣} {3️⃣} {4️⃣} {5️⃣} {6️⃣} {7️⃣} {8️⃣} {9️⃣} {#️⃣} {🇨🇳} {🇩🇪} {🇪🇸} {🇫🇷} {🇬🇧} {🇮🇹} {🇯🇵} {🇰🇷} {🇷🇺} {🇺🇸}]")) + .addAll( + new UnicodeSet( + "[{0️⃣} {1️⃣} {2️⃣} {3️⃣} {4️⃣} {5️⃣} {6️⃣} {7️⃣} {8️⃣} {9️⃣} {#️⃣} {🇨🇳} {🇩🇪} {🇪🇸} {🇫🇷} {🇬🇧} {🇮🇹} {🇯🇵} {🇰🇷} {🇷🇺} {🇺🇸}]")) .freeze(); - // if (true) - // System.out.println("Core:\t" + JSOURCES.size() + "\t" + JSOURCES); - } + // if (true) + // System.out.println("Core:\t" + JSOURCES.size() + "\t" + JSOURCES); + } - private static void checkDuplicates(UnicodeMap dcmProp, UnicodeMap kddiProp, - UnicodeMap sbProp) { - Relation carrierToUnicode = Relation.of(new TreeMap(), TreeSet.class); - for (Entry unicodeToCarrier : dcmProp.entrySet()) { - carrierToUnicode.put(unicodeToCarrier.getValue(), unicodeToCarrier.getKey()); - } - for (Entry unicodeToCarrier : kddiProp.entrySet()) { - carrierToUnicode.put(unicodeToCarrier.getValue(), unicodeToCarrier.getKey()); - } - for (Entry unicodeToCarrier : sbProp.entrySet()) { - carrierToUnicode.put(unicodeToCarrier.getValue(), unicodeToCarrier.getKey()); - } - int count = 0; - for (Entry> carrierAndUnicodes : carrierToUnicode.keyValuesSet()) { - Set unicodes = carrierAndUnicodes.getValue(); - if (unicodes.size() > 1) { - if (DEBUG) - System.out.println(++count); - for (String s : unicodes) { - if (DEBUG) - System.out.println(carrierAndUnicodes.getKey() + "\tU+" + Utility.hex(s, " U+") + "\t" + private static void checkDuplicates( + UnicodeMap dcmProp, UnicodeMap kddiProp, UnicodeMap sbProp) { + Relation carrierToUnicode = Relation.of(new TreeMap(), TreeSet.class); + for (Entry unicodeToCarrier : dcmProp.entrySet()) { + carrierToUnicode.put(unicodeToCarrier.getValue(), unicodeToCarrier.getKey()); + } + for (Entry unicodeToCarrier : kddiProp.entrySet()) { + carrierToUnicode.put(unicodeToCarrier.getValue(), unicodeToCarrier.getKey()); + } + for (Entry unicodeToCarrier : sbProp.entrySet()) { + carrierToUnicode.put(unicodeToCarrier.getValue(), unicodeToCarrier.getKey()); + } + int count = 0; + for (Entry> carrierAndUnicodes : carrierToUnicode.keyValuesSet()) { + Set unicodes = carrierAndUnicodes.getValue(); + if (unicodes.size() > 1) { + if (DEBUG) System.out.println(++count); + for (String s : unicodes) { + if (DEBUG) + System.out.println( + carrierAndUnicodes.getKey() + + "\tU+" + + Utility.hex(s, " U+") + + "\t" + UCharacter.getName(s, " + ")); - } - } } } - + } + } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiAnnotations.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiAnnotations.java index 4b2bb0264..7a680de72 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiAnnotations.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiAnnotations.java @@ -1,5 +1,12 @@ package org.unicode.tools.emoji; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.SimpleFormatter; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Output; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -9,44 +16,42 @@ import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; - import org.unicode.cldr.util.Annotations; import org.unicode.cldr.util.Annotations.AnnotationSet; import org.unicode.cldr.util.CLDRFile; import org.unicode.cldr.util.CldrUtility; import org.unicode.text.utility.Birelation; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.SimpleFormatter; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Output; - -public class EmojiAnnotations extends Birelation { - public enum Status {missing, gender, constructed, found} +public class EmojiAnnotations extends Birelation { + public enum Status { + missing, + gender, + constructed, + found + } - final Map TO_UNICODE_SET; - final private UnicodeMap shortNames = new UnicodeMap<>(); - final private UnicodeMap statusValues = new UnicodeMap<>(); + final Map TO_UNICODE_SET; + private final UnicodeMap shortNames = new UnicodeMap<>(); + private final UnicodeMap statusValues = new UnicodeMap<>(); // Add to CLDR private static final UnicodeSet MALE_SET = new UnicodeSet("[👦 👨 👴 🎅 🤴 🤵 👲🕴 🕺]"); private static final UnicodeSet FEMALE_SET = new UnicodeSet("[ 👧 👩 👵 🤶👸👰🤰💃]"); - // private static final Splitter TAB = Splitter.on("\t").trimResults(); // // private static final boolean SHOW = false; - // private static final Set STOP_WORDS = new HashSet<>(Arrays.asList("the", "of", "for", "a", "and", "state", - // "c�te", "verde▪cape", "dhekelia", "akrotiri", "comros", "pdr", "jamahiriya", "part", + // private static final Set STOP_WORDS = new HashSet<>(Arrays.asList("the", "of", + // "for", "a", "and", "state", + // "c�te", "verde▪cape", "dhekelia", "akrotiri", "comros", "pdr", "jamahiriya", + // "part", // "yugoslav", "tfyr", "autonomous", "rawanda", "da", "rb", "yugoslavia", // "states", "sar", "people's", "minor", // "sts.")); - public static final EmojiAnnotations ANNOTATIONS_TO_CHARS = new EmojiAnnotations("en", EmojiOrder.STD_ORDER.codepointCompare); + public static final EmojiAnnotations ANNOTATIONS_TO_CHARS = + new EmojiAnnotations("en", EmojiOrder.STD_ORDER.codepointCompare); /** * @deprecated Use {@link #EmojiAnnotations(String,Comparator,String...)} instead */ @@ -54,12 +59,14 @@ public EmojiAnnotations(Comparator codepointCompare, String... filenames this("en", codepointCompare, filenames); } - public EmojiAnnotations(String localeString, Comparator codepointCompare, String... filenames) { - super(new TreeMap(EmojiOrder.UCA_PLUS_CODEPOINT), - new HashMap(), - TreeSet.class, - TreeSet.class, - EmojiOrder.UCA_COLLATOR, + public EmojiAnnotations( + String localeString, Comparator codepointCompare, String... filenames) { + super( + new TreeMap(EmojiOrder.UCA_PLUS_CODEPOINT), + new HashMap(), + TreeSet.class, + TreeSet.class, + EmojiOrder.UCA_COLLATOR, EmojiOrder.UCA_PLUS_CODEPOINT); // for (String s : sorted) { @@ -67,14 +74,15 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // if (plainAnnotations.isEmpty()) { // plainAnnotations = EmojiAnnotations.ANNOTATIONS_TO_CHARS.getKeys(s); // } - // System.out.println(s + "\t" + CollectionUtilities.join(plainAnnotations, " | ")); + // System.out.println(s + "\t" + CollectionUtilities.join(plainAnnotations, " | + // ")); // } final AnnotationSet annotationData = Annotations.getDataSet(localeString); if (annotationData == null) { throw new IllegalArgumentException("No annotation data for " + localeString); } - //Loader loader = new Loader(CLDRConfig.getInstance().getCLDRFile(localeString, true), annotationData); - + // Loader loader = new Loader(CLDRConfig.getInstance().getCLDRFile(localeString, true), + // annotationData); final Set keywords = new LinkedHashSet<>(); Output outShortName = new Output<>(); @@ -83,7 +91,8 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare final String sNoVariants = s.replace(Emoji.EMOJI_VARIANT_STRING, ""); outShortName.value = annotationData.getShortName(s); Status status = Status.found; - if (outShortName.value == null || outShortName.value.contains(Annotations.ENGLISH_MARKER)) { + if (outShortName.value == null + || outShortName.value.contains(Annotations.ENGLISH_MARKER)) { status = Status.missing; } else if (outShortName.value.contains(Annotations.BAD_MARKER)) { status = Status.gender; @@ -100,7 +109,7 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare add(annotation, sNoVariants); } } - statusValues.put(s,status); + statusValues.put(s, status); shortNames.put(s, outShortName.value); if (!s.equals(sNoVariants)) { shortNames.put(sNoVariants, outShortName.value); @@ -115,11 +124,13 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // add(annotation, s); // } // } - //Output> lastLabel = new Output>(new TreeSet(codepointCompare)); + // Output> lastLabel = new Output>(new + // TreeSet(codepointCompare)); // for (String filename : filenames) { // int lineCount = 0; // int lineNumber = 0; - // EmojiIterator ei = new EmojiIterator(EmojiData.of(Emoji.VERSION_LAST_RELEASED), true); + // EmojiIterator ei = new + // EmojiIterator(EmojiData.of(Emoji.VERSION_LAST_RELEASED), true); // // for (String line : FileUtilities.in(EmojiAnnotations.class, filename)) { // line = line.trim(); @@ -133,9 +144,11 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // lineCount++; // for (String string : ei.set(line)) { // if (Emoji.ASCII_LETTERS.containsSome(string)) { - // UnicodeSet overlap = new UnicodeSet().addAll(string).retainAll(Emoji.ASCII_LETTERS); + // UnicodeSet overlap = new + // UnicodeSet().addAll(string).retainAll(Emoji.ASCII_LETTERS); // String withPosition = line.replaceAll("("+overlap+")", "###$1"); - // throw new IllegalArgumentException(lineNumber + "\tStrange line with ASCII emoji: " + overlap + "; "+ withPosition); + // throw new IllegalArgumentException(lineNumber + "\tStrange line + // with ASCII emoji: " + overlap + "; "+ withPosition); // } // if (EmojiData.EMOJI_DATA.skipEmojiSequence(string)) { // continue; @@ -146,7 +159,8 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // } // } // } - // if (SHOW) System.out.println(lineCount + "\tannotation lines from " + filename); + // if (SHOW) System.out.println(lineCount + "\tannotation lines from " + + // filename); // } // addOther("-apple", EmojiData.EMOJI_DATA.getChars()); // addOther("-android", EmojiData.EMOJI_DATA.getChars()); @@ -178,7 +192,6 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // add("other", s); // } - // for (String s : FITZ_MINIMAL) { // ANNOTATIONS_TO_CHARS.add("fitz-minimal", s); // } @@ -194,7 +207,8 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // } // for (int cp1 = Emoji.FIRST_REGIONAL; cp1 <= Emoji.LAST_REGIONAL; ++cp1) { // for (int cp2 = Emoji.FIRST_REGIONAL; cp2 <= Emoji.LAST_REGIONAL; ++cp2) { - // String emoji = new StringBuilder().appendCodePoint(cp1).appendCodePoint(cp2).toString(); + // String emoji = new + // StringBuilder().appendCodePoint(cp1).appendCodePoint(cp2).toString(); // if (EmojiData.EMOJI_DATA.getChars().contains(emoji)) { // add("flag", emoji); // } @@ -207,7 +221,7 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // if (regionCode == null || regionCode.length() != 2) { // continue; // } - // if (regionCode.equals("RS") + // if (regionCode.equals("RS") // && name.contains("montenegro")) { // continue; // } @@ -216,15 +230,17 @@ public EmojiAnnotations(String localeString, Comparator codepointCompare // addParts(emoji, name); // } freeze(); - Map _TO_UNICODE_SET = new HashMap<>(); + Map _TO_UNICODE_SET = new HashMap<>(); for (Entry> entry : this.keyValuesSet()) { _TO_UNICODE_SET.put(entry.getKey(), new UnicodeSet().addAll(entry.getValue()).freeze()); } TO_UNICODE_SET = Collections.unmodifiableMap(_TO_UNICODE_SET); // UnicodeSet annotationCharacters = new UnicodeSet().addAll(valuesSet()); // if (!annotationCharacters.containsAll(EmojiData.EMOJI_DATA.getChars())) { - // UnicodeSet missing = new UnicodeSet().addAll(EmojiData.EMOJI_DATA.getChars()).removeAll(annotationCharacters); - // throw new IllegalArgumentException("Missing annotations: " + missing.toPattern(false)); + // UnicodeSet missing = new + // UnicodeSet().addAll(EmojiData.EMOJI_DATA.getChars()).removeAll(annotationCharacters); + // throw new IllegalArgumentException("Missing annotations: " + + // missing.toPattern(false)); // } } @@ -291,11 +307,12 @@ static class Loader { final boolean isEnglish; private final SimpleFormatter KEYCAP_PATTERN; private final SimpleFormatter COMBINE_PATTERN; - - static final UnicodeSet FAMILY_PLUS = new UnicodeSet(Emoji.FAMILY_MARKERS) - .add(Emoji.JOINER) - .add(Emoji.EMOJI_VARIANT) - .freeze(); + + static final UnicodeSet FAMILY_PLUS = + new UnicodeSet(Emoji.FAMILY_MARKERS) + .add(Emoji.JOINER) + .add(Emoji.EMOJI_VARIANT) + .freeze(); static final String KISS = "\u2764\uFE0F\u200D\uD83D\uDC8B\u200D"; static final String HEART = "\u2764\uFE0F\u200D"; @@ -304,29 +321,39 @@ public Loader(CLDRFile cldrFile, UnicodeMap data) { this.isEnglish = cldrFile.getLocaleID().equals("en"); this.data = data; // {0}, {1} type="unit-short" - this.KEYCAP_PATTERN = SimpleFormatter.compile(cldrFile.getStringValue("//ldml/characterLabels/characterLabelPattern[@type=\"keycap\"]")); - this.COMBINE_PATTERN = SimpleFormatter.compile(cldrFile.getStringValue("//ldml/characterLabels/characterLabelPattern[@type=\"category-list\"]")); - sep = SimpleFormatter.compile(cldrFile.getStringValue("//ldml/listPatterns/listPattern[@type=\"unit-short\"]/listPatternPart[@type=\"2\"]")); + this.KEYCAP_PATTERN = + SimpleFormatter.compile( + cldrFile.getStringValue( + "//ldml/characterLabels/characterLabelPattern[@type=\"keycap\"]")); + this.COMBINE_PATTERN = + SimpleFormatter.compile( + cldrFile.getStringValue( + "//ldml/characterLabels/characterLabelPattern[@type=\"category-list\"]")); + sep = + SimpleFormatter.compile( + cldrFile.getStringValue( + "//ldml/listPatterns/listPattern[@type=\"unit-short\"]/listPatternPart[@type=\"2\"]")); } - private Status getNameAndAnnotations(String s, final Set keywordsToAppendTo, Output outShortName) { + private Status getNameAndAnnotations( + String s, final Set keywordsToAppendTo, Output outShortName) { if (s.equals("💃")) { int debug = 0; } outShortName.value = null; // TODO put into CLDR if (isEnglish) { - switch(s) { - case Emoji.UN: - outShortName.value = "United Nations"; - keywordsToAppendTo.add("UN"); - return Status.found; + switch (s) { + case Emoji.UN: + outShortName.value = "United Nations"; + keywordsToAppendTo.add("UN"); + return Status.found; } } Annotations datum = data.get(s); // try without variant if (datum == null) { - String s1 = s.replace(Emoji.EMOJI_VARIANT_STRING,""); + String s1 = s.replace(Emoji.EMOJI_VARIANT_STRING, ""); if (!s.equals(s1)) { datum = data.get(s1); } @@ -342,7 +369,12 @@ private Status getNameAndAnnotations(String s, final Set keywordsToAppen return outShortName.value == null ? Status.missing : Status.found; } else if (s.contains(Emoji.KEYCAP_MARK_STRING)) { Annotations keycapDatum = data.get("🔟"); - outShortName.value = fix(s, sep, outShortName.value, KEYCAP_PATTERN.format(UTF16.valueOf(s.charAt(0)))); + outShortName.value = + fix( + s, + sep, + outShortName.value, + KEYCAP_PATTERN.format(UTF16.valueOf(s.charAt(0)))); if (keycapDatum != null && keycapDatum.getShortName().contains("#")) { keywordsToAppendTo.addAll(keycapDatum.getKeywords()); } @@ -364,9 +396,10 @@ private Status getNameAndAnnotations(String s, final Set keywordsToAppen } else { status = Status.missing; } - //String type = EmojiData.shortModNameZ(rem.codePointAt(0)); + // String type = EmojiData.shortModNameZ(rem.codePointAt(0)); // if (status != Status.missing) { - // outShortName.value = sep.format(outShortName.value.toLowerCase(Locale.ENGLISH), type); + // outShortName.value = + // sep.format(outShortName.value.toLowerCase(Locale.ENGLISH), type); // keywordsToAppendTo.add(type); // } return status; @@ -374,12 +407,12 @@ private Status getNameAndAnnotations(String s, final Set keywordsToAppen // shortName = EmojiData.EMOJI_DATA.getName(s, true); Status status = Status.constructed; String base = null; - //s = s.replace(Emoji.JOINER_STRING,""); + // s = s.replace(Emoji.JOINER_STRING,""); if (s.contains(KISS)) { Annotations familyDatum = data.get("💏"); s = s.replace(KISS, ""); if (familyDatum != null) { - base = familyDatum.getShortName(); + base = familyDatum.getShortName(); keywordsToAppendTo.addAll(familyDatum.getKeywords()); status = Status.found; } else { @@ -389,16 +422,17 @@ private Status getNameAndAnnotations(String s, final Set keywordsToAppen Annotations familyDatum = data.get("💑"); s = s.replace(HEART, ""); if (familyDatum != null) { - base = familyDatum.getShortName(); + base = familyDatum.getShortName(); keywordsToAppendTo.addAll(familyDatum.getKeywords()); status = Status.found; } else { status = Status.missing; } } else if (FAMILY_PLUS.containsAll(s)) { - Annotations familyDatum = data.get("👪");// Familie + Annotations familyDatum = data.get("👪"); // Familie if (familyDatum != null) { - base = familyDatum.getShortName(); + base = familyDatum.getShortName(); keywordsToAppendTo.addAll(familyDatum.getKeywords()); status = Status.found; } else { @@ -407,9 +441,12 @@ private Status getNameAndAnnotations(String s, final Set keywordsToAppen } else if (Emoji.GENDER_MARKERS.containsSome(s)) { String rem = Emoji.GENDER_MARKERS.stripFrom(s, false); s = Emoji.GENDER_MARKERS.stripFrom(s, true); - Annotations familyDatum = data.get(rem.contains("♂") ? "👨" : "👩");// Familie + Annotations familyDatum = + data.get(rem.contains("♂") ? "👨" : "👩"); // Familie if (familyDatum != null) { - outShortName.value = fix(s, sep, outShortName.value, familyDatum.getShortName()); + outShortName.value = + fix(s, sep, outShortName.value, familyDatum.getShortName()); keywordsToAppendTo.addAll(familyDatum.getKeywords()); status = Status.gender; } else { @@ -422,7 +459,8 @@ private Status getNameAndAnnotations(String s, final Set keywordsToAppen } Annotations partDatum = data.get(cp); if (partDatum != null) { - outShortName.value = fix(s, sep, outShortName.value, partDatum.getShortName()); + outShortName.value = + fix(s, sep, outShortName.value, partDatum.getShortName()); keywordsToAppendTo.addAll(partDatum.getKeywords()); } else { outShortName.value = fix(s, sep, outShortName.value, "???"); @@ -464,7 +502,7 @@ public String getShortName(String s) { public Status getStatus(String s) { return CldrUtility.ifNull(statusValues.get(s), Status.missing); } - + public UnicodeSet getStatusKeys() { return statusValues.keySet(); } @@ -477,7 +515,8 @@ public static void main(String[] args) { // if (plainAnnotations.isEmpty()) { // plainAnnotations = EmojiAnnotations.ANNOTATIONS_TO_CHARS.getKeys(s); // } - // System.out.println(s + "\t" + CollectionUtilities.join(plainAnnotations, " | ")); + // System.out.println(s + "\t" + CollectionUtilities.join(plainAnnotations, " | + // ")); // } // UnicodeMap data = Annotations.getData("en"); // CLDRFile english = CLDRConfig.getInstance().getEnglish(); @@ -510,7 +549,7 @@ private static void show(String s, LinkedHashSet missing) { // StringBuilder newMods = new StringBuilder(); // for (int cp : CharSequences.codePoints(source)) { // if (EmojiData.MODIFIERS.contains(cp)) { - // + // // } // } // } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiData.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiData.java index aa999177d..78fc5de83 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiData.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiData.java @@ -1,5 +1,23 @@ package org.unicode.tools.emoji; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.Transform; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetSpanner; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.util.Arrays; import java.util.Collection; import java.util.Collections; @@ -14,7 +32,6 @@ import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Annotations; import org.unicode.cldr.util.Annotations.AnnotationSet; @@ -35,28 +52,10 @@ import org.unicode.tools.emoji.Emoji.Qualified; import org.unicode.tools.emoji.EmojiOrder.MajorGroup; -import com.google.common.base.Splitter; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableMultimap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Multimap; -import com.google.common.collect.TreeMultimap; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.Transform; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetSpanner; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - public class EmojiData implements EmojiDataSource { // should be properties - private static final UnicodeSet MULTIPERSON = new UnicodeSet("[👯 🤼 👫-👭 💏 💑 👪 🤝]").freeze(); + private static final UnicodeSet MULTIPERSON = + new UnicodeSet("[👯 🤼 👫-👭 💏 💑 👪 🤝]").freeze(); private static final UnicodeSet EXPLICIT_HAIR = new UnicodeSet("[👱]").freeze(); static final int HANDSHAKE = 0x1f91d; @@ -64,8 +63,8 @@ public class EmojiData implements EmojiDataSource { public static final String NEUTRAL_HOLDING = "🧑‍🤝‍🧑"; public static final String SAMPLE_NEUTRAL_HOLDING_WITH_SKIN = "🧑🏼‍🤝‍🧑🏻"; public static final String MAN_WITH_RED_HAIR = "👨‍🦰"; - public static final UnicodeSet HOLDING_HANDS_COMPOSITES = new UnicodeSet().add(0x1F46B).add(0x1F46C).add(0x1F46D) - .freeze(); + public static final UnicodeSet HOLDING_HANDS_COMPOSITES = + new UnicodeSet().add(0x1F46B).add(0x1F46C).add(0x1F46D).freeze(); public static final UnicodeSet OTHER_GROUP = new UnicodeSet("[💏 💑]").freeze(); static final String ZWJ_HANDSHAKE_ZWJ = Emoji.JOINER_STR + "🤝" + Emoji.JOINER_STR; @@ -75,9 +74,14 @@ public class EmojiData implements EmojiDataSource { static final String SHAKING_HANDS = RIGHTWARDS_HAND + Emoji.JOINER_STR + LEFTWARDS_HAND; public static boolean ALLOW_UNICODE_NAME = System.getProperty("ALLOW_UNICODE_NAME") != null; - public static final UnicodeSet TAKES_NO_VARIANT = new UnicodeSet(Emoji.EMOJI_VARIANTS_JOINER) - .addAll(new UnicodeSet("[[:M:][:Variation_Selector:][:Block=Tags:]]")) // TODO fix to use indexed props - .freeze(); + public static final UnicodeSet TAKES_NO_VARIANT = + new UnicodeSet(Emoji.EMOJI_VARIANTS_JOINER) + .addAll( + new UnicodeSet( + "[[:M:][:Variation_Selector:][:Block=Tags:]]")) // TODO fix to + // use indexed + // props + .freeze(); public static final String SAMPLE_WITHOUT_TRAILING_EVS = "👮🏻‍♀"; public static final AnnotationSet ANNOTATION_SET = Annotations.getDataSet("en"); @@ -85,7 +89,8 @@ public class EmojiData implements EmojiDataSource { public static final UnicodeSet MODIFIERS = new UnicodeSet(0x1F3FB, 0x1F3FF).freeze(); public enum DefaultPresentation { - text, emoji + text, + emoji } private final UnicodeSet singletonsWithDefectives = new UnicodeSet(); @@ -129,9 +134,14 @@ public enum DefaultPresentation { private UnicodeSet otherHuman; private UnicodeSet genderBase; private UnicodeMap toNeutral; - static final UnicodeSetSpanner MODS_SPANNER = new UnicodeSetSpanner( - new UnicodeSet(MODIFIERS).addAll(Emoji.ZWJ_GENDER_MARKERS).addAll(Emoji.FULL_ZWJ_GENDER_MARKERS).freeze()); - public static final UnicodeSetSpanner SKIN_SPANNER = new UnicodeSetSpanner(new UnicodeSet(MODIFIERS).freeze()); + static final UnicodeSetSpanner MODS_SPANNER = + new UnicodeSetSpanner( + new UnicodeSet(MODIFIERS) + .addAll(Emoji.ZWJ_GENDER_MARKERS) + .addAll(Emoji.FULL_ZWJ_GENDER_MARKERS) + .freeze()); + public static final UnicodeSetSpanner SKIN_SPANNER = + new UnicodeSetSpanner(new UnicodeSet(MODIFIERS).freeze()); public static final Splitter semi = Splitter.onPattern("[;#]").trimResults(); public static final Splitter semiOnly = Splitter.onPattern(";").trimResults(); @@ -151,7 +161,12 @@ public static EmojiData of(VersionInfo version) { } private enum EmojiProp { - Emoji, Emoji_Presentation, Emoji_Modifier, Emoji_Modifier_Base, Emoji_Component, Extended_Pictographic + Emoji, + Emoji_Presentation, + Emoji_Modifier, + Emoji_Modifier_Base, + Emoji_Component, + Extended_Pictographic } // 0023 ; Emoji # [1] (#️) NUMBER SIGN // 231A..231B ; Emoji_Presentation # [2] (⌚️..⌛️) WATCH..HOURGLASS @@ -159,15 +174,20 @@ private enum EmojiProp { // 261D ; Emoji_Modifier_Base # [1] (☝️) WHITE UP POINTING INDEX private EmojiData(VersionInfo version) { - final UnicodeMap gc = Emoji.BETA.loadEnum(UcdProperty.General_Category, - UcdPropertyValues.General_Category_Values.class); + final UnicodeMap gc = + Emoji.BETA.loadEnum( + UcdProperty.General_Category, + UcdPropertyValues.General_Category_Values.class); UnicodeSet NSM = gc.getSet(UcdPropertyValues.General_Category_Values.Nonspacing_Mark); UnicodeSet EM = gc.getSet(UcdPropertyValues.General_Category_Values.Enclosing_Mark); - EnumMap _modifierClassMap = new EnumMap<>(Emoji.ModifierStatus.class); + EnumMap _modifierClassMap = + new EnumMap<>(Emoji.ModifierStatus.class); - String[] ADD_VARIANT_KEYCAPS = { Emoji.KEYCAP_MARK_STRING, - // Emoji.TEXT_VARIANT_STRING + Emoji.KEYCAP_MARK_STRING, - Emoji.EMOJI_VARIANT_STRING + Emoji.KEYCAP_MARK_STRING, }; + String[] ADD_VARIANT_KEYCAPS = { + Emoji.KEYCAP_MARK_STRING, + // Emoji.TEXT_VARIANT_STRING + Emoji.KEYCAP_MARK_STRING, + Emoji.EMOJI_VARIANT_STRING + Emoji.KEYCAP_MARK_STRING, + }; this.version = version; final String directory = @@ -183,8 +203,7 @@ private EmojiData(VersionInfo version) { // jw # V1.1 (☺) white smiling face ++lineCount; line = line.trim(); - if (line.startsWith("#") || line.isEmpty()) - continue; + if (line.startsWith("#") || line.isEmpty()) continue; if (line.startsWith("2388")) { int debug = 0; } @@ -222,13 +241,13 @@ private EmojiData(VersionInfo version) { // # emoji: default emoji presentation boolean emojiPresentation = false; switch (f1) { - case "text": - break; - case "emoji": - emojiPresentation = true; - break; - default: - throw new IllegalArgumentException(line); + case "text": + break; + case "emoji": + emojiPresentation = true; + break; + default: + throw new IllegalArgumentException(line); } // # Field 3 — Emoji_Modifier_Status: // # modifier: an emoji modifier @@ -240,17 +259,17 @@ private EmojiData(VersionInfo version) { boolean emojiModifierBase = false; switch (f3) { - case "modifier": - emojiModifier = true; - break; - case "primary": - case "secondary": - emojiModifierBase = true; - break; - case "none": - break; - default: - throw new IllegalArgumentException(line); + case "modifier": + emojiModifier = true; + break; + case "primary": + case "secondary": + emojiModifierBase = true; + break; + case "none": + break; + default: + throw new IllegalArgumentException(line); } for (int cp = codePoint; cp <= codePointEnd; ++cp) { emojiData.add(cp, EmojiProp.Emoji); @@ -311,8 +330,9 @@ private EmojiData(VersionInfo version) { singletonsWithoutDefectives.add(cp); } - EmojiData.DefaultPresentation styleIn = set.contains(EmojiProp.Emoji_Presentation) - ? EmojiData.DefaultPresentation.emoji + EmojiData.DefaultPresentation styleIn = + set.contains(EmojiProp.Emoji_Presentation) + ? EmojiData.DefaultPresentation.emoji : EmojiData.DefaultPresentation.text; if (styleIn == EmojiData.DefaultPresentation.emoji) { emojiPresentationSet.add(cp); @@ -320,20 +340,24 @@ private EmojiData(VersionInfo version) { textPresentationSet.add(cp); } - Emoji.ModifierStatus modClass = set.contains(EmojiProp.Emoji_Modifier) ? Emoji.ModifierStatus.modifier - : set.contains(EmojiProp.Emoji_Modifier_Base) ? Emoji.ModifierStatus.modifier_base - : Emoji.ModifierStatus.none; + Emoji.ModifierStatus modClass = + set.contains(EmojiProp.Emoji_Modifier) + ? Emoji.ModifierStatus.modifier + : set.contains(EmojiProp.Emoji_Modifier_Base) + ? Emoji.ModifierStatus.modifier_base + : Emoji.ModifierStatus.none; putUnicodeSetValue(_modifierClassMap, cp, modClass); - } singletonsWithDefectives.freeze(); singletonsWithoutDefectives.freeze(); emojiPresentationSet.freeze(); textPresentationSet.freeze(); - modifierBases = new UnicodeSet().addAll(_modifierClassMap.get(Emoji.ModifierStatus.modifier_base)) - // .addAll(modifierClassMap.get(ModifierStatus.secondary)) - .freeze(); + modifierBases = + new UnicodeSet() + .addAll(_modifierClassMap.get(Emoji.ModifierStatus.modifier_base)) + // .addAll(modifierClassMap.get(ModifierStatus.secondary)) + .freeze(); if (!modifierBases.contains(0x1F90C)) { if (version.compareTo(Emoji.VERSION13) >= 0) { int debug = 0; @@ -371,8 +395,14 @@ private EmojiData(VersionInfo version) { modifierBasesRgi = new UnicodeSet(); modifierSequences = new UnicodeSet(); // HACK 1F441 200D 1F5E8 - zwjSequencesAll.add(new StringBuilder().appendCodePoint(0x1F441).appendCodePoint(0xFE0F) - .appendCodePoint(0x200D).appendCodePoint(0x1F5E8).appendCodePoint(0xFE0F).toString()); + zwjSequencesAll.add( + new StringBuilder() + .appendCodePoint(0x1F441) + .appendCodePoint(0xFE0F) + .appendCodePoint(0x200D) + .appendCodePoint(0x1F5E8) + .appendCodePoint(0xFE0F) + .toString()); // VariantFactory vf = new VariantFactory(); UnicodeSet debugSet = new UnicodeSet("[\\x{1F48F}\\x{1F491}]").freeze(); @@ -404,7 +434,14 @@ private EmojiData(VersionInfo version) { } if (EmojiData.MODIFIERS.contains(cp1)) { if (last < 0) { - throw new IllegalArgumentException("In " + file + ", modifier " + Utility.hex(last) + " " + UTF16.valueOf(cp1) + "not following base "); + throw new IllegalArgumentException( + "In " + + file + + ", modifier " + + Utility.hex(last) + + " " + + UTF16.valueOf(cp1) + + "not following base "); } modifierBasesRgi.add(last); modifierSequences.add(With.fromCodePoint(last, cp1)); @@ -443,7 +480,8 @@ private EmojiData(VersionInfo version) { int debug = 0; } addToZwjSequencesAll(noVariant); // get non-variant - final Set noVariantPlusModifiers = addModifiers(noVariant, false); + final Set noVariantPlusModifiers = + addModifiers(noVariant, false); for (String modSeq : noVariantPlusModifiers) { addToZwjSequencesAll(modSeq); addName(modSeq, list); @@ -470,16 +508,20 @@ private EmojiData(VersionInfo version) { } } else if (EM.containsSome(noVariant) || NSM.containsSome(noVariant)) { final String firstString = source.substring(0, 1); - keycapSequences - .add(firstString + Emoji.EMOJI_VARIANT_STRING + Emoji.KEYCAP_MARK_STRING); + keycapSequences.add( + firstString + + Emoji.EMOJI_VARIANT_STRING + + Emoji.KEYCAP_MARK_STRING); for (String s : ADD_VARIANT_KEYCAPS) { keycapSequenceAll.add(firstString + s); } keycapBases.add(firstString); - } else if (Emoji.DEFECTIVE.contains(first) // if it starts with a defective - && !Emoji.KEYCAP_BASE.contains(noVariant) // and is not just one of the keycap - // starts in Basic_Emoji - ) { + } else if (Emoji.DEFECTIVE.contains( + first) // if it starts with a defective + && !Emoji.KEYCAP_BASE.contains( + noVariant) // and is not just one of the keycap + // starts in Basic_Emoji + ) { throw new IllegalArgumentException("Unexpected Defective"); } } @@ -501,8 +543,9 @@ private EmojiData(VersionInfo version) { } if (version.compareTo(Emoji.VERSION4) <= 0) { - UnicodeMap sv = IndexUnicodeProperties.make(Emoji.VERSION_TO_GENERATE_UNICODE) - .load(UcdProperty.Standardized_Variant); + UnicodeMap sv = + IndexUnicodeProperties.make(Emoji.VERSION_TO_GENERATE_UNICODE) + .load(UcdProperty.Standardized_Variant); for (String s : sv.keySet()) { if (s.contains(Emoji.EMOJI_VARIANT_STRING)) { emojiWithVariants.add(s.codePointAt(0)); @@ -517,8 +560,7 @@ private EmojiData(VersionInfo version) { if (hashPos >= 0) { line = line.substring(0, hashPos); } - if (line.isEmpty()) - continue; + if (line.isEmpty()) continue; List list = semi.splitToList(line); String source = Utility.fromHex(list.get(0)); @@ -535,8 +577,7 @@ private EmojiData(VersionInfo version) { // # Code ; Default Style ; Ordering ; Annotations ; Sources #Version Char Name // U+263A ; text ; 0 ; face, human, outlined, relaxed, smile, smiley, smiling ; // jw # V1.1 (☺) white smiling face - if (line.startsWith("#") || line.isEmpty()) - continue; + if (line.startsWith("#") || line.isEmpty()) continue; List coreList = hashOnly.splitToList(line); List list = semi.splitToList(coreList.get(0)); final String f0 = list.get(0); @@ -553,7 +594,8 @@ private EmojiData(VersionInfo version) { if (!"ExtendedPictographic".equals(prop.replace("_", ""))) { throw new IllegalArgumentException(); } - boolean negative = list.size() > 2 && "NO".equals(list.get(2).toUpperCase(Locale.ENGLISH)); + boolean negative = + list.size() > 2 && "NO".equals(list.get(2).toUpperCase(Locale.ENGLISH)); if (negative) { extendedPictographic.remove(codePoint, codePointEnd); } else { @@ -583,8 +625,11 @@ private EmojiData(VersionInfo version) { // emojiRegionalIndicators.addAll(emojiData.getKeys(EmojiProp.Emoji_Regional_Indicator)).freeze(); emojiComponents.addAll(emojiData.getKeys(EmojiProp.Emoji_Component)).freeze(); - if (version.compareTo(Emoji.VERSION11) >= 0 && !new UnicodeSet(emojiComponents).removeAll(MODIFIERS) - .removeAll(Emoji.HAIR_STYLES).equals(Emoji.DEFECTIVE)) { + if (version.compareTo(Emoji.VERSION11) >= 0 + && !new UnicodeSet(emojiComponents) + .removeAll(MODIFIERS) + .removeAll(Emoji.HAIR_STYLES) + .equals(Emoji.DEFECTIVE)) { throw new IllegalArgumentException( "Bad components or defectives\n" + emojiComponents + "\n" + Emoji.DEFECTIVE); } @@ -599,16 +644,18 @@ private EmojiData(VersionInfo version) { keycapBases.freeze(); toNormalizedVariant.freeze(); fromNormalizedVariant.freeze(); - UnicodeSet rawHairBases = new UnicodeSet().addAll( - "🧒 👦 👧 🧑 👨 👩 🧓 👴 👵 👮 🕵 💂 👷 🤴 👸 👳 👲 🧔 🤵 👰 🤰 🤱 🎅 🤶 🧙-🧝 🙍 🙎 🙅 🙆 💁 🙋 🙇 🤦 🤷 💆 💇 🚶 🏃 💃 🕺 👯 🧖-🧘 🛀 🛌 🤺 🏇 ⛷ 🏂 🏌 🏄 🚣 🏊 ⛹ 🏋 🚴 🚵 🏎 🏍 🤸 🤼-🤾 🤹 🎓 🌾 🍳 🏫 🏭 🎨 🚒 ✈ 🚀 🎤 💻 🔬 💼 🔧 ⚖ ♀ ♂ ⚕ ") - .add(0x1F9B8).add(0x1F9B9).freeze(); + UnicodeSet rawHairBases = + new UnicodeSet() + .addAll( + "🧒 👦 👧 🧑 👨 👩 🧓 👴 👵 👮 🕵 💂 👷 🤴 👸 👳 👲 🧔 🤵 👰 🤰 🤱 🎅 🤶 🧙-🧝 🙍 🙎 🙅 🙆 💁 🙋 🙇 🤦 🤷 💆 💇 🚶 🏃 💃 🕺 👯 🧖-🧘 🛀 🛌 🤺 🏇 ⛷ 🏂 🏌 🏄 🚣 🏊 ⛹ 🏋 🚴 🚵 🏎 🏍 🤸 🤼-🤾 🤹 🎓 🌾 🍳 🏫 🏭 🎨 🚒 ✈ 🚀 🎤 💻 🔬 💼 🔧 ⚖ ♀ ♂ ⚕ ") + .add(0x1F9B8) + .add(0x1F9B9) + .freeze(); - if (DEBUG) - System.out.println("rawHairBases: " + rawHairBases.toPattern(false)); + if (DEBUG) System.out.println("rawHairBases: " + rawHairBases.toPattern(false)); hairBases.addAll(rawHairBases).retainAll(modifierBases).freeze(); - if (DEBUG) - System.out.println(version + "Hairbases: " + hairBases.toPattern(false)); + if (DEBUG) System.out.println(version + "Hairbases: " + hairBases.toPattern(false)); for (String s : zwjSequencesNormal) { if (s.contains("♀️") && !MODIFIERS.containsSome(s)) { @@ -642,9 +689,16 @@ private EmojiData(VersionInfo version) { // //.removeAll(new UnicodeSet("[[:L:][:M:][:^nt=none:]+_-]")) // .freeze(); - allEmojiWithoutDefectives = new UnicodeSet(singletonsWithDefectives).addAll(flagSequences) - .addAll(emojiTagSequences).addAll(modifierSequences).addAll(keycapSequences).addAll(zwjSequencesNormal) - .removeAll(Emoji.DEFECTIVE).addAll(MODIFIERS).freeze(); + allEmojiWithoutDefectives = + new UnicodeSet(singletonsWithDefectives) + .addAll(flagSequences) + .addAll(emojiTagSequences) + .addAll(modifierSequences) + .addAll(keycapSequences) + .addAll(zwjSequencesNormal) + .removeAll(Emoji.DEFECTIVE) + .addAll(MODIFIERS) + .freeze(); // if (allEmojiWithoutDefectives.contains("👨🏻‍🤝‍👨🏼")) { // throw new ICUException("??? 👨🏻‍🤝‍👨🏼"); // } @@ -665,11 +719,17 @@ private EmojiData(VersionInfo version) { int debug = 0; } - allEmojiWithDefectives = new UnicodeSet(allEmojiWithoutDefectives).addAll(zwjSequencesAll) - .addAll(keycapSequenceAll).freeze(); + allEmojiWithDefectives = + new UnicodeSet(allEmojiWithoutDefectives) + .addAll(zwjSequencesAll) + .addAll(keycapSequenceAll) + .freeze(); // make sure we are a superset (except for modifiers) - extendedPictographic.addAll(singletonsWithoutDefectives).removeAll(EmojiData.MODIFIERS).freeze(); + extendedPictographic + .addAll(singletonsWithoutDefectives) + .removeAll(EmojiData.MODIFIERS) + .freeze(); } private UnicodeSet addToSequencesNormal(String modSeq) { @@ -735,9 +795,11 @@ private void fixMaleFemale() { genderBase.add(first); } } - String neutered = Emoji.TO_NEUTRAL.transform(emoji) - .replace(Emoji.JOINER + Emoji.FEMALE + Emoji.EMOJI_VARIANT, "") - .replace(Emoji.JOINER + Emoji.MALE + Emoji.EMOJI_VARIANT, ""); + String neutered = + Emoji.TO_NEUTRAL + .transform(emoji) + .replace(Emoji.JOINER + Emoji.FEMALE + Emoji.EMOJI_VARIANT, "") + .replace(Emoji.JOINER + Emoji.MALE + Emoji.EMOJI_VARIANT, ""); if (!neutered.equals(emoji)) { toNeutral.put(emoji, neutered); @@ -777,7 +839,9 @@ private void addName(final String source, List lineParts) { String name; if (lineParts != null && lineParts.size() > 2) { - name = UCharacter.toTitleCase(lineParts.get(2), BreakIterator.getSentenceInstance(ULocale.ENGLISH)); + name = + UCharacter.toTitleCase( + lineParts.get(2), BreakIterator.getSentenceInstance(ULocale.ENGLISH)); } else { name = getFallbackName(filteredSource); } @@ -800,8 +864,11 @@ public UnicodeSet getAllEmojiWithDefectives() { } public UnicodeSet getEmojiForSortRules() { - return new UnicodeSet().addAll(getAllEmojiWithoutDefectives()).removeAll(Emoji.DEFECTIVE) - .addAll(getZwjSequencesNormal()).addAll(getKeycapSequences()); + return new UnicodeSet() + .addAll(getAllEmojiWithoutDefectives()) + .removeAll(Emoji.DEFECTIVE) + .addAll(getZwjSequencesNormal()) + .addAll(getKeycapSequences()); } public UnicodeSet getAllEmojiWithoutDefectives() { @@ -827,19 +894,23 @@ public static void freezeUnicodeSets(Collection collection) { } public DefaultPresentation getStyle(String ch) { - return textPresentationSet.contains(ch) ? DefaultPresentation.text + return textPresentationSet.contains(ch) + ? DefaultPresentation.text : emojiPresentationSet.contains(ch) ? DefaultPresentation.emoji : null; } public DefaultPresentation getStyle(int ch) { - return textPresentationSet.contains(ch) ? DefaultPresentation.text + return textPresentationSet.contains(ch) + ? DefaultPresentation.text : emojiPresentationSet.contains(ch) ? DefaultPresentation.emoji : null; } @Deprecated public UnicodeSet getModifierStatusSet(Emoji.ModifierStatus source) { - return source == Emoji.ModifierStatus.modifier ? getModifiers() - : source == Emoji.ModifierStatus.modifier_base ? modifierBases + return source == Emoji.ModifierStatus.modifier + ? getModifiers() + : source == Emoji.ModifierStatus.modifier_base + ? modifierBases : throwBad(new IllegalArgumentException()); } @@ -891,13 +962,14 @@ public T throwBad(RuntimeException e) { throw e; } - // public Iterable> entrySet() { // return data.entrySet(); // } - private static Set getSet(EnumMap _defaultPresentationMap, - String source, String string) { + private static Set getSet( + EnumMap _defaultPresentationMap, + String source, + String string) { if (string.isEmpty()) { return Collections.emptySet(); } @@ -963,15 +1035,35 @@ public static void putUnicodeSetValue(Map map, String key, T // private static void show(int cp, final UnicodeMap names, EmojiData emojiData) { - System.out.println(emojiData.version + "\t" + Utility.hex(cp) + ", " + emojiData.getStyle(cp) - + (emojiData.modifierBases.contains(cp) ? ", modifierBase" : "") + "\t" + names.get(cp)); - } - - private static void show(String cp, UnicodeMap ages, final UnicodeMap names, + System.out.println( + emojiData.version + + "\t" + + Utility.hex(cp) + + ", " + + emojiData.getStyle(cp) + + (emojiData.modifierBases.contains(cp) ? ", modifierBase" : "") + + "\t" + + names.get(cp)); + } + + private static void show( + String cp, + UnicodeMap ages, + final UnicodeMap names, EmojiData emojiData) { - System.out.println(BirthInfo.getBirthInfoMap().get(cp) + ";\temojiVersion=" - + Emoji.getShortName(emojiData.version) + ";\t" + Utility.hex(cp) + ";\t" + cp + ";\t" + names.get(cp) - + ";\t" + emojiData.getStyle(cp) + (emojiData.modifierBases.contains(cp) ? ", modifierBase" : "")); + System.out.println( + BirthInfo.getBirthInfoMap().get(cp) + + ";\temojiVersion=" + + Emoji.getShortName(emojiData.version) + + ";\t" + + Utility.hex(cp) + + ";\t" + + cp + + ";\t" + + names.get(cp) + + ";\t" + + emojiData.getStyle(cp) + + (emojiData.modifierBases.contains(cp) ? ", modifierBase" : "")); } public UnicodeSet getSortingChars() { @@ -981,7 +1073,8 @@ public UnicodeSet getSortingChars() { public static final EmojiData EMOJI_DATA = of(Emoji.VERSION_TO_GENERATE); public static final EmojiData EMOJI_DATA_PREVIOUS = of(Emoji.VERSION_TO_GENERATE_PREVIOUS); public static final EmojiData EMOJI_DATA_RELEASED = of(Emoji.VERSION_LAST_RELEASED); - public static final EmojiData EMOJI_DATA_BETA = Emoji.IS_BETA ? of(Emoji.VERSION_BETA) : EMOJI_DATA; + public static final EmojiData EMOJI_DATA_BETA = + Emoji.IS_BETA ? of(Emoji.VERSION_BETA) : EMOJI_DATA; public UnicodeSet getFlagSequences() { return flagSequences; @@ -991,7 +1084,7 @@ public UnicodeSet getKeycapSequences() { return keycapSequences; } - /** Include variant VS sequences **/ + /** Include variant VS sequences * */ public UnicodeSet getKeycapSequencesAll() { return keycapSequenceAll; } @@ -1002,22 +1095,30 @@ public UnicodeSet getKeycapBases() { public boolean skipEmojiSequence(String string) { EmojiData emojiData = this; - if (string.equals(" ") || string.equals("\t") || string.equals(Emoji.EMOJI_VARIANT_STRING) - || string.equals(Emoji.TEXT_VARIANT_STRING) || string.equals(Emoji.JOINER_STRING)) { + if (string.equals(" ") + || string.equals("\t") + || string.equals(Emoji.EMOJI_VARIANT_STRING) + || string.equals(Emoji.TEXT_VARIANT_STRING) + || string.equals(Emoji.JOINER_STRING)) { return true; } - if (!emojiData.getSortingChars().contains(string) && !emojiData.getZwjSequencesNormal().contains(string)) { + if (!emojiData.getSortingChars().contains(string) + && !emojiData.getZwjSequencesNormal().contains(string)) { return true; } return false; } - static final UnicodeSet JCARRIERS = new UnicodeSet().addAll(Emoji.BETA.load(UcdProperty.Emoji_DCM).keySet()) - .addAll(Emoji.BETA.load(UcdProperty.Emoji_KDDI).keySet()) - .addAll(Emoji.BETA.load(UcdProperty.Emoji_SB).keySet()).removeAll(new UnicodeSet("[:whitespace:]")) - .freeze(); + static final UnicodeSet JCARRIERS = + new UnicodeSet() + .addAll(Emoji.BETA.load(UcdProperty.Emoji_DCM).keySet()) + .addAll(Emoji.BETA.load(UcdProperty.Emoji_KDDI).keySet()) + .addAll(Emoji.BETA.load(UcdProperty.Emoji_SB).keySet()) + .removeAll(new UnicodeSet("[:whitespace:]")) + .freeze(); - private static Pattern EMOJI_VARIANTs = Pattern.compile("[" + Emoji.EMOJI_VARIANT + Emoji.TEXT_VARIANT + "]"); + private static Pattern EMOJI_VARIANTs = + Pattern.compile("[" + Emoji.EMOJI_VARIANT + Emoji.TEXT_VARIANT + "]"); public enum VariantStatus { /** All characters that need them have emoji-variants */ @@ -1077,11 +1178,13 @@ public VariantFactory set(String source) { } ImmutableList.Builder _parts = ImmutableList.builder(); StringBuilder result = new StringBuilder(); - int[] sequences = CharSequences.codePoints(EMOJI_VARIANTs.matcher(source).replaceAll("")); + int[] sequences = + CharSequences.codePoints(EMOJI_VARIANTs.matcher(source).replaceAll("")); for (int i = 0; i < sequences.length; ++i) { int cp = sequences[i]; result.appendCodePoint(cp); - if (!TAKES_NO_VARIANT.contains(cp) && !emojiPresentationSet.contains(cp) + if (!TAKES_NO_VARIANT.contains(cp) + && !emojiPresentationSet.contains(cp) && (i == sequences.length - 1 || !MODIFIERS.contains(sequences[i + 1]))) { _parts.add(result.toString()); result.setLength(0); @@ -1114,7 +1217,8 @@ public Set getCombinations() { // TODO put in code point order?? } result.append(parts.get(item)); String itemString = result.toString(); - if (full == null) { // the first one has all 1's, ie, all possible cases with emoji variants + if (full == null) { // the first one has all 1's, ie, all possible cases with emoji + // variants full = itemString; } combo.add(itemString); @@ -1133,7 +1237,7 @@ public String getFull() { /** * Add EVS to sequences where needed (and remove where not) - * + * * @param source * @return */ @@ -1147,7 +1251,7 @@ public String addEmojiVariants(String source, Emoji.Qualified qualified) { /** * Add EVS or TVS to sequences where needed (and remove where not) - * + * * @param source * @param qualified TODO * @return @@ -1183,7 +1287,8 @@ public String getVariant(String source, Emoji.Qualified qualified, char variant) private boolean needsVariant(int cp) { return cp == Emoji.TRANSGENDER_CP // hack - || (getSingletonsWithDefectives().contains(cp) && !getEmojiPresentationSet().contains(cp) + || (getSingletonsWithDefectives().contains(cp) + && !getEmojiPresentationSet().contains(cp) && !TAKES_NO_VARIANT.contains(cp)); } @@ -1197,7 +1302,8 @@ public String getName(String source) { static final String DEBUG_STRING = UTF16.valueOf(0x1F3F4); - private String _getName(String source, boolean toLower, Transform otherNameSource) { + private String _getName( + String source, boolean toLower, Transform otherNameSource) { if (source.contains(DEBUG_STRING)) { int debug = 0; } @@ -1228,12 +1334,18 @@ private String _getName(String source, boolean toLower, Transform firstCount) { + main: + if (s.length() > firstCount) { int cp2 = s.codePointAt(firstCount); // final EmojiDatum edata = getData(cp2); if (MODIFIERS.contains(cp2)) { @@ -1276,10 +1389,18 @@ public String getFallbackName(String s) { } else if (s.indexOf(EmojiData.HANDSHAKE) >= 0) { // HEART title = "Couple holding hands: "; } else if (s.indexOf(0x2640) >= 0) { - name = nameBuffer.append("FEMALE: ").append(Emoji.NAME.get(s.codePointAt(0))).toString(); + name = + nameBuffer + .append("FEMALE: ") + .append(Emoji.NAME.get(s.codePointAt(0))) + .toString(); break main; } else if (s.indexOf(0x2642) >= 0) { - name = nameBuffer.append("MALE: ").append(Emoji.NAME.get(s.codePointAt(0))).toString(); + name = + nameBuffer + .append("MALE: ") + .append(Emoji.NAME.get(s.codePointAt(0))) + .toString(); break main; } else if (Emoji.PROFESSION_OBJECT.containsSome(s)) { title = "Role: "; @@ -1289,7 +1410,10 @@ public String getFallbackName(String s) { nameBuffer.append(title); } for (int cp : CharSequences.codePoints(s)) { - if (cp == Emoji.JOINER || cp == Emoji.EMOJI_VARIANT || cp == 0x2764 || cp == 0x1F48B + if (cp == Emoji.JOINER + || cp == Emoji.EMOJI_VARIANT + || cp == 0x2764 + || cp == 0x1F48B || cp == HANDSHAKE) { // heart, kiss continue; } @@ -1320,15 +1444,15 @@ static String shortModNameX(int cp2) { static String shortModNameZ(int cp2) { switch (cp2) { - case 0x1F3FB: - return "t1/2"; - case 0x1F3FC: - case 0x1F3FD: - case 0x1F3FE: - case 0x1F3FF: - return "t" + (cp2 - 0x1F3F9); - default: - throw new IllegalArgumentException("Illegal Modifier Name"); + case 0x1F3FB: + return "t1/2"; + case 0x1F3FC: + case 0x1F3FD: + case 0x1F3FE: + case 0x1F3FF: + return "t" + (cp2 - 0x1F3F9); + default: + throw new IllegalArgumentException("Illegal Modifier Name"); } } @@ -1375,10 +1499,12 @@ public Set addModifiers(String singletonOrSequence, boolean addMultiples if (NEUTRAL_HOLDING.equals(singletonOrSequence) && addMultiples) { int debug = 0; } - if (singletonOrSequence == null || getModifiers().containsSome(singletonOrSequence) // skip if has modifiers + if (singletonOrSequence == null + || getModifiers().containsSome(singletonOrSequence) // skip if has modifiers // already - || !getModifierBasesRgi().containsSome(singletonOrSequence) // skip if has no modifier bases - ) { + || !getModifierBasesRgi() + .containsSome(singletonOrSequence) // skip if has no modifier bases + ) { return Collections.emptySet(); } LinkedHashSet output = new LinkedHashSet<>(); @@ -1391,13 +1517,20 @@ public Set addModifiers(String singletonOrSequence, boolean addMultiples didMultiples = true; // TODO HACK for now. If we add other groupings with skintones, generalize String prefix = singletonOrSequence.substring(0, handshakePos); - String postfix = singletonOrSequence.substring(handshakePos + ZWJ_HANDSHAKE_ZWJ.length()); + String postfix = + singletonOrSequence.substring( + handshakePos + ZWJ_HANDSHAKE_ZWJ.length()); boolean sameAffix = prefix.equals(postfix); final boolean skipIfFirstLighterThanSecond = !afterVersion12 && sameAffix; - addMultiples(prefix, ZWJ_HANDSHAKE_ZWJ, postfix, skipIfFirstLighterThanSecond, output); + addMultiples( + prefix, + ZWJ_HANDSHAKE_ZWJ, + postfix, + skipIfFirstLighterThanSecond, + output); } } else if (isHandshake(singletonOrSequence)) { - addMultiples(RIGHTWARDS_HAND, Emoji.JOINER_STR, LEFTWARDS_HAND, false, output); + addMultiples(RIGHTWARDS_HAND, Emoji.JOINER_STR, LEFTWARDS_HAND, false, output); } else if (false && MULTIPLE_SKINS.contains(singletonOrSequence)) { if (version.compareTo(Emoji.VERSION13) > 0) { String infix = "\u200D\u2764\u200D"; @@ -1407,7 +1540,7 @@ public Set addModifiers(String singletonOrSequence, boolean addMultiples pos = singletonOrSequence.indexOf(infix); } if (pos > 0) { - String prefix = singletonOrSequence.substring(0,pos); + String prefix = singletonOrSequence.substring(0, pos); String postfix = singletonOrSequence.substring(pos + infix.length()); addMultiples(prefix, ZWJ_HANDSHAKE_ZWJ, postfix, false, output); } @@ -1428,15 +1561,20 @@ public Set addModifiers(String singletonOrSequence, boolean addMultiples return output.isEmpty() ? Collections.emptySet() : ImmutableSet.copyOf(output); } - public void addMultiples(String prefix, String infix, String postfix, - final boolean skipIfFirstLighterThanSecond, LinkedHashSet output) { + public void addMultiples( + String prefix, + String infix, + String postfix, + final boolean skipIfFirstLighterThanSecond, + LinkedHashSet output) { for (String mod : EmojiData.MODIFIERS) { String prefixMod = addModifierPart(prefix, mod); if (prefixMod == null) { throw new IllegalArgumentException("internal error"); } for (String mod2 : EmojiData.MODIFIERS) { - if (skipIfFirstLighterThanSecond && mod.compareTo(mod2) < 0) { // skip if first mod is lighter than second + if (skipIfFirstLighterThanSecond + && mod.compareTo(mod2) < 0) { // skip if first mod is lighter than second continue; } String postfixMod = addModifierPart(postfix, mod2); @@ -1481,14 +1619,18 @@ private String addModifierPart(String singletonOrSequence, String modifier) { return b.toString(); } - static final IndexUnicodeProperties latest = IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); + static final IndexUnicodeProperties latest = + IndexUnicodeProperties.make(GenerateEnums.ENUM_VERSION); static final IndexUnicodeProperties beta = IndexUnicodeProperties.make(); static boolean SKIP = true; private static String getSpecialAge(String s) { - return CandidateData.getInstance().getCharacters().containsAll(s) ? "Candidate" - : EmojiData.of(Emoji.VERSION3).getAllEmojiWithDefectives().contains(s) ? "Emoji v3.0" - : EmojiData.of(Emoji.VERSION4).getAllEmojiWithDefectives().contains(s) ? "Emoji v4.0" + return CandidateData.getInstance().getCharacters().containsAll(s) + ? "Candidate" + : EmojiData.of(Emoji.VERSION3).getAllEmojiWithDefectives().contains(s) + ? "Emoji v3.0" + : EmojiData.of(Emoji.VERSION4).getAllEmojiWithDefectives().contains(s) + ? "Emoji v4.0" : "Emoji v5.0"; // // return version.compareTo(VersionInfo.UNICODE_10_0) == 0 ? "Candidate" @@ -1499,9 +1641,7 @@ public static void main(String[] args) { final VersionInfo v1_0 = VersionInfo.getInstance(1); EmojiData emoji10 = EmojiData.of(v1_0); UnicodeSet all = emoji10.getAllEmojiWithoutDefectives(); - System.out - .println(emoji10.getAllEmojiWithoutDefectives().size() - + "\t" + all); + System.out.println(emoji10.getAllEmojiWithoutDefectives().size() + "\t" + all); for (String s : all) { BirthInfo bi = BirthInfo.getBirthInfo(s); if (bi.emojiVersionInfo != v1_0) { @@ -1517,8 +1657,7 @@ public static void main(String[] args) { } System.out.println(Utility.hex(s) + "; \t" + s + "; \t" + emojiReleased.getName(s)); } - if (true) - return; + if (true) return; EmojiData e11a = null; for (Entry> entry : e11a.maleToOther.asMap().entrySet()) { @@ -1528,40 +1667,68 @@ public static void main(String[] args) { System.out.println("F2M\t" + entry.getKey() + "\t" + entry.getValue()); } System.out.println("otherHuman:\t" + e11a.otherHuman.toPattern(false)); - System.out.println("otherHuman-mods:\t" + new UnicodeSet(e11a.allEmojiWithoutDefectivesOrModifiers) - .retainAll(e11a.otherHuman).toPattern(false)); - - UnicodeSet explicitGendered = new UnicodeSet().addAll(e11a.maleToOther.keySet()) - .addAll(e11a.femaleToOther.keySet()).add(new UnicodeSet("[🧔]")).freeze(); - - UnicodeSet gendered = new UnicodeSet().addAll(e11a.maleToOther.keySet()).addAll(e11a.femaleToOther.keySet()) - .addAll(e11a.otherHuman).freeze(); - - UnicodeSet people = new UnicodeSet().addAll(EmojiOrder.BETA_ORDER.majorGroupings.getSet(MajorGroup.People)) - .removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("body")) - .removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("emotion")) - .removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("clothing")) - .retainAll(e11a.allEmojiWithoutDefectives).freeze(); + System.out.println( + "otherHuman-mods:\t" + + new UnicodeSet(e11a.allEmojiWithoutDefectivesOrModifiers) + .retainAll(e11a.otherHuman) + .toPattern(false)); + + UnicodeSet explicitGendered = + new UnicodeSet() + .addAll(e11a.maleToOther.keySet()) + .addAll(e11a.femaleToOther.keySet()) + .add(new UnicodeSet("[🧔]")) + .freeze(); + + UnicodeSet gendered = + new UnicodeSet() + .addAll(e11a.maleToOther.keySet()) + .addAll(e11a.femaleToOther.keySet()) + .addAll(e11a.otherHuman) + .freeze(); + + UnicodeSet people = + new UnicodeSet() + .addAll(EmojiOrder.BETA_ORDER.majorGroupings.getSet(MajorGroup.People)) + .removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("body")) + .removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("emotion")) + .removeAll(EmojiOrder.BETA_ORDER.charactersToOrdering.getSet("clothing")) + .retainAll(e11a.allEmojiWithoutDefectives) + .freeze(); diff2("gendered", gendered, "people", people); - System.out - .println("genderBase:\t" + e11a.getGenderBase().size() + "\t" + e11a.getGenderBase().toPattern(false)); - - diff2("otherHuman", new UnicodeSet(e11a.otherHuman).removeAll(e11a.otherHuman.strings()), "genderBase", + System.out.println( + "genderBase:\t" + + e11a.getGenderBase().size() + + "\t" + + e11a.getGenderBase().toPattern(false)); + + diff2( + "otherHuman", + new UnicodeSet(e11a.otherHuman).removeAll(e11a.otherHuman.strings()), + "genderBase", e11a.getGenderBase()); UnicodeSet explicitNeutral = new UnicodeSet("[👶🧒🧑🧓👼🧔🗣👤👥]").freeze(); UnicodeSet group = new UnicodeSet("[👫👬👭💏💑👪]").freeze(); - UnicodeSet explicitGender = new UnicodeSet(explicitGendered).removeAll(explicitGendered.strings()) - .removeAll(group); + UnicodeSet explicitGender = + new UnicodeSet(explicitGendered) + .removeAll(explicitGendered.strings()) + .removeAll(group); show("genderBase", e11a.getGenderBase()); show("explicitGender", explicitGender); show("explicitNeutral", explicitNeutral); show("group", group); - show("otherHuman", new UnicodeSet(e11a.otherHuman).removeAll(e11a.otherHuman.strings()) - .removeAll(e11a.getGenderBase()).removeAll(group).removeAll(explicitNeutral).removeAll(explicitGender)); + show( + "otherHuman", + new UnicodeSet(e11a.otherHuman) + .removeAll(e11a.otherHuman.strings()) + .removeAll(e11a.getGenderBase()) + .removeAll(group) + .removeAll(explicitNeutral) + .removeAll(explicitGender)); // diff2("genderBase", e11a.getGenderBase(), "Emoji.GENDER_BASE", // Emoji.GENDER_BASE); @@ -1573,9 +1740,11 @@ public static void main(String[] args) { { EmojiData e11 = EmojiData.of(Emoji.VERSION11); EmojiData e5 = EmojiData.of(Emoji.VERSION5); - UnicodeSet us11 = new UnicodeSet(e11.getAllEmojiWithoutDefectives()) - .removeAll(e5.getAllEmojiWithoutDefectives()); - Set sorted = us11.addAllTo(new TreeSet<>(EmojiOrder.of(Emoji.VERSION11).codepointCompare)); + UnicodeSet us11 = + new UnicodeSet(e11.getAllEmojiWithoutDefectives()) + .removeAll(e5.getAllEmojiWithoutDefectives()); + Set sorted = + us11.addAllTo(new TreeSet<>(EmojiOrder.of(Emoji.VERSION11).codepointCompare)); int count = 0; for (String s : sorted) { String v = e11.addEmojiVariants(s); @@ -1589,8 +1758,7 @@ public static void main(String[] args) { // } } } - if (true) - return; + if (true) return; EmojiData one = EmojiData.of(Emoji.VERSION1); UnicodeSet e1 = one.getAllEmojiWithDefectives(); @@ -1606,17 +1774,18 @@ public static void main(String[] args) { items.put(cat, item); } for (Entry> entry : items.asMap().entrySet()) { - System.out.println(entry.getKey() + "\t" + entry.getKey().getAttributes() + entry.getValue()); + System.out.println( + entry.getKey() + "\t" + entry.getKey().getAttributes() + entry.getValue()); } - if (SKIP) - return; + if (SKIP) return; BirthInfo.checkYears(); EmojiData v6 = EmojiData.of(Emoji.VERSION11); EmojiOrder order6 = EmojiOrder.of(Emoji.VERSION11); UnicodeSet Uv7 = new UnicodeSet("[:age=7.0:]"); - UnicodeSet newItems6 = new UnicodeSet(v6.allEmojiWithoutDefectivesOrModifiers) - .addAll(CandidateData.getInstance().getCharacters()); + UnicodeSet newItems6 = + new UnicodeSet(v6.allEmojiWithoutDefectivesOrModifiers) + .addAll(CandidateData.getInstance().getCharacters()); for (String s : newItems6) { if (Uv7.containsAll(s)) { continue; @@ -1625,13 +1794,18 @@ public static void main(String[] args) { if (category == null) { category = CandidateData.getInstance().getCategory(s); } - System.out.println(UCharacter.toTitleCase(v6.getName(s), null) + "\t" - + UCharacter.toTitleCase(category, null) + "\t" + getSpecialAge(s)); + System.out.println( + UCharacter.toTitleCase(v6.getName(s), null) + + "\t" + + UCharacter.toTitleCase(category, null) + + "\t" + + getSpecialAge(s)); } EmojiData v4 = EmojiData.of(Emoji.VERSION4); - UnicodeSet newItems = new UnicodeSet(last.getSingletonsWithoutDefectives()) - .removeAll(v4.getSingletonsWithoutDefectives()); + UnicodeSet newItems = + new UnicodeSet(last.getSingletonsWithoutDefectives()) + .removeAll(v4.getSingletonsWithoutDefectives()); Set sorted2 = new TreeSet<>(EmojiOrder.STD_ORDER.codepointCompare); for (String s : newItems.addAllTo(sorted2)) { System.out.println("U+" + Utility.hex(s)); @@ -1668,32 +1842,50 @@ public static void main(String[] args) { for (String combo : combos.getValue()) { Integer count = combos.getKey(); System.out.println( - count + "\t" + Utility.hex(combo, " ") + "\t(" + combo + ")" + "\t" + betaData.getName(combo)); - if (--max < 0) - break; + count + + "\t" + + Utility.hex(combo, " ") + + "\t(" + + combo + + ")" + + "\t" + + betaData.getName(combo)); + if (--max < 0) break; } System.out.println(); } - if (true) - return; + if (true) return; EmojiData lastReleasedData = EmojiData.of(Emoji.VERSION_LAST_RELEASED); - showDiff("Emoji", Emoji.VERSION_LAST_RELEASED_STRING, lastReleasedData.getSingletonsWithoutDefectives(), - Emoji.VERSION_BETA_STRING_WITH_COLOR, betaData.getSingletonsWithoutDefectives()); - showDiff("Emoji_Presentation", Emoji.VERSION_LAST_RELEASED_STRING, lastReleasedData.getEmojiPresentationSet(), - Emoji.VERSION_BETA_STRING_WITH_COLOR, betaData.getEmojiPresentationSet()); - showDiff("Emoji_Modifier_Base", Emoji.VERSION_LAST_RELEASED_STRING, lastReleasedData.getModifierBases(), - Emoji.VERSION_BETA_STRING_WITH_COLOR, betaData.getModifierBases()); + showDiff( + "Emoji", + Emoji.VERSION_LAST_RELEASED_STRING, + lastReleasedData.getSingletonsWithoutDefectives(), + Emoji.VERSION_BETA_STRING_WITH_COLOR, + betaData.getSingletonsWithoutDefectives()); + showDiff( + "Emoji_Presentation", + Emoji.VERSION_LAST_RELEASED_STRING, + lastReleasedData.getEmojiPresentationSet(), + Emoji.VERSION_BETA_STRING_WITH_COLOR, + betaData.getEmojiPresentationSet()); + showDiff( + "Emoji_Modifier_Base", + Emoji.VERSION_LAST_RELEASED_STRING, + lastReleasedData.getModifierBases(), + Emoji.VERSION_BETA_STRING_WITH_COLOR, + betaData.getModifierBases()); String name = betaData.getName("🏂🏻"); for (String s : betaData.getModifierBases()) { - String comp = betaData.getVariant(s, Emoji.Qualified.all, Emoji.EMOJI_VARIANT) + "\u200D\u2642\uFE0F"; + String comp = + betaData.getVariant(s, Emoji.Qualified.all, Emoji.EMOJI_VARIANT) + + "\u200D\u2642\uFE0F"; System.out.println(Utility.hex(comp, " ") + "\t" + s + "\t" + betaData.getName(s)); } - if (true) - return; + if (true) return; for (String s : betaData.allEmojiWithDefectives) { System.out.println(Emoji.show(s)); } @@ -1716,45 +1908,75 @@ public static void main(String[] args) { System.out.println("Version " + GenerateEnums.ENUM_VERSION); final UnicodeMap betaNames = beta.load(UcdProperty.Name); final UnicodeMap names = latest.load(UcdProperty.Name); - final UnicodeMap ages = beta.loadEnum(UcdProperty.Age, UcdPropertyValues.Age_Values.class); + final UnicodeMap ages = + beta.loadEnum(UcdProperty.Age, UcdPropertyValues.Age_Values.class); show(0x1f946, names, betaData); show(0x1f93b, names, betaData); - UnicodeSet overlap = new UnicodeSet(betaData.getModifierBases()) - .retainAll(EmojiData.DefaultPresentation.text == EmojiData.DefaultPresentation.emoji - ? betaData.getEmojiPresentationSet() - : betaData.getTextPresentationSet()); - System.out.println("ModifierBase + TextPresentation: " + overlap.size() + "\t" + overlap.toPattern(false)); + UnicodeSet overlap = + new UnicodeSet(betaData.getModifierBases()) + .retainAll( + EmojiData.DefaultPresentation.text + == EmojiData.DefaultPresentation.emoji + ? betaData.getEmojiPresentationSet() + : betaData.getTextPresentationSet()); + System.out.println( + "ModifierBase + TextPresentation: " + + overlap.size() + + "\t" + + overlap.toPattern(false)); for (String s : overlap) { - System.out.println(Utility.hex(s) + "\t" + s + "\t" + ages.get(s) + "\t" + names.get(s)); + System.out.println( + Utility.hex(s) + "\t" + s + "\t" + ages.get(s) + "\t" + names.get(s)); } - System.out.println("v2 SingletonsWithDefectives " + lastReleasedData.getSingletonsWithDefectives().size() + "\t" - + lastReleasedData.getSingletonsWithDefectives()); + System.out.println( + "v2 SingletonsWithDefectives " + + lastReleasedData.getSingletonsWithDefectives().size() + + "\t" + + lastReleasedData.getSingletonsWithDefectives()); - System.out.println("SingletonsWithDefectives " + betaData.getSingletonsWithDefectives().size() + "\t" - + betaData.getSingletonsWithDefectives()); - System.out.println("Defectives " - + -(betaData.getSingletonsWithDefectives().size() - betaData.getSingletonsWithoutDefectives().size())); + System.out.println( + "SingletonsWithDefectives " + + betaData.getSingletonsWithDefectives().size() + + "\t" + + betaData.getSingletonsWithDefectives()); + System.out.println( + "Defectives " + + -(betaData.getSingletonsWithDefectives().size() + - betaData.getSingletonsWithoutDefectives().size())); System.out.println("Flag Sequences " + betaData.getFlagSequences().size()); System.out.println("ModiferSequences " + betaData.getModifierSequences().size()); - System.out.println("Keycap Sequences " + betaData.getKeycapSequences().size() + "\t" - + betaData.getKeycapSequences().toPattern(false)); - System.out.println("Zwj Sequences " + betaData.getZwjSequencesNormal().size() + "\t" - + betaData.getZwjSequencesNormal().toPattern(false)); + System.out.println( + "Keycap Sequences " + + betaData.getKeycapSequences().size() + + "\t" + + betaData.getKeycapSequences().toPattern(false)); + System.out.println( + "Zwj Sequences " + + betaData.getZwjSequencesNormal().size() + + "\t" + + betaData.getZwjSequencesNormal().toPattern(false)); System.out.println("modifier" + ", " + betaData.MODIFIERS.toPattern(false)); System.out.println( - Emoji.CharSource.WDings + ", " + betaData.getCharSourceSet(Emoji.CharSource.WDings).toPattern(false)); - System.out.println(EmojiData.DefaultPresentation.emoji + ", " - + (EmojiData.DefaultPresentation.emoji == EmojiData.DefaultPresentation.emoji - ? betaData.getEmojiPresentationSet() - : betaData.getTextPresentationSet()).toPattern(false)); + Emoji.CharSource.WDings + + ", " + + betaData.getCharSourceSet(Emoji.CharSource.WDings).toPattern(false)); + System.out.println( + EmojiData.DefaultPresentation.emoji + + ", " + + (EmojiData.DefaultPresentation.emoji + == EmojiData.DefaultPresentation.emoji + ? betaData.getEmojiPresentationSet() + : betaData.getTextPresentationSet()) + .toPattern(false)); show(0x1F3CB, names, betaData); show(0x1F3CB, names, lastReleasedData); - UnicodeSet keys = new UnicodeSet(betaData.getSingletonsWithDefectives()) - .addAll(lastReleasedData.getSingletonsWithDefectives()); + UnicodeSet keys = + new UnicodeSet(betaData.getSingletonsWithDefectives()) + .addAll(lastReleasedData.getSingletonsWithDefectives()); System.out.println("Diffs"); for (String key : keys) { // EmojiDatum datum = lastReleasedData.data.get(key); @@ -1766,11 +1988,22 @@ public static void main(String[] args) { // show(key, ages, betaNames, emojiData2); } } - System.out.println("Keycap0 " + betaData.getSortingChars().contains("0" + Emoji.KEYCAP_MARK_STRING)); - System.out.println("KeycapE " - + betaData.getSortingChars().contains("0" + Emoji.EMOJI_VARIANT_STRING + Emoji.KEYCAP_MARK_STRING)); - System.out.println("KeycapT " - + betaData.getSortingChars().contains("0" + Emoji.TEXT_VARIANT_STRING + Emoji.KEYCAP_MARK_STRING)); + System.out.println( + "Keycap0 " + betaData.getSortingChars().contains("0" + Emoji.KEYCAP_MARK_STRING)); + System.out.println( + "KeycapE " + + betaData.getSortingChars() + .contains( + "0" + + Emoji.EMOJI_VARIANT_STRING + + Emoji.KEYCAP_MARK_STRING)); + System.out.println( + "KeycapT " + + betaData.getSortingChars() + .contains( + "0" + + Emoji.TEXT_VARIANT_STRING + + Emoji.KEYCAP_MARK_STRING)); } private static void show(String title, UnicodeSet uset) { @@ -1781,7 +2014,8 @@ private static void show(String title, UnicodeSet uset) { } catch (Exception e) { StringBuilder nameBuffer = new StringBuilder(); for (int cp : With.codePointArray(emoji)) { - if (!MODIFIERS.contains(cp) && EmojiData.EMOJI_DATA.emojiComponents.contains(cp)) { + if (!MODIFIERS.contains(cp) + && EmojiData.EMOJI_DATA.emojiComponents.contains(cp)) { continue; } if (nameBuffer.length() != 0) { @@ -1791,13 +2025,15 @@ private static void show(String title, UnicodeSet uset) { } name = nameBuffer.toString(); } - System.out.println(title + "\t\\x{" + Utility.hex(emoji, 1, " ") + "}\t" + emoji + "\t" + name); + System.out.println( + title + "\t\\x{" + Utility.hex(emoji, 1, " ") + "}\t" + emoji + "\t" + name); } } private static void diff(String title1, UnicodeSet set1, String title2, UnicodeSet set2) { UnicodeSet uset = new UnicodeSet(set1).removeAll(set2); - System.out.println(title1 + " - " + title2 + "\t" + uset.size() + "\t" + uset.toPattern(false)); + System.out.println( + title1 + " - " + title2 + "\t" + uset.size() + "\t" + uset.toPattern(false)); } private static void diff2(String title1, UnicodeSet set1, String title2, UnicodeSet set2) { @@ -1833,7 +2069,8 @@ public String getOnlyFirstVariant(String item) { return UTF16.valueOf(first) + Emoji.EMOJI_VARIANT_STRING + item.substring(firstLen); } - private static void showDiff(String title, String string1, UnicodeSet set1, String string2, UnicodeSet set2) { + private static void showDiff( + String title, String string1, UnicodeSet set1, String string2, UnicodeSet set2) { int count = showAminusB(title, string1, set1, string2, set2); count += showAminusB(title, string2, set2, string1, set1); if (count == 0) { @@ -1841,29 +2078,32 @@ private static void showDiff(String title, String string1, UnicodeSet set1, Stri } } - private static int showAminusB(String title, String string1, UnicodeSet set1, String string2, UnicodeSet set2) { + private static int showAminusB( + String title, String string1, UnicodeSet set1, String string2, UnicodeSet set2) { UnicodeSet firstMinusSecond = new UnicodeSet(set1).removeAll(set2); if (!firstMinusSecond.isEmpty()) { - System.out.println("Diff " + title + ": " + string1 + " - " + string2 + ": " + firstMinusSecond); + System.out.println( + "Diff " + title + ": " + string1 + " - " + string2 + ": " + firstMinusSecond); } return firstMinusSecond.size(); } /** - * private final EmojiData.DefaultPresentation style; private final - * Emoji.ModifierStatus modifierStatus; private final Set - * sources; - * + * private final EmojiData.DefaultPresentation style; private final Emoji.ModifierStatus + * modifierStatus; private final Set sources; + * * @param lastReleasedData * @param betaData * @param key * @return */ private static boolean dataEquals(EmojiData lastReleasedData, EmojiData betaData, String key) { - return lastReleasedData.singletonsWithDefectives.contains(key) == betaData.singletonsWithDefectives - .contains(key) - && lastReleasedData.emojiPresentationSet.contains(key) == betaData.emojiPresentationSet.contains(key) - && lastReleasedData.modifierBases.contains(key) == betaData.modifierBases.contains(key); + return lastReleasedData.singletonsWithDefectives.contains(key) + == betaData.singletonsWithDefectives.contains(key) + && lastReleasedData.emojiPresentationSet.contains(key) + == betaData.emojiPresentationSet.contains(key) + && lastReleasedData.modifierBases.contains(key) + == betaData.modifierBases.contains(key); } @Override @@ -1905,7 +2145,8 @@ public UnicodeSet getTagSequences() { return emojiTagSequences; } - static final UnicodeSet TYPICAL_DUP_GROUP = new UnicodeSet("[{👨‍👩‍👧‍👦}]").freeze(); // "[{👩‍❤️‍💋‍👨} + static final UnicodeSet TYPICAL_DUP_GROUP = + new UnicodeSet("[{👨‍👩‍👧‍👦}]").freeze(); // "[{👩‍❤️‍💋‍👨} // {👨‍👩‍👧‍👦} // {👩‍❤️‍👨}]" static final UnicodeSet TYPICAL_DUP_SIGN = new UnicodeSet("[{\u200D\u2642}]").freeze(); @@ -1931,22 +2172,21 @@ public UnicodeSet getExtendedPictographic() { public UnicodeSet getHairBases() { return hairBases; } - - private static final UnicodeSet EXPLICIT_GENDER_13 = new UnicodeSet( - "[[👦-👩 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🧔 🕴 👫-👭]]") - .freeze(); - private static final UnicodeSet EXPLICIT_GENDER_13_1 = new UnicodeSet(EXPLICIT_GENDER_13) - .remove("🧔") - .freeze(); + private static final UnicodeSet EXPLICIT_GENDER_13 = + new UnicodeSet("[[👦-👩 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🧔 🕴 👫-👭]]") + .freeze(); + + private static final UnicodeSet EXPLICIT_GENDER_13_1 = + new UnicodeSet(EXPLICIT_GENDER_13).remove("🧔").freeze(); -// private static final UnicodeSet EXPLICIT_GENDER_13 = new UnicodeSet( -// "[[👦-👩 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🧔 🕴 👫-👭]]") -// .freeze(); -// -// private static final UnicodeSet EXPLICIT_GENDER_13_1 = new UnicodeSet(EXPLICIT_GENDER_13) -// .remove("🧔") -// .freeze(); + // private static final UnicodeSet EXPLICIT_GENDER_13 = new UnicodeSet( + // "[[👦-👩 👴 👵 🤴 👸 👲 🧕 🤵 👰 🤰 🤱 🎅 🤶 💃 🕺 🧔 🕴 👫-👭]]") + // .freeze(); + // + // private static final UnicodeSet EXPLICIT_GENDER_13_1 = new UnicodeSet(EXPLICIT_GENDER_13) + // .remove("🧔") + // .freeze(); public UnicodeSet getExplicitGender() { return version.compareTo(Emoji.VERSION13) <= 0 ? EXPLICIT_GENDER_13 : EXPLICIT_GENDER_13_1; @@ -1976,28 +2216,29 @@ public UnicodeSet getMultiPersonGroupings() { return MULTIPERSON; } - /** - * This contains the mapping to the "shortest form" form for certain combinations. - */ - public static final Map MAP_TO_COUPLES = ImmutableMap.builder() - .put("👨‍🤝‍👨", "👬") - .put("👩‍🤝‍👨", "👫") - .put("👩‍🤝‍👩", "👭") - .put("🧑‍❤️‍💋‍🧑", "💏") - .put("🧑‍❤️‍🧑", "💑") - .build(); - - public static final Map COUPLES_TO_HANDSHAKE_VERSION = ImmutableMap.builder() - .put("👬", "👨‍🤝‍👨") - .put("👫", "👩‍🤝‍👨") - .put("👭", "👩‍🤝‍👩") - .build(); - - public static final UnicodeSet COUPLES = new UnicodeSet().addAll(COUPLES_TO_HANDSHAKE_VERSION.keySet()).freeze(); + /** This contains the mapping to the "shortest form" form for certain combinations. */ + public static final Map MAP_TO_COUPLES = + ImmutableMap.builder() + .put("👨‍🤝‍👨", "👬") + .put("👩‍🤝‍👨", "👫") + .put("👩‍🤝‍👩", "👭") + .put("🧑‍❤️‍💋‍🧑", "💏") + .put("🧑‍❤️‍🧑", "💑") + .build(); + + public static final Map COUPLES_TO_HANDSHAKE_VERSION = + ImmutableMap.builder() + .put("👬", "👨‍🤝‍👨") + .put("👫", "👩‍🤝‍👨") + .put("👭", "👩‍🤝‍👩") + .build(); + + public static final UnicodeSet COUPLES = + new UnicodeSet().addAll(COUPLES_TO_HANDSHAKE_VERSION.keySet()).freeze(); /** * Remove the skin tone modifiers and the gender signs, and remap the couples. - * + * * @param s * @return */ @@ -2006,26 +2247,27 @@ public String getBaseRemovingModsGender(String s) { String temp = MAP_TO_COUPLES.get(result); if (temp != null) { result = temp; - if (DEBUG) - System.out.println("couple: " + s + " => " + result); + if (DEBUG) System.out.println("couple: " + s + " => " + result); } return result; } public boolean isHandshake(String s) { - if (version.compareTo(Emoji.VERSION14) >= 0) { - return getBaseRemovingModsGender(s).equals(EmojiData.SHAKING_HANDS); - } - return false; + if (version.compareTo(Emoji.VERSION14) >= 0) { + return getBaseRemovingModsGender(s).equals(EmojiData.SHAKING_HANDS); + } + return false; } private static final Map charSourcesToUnicodeSet; - private static final UnicodeMap> codepointToCharSource = new UnicodeMap<>(); + private static final UnicodeMap> codepointToCharSource = + new UnicodeMap<>(); + static { - EnumMap _charSourceMap = new EnumMap<>(Emoji.CharSource.class); + EnumMap _charSourceMap = + new EnumMap<>(Emoji.CharSource.class); for (String line : FileUtilities.in(EmojiData.class, "emojiSources.txt")) { - if (line.startsWith("#") || line.isEmpty()) - continue; + if (line.startsWith("#") || line.isEmpty()) continue; List list = semi.splitToList(line); String source = Utility.fromHex(list.get(0)); Set sourcesIn = getSet(list.get(1)); @@ -2042,13 +2284,16 @@ public boolean isHandshake(String s) { charSourcesToUnicodeSet = Collections.unmodifiableMap(_charSourceMap); freezeUnicodeSets(_charSourceMap.values()); } + public static UnicodeSet getCharSourceSet(Emoji.CharSource charSource) { return CldrUtility.ifNull(charSourcesToUnicodeSet.get(charSource), UnicodeSet.EMPTY); } + public static Set getCharSources(int codepoint) { return codepointToCharSource.get(codepoint); } + public static Set getCharSources(String codepoints) { - return CldrUtility.ifNull(codepointToCharSource.get(codepoints),Collections.emptySet()); + return CldrUtility.ifNull(codepointToCharSource.get(codepoints), Collections.emptySet()); } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSource.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSource.java index 81afec45c..5471591f4 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSource.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSource.java @@ -1,37 +1,54 @@ package org.unicode.tools.emoji; -import org.unicode.tools.emoji.Emoji.Qualified; - import com.ibm.icu.dev.util.UnicodeMap; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; +import org.unicode.tools.emoji.Emoji.Qualified; public interface EmojiDataSource { public UnicodeSet getEmojiComponents(); + public UnicodeSet getSingletonsWithDefectives(); + public UnicodeSet getEmojiPresentationSet(); + public UnicodeSet getModifierBases(); + public UnicodeSet getExtendedPictographic(); + public UnicodeSet getTagSequences(); + public UnicodeSet getModifierSequences(); + public UnicodeSet getKeycapSequences(); + public UnicodeSet getFlagSequences(); + public UnicodeSet getZwjSequencesNormal(); + public UnicodeSet getEmojiWithVariants(); + public UnicodeSet getAllEmojiWithoutDefectives(); + public UnicodeSet getTextPresentationSet(); + public UnicodeSet getAllEmojiWithDefectives(); + public UnicodeSet getGenderBases(); + public UnicodeSet getTakesSign(); + public UnicodeSet getSingletonsWithoutDefectives(); - + public String getName(String s); + public default String getName(int codepoint) { return getName(UTF16.valueOf(codepoint)); } + public UnicodeMap getRawNames(); - + public default UnicodeSet getBasicSequences() { UnicodeSet result = new UnicodeSet(); for (String s : getSingletonsWithDefectives()) { @@ -41,27 +58,33 @@ public default UnicodeSet getBasicSequences() { if (getEmojiPresentationSet().contains(s)) { result.add(s); } else { - result.add(s+Emoji.EMOJI_VARIANT); + result.add(s + Emoji.EMOJI_VARIANT); } } return result.freeze(); } - + public default UnicodeSet getEmojiForSortRules() { return new UnicodeSet() .addAll(getAllEmojiWithoutDefectives()) .removeAll(Emoji.DEFECTIVE) - .addAll(getZwjSequencesNormal()) + .addAll(getZwjSequencesNormal()) .addAll(getKeycapSequences()); } - + public String addEmojiVariants(String s1); + public String getVersionString(); + public String getPlainVersion(); + public UnicodeSet getExplicitGender(); + public UnicodeSet getMultiPersonGroupings(); + public UnicodeSet getModifierBasesRgi(); + public UnicodeSet getAllEmojiWithoutDefectivesOrModifiers(); + public String addEmojiVariants(String s1, Qualified qualified); } - diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSourceCombined.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSourceCombined.java index 5c77ffe0a..f0d05ac1b 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSourceCombined.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiDataSourceCombined.java @@ -1,29 +1,28 @@ package org.unicode.tools.emoji; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.ICUUncheckedIOException; import java.io.IOException; import java.util.Collection; import java.util.HashSet; import java.util.Map.Entry; import java.util.Set; - import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.EmojiOrder.MajorGroup; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.ICUUncheckedIOException; - public class EmojiDataSourceCombined implements EmojiDataSource { public static final EmojiDataSource EMOJI_DATA = new EmojiDataSourceCombined(); - public static final EmojiDataSource EMOJI_DATA_BETA = new EmojiDataSourceCombined(EmojiData.EMOJI_DATA_BETA); + public static final EmojiDataSource EMOJI_DATA_BETA = + new EmojiDataSourceCombined(EmojiData.EMOJI_DATA_BETA); private static final boolean DEBUG = false; private final EmojiData emojiData; private final CandidateData candidates; - + public EmojiDataSourceCombined(EmojiData emojiData, CandidateData candidates) { this.emojiData = emojiData; this.candidates = candidates; @@ -38,116 +37,106 @@ public EmojiDataSourceCombined() { } static final UnicodeSet add(UnicodeSet base, UnicodeSet candidates) { - return candidates.isEmpty() ? base - : base.isEmpty() ? candidates - : new UnicodeSet(candidates).addAll(base).freeze(); + return candidates.isEmpty() + ? base + : base.isEmpty() ? candidates : new UnicodeSet(candidates).addAll(base).freeze(); } static final UnicodeMap add(UnicodeMap base, UnicodeMap candidates) { - return !Emoji.IS_BETA || candidates.isEmpty() ? base + return !Emoji.IS_BETA || candidates.isEmpty() + ? base : new UnicodeMap(candidates).putAll(base).freeze(); } @Override public UnicodeSet getEmojiComponents() { - return add(emojiData.getEmojiComponents(), - candidates.getEmojiComponents()); + return add(emojiData.getEmojiComponents(), candidates.getEmojiComponents()); } @Override public UnicodeSet getSingletonsWithDefectives() { - return add(emojiData.getSingletonsWithDefectives(), - candidates.getSingletonsWithDefectives()); + return add( + emojiData.getSingletonsWithDefectives(), candidates.getSingletonsWithDefectives()); } @Override public UnicodeSet getEmojiPresentationSet() { - return add(emojiData.getEmojiPresentationSet(), - candidates.getEmojiPresentationSet()); + return add(emojiData.getEmojiPresentationSet(), candidates.getEmojiPresentationSet()); } @Override public UnicodeSet getModifierBases() { - return add(emojiData.getModifierBases(), - candidates.getModifierBases()); + return add(emojiData.getModifierBases(), candidates.getModifierBases()); } @Override public UnicodeSet getModifierBasesRgi() { - return add(emojiData.getModifierBasesRgi(), - candidates.getModifierBasesRgi()); + return add(emojiData.getModifierBasesRgi(), candidates.getModifierBasesRgi()); } @Override public UnicodeSet getExtendedPictographic() { - return add(emojiData.getExtendedPictographic(), - candidates.getExtendedPictographic()); + return add(emojiData.getExtendedPictographic(), candidates.getExtendedPictographic()); } @Override public UnicodeSet getTagSequences() { - return add(emojiData.getTagSequences(), - candidates.getTagSequences()); + return add(emojiData.getTagSequences(), candidates.getTagSequences()); } @Override public UnicodeSet getModifierSequences() { - return add(emojiData.getModifierSequences(), - candidates.getModifierSequences()); + return add(emojiData.getModifierSequences(), candidates.getModifierSequences()); } @Override public UnicodeSet getFlagSequences() { - return add(emojiData.getFlagSequences(), - candidates.getFlagSequences()); + return add(emojiData.getFlagSequences(), candidates.getFlagSequences()); } @Override public UnicodeSet getZwjSequencesNormal() { - return add(emojiData.getZwjSequencesNormal(), - candidates.getZwjSequencesNormal()); + return add(emojiData.getZwjSequencesNormal(), candidates.getZwjSequencesNormal()); } @Override public UnicodeSet getEmojiWithVariants() { - return add(emojiData.getEmojiWithVariants(), - candidates.getEmojiWithVariants()); + return add(emojiData.getEmojiWithVariants(), candidates.getEmojiWithVariants()); } @Override public UnicodeSet getAllEmojiWithoutDefectives() { - return add(emojiData.getAllEmojiWithoutDefectives(), + return add( + emojiData.getAllEmojiWithoutDefectives(), candidates.getAllEmojiWithoutDefectives()); } @Override public UnicodeSet getAllEmojiWithoutDefectivesOrModifiers() { - return add(emojiData.getAllEmojiWithoutDefectivesOrModifiers(), + return add( + emojiData.getAllEmojiWithoutDefectivesOrModifiers(), candidates.getAllEmojiWithoutDefectivesOrModifiers()); } - @Override public UnicodeSet getTextPresentationSet() { - return add(emojiData.getTextPresentationSet(), - candidates.getTextPresentationSet()); + return add(emojiData.getTextPresentationSet(), candidates.getTextPresentationSet()); } @Override public UnicodeSet getAllEmojiWithDefectives() { - return add(emojiData.getAllEmojiWithDefectives(), - candidates.getAllEmojiWithDefectives()); + return add(emojiData.getAllEmojiWithDefectives(), candidates.getAllEmojiWithDefectives()); } @Override public UnicodeSet getGenderBases() { - return add(emojiData.getGenderBases(), - candidates.getGenderBases()); + return add(emojiData.getGenderBases(), candidates.getGenderBases()); } @Override public UnicodeSet getSingletonsWithoutDefectives() { - return add(emojiData.getSingletonsWithoutDefectives(), + return add( + emojiData.getSingletonsWithoutDefectives(), candidates.getSingletonsWithoutDefectives()); } @@ -164,11 +153,10 @@ public String getName(String s) { public UnicodeMap getRawNames() { return add(emojiData.getRawNames(), candidates.getRawNames()); } - + @Override public UnicodeSet getTakesSign() { - return add(emojiData.getTakesSign(), - candidates.getTakesSign()); + return add(emojiData.getTakesSign(), candidates.getTakesSign()); } @Override @@ -190,7 +178,7 @@ public String addEmojiVariants(String s1, Emoji.Qualified qualified) { public String getVersionString() { return getPlainVersion() + " + " + candidates.getVersionString(); } - + @Override public String getPlainVersion() { return emojiData.getVersionString(); @@ -198,22 +186,21 @@ public String getPlainVersion() { @Override public UnicodeSet getExplicitGender() { - return add(emojiData.getExplicitGender(), - candidates.getExplicitGender()); + return add(emojiData.getExplicitGender(), candidates.getExplicitGender()); } @Override public UnicodeSet getMultiPersonGroupings() { - return add(emojiData.getMultiPersonGroupings(), - candidates.getMultiPersonGroupings()); - } - -// public static void main(String[] args) { -// UnicodeSet allChars = EMOJI_DATA.getAllEmojiWithDefectives(); -// -// } + return add(emojiData.getMultiPersonGroupings(), candidates.getMultiPersonGroupings()); + } + + // public static void main(String[] args) { + // UnicodeSet allChars = EMOJI_DATA.getAllEmojiWithDefectives(); + // + // } /** * Created a copy of the input emojiOrdering.txt file but merging in the candidate data + * * @param reformatted */ public void showOrderingInterleaved(TempPrintWriter reformatted) { @@ -227,7 +214,8 @@ public void showOrderingInterleaved(int MAX_PER_LINE, Appendable out) { String lastSubgroup = null; MajorGroup lastMajor = null; int countOnLine = 0; - final Set>> keyValuesSet = EmojiOrder.STD_ORDER.orderingToCharacters.asMap().entrySet(); + final Set>> keyValuesSet = + EmojiOrder.STD_ORDER.orderingToCharacters.asMap().entrySet(); Set seen = new HashSet<>(); @@ -241,11 +229,20 @@ public void showOrderingInterleaved(int MAX_PER_LINE, Appendable out) { if (subgroup == null) { subgroup = candidates.getCategory(s); if (subgroup == null) { - throw new ICUException("Can't get subgroup for «" + orig + "» " + Utility.hex(orig)); + throw new ICUException( + "Can't get subgroup for «" + orig + "» " + Utility.hex(orig)); } } if (!subgroup.equals(lastSubgroup)) { - if (DEBUG) System.out.println(lastSubgroup + ";\t" + Utility.hex(lastSubgroup) + ";\t«" + orig + "»\t" + Utility.hex(orig)); + if (DEBUG) + System.out.println( + lastSubgroup + + ";\t" + + Utility.hex(lastSubgroup) + + ";\t«" + + orig + + "»\t" + + Utility.hex(orig)); if (countOnLine != 0) { out.append('\n'); countOnLine = 0; @@ -291,7 +288,7 @@ public void showOrderingInterleaved(int MAX_PER_LINE, Appendable out) { out.append(s2); seen.add(s2); ++countOnLine; - } + } } } } @@ -310,5 +307,4 @@ private String trimMods(String s) { String result = EmojiData.EMOJI_DATA_BETA.getBaseRemovingModsGender(s); return result; } - -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFlagOrder.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFlagOrder.java index 7f6608ef5..43b1568f5 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFlagOrder.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFlagOrder.java @@ -1,5 +1,10 @@ package org.unicode.tools.emoji; +import com.google.common.collect.ComparisonChain; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.text.LocaleDisplayNames; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.util.ULocale; import java.awt.image.BufferedImage; import java.awt.image.ColorModel; import java.awt.image.WritableRaster; @@ -10,21 +15,13 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; - import javax.imageio.ImageIO; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Counter; import org.unicode.cldr.util.Pair; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.collect.ComparisonChain; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.text.LocaleDisplayNames; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.util.ULocale; - public class EmojiFlagOrder { static class HSB implements Comparable { @@ -33,14 +30,16 @@ static class HSB implements Comparable { short blue; private HSB(int red, int green, int blue) { - this.red = (short)red; - this.green = (short)green; - this.blue = (short)blue; + this.red = (short) red; + this.green = (short) green; + this.blue = (short) blue; } + @Override public String toString() { - return Utility.hex(red,2) + Utility.hex(green,2) + Utility.hex(blue,2); + return Utility.hex(red, 2) + Utility.hex(green, 2) + Utility.hex(blue, 2); } + @Override public int compareTo(HSB o) { int diff; @@ -49,13 +48,13 @@ public int compareTo(HSB o) { return blue - o.blue; } - static Map cache = new HashMap<>(); + static Map cache = new HashMap<>(); public static HSB make(int red, int green, int blue) { red = fix(red); green = fix(green); - blue =fix(blue); - final int key = (red<<16)|(green<<8)|blue; + blue = fix(blue); + final int key = (red << 16) | (green << 8) | blue; HSB result = cache.get(key); if (result == null) { result = new HSB(red, green, blue); @@ -79,7 +78,7 @@ public int distanceTo(HSB other) { final int s0 = RED_FACTOR * (red - other.red); final int s1 = GREEN_FACTOR * (green - other.green); final int s2 = BLUE_FACTOR * (blue - other.blue); - int result = s0*s0 + s1*s1 + s2*s2; + int result = s0 * s0 + s1 * s1 + s2 * s2; return result; } @@ -108,10 +107,11 @@ static class ImageInfo implements Comparable { private int currentOrder = ++order; /** * returns hue, saturation, brightness + * * @param image * @return */ - ImageInfo (String name, BufferedImage image) { + ImageInfo(String name, BufferedImage image) { this.name = name; if (image == null) { return; @@ -137,7 +137,7 @@ static class ImageInfo implements Comparable { for (int x = 0; x < xCount; ++x) { colors[x] = new HSB[yCount]; for (int y = 0; y < yCount; ++y) { - //int rgb = image.getRGB(x, y); + // int rgb = image.getRGB(x, y); HSB hsbvals = getHSB(x, y, raster, colorModel); if (hsbvals == null) { continue; @@ -160,14 +160,21 @@ static class ImageInfo implements Comparable { // if (base.distanceTo(colors[x-1][y-1]) > MIN // || base.distanceTo(colors[x-1][y]) > MIN // || base.distanceTo(colors[x][y-1]) > MIN - // || x < xCount - 1 && base.distanceTo(colors[x+1][y]) > MIN + // || x < xCount - 1 && base.distanceTo(colors[x+1][y]) > + // MIN // ) { // distance++; // } - int core = Math.max(base.distanceTo(colors[x-1][y-1]), - Math.max(base.distanceTo(colors[x-1][y]), - Math.max(base.distanceTo(colors[x][y-1]), - + (x < xCount - 1 ? base.distanceTo(colors[x+1][y]) : 0)))); + int core = + Math.max( + base.distanceTo(colors[x - 1][y - 1]), + Math.max( + base.distanceTo(colors[x - 1][y]), + Math.max( + base.distanceTo(colors[x][y - 1]), + +(x < xCount - 1 + ? base.distanceTo(colors[x + 1][y]) + : 0)))); distance += core; // if (core > MIN) { // distance += 1; @@ -175,25 +182,27 @@ static class ImageInfo implements Comparable { } } // overall = Math.round(distance/(double)count); - overall = -lumTotal*1000/count; - + overall = -lumTotal * 1000 / count; } + @Override public int compareTo(ImageInfo o) { return ComparisonChain.start() - .compare(name,o.name, EmojiOrder.UCA_COLLATOR) + .compare(name, o.name, EmojiOrder.UCA_COLLATOR) .compare(overall, o.overall) .compare(currentOrder, o.currentOrder) .result(); } } - public static HSB getHSB(int x, int y, final WritableRaster raster, final ColorModel colorModel) { + + public static HSB getHSB( + int x, int y, final WritableRaster raster, final ColorModel colorModel) { Object inData = raster.getDataElements(x, y, null); int alpha = colorModel.getAlpha(inData); if (alpha < 32) { return null; // skip } else if (alpha != 255) { - //throw new IllegalArgumentException(); + // throw new IllegalArgumentException(); } int blue = colorModel.getBlue(inData); int green = colorModel.getGreen(inData); @@ -202,9 +211,8 @@ public static HSB getHSB(int x, int y, final WritableRaster raster, final ColorM return hsbvals; } - static void getFlagOrder() throws IOException { - Set> sorted = new TreeSet<>(); + Set> sorted = new TreeSet<>(); LocaleDisplayNames localeDisplayNames = LocaleDisplayNames.getInstance(ULocale.ENGLISH); for (String s : EmojiData.EMOJI_DATA.getChars()) { if (!Emoji.isRegionalIndicator(s.codePointAt(0))) { @@ -221,9 +229,10 @@ static void getFlagOrder() throws IOException { System.out.println("Can't read: " + file); sourceImage = null; } - String sortString = localeDisplayNames.regionDisplayName(Emoji.getRegionCodeFromEmoji(s)); + String sortString = + localeDisplayNames.regionDisplayName(Emoji.getRegionCodeFromEmoji(s)); ImageInfo info = new ImageInfo(sortString, sourceImage); - sorted.add(new Pair<>(info,s)); + sorted.add(new Pair<>(info, s)); } NumberFormat percent = NumberFormat.getPercentInstance(); percent.setMaximumFractionDigits(2); @@ -233,7 +242,15 @@ static void getFlagOrder() throws IOException { StringBuilder emojiList = new StringBuilder(); final String outFileName = "flag-emoji-list.html"; try (PrintWriter out = FileUtilities.openUTF8Writer(Emoji.CHARTS_DIR, outFileName)) { - ChartUtilities.writeHeader(outFileName, out, "Emoji Flags", null, false, "

    " + "Flag list. " + "

    \n", Emoji.DATA_DIR_PRODUCTION, Emoji.TR51_HTML); + ChartUtilities.writeHeader( + outFileName, + out, + "Emoji Flags", + null, + false, + "

    " + "Flag list. " + "

    \n", + Emoji.DATA_DIR_PRODUCTION, + Emoji.TR51_HTML); out.println(""); out.println("
    "); for (Pair colorChar : sorted) { @@ -244,19 +261,31 @@ static void getFlagOrder() throws IOException { base = info.overall; } // File file = getFile(codePoints); - out.print(""); + out.print( + ""); int limit = 15; if (info.colorDistribution != null) { for (R2 item : info.colorDistribution) { HSB color = item.get1(); HSB inverse = color.getContrast(); - out.println(""); + out.println( + ""); if (--limit < 0) break; } } @@ -271,19 +300,13 @@ static void getFlagOrder() throws IOException { } static String getFlag(String chars) { - //String core = Emoji.buildFileName(chars,"_"); - return ""; + // String core = Emoji.buildFileName(chars,"_"); + return ""; } - public static String getFile(String s) { - String core = Emoji.buildFileName(s,"_"); - return Settings.Output.GEN_DIR + "emoji_images/country-flags/ref_" + - core + - ".png"; + String core = Emoji.buildFileName(s, "_"); + return Settings.Output.GEN_DIR + "emoji_images/country-flags/ref_" + core + ".png"; } public static void main(String[] args) throws IOException { diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequency.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequency.java index d3ad2e3cb..1f8111dc4 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequency.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequency.java @@ -1,12 +1,22 @@ package org.unicode.tools.emoji; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableList.Builder; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import com.ibm.icu.util.ICUUncheckedIOException; +import com.ibm.icu.util.ULocale; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Comparator; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; @@ -18,38 +28,25 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.CLDRPaths; import org.unicode.cldr.util.Counter; -import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; import org.unicode.tools.MultiComparator; import org.unicode.tools.emoji.CountEmoji.Category; import org.unicode.tools.emoji.EmojiData.VariantFactory; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableList.Builder; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.SpanCondition; -import com.ibm.icu.util.ICUUncheckedIOException; -import com.ibm.icu.util.ULocale; - /** * To generate emoji frequency data: + * *
      - *
    • Add new data to DATA directory
    • - *
    • If the format changes, modify the constants in this file. Typically column rearrangments
    • - *
    • Run this program, and paste files into spreadsheet.
    • - *
    • + *
    • Add new data to DATA directory + *
    • If the format changes, modify the constants in this file. Typically column rearrangments + *
    • Run this program, and paste files into spreadsheet. + *
    • *
    - * @author markdavis * + * @author markdavis */ public class EmojiFrequency { static final String DATA_DIR = "/Volumes/GoogleDrive/My Drive/workspace/DATA"; @@ -63,7 +60,6 @@ public class EmojiFrequency { static final EmojiOrder order = EmojiOrder.of(Emoji.VERSION_LAST_RELEASED); static final UnicodeSet SKIP = new UnicodeSet("[© ® ™]").freeze(); - public static void main(String[] args) { System.out.println("\n\n***Twitter***\n"); showCounts("twitter.tsv", Twitter.countInfo.keyToCount, null); @@ -72,17 +68,19 @@ public static void main(String[] args) { // showCounts("gboardMainRaw.tsv", GBoardCounts.countsRaw, null); // // System.out.println("\n\n***W/O FE0F***\n"); - // showCounts("gboardNoFE0F.tsv", GBoardCounts.countsWithoutFe0f, GBoardCounts.countsRaw); + // showCounts("gboardNoFE0F.tsv", GBoardCounts.countsWithoutFe0f, + // GBoardCounts.countsRaw); // System.out.println("\n\n***CHARS***\n"); - showCountsSimple("gboardAllChars.tsv", CharFrequency.localeToCountInfo.get("001").keyToCount, null); + showCountsSimple( + "gboardAllChars.tsv", CharFrequency.localeToCountInfo.get("001").keyToCount, null); System.out.println("\n\n***RawSequencesToCount***\n"); showSequencesToCount("RawSequencesToCount.tsv"); System.out.println("\n\n***MAIN***\n"); showCounts("gboardMain.tsv", GBoardCounts.localeToCountInfo.get("001").keyToCount, null); - //showCounts("gboardDE.tsv", GBoardCounts.localeToCountInfo.get("de").keyToCount, null); + // showCounts("gboardDE.tsv", GBoardCounts.localeToCountInfo.get("de").keyToCount, null); // System.out.println("\n\n***EmojiTracker***\n"); // showCounts("emojiTracker.tsv", EmojiTracker.countInfo.keyToCount, null); @@ -105,11 +103,14 @@ private static void showCountsSimple(String filename, Map x, Objec for (Entry entry : x.entrySet()) { String term = entry.getKey(); Long count = entry.getValue(); - out.println(hex(term) - + "\t" + (count == 0 ? "" : count+"") - + "\t" + (++rank) - + "\t" + term - ); + out.println( + hex(term) + + "\t" + + (count == 0 ? "" : count + "") + + "\t" + + (++rank) + + "\t" + + term); } } catch (IOException e) { throw new ICUUncheckedIOException(e); @@ -118,22 +119,38 @@ private static void showCountsSimple(String filename, Map x, Objec private static void showTextEmoji(String filename) { try (PrintWriter out = FileUtilities.openUTF8Writer(OUTDIR, filename)) { - UnicodeSet Android_Chrome_TP = new UnicodeSet("[☹ ☠ ❣ ⛑ ☘ ⛰ ⛩ ♨ ⛴ ✈ ⏱ ⏲ ⛈ ☂ ⛱ ☃ ☄ ⛸ ⌨ ✉ ✏ ⛏ ⚒ ⚔ ⚙ ⚗ ⚖ ⛓ ⚰ ⚱ ⚠ ☢ ☣ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ⚛ ✡ ☸ ☯ ✝ ☦ ☪ ☮ ▶ ⏭ ⏯ ◀ ⏮ ⏏ ♀ ♂ ⚕ ♻ ⚜ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); - UnicodeSet Mac_Chrome_TP = new UnicodeSet("[☺ ❤ ❣ 🗨 ♨ ✈ ☀ ☁ ☂ ❄ ☃ ♠ ♥ ♦ ♣ ☎ ✉ ✏ ✒ ✂ ⚠ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ✡ ☯ ✝ ▶ ◀ ⏏ ♀ ♂ ⚕ ♻ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); - UnicodeSet Mac_TextMate_TP = new UnicodeSet("[☺☝ ✌✍ ❤ ❣ ♨ ✈ ☀ ☁ ☂ ❄ ☃ ♠ ♥ ♦ ♣ ✉ ✏ ✒ ✂ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ✡ ☯ ✝ ▶ ◀ ⏏ ♀ ♂ ⚕ ♻ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ #⃣ *⃣ 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); - UnicodeSet Mac_Notes_TP = new UnicodeSet("[☝ ✌ ✍ ❤ ❣ ♨ ✈ ☀ ☁ ☂ ❄ ☃ ♠ ♥ ♦ ♣ ✉ ✏ ✒ ✂ ⚠ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ✡ ☯ ✝ ▶ ◀ ⏏ ♀ ♂ ⚕ ♻ ⚜ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ #⃣ *⃣ 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); - UnicodeSet Mac_Safari_TP = new UnicodeSet("[☺☝ ✌ ✍ ❤ ❣♨✈☀ ☁☂❄♠ ♥ ♦ ♣☎✉✏ ✒✂⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖↕ ↔ ↩ ↪ ⤴ ⤵✡☯ ✝▶◀⏏ ♀ ♂ ⚕ ♻☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™#⃣ *⃣ 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗㊙ ▪ ▫ ◻ ◼]"); - out.println("Hex\tEmoji\tAndroid Chrome\tMac Chrome\tMac Safari\tMac TextMate\tMac Notes"); + UnicodeSet Android_Chrome_TP = + new UnicodeSet( + "[☹ ☠ ❣ ⛑ ☘ ⛰ ⛩ ♨ ⛴ ✈ ⏱ ⏲ ⛈ ☂ ⛱ ☃ ☄ ⛸ ⌨ ✉ ✏ ⛏ ⚒ ⚔ ⚙ ⚗ ⚖ ⛓ ⚰ ⚱ ⚠ ☢ ☣ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ⚛ ✡ ☸ ☯ ✝ ☦ ☪ ☮ ▶ ⏭ ⏯ ◀ ⏮ ⏏ ♀ ♂ ⚕ ♻ ⚜ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); + UnicodeSet Mac_Chrome_TP = + new UnicodeSet( + "[☺ ❤ ❣ 🗨 ♨ ✈ ☀ ☁ ☂ ❄ ☃ ♠ ♥ ♦ ♣ ☎ ✉ ✏ ✒ ✂ ⚠ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ✡ ☯ ✝ ▶ ◀ ⏏ ♀ ♂ ⚕ ♻ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); + UnicodeSet Mac_TextMate_TP = + new UnicodeSet( + "[☺☝ ✌✍ ❤ ❣ ♨ ✈ ☀ ☁ ☂ ❄ ☃ ♠ ♥ ♦ ♣ ✉ ✏ ✒ ✂ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ✡ ☯ ✝ ▶ ◀ ⏏ ♀ ♂ ⚕ ♻ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ #⃣ *⃣ 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); + UnicodeSet Mac_Notes_TP = + new UnicodeSet( + "[☝ ✌ ✍ ❤ ❣ ♨ ✈ ☀ ☁ ☂ ❄ ☃ ♠ ♥ ♦ ♣ ✉ ✏ ✒ ✂ ⚠ ⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖ ↕ ↔ ↩ ↪ ⤴ ⤵ ✡ ☯ ✝ ▶ ◀ ⏏ ♀ ♂ ⚕ ♻ ⚜ ☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™ #⃣ *⃣ 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗ ㊙ ▪ ▫ ◻ ◼]"); + UnicodeSet Mac_Safari_TP = + new UnicodeSet( + "[☺☝ ✌ ✍ ❤ ❣♨✈☀ ☁☂❄♠ ♥ ♦ ♣☎✉✏ ✒✂⬆ ↗ ➡ ↘ ⬇ ↙ ⬅ ↖↕ ↔ ↩ ↪ ⤴ ⤵✡☯ ✝▶◀⏏ ♀ ♂ ⚕ ♻☑ ✔ ✖ 〽 ✳ ✴ ❇ ‼ ⁉ 〰 © ® ™#⃣ *⃣ 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🅰 🅱 ℹ Ⓜ 🅾 🅿 🈂 🈷 ㊗㊙ ▪ ▫ ◻ ◼]"); + out.println( + "Hex\tEmoji\tAndroid Chrome\tMac Chrome\tMac Safari\tMac TextMate\tMac Notes"); for (String s : EmojiMatcher.nopres) { out.println( hex(s) - + "\t" + s - + "\t" + (Android_Chrome_TP.contains(s) ? "text" : "emoji") - + "\t" + (Mac_Chrome_TP.contains(s) ? "text" : "emoji") - + "\t" + (Mac_Safari_TP.contains(s) ? "text" : "emoji") - + "\t" + (Mac_TextMate_TP.contains(s) ? "text" : "emoji") - + "\t" + (Mac_Notes_TP.contains(s) ? "text" : "emoji") - ); + + "\t" + + s + + "\t" + + (Android_Chrome_TP.contains(s) ? "text" : "emoji") + + "\t" + + (Mac_Chrome_TP.contains(s) ? "text" : "emoji") + + "\t" + + (Mac_Safari_TP.contains(s) ? "text" : "emoji") + + "\t" + + (Mac_TextMate_TP.contains(s) ? "text" : "emoji") + + "\t" + + (Mac_Notes_TP.contains(s) ? "text" : "emoji")); } } catch (IOException e) { throw new ICUUncheckedIOException(e); @@ -141,7 +158,10 @@ private static void showTextEmoji(String filename) { } private static void showSequencesToCount(String outputFileName) { - Set sorted = EmojiData.of(Emoji.VERSION_LAST_RELEASED).getAllEmojiWithDefectives().addAllTo(new TreeSet<>(order.codepointCompare)); + Set sorted = + EmojiData.of(Emoji.VERSION_LAST_RELEASED) + .getAllEmojiWithDefectives() + .addAllTo(new TreeSet<>(order.codepointCompare)); VariantFactory vf = order.emojiData.new VariantFactory(); for (String s : EmojiData.EMOJI_DATA.getAllEmojiWithDefectives()) { if (s.equals("\u263A")) { @@ -155,10 +175,7 @@ private static void showSequencesToCount(String outputFileName) { try (PrintWriter out = FileUtilities.openUTF8Writer(OUTDIR, outputFileName)) { out.println("Hex\tEmoji\tCLDR Name"); for (String s : sorted) { - out.println(hex(s,4) - + "\t" + s - + "\t" + getName(s) - ); + out.println(hex(s, 4) + "\t" + s + "\t" + getName(s)); } } catch (IOException e) { throw new ICUUncheckedIOException(e); @@ -166,9 +183,11 @@ private static void showSequencesToCount(String outputFileName) { } static final Set SORTED; + static { - Comparator tweaked = new MultiComparator( - order.codepointCompare, new UTF16.StringComparator(true, false, 0)); + Comparator tweaked = + new MultiComparator( + order.codepointCompare, new UTF16.StringComparator(true, false, 0)); Set SORTED2 = new TreeSet<>(tweaked); System.out.println(order.codepointCompare.compare("😀", "#️⃣")); for (String s : EmojiData.EMOJI_DATA.getAllEmojiWithDefectives()) { @@ -195,7 +214,7 @@ private static void showSequencesToCount(String outputFileName) { private static void showInfo(String filename) { int sortOrder = 0; - //try (PrintWriter out = FileUtilities.openUTF8Writer(OUTDIR, filename)) { + // try (PrintWriter out = FileUtilities.openUTF8Writer(OUTDIR, filename)) { try (PrintWriter out = FileUtilities.openUTF8Writer(OUTDIR, filename)) { out.println("Hex\tEmoji\tGroup\tSubgroup\tName (cldr)\tNorm?\tSort Order\tType\tYear"); @@ -216,15 +235,22 @@ private static void showInfo(String filename) { String ep = EmojiData.EMOJI_DATA.addEmojiVariants(s).equals(s) ? "" : "Defect"; out.println( hex(s) - + "\t" + s - + "\t" + order.getMajorGroupFromCategory(subcategory).toPlainString() - + "\t" + subcategory.toString() - + "\t" + getName(s) - + "\t" + ep - + "\t" + sortOrder++ - + "\t" + Category.getBucket(dataS).toStringPlain() - + "\t" + BirthInfo.getYear(dataS) - ); + + "\t" + + s + + "\t" + + order.getMajorGroupFromCategory(subcategory).toPlainString() + + "\t" + + subcategory.toString() + + "\t" + + getName(s) + + "\t" + + ep + + "\t" + + sortOrder++ + + "\t" + + Category.getBucket(dataS).toStringPlain() + + "\t" + + BirthInfo.getYear(dataS)); } } catch (IOException e) { throw new ICUUncheckedIOException(e); @@ -232,20 +258,22 @@ private static void showInfo(String filename) { } private static String getName(String s) { - return s.equals(UNSPECIFIED_GENDER) ? "unspecified-gender" - : s.equals(UNSPECIFIED_SKIN) ? "unspecified-skin" - : EmojiData.EMOJI_DATA.getName(s); + return s.equals(UNSPECIFIED_GENDER) + ? "unspecified-gender" + : s.equals(UNSPECIFIED_SKIN) ? "unspecified-skin" : EmojiData.EMOJI_DATA.getName(s); } static final UnicodeSet HACK_FE0F = new UnicodeSet("[©®™✔]").freeze(); - private static void showCounts(String filename, Map x, Map withFe0f) { + private static void showCounts( + String filename, Map x, Map withFe0f) { try (PrintWriter out = FileUtilities.openUTF8Writer(OUTDIR, filename)) { boolean normal = withFe0f == null; - out.println("Hex\tCount" - + (normal ? "\tRank" : "\tGB-Data\tto add to GB-Data") - + "\tEmoji"); + out.println( + "Hex\tCount" + + (normal ? "\tRank" : "\tGB-Data\tto add to GB-Data") + + "\tEmoji"); int rank = 0; Set missing = new LinkedHashSet<>(SORTED); for (Entry entry : x.entrySet()) { @@ -256,34 +284,31 @@ private static void showCounts(String filename, Map x, Map items = (SortedSet) unicodeSet.strings(); int cp = input.codePointAt(offset); - SortedSet subset = items.subSet(UTF16.valueOf(cp), UTF16.valueOf(cp+1)); + SortedSet subset = items.subSet(UTF16.valueOf(cp), UTF16.valueOf(cp + 1)); int bestLength = -1; int inputLength = input.length(); int allowedLength = inputLength - offset; @@ -314,20 +339,24 @@ static int matches(UnicodeSet unicodeSet, String input, int offset) { static class CountInfo { public static final double SCALE = 1000000000.0; final long rawTotal; - final Map keyToCount; - final Map keyToRank; + final Map keyToCount; + final Map keyToRank; public long getRaw(String key) { Long raw = keyToCount.get(key); - return raw == null ? 0 : (long)(raw * rawTotal / SCALE); + return raw == null ? 0 : (long) (raw * rawTotal / SCALE); } - public CountInfo(Counter inputCounter, Set keepStrings, Map yeartoweight) { + + public CountInfo( + Counter inputCounter, + Set keepStrings, + Map yeartoweight) { inputCounter.remove(""); rawTotal = inputCounter.getTotal(); - Map _keyToCount = new LinkedHashMap<>(); - Map _keyToRank = new LinkedHashMap<>(); + Map _keyToCount = new LinkedHashMap<>(); + Map _keyToRank = new LinkedHashMap<>(); - double factor = SCALE/rawTotal; + double factor = SCALE / rawTotal; int rank = 0; UnicodeSet failed = new UnicodeSet(); for (R2 entry : inputCounter.getEntrySetSortedByCount(false, null)) { @@ -340,7 +369,7 @@ public CountInfo(Counter inputCounter, Set keepStrings, Map counts = new Counter<>(); + private static final String FREQ_SOURCE = DATA_DIR + "/frequency/emoji/"; + // static Counter counts = new Counter<>(); static Map localeToCountInfo = new LinkedHashMap<>(); // static Counter countsRaw = new Counter<>(); // static Counter countsWithoutFe0f = new Counter<>(); @@ -412,26 +455,27 @@ private static long toAddAdjusted(String term, Long countWithFe0f, Long countWit } // Android API Distribution // from sheet: - final static Map yearToWeight = ImmutableMap.builder() - .put(2010, 0.021255544174738) - .put(2011, 0.0212968973735215) - .put(2012, 0.0213592297560782) - .put(2013, 0.0220032266331459) - .put(2014, 0.0238228623503592) - .put(2015, 0.0295854195945678) - .put(2016, 0.0415430939278561) - .put(2017, 0.0893600428570618) - .put(2018, 0.729773683332672) - .build(); + static final Map yearToWeight = + ImmutableMap.builder() + .put(2010, 0.021255544174738) + .put(2011, 0.0212968973735215) + .put(2012, 0.0213592297560782) + .put(2013, 0.0220032266331459) + .put(2014, 0.0238228623503592) + .put(2015, 0.0295854195945678) + .put(2016, 0.0415430939278561) + .put(2017, 0.0893600428570618) + .put(2018, 0.729773683332672) + .build(); static { Map> _counts = new LinkedHashMap<>(); - //Counter _counts = new Counter<>(); + // Counter _counts = new Counter<>(); List emojiSet = new ArrayList<>(); List nonPresSet = new ArrayList<>(); List nonEmojiSet = new ArrayList<>(); - //,text,decimal_code_points,count,hex_code_points + // ,text,decimal_code_points,count,hex_code_points // 8,❤️,"[10084, 65039]",705086,"['0x2764', '0xFE0F']" CSVParser csvParser = new CSVParser(); File folder = new File(FREQ_SOURCE + "/gboardRaw"); @@ -444,7 +488,8 @@ private static long toAddAdjusted(String term, Long countWithFe0f, Long countWit // for (Type type : Type.values()) { // for (String id : Arrays.asList( // "20171031_20171113", "20171115_20171128", - // "20180608_20180621", "20180624_20180707")) { // "20171031_20171113", "20171115_20171128" + // "20180608_20180621", "20180624_20180707")) { // + // "20171031_20171113", "20171115_20171128" // String filename = type.getFile() + id + ".csv"; int offset = 0; String folderName; @@ -470,7 +515,11 @@ private static long toAddAdjusted(String term, Long countWithFe0f, Long countWit String rankString = csvParser.get(type.getRankIndex() + offset); long rank = type == Type.global ? Long.parseLong(rankString) : -1; - String locale = type == Type.global ? "001" : normalizeLocale(csvParser.get(type.getLocaleIndex() + offset)); + String locale = + type == Type.global + ? "001" + : normalizeLocale( + csvParser.get(type.getLocaleIndex() + offset)); if (locale == null) { continue; } @@ -481,14 +530,21 @@ private static long toAddAdjusted(String term, Long countWithFe0f, Long countWit nonEmojiSet.clear(); nonPresSet.clear(); EmojiMatcher.parse(emojiString, emojiSet, nonPresSet, nonEmojiSet); - if (DEBUG) System.out.println(rank - + "\t" + count - + "\t" + emojiString - + "\t" + hex(emojiString) - + "\t" + emojiSet - + "\t" + nonPresSet - + "\t" + nonEmojiSet - ); + if (DEBUG) + System.out.println( + rank + + "\t" + + count + + "\t" + + emojiString + + "\t" + + hex(emojiString) + + "\t" + + emojiSet + + "\t" + + nonPresSet + + "\t" + + nonEmojiSet); Counter c = _counts.get(locale); if (c == null) { _counts.put(locale, c = new Counter<>()); @@ -505,11 +561,13 @@ private static long toAddAdjusted(String term, Long countWithFe0f, Long countWit localeToCountInfo = normalizeLocaleCounts(_counts, SORTED, yearToWeight); // counts.addAll(countsRaw); - // for (R2 entry : countsWithoutFe0f.getEntrySetSortedByCount(false, null)) { + // for (R2 entry : + // countsWithoutFe0f.getEntrySetSortedByCount(false, null)) { // long countWithoutFe0f = entry.get0(); // String term = entry.get1(); // long countWithFe0f = counts.get(term); - // counts.add(term + Emoji.EMOJI_VARIANT, toAddAdjusted(term, countWithFe0f, countWithoutFe0f)); + // counts.add(term + Emoji.EMOJI_VARIANT, toAddAdjusted(term, + // countWithFe0f, countWithoutFe0f)); // } } } @@ -524,12 +582,18 @@ private static String normalizeLocale(String string) { int debug = 0; } ULocale max = ULocale.addLikelySubtags(ulocale); - ULocale noCountry = new ULocale.Builder().setLanguage(max.getLanguage()).setScript(max.getScript()).build(); + ULocale noCountry = + new ULocale.Builder() + .setLanguage(max.getLanguage()) + .setScript(max.getScript()) + .build(); return ULocale.minimizeSubtags(noCountry).toLanguageTag(); } - private static Map normalizeLocaleCounts(Map> _counts, - Set keepString, Map yeartoweight) { + private static Map normalizeLocaleCounts( + Map> _counts, + Set keepString, + Map yeartoweight) { Map counts2 = new LinkedHashMap<>(); for (String locale : _counts.keySet()) { Counter c = _counts.get(locale); @@ -540,14 +604,14 @@ private static Map normalizeLocaleCounts(Map localeToCountInfo; static { Map> _counts = new LinkedHashMap<>(); - //Counter _counts = new Counter<>(); + // Counter _counts = new Counter<>(); - //,text,decimal_code_points,count,hex_code_points + // ,text,decimal_code_points,count,hex_code_points // 8,❤️,"[10084, 65039]",705086,"['0x2764', '0xFE0F']" CSVParser csvParser = new CSVParser(); File folder = new File(FREQ_SOURCE); @@ -560,7 +624,8 @@ static class CharFrequency { // for (Type type : Type.values()) { // for (String id : Arrays.asList( // "20171031_20171113", "20171115_20171128", - // "20180608_20180621", "20180624_20180707")) { // "20171031_20171113", "20171115_20171128" + // "20180608_20180621", "20180624_20180707")) { // + // "20171031_20171113", "20171115_20171128" // String filename = type.getFile() + id + ".csv"; int offset = 0; String folderName; @@ -593,18 +658,20 @@ static class CharFrequency { String rankString = csvParser.get(type.getRankIndex() + offset); long rank = type == Type.global ? Long.parseLong(rankString) : -1; - String locale = type == Type.global ? "001" : normalizeLocale(csvParser.get(type.getLocaleIndex() + offset)); + String locale = + type == Type.global + ? "001" + : normalizeLocale( + csvParser.get(type.getLocaleIndex() + offset)); if (locale == null) { continue; } String countString = csvParser.get(type.getCountIndex() + offset); long count = Long.parseLong(countString); - if (DEBUG) System.out.println(rank - + "\t" + count - + "\t" + emojiString - + "\t" + hex(emojiString) - ); + if (DEBUG) + System.out.println( + rank + "\t" + count + "\t" + emojiString + "\t" + hex(emojiString)); Counter c = _counts.get(locale); if (c == null) { _counts.put(locale, c = new Counter<>()); @@ -617,7 +684,6 @@ static class CharFrequency { } } - public static String hex(String string) { return hex(string, 1); } @@ -627,13 +693,16 @@ private static String hex(String string, int minLen) { } public static class CSVParser { - enum State {start, quote} + enum State { + start, + quote + } // ab,cd => -1,2,5 that is, point before each comma private String line; private List commaPoints = new ArrayList<>(); public String get(int item) { - return line.substring(commaPoints.get(item)+1, commaPoints.get(item+1)); + return line.substring(commaPoints.get(item) + 1, commaPoints.get(item + 1)); } public int size() { @@ -648,25 +717,34 @@ public CSVParser set(String line) { int i = 0; for (; i < line.length(); ++i) { int ch = line.charAt(i); - switch(state) { - case start: { - switch(ch) { - case ',': commaPoints.add(i); break; - case '"': state = State.quote; break; - } - break; - } - case quote: { - switch(ch) { - case '"': state = State.start; break; - } - break; - } + switch (state) { + case start: + { + switch (ch) { + case ',': + commaPoints.add(i); + break; + case '"': + state = State.quote; + break; + } + break; + } + case quote: + { + switch (ch) { + case '"': + state = State.start; + break; + } + break; + } } } commaPoints.add(i); return this; } + public List toList() { Builder builder = ImmutableList.builder(); for (int i = 0; i < size(); ++i) { @@ -674,6 +752,7 @@ public List toList() { } return builder.build(); } + @Override public String toString() { return toList().toString(); @@ -682,30 +761,33 @@ public String toString() { static class EmojiTracker { static CountInfo countInfo; + static { Counter _counts = new Counter<>(); - Matcher m = Pattern.compile("id=\"score-([A-F0-9]+)\">\\s*(\\d+)\\s*").matcher(""); + Matcher m = + Pattern.compile("id=\"score-([A-F0-9]+)\">\\s*(\\d+)\\s*").matcher(""); // 1872748264 - try (BufferedReader in = FileUtilities.openFile(GenerateEmojiFrequency.class, "emojitracker.txt")) { + try (BufferedReader in = + FileUtilities.openFile(GenerateEmojiFrequency.class, "emojitracker.txt")) { String lastBuffer = ""; double factor = 0; while (true) { String line = in.readLine(); if (line == null) break; - line = lastBuffer+line; + line = lastBuffer + line; m.reset(line); int pos = 0; while (true) { boolean found = m.find(pos); if (!found) break; - int cp = Integer.parseInt(m.group(1),16); + int cp = Integer.parseInt(m.group(1), 16); String str = UTF16.valueOf(cp); long count = Long.parseLong(m.group(2)); if (factor == 0) { - factor = 1_000_000_000.0/count; + factor = 1_000_000_000.0 / count; } addCount(_counts, normalizeEmoji(str, _counts, count), count); pos = m.end(); @@ -720,10 +802,13 @@ static class EmojiTracker { } static class Twitter { - //code emoji Twemoji description iPhone Android Web Lite TweetDeck Total TOO client - //1f602 😂 Face with tears of joy 1,808,011,468 1,651,744,252 79,888,884 159,574,416 2,263,084 3,699,219,020 + // code emoji Twemoji description iPhone Android + // Web Lite TweetDeck Total TOO client + // 1f602 😂 Face with tears of joy 1,808,011,468 + // 1,651,744,252 79,888,884 159,574,416 2,263,084 3,699,219,020 static CountInfo countInfo; + static { int charField = 1; int hexField = 0; @@ -731,7 +816,8 @@ static class Twitter { int totalFields = 10; Counter _counts = new Counter<>(); - try (BufferedReader in = FileUtilities.openFile(DATA_DIR+"/frequency/emoji/", "twitterRaw.tsv")) { + try (BufferedReader in = + FileUtilities.openFile(DATA_DIR + "/frequency/emoji/", "twitterRaw.tsv")) { int lineCount = 0; while (true) { String line = in.readLine(); @@ -745,13 +831,18 @@ static class Twitter { throw new IllegalArgumentException("Bad data: " + line); } String rawCodes = parts[charField]; - String hexCodes = parts[hexField].replace("-", " ").toUpperCase(Locale.ROOT).replace(" FE0F", ""); + String hexCodes = + parts[hexField] + .replace("-", " ") + .toUpperCase(Locale.ROOT) + .replace(" FE0F", ""); hexCodes = Utility.hex(Utility.fromHex(hexCodes, false, 2)); String hexOfRawCodes = Utility.hex(rawCodes).replace(" FE0F", ""); if (!hexCodes.equals(hexOfRawCodes)) { - throw new IllegalArgumentException("mismatched emoji (seq) and hex: " + line); + throw new IllegalArgumentException( + "mismatched emoji (seq) and hex: " + line); } - long count = Long.parseLong(parts[countField].replace(",","")); + long count = Long.parseLong(parts[countField].replace(",", "")); String codes = normalizeEmoji(rawCodes, _counts, count); addCount(_counts, codes, count); } @@ -763,19 +854,24 @@ static class Twitter { } static class Facebook { - // File name Codepoints UTC Name Emoji Emoji Index Hit Index Relative Frequency Group Subgroup - // emoji_FACE-WITH-TEARS-OF-JOY_1f602 1F602 face with tears of joy 😂 3 1 1000000000 Smileys & People face-positive + // File name Codepoints UTC Name Emoji Emoji Index Hit + // Index Relative Frequency Group Subgroup + // emoji_FACE-WITH-TEARS-OF-JOY_1f602 1F602 face with tears of joy 😂 + // 3 1 1000000000 Smileys & People face-positive // old - // 😀 emoji_GRINNING-FACE_1f600 1F600 grinning face 1 28 98597505 Smileys & People face-positive + // 😀 emoji_GRINNING-FACE_1f600 1F600 grinning face 1 28 98597505 Smileys & + // People face-positive static int emojiField = 3, hexField = 1, freqField = 6, fieldLen = 9; static CountInfo countInfo; + static { Counter _counts = new Counter<>(); int lineCount = 0; String line = null; - try (BufferedReader in = FileUtilities.openFile(DATA_DIR+"/frequency/emoji/", "facebookRaw.tsv")) { + try (BufferedReader in = + FileUtilities.openFile(DATA_DIR + "/frequency/emoji/", "facebookRaw.tsv")) { while (true) { line = in.readLine(); if (line == null) break; @@ -788,22 +884,25 @@ static class Facebook { } String[] parts = line.split("\t"); if (parts.length != fieldLen) { - throw new IllegalArgumentException("Wrong number of fields: «" + line + "»"); + throw new IllegalArgumentException( + "Wrong number of fields: «" + line + "»"); } // String hexCodes = parts[1]; String hexCodes = parts[hexField]; - //long count = Math.round(Double.parseDouble(parts[2].replace(",",""))); + // long count = Math.round(Double.parseDouble(parts[2].replace(",",""))); long count = Math.round(Double.parseDouble(parts[freqField])); String codes = normalizeHexEmoji(hexCodes, _counts, count); - //String codes = parts[emojiField]; + // String codes = parts[emojiField]; addCount(_counts, codes, count); } } catch (Exception e) { - throw new ICUUncheckedIOException("Bad hex at " + lineCount + "\t«" + line + "»", e); + throw new ICUUncheckedIOException( + "Bad hex at " + lineCount + "\t«" + line + "»", e); } countInfo = new CountInfo(_counts, SORTED, null); } } + static UnicodeSet DUPS = new UnicodeSet(); private static String normalizeEmoji(String rawCodes, Counter stripped, long counts) { @@ -818,13 +917,14 @@ private static String normalizeEmoji(String rawCodes, Counter stripped, if (stripped != null) { Category cat = Category.getBucket(result); - switch(cat) { - case ungendered: - case ungendered_skin: - addCount(stripped, UNSPECIFIED_GENDER, counts); - DUPS.add(result); + switch (cat) { + case ungendered: + case ungendered_skin: + addCount(stripped, UNSPECIFIED_GENDER, counts); + DUPS.add(result); } - if (EmojiData.EMOJI_DATA.getModifierBases().containsSome(result) && !EmojiData.MODIFIERS.containsSome(result)) { + if (EmojiData.EMOJI_DATA.getModifierBases().containsSome(result) + && !EmojiData.MODIFIERS.containsSome(result)) { addCount(stripped, UNSPECIFIED_SKIN, counts); } } @@ -846,16 +946,24 @@ private static String normalizeEmoji(String rawCodes, Counter stripped, return EmojiData.EMOJI_DATA.addEmojiVariants(result); } - public static String stripFrom(UnicodeSet uset, CharSequence source, boolean matches, Counter filtered, long counts) { + public static String stripFrom( + UnicodeSet uset, + CharSequence source, + boolean matches, + Counter filtered, + long counts) { StringBuilder result = new StringBuilder(); // could optimize to only allocate when needed SpanCondition toKeep = matches ? SpanCondition.NOT_CONTAINED : SpanCondition.CONTAINED; SpanCondition toSkip = matches ? SpanCondition.CONTAINED : SpanCondition.NOT_CONTAINED; - for (int pos = 0; pos < source.length();) { + for (int pos = 0; pos < source.length(); ) { int inside = uset.span(source, pos, toKeep); result.append(source.subSequence(pos, inside)); pos = uset.span(source, inside, toSkip); // get next start if (pos > inside && filtered != null) { - addCount(filtered, source.subSequence(inside, pos).toString().replace("\u200D", ""), counts); + addCount( + filtered, + source.subSequence(inside, pos).toString().replace("\u200D", ""), + counts); } } String resultString = result.toString(); @@ -865,7 +973,7 @@ public static String stripFrom(UnicodeSet uset, CharSequence source, boolean mat private static String normalizeHexEmoji(String rawCodes, Counter _counts, long count) { if (rawCodes.startsWith("\\x{") && rawCodes.endsWith("}")) { - rawCodes = rawCodes.substring(3, rawCodes.length()-1); + rawCodes = rawCodes.substring(3, rawCodes.length() - 1); } // hack String[] parts = rawCodes.split("\\s+"); diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequencyOld.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequencyOld.java index 8c6335e28..b565b935f 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequencyOld.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiFrequencyOld.java @@ -1,33 +1,36 @@ package org.unicode.tools.emoji; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.Row.R2; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.util.Locale; import java.util.Map.Entry; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.Counter; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.Row.R2; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class EmojiFrequencyOld { - /* -
  • - + /* +
  • + 375235298
  • */ static UnicodeMap data = new UnicodeMap<>(); static long total; + static { - Matcher m = Pattern.compile("id=\"score-(\\p{XDigit}+)(-\\p{XDigit}+)?\">(\\d+)").matcher(""); - for (String line : FileUtilities.in(Settings.Output.GEN_DIR + "frequency/emoji/", "emoji-tracker.txt")) { + Matcher m = + Pattern.compile("id=\"score-(\\p{XDigit}+)(-\\p{XDigit}+)?\">(\\d+)") + .matcher(""); + for (String line : + FileUtilities.in( + Settings.Output.GEN_DIR + "frequency/emoji/", "emoji-tracker.txt")) { if (line.startsWith(" entry : data.entrySet()) { final String cp = entry.getKey(); - System.out.println(cp - + "\t" + nf.format(entry.getValue()) - + "\t" + UCharacter.getName(cp, ", ") - + "\t" + EmojiAnnotations.ANNOTATIONS_TO_CHARS.getKeys(cp) - ); + System.out.println( + cp + + "\t" + + nf.format(entry.getValue()) + + "\t" + + UCharacter.getName(cp, ", ") + + "\t" + + EmojiAnnotations.ANNOTATIONS_TO_CHARS.getKeys(cp)); } - + System.out.println("\n\n\n"); System.out.println("Annotation\tTw. Count\tTw. Count Ave.\tSet"); Counter annotationToFrequency = new Counter(); @@ -94,10 +102,12 @@ public static void main(String[] args) { final Long count = entry.get0(); System.out.println( annotation - + "\t" + nf.format(count) - + "\t" + nf.format(count/annotationToCount.get(annotation)) - + "\t" + us.toPattern(false) - ); + + "\t" + + nf.format(count) + + "\t" + + nf.format(count / annotationToCount.get(annotation)) + + "\t" + + us.toPattern(false)); } } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiImageData.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiImageData.java index 95279eaa0..c390fbf9c 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiImageData.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiImageData.java @@ -1,5 +1,10 @@ package org.unicode.tools.emoji; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.VersionInfo; import java.io.File; import java.io.IOException; import java.io.PrintWriter; @@ -12,19 +17,12 @@ import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.tools.emoji.Emoji.Source; import org.unicode.tools.emoji.GenerateEmoji.Style; import org.unicode.tools.emoji.GenerateEmoji.Visibility; import org.unicode.utilities.UnicodeSetFormatter; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Output; -import com.ibm.icu.util.VersionInfo; - public class EmojiImageData { static final Map> DATA = new ConcurrentHashMap<>(); static final Map IMAGE_CACHE = new ConcurrentHashMap<>(); @@ -98,13 +96,19 @@ static String getDataUrlFromFilename(Source source, String filename) { if (!file.exists()) { result = ""; } else if (!GenerateEmoji.DATAURL) { - result = ""; + result = + ""; // "../images/" + filename; } else if (source == Source.svg) { result = CollectionUtilities.join(Files.readAllLines(file.toPath()), "\n"); } else { - byte[] bytes = GenerateEmoji.RESIZE_IMAGE <= 0 ? Files.readAllBytes(file.toPath()) - : LoadImage.resizeImage(file, GenerateEmoji.RESIZE_IMAGE, GenerateEmoji.RESIZE_IMAGE); + byte[] bytes = + GenerateEmoji.RESIZE_IMAGE <= 0 + ? Files.readAllBytes(file.toPath()) + : LoadImage.resizeImage( + file, + GenerateEmoji.RESIZE_IMAGE, + GenerateEmoji.RESIZE_IMAGE); result = "data:image/png;base64," + Base64.getEncoder().encodeToString(bytes); } EmojiImageData.IMAGE_CACHE.put(filename, result); @@ -116,17 +120,31 @@ static String getDataUrlFromFilename(Source source, String filename) { } public static void write(Set platforms2) throws IOException { - try (PrintWriter outText = FileUtilities.openUTF8Writer(Emoji.TR51_INTERNAL_DIR, "missing-emoji-list.tsv")) { + try (PrintWriter outText = + FileUtilities.openUTF8Writer(Emoji.TR51_INTERNAL_DIR, "missing-emoji-list.tsv")) { showText(outText, 100); } final String outFileName = "missing-emoji-list.html"; try (PrintWriter out = FileUtilities.openUTF8Writer(Emoji.TR51_INTERNAL_DIR, outFileName)) { - ChartUtilities.writeHeader(outFileName, out, "Missing", null, false, "

    Missing list of emoji characters.

    \n", Emoji.DATA_DIR_PRODUCTION, Emoji.TR51_HTML); + ChartUtilities.writeHeader( + outFileName, + out, + "Missing", + null, + false, + "

    Missing list of emoji characters.

    \n", + Emoji.DATA_DIR_PRODUCTION, + Emoji.TR51_HTML); out.println("
    " + codePoints - + "" + Emoji.getFlagRegionName(codePoints) - + "" + getFlag(codePoints) - + "" + percent2.format(info.overall/base) - + "
    " + + codePoints + + "" + + Emoji.getFlagRegionName(codePoints) + + "" + + getFlag(codePoints) + + "" + + percent2.format(info.overall / base) + + "" + percent.format(item.get0()/(double)info.count) + " " + item.get1() + "" + + percent.format(item.get0() / (double) info.count) + + " " + + item.get1() + + "
    "); String headerRow = ""; for (Emoji.Source type : platforms2) { - headerRow += ""; + headerRow += + ""; } headerRow += ""; @@ -141,7 +159,8 @@ public static void write(Set platforms2) throws IOException { } } - private static void showDiff(PrintWriter out, String headerRow, Set platforms2, Breakdown breakdown) { + private static void showDiff( + PrintWriter out, String headerRow, Set platforms2, Breakdown breakdown) { // find common UnicodeSet common = null; boolean skipSeparate = true; @@ -156,7 +175,8 @@ private static void showDiff(PrintWriter out, String headerRow, Set plat } // per source String sectionLink = ChartUtilities.getDoubleLink(breakdown.title); - final GenerateEmojiData.PropPrinter propPrinter = new GenerateEmojiData.PropPrinter().set(EmojiDataSourceCombined.EMOJI_DATA); + final GenerateEmojiData.PropPrinter propPrinter = + new GenerateEmojiData.PropPrinter().set(EmojiDataSourceCombined.EMOJI_DATA); String title = breakdown.title; if (!skipSeparate) { @@ -171,20 +191,49 @@ private static void showDiff(PrintWriter out, String headerRow, Set plat for (Emoji.Source source : platforms2) { final UnicodeSet us = breakdown.getMissing(source); final UnicodeSet missing = new UnicodeSet(us).removeAll(common); - GenerateEmoji.displayUnicodeSet(out, missing.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), Style.bestImage, 0, 1, 1, "../../emoji/charts/full-emoji-list.html", "", "lchars", Visibility.external); + GenerateEmoji.displayUnicodeSet( + out, + missing.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), + Style.bestImage, + 0, + 1, + 1, + "../../emoji/charts/full-emoji-list.html", + "", + "lchars", + Visibility.external); } out.print(""); } // common if (common.size() != 0) { - out.println("" - + ""); - out.println("" - + ""); + out.println( + "" + + ""); + out.println( + "" + + ""); out.println(""); - GenerateEmoji.displayUnicodeSet(out, common.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), Style.bestImage, 0, platforms2.size(), 1, null, "", "lchars", Visibility.external); + GenerateEmoji.displayUnicodeSet( + out, + common.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), + Style.bestImage, + 0, + platforms2.size(), + 1, + null, + "", + "lchars", + Visibility.external); out.println(""); } } @@ -198,7 +247,12 @@ public Breakdown(String title, UnicodeSet uset) { this.uset = uset; } - static void add(List result, String title, VersionInfo version, UnicodeSet v3, UnicodeSet v4) { + static void add( + List result, + String title, + VersionInfo version, + UnicodeSet v3, + UnicodeSet v4) { final UnicodeSet old = new UnicodeSet(v3).retainAll(v4).freeze(); title += "\tv" + version.getVersionString(2, 2); if (version.equals(Emoji.VERSION_TO_GENERATE_PREVIOUS)) { @@ -211,6 +265,7 @@ static void add(List result, String title, VersionInfo version, Unico public UnicodeSet getSupported(Source source) { return new UnicodeSet(uset).retainAll(EmojiImageData.getSupported(source)).freeze(); } + public UnicodeSet getMissing(Source source) { return new UnicodeSet(uset).removeAll(EmojiImageData.getSupported(source)).freeze(); } @@ -222,12 +277,34 @@ private static List getBreakdown() { List result = new ArrayList<>(); for (VersionInfo version : Arrays.asList(last.getVersion(), current.getVersion())) { - Breakdown.add(result, "singletons", version, last.getSingletonsWithoutDefectives(), current.getSingletonsWithoutDefectives()); - Breakdown.add(result, "keycaps", version, last.getKeycapSequences(), current.getKeycapSequences()); - Breakdown.add(result, "flags", version, last.getFlagSequences(), current.getFlagSequences()); - Breakdown.add(result, "tags", version, last.getTagSequences(), current.getTagSequences()); - Breakdown.add(result, "modifiers", version, last.getModifierSequences(), current.getModifierSequences()); - Breakdown.add(result, "zwj", version, last.getZwjSequencesNormal(), current.getZwjSequencesNormal()); + Breakdown.add( + result, + "singletons", + version, + last.getSingletonsWithoutDefectives(), + current.getSingletonsWithoutDefectives()); + Breakdown.add( + result, + "keycaps", + version, + last.getKeycapSequences(), + current.getKeycapSequences()); + Breakdown.add( + result, "flags", version, last.getFlagSequences(), current.getFlagSequences()); + Breakdown.add( + result, "tags", version, last.getTagSequences(), current.getTagSequences()); + Breakdown.add( + result, + "modifiers", + version, + last.getModifierSequences(), + current.getModifierSequences()); + Breakdown.add( + result, + "zwj", + version, + last.getZwjSequencesNormal(), + current.getZwjSequencesNormal()); } return result; } @@ -242,15 +319,18 @@ public static void main(String[] args) throws IOException { private static void showText(PrintWriter out, int MAX) { EmojiData current = EmojiData.of(Emoji.VERSION_TO_GENERATE); - // The EMPTY_COLUMNS is so that we have the same number of tab columns on each line, displaying better in github + // The EMPTY_COLUMNS is so that we have the same number of tab columns on each line, + // displaying better in github out.println("TOTALS" + EMPTY_COLUMNS); out.println(EMPTY_COLUMNS); for (Source source : Source.VENDOR_SOURCES) { final UnicodeSet supported = getSupported(source); - UnicodeSet missing = new UnicodeSet(current.getAllEmojiWithoutDefectives()).removeAll(supported); + UnicodeSet missing = + new UnicodeSet(current.getAllEmojiWithoutDefectives()).removeAll(supported); getCounts(out, source, "missing\tv" + current.getVersionString(), missing, MAX); - // getCounts(PrintWriter out, Source source, String title, UnicodeSet missing, int MAX) { + // getCounts(PrintWriter out, Source source, String title, UnicodeSet missing, int MAX) + // { } out.println(EMPTY_COLUMNS); Output printed = new Output<>(false); @@ -258,7 +338,8 @@ private static void showText(PrintWriter out, int MAX) { out.println(EMPTY_COLUMNS); for (Source source : Source.VENDOR_SOURCES) { final UnicodeSet supported = getSupported(source); - //System.out.println(source + "\t" + supported.size() + "\t" + max(supported.toPattern(false), MAX)); + // System.out.println(source + "\t" + supported.size() + "\t" + + // max(supported.toPattern(false), MAX)); if (supported.isEmpty()) { continue; } @@ -305,36 +386,46 @@ private static void showText(PrintWriter out, int MAX) { } } - static final UnicodeSetFormatter PRETTY_HEX = new UnicodeSetFormatter() - .setRawToQuote(UnicodeSet.ALL_CODE_POINTS) - .setQuoter(new Hexer('x').setDoEscape(x -> UnicodeSet.ALL_CODE_POINTS.contains(x))); + static final UnicodeSetFormatter PRETTY_HEX = + new UnicodeSetFormatter() + .setRawToQuote(UnicodeSet.ALL_CODE_POINTS) + .setQuoter( + new Hexer('x') + .setDoEscape(x -> UnicodeSet.ALL_CODE_POINTS.contains(x))); - static final UnicodeSetFormatter PRETTY_PLAIN = new UnicodeSetFormatter() - .setRawToQuote(UnicodeSet.EMPTY) - .setQuoter(new Hexer('x')); + static final UnicodeSetFormatter PRETTY_PLAIN = + new UnicodeSetFormatter().setRawToQuote(UnicodeSet.EMPTY).setQuoter(new Hexer('x')); - private static UnicodeSet getCounts(PrintWriter out, Source source, Breakdown breakdown, int MAX, Output printed) { + private static UnicodeSet getCounts( + PrintWriter out, Source source, Breakdown breakdown, int MAX, Output printed) { String title = breakdown.title; - UnicodeSet lastMissingSingletons = breakdown.getMissing(source); // new UnicodeSet(breakdown.uset).removeAll(getSupported(source)); + UnicodeSet lastMissingSingletons = + breakdown.getMissing( + source); // new UnicodeSet(breakdown.uset).removeAll(getSupported(source)); if (!lastMissingSingletons.isEmpty()) { getCounts(out, source, title, lastMissingSingletons, MAX); printed.value = true; } - return breakdown.getSupported(source); // new UnicodeSet(breakdown.uset).retainAll(getSupported(source)).freeze(); + return breakdown.getSupported( + source); // new UnicodeSet(breakdown.uset).retainAll(getSupported(source)).freeze(); } static final String EMPTY_COLUMNS = "\t\t\t\t\t"; - private static void getCounts(PrintWriter out, Source source, String title, UnicodeSet missing, int MAX) { - out.println(source - + "\t" + title - + "\t" + missing.size() - + "\t" + (MAX == -1 ? "hex\t" + PRETTY_HEX.format(missing) - : MAX == -2 ? "file\t" + formatFiles(source, missing) - : "plain\t" + max(PRETTY_PLAIN.format(missing), MAX) - ) - ); - + private static void getCounts( + PrintWriter out, Source source, String title, UnicodeSet missing, int MAX) { + out.println( + source + + "\t" + + title + + "\t" + + missing.size() + + "\t" + + (MAX == -1 + ? "hex\t" + PRETTY_HEX.format(missing) + : MAX == -2 + ? "file\t" + formatFiles(source, missing) + : "plain\t" + max(PRETTY_PLAIN.format(missing), MAX))); } private static String formatFiles(Source type, UnicodeSet lastMissingSingletons) { @@ -343,7 +434,7 @@ private static String formatFiles(Source type, UnicodeSet lastMissingSingletons) if (result.length() != 0) { result.append(", "); } - String fixed = s.replace(Emoji.EMOJI_VARIANT_STRING,""); + String fixed = s.replace(Emoji.EMOJI_VARIANT_STRING, ""); String core = Emoji.buildFileName(fixed, "_"); String suffix = type.getSuffix(); String filename = type.getPrefix() + "_" + core + suffix; @@ -357,6 +448,6 @@ private static String max(String pattern, int maxLen) { return pattern; } int maxOffset = pattern.offsetByCodePoints(0, maxLen); - return pattern.substring(0,maxOffset) + "…"; + return pattern.substring(0, maxOffset) + "…"; } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiIterator.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiIterator.java index b3aae14fd..db507cf59 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiIterator.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiIterator.java @@ -1,15 +1,14 @@ package org.unicode.tools.emoji; +import com.ibm.icu.lang.CharSequences; +import com.ibm.icu.text.UnicodeSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Set; -import com.ibm.icu.lang.CharSequences; -import com.ibm.icu.text.UnicodeSet; - public final class EmojiIterator implements Iterable, Iterator { private static UnicodeSet COMBINING = new UnicodeSet("[:M:]").freeze(); - private static UnicodeSet TAGS = new UnicodeSet(0xE0020,0xE007F).freeze(); + private static UnicodeSet TAGS = new UnicodeSet(0xE0020, 0xE007F).freeze(); private int[] line; private int pos; @@ -26,8 +25,9 @@ public EmojiIterator(EmojiData data, boolean stripTrailingStyleVariants) { } /** * Resets newLabel if there is a label. + * * @param line - * @return + * @return */ public EmojiIterator set(String line) { line = Emoji.UNESCAPE.transform(line); @@ -35,12 +35,16 @@ public EmojiIterator set(String line) { int tabPos = line.indexOf('\t'); if (tabPos >= 0) { newLabel.clear(); - String[] temp = line.substring(0,tabPos).trim().split(",\\s*"); + String[] temp = line.substring(0, tabPos).trim().split(",\\s*"); for (String part : temp) { if (Emoji.KEYWORD_CHARS.containsAll(part)) { newLabel.add(part); } else { - throw new IllegalArgumentException("Bad label format before tab: " + line + " — " + new UnicodeSet().addAll(part).removeAll(Emoji.KEYWORD_CHARS)); + throw new IllegalArgumentException( + "Bad label format before tab: " + + line + + " — " + + new UnicodeSet().addAll(part).removeAll(Emoji.KEYWORD_CHARS)); } } line = line.substring(tabPos + 1); @@ -52,6 +56,7 @@ public EmojiIterator set(String line) { /** * Gets the next sequence, either single code points or emoji sequences. + * * @return */ public String next() { @@ -81,7 +86,7 @@ public String next() { } current = line[pos++]; } - + int lastTag = -1; while (TAGS.contains(current)) { lastTag = current; @@ -117,9 +122,9 @@ public String next() { } // remove trailing emoji variant if (stripTrailingStyleVariants) { - final char finalChar = result.charAt(result.length()-1); + final char finalChar = result.charAt(result.length() - 1); if (finalChar == Emoji.EMOJI_VARIANT || finalChar == Emoji.TEXT_VARIANT) { - result.setLength(result.length()-1); + result.setLength(result.length() - 1); } } return result.toString(); @@ -138,13 +143,13 @@ public Iterator iterator() { // quick test. public static void main(String[] args) { String[] tests = { - " couple\t👩‍❤️‍💋‍👩", - " flag, junk\t🇰🇷🦄👦🏻 ㊗\uFE0F㊗\uFE0F", - " face \t 😀 😁 😂 😃 😄 😅 😆 😉 ", - " 😊 😋 😎 😍 😘 😗 😙 😚", - " unicorn \t 🦄", - " cheese \t🧀🍕", - " no bullying, witness \t 👁‍🗨", + " couple\t👩‍❤️‍💋‍👩", + " flag, junk\t🇰🇷🦄👦🏻 ㊗\uFE0F㊗\uFE0F", + " face \t 😀 😁 😂 😃 😄 😅 😆 😉 ", + " 😊 😋 😎 😍 😘 😗 😙 😚", + " unicorn \t 🦄", + " cheese \t🧀🍕", + " no bullying, witness \t 👁‍🗨", }; EmojiIterator ei = new EmojiIterator(EmojiData.of(Emoji.VERSION_LAST_RELEASED), true); for (String line : tests) { @@ -154,4 +159,4 @@ public static void main(String[] args) { } } } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLinkAdder.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLinkAdder.java index 4c810c844..5ea06734f 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLinkAdder.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLinkAdder.java @@ -3,7 +3,6 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.text.utility.Settings; @@ -12,7 +11,11 @@ public class EmojiLinkAdder { public static void main(String[] args) { if (args.length == 0) { - args = new String[] {Settings.UnicodeTools.UNICODETOOLS_REPO_DIR + "/reports/tr51/pri294-emoji-image-backgroundA.html"}; + args = + new String[] { + Settings.UnicodeTools.UNICODETOOLS_REPO_DIR + + "/reports/tr51/pri294-emoji-image-backgroundA.html" + }; } Matcher m = TO_FIX.matcher(""); @@ -20,11 +23,12 @@ public static void main(String[] args) { for (String line : FileUtilities.in("", file)) { if (m.reset(line).matches()) { String hex = m.group(2); - String mid = "" - + hex - + ""; + String mid = + "" + + hex + + ""; line = m.group(1) + mid + m.group(3); } System.out.println(line); diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLocaleData.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLocaleData.java index 90493b043..1ded3240d 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLocaleData.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiLocaleData.java @@ -2,20 +2,18 @@ import com.ibm.icu.util.ULocale; -/** - * Provide access to the CLDR short name and annotations for emoji characters. - */ +/** Provide access to the CLDR short name and annotations for emoji characters. */ public class EmojiLocaleData { private final ULocale locale; public EmojiLocaleData(ULocale locale) { this.locale = locale; } - + public ULocale getLocale() { return locale; } - + public static final ULocale[] getAvailableULocales() { return null; } @@ -23,7 +21,7 @@ public static final ULocale[] getAvailableULocales() { public String getName(String emojiOrSequence) { return null; } - + public String getKeywords(String emojiOrSequence) { return null; } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiMatcher.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiMatcher.java index fd7929e83..f5ff3d850 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiMatcher.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiMatcher.java @@ -1,12 +1,11 @@ package org.unicode.tools.emoji; -import java.util.List; - import com.ibm.icu.lang.CharSequences; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; +import java.util.List; -public class EmojiMatcher { +public class EmojiMatcher { private int start; private int end; @@ -23,16 +22,19 @@ public int getEnd() { static final UnicodeSet BASE2 = new UnicodeSet("[\\p{Emoji_Modifier}\uFE0F]"); static final UnicodeSet TAG_SPEC = new UnicodeSet("[\\x{E0020}-\\x{E007E}]"); /** - * Returns true if a possible emoji occurs at or after offset. If true, use getStart() to find the start of the emoji, and getEnd() to get the end. + * Returns true if a possible emoji occurs at or after offset. If true, use getStart() to find + * the start of the emoji, and getEnd() to get the end. + * *
    -emoji_sequence := 
    -| [0-9#*] \x{FE0F 20E3}
    -| ( \p{Regional_Indicator} \p{Regional_Indicator} )
    -| emoji_zwj_element ( \x{200d} emoji_zwj_element )+
    -
    -emoji_zwj_element := 
    -  [\p{Emoji}-\p{Emoji_Component}] ( \p{Emoji_Modifier} | \x{FE0F} )? ( [\x{E0020}-\x{E007E}]+ \x{E007F} )? 
    -  
    + * emoji_sequence := + * | [0-9#*] \x{FE0F 20E3} + * | ( \p{Regional_Indicator} \p{Regional_Indicator} ) + * | emoji_zwj_element ( \x{200d} emoji_zwj_element )+ + * + * emoji_zwj_element := + * [\p{Emoji}-\p{Emoji_Component}] ( \p{Emoji_Modifier} | \x{FE0F} )? ( [\x{E0020}-\x{E007E}]+ \x{E007F} )? + * + * * @param input * @param offset * @return @@ -40,134 +42,164 @@ public int getEnd() { public FindStatus findPossible(String input, int offset) { int cp = 0; State state = State.start; - for ( ; offset < input.length(); offset += Character.charCount(cp)) { + for (; offset < input.length(); offset += Character.charCount(cp)) { cp = input.codePointAt(offset); switch (state) { - case start: { - start = offset; - if (KEYCAP_START.contains(cp)) { - state = State.haveKeycap1; - } else if (Emoji.REGIONAL_INDICATORS.contains(cp)) { - start = offset; - state = State.haveRegionalIndicator; - } else if (BASE.contains(cp)) { - start = offset; - state = State.haveBase; - } - break; + case start: + { + start = offset; + if (KEYCAP_START.contains(cp)) { + state = State.haveKeycap1; + } else if (Emoji.REGIONAL_INDICATORS.contains(cp)) { + start = offset; + state = State.haveRegionalIndicator; + } else if (BASE.contains(cp)) { + start = offset; + state = State.haveBase; + } + break; + } + case haveKeycap1: + { + if (cp == Emoji.EMOJI_VARIANT) { + state = State.haveKeycap2; + break; + } + // optional, fall through + } + case haveKeycap2: + { + if (cp == Emoji.KEYCAP_MARK) { + this.end = offset + 1; + return FindStatus.full; + } + // if we get to this point, we have a keycap without base. So go back to + // start + state = State.start; + break; + } + case haveRegionalIndicator: + { + if (Emoji.REGIONAL_INDICATORS.contains(cp)) { + this.end = offset + Character.charCount(cp); + return FindStatus.full; + } + this.end = offset; + return FindStatus.partial; + } + case haveBase: + { + if (BASE2.contains(cp)) { + state = State.haveBase2; + break; + } + // optional, fallthrough + } + case haveBase2: + { + if (TAG_SPEC.contains(cp)) { + state = State.haveTag; + break; + } else if (cp == Emoji.JOINER) { + state = State.haveZwj; + break; + } + this.end = offset; + return FindStatus.full; + } + case haveTag: + { + if (TAG_SPEC.contains(cp)) { + state = State.haveTag; + break; + } else if (cp == 0xE007F) { + state = State.tagDone; + break; + } + this.end = offset; + return FindStatus.full; + } + case tagDone: + { + if (cp == Emoji.JOINER) { + state = State.haveZwj; + continue; + } + this.end = offset; + return FindStatus.full; + } + case haveZwj: + { + if (BASE.contains(cp)) { + start = offset; + state = State.haveBase; + continue; + } + this.end = offset - 1; // backup to before zwj + return FindStatus.full; + } + default: + throw new IllegalArgumentException(); } - case haveKeycap1: { - if (cp == Emoji.EMOJI_VARIANT) { - state = State.haveKeycap2; - break; + } + switch (state) { + case start: + case haveKeycap1: + case haveKeycap2: + { + start = offset; + this.end = offset; + return FindStatus.none; } - // optional, fall through - } - case haveKeycap2: { - if (cp == Emoji.KEYCAP_MARK) { - this.end = offset + 1; + case haveBase: + case haveBase2: + case tagDone: + { + this.end = offset; return FindStatus.full; } - // if we get to this point, we have a keycap without base. So go back to start - state = State.start; - break; - } - case haveRegionalIndicator: { - if (Emoji.REGIONAL_INDICATORS.contains(cp)) { - this.end = offset + Character.charCount(cp); + case haveZwj: + { + this.end = offset - 1; // backup to before zwj return FindStatus.full; } - this.end = offset; - return FindStatus.partial; - } - case haveBase: { - if (BASE2.contains(cp)) { - state = State.haveBase2; - break; - } - // optional, fallthrough - } - case haveBase2: { - if (TAG_SPEC.contains(cp)) { - state = State.haveTag; - break; - } else if (cp == Emoji.JOINER) { - state = State.haveZwj; - break; - } - this.end = offset; - return FindStatus.full; - } - case haveTag: { - if (TAG_SPEC.contains(cp)) { - state = State.haveTag; - break; - } else if (cp == 0xE007F) { - state = State.tagDone; - break; - } - this.end = offset; - return FindStatus.full; - } - case tagDone: { - if (cp == Emoji.JOINER) { - state = State.haveZwj; - continue; - } - this.end = offset; - return FindStatus.full; - } - case haveZwj: { - if (BASE.contains(cp)) { - start = offset; - state = State.haveBase; - continue; - } - this.end = offset-1; // backup to before zwj - return FindStatus.full; - } default: - throw new IllegalArgumentException(); - } - } - switch (state) { - case start: - case haveKeycap1: - case haveKeycap2: - { - start = offset; - this.end = offset; - return FindStatus.none; - } - case haveBase: case haveBase2: case tagDone: { - this.end = offset; - return FindStatus.full; - } - case haveZwj: { - this.end = offset-1; // backup to before zwj - return FindStatus.full; - } - default: { - this.end = offset; - return FindStatus.partial; - } + { + this.end = offset; + return FindStatus.partial; + } } } - public enum FindStatus {none, partial, full} - - private enum State {start, haveKeycap1, haveKeycap2, haveRegionalIndicator, haveBase, haveBase2, haveTag, tagDone, haveZwj} + public enum FindStatus { + none, + partial, + full + } + private enum State { + start, + haveKeycap1, + haveKeycap2, + haveRegionalIndicator, + haveBase, + haveBase2, + haveTag, + tagDone, + haveZwj + } static final UnicodeSet fixed; - static final UnicodeSet nopres = new UnicodeSet(EmojiData.EMOJI_DATA.getSingletonsWithDefectives()) - .removeAll(EmojiData.EMOJI_DATA.getEmojiPresentationSet()); + static final UnicodeSet nopres = + new UnicodeSet(EmojiData.EMOJI_DATA.getSingletonsWithDefectives()) + .removeAll(EmojiData.EMOJI_DATA.getEmojiPresentationSet()); static final UnicodeSet components = EmojiData.of(Emoji.VERSION_BETA).getEmojiComponents(); static { - fixed = new UnicodeSet(EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives()) - .removeAll(components) - .removeAll(nopres); + fixed = + new UnicodeSet(EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives()) + .removeAll(components) + .removeAll(nopres); for (String s : nopres) { fixed.add(s + Emoji.EMOJI_VARIANT); } @@ -181,11 +213,13 @@ private enum State {start, haveKeycap1, haveKeycap2, haveRegionalIndicator, have fixed.freeze(); } + static UnicodeSet singletonFailures = new UnicodeSet(); - static void parse(String input, List emoji, List noPres, List nonEmoji) { + static void parse( + String input, List emoji, List noPres, List nonEmoji) { int emojiEnd = 0; - for (int offset = 0; offset < input.length();) { + for (int offset = 0; offset < input.length(); ) { int match = EmojiFrequency.matches(fixed, input, offset); if (match > offset) { if (emojiEnd < offset) { @@ -207,7 +241,7 @@ static void parse(String input, List emoji, List noPres, List nonEmoji, List noPres2) { StringBuilder nonEmojiBuffer = new StringBuilder(); for (int cp : CharSequences.codePoints(str)) { - if (nopres.contains(cp) && cp >= 0x7F) { // hack to exclude keycap bases + if (nopres.contains(cp) && cp >= 0x7F) { // hack to exclude keycap bases noPres2.add(UTF16.valueOf(cp)); } else { nonEmojiBuffer.appendCodePoint(cp); @@ -220,35 +254,56 @@ public static void main(String[] args) { EmojiMatcher m = new EmojiMatcher(); boolean verbose = true; Object[][] tests = { - {"a", FindStatus.none, 1, 1}, - {"1", FindStatus.none, 1, 1}, - {"1a", FindStatus.none, 2, 2}, - {"1\ufe0f", FindStatus.none, 2, 2}, - {"1\ufe0fa", FindStatus.none, 3, 3}, - {"1" + Emoji.KEYCAP_MARK, FindStatus.full, 2, 2}, - {"1\ufe0f" + Emoji.KEYCAP_MARK, FindStatus.full, 3, 3}, - {"a👶🏿b", FindStatus.full, 1, 5, FindStatus.none, 6, 6}, - {"a👶👶🏿b", FindStatus.full, 1, 3, FindStatus.full, 3, 7, FindStatus.none, 8, 8}, + {"a", FindStatus.none, 1, 1}, + {"1", FindStatus.none, 1, 1}, + {"1a", FindStatus.none, 2, 2}, + {"1\ufe0f", FindStatus.none, 2, 2}, + {"1\ufe0fa", FindStatus.none, 3, 3}, + {"1" + Emoji.KEYCAP_MARK, FindStatus.full, 2, 2}, + {"1\ufe0f" + Emoji.KEYCAP_MARK, FindStatus.full, 3, 3}, + {"a👶🏿b", FindStatus.full, 1, 5, FindStatus.none, 6, 6}, + {"a👶👶🏿b", FindStatus.full, 1, 3, FindStatus.full, 3, 7, FindStatus.none, 8, 8}, }; for (Object[] row : tests) { - final String input = (String)(row[0]); + final String input = (String) (row[0]); int cursor = 0; for (int i = 1; i < row.length; i += 3) { FindStatus expectedStatus = (FindStatus) row[i]; - int expectedStart = (int) row[i+1]; - int expectedEnd = (int) row[i+2]; + int expectedStart = (int) row[i + 1]; + int expectedEnd = (int) row[i + 2]; FindStatus status = m.findPossible(input, cursor); System.out.println(cursor); if (verbose || status != expectedStatus) { - System.out.println((status != expectedStatus ? "Failed " : "OK") + " Status:\t" + input + "\texpected: " + expectedStatus + "\tactual: " + status); + System.out.println( + (status != expectedStatus ? "Failed " : "OK") + + " Status:\t" + + input + + "\texpected: " + + expectedStatus + + "\tactual: " + + status); m.findPossible(input, cursor); } if (verbose || m.getStart() != expectedStart) { - System.out.println(( m.getStart() != expectedStart ? "Failed " : "OK") + " Start:\t" + input + "\texpected: " + expectedStart + "\tactual: " + m.getStart()); + System.out.println( + (m.getStart() != expectedStart ? "Failed " : "OK") + + " Start:\t" + + input + + "\texpected: " + + expectedStart + + "\tactual: " + + m.getStart()); m.findPossible(input, cursor); } if (verbose || m.getEnd() != expectedEnd) { - System.out.println((m.getEnd() != expectedEnd ? "Failed " : "OK") + " end:\t" + input + "\texpected: " + expectedEnd + "\tactual: " + m.getEnd()); + System.out.println( + (m.getEnd() != expectedEnd ? "Failed " : "OK") + + " end:\t" + + input + + "\texpected: " + + expectedEnd + + "\tactual: " + + m.getEnd()); m.findPossible(input, cursor); } cursor = expectedEnd; @@ -256,7 +311,6 @@ public static void main(String[] args) { System.out.println(); } - for (String s : EmojiData.EMOJI_DATA.getAllEmojiWithDefectives()) { if (EmojiData.MODIFIERS.contains(s)) { continue; @@ -276,4 +330,4 @@ private static FindStatus check(EmojiMatcher m, String s) { } return status; } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiOrder.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiOrder.java index 071bfe9df..b5329a0ca 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiOrder.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiOrder.java @@ -1,5 +1,24 @@ package org.unicode.tools.emoji; +import com.google.common.base.Objects; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.LinkedHashMultimap; +import com.google.common.collect.Multimap; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UTF16.StringComparator; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSet.SpanCondition; +import com.ibm.icu.text.UnicodeSetSpanner; +import com.ibm.icu.util.ICUException; +import com.ibm.icu.util.ICUUncheckedIOException; +import com.ibm.icu.util.Output; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.VersionInfo; import java.io.IOException; import java.io.PrintWriter; import java.util.Arrays; @@ -7,7 +26,6 @@ import java.util.Collections; import java.util.Comparator; import java.util.EnumMap; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; @@ -17,38 +35,18 @@ import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.MapComparator; import org.unicode.cldr.util.MultiComparator; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Objects; -import com.google.common.collect.HashMultimap; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMultimap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.LinkedHashMultimap; -import com.google.common.collect.Multimap; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.text.Collator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UTF16.StringComparator; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSet.SpanCondition; -import com.ibm.icu.text.UnicodeSetSpanner; -import com.ibm.icu.util.ICUException; -import com.ibm.icu.util.ICUUncheckedIOException; -import com.ibm.icu.util.Output; -import com.ibm.icu.util.ULocale; -import com.ibm.icu.util.VersionInfo; - public class EmojiOrder { - private static final ImmutableList HAIR_ORDER = ImmutableList.of("🦰", "🦱", "🦳", "🦲"); - //private static final UnicodeSet MODIFIER_BASES = EmojiData.EMOJI_DATA.getModifierBases(); - private static final ConcurrentHashMap VERSION_TO_DATA = new ConcurrentHashMap<>(); + private static final ImmutableList HAIR_ORDER = + ImmutableList.of("🦰", "🦱", "🦳", "🦲"); + // private static final UnicodeSet MODIFIER_BASES = EmojiData.EMOJI_DATA.getModifierBases(); + private static final ConcurrentHashMap VERSION_TO_DATA = + new ConcurrentHashMap<>(); private static final boolean DEBUG = false; static final UnicodeSet DEBUG_SET = new UnicodeSet("[\u200D \\U0001F9D1 \\U0001F9AF]").freeze(); @@ -66,28 +64,37 @@ public enum MajorGroup { Flags, Other; final Set alternateInput; + MajorGroup(String... alternateInput) { this.alternateInput = ImmutableSet.copyOf(alternateInput); } - static final MajorGroup - Smileys_and_People = Smileys_and_Emotion, - Smileys = Smileys_and_Emotion, - People = People_and_Body - ; + + static final MajorGroup Smileys_and_People = Smileys_and_Emotion, + Smileys = Smileys_and_Emotion, + People = People_and_Body; + public String toString() { return name(); - // throw new ICUException("Disabling this because it is too easy to use the wrong choice."); - //return name().replace("_and_", " & ").replace('_', ' '); - }; + // throw new ICUException("Disabling this because it is too easy to use the + // wrong choice."); + // return name().replace("_and_", " & ").replace('_', ' '); + } + ; + public String toPlainString() { return name().replace("_and_", " & ").replace('_', ' '); - }; + } + ; + public String toSourceString() { return name(); - }; + } + ; + public String toHTMLString() { return toPlainString().replace("&", "&"); } + public static MajorGroup fromString(String source) { source = source.trim(); for (MajorGroup trial : values()) { @@ -96,18 +103,20 @@ public static MajorGroup fromString(String source) { } } return valueOf(source); - }; + } + ; } - //static final EmojiData emojiDataLast = EmojiData.of(Emoji.VERSION_LAST_RELEASED); - public static final StringComparator CODE_POINT_COMPARATOR = new UTF16.StringComparator(true, false, 0); + // static final EmojiData emojiDataLast = EmojiData.of(Emoji.VERSION_LAST_RELEASED); + public static final StringComparator CODE_POINT_COMPARATOR = + new UTF16.StringComparator(true, false, 0); static final boolean USE_ORDER = true; // static final ImmutableMap> hack = ImmutableMap.of( // "👁", ImmutableList.of("👁️‍🗨️ "), // "💏", ImmutableList.of("👩‍❤️‍💋‍👨", "👨‍❤️‍💋‍👨", "👩‍❤️‍💋‍👩"), // "💑", ImmutableList.of("👩‍❤️‍👨", "👨‍❤️‍👨", "👩‍❤️‍👩"), // "👪", ImmutableList.of( - // "👨‍👩‍👦", "👨‍👩‍👧", "👨‍👩‍👧‍👦", "👨‍👩‍👦‍👦", "👨‍👩‍👧‍👧", + // "👨‍👩‍👦", "👨‍👩‍👧", "👨‍👩‍👧‍👦", "👨‍👩‍👦‍👦", "👨‍👩‍👧‍👧", // "👨‍👨‍👦", "👨‍👨‍👧", "👨‍👨‍👧‍👦", "👨‍👨‍👦‍👦", "👨‍👨‍👧‍👧", // "👩‍👩‍👦", "👩‍👩‍👧", "👩‍👩‍👧‍👦", "👩‍👩‍👦‍👦", "👩‍👩‍👧‍👧") // ); @@ -121,25 +130,25 @@ public static MajorGroup fromString(String source) { // } // } - public static final Comparator UCA_COLLATOR = (Comparator)(Comparator)Collator.getInstance(ULocale.ROOT); + public static final Comparator UCA_COLLATOR = + (Comparator) (Comparator) Collator.getInstance(ULocale.ROOT); public static final Comparator UCA_PLUS_CODEPOINT = - new MultiComparator( - EmojiOrder.UCA_COLLATOR, - CODE_POINT_COMPARATOR); + new MultiComparator(EmojiOrder.UCA_COLLATOR, CODE_POINT_COMPARATOR); public static final EmojiOrder STD_ORDER = EmojiOrder.of(Emoji.VERSION_TO_GENERATE); public static final EmojiOrder ORDER_RELEASED = EmojiOrder.of(Emoji.VERSION_LAST_RELEASED); - public static final EmojiOrder BETA_ORDER = Emoji.IS_BETA ? EmojiOrder.of(Emoji.VERSION_BETA) : STD_ORDER; + public static final EmojiOrder BETA_ORDER = + Emoji.IS_BETA ? EmojiOrder.of(Emoji.VERSION_BETA) : STD_ORDER; public final EmojiData emojiData; - public final MapComparator mapCollator; - public final Comparator codepointCompare; + public final MapComparator mapCollator; + public final Comparator codepointCompare; - public final Multimap orderingToCharacters; - public final UnicodeMap charactersToOrdering = new UnicodeMap<>(); - public final UnicodeMap majorGroupings = new UnicodeMap<>(); - public final Map categoryToOrder; + public final Multimap orderingToCharacters; + public final UnicodeMap charactersToOrdering = new UnicodeMap<>(); + public final UnicodeMap majorGroupings = new UnicodeMap<>(); + public final Map categoryToOrder; private final Map categoryToMajor; private final UnicodeSet firstInLine; @@ -154,7 +163,6 @@ public MajorGroup getMajorGroupFromCategory(String group) { return categoryToMajor.get(group); } - public static EmojiOrder of(VersionInfo version) { EmojiOrder result = VERSION_TO_DATA.get(version); if (result == null) { @@ -165,17 +173,25 @@ public static EmojiOrder of(VersionInfo version) { private EmojiOrder(VersionInfo version, String file) { emojiData = EmojiData.of(version); - mapCollator = new MapComparator() - .setErrorOnMissing(false) - .setSortBeforeOthers(false) - .setDoFallback(false) - ; - Map _groupOrder = new LinkedHashMap(); + mapCollator = + new MapComparator() + .setErrorOnMissing(false) + .setSortBeforeOthers(false) + .setDoFallback(false); + Map _groupOrder = new LinkedHashMap(); Map _categoryToMajor = new LinkedHashMap<>(); StringBuilder _reformatted = new StringBuilder(); firstInLine = new UnicodeSet(); LinkedHashSet _sorted = new LinkedHashSet(); - orderingToCharacters = loadOrdering(version, file, mapCollator, _groupOrder, _categoryToMajor, _reformatted, _sorted); + orderingToCharacters = + loadOrdering( + version, + file, + mapCollator, + _groupOrder, + _categoryToMajor, + _reformatted, + _sorted); reformatted = _reformatted.toString(); sorted = ImmutableSet.copyOf(_sorted); firstInLine.freeze(); @@ -183,25 +199,29 @@ private EmojiOrder(VersionInfo version, String file) { majorGroupings.freeze(); categoryToOrder = Collections.unmodifiableMap(_groupOrder); categoryToMajor = Collections.unmodifiableMap(_categoryToMajor); - codepointCompare = + codepointCompare = new MultiComparator( - mapCollator, - EmojiOrder.UCA_COLLATOR, - CODE_POINT_COMPARATOR); + mapCollator, EmojiOrder.UCA_COLLATOR, CODE_POINT_COMPARATOR); // new MultiComparator( // mp, // EmojiOrder.UCA_COLLATOR, // CODE_POINT_COMPARATOR, // new UTF16.StringComparator(true,false,0)); - if(DEBUG) { + if (DEBUG) { String last = ""; for (String s : Arrays.asList("\u2017", "\u002D", "\uFF0D")) { System.out.println( - Utility.hex(last) + "/" + last - + " vs " - + Utility.hex(s) + "/" + s - + ": " + codepointCompare.compare(last, s) - + " map: " + mapCollator.getNumericOrder(s)); + Utility.hex(last) + + "/" + + last + + " vs " + + Utility.hex(s) + + "/" + + s + + ": " + + codepointCompare.compare(last, s) + + " map: " + + mapCollator.getNumericOrder(s)); last = s; } } @@ -210,30 +230,54 @@ private EmojiOrder(VersionInfo version, String file) { Set orderedStrings = new LinkedHashSet<>(mapCollator.getOrder()); check("orderedStrings, sorted", orderedStrings, sorted, true); - Set sourceData = emojiData.getAllEmojiWithDefectives().addAllTo(new LinkedHashSet<>(mapCollator.getOrder())); + Set sourceData = + emojiData + .getAllEmojiWithDefectives() + .addAllTo(new LinkedHashSet<>(mapCollator.getOrder())); check("orderedStrings vs sourceData", orderedStrings, sourceData, true); Set orderingToCharactersValues = new LinkedHashSet<>(orderingToCharacters.values()); - check("edStrings vs orderingToCharactersValues", orderedStrings, orderingToCharactersValues, true); - - Set charactersToOrderingKeys = charactersToOrdering.keySet().addAllTo(new LinkedHashSet<>()); - check("orderedStrings vs charactersToOrderingKeys", orderedStrings, charactersToOrderingKeys, false); - - check("orderedStrings vs majorGroupings.keySet", orderedStrings, majorGroupings.keySet().addAllTo(new LinkedHashSet<>()), false); - - check("categoryToOrder.keySet vs categoryToMajor.keySet", categoryToOrder.keySet(), categoryToMajor.keySet(), false); + check( + "edStrings vs orderingToCharactersValues", + orderedStrings, + orderingToCharactersValues, + true); + + Set charactersToOrderingKeys = + charactersToOrdering.keySet().addAllTo(new LinkedHashSet<>()); + check( + "orderedStrings vs charactersToOrderingKeys", + orderedStrings, + charactersToOrderingKeys, + false); + + check( + "orderedStrings vs majorGroupings.keySet", + orderedStrings, + majorGroupings.keySet().addAllTo(new LinkedHashSet<>()), + false); + + check( + "categoryToOrder.keySet vs categoryToMajor.keySet", + categoryToOrder.keySet(), + categoryToMajor.keySet(), + false); } - public static > void check(String title, U a, U b, boolean checkOrder) { + public static > void check( + String title, U a, U b, boolean checkOrder) { if (!Objects.equal(a, b)) { LinkedHashSet aMinusB = new LinkedHashSet<>(a); aMinusB.removeAll(b); LinkedHashSet bMinusA = new LinkedHashSet<>(b); bMinusA.removeAll(a); - if (!aMinusB.isEmpty() - || !bMinusA.isEmpty()) { - throw new ICUException((title.isEmpty() ? "" : title + ": ") - +"missmatch:\n\t(a-b)=" + aMinusB + ";\n\t(b-a)=" + bMinusA); + if (!aMinusB.isEmpty() || !bMinusA.isEmpty()) { + throw new ICUException( + (title.isEmpty() ? "" : title + ": ") + + "missmatch:\n\t(a-b)=" + + aMinusB + + ";\n\t(b-a)=" + + bMinusA); } } if (!checkOrder) { @@ -246,39 +290,43 @@ public static > void check(String title, U a, U b, bo T aItem = ita.next(); T bItem = itb.next(); if (!aItem.equals(bItem)) { - throw new ICUException(counter + ") ordering missmatch: a=" + aItem + "; b-a=" + bItem); + throw new ICUException( + counter + ") ordering missmatch: a=" + aItem + "; b-a=" + bItem); } ++counter; } } - private Multimap loadOrdering(VersionInfo version, String sourceFile, - MapComparator mapComparator, - Map _groupOrder, + private Multimap loadOrdering( + VersionInfo version, + String sourceFile, + MapComparator mapComparator, + Map _groupOrder, Map _categoryToMajor, - StringBuilder reformatted, + StringBuilder reformatted, LinkedHashSet sorted) { - //System.out.println(sourceFile); + // System.out.println(sourceFile); Multimap _orderingToCharacters = LinkedHashMultimap.create(); Output> lastLabel = new Output>(new TreeSet()); MajorGroup majorGroup = null; EmojiIterator ei = new EmojiIterator(emojiData, false); - final String directory = - Settings.UnicodeTools.getDataPathString( - "emoji", version.getVersionString(2, 2)) + - "/source"; + final String directory = + Settings.UnicodeTools.getDataPathString("emoji", version.getVersionString(2, 2)) + + "/source"; int lineCounter = 0; - for (String line : FileUtilities.in(EmojiOrder.class, - sourceFile)) { + for (String line : FileUtilities.in(EmojiOrder.class, sourceFile)) { ++lineCounter; - if (line.isEmpty() || line.startsWith("#") && !line.startsWith("#⃣") && !line.startsWith("#️⃣")) { + if (line.isEmpty() + || line.startsWith("#") && !line.startsWith("#⃣") && !line.startsWith("#️⃣")) { continue; } if (DEBUG) System.out.println(line); line = Emoji.UNESCAPE.transform(line); - line = line.replace(Emoji.TEXT_VARIANT_STRING, "").replace(Emoji.EMOJI_VARIANT_STRING, ""); + line = + line.replace(Emoji.TEXT_VARIANT_STRING, "") + .replace(Emoji.EMOJI_VARIANT_STRING, ""); if (line.contains("keycap")) { int debug = 0; @@ -315,7 +363,8 @@ private Multimap loadOrdering(VersionInfo version, String source // if (major == null) { // _categoryToMajor.put(item, majorGroup); // } else if (major != majorGroup) { - // throw new IllegalArgumentException("Conflicting major categories"); + // throw new IllegalArgumentException("Conflicting major + // categories"); // } // // hack for now // if (oldLine.contains("\t")) { @@ -344,21 +393,31 @@ private Multimap loadOrdering(VersionInfo version, String source } reformatted.append(cleanString); } - //System.out.println("Adding: " + Utility.hex(string) + "\t" + string); + // System.out.println("Adding: " + Utility.hex(string) + "\t" + string); add(_orderingToCharacters, sorted, majorGroup, lastLabel, string); if (!sorted.contains(string)) { throw new ICUException(); } String handshakeVersion = EmojiData.COUPLES_TO_HANDSHAKE_VERSION.get(string); if (handshakeVersion == null) { - //Now add the modifier sequences + // Now add the modifier sequences if (string.contains("👨‍❤‍👨")) { int debug = 0; } - addAllModifiers(_orderingToCharacters, sorted, lastLabel, majorGroup, string); // , withEmojiVariant, withoutFinal, noVariant); + addAllModifiers( + _orderingToCharacters, + sorted, + lastLabel, + majorGroup, + string); // , withEmojiVariant, withoutFinal, noVariant); } else { - //Now add the modifier sequences - addAllModifiers(_orderingToCharacters, sorted, lastLabel, majorGroup, handshakeVersion); // , withEmojiVariant, withoutFinal, noVariant); + // Now add the modifier sequences + addAllModifiers( + _orderingToCharacters, + sorted, + lastLabel, + majorGroup, + handshakeVersion); // , withEmojiVariant, withoutFinal, noVariant); } // ImmutableList list = hack.get(string); @@ -366,8 +425,9 @@ private Multimap loadOrdering(VersionInfo version, String source // addVariants(result, sorted, majorGroup, lastLabel, string); // for (String string2 : list) { // //System.err.println("Adding " + show(string2)); - // add(result, sorted, majorGroup, lastLabel, string2); - // addVariants(result, sorted, majorGroup, lastLabel, string2); + // add(result, sorted, majorGroup, lastLabel, string2); + // addVariants(result, sorted, majorGroup, lastLabel, + // string2); // } // } @@ -375,21 +435,34 @@ private Multimap loadOrdering(VersionInfo version, String source if (emojiData.getGenderBase().contains(string) && !string.equals("👱")) { String stringWithMale = string + "\u200d\u2642"; add(_orderingToCharacters, sorted, majorGroup, lastLabel, stringWithMale); - //Now add the modifier sequences - addAllModifiers(_orderingToCharacters, sorted, lastLabel, majorGroup, stringWithMale); // , withEmojiVariant, withoutFinal, noVariant); // + Emoji.EMOJI_VARIANT_STRING + // Now add the modifier sequences + addAllModifiers( + _orderingToCharacters, + sorted, + lastLabel, + majorGroup, + stringWithMale); // , withEmojiVariant, withoutFinal, noVariant); // + + // Emoji.EMOJI_VARIANT_STRING if (!sorted.contains(stringWithMale)) { throw new ICUException(); } String stringWithFemale = string + "\u200d\u2640"; add(_orderingToCharacters, sorted, majorGroup, lastLabel, stringWithFemale); - //Now add the modifier sequences - addAllModifiers(_orderingToCharacters, sorted, lastLabel, majorGroup, stringWithFemale); // , withEmojiVariant, withoutFinal, noVariant); // + Emoji.EMOJI_VARIANT_STRING + // Now add the modifier sequences + addAllModifiers( + _orderingToCharacters, + sorted, + lastLabel, + majorGroup, + stringWithFemale); // , withEmojiVariant, withoutFinal, noVariant); // + // + Emoji.EMOJI_VARIANT_STRING if (!sorted.contains(stringWithFemale)) { throw new ICUException(); } } // // add/remove all variant strings - // if (string.contains(Emoji.JOINER_STRING) || emojiData.getKeycapBases().contains(string.charAt(0))) { + // if (string.contains(Emoji.JOINER_STRING) || + // emojiData.getKeycapBases().contains(string.charAt(0))) { // addVariants(result, sorted, majorGroup, lastLabel, string); // } } @@ -398,11 +471,12 @@ private Multimap loadOrdering(VersionInfo version, String source } } - if (DEBUG) for (String s : sorted) { - if (DEBUG_SET.containsAll(s)) { - System.out.println("Debug: " + Utility.hex(s) + "\t" + s); + if (DEBUG) + for (String s : sorted) { + if (DEBUG_SET.containsAll(s)) { + System.out.println("Debug: " + Utility.hex(s) + "\t" + s); + } } - } mapComparator.add(sorted); // mapComparator.setErrorOnMissing(true); mapComparator.freeze(); @@ -412,8 +486,12 @@ private Multimap loadOrdering(VersionInfo version, String source return ImmutableMultimap.copyOf(_orderingToCharacters); } - private void addAllModifiers(Multimap result, Set sorted, Output> lastLabel, MajorGroup majorGroup, - String... strings) { + private void addAllModifiers( + Multimap result, + Set sorted, + Output> lastLabel, + MajorGroup majorGroup, + String... strings) { HashSet seen = new HashSet<>(); for (String string : strings) { if (string == null || seen.contains(string)) { @@ -427,7 +505,9 @@ private void addAllModifiers(Multimap result, Set sorted if (DEBUG) { System.out.println(string + "=>" + results); } - boolean isHoldingHands = string.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ) && !string.equals(EmojiData.NEUTRAL_HOLDING); + boolean isHoldingHands = + string.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ) + && !string.equals(EmojiData.NEUTRAL_HOLDING); boolean isHandshake = EmojiData.EMOJI_DATA_BETA.isHandshake(string); UnicodeSet temp = isHoldingHands || isHandshake ? new UnicodeSet() : null; @@ -442,8 +522,9 @@ private void addAllModifiers(Multimap result, Set sorted if (DEBUG) System.out.println(oldItem + " ==> " + item); } } - if (DEBUG && (EmojiData.COUPLES.containsSome(item) - || item.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ))) { + if (DEBUG + && (EmojiData.COUPLES.containsSome(item) + || item.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ))) { System.out.println("**Adding: " + item); } add(result, sorted, majorGroup, lastLabel, item); @@ -451,18 +532,25 @@ private void addAllModifiers(Multimap result, Set sorted } } - private void add(Multimap _orderingToCharacters, Set sorted, MajorGroup majorGroup, Output> lastLabel, String string) { + private void add( + Multimap _orderingToCharacters, + Set sorted, + MajorGroup majorGroup, + Output> lastLabel, + String string) { if (string.contains(Emoji.EMOJI_VARIANT_STRING)) { - throw new IllegalArgumentException("String shouldn't contain variants at this point: " + string); + throw new IllegalArgumentException( + "String shouldn't contain variants at this point: " + string); } if (sorted.contains(string)) { throw new IllegalArgumentException("Duplicate entry for: " + string); - //return; // already done. - } - if (DEBUG) if (DEBUG_SET.containsAll(string)) { - System.out.println("Debug: " + Utility.hex(string) + "\t" + string); - int debug = 0; + // return; // already done. } + if (DEBUG) + if (DEBUG_SET.containsAll(string)) { + System.out.println("Debug: " + Utility.hex(string) + "\t" + string); + int debug = 0; + } // if (string.contains("⚕")) { // System.out.println("\t" + string); // } @@ -487,7 +575,8 @@ private void add(Multimap _orderingToCharacters, Set sor } // String full = emojiData.addEmojiVariants(string); // if (full.endsWith(Emoji.EMOJI_VARIANT_STRING)) { - // String noVariantAtEnd = full.substring(0, full.length() - Emoji.EMOJI_VARIANT_STRING.length()); + // String noVariantAtEnd = full.substring(0, full.length() - + // Emoji.EMOJI_VARIANT_STRING.length()); // sorted.add(noVariantAtEnd); // } // sorted.add(full); @@ -501,9 +590,6 @@ private void add(Multimap _orderingToCharacters, Set sor // } } - - - // private void show() { // for (Entry> labelToSet : orderingToCharacters.keyValuesSet()) { // String label = labelToSet.getKey(); @@ -535,7 +621,10 @@ public T appendCollationRules(T outText, UnicodeSet... ch for (String m : EmojiData.MODIFIERS) { outText.append(m); } - UnicodeSet hairSkin = new UnicodeSet(EmojiData.MODIFIERS).addAll(Emoji.HAIR_STYLES_WITH_JOINERS).freeze(); + UnicodeSet hairSkin = + new UnicodeSet(EmojiData.MODIFIERS) + .addAll(Emoji.HAIR_STYLES_WITH_JOINERS) + .freeze(); UnicodeSetSpanner HairSkinSpanner = new UnicodeSetSpanner(hairSkin); String lastGroup = null; @@ -551,12 +640,13 @@ public T appendCollationRules(T outText, UnicodeSet... ch int debug = 0; } String group = getCategory(s); - if (!Objects.equal(group,lastGroup)) { + if (!Objects.equal(group, lastGroup)) { needRelation = true; lastGroup = group; } - // we can skip anything that ends with skin or zwj-hair, since we make those ignorable + // we can skip anything that ends with skin or zwj-hair, since we make those + // ignorable int trimmed = hairSkin.spanBack(s, SpanCondition.SIMPLE); if (trimmed < s.length()) { continue; @@ -569,12 +659,14 @@ public T appendCollationRules(T outText, UnicodeSet... ch if (s.contains(explicitHair)) { forLater.add(stripFinalVariant(s)); continue; - } + } } - if (s.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ) || EmojiData.HOLDING_HANDS_COMPOSITES.contains(s)) { + if (s.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ) + || EmojiData.HOLDING_HANDS_COMPOSITES.contains(s)) { if (s.equals(EmojiData.NEUTRAL_HOLDING)) { addMultipersonSkinTones(outText, haveSeen, "🧑‍🤝‍🧑", "👭", "👫", "👬"); - needRelation = true; // after this, we will need the next item to have the relation + needRelation = + true; // after this, we will need the next item to have the relation } else { checkHolding.add(s); } @@ -590,7 +682,7 @@ public T appendCollationRules(T outText, UnicodeSet... ch int last = Character.codePointBefore(s, s.length()); if (EmojiData.MODIFIERS.contains(last)) { String oldS = s; - s = s.substring(0, s.length()-Character.charCount(last)); // remove last + s = s.substring(0, s.length() - Character.charCount(last)); // remove last if (temp.contains(s)) { continue; // skip if present } @@ -600,7 +692,8 @@ public T appendCollationRules(T outText, UnicodeSet... ch // at this point, the only Modifiers can be medial. if (isFirst) { if (multiCodePoint) { - throw new IllegalArgumentException("Cannot have first item with > 1 codepoint: " + s); + throw new IllegalArgumentException( + "Cannot have first item with > 1 codepoint: " + s); } outText.append("&").append(s); isFirst = false; @@ -651,7 +744,8 @@ public T appendCollationRules(T outText, UnicodeSet... ch haveSeen.add(s); // needRelation = true; // // break arbitrarily (but predictably) - // int bottomBits = s.codePointAt(0) & 0x7; + // int bottomBits = s.codePointAt(0) & + // 0x7; // needRelation = bottomBits == 0; } } @@ -671,34 +765,35 @@ public T appendCollationRules(T outText, UnicodeSet... ch } } // OLD Hack to sort handshake items - //addHandshakes(outText); + // addHandshakes(outText); } catch (IOException e) { - throw new ICUUncheckedIOException("Internal Error",e); + throw new ICUUncheckedIOException("Internal Error", e); } return outText; } /** - * Add all the characters, based on the bases and their order: NN, WW, WM, MM (🧑‍🤝‍🧑, 👭, 👫, 👬) - * Start with each base, and add the ZWJ (equivalents) skin on the first item. - * (Need not add for last, since the skin components have the right order.) - * Exceptions! - * When the skin tone is identical, it is applied to the combined base for WW, WM, MM. - * The easiest way to do that is to store these for later and do resets. - * TODO Generalize for other skin tones, generalizing EmojiData.COUPLES_TO_HANDSHAKE_VERSION + * Add all the characters, based on the bases and their order: NN, WW, WM, MM (🧑‍🤝‍🧑, 👭, 👫, + * 👬) Start with each base, and add the ZWJ (equivalents) skin on the first item. (Need not add + * for last, since the skin components have the right order.) Exceptions! When the skin tone is + * identical, it is applied to the combined base for WW, WM, MM. The easiest way to do that is + * to store these for later and do resets. TODO Generalize for other skin tones, generalizing + * EmojiData.COUPLES_TO_HANDSHAKE_VERSION + * * @param outText output * @param haveSeen for tracking the items added * @param bases the base characters * @throws IOException */ - private void addMultipersonSkinTones(T outText, Set haveSeen, String... bases) - throws IOException { - Map expansion = new LinkedHashMap<>(); + private void addMultipersonSkinTones( + T outText, Set haveSeen, String... bases) throws IOException { + Map expansion = new LinkedHashMap<>(); for (String base : bases) { outText.append("\n< ").append(quoteSyntax(base)); haveSeen.remove(base); boolean compositeBase = base.length() == 2; - String handshake = compositeBase ? EmojiData.COUPLES_TO_HANDSHAKE_VERSION.get(base) : base; + String handshake = + compositeBase ? EmojiData.COUPLES_TO_HANDSHAKE_VERSION.get(base) : base; int firstCp = handshake.codePointAt(0); String lead = UTF16.valueOf(firstCp); String trail = handshake.substring(Character.charCount(firstCp)); @@ -713,22 +808,20 @@ private void addMultipersonSkinTones(T outText, Set entry : expansion.entrySet()) { outText.append("\n& ") - .append(quoteSyntax(entry.getKey())) - .append('=') - .append(quoteSyntax(entry.getValue())) - ; + .append(quoteSyntax(entry.getKey())) + .append('=') + .append(quoteSyntax(entry.getValue())); } } // HACK the handshakes to add at end - private void addHandshakes(T outText) - throws IOException { + private void addHandshakes(T outText) throws IOException { String handshake = null; Set sorted = sort(codepointCompare, emojiData.getAllEmojiWithoutDefectives()); for (String emoji : sorted) { if (EmojiData.COUPLES.containsSome(emoji)) { handshake = emoji; // save in case we have one after - } else if (emoji.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ)) { + } else if (emoji.contains(EmojiData.ZWJ_HANDSHAKE_ZWJ)) { if (handshake != null) { outText.append("\n& " + handshake); handshake = null; @@ -740,7 +833,7 @@ private void addHandshakes(T outText) private String stripFinalVariant(String s) { if (s.endsWith(Emoji.EMOJI_VARIANT_STRING)) { - s = s.substring(0,s.length()-Emoji.EMOJI_VARIANT_STRING.length()); + s = s.substring(0, s.length() - Emoji.EMOJI_VARIANT_STRING.length()); } return s; } @@ -754,14 +847,14 @@ public UnicodeSet getCharsWithCategory(String category) { } private String withoutTrailingVariant(String s) { - return s.endsWith(Emoji.EMOJI_VARIANT_STRING) ? s.substring(0, s.length()-1) : s; + return s.endsWith(Emoji.EMOJI_VARIANT_STRING) ? s.substring(0, s.length() - 1) : s; } - static final UnicodeSet NEEDS_QUOTE = new UnicodeSet("[[:Pattern_White_Space:][\\&\\[\\]#@!<;,=*]]").freeze(); + static final UnicodeSet NEEDS_QUOTE = + new UnicodeSet("[[:Pattern_White_Space:][\\&\\[\\]#@!<;,=*]]").freeze(); private String quoteSyntax(String source) { - return NEEDS_QUOTE.containsNone(source) ? source : - "'" + source.replace("'", "''") + "'"; + return NEEDS_QUOTE.containsNone(source) ? source : "'" + source.replace("'", "''") + "'"; } public static Set sort(Comparator comparator, UnicodeSet... characters) { @@ -788,6 +881,7 @@ public int getGroupOrder(String cat1) { public static final UnicodeSet GENDER_OBJECTS = new UnicodeSet(); public static final UnicodeSet GENDER_NEUTRALS = new UnicodeSet(); + static { for (String s : EmojiData.EMOJI_DATA.getEmojiForSortRules()) { CountEmoji.ZwjType type = CountEmoji.ZwjType.getType(s); @@ -813,7 +907,7 @@ public static void main(String[] args) { // // for (String s : EMOJI_DATA_RELEASE.getAllEmojiWithDefectives()) { // switch (CountEmoji.Category.getBucket(s)) { - // case zwj_seq_role: + // case zwj_seq_role: // int first = s.codePointAt(0); // roleBase.add(first); // temp.clear().addAll(s).remove(first) @@ -841,7 +935,8 @@ public static void main(String[] args) { // categories.add(order.getCategory(s)); // } // for (String category : categories) { - // missing.addAll(new UnicodeSet(order.getCharsWithCategory(category)).removeAll(all)); + // missing.addAll(new + // UnicodeSet(order.getCharsWithCategory(category)).removeAll(all)); // } // Set skip = new HashSet<>(missing.strings()); // missing.removeAll(skip).removeAll(EmojiData.EMOJI_DATA.getEmojiComponents()); @@ -852,9 +947,12 @@ public static void main(String[] args) { // // UnicodeMap data = new UnicodeMap<>(); // for (String s : all) { - // data.put(s, (EMOJI_DATA_RELEASE.getModifierBases().contains(s) ? "skin" : multiple.contains(s) ? "multiple" : "") - // + "\t" + (EMOJI_DATA_RELEASE.getExplicitGender().contains(s) ? "xgender" : EMOJI_DATA_RELEASE.getGenderBases().contains(s) ? "gender" : "") - // + "\t" + (EMOJI_DATA_RELEASE.getExplicitHair().contains(s) ? "xhair" : EMOJI_DATA_RELEASE.getHairBases().contains(s) ? "hair" : "") + // data.put(s, (EMOJI_DATA_RELEASE.getModifierBases().contains(s) ? "skin" : + // multiple.contains(s) ? "multiple" : "") + // + "\t" + (EMOJI_DATA_RELEASE.getExplicitGender().contains(s) ? + // "xgender" : EMOJI_DATA_RELEASE.getGenderBases().contains(s) ? "gender" : "") + // + "\t" + (EMOJI_DATA_RELEASE.getExplicitHair().contains(s) ? "xhair" : + // EMOJI_DATA_RELEASE.getHairBases().contains(s) ? "hair" : "") // + "\t" + (roleBase.contains(s) ? "role" : "") // ); // @@ -862,30 +960,55 @@ public static void main(String[] args) { // show(data); // System.out.println(); - // System.out.println("# Generating for repertoire of E" + Emoji.VERSION_LAST_RELEASED_STRING + // System.out.println("# Generating for repertoire of E" + + // Emoji.VERSION_LAST_RELEASED_STRING // + ", based on ordering for E" + Emoji.VERSION_BETA_STRING); - check("BETA_ORDER.sorted, BETA_ORDER.mp.getOrder", BETA_ORDER.sorted, BETA_ORDER.mapCollator.getOrder(), false); + check( + "BETA_ORDER.sorted, BETA_ORDER.mp.getOrder", + BETA_ORDER.sorted, + BETA_ORDER.mapCollator.getOrder(), + false); - Set temp = sort(BETA_ORDER.codepointCompare, BETA_ORDER.emojiData.getEmojiForSortRules(), GENDER_NEUTRALS); + Set temp = + sort( + BETA_ORDER.codepointCompare, + BETA_ORDER.emojiData.getEmojiForSortRules(), + GENDER_NEUTRALS); - //check(BETA_ORDER.sorted, temp, false); + // check(BETA_ORDER.sorted, temp, false); int counter = 0; for (String s : temp) { - System.out.println(++counter + ")\t" + BETA_ORDER.mapCollator.getNumericOrder(s) + "\t" + s + "\t" + getName(s)); + System.out.println( + ++counter + + ")\t" + + BETA_ORDER.mapCollator.getNumericOrder(s) + + "\t" + + s + + "\t" + + getName(s)); if (counter > 50) break; } counter = 0; for (String s : BETA_ORDER.sorted) { - System.out.println(++counter + ")\t" + BETA_ORDER.mapCollator.getNumericOrder(s) + "\t" + s + "\t" + getName(s)); + System.out.println( + ++counter + + ")\t" + + BETA_ORDER.mapCollator.getNumericOrder(s) + + "\t" + + s + + "\t" + + getName(s)); if (counter > 50) break; } - System.out.println("# START AUTOGENERATED EMOJI ORDER — " + BETA_ORDER.emojiData.getVersionString()); + System.out.println( + "# START AUTOGENERATED EMOJI ORDER — " + BETA_ORDER.emojiData.getVersionString()); StringBuilder rules = new StringBuilder(); - BETA_ORDER.appendCollationRules(rules, BETA_ORDER.emojiData.getEmojiForSortRules(), GENDER_NEUTRALS); + BETA_ORDER.appendCollationRules( + rules, BETA_ORDER.emojiData.getEmojiForSortRules(), GENDER_NEUTRALS); System.out.println(rules); System.out.println("# END AUTOGENERATED EMOJI ORDER"); @@ -895,7 +1018,6 @@ public static void main(String[] args) { out.close(); } - private static String getName(String s) { try { return EmojiData.EMOJI_DATA_BETA.getName(s); @@ -904,16 +1026,17 @@ private static String getName(String s) { } } - private static void show(UnicodeMap data) { - Set temp = sort(EmojiOrder.of(Emoji.VERSION_LAST_RELEASED).codepointCompare, data.keySet()); + Set temp = + sort(EmojiOrder.of(Emoji.VERSION_LAST_RELEASED).codepointCompare, data.keySet()); for (String s : temp) { showEmoji(data.get(s), s); } } private static void show(String title, UnicodeSet unicodeSet) { - Set temp = sort(EmojiOrder.of(Emoji.VERSION_LAST_RELEASED).codepointCompare, unicodeSet); + Set temp = + sort(EmojiOrder.of(Emoji.VERSION_LAST_RELEASED).codepointCompare, unicodeSet); for (String s : temp) { showEmoji(title, s); } @@ -926,17 +1049,16 @@ private static void showEmoji(String info, String s) { } catch (Exception e) { return; } - System.out.println("\\x{" + Utility.hex(s,2," ") + "}" - + "\t" + s - + "\t" + name - + "\t" + info - ); + System.out.println( + "\\x{" + Utility.hex(s, 2, " ") + "}" + "\t" + s + "\t" + name + "\t" + info); } public void showCategories(Appendable out) { try { - Map> majorToMinorToEmoji = new EnumMap<>(MajorGroup.class); - Set sorted = sort(codepointCompare,emojiData.getAllEmojiWithoutDefectivesOrModifiers()); + Map> majorToMinorToEmoji = + new EnumMap<>(MajorGroup.class); + Set sorted = + sort(codepointCompare, emojiData.getAllEmojiWithoutDefectivesOrModifiers()); for (String emoji : sorted) { String cat = getCategory(emoji); MajorGroup major = getMajorGroupFromCategory(cat); @@ -947,17 +1069,23 @@ public void showCategories(Appendable out) { sub.put(cat, emoji); } int line = 0; - for (Entry> entry : majorToMinorToEmoji.entrySet()) { - MajorGroup major = entry.getKey(); - for (Entry> entry2 : entry.getValue().asMap().entrySet()) { + for (Entry> entry : + majorToMinorToEmoji.entrySet()) { + MajorGroup major = entry.getKey(); + for (Entry> entry2 : + entry.getValue().asMap().entrySet()) { Collection items = entry2.getValue(); - out.append(++line - + "\t" + major.toPlainString() - + "\t" + entry2.getKey() - + "\t" + items.size() - + "\t" + CollectionUtilities.join(items, " ") - + "\n" - ); + out.append( + ++line + + "\t" + + major.toPlainString() + + "\t" + + entry2.getKey() + + "\t" + + items.size() + + "\t" + + CollectionUtilities.join(items, " ") + + "\n"); } } } catch (IOException e) { @@ -969,7 +1097,6 @@ public String getReformatted() { return reformatted; } - public boolean isFirstInLine(String s) { return firstInLine.contains(s); } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiRename.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiRename.java index a9a69b9d3..b4f931835 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiRename.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiRename.java @@ -1,16 +1,14 @@ package org.unicode.tools.emoji; +import com.google.common.base.Splitter; +import com.ibm.icu.text.Transform; import java.io.File; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.google.common.base.Splitter; -import com.ibm.icu.text.Transform; - public class EmojiRename { static final boolean NO_ACTION = false; static final File DIR = new File(Settings.Output.GEN_DIR + "emoji_images_source"); @@ -19,15 +17,15 @@ public class EmojiRename { static final Splitter DASH = Splitter.on('-'); public static void main(String[] args) { -// renameCountryFlags(ANDROID_TRANSFORM, "android", "android_large"); -// renameCountryFlags(TWITTER_TRANSFORM, "twitter"); -// renameCountryFlags(TWITTER_TRANSFORM, "apple", "apple_large"); -// renameCountryFlags(WINDOWS_TRANSFORM, "windows", "windows_large"); + // renameCountryFlags(ANDROID_TRANSFORM, "android", "android_large"); + // renameCountryFlags(TWITTER_TRANSFORM, "twitter"); + // renameCountryFlags(TWITTER_TRANSFORM, "apple", "apple_large"); + // renameCountryFlags(WINDOWS_TRANSFORM, "windows", "windows_large"); rename("windows10", "glyph-(.*).png", "windows_$1.png"); } private static void rename(String subdir, String sourcePattern, String targetPattern) { - File fileSubdir = new File(DIR,subdir); + File fileSubdir = new File(DIR, subdir); if (!fileSubdir.exists()) { System.out.println("Skipping missing subdirectory: " + fileSubdir); return; @@ -39,21 +37,22 @@ private static void rename(String subdir, String sourcePattern, String targetPat System.out.println("Mismatch: " + fileSubdir); return; } - int cp = Integer.parseInt(m.group(1),16); + int cp = Integer.parseInt(m.group(1), 16); if (!EmojiData.EMOJI_DATA.getChars().contains(cp)) { continue; } - String newName = targetPattern.replace("$1", Utility.hex(cp,4).toLowerCase()); + String newName = targetPattern.replace("$1", Utility.hex(cp, 4).toLowerCase()); File target = new File(fileSubdir, newName); System.out.println(file.getName() + "\t=>\t" + newName); file.renameTo(target); - } + } } - private static void renameCountryFlags(Transform transform, String... subdirectories) { + private static void renameCountryFlags( + Transform transform, String... subdirectories) { String base = subdirectories[0]; for (String subdir : subdirectories) { - File fileSubdir = new File(DIR,subdir); + File fileSubdir = new File(DIR, subdir); if (!fileSubdir.exists()) { System.out.println("Skipping missing subdirectory: " + fileSubdir); continue; @@ -73,79 +72,81 @@ private static void renameCountryFlags(Transform transform, Strin if (emoji == null) { continue; } - String newName = base + "_" + Emoji.buildFileName(emoji,"_") + "." + suffix; - File target = new File(fileSubdir,newName); + String newName = base + "_" + Emoji.buildFileName(emoji, "_") + "." + suffix; + File target = new File(fileSubdir, newName); System.out.println(file.getName() + "\t=>\t" + target); if (!NO_ACTION) { file.renameTo(target); } - } } } + private static final Transform TWITTER_TRANSFORM = + new Transform() { + @Override + public String transform(String prefix) { + String emoji = null; + // 1f1e8-1f1f3.png + StringBuilder b = new StringBuilder(); + for (String hexes : DASH.split(prefix)) { + b.appendCodePoint(Integer.parseInt(hexes, 16)); + } + emoji = b.toString(); + if (emoji == null) { + throw new IllegalArgumentException(prefix); + } + return emoji; + } + }; - private static final Transform TWITTER_TRANSFORM = new Transform() { - @Override - public String transform(String prefix) { - String emoji = null; - // 1f1e8-1f1f3.png - StringBuilder b = new StringBuilder(); - for (String hexes : DASH.split(prefix)) { - b.appendCodePoint(Integer.parseInt(hexes,16)); - } - emoji = b.toString(); - - if (emoji == null) { - throw new IllegalArgumentException(prefix); - } - return emoji; - } - }; - - private static final Transform WINDOWS_TRANSFORM = new Transform() { - static final String WINDOWS_PREFIX = "glyph_0x"; - @Override - public String transform(String prefix) { - String emoji = null; - // glyph_0x1f6a0.png - if (!prefix.startsWith(WINDOWS_PREFIX)) { - throw new IllegalArgumentException("«" + prefix + "»"); - } - StringBuilder b = new StringBuilder(); - String hexes = prefix.substring(WINDOWS_PREFIX.length()); - int cp = Integer.parseInt(hexes,16); - if (!EmojiData.EMOJI_DATA.getChars().contains(cp)) { - return null; // don't change - } - b.appendCodePoint(cp); - emoji = b.toString(); - return emoji; - } - }; + private static final Transform WINDOWS_TRANSFORM = + new Transform() { + static final String WINDOWS_PREFIX = "glyph_0x"; - private static final Transform ANDROID_TRANSFORM = new Transform() { - @Override - public String transform(String prefix) { - String emoji = null; - if (prefix.length() == 2) { - emoji = Emoji.getEmojiFromRegionCode(prefix); - } else { - // emoji_u2b55 - // emoji_u1f1e8_1f1f3 - if (prefix.startsWith("emoji_u")) { + @Override + public String transform(String prefix) { + String emoji = null; + // glyph_0x1f6a0.png + if (!prefix.startsWith(WINDOWS_PREFIX)) { + throw new IllegalArgumentException("«" + prefix + "»"); + } StringBuilder b = new StringBuilder(); - for (String hexes : UNDERBAR.split(prefix.substring("emoji_u".length()))) { - b.appendCodePoint(Integer.parseInt(hexes,16)); + String hexes = prefix.substring(WINDOWS_PREFIX.length()); + int cp = Integer.parseInt(hexes, 16); + if (!EmojiData.EMOJI_DATA.getChars().contains(cp)) { + return null; // don't change } + b.appendCodePoint(cp); emoji = b.toString(); + return emoji; } - } - if (emoji == null) { - throw new IllegalArgumentException(prefix); - } - return emoji; - } - }; + }; + + private static final Transform ANDROID_TRANSFORM = + new Transform() { + @Override + public String transform(String prefix) { + String emoji = null; + if (prefix.length() == 2) { + emoji = Emoji.getEmojiFromRegionCode(prefix); + } else { + // emoji_u2b55 + // emoji_u1f1e8_1f1f3 + if (prefix.startsWith("emoji_u")) { + StringBuilder b = new StringBuilder(); + for (String hexes : + UNDERBAR.split(prefix.substring("emoji_u".length()))) { + b.appendCodePoint(Integer.parseInt(hexes, 16)); + } + emoji = b.toString(); + } + } + if (emoji == null) { + throw new IllegalArgumentException(prefix); + } + return emoji; + } + }; } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiStats.java b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiStats.java index 96cd897e2..b3b408dd1 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiStats.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/EmojiStats.java @@ -1,5 +1,7 @@ package org.unicode.tools.emoji; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UnicodeSet; import java.io.File; import java.io.IOException; import java.io.PrintWriter; @@ -11,15 +13,11 @@ import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.tools.emoji.Emoji.Source; import org.unicode.tools.emoji.GenerateEmoji.Style; import org.unicode.tools.emoji.GenerateEmoji.Visibility; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UnicodeSet; - class EmojiStats { enum Type { carriers(EmojiData.JCARRIERS), @@ -28,8 +26,8 @@ enum Type { other(UnicodeSet.EMPTY), modifierSequences(EmojiData.EMOJI_DATA.getModifierSequences()), zwjSequences(EmojiData.EMOJI_DATA.getZwjSequencesNormal()), - // standardAdditions8(nc8) - ; + // standardAdditions8(nc8) + ; final UnicodeSet items; Type(UnicodeSet _items) { @@ -58,7 +56,8 @@ static EmojiStats.Type getType(String chars) { Map _totalMissingData = new EnumMap<>(Emoji.Source.class); Map _extraData = new EnumMap<>(Emoji.Source.class); Map _allTypes = new EnumMap<>(Type.class); - Map> _data = new EnumMap<>(EmojiStats.Type.class); + Map> _data = + new EnumMap<>(EmojiStats.Type.class); for (Emoji.Source s : Emoji.Source.values()) { _totalMissingData.put(s, new UnicodeSet()); _extraData.put(s, new UnicodeSet()); @@ -82,7 +81,9 @@ static EmojiStats.Type getType(String chars) { } Source source; try { - source = Source.valueOf(platformString.equals("android") ? "google" : platformString); + source = + Source.valueOf( + platformString.equals("android") ? "google" : platformString); } catch (Exception e) { if (SHOW) System.out.println("Skipping directory file " + platformString); continue; @@ -141,8 +142,14 @@ static EmojiStats.Type getType(String chars) { public void write(Set platforms2) throws IOException { final boolean extraPlatforms = false; final String outFileName = "missing-emoji-list.html"; - PrintWriter out = FileUtilities.openUTF8Writer(extraPlatforms ? Emoji.INTERNAL_OUTPUT_DIR : Emoji.TR51_INTERNAL_DIR, outFileName); - PrintWriter outText = FileUtilities.openUTF8Writer(extraPlatforms ? Emoji.INTERNAL_OUTPUT_DIR : Emoji.TR51_INTERNAL_DIR, "missing-emoji-list.txt"); + PrintWriter out = + FileUtilities.openUTF8Writer( + extraPlatforms ? Emoji.INTERNAL_OUTPUT_DIR : Emoji.TR51_INTERNAL_DIR, + outFileName); + PrintWriter outText = + FileUtilities.openUTF8Writer( + extraPlatforms ? Emoji.INTERNAL_OUTPUT_DIR : Emoji.TR51_INTERNAL_DIR, + "missing-emoji-list.txt"); UnicodeSet jc = EmojiData.JCARRIERS; // new UnicodeSet() // .addAll(totalData.get(Source.sb)) @@ -153,34 +160,63 @@ public void write(Set platforms2) throws IOException { UnicodeSet needsVS = new UnicodeSet(); for (String s : jc) { int first = s.codePointAt(0); - if (!EmojiData.EMOJI_DATA.getEmojiWithVariants().contains(first) && textStyle.contains(first)) { + if (!EmojiData.EMOJI_DATA.getEmojiWithVariants().contains(first) + && textStyle.contains(first)) { needsVS.add(first); } } - if (SHOW) System.out.println("All Emoji\t" + EmojiData.EMOJI_DATA.getChars().toPattern(false)); + if (SHOW) + System.out.println("All Emoji\t" + EmojiData.EMOJI_DATA.getChars().toPattern(false)); if (SHOW) System.out.println("needs VS\t" + needsVS.toPattern(false)); - if (SHOW) System.out.println("gmail-jc\t" - + new UnicodeSet(totalMissingData.get(Emoji.Source.gmail)).removeAll(jc).toPattern(false)); - if (SHOW) System.out.println("jc-gmail\t" - + new UnicodeSet(jc).removeAll(totalMissingData.get(Emoji.Source.gmail)).toPattern(false)); + if (SHOW) + System.out.println( + "gmail-jc\t" + + new UnicodeSet(totalMissingData.get(Emoji.Source.gmail)) + .removeAll(jc) + .toPattern(false)); + if (SHOW) + System.out.println( + "jc-gmail\t" + + new UnicodeSet(jc) + .removeAll(totalMissingData.get(Emoji.Source.gmail)) + .toPattern(false)); for (Entry entry : totalMissingData.entrySet()) { if (SHOW) System.out.println(entry.getKey() + "\t" + entry.getValue().toPattern(false)); } - ChartUtilities.writeHeader(outFileName, out, "Missing", null, false, "

    Missing list of emoji characters.

    \n", Emoji.DATA_DIR_PRODUCTION, Emoji.TR51_HTML); + ChartUtilities.writeHeader( + outFileName, + out, + "Missing", + null, + false, + "

    Missing list of emoji characters.

    \n", + Emoji.DATA_DIR_PRODUCTION, + Emoji.TR51_HTML); out.println("
    Type" + type + " missing" + + type + + " missing
    Common" - + "common missing
    " + sectionLink + " count" - + common.size() + "
    Common" + + "common missing
    " + + sectionLink + + " count" + + common.size() + + "
    " + title + "
    "); String headerRow = ""; for (Emoji.Source type : platforms2) { - headerRow += ""; + headerRow += + ""; } headerRow += ""; for (Entry> entry : data.entrySet()) { - showDiff(out, outText, headerRow, entry.getKey().toString(), entry.getValue(), platforms2); + showDiff( + out, + outText, + headerRow, + entry.getKey().toString(), + entry.getValue(), + platforms2); } out.println("
    Type" + type + " missing" + + type + + " missing
    "); @@ -189,8 +225,13 @@ public void write(Set platforms2) throws IOException { outText.close(); } - private void showDiff(PrintWriter out, PrintWriter outText, String headerRow, final String title, - final Map values, Set platforms2) { + private void showDiff( + PrintWriter out, + PrintWriter outText, + String headerRow, + final String title, + final Map values, + Set platforms2) { // find common UnicodeSet common = null; boolean skipSeparate = true; @@ -207,7 +248,8 @@ private void showDiff(PrintWriter out, PrintWriter outText, String headerRow, fi // per source String sectionLink = ChartUtilities.getDoubleLink(title); - final GenerateEmojiData.PropPrinter propPrinter = new GenerateEmojiData.PropPrinter().set(EmojiDataSourceCombined.EMOJI_DATA); + final GenerateEmojiData.PropPrinter propPrinter = + new GenerateEmojiData.PropPrinter().set(EmojiDataSourceCombined.EMOJI_DATA); if (!skipSeparate) { out.println(headerRow); @@ -215,51 +257,96 @@ private void showDiff(PrintWriter out, PrintWriter outText, String headerRow, fi out.print("" + sectionLink + " count"); sectionLink = title; for (Emoji.Source source : platforms2) { - final UnicodeSet us = org.unicode.text.utility.Utility.ifNull(values.get(source), UnicodeSet.EMPTY); + final UnicodeSet us = + org.unicode.text.utility.Utility.ifNull( + values.get(source), UnicodeSet.EMPTY); out.print("" + (us.size() - common.size()) + ""); } out.print(""); out.print("" + title + " chars"); for (Emoji.Source source : platforms2) { - final UnicodeSet us = org.unicode.text.utility.Utility.ifNull(values.get(source), UnicodeSet.EMPTY); + final UnicodeSet us = + org.unicode.text.utility.Utility.ifNull( + values.get(source), UnicodeSet.EMPTY); final UnicodeSet missing = new UnicodeSet(us).removeAll(common); - GenerateEmoji.displayUnicodeSet(out, missing.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), Style.bestImage, 0, 1, 1, "../../emoji/charts/full-emoji-list.html", "", "lchars", Visibility.external); + GenerateEmoji.displayUnicodeSet( + out, + missing.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), + Style.bestImage, + 0, + 1, + 1, + "../../emoji/charts/full-emoji-list.html", + "", + "lchars", + Visibility.external); outText.println(source + "\t" + missing.size()); - propPrinter.show(outText, source+"", null, 14, 14, us, true, false, false); + propPrinter.show(outText, source + "", null, 14, 14, us, true, false, false); } out.print(""); } // common if (common.size() != 0) { - out.println("Common" - + "" - + "common missing"); - out.println("" + sectionLink + " count" - + "" - + common.size() + ""); + out.println( + "Common" + + "" + + "common missing"); + out.println( + "" + + sectionLink + + " count" + + "" + + common.size() + + ""); out.println("" + title + ""); - GenerateEmoji.displayUnicodeSet(out, common.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), Style.bestImage, 0, platforms2.size(), 1, null, "", "lchars", Visibility.external); + GenerateEmoji.displayUnicodeSet( + out, + common.addAllTo(new TreeSet(GenerateEmoji.EMOJI_COMPARATOR)), + Style.bestImage, + 0, + platforms2.size(), + 1, + null, + "", + "lchars", + Visibility.external); out.println(""); outText.println("common \t" + common.size()); propPrinter.show(outText, "common", null, 14, 14, common, true, false, false); } } + public static void main(String[] args) { - - final UnicodeSet missingSamsung = new UnicodeSet(EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives()) - .removeAll(EmojiData.EMOJI_DATA.getModifierSequences()) - .removeAll(totalMissingData.get(Source.samsung)); - System.out.println("\nSamsung missing: " + missingSamsung.size() + "\t" + missingSamsung.toPattern(false) + "\n"); + + final UnicodeSet missingSamsung = + new UnicodeSet(EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives()) + .removeAll(EmojiData.EMOJI_DATA.getModifierSequences()) + .removeAll(totalMissingData.get(Source.samsung)); + System.out.println( + "\nSamsung missing: " + + missingSamsung.size() + + "\t" + + missingSamsung.toPattern(false) + + "\n"); for (String cp : missingSamsung) { show(Source.samsung, cp); } UnicodeSet samsungExtras = extraData.get(Source.samsung); - System.out.println("\nSamsung extras: " + samsungExtras.size() + "\t" + samsungExtras.toPattern(false) + "\n"); + System.out.println( + "\nSamsung extras: " + + samsungExtras.size() + + "\t" + + samsungExtras.toPattern(false) + + "\n"); for (String cp : samsungExtras) { show(Source.samsung, cp); } - + for (String cp : totalMissingData.get(Source.google)) { show(Source.google, cp); } @@ -267,9 +354,15 @@ public static void main(String[] args) { private static void show(Source source, String cp) { System.out.println( - source.getPrefix() + "_" + Emoji.buildFileName(cp, "_") + ".png" - + " ;\t" + cp - + " ;\tv" + BirthInfo.getYear(cp) - + " ;\t" + EmojiData.EMOJI_DATA.getName(cp)); + source.getPrefix() + + "_" + + Emoji.buildFileName(cp, "_") + + ".png" + + " ;\t" + + cp + + " ;\tv" + + BirthInfo.getYear(cp) + + " ;\t" + + EmojiData.EMOJI_DATA.getName(cp)); } -} \ No newline at end of file +} diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/FindExtraImages.java b/unicodetools/src/main/java/org/unicode/tools/emoji/FindExtraImages.java index 749e42681..1ab808e2f 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/FindExtraImages.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/FindExtraImages.java @@ -1,21 +1,19 @@ package org.unicode.tools.emoji; +import com.ibm.icu.lang.UCharacter; import java.io.File; import java.util.LinkedHashSet; import java.util.Set; - import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.Emoji.Source; -import com.ibm.icu.lang.UCharacter; - public class FindExtraImages { public static void main(String[] args) { Set emojiFileSuffixes = new LinkedHashSet<>(); for (String emoji : EmojiData.EMOJI_DATA.getAllEmojiWithoutDefectives()) { emojiFileSuffixes.add(Emoji.buildFileName(emoji, "_")); } - //Map extraChars = + // Map extraChars = for (Emoji.Source source : Source.values()) { if (source == Source.charOverride || source.compareTo(Source.gmail) >= 0) { continue; @@ -31,7 +29,8 @@ public static void main(String[] args) { String otherChars = ""; String otherName = ""; if (name.startsWith(source.getPrefix() + "_") && name.endsWith(".png")) { - String remainder = name.substring(source.getPrefix().length()+1, name.length()-4); + String remainder = + name.substring(source.getPrefix().length() + 1, name.length() - 4); if (emojiFileSuffixes.contains(remainder)) { continue; } else { @@ -48,7 +47,7 @@ public static void main(String[] args) { } } missing.add(name); - System.out.println(source + "\t" +name + otherChars); + System.out.println(source + "\t" + name + otherChars); } } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/FixEmojiText.java b/unicodetools/src/main/java/org/unicode/tools/emoji/FixEmojiText.java index 8a48535ea..3e37199b6 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/FixEmojiText.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/FixEmojiText.java @@ -1,21 +1,19 @@ package org.unicode.tools.emoji; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.Locale; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.draft.FileUtilities; import org.unicode.cldr.util.With; import org.unicode.text.utility.Settings; import org.unicode.text.utility.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; - public class FixEmojiText { static boolean SHOW_NAME = false; @@ -30,9 +28,8 @@ public static void main(String[] args) { } private static void process(String arg) { - try ( - PrintWriter out = FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "images/listing.html") - ) { + try (PrintWriter out = + FileUtilities.openUTF8Writer(Settings.Output.GEN_DIR, "images/listing.html")) { out.println("

    "); StringBuilder result = new StringBuilder(); System.out.println(arg); @@ -59,7 +56,7 @@ private static void process(String arg) { private static void process2(String cp, StringBuilder result) { if (EmojiData.EMOJI_DATA.getChars().contains(cp)) { String hex = Utility.hex(cp); - String ID = Utility.hex(cp,"_").toLowerCase(Locale.ENGLISH) + ".png"; + String ID = Utility.hex(cp, "_").toLowerCase(Locale.ENGLISH) + ".png"; String fileName = DATA_SOURCE + "AppleEmoji/apple_" + ID; if (!new File(fileName).exists()) { fileName = DATA_SOURCE + "country-flags/ref_" + ID; @@ -74,19 +71,15 @@ private static void process2(String cp, StringBuilder result) { result.append("\t\t\t\t\t"); } String name = UCharacter.getName(cp, " + ").toLowerCase(Locale.ENGLISH); - result - .append("")
-            .append(cp)
-            .append(""); if (SHOW_NAME) { - result.append("\t" +name + "
    "); + result.append("\t" + name + "
    "); } } else { result.append(cp); diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateAnnotations.java b/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateAnnotations.java index 335cf3451..720d72f3f 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateAnnotations.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateAnnotations.java @@ -1,24 +1,21 @@ package org.unicode.tools.emoji; -import java.awt.ItemSelectable; +import com.google.common.collect.Multimap; +import com.google.common.collect.TreeMultimap; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.VersionInfo; import java.util.Collection; import java.util.Collections; import java.util.LinkedHashSet; import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.util.Annotations; import org.unicode.cldr.util.Annotations.AnnotationSet; import org.unicode.text.utility.Utility; -import com.google.common.collect.Multimap; -import com.google.common.collect.TreeMultimap; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.VersionInfo; - public class GenerateAnnotations { static AnnotationSet english = Annotations.getDataSet("en"); @@ -27,25 +24,27 @@ public class GenerateAnnotations { public static void main(String[] args) { System.err.println("OLD, use GenerateCldrData"); System.exit(-1); - + EmojiDataSourceCombined betaData = new EmojiDataSourceCombined(); - //showGenderVariants(betaData); - //if (true) return; + // showGenderVariants(betaData); + // if (true) return; - //showStats(Emoji.VERSION3, Emoji.VERSION4, Emoji.VERSION5); + // showStats(Emoji.VERSION3, Emoji.VERSION4, Emoji.VERSION5); EmojiData lastData = EmojiData.of(Emoji.VERSION_LAST_RELEASED); - UnicodeSet set = new UnicodeSet() - .addAll(betaData.getAllEmojiWithoutDefectivesOrModifiers()) - .removeAll(lastData.getAllEmojiWithoutDefectivesOrModifiers()) - .removeAll(betaData.getTagSequences()) - .freeze(); - - UnicodeSet full = new UnicodeSet() - .addAll(betaData.getAllEmojiWithoutDefectives()) - .removeAll(lastData.getAllEmojiWithoutDefectives()) - .freeze(); + UnicodeSet set = + new UnicodeSet() + .addAll(betaData.getAllEmojiWithoutDefectivesOrModifiers()) + .removeAll(lastData.getAllEmojiWithoutDefectivesOrModifiers()) + .removeAll(betaData.getTagSequences()) + .freeze(); + + UnicodeSet full = + new UnicodeSet() + .addAll(betaData.getAllEmojiWithoutDefectives()) + .removeAll(lastData.getAllEmojiWithoutDefectives()) + .freeze(); TreeSet sorted = set.addAllTo(new TreeSet<>(EmojiOrder.STD_ORDER.codepointCompare)); UnicodeSet found = new UnicodeSet(); @@ -79,26 +78,31 @@ public static void main(String[] args) { // name = candidateData.getShorterName(source); // annotations = candidateData.getAnnotations(source); // } - System.out.println("" + CollectionUtilities.join(annotations, " | ") - + ""); - System.out.println("" + (name == null ? "???" : name) - + ""); + System.out.println( + "" + + CollectionUtilities.join(annotations, " | ") + + ""); + System.out.println( + "" + + (name == null ? "???" : name) + + ""); System.out.println(); found.add(source); } - System.out.println("Add to emoji list (" - + found.size() - + "): " + found.toPattern(false)); + System.out.println("Add to emoji list (" + found.size() + "): " + found.toPattern(false)); for (String s : missed) { - System.out.println("**** Fetching from candidateData, not CLDR: " + s + "\t" + Utility.hex(s)); - + System.out.println( + "**** Fetching from candidateData, not CLDR: " + s + "\t" + Utility.hex(s)); } - } - static private String MAN = UTF16.valueOf(0x1F468); - static private String ADULT = UTF16.valueOf(0x1F9D1); + private static String MAN = UTF16.valueOf(0x1F468); + private static String ADULT = UTF16.valueOf(0x1F9D1); private static void showGenderVariants(EmojiData betaData) { Multimap data = TreeMultimap.create(); @@ -111,14 +115,14 @@ private static void showGenderVariants(EmojiData betaData) { int first = s.codePointAt(0); data.put(type, UTF16.valueOf(first)); } else { - switch(type) { - case family: break; - default: - if (s.startsWith(MAN)) { - String sequence = ADULT + s.substring(MAN.length()); - data.put(type, sequence); - - } + switch (type) { + case family: + break; + default: + if (s.startsWith(MAN)) { + String sequence = ADULT + s.substring(MAN.length()); + data.put(type, sequence); + } } } } @@ -136,10 +140,9 @@ private static void showGenderVariants(EmojiData betaData) { UnicodeSet sequence = new UnicodeSet().addAll(entry.getValue()); System.out.println(type + "\t" + sequence.toPattern(false)); } - } - static private int ADULT_CP = ADULT.codePointAt(0); + private static int ADULT_CP = ADULT.codePointAt(0); private static String getName(String source) { String name = getShortName(source); @@ -149,7 +152,7 @@ private static String getName(String source) { String seq = MAN + source.substring(ADULT.length()); name = getShortName(seq); if (name != null) { - name = name.startsWith("man ") ? name.substring(4) : "??"+name; + name = name.startsWith("man ") ? name.substring(4) : "??" + name; } } } diff --git a/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateCldrData.java b/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateCldrData.java index 837459f14..74f99bc9b 100644 --- a/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateCldrData.java +++ b/unicodetools/src/main/java/org/unicode/tools/emoji/GenerateCldrData.java @@ -1,38 +1,30 @@ package org.unicode.tools.emoji; +import com.google.common.collect.BiMap; +import com.google.common.collect.HashBiMap; +import com.ibm.icu.dev.util.CollectionUtilities; +import com.ibm.icu.dev.util.UnicodeMap; +import com.ibm.icu.impl.locale.XCldrStub.ImmutableSet; +import com.ibm.icu.text.DecimalFormat; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ICUException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; -import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.TreeSet; - import org.unicode.cldr.util.Annotations; -import org.unicode.cldr.util.CLDRPaths; -import org.unicode.cldr.util.Tabber; import org.unicode.cldr.util.Annotations.AnnotationSet; -import org.unicode.text.utility.Settings; +import org.unicode.cldr.util.Tabber; import org.unicode.text.utility.Utility; import org.unicode.tools.emoji.CountEmoji.Attribute; import org.unicode.tools.emoji.CountEmoji.Category; import org.unicode.tools.emoji.EmojiOrder.MajorGroup; -import org.unicode.tools.emoji.GenerateEmojiTestFile.Target; - -import com.google.common.collect.BiMap; -import com.google.common.collect.HashBiMap; -import com.ibm.icu.dev.util.CollectionUtilities; -import com.ibm.icu.dev.util.UnicodeMap; -import com.ibm.icu.impl.locale.XCldrStub.ImmutableSet; -import com.ibm.icu.text.DecimalFormat; -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.ICUException; public class GenerateCldrData { static final CandidateData candidateData = CandidateData.getInstance(); @@ -43,24 +35,25 @@ public static void main(String[] args) throws IOException { EmojiData EMOJI_DATA_PREVIOUS = EmojiData.EMOJI_DATA; UnicodeMap cldrEnglishData = org.unicode.cldr.util.Annotations.getData("en"); - UnicodeSet onlyNew = new UnicodeSet(betaPlusCandidateData.getAllEmojiWithoutDefectives()) - .removeAll(EMOJI_DATA_PREVIOUS.getAllEmojiWithoutDefectives()) + UnicodeSet onlyNew = + new UnicodeSet(betaPlusCandidateData.getAllEmojiWithoutDefectives()) + .removeAll(EMOJI_DATA_PREVIOUS.getAllEmojiWithoutDefectives()) // .removeAll(previousEmoji.getZwjSequencesAll()) // catch the eye // witness ; Set found = getSortedFiltered(onlyNew, candidateData.comparator, true); - - System.out.println("\n*** CLDR Instructions" - + "\n• Add to the files root.xml and en.xml, as below" - + "\n• Replace the contents of org.unicode.cldr.util.data.emoji.emoji-test.txt as below" - + "\n• Also add new images to /cldr-apps/WebContent/images/emoji/." - + " They need to have the prefix 'emoji-', like emoji_0023_20e3.png." - + " If you don't do this, you'll get failures in TestAnnotations.testEmojiImages." - + "\n• May need to change org.unicode.cldr.util.Emoji.SPECIALS to have TestAnnotations.testAnnotationPaths pass." - + " See notes below SPECIALS for instructions." - + "\n• May need to change the name composition algorithm (for new components like hair styles) and " - + "\n\tmodify the documentation of composition of names in LDML." - ); + + System.out.println( + "\n*** CLDR Instructions" + + "\n• Add to the files root.xml and en.xml, as below" + + "\n• Replace the contents of org.unicode.cldr.util.data.emoji.emoji-test.txt as below" + + "\n• Also add new images to /cldr-apps/WebContent/images/emoji/." + + " They need to have the prefix 'emoji-', like emoji_0023_20e3.png." + + " If you don't do this, you'll get failures in TestAnnotations.testEmojiImages." + + "\n• May need to change org.unicode.cldr.util.Emoji.SPECIALS to have TestAnnotations.testAnnotationPaths pass." + + " See notes below SPECIALS for instructions." + + "\n• May need to change the name composition algorithm (for new components like hair styles) and " + + "\n\tmodify the documentation of composition of names in LDML."); System.out.println("\n*** Add to root.xml"); @@ -70,26 +63,37 @@ public static void main(String[] args) throws IOException { getLines(found, false, cldrEnglishData); - System.out.println("\n*** Replace contents of org.unicode.cldr.util.data.emoji.emoji-test.txt by"); + System.out.println( + "\n*** Replace contents of org.unicode.cldr.util.data.emoji.emoji-test.txt by"); // # subgroup: transport-ground // 1F682 ; fully-qualified # 🚂 locomotive - // final String OUTPUT_DIR = CLDRPaths.GEN_DIRECTORY + "cldr/emoji/" + Emoji.VERSION_BETA_STRING; + // final String OUTPUT_DIR = CLDRPaths.GEN_DIRECTORY + "cldr/emoji/" + + // Emoji.VERSION_BETA_STRING; // UnicodeSet temp = new UnicodeSet(onlyNew).addAll(found); - //GenerateEmojiTestFile.showLines(EmojiOrder.BETA_ORDER, temp, Target.propFile, OUTPUT_DIR); + // GenerateEmojiTestFile.showLines(EmojiOrder.BETA_ORDER, temp, Target.propFile, + // OUTPUT_DIR); // System.out.println(OUTPUT_DIR); getEmojiDataLines(); - - //printXML(onlyNew); + // printXML(onlyNew); } + private static void getEmojiDataLines() { - Set found = getSortedFiltered(betaPlusCandidateData.getAllEmojiWithDefectives(), candidateData.comparator, false); // releaseData, betaPlusCandidateData + Set found = + getSortedFiltered( + betaPlusCandidateData.getAllEmojiWithDefectives(), + candidateData.comparator, + false); // releaseData, betaPlusCandidateData if (false) { - Set found2 = getSortedFiltered(betaPlusCandidateData.getAllEmojiWithDefectives(), EmojiOrder.STD_ORDER.codepointCompare, false); // releaseData, betaPlusCandidateData + Set found2 = + getSortedFiltered( + betaPlusCandidateData.getAllEmojiWithDefectives(), + EmojiOrder.STD_ORDER.codepointCompare, + false); // releaseData, betaPlusCandidateData ArrayList foundArray = new ArrayList<>(found); ArrayList found2Array = new ArrayList<>(found2); if (!foundArray.equals(found2Array)) { @@ -106,19 +110,24 @@ private static void getEmojiDataLines() { MajorGroup oldMajorCat = null; String oldMinorCat = null; String versionString = Emoji.VERSION_BETA.getVersionString(1, 2); - System.out.println("# Generated lines from Emoji Data v" + versionString + ", using GenerateCldrData.java"); + System.out.println( + "# Generated lines from Emoji Data v" + + versionString + + ", using GenerateCldrData.java"); Set doneAlready = new HashSet<>(); // get sizes - // 1F1F9 1F1F2 ; fully-qualified # 🇹🇲 flag: Turkmenistan + // 1F1F9 1F1F2 ; fully-qualified # 🇹🇲 flag: + // Turkmenistan int maxField1 = 0; int maxField2 = "minimally-qualified".length(); for (String s : found) { maxField1 = Math.max(maxField1, Utility.hex(s, " ").length()); } - Tabber tabber = new Tabber.MonoTabber() - .add(maxField1+1, Tabber.LEFT) - .add(maxField2+3, Tabber.LEFT); + Tabber tabber = + new Tabber.MonoTabber() + .add(maxField1 + 1, Tabber.LEFT) + .add(maxField2 + 3, Tabber.LEFT); ; Set minorSeen = new HashSet<>(); @@ -127,8 +136,12 @@ private static void getEmojiDataLines() { String minorCat = candidateData.getCategory(s); if (!minorCat.equals(oldMinorCat)) { if (minorSeen.contains(minorCat)) { - throw new ICUException("Bad ordering for " + minorCat + "\t" + s - + "\nMaybe a mismatch between candidateData.txt and emojiOrdering.txt"); + throw new ICUException( + "Bad ordering for " + + minorCat + + "\t" + + s + + "\nMaybe a mismatch between candidateData.txt and emojiOrdering.txt"); } minorSeen.add(minorCat); oldMinorCat = minorCat; @@ -142,25 +155,37 @@ private static void getEmojiDataLines() { String name = betaPlusCandidateData.getName(s); String withVS = betaPlusCandidateData.addEmojiVariants(s); String withVSFirst = betaPlusCandidateData.addEmojiVariants(s, Emoji.Qualified.first); - String withoutVS = withVS.replace(Emoji.EMOJI_VARIANT_STRING,""); + String withoutVS = withVS.replace(Emoji.EMOJI_VARIANT_STRING, ""); addDataLine(withVS, name, doneAlready, "fully-qualified", tabber); addDataLine(withVSFirst, name, doneAlready, "minimally-qualified", tabber); - addDataLine(withoutVS.replace(Emoji.EMOJI_VARIANT_STRING,""), name, doneAlready, "unqualified", tabber); + addDataLine( + withoutVS.replace(Emoji.EMOJI_VARIANT_STRING, ""), + name, + doneAlready, + "unqualified", + tabber); } } - private static void addDataLine(String s, String name, Set doneAlready, String classification, Tabber tabber) { + private static void addDataLine( + String s, String name, Set doneAlready, String classification, Tabber tabber) { if (!doneAlready.contains(s)) { - System.out.println(tabber.process( - Utility.hex(s, " ") + "\t; " - + classification - + "\t# " + s + " " + name) - ); + System.out.println( + tabber.process( + Utility.hex(s, " ") + + "\t; " + + classification + + "\t# " + + s + + " " + + name)); doneAlready.add(s); } } - private static Set getSortedFiltered(UnicodeSet onlyNew, Comparator comparator, boolean filtered) { + + private static Set getSortedFiltered( + UnicodeSet onlyNew, Comparator comparator, boolean filtered) { Set found = new LinkedHashSet<>(); // TODO, handle beta data also for (String s : onlyNew.addAllTo(new TreeSet<>(comparator))) { @@ -178,6 +203,7 @@ private static Set getSortedFiltered(UnicodeSet onlyNew, Comparator emojiToCode = HashBiMap.create(); + static { AnnotationSet rootAnnotations = Annotations.getDataSet("root"); for (String emoji : rootAnnotations.keySet()) { @@ -185,9 +211,10 @@ private static Set getSortedFiltered(UnicodeSet onlyNew, Comparator found, boolean isRoot, UnicodeMap cldrData) { + private static void getLines( + Set found, boolean isRoot, UnicodeMap cldrData) { int counter = 0; AnnotationSet english = Annotations.getDataSet("en"); - System.out.println(""); + System.out.println( + ""); Set warnings = new TreeSet<>(); for (String s : found) { @@ -217,18 +249,24 @@ private static void getLines(Set found, boolean isRoot, UnicodeMap candidateKeywords = candidateData.getAnnotations(s); Set cldrKeywords = annotations.getKeywords(); if (!Objects.equals(candidateKeywords, cldrKeywords)) { - warnings.add(rootCode + " Keywords Differ:" - + "\tcldr:\t" + cldrKeywords - + "\tcandidate:\t" + candidateKeywords - ); + warnings.add( + rootCode + + " Keywords Differ:" + + "\tcldr:\t" + + cldrKeywords + + "\tcandidate:\t" + + candidateKeywords); } } continue; @@ -257,44 +295,56 @@ private static void getLines(Set found, boolean isRoot, UnicodeMap" + keywords + "" - + " \t"); + System.out.println( + prefix + + ">" + + keywords + + "" + + " \t"); System.out.println(prefix + " type=\"tts\"" + ">" + name + ""); } if (!isRoot) { if (!warnings.isEmpty()) { - System.out.println("Differences from candidate to cldr" - + "\nJust double check these against CLDR, which may be newer."); + System.out.println( + "Differences from candidate to cldr" + + "\nJust double check these against CLDR, which may be newer."); for (String warning : warnings) { System.out.println(warning); } } } else { - UnicodeSet extraInRoot = new UnicodeSet().addAll(found) - .removeAll(betaPlusCandidateData.getAllEmojiWithoutDefectives()); + UnicodeSet extraInRoot = + new UnicodeSet() + .addAll(found) + .removeAll(betaPlusCandidateData.getAllEmojiWithoutDefectives()); System.out.println("Extra annotations in root: " + extraInRoot.toPattern(false)); } } - // private static LocaleData printXml(OldAnnotationData data, UnicodeSet missing) throws IOException { + // private static LocaleData printXml(OldAnnotationData data, UnicodeSet missing) throws + // IOException { // final boolean isEnglish = data.locale.equals(ULocale.ENGLISH); // LocaleData ld = LocaleData.getInstance(data.locale); // String language = data.locale.getLanguage(); // String script = data.locale.getScript(); // String territory = data.locale.getCountry(); - // try (PrintWriter outText = FileUtilities.openUTF8Writer(CLDRPaths.COMMON_DIRECTORY + "/annotations/", data.locale + ".xml")) { + // try (PrintWriter outText = FileUtilities.openUTF8Writer(CLDRPaths.COMMON_DIRECTORY + + // "/annotations/", data.locale + ".xml")) { // outText.append(DtdType.ldml.header(MethodHandles.lookup().lookupClass()) // + "\t\n" - // + "\t\t\n" + // + "\t\t\n" // + "\t\t\n" // + (script.isEmpty() ? "" : "\t\t