Skip to content

Commit

Permalink
Remove duplicates and fix offsets after every ICU transliteration inv…
Browse files Browse the repository at this point in the history
…ocation
  • Loading branch information
billdueber committed Mar 20, 2024
1 parent 80b0c86 commit 1c86152
Showing 1 changed file with 17 additions and 3 deletions.
20 changes: 17 additions & 3 deletions biblio/conf/managed-schema
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@
'>
<!-- Normalization and case folding -->

<!-- pre/post_tokenization_cjk take whole cloth from Stanford
<!-- pre/post_tokenization_cjk taken whole cloth from Stanford
via https://github.com/sul-dlss/CJKFilterUtils
The pre-tokenization step turns out to be horrendously expensive
Expand Down Expand Up @@ -234,15 +234,28 @@
-->

<!ENTITY post_tokenization_cjk '
<!-- <filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.ICUTransformFilterFactory" id="Traditional-Simplified"/>
<filter class="solr.FixBrokenOffsetsFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<filter class="solr.ICUTransformFilterFactory" id="Katakana-Hiragana"/>
<filter class="solr.CJKBigramFilterFactory" han="true" hiragana="true"
katakana="true" hangul="true" outputUnigrams="true" />
<filter class="solr.FixBrokenOffsetsFilterFactory"/>
-->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>


'>

<!ENTITY post_tokenization_cjk_no_bigrams '
<!-- <filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.ICUTransformFilterFactory" id="Traditional-Simplified"/>
<filter class="solr.ICUTransformFilterFactory" id="Katakana-Hiragana"/>
<filter class="solr.FixBrokenOffsetsFilterFactory"/>
&remove_duplicates_at_same_position;
-->
'>

<!ENTITY pre_tokenization_case_folding '
<charFilter class="solr.ICUNormalizer2CharFilterFactory"/>
Expand Down Expand Up @@ -554,6 +567,7 @@
<analyzer>
&tokenize_into_one_big_token;
&icu_case_folding_and_normalization;
&post_tokenization_cjk;
<filter class="solr.PatternReplaceFilterFactory"
pattern="^\p{Z}*the\p{Z}+" replacement=""
replace="all"
Expand Down

0 comments on commit 1c86152

Please sign in to comment.