@@ -22,78 +22,59 @@ public static IEnumerable<object?[]> NormalizerData
2222 new LowerCaseNormalizer ( ) ,
2323 "How Are You Doing?" ,
2424 "how are you doing?" ,
25- true , // IsOneToOneMapping
26- true , // CanMapToOriginal
27- null , // NormalizedToOriginalMapping
2825 } ;
2926
3027 yield return new object ? [ ]
3128 {
3229 new UpperCaseNormalizer ( ) ,
3330 "How Are You Doing?" ,
3431 "HOW ARE YOU DOING?" ,
35- true , // IsOneToOneMapping
36- true , // CanMapToOriginal
37- null , // NormalizedToOriginalMapping
3832 } ;
3933
4034 yield return new object ? [ ]
4135 {
4236 new RemoveQuotesNormalizer ( ) ,
4337 "This is already normalized string" ,
4438 "This is already normalized string" ,
45- true , // IsOneToOneMapping
46- true , // CanMapToOriginal
47- null , // NormalizedToOriginalMapping
4839 } ;
4940
5041 yield return new object ? [ ]
5142 {
5243 new RemoveQuotesNormalizer ( ) ,
5344 "String \" to\" normalize" ,
5445 "String to normalize" ,
55- false , // IsOneToOneMapping
56- true , // CanMapToOriginal
57- new int [ ] { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 8 , 9 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 } , // NormalizedToOriginalMapping
5846 } ;
5947
6048 yield return new object ? [ ]
6149 {
6250 new UnicodeNormalizer ( NormalizationForm . FormKD ) ,
6351 "\uFB01 " , // Composed form of the character 'fi' one character
6452 "fi" , // normalized in 2 characters 'f' and 'i'
65- false , // IsOneToOneMapping
66- false , // CanMapToOriginal
67- null , // NormalizedToOriginalMapping
6853 } ;
6954 }
7055 }
7156
7257 [ Theory ]
7358 [ MemberData ( nameof ( NormalizerData ) ) ]
74- public void TestNormalizer ( Normalizer normalizer , string sentence , string normalized , bool isOneToOneMapping , bool canMapToOriginal , int [ ] normalizedToOriginalMapping )
59+ public void TestNormalizer ( Normalizer normalizer , string text , string normalized )
7560 {
76- NormalizedString ns = normalizer . Normalize ( sentence ) ;
77- Assert . Equal ( normalized , ns . Normalized ) ;
78- Assert . Equal ( isOneToOneMapping , ns . IsOneToOneMapping ) ;
79- Assert . Equal ( canMapToOriginal , ns . CanMapToOriginal ) ;
80- Assert . Equal ( normalizedToOriginalMapping , ns . NormalizedToOriginalMapping ) ;
61+ string normalizedText = normalizer . Normalize ( text ) ;
62+ Assert . Equal ( normalized , normalizedText ) ;
8163
8264 Tokenizer tokenizer = new Tokenizer ( BpeTests . CreateEmptyBpe ( ) , WhiteSpace . Instance , normalizer ) ;
83- EncodingResult encoding = tokenizer . Encode ( sentence ) ;
84- Assert . Equal ( canMapToOriginal , encoding . OffsetsMappedToOriginalString ) ;
85- Assert . Equal ( sentence , encoding . OriginalString ) ;
65+ EncodingResult encoding = tokenizer . Encode ( text ) ;
66+ Assert . Equal ( text , encoding . OriginalString ) ;
8667 Assert . Equal ( normalized , encoding . NormalizedString ) ;
8768 }
8869
8970 public class RemoveQuotesNormalizer : Normalizer
9071 {
91- public override NormalizedString Normalize ( string original )
72+ public override string Normalize ( string original )
9273 {
9374 int index = original . IndexOf ( '"' ) ;
9475 if ( index <= 0 )
9576 {
96- return new NormalizedString ( original , original , null , true ) ;
77+ return original ;
9778 }
9879
9980 StringBuilder sb = new StringBuilder ( original . Length ) ;
@@ -128,7 +109,7 @@ public override NormalizedString Normalize(string original)
128109 }
129110 } while ( true ) ;
130111
131- return new NormalizedString ( original , sb . ToString ( ) , mapping . ToArray ( ) , false ) ;
112+ return sb . ToString ( ) ;
132113 }
133114 }
134115
@@ -140,14 +121,14 @@ public UnicodeNormalizer(NormalizationForm form)
140121 _normalizationForm = form ;
141122 }
142123
143- public override NormalizedString Normalize ( string original )
124+ public override string Normalize ( string original )
144125 {
145126 if ( string . IsNullOrEmpty ( original ) )
146127 {
147- return new NormalizedString ( original , "" , null , true ) ;
128+ return string . Empty ;
148129 }
149130
150- return new NormalizedString ( original , original . Normalize ( _normalizationForm ) , null , false ) ;
131+ return original . Normalize ( _normalizationForm ) ;
151132 }
152133 }
153134 }
0 commit comments