Skip to content

Commit d48b32d

Browse files
committed
Cleanup normalization support
1 parent 62334c6 commit d48b32d

File tree

6 files changed

+23
-142
lines changed

6 files changed

+23
-142
lines changed

src/Microsoft.ML.Tokenizers/Normalizer/LowerCaseNormalizer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ public LowerCaseNormalizer() { }
2121
/// </summary>
2222
/// <param name="original">The original string to normalize to lowercase form.</param>
2323
/// <returns>The lower-cased normalized string.</returns>
24-
public override NormalizedString Normalize(string original) => new NormalizedString(original, original.ToLowerInvariant(), normalizedToOriginalMapping: null, isOneToOneMapping: true);
24+
public override string Normalize(string original) => original.ToLowerInvariant();
2525
}
2626
}

src/Microsoft.ML.Tokenizers/Normalizer/NormalizedString.cs

Lines changed: 0 additions & 64 deletions
This file was deleted.

src/Microsoft.ML.Tokenizers/Normalizer/Normalizer.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ public abstract class Normalizer
1515
/// Process the original string to modify it and obtain a normalized string.
1616
/// </summary>
1717
/// <param name="original">The original string to normalize.</param>
18-
/// <returns>The normalized string along with the mapping to the original string.</returns>
19-
public abstract NormalizedString Normalize(string original);
18+
/// <returns>The normalized string.</returns>
19+
public abstract string Normalize(string original);
2020
}
2121
}

src/Microsoft.ML.Tokenizers/Normalizer/UpperCaseNormalizer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ public UpperCaseNormalizer() { }
2121
/// </summary>
2222
/// <param name="original">The original string to normalize to uppercase form.</param>
2323
/// <returns>The upper-cased normalized string.</returns>
24-
public override NormalizedString Normalize(string original) => new NormalizedString(original, original.ToUpperInvariant(), normalizedToOriginalMapping: null, isOneToOneMapping: true);
24+
public override string Normalize(string original) => original.ToUpperInvariant();
2525
}
2626
}

src/Microsoft.ML.Tokenizers/Tokenizer.cs

Lines changed: 8 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -67,56 +67,20 @@ public EncodingResult Encode(string text, bool considerSpecialTokens = true)
6767
throw new ArgumentNullException(nameof(text));
6868
}
6969

70-
string normalized;
71-
NormalizedString normalizedString = default;
72-
70+
string normalized = Normalizer is null ? text : Normalizer.Normalize(text);
7371
bool offsetsMappedToOriginal = true;
74-
if (Normalizer is not null)
75-
{
76-
normalizedString = Normalizer.Normalize(text);
77-
normalized = normalizedString.Normalized;
78-
79-
offsetsMappedToOriginal = normalizedString.CanMapToOriginal;
80-
}
81-
else
82-
{
83-
normalized = text;
84-
}
8572

8673
EncodingResult encoding = new(text, normalized, PreTokenizer.PreTokenize(normalized, considerSpecialTokens), offsetsMappedToOriginal);
8774

88-
if (Normalizer is null || !normalizedString.CanMapToOriginal || normalizedString.IsOneToOneMapping)
75+
foreach (Split split in encoding.Splits)
8976
{
90-
// Optimize the case we don't have to map the offsets.
91-
foreach (Split split in encoding.Splits)
77+
IReadOnlyList<Token> tokens = Model.Encode(split.TokenString, split.IsSpecialToken);
78+
foreach (Token token in tokens)
9279
{
93-
IReadOnlyList<Token> tokens = Model.Encode(split.TokenString, split.IsSpecialToken);
94-
foreach (Token token in tokens)
95-
{
96-
token.Offset = (token.Offset.Index + split.Offset.Index, token.Offset.Length);
97-
}
98-
99-
encoding.AddTokens(tokens);
80+
token.Offset = (token.Offset.Index + split.Offset.Index, token.Offset.Length);
10081
}
101-
}
102-
else
103-
{
104-
Debug.Assert(normalizedString.NormalizedToOriginalMapping is not null);
105-
106-
foreach (Split split in encoding.Splits)
107-
{
108-
IReadOnlyList<Token> tokens = Model.Encode(split.TokenString, split.IsSpecialToken);
109-
foreach (Token token in tokens)
110-
{
111-
int index = normalizedString.NormalizedToOriginalMapping![token.Offset.Index + split.Offset.Index];
11282

113-
Debug.Assert(index >= 0);
114-
115-
token.Offset = (index, token.Offset.Length);
116-
}
117-
118-
encoding.AddTokens(tokens);
119-
}
83+
encoding.AddTokens(tokens);
12084
}
12185

12286
return encoding;
@@ -135,7 +99,7 @@ public IReadOnlyList<int> EncodeToIds(string text, bool considerSpecialTokens =
13599
throw new ArgumentNullException(nameof(text));
136100
}
137101

138-
string normalized = Normalizer is not null ? Normalizer.Normalize(text).Normalized : text;
102+
string normalized = Normalizer is not null ? Normalizer.Normalize(text) : text;
139103
List<int> idsList = new();
140104

141105
foreach (Split split in PreTokenizer.PreTokenize(normalized, considerSpecialTokens))
@@ -161,7 +125,7 @@ public int CountTokens(string text, bool considerSpecialTokens = true)
161125
throw new ArgumentNullException(nameof(text));
162126
}
163127

164-
string normalized = Normalizer is not null ? Normalizer.Normalize(text).Normalized : text;
128+
string normalized = Normalizer is not null ? Normalizer.Normalize(text) : text;
165129

166130
int idsCount = 0;
167131
foreach (Split split in PreTokenizer.PreTokenize(normalized, considerSpecialTokens))

test/Microsoft.ML.Tokenizers.Tests/NormalizerTests.cs

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,78 +22,59 @@ public static IEnumerable<object?[]> NormalizerData
2222
new LowerCaseNormalizer(),
2323
"How Are You Doing?",
2424
"how are you doing?",
25-
true, // IsOneToOneMapping
26-
true, // CanMapToOriginal
27-
null, // NormalizedToOriginalMapping
2825
};
2926

3027
yield return new object?[]
3128
{
3229
new UpperCaseNormalizer(),
3330
"How Are You Doing?",
3431
"HOW ARE YOU DOING?",
35-
true, // IsOneToOneMapping
36-
true, // CanMapToOriginal
37-
null, // NormalizedToOriginalMapping
3832
};
3933

4034
yield return new object?[]
4135
{
4236
new RemoveQuotesNormalizer(),
4337
"This is already normalized string",
4438
"This is already normalized string",
45-
true, // IsOneToOneMapping
46-
true, // CanMapToOriginal
47-
null, // NormalizedToOriginalMapping
4839
};
4940

5041
yield return new object?[]
5142
{
5243
new RemoveQuotesNormalizer(),
5344
"String \"to\" normalize",
5445
"String to normalize",
55-
false, // IsOneToOneMapping
56-
true, // CanMapToOriginal
57-
new int[] { 0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 }, // NormalizedToOriginalMapping
5846
};
5947

6048
yield return new object?[]
6149
{
6250
new UnicodeNormalizer(NormalizationForm.FormKD),
6351
"\uFB01", // Composed form of the character 'fi' one character
6452
"fi", // normalized in 2 characters 'f' and 'i'
65-
false, // IsOneToOneMapping
66-
false, // CanMapToOriginal
67-
null, // NormalizedToOriginalMapping
6853
};
6954
}
7055
}
7156

7257
[Theory]
7358
[MemberData(nameof(NormalizerData))]
74-
public void TestNormalizer(Normalizer normalizer, string sentence, string normalized, bool isOneToOneMapping, bool canMapToOriginal, int[] normalizedToOriginalMapping)
59+
public void TestNormalizer(Normalizer normalizer, string text, string normalized)
7560
{
76-
NormalizedString ns = normalizer.Normalize(sentence);
77-
Assert.Equal(normalized, ns.Normalized);
78-
Assert.Equal(isOneToOneMapping, ns.IsOneToOneMapping);
79-
Assert.Equal(canMapToOriginal, ns.CanMapToOriginal);
80-
Assert.Equal(normalizedToOriginalMapping, ns.NormalizedToOriginalMapping);
61+
string normalizedText = normalizer.Normalize(text);
62+
Assert.Equal(normalized, normalizedText);
8163

8264
Tokenizer tokenizer = new Tokenizer(BpeTests.CreateEmptyBpe(), WhiteSpace.Instance, normalizer);
83-
EncodingResult encoding = tokenizer.Encode(sentence);
84-
Assert.Equal(canMapToOriginal, encoding.OffsetsMappedToOriginalString);
85-
Assert.Equal(sentence, encoding.OriginalString);
65+
EncodingResult encoding = tokenizer.Encode(text);
66+
Assert.Equal(text, encoding.OriginalString);
8667
Assert.Equal(normalized, encoding.NormalizedString);
8768
}
8869

8970
public class RemoveQuotesNormalizer : Normalizer
9071
{
91-
public override NormalizedString Normalize(string original)
72+
public override string Normalize(string original)
9273
{
9374
int index = original.IndexOf('"');
9475
if (index <= 0)
9576
{
96-
return new NormalizedString(original, original, null, true);
77+
return original;
9778
}
9879

9980
StringBuilder sb = new StringBuilder(original.Length);
@@ -128,7 +109,7 @@ public override NormalizedString Normalize(string original)
128109
}
129110
} while (true);
130111

131-
return new NormalizedString(original, sb.ToString(), mapping.ToArray(), false);
112+
return sb.ToString();
132113
}
133114
}
134115

@@ -140,14 +121,14 @@ public UnicodeNormalizer(NormalizationForm form)
140121
_normalizationForm = form;
141122
}
142123

143-
public override NormalizedString Normalize(string original)
124+
public override string Normalize(string original)
144125
{
145126
if (string.IsNullOrEmpty(original))
146127
{
147-
return new NormalizedString(original, "", null, true);
128+
return string.Empty;
148129
}
149130

150-
return new NormalizedString(original, original.Normalize(_normalizationForm), null, false);
131+
return original.Normalize(_normalizationForm);
151132
}
152133
}
153134
}

0 commit comments

Comments
 (0)