From 68656e18c049d77a383f474f1a9dc57739afffc7 Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <tarekms@microsoft.com>
Date: Fri, 27 Sep 2024 16:16:08 -0700
Subject: [PATCH 1/6] Move the Tokenizer's data into separate packages.

---
 Microsoft.ML.sln                              |  66 ++++++++++++
 eng/TokenizerData.targets                     |  80 ++++++++++++++
 .../Cl100kBaseTokenizerData.cs                |  16 +++
 .../Data/cl100k_base.tiktoken                 |   0
 ...osoft.ML.Tokenizers.Data.Cl100kBase.csproj |  27 +++++
 .../PACKAGE.md                                |  38 +++++++
 .../Data/gpt2.tiktoken                        |   0
 .../Gpt2TokenizerData.cs                      |  16 +++
 .../Microsoft.ML.Tokenizers.Data.Gpt2.csproj  |  27 +++++
 .../PACKAGE.md                                |  29 +++++
 .../Data/o200k_base.tiktoken                  |   0
 ...rosoft.ML.Tokenizers.Data.O200kBase.csproj |  27 +++++
 .../O200kBaseTokenizerData.cs                 |  16 +++
 .../PACKAGE.md                                |  29 +++++
 .../Data/p50k_base.tiktoken                   |   0
 ...crosoft.ML.Tokenizers.Data.P50kBase.csproj |  27 +++++
 .../P50kBaseTokenizerData.cs                  |  16 +++
 .../PACKAGE.md                                |  37 +++++++
 .../Data/r50k_base.tiktoken                   |   0
 ...crosoft.ML.Tokenizers.Data.R50kBase.csproj |  27 +++++
 .../PACKAGE.md                                |  47 ++++++++
 .../R50kBaseTokenizerData.cs                  |  16 +++
 .../Microsoft.ML.Tokenizers.csproj            | 102 ------------------
 .../Model/TiktokenTokenizer.cs                |  44 +++++---
 .../Microsoft.ML.Tokenizers.Data.Tests.csproj |  22 ++++
 .../TokenizerDataTests.cs                     |  63 +++++++++++
 .../Microsoft.ML.Tokenizers.Tests.csproj      |   7 ++
 .../{TitokenTests.cs => TiktokenTests.cs}     |   2 +-
 test/Microsoft.ML.Tokenizers.Tests/Utils.cs   |   2 +-
 29 files changed, 667 insertions(+), 116 deletions(-)
 create mode 100644 eng/TokenizerData.targets
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
 rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.Cl100kBase}/Data/cl100k_base.tiktoken (100%)
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
 rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.Gpt2}/Data/gpt2.tiktoken (100%)
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
 rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.O200kBase}/Data/o200k_base.tiktoken (100%)
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
 rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.P50kBase}/Data/p50k_base.tiktoken (100%)
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
 rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.R50kBase}/Data/r50k_base.tiktoken (100%)
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
 create mode 100644 test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
 create mode 100644 test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
 rename test/Microsoft.ML.Tokenizers.Tests/{TitokenTests.cs => TiktokenTests.cs} (99%)

diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
index 00635886a1..d57cc442bd 100644
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -918,6 +930,54 @@ Global
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -1013,6 +1073,12 @@ Global
 		{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
diff --git a/eng/TokenizerData.targets b/eng/TokenizerData.targets
new file mode 100644
index 0000000000..e80b6bf5de
--- /dev/null
+++ b/eng/TokenizerData.targets
@@ -0,0 +1,80 @@
+<Project>
+  <UsingTask TaskName="CompressFile"
+    TaskFactory="RoslynCodeTaskFactory"
+    AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
+    <ParameterGroup>
+      <Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
+    </ParameterGroup>
+    <Task>
+      <Using Namespace="System.Globalization" />
+      <Using Namespace="System.IO" />
+      <Using Namespace="System.IO.Compression" />
+      <Code Type="Fragment" Language="cs">
+			<![CDATA[
+        foreach (var file in Files)
+        {
+            string fileName = file.GetMetadata("FullPath");
+            string fileContent = File.ReadAllText(fileName);
+            int capacity = 1;
+            int eolIndex = 0;
+            do
+            {
+                if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
+                {
+                    eolIndex++;
+                    capacity++;
+                }
+                else
+                {
+                    break;
+                }
+            } while (eolIndex < fileContent.Length);
+
+            using var sourceStream = File.OpenRead(fileName);
+            using var reader = new StreamReader(sourceStream);
+            using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
+            using var streamWriter = new StreamWriter(destStream);
+
+            streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
+
+            string line;
+            int destLineNumber = 0;
+
+            while ((line = reader.ReadLine()) != null)
+            {
+                if (line.Length == 0) { continue; }
+                int index = line.IndexOf(' ');
+
+                if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
+                {
+                    Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
+                    break;
+                }
+
+                while (destLineNumber < id)
+                {
+                    // ensure id always aligns with the line number
+                    streamWriter.WriteLine();
+                    destLineNumber++;
+                }
+
+                streamWriter.WriteLine(line.Substring(0, index));
+                destLineNumber++;
+            }
+        }
+      ]]>
+      </Code>
+    </Task>
+  </UsingTask>
+
+  <Target Name="CompressTiktokenData"
+          BeforeTargets="AssignTargetPaths"
+          Inputs="@(FilesToCompress)"
+          Outputs="@(FilesToCompress->'%(Destination)')">
+
+    <CompressFile Files="@(FilesToCompress)" />
+    <ItemGroup>
+      <EmbeddedResource Include="@(FilesToCompress->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
+    </ItemGroup>
+  </Target>
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
new file mode 100644
index 0000000000..4995edafa6
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class Cl100kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers/Data/cl100k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/cl100k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
new file mode 100644
index 0000000000..f1b5183495
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
@@ -0,0 +1,27 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <FilesToCompress Include="Data\cl100k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+  </ItemGroup>
+
+  <Import Project="..\..\eng\TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
new file mode 100644
index 0000000000..bf390cd1de
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
@@ -0,0 +1,38 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.
+
+## Key Features
+
+* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+      1. gpt-4
+      2. gpt-3.5-turbo
+      3. gpt-3.5-turbo-16k
+      4. gpt-35
+      5. gpt-35-turbo
+      6. gpt-35-turbo-16k
+      7. text-embedding-ada-002
+      8. text-embedding-3-small
+      9. text-embedding-3-large
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [Conceptual documentation](TODO)
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/gpt2.tiktoken b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/gpt2.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
new file mode 100644
index 0000000000..115530204f
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
@@ -0,0 +1,16 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class Gpt2TokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
new file mode 100644
index 0000000000..22d5facac5
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@@ -0,0 +1,27 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - gpt2.tiktoken:        https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <FilesToCompress Include="Data\gpt2.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+  </ItemGroup>
+
+  <Import Project="..\..\eng\TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
new file mode 100644
index 0000000000..6c477b6781
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
@@ -0,0 +1,29 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.
+
+## Key Features
+
+* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified model.
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [Conceptual documentation](TODO)
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/o200k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/o200k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
new file mode 100644
index 0000000000..d188141bd0
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@@ -0,0 +1,27 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - o200k_base.tiktoken   https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <FilesToCompress Include="Data\o200k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+  </ItemGroup>
+
+  <Import Project="..\..\eng\TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
new file mode 100644
index 0000000000..4711e92dc6
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class O200kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
new file mode 100644
index 0000000000..0aff2e4c10
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
@@ -0,0 +1,29 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as `Gpt-4o`.
+
+## Key Features
+
+* This package mainly contains the o200k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-4o model.
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified model.
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [Conceptual documentation](TODO)
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.O200kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/p50k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/p50k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
new file mode 100644
index 0000000000..88435ab39a
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
@@ -0,0 +1,27 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.P50kBase includes the Tiktoken tokenizer data file p50k_base.tiktoken, which is utilized by models such as text-davinci-002</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - p50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <FilesToCompress Include="Data\p50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+  </ItemGroup>
+
+  <Import Project="..\..\eng\TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
new file mode 100644
index 0000000000..fe3a56dc53
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class P50kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
new file mode 100644
index 0000000000..db6d6ec267
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
@@ -0,0 +1,37 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data file `p50k_base.tiktoken`, which is utilized by models such as `text-davinci-002`.
+
+## Key Features
+
+* This package mainly contains the `p50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+      1. text-davinci-002
+      2. text-davinci-003
+      3. code-davinci-001
+      4. code-davinci-002
+      5. code-cushman-001
+      6. code-cushman-002
+      7. davinci-codex
+      8. cushman-codex
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [Conceptual documentation](TODO)
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.P50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/r50k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/r50k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
new file mode 100644
index 0000000000..2ed25afc1d
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
@@ -0,0 +1,27 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.R50kBase includes the Tiktoken tokenizer data file r50k_base.tiktoken, which is utilized by models such as text-davinci-001</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - r50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <FilesToCompress Include="Data\r50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+  </ItemGroup>
+
+  <Import Project="..\..\eng\TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
new file mode 100644
index 0000000000..69542385ff
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
@@ -0,0 +1,47 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data file `r50k_base.tiktoken`, which is utilized by models such as `text-davinci-001`.
+
+## Key Features
+
+* This package mainly contains the `r50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+      1. text-davinci-001
+      2. text-curie-001
+      3. text-babbage-001
+      4. text-ada-001
+      5. davinci
+      6. curie
+      7. babbage
+      8. ada
+      9. text-similarity-davinci-001
+     10. text-similarity-curie-001
+     11. text-similarity-babbage-001
+     12. text-similarity-ada-001
+     13. text-search-davinci-doc-001
+     14. text-search-curie-doc-001
+     15. text-search-babbage-doc-001
+     16. text-search-ada-doc-001
+     17. code-search-babbage-code-001
+     18. code-search-ada-code-001
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [Conceptual documentation](TODO)
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.R50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
new file mode 100644
index 0000000000..8098af414e
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class R50kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
index 8294d99545..93a6cbb644 100644
--- a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
+++ b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
@@ -25,106 +25,4 @@
     <PackageReference Include="Microsoft.Bcl.HashCode" Version="$(MicrosoftBclHashCodeVersion)" />
   </ItemGroup>
 
-  <UsingTask TaskName="CompressFile"
-    TaskFactory="RoslynCodeTaskFactory"
-    AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
-    <ParameterGroup>
-      <Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
-    </ParameterGroup>
-    <Task>
-      <Using Namespace="System.Globalization" />
-      <Using Namespace="System.IO" />
-      <Using Namespace="System.IO.Compression" />
-      <Code Type="Fragment" Language="cs">
-			<![CDATA[
-        foreach (var file in Files)
-        {
-            string fileName = file.GetMetadata("FullPath");
-            string fileContent = File.ReadAllText(fileName);
-            int capacity = 1;
-            int eolIndex = 0;
-            do
-            {
-                if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
-                {
-                    eolIndex++;
-                    capacity++;
-                }
-                else
-                {
-                    break;
-                }
-            } while (eolIndex < fileContent.Length);
-
-            using var sourceStream = File.OpenRead(fileName);
-            using var reader = new StreamReader(sourceStream);
-            using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
-            using var streamWriter = new StreamWriter(destStream);
-
-            streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
-
-            string line;
-            int destLineNumber = 0;
-
-            while ((line = reader.ReadLine()) != null)
-            {
-                if (line.Length == 0) { continue; }
-                int index = line.IndexOf(' ');
-
-                if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
-                {
-                    Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
-                    break;
-                }
-
-                while (destLineNumber < id)
-                {
-                    // ensure id always aligns with the line number
-                    streamWriter.WriteLine();
-                    destLineNumber++;
-                }
-
-                streamWriter.WriteLine(line.Substring(0, index));
-                destLineNumber++;
-            }
-        }
-      ]]>
-      </Code>
-    </Task>
-  </UsingTask>
-
-  <ItemGroup>
-    <!--
-      The following files are compressed using the DeflateStream and embedded as resources in the assembly.
-      The files are downloaded from the following sources and compressed to the Destination.
-        1. cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
-        2. gpt2.tiktoken:        https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
-        3. p50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
-        4. r50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
-        5. o200k_base.tiktoken   https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
-
-      These files under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
-
-      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
-      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
-      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
-      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-    -->
-    <FilesToCompress Include="Data\cl100k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\gpt2.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\p50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\r50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\o200k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-  </ItemGroup>
-
-  <Target Name="TestCompress"
-          BeforeTargets="AssignTargetPaths"
-          Inputs="@(FilesToCompress)"
-          Outputs="@(FilesToCompress->'%(Destination)')">
-
-    <CompressFile Files="@(FilesToCompress)" />
-    <ItemGroup>
-      <EmbeddedResource Include="@(FilesToCompress->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
-    </ItemGroup>
-  </Target>
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
index 08bbf57634..a6d1f696b5 100644
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@@ -11,6 +11,7 @@
 using System.IO.Compression;
 using System.Linq;
 using System.Net.Http;
+using System.Reflection;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Threading;
@@ -1114,31 +1115,31 @@ private static ModelEncoding GetModelEncoding(string modelName)
             return encoder;
         }
 
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? TypeName, string? PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
 
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? TypeName, string? PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
         {
             switch (modelEncoding)
             {
                 case ModelEncoding.Cl100kBase:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile);
+                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Cl100kBaseTypeName, Cl100kBasePackageName);
 
                 case ModelEncoding.P50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, P50kBaseTypeName, P50kBasePackageName);
 
                 case ModelEncoding.P50kEdit:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile);
+                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, P50kBaseTypeName, P50kBasePackageName);
 
                 case ModelEncoding.R50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, R50kBaseTypeName, R50kBasePackageName);
 
                 case ModelEncoding.GPT2:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Gpt2TypeName, Gpt2PackageName);
 
                 case ModelEncoding.O200kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile);
+                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, O200kBaseTypeName, O200kBasePackageName);
 
                 default:
                     throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
@@ -1163,6 +1164,17 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
         internal const string R50kBaseEncodingName = "r50k_base";
         internal const string O200kBaseEncodingName = "o200k_base";
 
+        internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData";
+        internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
+        internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData";
+        internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
+        internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData";
+        internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase";
+        internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData";
+        internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
+        internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData";
+        internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
+
 #if NET7_0_OR_GREATER
         [GeneratedRegex(Cl100kBaseRegexPattern)]
         private static partial Regex Cl100kBaseRegex();
@@ -1195,7 +1207,7 @@ private static TiktokenTokenizer CreateForModel(
                                     IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
                                     Normalizer? normalizer = null)
         {
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? TypeName, string? PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -1209,7 +1221,15 @@ private static TiktokenTokenizer CreateForModel(
                     tiktokenConfiguration.VocabFile,
                     out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
             {
-                using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
+                Debug.Assert(tiktokenConfiguration.TypeName is not null && tiktokenConfiguration.PackageName is not null);
+
+                Type? type = Type.GetType($"{tiktokenConfiguration.TypeName}, {tiktokenConfiguration.PackageName}");
+                if (type is null)
+                {
+                    throw new InvalidOperationException($"The tokenizer data file is missing. Try to reference the package {tiktokenConfiguration.PackageName} in your project.");
+                }
+
+                using Stream compressedStream = type.Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
                 using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
 
                 cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
@@ -1338,7 +1358,7 @@ public static TiktokenTokenizer CreateForModel(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? __, string? ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -1378,7 +1398,7 @@ public static async Task<TiktokenTokenizer> CreateForModelAsync(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? __, string? ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {
diff --git a/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
new file mode 100644
index 0000000000..fe4dce9c2e
--- /dev/null
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
@@ -0,0 +1,22 @@
+﻿<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <AssemblyName>Microsoft.ML.Tokenizers.Data.Tests</AssemblyName>
+    <StrongNameKeyId>Test</StrongNameKeyId>
+    <NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <!-- Import the test signing certificate -->
+  <Import Project="../Cert.props" />
+
+  <ItemGroup>
+    <Compile Include="..\Microsoft.ML.Tokenizers.Tests\Utils.cs" />
+    <Compile Include="..\..\src\Common\tests\RetryHelper.cs" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+  </ItemGroup>
+
+</Project>
\ No newline at end of file
diff --git a/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
new file mode 100644
index 0000000000..19782a669a
--- /dev/null
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
@@ -0,0 +1,63 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Buffers;
+using System.Buffers.Binary;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using System.Net;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+using Xunit;
+
+namespace Microsoft.ML.Tokenizers.Tests
+{
+    public class TokenizerDataTests
+    {
+        [Theory]
+        [InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")]            // O200kBase
+        [InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")]            // Cl100kBase
+        [InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")]   // P50kBase
+        [InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")]   // R50kBase
+        [InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")]                   // Gpt2
+        public void TestMissingDataPackages(string modelName, string packageName)
+        {
+            var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName));
+            Assert.NotNull(exception);
+            Assert.Contains(packageName, exception.Message);
+        }
+
+        public static IEnumerable<object[]> ModelUrlData()
+        {
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
+            yield return new object[] { @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
+        }
+
+        [Theory]
+        [MemberData(nameof(ModelUrlData))]
+        public async Task TestTokenizerCreationWithProvidedData(string url)
+        {
+            string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
+            await Utils.DownloadFile(url, tokenizerDataFileName);
+
+            try
+            {
+                TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
+                Assert.NotNull(externalTokenizer);
+            }
+            finally
+            {
+                Utils.DeleteFile(tokenizerDataFileName);
+            }
+        }
+    }
+}
+
diff --git a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
index 802cae464a..612519b553 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
+++ b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
@@ -12,6 +12,13 @@
 
   <ItemGroup>
     <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+
+    <!-- Tokenizer's data packages -->
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj" />
   </ItemGroup>
 
   <ItemGroup>
diff --git a/test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
similarity index 99%
rename from test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs
rename to test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
index 791e24527c..0d25032696 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@@ -45,7 +45,7 @@ public async Task TestTokenizerCreation()
 
             string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
 
-            using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
+            using Stream compressedStream = Type.GetType("Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase")!.Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
             using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
 
             using (Stream fileStream = File.OpenWrite(tokenizerDataFileName))
diff --git a/test/Microsoft.ML.Tokenizers.Tests/Utils.cs b/test/Microsoft.ML.Tokenizers.Tests/Utils.cs
index d373475566..8cbc17620e 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/Utils.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/Utils.cs
@@ -56,7 +56,7 @@ public static string SaveEmbeddedResourceFile(string resourceName)
         {
             string fileName = CreateTemporaryFile("txt");
             using Stream fileStream = File.Create(fileName);
-            typeof(BpeTests).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
+            typeof(Utils).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
             return fileName;
         }
     }

From 174a0c369424081ebf0e2709c433092879fab82a Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <tarekms@microsoft.com>
Date: Mon, 30 Sep 2024 16:00:08 -0700
Subject: [PATCH 2/6] Address the feedback

---
 eng/TokenizerData.targets                        | 16 ++++++++++++----
 ...icrosoft.ML.Tokenizers.Data.Cl100kBase.csproj |  4 ++--
 .../Microsoft.ML.Tokenizers.Data.Gpt2.csproj     |  4 ++--
 ...Microsoft.ML.Tokenizers.Data.O200kBase.csproj |  4 ++--
 .../Microsoft.ML.Tokenizers.Data.P50kBase.csproj |  4 ++--
 .../Microsoft.ML.Tokenizers.Data.R50kBase.csproj |  4 ++--
 .../Model/TiktokenTokenizer.cs                   |  8 +++++---
 .../TokenizerDataTests.cs                        |  2 +-
 .../TiktokenTests.cs                             |  2 +-
 9 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/eng/TokenizerData.targets b/eng/TokenizerData.targets
index e80b6bf5de..9936c726aa 100644
--- a/eng/TokenizerData.targets
+++ b/eng/TokenizerData.targets
@@ -69,12 +69,20 @@
 
   <Target Name="CompressTiktokenData"
           BeforeTargets="AssignTargetPaths"
-          Inputs="@(FilesToCompress)"
-          Outputs="@(FilesToCompress->'%(Destination)')">
+          DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
+          Inputs="@(TokenizerDataEmbeddedResource)"
+          Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">
 
-    <CompressFile Files="@(FilesToCompress)" />
+      <CompressFile Files="@(TokenizerDataEmbeddedResource)" />
+
+      <ItemGroup>
+        <EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
+      </ItemGroup>
+  </Target>
+
+  <Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
     <ItemGroup>
-      <EmbeddedResource Include="@(FilesToCompress->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
+      <TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
     </ItemGroup>
   </Target>
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
index f1b5183495..d9e0940472 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
@@ -20,8 +20,8 @@
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
       After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
-    <FilesToCompress Include="Data\cl100k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+    <TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
   </ItemGroup>
 
-  <Import Project="..\..\eng\TokenizerData.targets" />
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
index 22d5facac5..d3e41421ce 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@@ -20,8 +20,8 @@
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
       After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
-    <FilesToCompress Include="Data\gpt2.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+    <TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
   </ItemGroup>
 
-  <Import Project="..\..\eng\TokenizerData.targets" />
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
index d188141bd0..0671ba4ebc 100644
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@@ -20,8 +20,8 @@
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
       After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
-    <FilesToCompress Include="Data\o200k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+    <TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
   </ItemGroup>
 
-  <Import Project="..\..\eng\TokenizerData.targets" />
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
index 88435ab39a..baf8e83c88 100644
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
@@ -20,8 +20,8 @@
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
       After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
-    <FilesToCompress Include="Data\p50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+    <TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
   </ItemGroup>
 
-  <Import Project="..\..\eng\TokenizerData.targets" />
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
index 2ed25afc1d..e82cd4b49f 100644
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
@@ -20,8 +20,8 @@
       we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
       After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
     -->
-    <FilesToCompress Include="Data\r50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+    <TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
   </ItemGroup>
 
-  <Import Project="..\..\eng\TokenizerData.targets" />
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
index a6d1f696b5..7a25b5c653 100644
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@@ -1155,7 +1155,7 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
         private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate";  // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
         private const string P50RanksFile = "p50k_base.tiktoken.deflate";           // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
         private const string R50RanksFile = "r50k_base.tiktoken.deflate";           // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
-        private const string GPT2File = "gpt2.tiktoken.deflate";                    // "https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
+        private const string GPT2File = "gpt2.tiktoken.deflate";                    // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
         private const string O200kBaseFile = "o200k_base.tiktoken.deflate";         // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
 
         internal const string Cl100kBaseEncodingName = "cl100k_base";
@@ -1223,10 +1223,12 @@ private static TiktokenTokenizer CreateForModel(
             {
                 Debug.Assert(tiktokenConfiguration.TypeName is not null && tiktokenConfiguration.PackageName is not null);
 
-                Type? type = Type.GetType($"{tiktokenConfiguration.TypeName}, {tiktokenConfiguration.PackageName}");
+                string fullAssemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
+                int commaIndex = fullAssemblyName.IndexOf(',');
+                Type? type = commaIndex > 0 ? Type.GetType($"{tiktokenConfiguration.TypeName}, {tiktokenConfiguration.PackageName}{fullAssemblyName.Substring(commaIndex)}") : null;
                 if (type is null)
                 {
-                    throw new InvalidOperationException($"The tokenizer data file is missing. Try to reference the package {tiktokenConfiguration.PackageName} in your project.");
+                    throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
                 }
 
                 using Stream compressedStream = type.Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
diff --git a/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
index 19782a669a..e165e931c9 100644
--- a/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
@@ -35,7 +35,7 @@ public void TestMissingDataPackages(string modelName, string packageName)
         public static IEnumerable<object[]> ModelUrlData()
         {
             yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
-            yield return new object[] { @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
+            yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
             yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
             yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
             yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
index 0d25032696..900240c094 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@@ -97,7 +97,7 @@ public async Task TestTokenizerCreation()
         public static IEnumerable<object[]> ModelUrlData()
         {
             yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
-            yield return new object[] { GPT2, @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
+            yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
             yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
             yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
             yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };

From 2f11a3c9cc21e5381ea9f5884637c34ec8328b92 Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <tarekms@microsoft.com>
Date: Mon, 30 Sep 2024 17:50:36 -0700
Subject: [PATCH 3/6] More feedback addressing

---
 .../Cl100kBaseTokenizerData.cs                | 16 -------
 ...osoft.ML.Tokenizers.Data.Cl100kBase.csproj |  4 ++
 .../PACKAGE.md                                | 11 ++++-
 .../Gpt2TokenizerData.cs                      | 16 -------
 .../Microsoft.ML.Tokenizers.Data.Gpt2.csproj  |  4 ++
 .../PACKAGE.md                                |  8 +++-
 ...rosoft.ML.Tokenizers.Data.O200kBase.csproj |  4 ++
 .../O200kBaseTokenizerData.cs                 | 16 -------
 .../PACKAGE.md                                | 11 ++++-
 ...crosoft.ML.Tokenizers.Data.P50kBase.csproj |  4 ++
 .../P50kBaseTokenizerData.cs                  | 16 -------
 .../PACKAGE.md                                | 11 ++++-
 ...crosoft.ML.Tokenizers.Data.R50kBase.csproj |  4 ++
 .../PACKAGE.md                                | 11 ++++-
 .../R50kBaseTokenizerData.cs                  | 16 -------
 .../Model/TiktokenTokenizer.cs                | 48 +++++++++++--------
 .../Microsoft.ML.Tokenizers.Tests.csproj      |  2 -
 .../TiktokenTests.cs                          |  3 +-
 18 files changed, 97 insertions(+), 108 deletions(-)
 delete mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
 delete mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
 delete mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
 delete mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
 delete mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs

diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
deleted file mode 100644
index 4995edafa6..0000000000
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
+++ /dev/null
@@ -1,16 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System.ComponentModel;
-
-namespace Microsoft.ML.Tokenizers
-{
-    /// <summary>
-    /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
-    /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class Cl100kBaseTokenizerData
-    {
-    }
-}
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
index d9e0940472..3a7c2a3504 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
@@ -23,5 +23,9 @@
     <TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
   </ItemGroup>
 
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
   <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
index bf390cd1de..20c7c2df42 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
@@ -19,13 +19,22 @@ The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer da
 
 Reference this package in your project to use the Tiktoken tokenizer with the specified models.
 
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");
+
+```
+
 ## Main Types
 
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 
 ## Additional Documentation
 
-* [Conceptual documentation](TODO)
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 
 ## Related Packages
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
deleted file mode 100644
index 115530204f..0000000000
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
+++ /dev/null
@@ -1,16 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System.ComponentModel;
-
-namespace Microsoft.ML.Tokenizers
-{
-    /// <summary>
-    /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
-    /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class Gpt2TokenizerData
-    {
-    }
-}
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
index d3e41421ce..15799111ee 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@@ -23,5 +23,9 @@
     <TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
   </ItemGroup>
 
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
   <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
index 6c477b6781..945e24e4fb 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
@@ -10,13 +10,19 @@ The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data fil
 
 Reference this package in your project to use the Tiktoken tokenizer with the specified model.
 
+```csharp
+
+// Create a tokenizer for the specified model
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");
+
+```
+
 ## Main Types
 
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 
 ## Additional Documentation
 
-* [Conceptual documentation](TODO)
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 
 ## Related Packages
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
index 0671ba4ebc..b9ce1bb964 100644
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@@ -23,5 +23,9 @@
     <TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
   </ItemGroup>
 
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
   <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
deleted file mode 100644
index 4711e92dc6..0000000000
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
+++ /dev/null
@@ -1,16 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System.ComponentModel;
-
-namespace Microsoft.ML.Tokenizers
-{
-    /// <summary>
-    /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
-    /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class O200kBaseTokenizerData
-    {
-    }
-}
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
index 0aff2e4c10..02b68e3291 100644
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
@@ -10,13 +10,22 @@ The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer dat
 
 Reference this package in your project to use the Tiktoken tokenizer with the specified model.
 
+```csharp
+
+// Create a tokenizer for the specified model
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-4o");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("o200k_base");
+
+```
+
 ## Main Types
 
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 
 ## Additional Documentation
 
-* [Conceptual documentation](TODO)
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 
 ## Related Packages
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
index baf8e83c88..2d60f2ee5c 100644
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
@@ -23,5 +23,9 @@
     <TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
   </ItemGroup>
 
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
   <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
deleted file mode 100644
index fe3a56dc53..0000000000
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
+++ /dev/null
@@ -1,16 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System.ComponentModel;
-
-namespace Microsoft.ML.Tokenizers
-{
-    /// <summary>
-    /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
-    /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class P50kBaseTokenizerData
-    {
-    }
-}
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
index db6d6ec267..fecc3855b6 100644
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
@@ -18,13 +18,22 @@ The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data
 
 Reference this package in your project to use the Tiktoken tokenizer with the specified models.
 
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-002");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("p50k_base");
+
+```
+
 ## Main Types
 
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 
 ## Additional Documentation
 
-* [Conceptual documentation](TODO)
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 
 ## Related Packages
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
index e82cd4b49f..b61f83a489 100644
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
@@ -23,5 +23,9 @@
     <TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
   </ItemGroup>
 
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
   <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
index 69542385ff..84df79a9bd 100644
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
@@ -28,13 +28,22 @@ The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data
 
 Reference this package in your project to use the Tiktoken tokenizer with the specified models.
 
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-001");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("r50k_base");
+
+```
+
 ## Main Types
 
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 
 ## Additional Documentation
 
-* [Conceptual documentation](TODO)
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 
 ## Related Packages
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
deleted file mode 100644
index 8098af414e..0000000000
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
+++ /dev/null
@@ -1,16 +0,0 @@
-﻿// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System.ComponentModel;
-
-namespace Microsoft.ML.Tokenizers
-{
-    /// <summary>
-    /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
-    /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class R50kBaseTokenizerData
-    {
-    }
-}
diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
index 7a25b5c653..21f1e55ce1 100644
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@@ -1115,31 +1115,31 @@ private static ModelEncoding GetModelEncoding(string modelName)
             return encoder;
         }
 
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? TypeName, string? PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
 
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? TypeName, string? PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
         {
             switch (modelEncoding)
             {
                 case ModelEncoding.Cl100kBase:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Cl100kBaseTypeName, Cl100kBasePackageName);
+                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Cl100kBasePackageName);
 
                 case ModelEncoding.P50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, P50kBaseTypeName, P50kBasePackageName);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, P50kBasePackageName);
 
                 case ModelEncoding.P50kEdit:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, P50kBaseTypeName, P50kBasePackageName);
+                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, P50kBasePackageName);
 
                 case ModelEncoding.R50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, R50kBaseTypeName, R50kBasePackageName);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, R50kBasePackageName);
 
                 case ModelEncoding.GPT2:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Gpt2TypeName, Gpt2PackageName);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Gpt2PackageName);
 
                 case ModelEncoding.O200kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, O200kBaseTypeName, O200kBasePackageName);
+                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, O200kBasePackageName);
 
                 default:
                     throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
@@ -1164,15 +1164,10 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
         internal const string R50kBaseEncodingName = "r50k_base";
         internal const string O200kBaseEncodingName = "o200k_base";
 
-        internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData";
         internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
-        internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData";
         internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
-        internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData";
         internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase";
-        internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData";
         internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
-        internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData";
         internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
 
 #if NET7_0_OR_GREATER
@@ -1207,7 +1202,7 @@ private static TiktokenTokenizer CreateForModel(
                                     IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
                                     Normalizer? normalizer = null)
         {
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? TypeName, string? PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -1221,17 +1216,30 @@ private static TiktokenTokenizer CreateForModel(
                     tiktokenConfiguration.VocabFile,
                     out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
             {
-                Debug.Assert(tiktokenConfiguration.TypeName is not null && tiktokenConfiguration.PackageName is not null);
+                Debug.Assert(tiktokenConfiguration.PackageName is not null);
 
                 string fullAssemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
                 int commaIndex = fullAssemblyName.IndexOf(',');
-                Type? type = commaIndex > 0 ? Type.GetType($"{tiktokenConfiguration.TypeName}, {tiktokenConfiguration.PackageName}{fullAssemblyName.Substring(commaIndex)}") : null;
-                if (type is null)
+
+                Assembly? assembly = null;
+                if (commaIndex > 0)
+                {
+                    try
+                    {
+                        assembly = Assembly.Load($"{tiktokenConfiguration.PackageName}{fullAssemblyName.Substring(commaIndex)}");
+                    }
+                    catch
+                    {
+                        // Ignore the exception, we'll throw another informative exception below.
+                    }
+                }
+
+                if (assembly is null)
                 {
                     throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
                 }
 
-                using Stream compressedStream = type.Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
+                using Stream compressedStream = assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
                 using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
 
                 cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
@@ -1360,7 +1368,7 @@ public static TiktokenTokenizer CreateForModel(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? __, string? ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -1400,7 +1408,7 @@ public static async Task<TiktokenTokenizer> CreateForModelAsync(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? __, string? ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? __) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {
diff --git a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
index 612519b553..b4a386bc40 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
+++ b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
@@ -11,8 +11,6 @@
   <Import Project="../Cert.props" />
 
   <ItemGroup>
-    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
-
     <!-- Tokenizer's data packages -->
     <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj" />
     <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj" />
diff --git a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
index 900240c094..bf75e51ec0 100644
--- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@@ -45,7 +45,8 @@ public async Task TestTokenizerCreation()
 
             string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
 
-            using Stream compressedStream = Type.GetType("Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase")!.Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
+            string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
+            using Stream compressedStream = Assembly.Load($"Microsoft.ML.Tokenizers.Data.Cl100kBase{assemblyName.Substring(assemblyName.IndexOf(','))}").GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
             using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
 
             using (Stream fileStream = File.OpenWrite(tokenizerDataFileName))

From e9c07d77ca624f5fe26e9555fef77a5020d2e8d4 Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <tarekms@microsoft.com>
Date: Tue, 1 Oct 2024 11:18:41 -0700
Subject: [PATCH 4/6] More feedback addressing

---
 .../Model/TiktokenTokenizer.cs                | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
index 21f1e55ce1..7242138c99 100644
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@@ -1221,25 +1221,21 @@ private static TiktokenTokenizer CreateForModel(
                 string fullAssemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
                 int commaIndex = fullAssemblyName.IndexOf(',');
 
+                Debug.Assert(commaIndex > 0);
+
                 Assembly? assembly = null;
-                if (commaIndex > 0)
+                try
                 {
-                    try
-                    {
-                        assembly = Assembly.Load($"{tiktokenConfiguration.PackageName}{fullAssemblyName.Substring(commaIndex)}");
-                    }
-                    catch
-                    {
-                        // Ignore the exception, we'll throw another informative exception below.
-                    }
+                    assembly = Assembly.Load($"{tiktokenConfiguration.PackageName}{fullAssemblyName.Substring(commaIndex)}");
                 }
-
-                if (assembly is null)
+                catch (Exception e)
                 {
-                    throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
+                    throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.", e);
                 }
 
-                using Stream compressedStream = assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
+                Debug.Assert(assembly is not null);
+
+                using Stream compressedStream = assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
                 using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
 
                 cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();

From 6dfb2cfa3800617a4edb0eb22044ea0fe2e57031 Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <tarekms@microsoft.com>
Date: Thu, 3 Oct 2024 14:44:02 -0700
Subject: [PATCH 5/6] Trimming/AoT support

---
 .../Cl100kBaseTokenizerData.cs                | 16 ++++++
 .../Gpt2TokenizerData.cs                      | 16 ++++++
 .../O200kBaseTokenizerData.cs                 | 16 ++++++
 .../P50kBaseTokenizerData.cs                  | 16 ++++++
 .../R50kBaseTokenizerData.cs                  | 16 ++++++
 .../Model/TiktokenTokenizer.cs                | 56 ++++++++-----------
 6 files changed, 104 insertions(+), 32 deletions(-)
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
 create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs

diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
new file mode 100644
index 0000000000..7e99aaf67b
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class Cl100kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
new file mode 100644
index 0000000000..c6467c7423
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
@@ -0,0 +1,16 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class Gpt2TokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
new file mode 100644
index 0000000000..815678c5b3
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class O200kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
new file mode 100644
index 0000000000..fa4bc2bc77
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class P50kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
new file mode 100644
index 0000000000..26a9ad3fb0
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    public sealed class R50kBaseTokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
index 7242138c99..42658eb93c 100644
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@@ -1115,31 +1115,31 @@ private static ModelEncoding GetModelEncoding(string modelName)
             return encoder;
         }
 
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
 
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
         {
             switch (modelEncoding)
             {
                 case ModelEncoding.Cl100kBase:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Cl100kBasePackageName);
+                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
+
+                case ModelEncoding.GPT2:
+                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
+
+                case ModelEncoding.O200kBase:
+                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName);
 
                 case ModelEncoding.P50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, P50kBasePackageName);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
 
                 case ModelEncoding.P50kEdit:
                     return (new Dictionary<string, int>
-                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, P50kBasePackageName);
+                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
 
                 case ModelEncoding.R50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, R50kBasePackageName);
-
-                case ModelEncoding.GPT2:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Gpt2PackageName);
-
-                case ModelEncoding.O200kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, O200kBasePackageName);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
 
                 default:
                     throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
@@ -1170,6 +1170,12 @@ private static (Dictionary<string, int> SpecialTokens, Regex Regex, string Vocab
         internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
         internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
 
+        internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData, Microsoft.ML.Tokenizers.Data.Gpt2, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.O200kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.P50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+
 #if NET7_0_OR_GREATER
         [GeneratedRegex(Cl100kBaseRegexPattern)]
         private static partial Regex Cl100kBaseRegex();
@@ -1202,7 +1208,7 @@ private static TiktokenTokenizer CreateForModel(
                                     IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
                                     Normalizer? normalizer = null)
         {
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, string? PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -1216,26 +1222,12 @@ private static TiktokenTokenizer CreateForModel(
                     tiktokenConfiguration.VocabFile,
                     out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
             {
-                Debug.Assert(tiktokenConfiguration.PackageName is not null);
-
-                string fullAssemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
-                int commaIndex = fullAssemblyName.IndexOf(',');
-
-                Debug.Assert(commaIndex > 0);
-
-                Assembly? assembly = null;
-                try
+                if (tiktokenConfiguration.DataType is null)
                 {
-                    assembly = Assembly.Load($"{tiktokenConfiguration.PackageName}{fullAssemblyName.Substring(commaIndex)}");
+                    throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
                 }
-                catch (Exception e)
-                {
-                    throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.", e);
-                }
-
-                Debug.Assert(assembly is not null);
 
-                using Stream compressedStream = assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
+                using Stream compressedStream = tiktokenConfiguration.DataType.Assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
                 using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
 
                 cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
@@ -1364,7 +1356,7 @@ public static TiktokenTokenizer CreateForModel(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {
@@ -1404,7 +1396,7 @@ public static async Task<TiktokenTokenizer> CreateForModelAsync(
                 throw new ArgumentNullException(nameof(modelName));
             }
 
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, string? __) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
 
             if (extraSpecialTokens is not null)
             {

From 9df82d6b7ad7e704743136ba11ee2da894335a57 Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <tarekms@microsoft.com>
Date: Thu, 3 Oct 2024 15:47:04 -0700
Subject: [PATCH 6/6] Make data types internal

---
 .../Cl100kBaseTokenizerData.cs                               | 5 +----
 src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs   | 5 +----
 .../O200kBaseTokenizerData.cs                                | 5 +----
 .../P50kBaseTokenizerData.cs                                 | 5 +----
 .../R50kBaseTokenizerData.cs                                 | 2 +-
 5 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
index 7e99aaf67b..c13c37a9cb 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
@@ -2,15 +2,12 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System.ComponentModel;
-
 namespace Microsoft.ML.Tokenizers
 {
     /// <summary>
     /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
     /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class Cl100kBaseTokenizerData
+    internal sealed class Cl100kBaseTokenizerData
     {
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
index c6467c7423..00d6fe3061 100644
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
@@ -2,15 +2,12 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System.ComponentModel;
-
 namespace Microsoft.ML.Tokenizers
 {
     /// <summary>
     /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
     /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class Gpt2TokenizerData
+    internal sealed class Gpt2TokenizerData
     {
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
index 815678c5b3..ca57df617e 100644
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
@@ -2,15 +2,12 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System.ComponentModel;
-
 namespace Microsoft.ML.Tokenizers
 {
     /// <summary>
     /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
     /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class O200kBaseTokenizerData
+    internal sealed class O200kBaseTokenizerData
     {
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
index fa4bc2bc77..6a421bb9d4 100644
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
@@ -2,15 +2,12 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
-using System.ComponentModel;
-
 namespace Microsoft.ML.Tokenizers
 {
     /// <summary>
     /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
     /// </summary>
-    [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class P50kBaseTokenizerData
+    internal sealed class P50kBaseTokenizerData
     {
     }
 }
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
index 26a9ad3fb0..5e5278dd23 100644
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
@@ -10,7 +10,7 @@ namespace Microsoft.ML.Tokenizers
     /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
     /// </summary>
     [EditorBrowsable(EditorBrowsableState.Never)]
-    public sealed class R50kBaseTokenizerData
+    internal sealed class R50kBaseTokenizerData
     {
     }
 }