Skip to content

Commit

Permalink
Move the Tokenizer's data into separate packages. (#7248)
Browse files Browse the repository at this point in the history
* Move the Tokenizer's data into separate packages.

* Address the feedback

* More feedback addressing

* More feedback addressing

* Trimming/AoT support

* Make data types internal
  • Loading branch information
tarekgh authored Oct 4, 2024
1 parent 189ba24 commit 1e91427
Show file tree
Hide file tree
Showing 29 changed files with 729 additions and 123 deletions.
66 changes: 66 additions & 0 deletions Microsoft.ML.sln
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -918,6 +930,54 @@ Global
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -1013,6 +1073,12 @@ Global
{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
Expand Down
88 changes: 88 additions & 0 deletions eng/TokenizerData.targets
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
<Project>
<UsingTask TaskName="CompressFile"
TaskFactory="RoslynCodeTaskFactory"
AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
<ParameterGroup>
<Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
</ParameterGroup>
<Task>
<Using Namespace="System.Globalization" />
<Using Namespace="System.IO" />
<Using Namespace="System.IO.Compression" />
<Code Type="Fragment" Language="cs">
<![CDATA[
foreach (var file in Files)
{
string fileName = file.GetMetadata("FullPath");
string fileContent = File.ReadAllText(fileName);
int capacity = 1;
int eolIndex = 0;
do
{
if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
{
eolIndex++;
capacity++;
}
else
{
break;
}
} while (eolIndex < fileContent.Length);
using var sourceStream = File.OpenRead(fileName);
using var reader = new StreamReader(sourceStream);
using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
using var streamWriter = new StreamWriter(destStream);
streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
string line;
int destLineNumber = 0;
while ((line = reader.ReadLine()) != null)
{
if (line.Length == 0) { continue; }
int index = line.IndexOf(' ');
if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
{
Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
break;
}
while (destLineNumber < id)
{
// ensure id always aligns with the line number
streamWriter.WriteLine();
destLineNumber++;
}
streamWriter.WriteLine(line.Substring(0, index));
destLineNumber++;
}
}
]]>
</Code>
</Task>
</UsingTask>

<Target Name="CompressTiktokenData"
BeforeTargets="AssignTargetPaths"
DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
Inputs="@(TokenizerDataEmbeddedResource)"
Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">

<CompressFile Files="@(TokenizerDataEmbeddedResource)" />

<ItemGroup>
<EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
</ItemGroup>
</Target>

<Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
<ItemGroup>
<TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
</ItemGroup>
</Target>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

namespace Microsoft.ML.Tokenizers
{
/// <summary>
/// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
/// </summary>
internal sealed class Cl100kBaseTokenizerData
{
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<Nullable>enable</Nullable>
<IsPackable>true</IsPackable>
<PackageDescription>The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.</PackageDescription>
</PropertyGroup>

<ItemGroup>
<!--
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
The files are downloaded from the following sources and compressed to the Destination.
- cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-->
<TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
</ItemGroup>

<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
</Project>
47 changes: 47 additions & 0 deletions src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
## About

The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.

## Key Features

* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
1. gpt-4
2. gpt-3.5-turbo
3. gpt-3.5-turbo-16k
4. gpt-35
5. gpt-35-turbo
6. gpt-35-turbo-16k
7. text-embedding-ada-002
8. text-embedding-3-small
9. text-embedding-3-large

## How to Use

Reference this package in your project to use the Tiktoken tokenizer with the specified models.

```csharp

// Create a tokenizer for the specified model or any other listed model name
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");

// Create a tokenizer for the specified encoding
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");

```

## Main Types

Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.

## Additional Documentation

* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)

## Related Packages

<!-- The related packages associated with this package -->
Microsoft.ML.Tokenizers

## Feedback & Contributing

Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
13 changes: 13 additions & 0 deletions src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

namespace Microsoft.ML.Tokenizers
{
/// <summary>
/// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
/// </summary>
internal sealed class Gpt2TokenizerData
{
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<Nullable>enable</Nullable>
<IsPackable>true</IsPackable>
<PackageDescription>The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.</PackageDescription>
</PropertyGroup>

<ItemGroup>
<!--
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
The files are downloaded from the following sources and compressed to the Destination.
- gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-->
<TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
</ItemGroup>

<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
</Project>
35 changes: 35 additions & 0 deletions src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
## About

The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.

## Key Features

* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.

## How to Use

Reference this package in your project to use the Tiktoken tokenizer with the specified model.

```csharp

// Create a tokenizer for the specified model
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");

```

## Main Types

Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.

## Additional Documentation

* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)

## Related Packages

<!-- The related packages associated with this package -->
Microsoft.ML.Tokenizers

## Feedback & Contributing

Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<Nullable>enable</Nullable>
<IsPackable>true</IsPackable>
<PackageDescription>The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.</PackageDescription>
</PropertyGroup>

<ItemGroup>
<!--
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
The files are downloaded from the following sources and compressed to the Destination.
- o200k_base.tiktoken https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-->
<TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
</ItemGroup>

<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

namespace Microsoft.ML.Tokenizers
{
/// <summary>
/// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
/// </summary>
internal sealed class O200kBaseTokenizerData
{
}
}
Loading

0 comments on commit 1e91427

Please sign in to comment.