Skip to content

Commit 79c71f7

Browse files
authored
Merge pull request #29 from cnblogs/use-tokenizer-from-sdk
feat: use tokenizer from sdk
2 parents 7a86f4e + 9434c51 commit 79c71f7

10 files changed

+37
-151806
lines changed

src/KernelMemory.DashScope/DashScopeEmbeddedResource.cs

-29
This file was deleted.

src/KernelMemory.DashScope/DashScopeTextGenerator.cs

+7-4
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,19 @@ public class DashScopeTextGenerator(
2222
int maxToken = 6000) : ITextGenerator
2323
{
2424
private readonly ILogger<DashScopeTextGenerator> _logger = loggerFactory?.CreateLogger<DashScopeTextGenerator>()
25-
?? DefaultLogger<DashScopeTextGenerator>.Instance;
25+
?? DefaultLogger<DashScopeTextGenerator>.Instance;
2626

2727
/// <inheritdoc />
2828
public int CountTokens(string text)
2929
{
30-
return tokenizer?.CountTokens(text) ?? QWenTokenizer.CountTokensStatic(text);
30+
return tokenizer?.CountTokens(text) ?? QWenTokenizer.CountTokens(text);
3131
}
3232

3333
/// <inheritdoc />
3434
public IReadOnlyList<string> GetTokens(string text)
3535
{
36-
return tokenizer?.GetTokens(text) ?? QWenTokenizer.GetTokensStatic(text);
36+
return tokenizer?.GetTokens(text)
37+
?? QWenTokenizer.Tokenizer.EncodeToTokens(text, out _).Select(x => x.Value).ToList();
3738
}
3839

3940
/// <inheritdoc />
@@ -47,7 +48,9 @@ public async IAsyncEnumerable<string> GenerateTextAsync(
4748
TopP = options.NucleusSampling == 0 ? null : (float)options.NucleusSampling,
4849
Temperature = options.Temperature == 0 ? null : (float)options.Temperature,
4950
RepetitionPenalty =
50-
options.FrequencyPenalty == 0 ? null : ((float)options.FrequencyPenalty + 1), // dashScope's default value is 1.0, kernel memory is 0.0
51+
options.FrequencyPenalty == 0
52+
? null
53+
: ((float)options.FrequencyPenalty + 1), // dashScope's default value is 1.0, kernel memory is 0.0
5154
MaxTokens = options.MaxTokens == 0 ? null : options.MaxTokens,
5255
Stop = options.StopSequences.ToArray(),
5356
IncrementalOutput = true,

src/KernelMemory.DashScope/DependencyInjector.cs

+5-5
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public static IKernelMemoryBuilder WithDashScopeDefaults(
3636
ITextTokenizer? textEmbeddingTokenizer = null,
3737
bool onlyForRetrieval = false)
3838
{
39-
textGenerationTokenizer ??= new QWenTokenizer();
39+
textGenerationTokenizer ??= new QWenTextTokenizer();
4040
textEmbeddingTokenizer ??= new LengthTokenizer();
4141

4242
var config = new DashScopeConfig
@@ -97,7 +97,7 @@ public static IKernelMemoryBuilder WithDashScope(
9797
{
9898
config.EnsureValid();
9999
embeddingTokenizer ??= new LengthTokenizer();
100-
textTokenizer ??= new QWenTokenizer();
100+
textTokenizer ??= new QWenTextTokenizer();
101101
dashScopeClient ??= new DashScopeClient(config.ApiKey);
102102
builder.WithDashScopeTextGeneration(config, textTokenizer, dashScopeClient);
103103
builder.WithDashScopeTextEmbeddingGeneration(config, embeddingTokenizer, onlyForRetrieval, dashScopeClient);
@@ -119,7 +119,7 @@ public static IKernelMemoryBuilder WithDashScopeTextGeneration(
119119
IDashScopeClient? dashScopeClient = null)
120120
{
121121
config.EnsureValid();
122-
tokenizer ??= new QWenTokenizer();
122+
tokenizer ??= new QWenTextTokenizer();
123123
dashScopeClient ??= new DashScopeClient(config.ApiKey);
124124
builder.Services.AddDashScopeTextGeneration(config, tokenizer, dashScopeClient);
125125
return builder;
@@ -188,7 +188,7 @@ public static IServiceCollection AddDashScopeTextEmbeddingGeneration(
188188
/// </summary>
189189
/// <param name="services">The <see cref="IServiceCollection"/>.</param>
190190
/// <param name="config">Settings for DashScope.</param>
191-
/// <param name="tokenizer">The tokenizer to use, defaults to <see cref="QWenTokenizer"/>.</param>
191+
/// <param name="tokenizer">The tokenizer to use, defaults to <see cref="QWenTextTokenizer"/>.</param>
192192
/// <param name="dashScopeClient">The underlying <see cref="IDashScopeClient"/>.</param>
193193
/// <returns></returns>
194194
public static IServiceCollection AddDashScopeTextGeneration(
@@ -198,7 +198,7 @@ public static IServiceCollection AddDashScopeTextGeneration(
198198
IDashScopeClient? dashScopeClient = null)
199199
{
200200
config.EnsureValid();
201-
tokenizer ??= new QWenTokenizer();
201+
tokenizer ??= new QWenTextTokenizer();
202202

203203
return services.AddSingleton<ITextGenerator>(
204204
sp => new DashScopeTextGenerator(

src/KernelMemory.DashScope/KernelMemory.DashScope.csproj

+1-6
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,8 @@
1818
</PropertyGroup>
1919

2020
<ItemGroup>
21-
<PackageReference Include="Microsoft.DeepDev.TokenizerLib" Version="1.3.3" />
2221
<PackageReference Include="Microsoft.KernelMemory.Abstractions" Version="0.95.241216.2" />
23-
<PackageReference Include="Cnblogs.DashScope.Core" Version="0.5.1" />
22+
<PackageReference Include="Cnblogs.DashScope.Core" Version="0.5.2" />
2423
</ItemGroup>
2524

2625
<ItemGroup>
@@ -31,8 +30,4 @@
3130
<None Include="../../README.md" Pack="true" PackagePath="\" />
3231
</ItemGroup>
3332

34-
<ItemGroup>
35-
<EmbeddedResource Include="qwen.tiktoken" />
36-
</ItemGroup>
37-
3833
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
using Cnblogs.DashScope.Core;
2+
using Microsoft.KernelMemory.AI;
3+
4+
namespace Cnblogs.KernelMemory.AI.DashScope;
5+
6+
/// <summary>
7+
/// Tokenizer using QWen
8+
/// </summary>
9+
public class QWenTextTokenizer : ITextTokenizer
10+
{
11+
/// <inheritdoc />
12+
public int CountTokens(string text)
13+
{
14+
return QWenTokenizer.CountTokens(text);
15+
}
16+
17+
/// <inheritdoc />
18+
public IReadOnlyList<string> GetTokens(string text)
19+
{
20+
return QWenTokenizer.Tokenizer.EncodeToTokens(text, out _).Select(x => x.Value).ToList();
21+
}
22+
}

src/KernelMemory.DashScope/QWenTokenizer.cs

-72
This file was deleted.

0 commit comments

Comments
 (0)