Skip to content

Commit

Permalink
fix for unicode
Browse files Browse the repository at this point in the history
  • Loading branch information
danny50610 committed Aug 18, 2023
1 parent 7825519 commit 1ac9f79
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 4 deletions.
14 changes: 14 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Listen for Xdebug",
"type": "php",
"request": "launch",
"port": 9003
},
]
}
2 changes: 1 addition & 1 deletion src/Encoding.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Encoding
public function __construct(&$mergeableRanks, $pattenRegex)
{
$this->mergeableRanks = $mergeableRanks;
$this->pattenRegex = $pattenRegex;
$this->pattenRegex = $pattenRegex . 'u'; // u for unicode
}

public function encodeOrdinary(string $text): array
Expand Down
20 changes: 17 additions & 3 deletions tests/EncodingCl100kBaseTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,24 @@

class EncodingCl100kBaseTest extends TestCase
{
public function test() {
/**
* @dataProvider textDataProvider
*/
public function testEncodeOrdinary($text, $expectedTokens) {
$enc = EncodingFactory::createByEncodingName('cl100k_base');
$tokens = $enc->encodeOrdinary('tiktoken is great!');
$tokens = $enc->encodeOrdinary($text);

$this->assertSame([83, 1609, 5963, 374, 2294, 0], $tokens);
$this->assertSame($expectedTokens, $tokens);
}

public static function textDataProvider()
{
return [
['tiktoken is great!', [83, 1609, 5963, 374, 2294, 0]],
['台北 101 高度 508 公尺', [55038, 49409, 220, 4645, 18630, 41519, 27479, 220, 19869, 35469, 105, 16175, 118]],
// TODO: 表情符號
];
}

// TODO: Encode -> Decode chain
}

0 comments on commit 1ac9f79

Please sign in to comment.