diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..7e3328d --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,14 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Listen for Xdebug", + "type": "php", + "request": "launch", + "port": 9003 + }, + ] +} \ No newline at end of file diff --git a/src/Encoding.php b/src/Encoding.php index ac9db6e..58fb804 100644 --- a/src/Encoding.php +++ b/src/Encoding.php @@ -11,7 +11,7 @@ class Encoding public function __construct(&$mergeableRanks, $pattenRegex) { $this->mergeableRanks = $mergeableRanks; - $this->pattenRegex = $pattenRegex; + $this->pattenRegex = $pattenRegex . 'u'; // u for unicode } public function encodeOrdinary(string $text): array diff --git a/tests/EncodingCl100kBaseTest.php b/tests/EncodingCl100kBaseTest.php index 76988ca..5e9ec8a 100644 --- a/tests/EncodingCl100kBaseTest.php +++ b/tests/EncodingCl100kBaseTest.php @@ -7,10 +7,24 @@ class EncodingCl100kBaseTest extends TestCase { - public function test() { + /** + * @dataProvider textDataProvider + */ + public function testEncodeOrdinary($text, $expectedTokens) { $enc = EncodingFactory::createByEncodingName('cl100k_base'); - $tokens = $enc->encodeOrdinary('tiktoken is great!'); + $tokens = $enc->encodeOrdinary($text); - $this->assertSame([83, 1609, 5963, 374, 2294, 0], $tokens); + $this->assertSame($expectedTokens, $tokens); } + + public static function textDataProvider() + { + return [ + ['tiktoken is great!', [83, 1609, 5963, 374, 2294, 0]], + ['台北 101 高度 508 公尺', [55038, 49409, 220, 4645, 18630, 41519, 27479, 220, 19869, 35469, 105, 16175, 118]], + // TODO: 表情符號 + ]; + } + + // TODO: Encode -> Decode chain }