Skip to content

Commit

Permalink
Modified Regex Pattern To Encode Emoji Unicodes Better
Browse files Browse the repository at this point in the history
  • Loading branch information
Sajjad Hossain Sagor committed Feb 27, 2023
1 parent fbec019 commit d61a343
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion src/bpe.php
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public function __construct()
// - we are special casing a few common apostrophe constructs ('s, 't, 're, ...) and making those into separate tokens
// - we then separate out strings into consecutive chunks of 1) letters, 2) numbers, 3) non-letter-numbers, 4) whitespaces

$this->regex_pattern = "/'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/";
$this->regex_pattern = "/(?:\\\\u[a-f0-9]+)+|\'[stdm]|\'[rv]e|\'ll| ?\p{L}+| ?\p{N}+| ?(?!\\\\u[a-f0-9]+\b)[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/m";

$this->cache = [];
}
Expand Down

0 comments on commit d61a343

Please sign in to comment.