add a benchamrk against emojibase

cometkim · Jun 12, 2024 · d20deb2 · d20deb2
1 parent 2e84f7f
commit d20deb2
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -198,16 +198,20 @@ Look [benchmark](benchmark) to see how it works.
 
 - built-in Unicode `RegExp`
 - [emoji-regex]@10.3.0 (101M+ weekly downloads on NPM)
+- [emojibase-regex]@15.3.2 (192K+ weekly downloads on NPM)
 
 #### Package stats
 
 | Name                      | Unicode®      | ESM? | Size    | Size (min) | Size (min+gzip) | Size (min+br) |
 |---------------------------|---------------|------|--------:|-----------:|----------------:|--------------:|
 | `unicode-segmenter/emoji` | 15.1.0        |    ✔️ |   3,058 |      2,611 |           1,041 |           751 |
-| `emoji-regex`*            | 15.1.0 (vary) |    ✔️ |  12,946 |     12,859 |           2,180 |         1,746 |
+| `emoji-regex`             | 15.1.0 (vary) |    ✔️ |  12,946 |     12,859 |           2,180 |         1,746 |
+| `emojibase-regex`*        | 15.1.0        |    ✖️ |  17,711 |     16,595 |           2,870 |         2,317 |
+| `emojibase-regex/emoji`*  | 15.1.0        |    ✖️ |  13,550 |     12,458 |           2,835 |         2,210 |
 | `RegExp` w/ `u`*          |             - |    - |       0 |          0 |               0 |             0 |
 
-* `emoji-regex` only supports `Emoji_Presentation` property, not `Extended_Pictographic`.
+* `emojibase-regex` matches `Extended_Pictographic` property.
+* `emojibase-regex/emoji` matches only `Emoji_Presentation` property.
 * You can build your own `emoji-regex` using [emoji-test-regex-pattern](https://github.com/mathiasbynens/emoji-test-regex-pattern).
 * `RegExp` Unicode data is always kept up to date as the runtime support.
 * `RegExp` Unicode may not be available in [some old browsers](https://caniuse.com/mdn-javascript_builtins_regexp_unicode), edge runtimes, or embedded environments.
@@ -218,36 +222,48 @@ The runtime performance of `unicode-segmenter/emoji` is enough to test the prese
 
 It's \~2.5x worse than `RegExp` w/ `u` for match-all performance, but that's useless examples in the real world because others don't care about grapheme clusters.
 
+You can handle emojis in between grapheme processing by `unicode-segmenter/grapheme`. It might be a bit less performant than the dedicated emoji matchers, but it might still be reasonable.
+
 <details>
   <summary>Details</summary>
 
   ```
   cpu: Apple M1 Pro
   runtime: node v20.13.1 (arm64-darwin)
   
-  benchmark                    time (avg)             (min … max)       p75       p99      p999
-  --------------------------------------------------------------- -----------------------------
+  benchmark                       time (avg)             (min … max)       p75       p99      p999
+  ------------------------------------------------------------------ -----------------------------
   • checking if any emoji
-  --------------------------------------------------------------- -----------------------------
-  unicode-segmenter/emoji   16.11 ns/iter     (15.28 ns … 339 ns)  16.32 ns  18.66 ns  43.42 ns
-  RegExp w/ unicode         19.03 ns/iter     (16.52 ns … 185 ns)   17.9 ns  46.28 ns  74.85 ns
-  emoji-regex               43.15 ns/iter   (41.54 ns … 73.51 ns)  43.58 ns  47.93 ns  65.73 ns
+  ------------------------------------------------------------------ -----------------------------
+  unicode-segmenter/emoji       16.1 ns/iter     (15.54 ns … 257 ns)  16.11 ns  19.88 ns  45.92 ns
+  unicode-segmenter/grapheme   89.67 ns/iter     (75.44 ns … 938 ns)  94.65 ns    136 ns    473 ns
+  RegExp w/ unicode            17.73 ns/iter    (16.5 ns … 90.05 ns)  16.93 ns  36.82 ns  56.01 ns
+  emoji-regex                  42.83 ns/iter     (41.32 ns … 409 ns)  43.23 ns  51.88 ns    169 ns
+  emojibase-regex                145 ns/iter     (109 ns … 2'300 ns)    111 ns    942 ns  1'952 ns
+  emojibase-regex/emoji        77.47 ns/iter     (69.74 ns … 916 ns)  73.97 ns    277 ns    667 ns
   
   summary for checking if any emoji
     unicode-segmenter/emoji
-     1.18x faster than RegExp w/ unicode
-     2.68x faster than emoji-regex
+     1.1x faster than RegExp w/ unicode
+     2.66x faster than emoji-regex
+     4.81x faster than emojibase-regex/emoji
+     5.57x faster than unicode-segmenter/grapheme
+     9x faster than emojibase-regex
   
   • match all emoji
-  --------------------------------------------------------------- -----------------------------
-  unicode-segmenter/emoji   3'215 ns/iter     (2'958 ns … 189 µs)  3'208 ns  3'708 ns 11'833 ns
-  RegExp w/ unicode         1'285 ns/iter   (1'221 ns … 1'509 ns)  1'299 ns  1'449 ns  1'509 ns
-  emoji-regex              11'696 ns/iter    (11'125 ns … 239 µs) 11'667 ns 16'125 ns 20'375 ns
+  ------------------------------------------------------------------ -----------------------------
+  unicode-segmenter/emoji      3'223 ns/iter     (3'000 ns … 237 µs)  3'208 ns  3'667 ns 13'417 ns
+  unicode-segmenter/grapheme   7'904 ns/iter     (7'333 ns … 290 µs)  7'791 ns  9'625 ns 72'333 ns
+  RegExp w/ unicode            1'266 ns/iter   (1'215 ns … 1'374 ns)  1'285 ns  1'362 ns  1'374 ns
+  emoji-regex                 11'567 ns/iter    (11'083 ns … 193 µs) 11'666 ns 12'500 ns 28'834 ns
+  emojibase-regex             16'934 ns/iter    (16'291 ns … 187 µs) 17'083 ns 18'500 ns 29'708 ns
   
   summary for match all emoji
     unicode-segmenter/emoji
-     2.5x slower than RegExp w/ unicode
-     3.64x faster than emoji-regex
+     2.55x slower than RegExp w/ unicode
+     2.45x faster than unicode-segmenter/grapheme
+     3.59x faster than emoji-regex
+     5.25x faster than emojibase-regex
   ```
 
 </details>

diff --git a/benchmark/bundle-entry-emojibase-regex-emoji.js b/benchmark/bundle-entry-emojibase-regex-emoji.js
@@ -0,0 +1 @@
+export { default as EMOJI_REGEX } from 'emojibase-regex/emoji';
diff --git a/benchmark/bundle-entry-emojibase-regex.js b/benchmark/bundle-entry-emojibase-regex.js
@@ -0,0 +1 @@
+export { default as EMOJI_REGEX } from 'emojibase-regex';
diff --git a/benchmark/bundle-stats-emoji.js b/benchmark/bundle-stats-emoji.js
@@ -25,23 +25,27 @@ let myEntry = await reportBundleStats(
 
 let competitors = [
   'emoji-regex',
+  'emojibase-regex',
+  ['emojibase-regex-emoji', 'emojibase-regex/emoji'],
 ];
 
 let otherEntries = await Promise.all(
   competitors.map(async (lib) => {
+    let libEntry = Array.isArray(lib) ? lib[0] : lib;
+    let libName = Array.isArray(lib) ? lib[1] : lib;
     let result = await build({
       write: false,
       bundle: true,
-      entryPoints: [path.join(baseDir, `bundle-entry-${lib}.js`)],
+      entryPoints: [path.join(baseDir, `bundle-entry-${libEntry}.js`)],
     });
     let minResult = await build({
       write: false,
       bundle: true,
       minify: true,
-      entryPoints: [path.join(baseDir, `bundle-entry-${lib}.js`)],
+      entryPoints: [path.join(baseDir, `bundle-entry-${libEntry}.js`)],
     });
     return await reportBundleStats(
-      lib,
+      libName,
       result.outputFiles[0].contents,
       minResult.outputFiles[0].contents,
     );

diff --git a/benchmark/performance-emoji.js b/benchmark/performance-emoji.js
@@ -1,10 +1,12 @@
 import * as assert from 'node:assert/strict';
 import { group, baseline, bench, run } from 'mitata';
 import emojiRegex from 'emoji-regex';
-import XRegExp from 'xregexp';
+import EMOJIBASE_REGEX_EXT from 'emojibase-regex';
+import EMOJIBASE_REGEX from 'emojibase-regex/emoji.js';
 
 import { takeCodePoint } from '../src/utils.js';
 import { isEmoji } from '../src/emoji.js';
+import { graphemeSegments, GraphemeCategory } from '../src/grapheme.js';
 
 let input = '🚀 새로운 유니코드 분할기 라이브러리 \'unicode-segmenter\'를 소개합니다! 🔍 각종 언어의 문자를 정확하게 구분해주는 강력한 도구입니다. Check it out! 👉 [https://github.com/cometkim/unicode-segmenter] #Unicode #Programming 🌐';
 
@@ -29,23 +31,45 @@ group('checking if any emoji', () => {
     }
     return false;
   }
+
+  function anyEmojiByGrapheme(input) {
+    for (const { segment, _cat } of graphemeSegments(input)) {
+      if (_cat === GraphemeCategory.Extended_Pictographic) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   baseline('unicode-segmenter/emoji', () => {
     assert.equal(anyEmoji(input), true);
   });
 
+  bench('unicode-segmenter/grapheme', () => {
+    assert.equal(anyEmojiByGrapheme(input), true);
+  });
+
   bench('RegExp w/ unicode', () => {
     assert.equal(/\p{Extended_Pictographic}/u.test(input), true);
   });
 
   // Should remove the `g` flag enabled by default.
-  let e = new RegExp(emojiRegex(), '');
+  let EMOJI_REGEX = new RegExp(emojiRegex(), '');
   bench('emoji-regex', () => {
-    assert.equal(e.test(input), true);
+    assert.equal(EMOJI_REGEX.test(input), true);
+  });
+
+  bench('emojibase-regex', () => {
+    assert.equal(EMOJIBASE_REGEX_EXT.test(input), true);
+  });
+
+  bench('emojibase-regex/emoji', () => {
+    assert.equal(EMOJIBASE_REGEX.test(input), true);
   });
 });
 
 group('match all emoji', () => {
-  function* allEmoji(input) {
+  function* allEmojis(input) {
     let cursor = 0;
     let len = input.length;
     while (cursor < len) {
@@ -58,11 +82,27 @@ group('match all emoji', () => {
     }
   }
 
+  function* allEmojisByGrapheme(input) {
+    for (const { segment, _cat } of graphemeSegments(input)) {
+      if (_cat === GraphemeCategory.Extended_Pictographic) {
+        yield segment;
+      }
+    }
+  }
+
   let expected = ['🚀', '🔍', '👉', '🌐'];
 
   baseline('unicode-segmenter/emoji', () => {
     assert.deepEqual(
-      [...allEmoji(input)]
+      [...allEmojis(input)]
+        .map(match => match), // iter for fair competition
+      expected,
+    );
+  });
+
+  bench('unicode-segmenter/grapheme', () => {
+    assert.deepEqual(
+      [...allEmojisByGrapheme(input)]
         .map(match => match), // iter for fair competition
       expected,
     );
@@ -76,14 +116,34 @@ group('match all emoji', () => {
     );
   });
 
-  let e = emojiRegex();
+  let EMOJI_REGEX = emojiRegex();
   bench('emoji-regex', () => {
     assert.deepEqual(
-      [...input.matchAll(e)]
+      [...input.matchAll(EMOJI_REGEX)]
         .map(match => match[0]),
       expected,
     );
   });
+
+  let EMOJIBASE_REGEX_EXT_G = new RegExp(EMOJIBASE_REGEX_EXT, 'g');
+  bench('emojibase-regex', () => {
+    assert.deepEqual(
+      [...input.matchAll(EMOJIBASE_REGEX_EXT_G)]
+        .map(match => match[0]),
+      expected,
+    );
+  });
+
+  // Note: It doesn't match Extended_Pictographic
+  //
+  // let EMOJIBASE_REGEX_G = new RegExp(EMOJIBASE_REGEX, 'g');
+  // bench('emojibase-regex/emoji', () => {
+  //   assert.deepEqual(
+  //     [...input.matchAll(EMOJIBASE_REGEX_G)]
+  //       .map(match => match[0]),
+  //     expected,
+  //   );
+  // });
 });
 
 run();
diff --git a/package.json b/package.json
@@ -112,6 +112,7 @@
     "@types/node": "^20.12.7",
     "@types/xregexp": "^4.4.0",
     "emoji-regex": "10.3.0",
+    "emojibase-regex": "15.3.2",
     "esbuild": "^0.20.2",
     "fast-check": "^3.17.1",
     "grapheme-splitter": "1.0.4",

diff --git a/yarn.lock b/yarn.lock
@@ -1802,6 +1802,13 @@ __metadata:
   languageName: node
   linkType: hard
 
+"emojibase-regex@npm:15.3.2":
+  version: 15.3.2
+  resolution: "emojibase-regex@npm:15.3.2"
+  checksum: 10c0/9500690ef4df9b6bee6039579d2bd324cca347ba55d34ffd9d1a3fc55a1dc78fe261f2282d803a0c945fd90943e32f05d6a7822e5bdeebb48b8432c370947daa
+  languageName: node
+  linkType: hard
+
 "encoding@npm:^0.1.13":
   version: 0.1.13
   resolution: "encoding@npm:0.1.13"
@@ -4792,6 +4799,7 @@ __metadata:
     "@types/node": "npm:^20.12.7"
     "@types/xregexp": "npm:^4.4.0"
     emoji-regex: "npm:10.3.0"
+    emojibase-regex: "npm:15.3.2"
     esbuild: "npm:^0.20.2"
     fast-check: "npm:^3.17.1"
     grapheme-splitter: "npm:1.0.4"