diff --git a/README.md b/README.md index ec9bd9f..e39ae35 100644 --- a/README.md +++ b/README.md @@ -198,16 +198,20 @@ Look [benchmark](benchmark) to see how it works. - built-in Unicode `RegExp` - [emoji-regex]@10.3.0 (101M+ weekly downloads on NPM) +- [emojibase-regex]@15.3.2 (192K+ weekly downloads on NPM) #### Package stats | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | |---------------------------|---------------|------|--------:|-----------:|----------------:|--------------:| | `unicode-segmenter/emoji` | 15.1.0 | ✔️ | 3,058 | 2,611 | 1,041 | 751 | -| `emoji-regex`* | 15.1.0 (vary) | ✔️ | 12,946 | 12,859 | 2,180 | 1,746 | +| `emoji-regex` | 15.1.0 (vary) | ✔️ | 12,946 | 12,859 | 2,180 | 1,746 | +| `emojibase-regex`* | 15.1.0 | ✖️ | 17,711 | 16,595 | 2,870 | 2,317 | +| `emojibase-regex/emoji`* | 15.1.0 | ✖️ | 13,550 | 12,458 | 2,835 | 2,210 | | `RegExp` w/ `u`* | - | - | 0 | 0 | 0 | 0 | -* `emoji-regex` only supports `Emoji_Presentation` property, not `Extended_Pictographic`. +* `emojibase-regex` matches `Extended_Pictographic` property. +* `emojibase-regex/emoji` matches only `Emoji_Presentation` property. * You can build your own `emoji-regex` using [emoji-test-regex-pattern](https://github.com/mathiasbynens/emoji-test-regex-pattern). * `RegExp` Unicode data is always kept up to date as the runtime support. * `RegExp` Unicode may not be available in [some old browsers](https://caniuse.com/mdn-javascript_builtins_regexp_unicode), edge runtimes, or embedded environments. @@ -218,6 +222,8 @@ The runtime performance of `unicode-segmenter/emoji` is enough to test the prese It's \~2.5x worse than `RegExp` w/ `u` for match-all performance, but that's useless examples in the real world because others don't care about grapheme clusters. +You can handle emojis in between grapheme processing by `unicode-segmenter/grapheme`. It might be a bit less performant than the dedicated emoji matchers, but it might still be reasonable. +
Details @@ -225,29 +231,39 @@ It's \~2.5x worse than `RegExp` w/ `u` for match-all performance, but that's use cpu: Apple M1 Pro runtime: node v20.13.1 (arm64-darwin) - benchmark time (avg) (min … max) p75 p99 p999 - --------------------------------------------------------------- ----------------------------- + benchmark time (avg) (min … max) p75 p99 p999 + ------------------------------------------------------------------ ----------------------------- • checking if any emoji - --------------------------------------------------------------- ----------------------------- - unicode-segmenter/emoji 16.11 ns/iter (15.28 ns … 339 ns) 16.32 ns 18.66 ns 43.42 ns - RegExp w/ unicode 19.03 ns/iter (16.52 ns … 185 ns) 17.9 ns 46.28 ns 74.85 ns - emoji-regex 43.15 ns/iter (41.54 ns … 73.51 ns) 43.58 ns 47.93 ns 65.73 ns + ------------------------------------------------------------------ ----------------------------- + unicode-segmenter/emoji 16.1 ns/iter (15.54 ns … 257 ns) 16.11 ns 19.88 ns 45.92 ns + unicode-segmenter/grapheme 89.67 ns/iter (75.44 ns … 938 ns) 94.65 ns 136 ns 473 ns + RegExp w/ unicode 17.73 ns/iter (16.5 ns … 90.05 ns) 16.93 ns 36.82 ns 56.01 ns + emoji-regex 42.83 ns/iter (41.32 ns … 409 ns) 43.23 ns 51.88 ns 169 ns + emojibase-regex 145 ns/iter (109 ns … 2'300 ns) 111 ns 942 ns 1'952 ns + emojibase-regex/emoji 77.47 ns/iter (69.74 ns … 916 ns) 73.97 ns 277 ns 667 ns summary for checking if any emoji unicode-segmenter/emoji - 1.18x faster than RegExp w/ unicode - 2.68x faster than emoji-regex + 1.1x faster than RegExp w/ unicode + 2.66x faster than emoji-regex + 4.81x faster than emojibase-regex/emoji + 5.57x faster than unicode-segmenter/grapheme + 9x faster than emojibase-regex • match all emoji - --------------------------------------------------------------- ----------------------------- - unicode-segmenter/emoji 3'215 ns/iter (2'958 ns … 189 µs) 3'208 ns 3'708 ns 11'833 ns - RegExp w/ unicode 1'285 ns/iter (1'221 ns … 1'509 ns) 1'299 ns 1'449 ns 1'509 ns - emoji-regex 11'696 ns/iter (11'125 ns … 239 µs) 11'667 ns 16'125 ns 20'375 ns + ------------------------------------------------------------------ ----------------------------- + unicode-segmenter/emoji 3'223 ns/iter (3'000 ns … 237 µs) 3'208 ns 3'667 ns 13'417 ns + unicode-segmenter/grapheme 7'904 ns/iter (7'333 ns … 290 µs) 7'791 ns 9'625 ns 72'333 ns + RegExp w/ unicode 1'266 ns/iter (1'215 ns … 1'374 ns) 1'285 ns 1'362 ns 1'374 ns + emoji-regex 11'567 ns/iter (11'083 ns … 193 µs) 11'666 ns 12'500 ns 28'834 ns + emojibase-regex 16'934 ns/iter (16'291 ns … 187 µs) 17'083 ns 18'500 ns 29'708 ns summary for match all emoji unicode-segmenter/emoji - 2.5x slower than RegExp w/ unicode - 3.64x faster than emoji-regex + 2.55x slower than RegExp w/ unicode + 2.45x faster than unicode-segmenter/grapheme + 3.59x faster than emoji-regex + 5.25x faster than emojibase-regex ```
diff --git a/benchmark/bundle-entry-emojibase-regex-emoji.js b/benchmark/bundle-entry-emojibase-regex-emoji.js new file mode 100644 index 0000000..90e3d18 --- /dev/null +++ b/benchmark/bundle-entry-emojibase-regex-emoji.js @@ -0,0 +1 @@ +export { default as EMOJI_REGEX } from 'emojibase-regex/emoji'; diff --git a/benchmark/bundle-entry-emojibase-regex.js b/benchmark/bundle-entry-emojibase-regex.js new file mode 100644 index 0000000..dce75ca --- /dev/null +++ b/benchmark/bundle-entry-emojibase-regex.js @@ -0,0 +1 @@ +export { default as EMOJI_REGEX } from 'emojibase-regex'; diff --git a/benchmark/bundle-stats-emoji.js b/benchmark/bundle-stats-emoji.js index fcecea3..ea8e940 100644 --- a/benchmark/bundle-stats-emoji.js +++ b/benchmark/bundle-stats-emoji.js @@ -25,23 +25,27 @@ let myEntry = await reportBundleStats( let competitors = [ 'emoji-regex', + 'emojibase-regex', + ['emojibase-regex-emoji', 'emojibase-regex/emoji'], ]; let otherEntries = await Promise.all( competitors.map(async (lib) => { + let libEntry = Array.isArray(lib) ? lib[0] : lib; + let libName = Array.isArray(lib) ? lib[1] : lib; let result = await build({ write: false, bundle: true, - entryPoints: [path.join(baseDir, `bundle-entry-${lib}.js`)], + entryPoints: [path.join(baseDir, `bundle-entry-${libEntry}.js`)], }); let minResult = await build({ write: false, bundle: true, minify: true, - entryPoints: [path.join(baseDir, `bundle-entry-${lib}.js`)], + entryPoints: [path.join(baseDir, `bundle-entry-${libEntry}.js`)], }); return await reportBundleStats( - lib, + libName, result.outputFiles[0].contents, minResult.outputFiles[0].contents, ); diff --git a/benchmark/performance-emoji.js b/benchmark/performance-emoji.js index 7c58f31..fc84f76 100644 --- a/benchmark/performance-emoji.js +++ b/benchmark/performance-emoji.js @@ -1,10 +1,12 @@ import * as assert from 'node:assert/strict'; import { group, baseline, bench, run } from 'mitata'; import emojiRegex from 'emoji-regex'; -import XRegExp from 'xregexp'; +import EMOJIBASE_REGEX_EXT from 'emojibase-regex'; +import EMOJIBASE_REGEX from 'emojibase-regex/emoji.js'; import { takeCodePoint } from '../src/utils.js'; import { isEmoji } from '../src/emoji.js'; +import { graphemeSegments, GraphemeCategory } from '../src/grapheme.js'; let input = '🚀 새로운 유니코드 분할기 라이브러리 \'unicode-segmenter\'를 소개합니다! 🔍 각종 언어의 문자를 정확하게 구분해주는 강력한 도구입니다. Check it out! 👉 [https://github.com/cometkim/unicode-segmenter] #Unicode #Programming 🌐'; @@ -29,23 +31,45 @@ group('checking if any emoji', () => { } return false; } + + function anyEmojiByGrapheme(input) { + for (const { segment, _cat } of graphemeSegments(input)) { + if (_cat === GraphemeCategory.Extended_Pictographic) { + return true; + } + } + return false; + } + baseline('unicode-segmenter/emoji', () => { assert.equal(anyEmoji(input), true); }); + bench('unicode-segmenter/grapheme', () => { + assert.equal(anyEmojiByGrapheme(input), true); + }); + bench('RegExp w/ unicode', () => { assert.equal(/\p{Extended_Pictographic}/u.test(input), true); }); // Should remove the `g` flag enabled by default. - let e = new RegExp(emojiRegex(), ''); + let EMOJI_REGEX = new RegExp(emojiRegex(), ''); bench('emoji-regex', () => { - assert.equal(e.test(input), true); + assert.equal(EMOJI_REGEX.test(input), true); + }); + + bench('emojibase-regex', () => { + assert.equal(EMOJIBASE_REGEX_EXT.test(input), true); + }); + + bench('emojibase-regex/emoji', () => { + assert.equal(EMOJIBASE_REGEX.test(input), true); }); }); group('match all emoji', () => { - function* allEmoji(input) { + function* allEmojis(input) { let cursor = 0; let len = input.length; while (cursor < len) { @@ -58,11 +82,27 @@ group('match all emoji', () => { } } + function* allEmojisByGrapheme(input) { + for (const { segment, _cat } of graphemeSegments(input)) { + if (_cat === GraphemeCategory.Extended_Pictographic) { + yield segment; + } + } + } + let expected = ['🚀', '🔍', '👉', '🌐']; baseline('unicode-segmenter/emoji', () => { assert.deepEqual( - [...allEmoji(input)] + [...allEmojis(input)] + .map(match => match), // iter for fair competition + expected, + ); + }); + + bench('unicode-segmenter/grapheme', () => { + assert.deepEqual( + [...allEmojisByGrapheme(input)] .map(match => match), // iter for fair competition expected, ); @@ -76,14 +116,34 @@ group('match all emoji', () => { ); }); - let e = emojiRegex(); + let EMOJI_REGEX = emojiRegex(); bench('emoji-regex', () => { assert.deepEqual( - [...input.matchAll(e)] + [...input.matchAll(EMOJI_REGEX)] .map(match => match[0]), expected, ); }); + + let EMOJIBASE_REGEX_EXT_G = new RegExp(EMOJIBASE_REGEX_EXT, 'g'); + bench('emojibase-regex', () => { + assert.deepEqual( + [...input.matchAll(EMOJIBASE_REGEX_EXT_G)] + .map(match => match[0]), + expected, + ); + }); + + // Note: It doesn't match Extended_Pictographic + // + // let EMOJIBASE_REGEX_G = new RegExp(EMOJIBASE_REGEX, 'g'); + // bench('emojibase-regex/emoji', () => { + // assert.deepEqual( + // [...input.matchAll(EMOJIBASE_REGEX_G)] + // .map(match => match[0]), + // expected, + // ); + // }); }); run(); diff --git a/package.json b/package.json index 16a57bb..53e64cc 100644 --- a/package.json +++ b/package.json @@ -112,6 +112,7 @@ "@types/node": "^20.12.7", "@types/xregexp": "^4.4.0", "emoji-regex": "10.3.0", + "emojibase-regex": "15.3.2", "esbuild": "^0.20.2", "fast-check": "^3.17.1", "grapheme-splitter": "1.0.4", diff --git a/yarn.lock b/yarn.lock index 8dd5352..220bfa0 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1802,6 +1802,13 @@ __metadata: languageName: node linkType: hard +"emojibase-regex@npm:15.3.2": + version: 15.3.2 + resolution: "emojibase-regex@npm:15.3.2" + checksum: 10c0/9500690ef4df9b6bee6039579d2bd324cca347ba55d34ffd9d1a3fc55a1dc78fe261f2282d803a0c945fd90943e32f05d6a7822e5bdeebb48b8432c370947daa + languageName: node + linkType: hard + "encoding@npm:^0.1.13": version: 0.1.13 resolution: "encoding@npm:0.1.13" @@ -4792,6 +4799,7 @@ __metadata: "@types/node": "npm:^20.12.7" "@types/xregexp": "npm:^4.4.0" emoji-regex: "npm:10.3.0" + emojibase-regex: "npm:15.3.2" esbuild: "npm:^0.20.2" fast-check: "npm:^3.17.1" grapheme-splitter: "npm:1.0.4"