feat: add a decode function and more tests

HarikrishnanBalagopal · HarikrishnanBalagopal · commit 4d507c182b32 · 2023-05-04T20:19:22.000+05:30
Signed-off-by: Harikrishnan Balagopal &lt;harikrishmenon@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -1,11 +1,12 @@
-# UTF-8 Text Encoder
+# UTF-8 Text Encoder/Decoder
 
 Encodes a string to UTF-8 encoded bytes. https://en.wikipedia.org/wiki/UTF-8
+Decodes UTF-8 bytes to a string.
 
 - No dependencies.
 - Small code size and simple implementation that you can copy paste.
 
-This can act as a polyfill for `TextEncoder` if you only need UTF-8 encoding. Useful when the `TextEncoder` is undefined in places like the `AudioWorklet` API in browsers. See https://github.com/rustwasm/wasm-bindgen/issues/2367 for more info.
+This can act as a polyfill for `TextEncoder` and `TextDecoder` if you only need UTF-8 encoding/decoding. Useful when `TextEncoder` and `TextDecoder` are undefined in places like the `AudioWorklet` API in browsers. See https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Using_AudioWorklet and https://github.com/rustwasm/wasm-bindgen/issues/2367 for more info.
 
 ## Usage
 
@@ -18,9 +19,12 @@ or
 ```
 pnpm add @haribala/text-encoder-utf-8
 ```
-Then import `encode` which takes a `string` and returns a `Uint8array`
+
 ```js
-import { encode } from '@haribala/text-encoder-utf-8';
+import { encode, decode } from '@haribala/text-encoder-utf-8';
 
-const bytes = encode(s);
-```
+const a = 'hi there! 😃 🖐🏻';
+const bytes = encode(a);
+const b = decode(bytes);
+console.log(a === b);
+```
diff --git a/index.d.ts b/index.d.ts
@@ -1 +1,17 @@
+/**
+ * Encode a string to UTF-8 bytes.
+ * @param {string} s - The string to encode.
+ * @returns {Uint8Array} Bytes containing the UTF-8 encoding of the string.
+ */
 export function encode(_: string): Uint8Array;
+
+/**
+ * Decode UTF-8 bytes into a string.
+ * Will throw an error if the input is not valid UTF-8 bytes.
+ * If there are 3 or less continuation bytes at the beginning of the string
+ * they will be ignored. Similarly if a 2,3,4 byte character overflows the end
+ * of the string it will also be ignored.
+ * @param {Uint8Array} arr - Bytes containing the UTF-8 encoding of the string.
+ * @returns {string} The decoded string.
+ */
+export function decode(_: Uint8Array): string;
diff --git a/index.js b/index.js
@@ -24,3 +24,96 @@ export const encode = (s) => new Uint8Array([...s].map(c => c.codePointAt(0)).fl
         (x & 0x3F) | 0x80,
     ];
 }));
+
+/**
+ * Decode UTF-8 bytes into a string.
+ * Will throw an error if the input is not valid UTF-8 bytes.
+ * If there are 3 or less continuation bytes at the beginning of the string
+ * they will be ignored. Similarly if a 2,3,4 byte character overflows the end
+ * of the string it will also be ignored.
+ * @param {Uint8Array} arr - Bytes containing the UTF-8 encoding of the string.
+ * @returns {string} The decoded string.
+ */
+export const decode = (arr) => {
+    const hex = x => x.toString(16).padStart(2, '0');
+    const xs = Array.from(arr);
+    const res = [];
+    let i = 0;
+    while (i < xs.length && i < 3 && xs[i] && (xs[i] & 0xC0) === 0x80) i++;
+    if (i >= xs.length) return '';
+    if (!(
+        ((xs[i] & 0x80) === 0) || // 1 byte
+        ((xs[i] & 0xE0) === 0xC0) || // 2 byte
+        ((xs[i] & 0xF0) === 0xE0) || // 3 byte
+        ((xs[i] & 0xF8) === 0xF0) // 4 byte
+    )) {
+        throw new Error(`invalid utf-8. Expected a leading byte at index ${i} actual ${hex(xs[i])}`);
+    }
+    for (; i < xs.length; i++) {
+        const x = xs[i];
+        if ((x & 0x80) === 0) {
+            // 1 byte
+            res.push(x);
+            continue;
+        }
+        if ((x & 0xE0) === 0xC0) {
+            // 2 byte
+            if (i + 1 >= xs.length) break;
+            const x1 = xs[i + 1];
+            if ((x1 & 0xC0) !== 0x80) {
+                throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 1} actual ${hex(x1)}`);
+            }
+            const c = ((x & 0x1F) << 6) | (x1 & 0x3F);
+            if (c < 0x80 || c >= 0x800) {
+                throw new Error(`invalid utf-8. Expected an integer between 0x80 and 0x800 at index ${i} actual ${c}`);
+            }
+            res.push(c);
+            i++;
+            continue;
+        }
+        if ((x & 0xF0) === 0xE0) {
+            // 3 byte
+            if (i + 2 >= xs.length) break;
+            const x1 = xs[i + 1];
+            if ((x1 & 0xC0) !== 0x80) {
+                throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 1} actual ${hex(x1)}`);
+            }
+            const x2 = xs[i + 2];
+            if ((x2 & 0xC0) !== 0x80) {
+                throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 2} actual ${hex(x2)}`);
+            }
+            const c = ((x & 0x0F) << 12) | ((x1 & 0x3F) << 6) | (x2 & 0x3F);
+            if (c < 0x800 || c >= 0x10000) {
+                throw new Error(`invalid utf-8. Expected an integer between 0x800 and 0x10000 at index ${i} actual ${c}`);
+            }
+            res.push(c);
+            i += 2;
+            continue;
+        }
+        if ((x & 0xF8) === 0xF0) {
+            // 4 byte
+            if (i + 3 >= xs.length) break;
+            const x1 = xs[i + 1];
+            if ((x1 & 0xC0) !== 0x80) {
+                throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 1} actual ${hex(x1)}`);
+            }
+            const x2 = xs[i + 2];
+            if ((x2 & 0xC0) !== 0x80) {
+                throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 2} actual ${hex(x2)}`);
+            }
+            const x3 = xs[i + 3];
+            if ((x3 & 0xC0) !== 0x80) {
+                throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 3} actual ${hex(x3)}`);
+            }
+            const c = ((x & 0x07) << 18) | ((x1 & 0x3F) << 12) | ((x2 & 0x3F) << 6) | (x3 & 0x3F);
+            if (c < 0x10000) {
+                throw new Error(`invalid utf-8. Expected an integer above 0x10000 at index ${i} actual ${c}`);
+            }
+            res.push(c);
+            i += 3;
+            continue;
+        }
+        throw new Error(`invalid utf-8. Expected a leading byte at index ${i} actual ${hex(x)}`);
+    }
+    return String.fromCodePoint(...res);
+};
diff --git a/index.test.js b/index.test.js
@@ -1,4 +1,4 @@
-import { encode } from './index.js';
+import { encode, decode } from './index.js';
 
 test('encode-and-decode', () => {
     const testCases = [
@@ -7,6 +7,7 @@ test('encode-and-decode', () => {
     ];
     testCases.forEach(s => {
         const arr = encode(s);
+        expect(decode(arr)).toBe(s);
         {
             // test against TextEncoder
             const expected = Array.from(new TextEncoder().encode(s));
@@ -21,3 +22,108 @@ test('encode-and-decode', () => {
         }
     });
 });
+
+test('utf-8-broken-beginning', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0x80, 0x80, // continuation byte x 3
+        ...Array.from(textEnc.encode(s)),
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s);
+    expect(decode(arr)).toBe(s);
+});
+
+test('utf-8-broken-beginning-invalid-continuation-byte', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0xFF, 0x80, // continuation byte, invalid byte, continuation byte
+        ...Array.from(textEnc.encode(s)),
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s);
+    expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 1 actual ff');
+});
+
+test('utf-8-broken-beginning-too-many-continuation-bytes', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0x80, 0x80, 0x80, // continuation byte x 4
+        ...Array.from(textEnc.encode(s)),
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD\uFFFD' + s);
+    expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 3 actual 80');
+});
+
+test('utf-8-broken-beginning-invalid-leading-byte', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0x80, 0x80, 0xFF, // continuation byte x 4
+        ...Array.from(textEnc.encode(s)),
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD\uFFFD' + s);
+    expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 3 actual ff');
+});
+
+test('utf-8-broken-ending-partial-multi-byte-character', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0x80, 0x80, // continuation byte x 4
+        ...Array.from(textEnc.encode(s)),
+        0xF0,
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD');
+    expect(decode(arr)).toBe(s);
+});
+
+test('utf-8-broken-middle-invalid-character', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0x80, 0x80, // continuation byte x 4
+        ...Array.from(textEnc.encode(s)),
+        0xFF,
+        ...Array.from(textEnc.encode(s)),
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD' + s);
+    expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 56 actual ff');
+});
+
+test('utf-8-broken-middle-invalid-4-byte-character-not-continuation-byte', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0x80, 0x80, // continuation byte x 4
+        ...Array.from(textEnc.encode(s)),
+        0xF0,
+        0x80,
+        ...Array.from(textEnc.encode(s)),
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD\uFFFD' + s);
+    expect(() => decode(arr)).toThrow('invalid utf-8. Expected a continuation byte at index 58 actual 61');
+});
+
+test('utf-8-broken-middle-invalid-2-byte-character-out-of-range', () => {
+    const textEnc = new TextEncoder();
+    const textDec = new TextDecoder();
+    const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
+    const arr = new Uint8Array([
+        0x80, 0x80, 0x80, // continuation byte x 4
+        ...Array.from(textEnc.encode(s)),
+        0xC0,
+        0x80,
+        ...Array.from(textEnc.encode(s)),
+    ]);
+    expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD\uFFFD' + s);
+    expect(() => decode(arr)).toThrow('invalid utf-8. Expected an integer between 0x80 and 0x800 at index 56 actual 0');
+});
diff --git a/package.json b/package.json
@@ -2,8 +2,8 @@
   "name": "@haribala/text-encoder-utf-8",
   "packageManager": "pnpm@8.4.0",
   "type": "module",
-  "version": "1.0.3",
-  "description": "A simple text encoder for utf-8",
+  "version": "1.1.0",
+  "description": "A simple text encoder/decoder for utf-8",
   "main": "index.js",
   "scripts": {
     "test": "NODE_OPTIONS=--experimental-vm-modules jest"