Skip to content

Commit 4d507c1

Browse files
committedMay 4, 2023
feat: add a decode function and more tests
Signed-off-by: Harikrishnan Balagopal <harikrishmenon@gmail.com>
1 parent b2e96b6 commit 4d507c1

File tree

5 files changed

+228
-9
lines changed

5 files changed

+228
-9
lines changed
 

‎README.md

+10-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
# UTF-8 Text Encoder
1+
# UTF-8 Text Encoder/Decoder
22

33
Encodes a string to UTF-8 encoded bytes. https://en.wikipedia.org/wiki/UTF-8
4+
Decodes UTF-8 bytes to a string.
45

56
- No dependencies.
67
- Small code size and simple implementation that you can copy paste.
78

8-
This can act as a polyfill for `TextEncoder` if you only need UTF-8 encoding. Useful when the `TextEncoder` is undefined in places like the `AudioWorklet` API in browsers. See https://github.com/rustwasm/wasm-bindgen/issues/2367 for more info.
9+
This can act as a polyfill for `TextEncoder` and `TextDecoder` if you only need UTF-8 encoding/decoding. Useful when `TextEncoder` and `TextDecoder` are undefined in places like the `AudioWorklet` API in browsers. See https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Using_AudioWorklet and https://github.com/rustwasm/wasm-bindgen/issues/2367 for more info.
910

1011
## Usage
1112

@@ -18,9 +19,12 @@ or
1819
```
1920
pnpm add @haribala/text-encoder-utf-8
2021
```
21-
Then import `encode` which takes a `string` and returns a `Uint8array`
22+
2223
```js
23-
import { encode } from '@haribala/text-encoder-utf-8';
24+
import { encode, decode } from '@haribala/text-encoder-utf-8';
2425

25-
const bytes = encode(s);
26-
```
26+
const a = 'hi there! 😃 🖐🏻';
27+
const bytes = encode(a);
28+
const b = decode(bytes);
29+
console.log(a === b);
30+
```

‎index.d.ts

+16
Original file line numberDiff line numberDiff line change
@@ -1 +1,17 @@
1+
/**
2+
* Encode a string to UTF-8 bytes.
3+
* @param {string} s - The string to encode.
4+
* @returns {Uint8Array} Bytes containing the UTF-8 encoding of the string.
5+
*/
16
export function encode(_: string): Uint8Array;
7+
8+
/**
9+
* Decode UTF-8 bytes into a string.
10+
* Will throw an error if the input is not valid UTF-8 bytes.
11+
* If there are 3 or less continuation bytes at the beginning of the string
12+
* they will be ignored. Similarly if a 2,3,4 byte character overflows the end
13+
* of the string it will also be ignored.
14+
* @param {Uint8Array} arr - Bytes containing the UTF-8 encoding of the string.
15+
* @returns {string} The decoded string.
16+
*/
17+
export function decode(_: Uint8Array): string;

‎index.js

+93
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,96 @@ export const encode = (s) => new Uint8Array([...s].map(c => c.codePointAt(0)).fl
2424
(x & 0x3F) | 0x80,
2525
];
2626
}));
27+
28+
/**
29+
* Decode UTF-8 bytes into a string.
30+
* Will throw an error if the input is not valid UTF-8 bytes.
31+
* If there are 3 or less continuation bytes at the beginning of the string
32+
* they will be ignored. Similarly if a 2,3,4 byte character overflows the end
33+
* of the string it will also be ignored.
34+
* @param {Uint8Array} arr - Bytes containing the UTF-8 encoding of the string.
35+
* @returns {string} The decoded string.
36+
*/
37+
export const decode = (arr) => {
38+
const hex = x => x.toString(16).padStart(2, '0');
39+
const xs = Array.from(arr);
40+
const res = [];
41+
let i = 0;
42+
while (i < xs.length && i < 3 && xs[i] && (xs[i] & 0xC0) === 0x80) i++;
43+
if (i >= xs.length) return '';
44+
if (!(
45+
((xs[i] & 0x80) === 0) || // 1 byte
46+
((xs[i] & 0xE0) === 0xC0) || // 2 byte
47+
((xs[i] & 0xF0) === 0xE0) || // 3 byte
48+
((xs[i] & 0xF8) === 0xF0) // 4 byte
49+
)) {
50+
throw new Error(`invalid utf-8. Expected a leading byte at index ${i} actual ${hex(xs[i])}`);
51+
}
52+
for (; i < xs.length; i++) {
53+
const x = xs[i];
54+
if ((x & 0x80) === 0) {
55+
// 1 byte
56+
res.push(x);
57+
continue;
58+
}
59+
if ((x & 0xE0) === 0xC0) {
60+
// 2 byte
61+
if (i + 1 >= xs.length) break;
62+
const x1 = xs[i + 1];
63+
if ((x1 & 0xC0) !== 0x80) {
64+
throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 1} actual ${hex(x1)}`);
65+
}
66+
const c = ((x & 0x1F) << 6) | (x1 & 0x3F);
67+
if (c < 0x80 || c >= 0x800) {
68+
throw new Error(`invalid utf-8. Expected an integer between 0x80 and 0x800 at index ${i} actual ${c}`);
69+
}
70+
res.push(c);
71+
i++;
72+
continue;
73+
}
74+
if ((x & 0xF0) === 0xE0) {
75+
// 3 byte
76+
if (i + 2 >= xs.length) break;
77+
const x1 = xs[i + 1];
78+
if ((x1 & 0xC0) !== 0x80) {
79+
throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 1} actual ${hex(x1)}`);
80+
}
81+
const x2 = xs[i + 2];
82+
if ((x2 & 0xC0) !== 0x80) {
83+
throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 2} actual ${hex(x2)}`);
84+
}
85+
const c = ((x & 0x0F) << 12) | ((x1 & 0x3F) << 6) | (x2 & 0x3F);
86+
if (c < 0x800 || c >= 0x10000) {
87+
throw new Error(`invalid utf-8. Expected an integer between 0x800 and 0x10000 at index ${i} actual ${c}`);
88+
}
89+
res.push(c);
90+
i += 2;
91+
continue;
92+
}
93+
if ((x & 0xF8) === 0xF0) {
94+
// 4 byte
95+
if (i + 3 >= xs.length) break;
96+
const x1 = xs[i + 1];
97+
if ((x1 & 0xC0) !== 0x80) {
98+
throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 1} actual ${hex(x1)}`);
99+
}
100+
const x2 = xs[i + 2];
101+
if ((x2 & 0xC0) !== 0x80) {
102+
throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 2} actual ${hex(x2)}`);
103+
}
104+
const x3 = xs[i + 3];
105+
if ((x3 & 0xC0) !== 0x80) {
106+
throw new Error(`invalid utf-8. Expected a continuation byte at index ${i + 3} actual ${hex(x3)}`);
107+
}
108+
const c = ((x & 0x07) << 18) | ((x1 & 0x3F) << 12) | ((x2 & 0x3F) << 6) | (x3 & 0x3F);
109+
if (c < 0x10000) {
110+
throw new Error(`invalid utf-8. Expected an integer above 0x10000 at index ${i} actual ${c}`);
111+
}
112+
res.push(c);
113+
i += 3;
114+
continue;
115+
}
116+
throw new Error(`invalid utf-8. Expected a leading byte at index ${i} actual ${hex(x)}`);
117+
}
118+
return String.fromCodePoint(...res);
119+
};

‎index.test.js

+107-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { encode } from './index.js';
1+
import { encode, decode } from './index.js';
22

33
test('encode-and-decode', () => {
44
const testCases = [
@@ -7,6 +7,7 @@ test('encode-and-decode', () => {
77
];
88
testCases.forEach(s => {
99
const arr = encode(s);
10+
expect(decode(arr)).toBe(s);
1011
{
1112
// test against TextEncoder
1213
const expected = Array.from(new TextEncoder().encode(s));
@@ -21,3 +22,108 @@ test('encode-and-decode', () => {
2122
}
2223
});
2324
});
25+
26+
test('utf-8-broken-beginning', () => {
27+
const textEnc = new TextEncoder();
28+
const textDec = new TextDecoder();
29+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
30+
const arr = new Uint8Array([
31+
0x80, 0x80, 0x80, // continuation byte x 3
32+
...Array.from(textEnc.encode(s)),
33+
]);
34+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s);
35+
expect(decode(arr)).toBe(s);
36+
});
37+
38+
test('utf-8-broken-beginning-invalid-continuation-byte', () => {
39+
const textEnc = new TextEncoder();
40+
const textDec = new TextDecoder();
41+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
42+
const arr = new Uint8Array([
43+
0x80, 0xFF, 0x80, // continuation byte, invalid byte, continuation byte
44+
...Array.from(textEnc.encode(s)),
45+
]);
46+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s);
47+
expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 1 actual ff');
48+
});
49+
50+
test('utf-8-broken-beginning-too-many-continuation-bytes', () => {
51+
const textEnc = new TextEncoder();
52+
const textDec = new TextDecoder();
53+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
54+
const arr = new Uint8Array([
55+
0x80, 0x80, 0x80, 0x80, // continuation byte x 4
56+
...Array.from(textEnc.encode(s)),
57+
]);
58+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD\uFFFD' + s);
59+
expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 3 actual 80');
60+
});
61+
62+
test('utf-8-broken-beginning-invalid-leading-byte', () => {
63+
const textEnc = new TextEncoder();
64+
const textDec = new TextDecoder();
65+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
66+
const arr = new Uint8Array([
67+
0x80, 0x80, 0x80, 0xFF, // continuation byte x 4
68+
...Array.from(textEnc.encode(s)),
69+
]);
70+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD\uFFFD' + s);
71+
expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 3 actual ff');
72+
});
73+
74+
test('utf-8-broken-ending-partial-multi-byte-character', () => {
75+
const textEnc = new TextEncoder();
76+
const textDec = new TextDecoder();
77+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
78+
const arr = new Uint8Array([
79+
0x80, 0x80, 0x80, // continuation byte x 4
80+
...Array.from(textEnc.encode(s)),
81+
0xF0,
82+
]);
83+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD');
84+
expect(decode(arr)).toBe(s);
85+
});
86+
87+
test('utf-8-broken-middle-invalid-character', () => {
88+
const textEnc = new TextEncoder();
89+
const textDec = new TextDecoder();
90+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
91+
const arr = new Uint8Array([
92+
0x80, 0x80, 0x80, // continuation byte x 4
93+
...Array.from(textEnc.encode(s)),
94+
0xFF,
95+
...Array.from(textEnc.encode(s)),
96+
]);
97+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD' + s);
98+
expect(() => decode(arr)).toThrow('invalid utf-8. Expected a leading byte at index 56 actual ff');
99+
});
100+
101+
test('utf-8-broken-middle-invalid-4-byte-character-not-continuation-byte', () => {
102+
const textEnc = new TextEncoder();
103+
const textDec = new TextDecoder();
104+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
105+
const arr = new Uint8Array([
106+
0x80, 0x80, 0x80, // continuation byte x 4
107+
...Array.from(textEnc.encode(s)),
108+
0xF0,
109+
0x80,
110+
...Array.from(textEnc.encode(s)),
111+
]);
112+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD\uFFFD' + s);
113+
expect(() => decode(arr)).toThrow('invalid utf-8. Expected a continuation byte at index 58 actual 61');
114+
});
115+
116+
test('utf-8-broken-middle-invalid-2-byte-character-out-of-range', () => {
117+
const textEnc = new TextEncoder();
118+
const textDec = new TextDecoder();
119+
const s = 'abcd😊efgh\n012345689\t€\r🧑🏽‍🍳helloworld!';
120+
const arr = new Uint8Array([
121+
0x80, 0x80, 0x80, // continuation byte x 4
122+
...Array.from(textEnc.encode(s)),
123+
0xC0,
124+
0x80,
125+
...Array.from(textEnc.encode(s)),
126+
]);
127+
expect(textDec.decode(arr)).toBe('\uFFFD\uFFFD\uFFFD' + s + '\uFFFD\uFFFD' + s);
128+
expect(() => decode(arr)).toThrow('invalid utf-8. Expected an integer between 0x80 and 0x800 at index 56 actual 0');
129+
});

‎package.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
"name": "@haribala/text-encoder-utf-8",
33
"packageManager": "pnpm@8.4.0",
44
"type": "module",
5-
"version": "1.0.3",
6-
"description": "A simple text encoder for utf-8",
5+
"version": "1.1.0",
6+
"description": "A simple text encoder/decoder for utf-8",
77
"main": "index.js",
88
"scripts": {
99
"test": "NODE_OPTIONS=--experimental-vm-modules jest"

0 commit comments

Comments
 (0)
Please sign in to comment.