From 84ee65055124659c9f26a906eadd575e71f01b8a Mon Sep 17 00:00:00 2001 From: Alexander Shtuchkin Date: Wed, 15 Jul 2020 18:08:49 -0400 Subject: [PATCH] Implement UTF-16LE encoding, update tests, adjust codec interface Three major reasons for reimplementing UTF-16 and not use native codec: 1. We want to remove StringDecoder & Buffer references due to #235. 2. StringDecoder is inconsistent with handling surrogates on Node v6-9 3. NPM module string_decoder gives strange results when processing chunks - it sometimes prepends '\u0000', likely due to a bug. Performance was and is a major concern here. Decoder shouldn't be affected because it uses backend methods directly. Encoder is affected due to introducing character-level loop. It's still very fast (~450Mb/s), so I'm not too worried. If needed, we can make it about 4x faster in Node.js by introducing a dedicated backend method. Browser speeds will be the same. --- encodings/internal.js | 3 +- encodings/utf16.js | 177 +++++++++++++++++++++++++++++------ lib/index.js | 14 ++- test/streams-test.js | 16 +--- test/utf16-test.js | 211 ++++++++++++++++++++++++++++++++++-------- test/utils.js | 26 ++++++ 6 files changed, 360 insertions(+), 87 deletions(-) diff --git a/encodings/internal.js b/encodings/internal.js index dc1074f..d04ed2f 100644 --- a/encodings/internal.js +++ b/encodings/internal.js @@ -9,8 +9,7 @@ module.exports = { cesu8: { type: "_internal", bomAware: true}, unicode11utf8: "utf8", - ucs2: { type: "_internal", bomAware: true}, - utf16le: "ucs2", + // NOTE: utf-16le/ucs2 are in utf16.js. binary: { type: "_internal" }, base64: { type: "_internal" }, diff --git a/encodings/utf16.js b/encodings/utf16.js index ba23f9a..ed9bb09 100644 --- a/encodings/utf16.js +++ b/encodings/utf16.js @@ -1,17 +1,123 @@ "use strict"; -// Note: UTF16-LE (or UCS2) codec is Node.js native. See encodings/internal.js +// == UTF16-LE codec. ========================================================== +// Note: We're not using Node.js native codec because StringDecoder implementation is buggy +// (adds \0 in some chunks; doesn't flag non-even number of bytes). We do use raw encoding/decoding +// routines for performance, though. + +exports.utf16le = class Utf16LECodec { + createEncoder(options, iconv) { + return new Utf16LEEncoder(iconv.backend); + } + createDecoder(options, iconv) { + return new Utf16LEDecoder(iconv.backend, iconv.defaultCharUnicode); + } + get bomAware() { return true; } +} + +class Utf16LEEncoder { + constructor(backend) { + this.backend = backend; + } + + write(str) { + const bytes = this.backend.allocBytes(str.length * 2); + const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length); + for (let i = 0; i < str.length; i++) { + chars[i] = str.charCodeAt(i); + } + return this.backend.bytesToResult(bytes, bytes.length); + } + + end() {} +} + +class Utf16LEDecoder { + constructor(backend, defaultChar) { + this.backend = backend; + this.defaultChar = defaultChar; + this.overflowByte = -1; + this.prefixSurrogate = undefined; + } + + write(buf) { + if (buf.length == 0) { + return ''; + } + let byteOffset = buf.byteOffset; + let byteLen = buf.length; + + // Process previous overflowByte + let prefix = ''; + if (this.overflowByte !== -1) { + byteOffset++; byteLen--; + prefix = String.fromCharCode(this.overflowByte + (buf[0] << 8)); + } + + // Set new overflowByte + if (byteLen & 1) { + this.overflowByte = buf[buf.length-1]; + byteLen--; + } else { + this.overflowByte = -1; + } + + let chars; + if (byteOffset & 1 === 0) { + // If byteOffset is aligned, just use the ArrayBuffer from input buf. + chars = new Uint16Array(buf.buffer, byteOffset, byteLen >> 1); + } else { + // If byteOffset is NOT aligned, create a new aligned buffer and copy the data. + chars = this.backend.allocRawChars(byteLen >> 1); + const srcByteView = new Uint8Array(buf.buffer, byteOffset, byteLen); + const destByteView = new Uint8Array(chars.buffer, chars.byteOffset, byteLen); + destByteView.set(srcByteView); + } + + let res = prefix + this.backend.rawCharsToResult(chars, chars.length); + if (res) { + // Add high surrogate from previous chunk. + if (this.prefixSurrogate) { + res = this.prefixSurrogate + res; + this.prefixSurrogate = undefined; + } + + // Slice off a new high surrogate at the end of the current chunk. + const lastChar = res.charCodeAt(res.length-1); + if (0xD800 <= lastChar && lastChar < 0xDC00) { + this.prefixSurrogate = res[res.length-1]; + res = res.slice(0, -1); + } + } + return res; + } + + end() { + if (this.prefixSurrogate || this.overflowByte !== -1) { + const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : ''); + this.prefixSurrogate = undefined; + this.overflowByte = -1; + return res; + } + } +} +exports.ucs2 = "utf16le"; // Alias + // == UTF16-BE codec. ========================================================== exports.utf16be = class Utf16BECodec { - get encoder() { return Utf16BEEncoder; } - get decoder() { return Utf16BEDecoder; } + createEncoder(options, iconv) { + return new Utf16BEEncoder(iconv.backend); + } + createDecoder(options, iconv) { + return new Utf16BEDecoder(iconv.backend, iconv.defaultCharUnicode); + } get bomAware() { return true; } } class Utf16BEEncoder { - constructor(opts, codec, backend) { + constructor(backend) { this.backend = backend; } @@ -30,30 +136,59 @@ class Utf16BEEncoder { } class Utf16BEDecoder { - constructor(opts, codec, backend) { + constructor(backend, defaultChar) { this.backend = backend; + this.defaultChar = defaultChar; this.overflowByte = -1; + this.prefixSurrogate = undefined; } write(buf) { + if (buf.length === 0) { + return ''; + } + const chars = this.backend.allocRawChars((buf.length+1) >> 1); let charsPos = 0, i = 0; - if (this.overflowByte !== -1 && i < buf.length) { + if (this.overflowByte !== -1) { chars[charsPos++] = (this.overflowByte << 8) + buf[i++]; } + // NOTE: we can win another 10% perf by using chars[i >> 1]. + // NOTE: the double-reverse method takes almost the same time. for (; i < buf.length-1; i += 2) { chars[charsPos++] = (buf[i] << 8) + buf[i+1]; } this.overflowByte = (i == buf.length-1) ? buf[i] : -1; - return this.backend.rawCharsToResult(chars, charsPos); + let res = this.backend.rawCharsToResult(chars, charsPos); + if (res) { + // Add high surrogate from previous chunk. + if (this.prefixSurrogate) { + res = this.prefixSurrogate + res; + this.prefixSurrogate = undefined; + } + + // Slice off a new high surrogate at the end of the current chunk. + const lastChar = res.charCodeAt(res.length-1); + if (0xD800 <= lastChar && lastChar < 0xDC00) { + this.prefixSurrogate = res[res.length-1]; + res = res.slice(0, -1); + } + } + return res; + } end() { - this.overflowByte = -1; + if (this.prefixSurrogate || this.overflowByte !== -1) { + const res = (this.prefixSurrogate ? this.prefixSurrogate : '') + (this.overflowByte !== -1 ? this.defaultChar : ''); + this.prefixSurrogate = undefined; + this.overflowByte = -1; + return res; + } } } @@ -67,39 +202,25 @@ class Utf16BEDecoder { // Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false). exports.utf16 = class Utf16Codec { - constructor(opts, iconv) { - this.iconv = iconv; - } - get encoder() { return Utf16Encoder; } - get decoder() { return Utf16Decoder; } -} - -class Utf16Encoder { - constructor(options, codec) { + createEncoder(options, iconv) { options = options || {}; if (options.addBOM === undefined) options.addBOM = true; - this.encoder = codec.iconv.getEncoder(options.use || 'utf-16le', options); + return iconv.getEncoder('utf-16le', options); } - - // Pass-through to this.encoder - write(str) { - return this.encoder.write(str); - } - - end() { - return this.encoder.end(); + createDecoder(options, iconv) { + return new Utf16Decoder(options, iconv); } } class Utf16Decoder { - constructor(options, codec) { + constructor(options, iconv) { this.decoder = null; this.initialBufs = []; this.initialBufsLen = 0; this.options = options || {}; - this.iconv = codec.iconv; + this.iconv = iconv; } write(buf) { diff --git a/lib/index.js b/lib/index.js index f85d007..89b5890 100644 --- a/lib/index.js +++ b/lib/index.js @@ -105,8 +105,11 @@ iconv._canonicalizeEncoding = function(encoding) { } iconv.getEncoder = function getEncoder(encoding, options) { - var codec = iconv.getCodec(encoding), - encoder = new codec.encoder(options, codec, iconv.backend); + const codec = iconv.getCodec(encoding); + + let encoder = codec.createEncoder + ? codec.createEncoder(options, iconv) + : new codec.encoder(options, codec, iconv.backend); if (codec.bomAware && options && options.addBOM) encoder = new bomHandling.PrependBOM(encoder, options); @@ -115,8 +118,11 @@ iconv.getEncoder = function getEncoder(encoding, options) { } iconv.getDecoder = function getDecoder(encoding, options) { - var codec = iconv.getCodec(encoding), - decoder = new codec.decoder(options, codec, iconv.backend); + const codec = iconv.getCodec(encoding); + + let decoder = codec.createDecoder + ? codec.createDecoder(options, iconv) + : new codec.decoder(options, codec, iconv.backend); if (codec.bomAware && !(options && options.stripBOM === false)) decoder = new bomHandling.StripBOM(decoder, options); diff --git a/test/streams-test.js b/test/streams-test.js index 202781f..1de95bd 100644 --- a/test/streams-test.js +++ b/test/streams-test.js @@ -213,17 +213,7 @@ describe("Streaming mode", function() { encoding: "ucs2", input: [[0x3D], [0xD8, 0x3B], [0xDE]], // U+1F63B, 😻, SMILING CAT FACE WITH HEART-SHAPED EYES outputType: false, // Don't concat - checkOutput: function(res) { - if (semver.satisfies(process.version, '>= 6.2.1 < 10.0.0')) { - // After a string_decoder rewrite in https://github.com/nodejs/node/pull/6777, which - // was merged in Node v6.2.1, we don't merge chunks anymore. - // Not really correct, but it seems we cannot do anything with it. - // Though it has been fixed again in Node v10.0.0 - assert.deepEqual(res, ["\uD83D", "\uDE3B"]); - } else { - assert.deepEqual(res, ["\uD83D\uDE3B"]); // We should have only 1 chunk. - } - }, + checkOutput: function(res) { assert.deepEqual(res, ["\uD83D\uDE3B"]); }, // We should have only 1 chunk. })); it("Encoding using internal modules: utf8", checkEncodeStream({ @@ -264,13 +254,13 @@ describe("Streaming mode", function() { it("Decoding of uneven length buffers from UTF-16BE - 2", checkDecodeStream({ encoding: "UTF-16BE", - input: [[0x00, 0x61, 0x00], [0x62, 0x00, 0x63]], + input: [[0x00, 0x61, 0x00], [0x62, 0x00], [0x63]], output: "abc" })); it("Decoding of uneven length buffers from UTF-16", checkDecodeStream({ encoding: "UTF-16", - input: [[0x61], [0x0], [0x20], [0x0]], + input: [[0x61], [0x0, 0x20], [0x0]], output: "a " })); diff --git a/test/utf16-test.js b/test/utf16-test.js index 7a18188..013bb84 100644 --- a/test/utf16-test.js +++ b/test/utf16-test.js @@ -1,80 +1,211 @@ -var assert = require('assert'), - utils = require('./utils'), - iconv = utils.requireIconv(), - hex = utils.hex; - -var testStr = "1aя中文☃💩"; - utf16beBuf = utils.bytesFrom([0, 0x31, 0, 0x61, 0x04, 0x4f, 0x4e, 0x2d, 0x65, 0x87, 0x26, 0x03, 0xd8, 0x3d, 0xdc, 0xa9]), - utf16leBuf = utils.bytesFrom([0x31, 0, 0x61, 0, 0x4f, 0x04, 0x2d, 0x4e, 0x87, 0x65, 0x03, 0x26, 0x3d, 0xd8, 0xa9, 0xdc]), - utf16beBOM = utils.bytesFrom([0xFE, 0xFF]), - utf16leBOM = utils.bytesFrom([0xFF, 0xFE]), - sampleStr = '\n<俄语>данные'; - -describe("UTF-16LE codec #node-web", function() { +"use strict"; + +const assert = require('assert'), + utils = require('./utils'), + iconv = utils.requireIconv(), + hex = utils.hex; + +const testStr = "1aя中文☃💩", + utf16beBuf = utils.bytesFrom([0, 0x31, 0, 0x61, 0x04, 0x4f, 0x4e, 0x2d, 0x65, 0x87, 0x26, 0x03, 0xd8, 0x3d, 0xdc, 0xa9]), + utf16leBuf = utils.bytesFrom([0x31, 0, 0x61, 0, 0x4f, 0x04, 0x2d, 0x4e, 0x87, 0x65, 0x03, 0x26, 0x3d, 0xd8, 0xa9, 0xdc]), + utf16beBOM = utils.bytesFrom([0xFE, 0xFF]), + utf16leBOM = utils.bytesFrom([0xFF, 0xFE]), + sampleStr = '\n<数据>נְתוּנִים', + weirdBuf = utils.bytesFrom([0x15, 0x16, 0x17, 0x18]); // Can't automatically detect whether it's LE or BE. + + +describe("UTF-16LE encoder #node-web", function() { + const enc = 'utf16-le'; + it("encodes basic strings correctly", function() { + assert.equal(hex(iconv.encode('', enc)), ''); + assert.equal(hex(iconv.encode(testStr, enc)), hex(utf16leBuf)); + }); + + it("adds BOM if asked", function() { + assert.equal(hex(iconv.encode(testStr, enc, {addBOM: true})), hex(utf16leBOM) + hex(utf16leBuf)); + }); + + // NOTE: I'm not sure what the right behavior is here. Node.js keeps all invalid surrogates as-is for + // both utf-16le and ucs2 encodings. TextEncoder can't encode utf-16, but when using utf-8, replaces + // these with '�'. Leaning towards Node side for now. + it("keeps single and invalid surrogates as-is", function() { + assert.equal(hex(iconv.encode(' \uD800 \uDE00 \uDE00\uD800 \uD800', enc)), + "2000 00d8 2000 00de 2000 00de 00d8 2000 00d8".replace(/ /g, "")); + }); + + it("keeps valid surrogate pairs split on a chunk boundary unchanged", function() { + const encoder = iconv.getEncoder(enc); + assert.equal(hex(encoder.write('\uD83D')), '3dd8'); + assert.equal(hex(encoder.write('\uDCA9')), 'a9dc'); + assert.strictEqual(encoder.end(), undefined); + }); +}); + +describe("UTF-16LE decoder #node-web", function() { + const enc = 'utf16-le'; + it("decodes basic buffers correctly", function() { + assert.equal(iconv.decode(utf16leBuf, enc), testStr); + }); + + it("decodes uneven length buffers showing an error", function() { + assert.equal(iconv.decode(utils.bytesFrom([0x61, 0, 0]), enc), "a�"); + }); + it("decodes very short buffers correctly", function() { - assert.equal(iconv.decode(utils.bytesFrom([]), 'utf-16le'), ''); - - // Looks like StringDecoder doesn't do the right thing here, returning '\u0000'. TODO: fix. - //assert.equal(iconv.decode(utils.bytesFrom([0x61]), 'utf-16le'), ''); + assert.equal(iconv.decode(utils.bytesFrom([]), enc), ''); + assert.equal(iconv.decode(utils.bytesFrom([0x61]), enc), '�'); }); + + it("handles chunks with uneven lengths correctly", utils.checkDecoderChunks(enc, { + inputs: [[], [0x61], [], [0x00], [0x61], [0x00, 0x61], [0x00, 0x00]], + outputs: ['', '', '', 'a', '', 'a', 'a', '�'], + })); + + it("doesn't split valid surrogate pairs between chunks", utils.checkDecoderChunks(enc, [{ + inputs: [[0x3D, 0xD8, 0x3B], [0xDE]], + outputs: [ '', "\uD83D\uDE3B"], + }, { + inputs: [[0x3D, 0xD8], [0x3B], [0xDE]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0x3D], [0xD8, 0x3B], [0xDE]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0x3D], [0xD8], [0x3B], [0xDE]], + outputs: [ '', '', '', "\uD83D\uDE3B"], + }])); + + it("handles complex surrogate pairs cases", utils.checkDecoderChunks(enc, [{ + inputs: [[0x3E], [0xD9], [0x3D], [0xD8], [0x3B], [0xDE]], + outputs: [ '', '', '', '\uD93E', '', "\uD83D\uDE3B"] + }, { + inputs: [[0x3E, 0xD9, 0x3D], [0xD8], [0x3B, 0xDE]], + outputs: [ '', '\uD93E', "\uD83D\uDE3B"], + }, { + inputs: [[0x3E, 0xD9, 0x3D]], + outputs: [ '', '\uD93E�'], + }, { + inputs: [[0x3E, 0xD9], [0x3D]], + outputs: [ '', '', '\uD93E�'], + }, { + inputs: [[0x3E, 0xD9]], + outputs: [ '', '\uD93E'], + }])); }); -describe("UTF-16BE codec #node-web", function() { +describe("UTF-16BE encoder #node-web", function() { + const enc = 'utf16-be'; it("encodes basic strings correctly", function() { - assert.equal(hex(iconv.encode(testStr, 'utf16-be')), hex(utf16beBuf)); + assert.equal(hex(iconv.encode('', enc)), ''); + assert.equal(hex(iconv.encode(testStr, enc)), hex(utf16beBuf)); }); + it("adds BOM if asked", function() { + assert.equal(hex(iconv.encode(testStr, enc, {addBOM: true})), hex(utf16beBOM) + hex(utf16beBuf)); + }); + + // See note in UTF-16LE encoder above; we need to keep them consistent. + it("keeps single and invalid surrogates as-is", function() { + assert.equal(hex(iconv.encode(' \uD800 \uDE00 \uDE00\uD800 \uD800', enc)), + "0020 d800 0020 de00 0020 de00 d800 0020 d800".replace(/ /g, "")); + }); + + it("handles valid surrogate pairs on chunk boundary correctly", function() { + const encoder = iconv.getEncoder(enc); + assert.equal(hex(encoder.write('\uD83D')), 'd83d'); + assert.equal(hex(encoder.write('\uDCA9')), 'dca9'); + assert.strictEqual(encoder.end(), undefined); + }); +}); + +describe("UTF-16BE decoder #node-web", function() { + const enc = 'utf16-be'; it("decodes basic buffers correctly", function() { - assert.equal(iconv.decode(utf16beBuf, 'utf16-be'), testStr); + assert.equal(iconv.decode(utf16beBuf, enc), testStr); }); - it("decodes uneven length buffers with no error", function() { - assert.equal(iconv.decode(utils.bytesFrom([0, 0x61, 0]), 'utf16-be'), "a"); + it("decodes uneven length buffers showing an error", function() { + assert.equal(iconv.decode(utils.bytesFrom([0, 0x61, 0]), enc), "a�"); }); it("decodes very short buffers correctly", function() { - assert.equal(iconv.decode(utils.bytesFrom([]), 'utf-16be'), ''); - assert.equal(iconv.decode(utils.bytesFrom([0x61]), 'utf-16be'), ''); + assert.equal(iconv.decode(utils.bytesFrom([]), enc), ''); + assert.equal(iconv.decode(utils.bytesFrom([0x61]), enc), '�'); }); + + it("handles chunks with uneven lengths correctly", utils.checkDecoderChunks(enc, { + inputs: [[], [0x00], [], [0x61], [0x00], [0x61, 0x00], [0x61, 0x00]], + outputs: ['', '', '', 'a', '', 'a', 'a', '�'], + })); + + it("doesn't split valid surrogate pairs between chunks", utils.checkDecoderChunks(enc, [{ + inputs: [[0xD8, 0x3D, 0xDE], [0x3B]], + outputs: [ '', "\uD83D\uDE3B"], + }, { + inputs: [[0xD8, 0x3D], [0xDE], [0x3B]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0xD8], [0x3D, 0xDE], [0x3B]], + outputs: [ '', '', "\uD83D\uDE3B"], + }, { + inputs: [[0xD8], [0x3D], [0xDE], [0x3B]], + outputs: [ '', '', '', "\uD83D\uDE3B"], + }])); + + it("handles complex surrogate pairs cases", utils.checkDecoderChunks(enc, [{ + inputs: [[0xD9], [0x3E], [0xD8], [0x3D], [0xDE], [0x3B]], + outputs: [ '', '', '', '\uD93E', '', "\uD83D\uDE3B"] + }, { + inputs: [[0xD9, 0x3E, 0xD8], [0x3D], [0xDE, 0x3B]], + outputs: [ '', '\uD93E', "\uD83D\uDE3B"], + }, { + inputs: [[0xD9, 0x3E, 0xD8]], + outputs: [ '', '\uD93E�'], + }, { + inputs: [[0xD9, 0x3E], [0xD8]], + outputs: [ '', '', '\uD93E�'], + }, { + inputs: [[0xD9, 0x3E]], + outputs: [ '', '\uD93E'], + }])); }); describe("UTF-16 encoder #node-web", function() { + const enc = 'utf-16'; it("uses UTF-16LE and adds BOM when encoding", function() { - assert.equal(hex(iconv.encode(testStr, "utf-16")), hex(utf16leBOM) + hex(utf16leBuf)); + assert.equal(hex(iconv.encode(testStr, enc)), hex(utf16leBOM) + hex(utf16leBuf)); }); it("can skip BOM", function() { - assert.equal(hex(iconv.encode(testStr, "utf-16", {addBOM: false})), hex(utf16leBuf)); - }); - - it("can use other encodings, for example UTF-16BE, with BOM", function() { - assert.equal(hex(iconv.encode(testStr, "utf-16", {use: 'UTF-16BE'})), hex(utf16beBOM) + hex(utf16beBuf)); + assert.equal(hex(iconv.encode(testStr, enc, {addBOM: false})), hex(utf16leBuf)); }); }); describe("UTF-16 decoder #node-web", function() { + const enc = 'utf-16', + encLE = 'utf-16le', + encBE = 'utf-16be'; + it("uses BOM to determine encoding", function() { - assert.equal(iconv.decode(utils.concatBufs([utf16leBOM, utf16leBuf]), "utf-16"), testStr); - assert.equal(iconv.decode(utils.concatBufs([utf16beBOM, utf16beBuf]), "utf-16"), testStr); + assert.equal(iconv.decode(utils.concatBufs([utf16leBOM, utf16leBuf]), enc), testStr); + assert.equal(iconv.decode(utils.concatBufs([utf16beBOM, utf16beBuf]), enc), testStr); }); it("handles very short buffers", function() { - assert.equal(iconv.decode(utils.bytesFrom([]), 'utf-16'), ''); - - // Looks like StringDecoder doesn't do the right thing here. TODO: fix. - //assert.equal(iconv.decode(utils.bytesFrom([0x61]), 'utf-16'), ''); + assert.equal(iconv.decode(utils.bytesFrom([]), enc), ''); + assert.equal(iconv.decode(utils.bytesFrom([0x61]), enc), '�'); }); it("uses spaces when there is no BOM to determine encoding", function() { - assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-16le'), 'utf-16'), sampleStr); - assert.equal(iconv.decode(iconv.encode(sampleStr, 'utf-16be'), 'utf-16'), sampleStr); + assert.equal(iconv.decode(iconv.encode(sampleStr, encLE), enc), sampleStr); + assert.equal(iconv.decode(iconv.encode(sampleStr, encBE), enc), sampleStr); }); it("uses UTF-16LE if no BOM and heuristics failed", function() { - assert.equal(iconv.decode(utf16leBuf, 'utf-16'), testStr); + assert.equal(iconv.decode(weirdBuf, enc), iconv.decode(weirdBuf, encLE)); }); it("can be given a different default encoding", function() { - assert.equal(iconv.decode(utf16leBuf, 'utf-16', {default: 'utf-16le'}), testStr); + assert.equal(iconv.decode(weirdBuf, enc, {defaultEncoding: encBE}), iconv.decode(weirdBuf, encBE)); }); }); diff --git a/test/utils.js b/test/utils.js index dc9e3c8..f0a430b 100644 --- a/test/utils.js +++ b/test/utils.js @@ -1,3 +1,5 @@ +"use strict"; + const assert = require("assert"); const utils = module.exports = { @@ -34,5 +36,29 @@ const utils = module.exports = { assert(nonStrict || (bytes instanceof utils.BytesType)); return bytes.reduce((output, byte) => (output + ('0' + (byte & 0xFF).toString(16)).slice(-2)), ''); }, + + checkDecoderChunks(encoding, cases) { + return () => { + const decoder = utils.iconv.getDecoder(encoding); + if (!Array.isArray(cases)) { + cases = [cases]; + } + + for (let idx = 0; idx < cases.length; idx++) { + const inputs = cases[idx].inputs, + outputs = cases[idx].outputs; + for (let i = 0; i < inputs.length; i++) + assert.strictEqual(decoder.write(utils.bytesFrom(inputs[i])), outputs[i], `position ${i} in case ${idx}`); + + if (outputs.length === inputs.length) { + assert(!decoder.end(), `end is not empty in case ${idx}`); + } else if (outputs.length === inputs.length + 1) { + assert.strictEqual(decoder.end(), outputs[outputs.length-1], `end result unexpected in case ${idx}`); + } else { + assert(false, `invalid outputs array size in case ${idx}`); + } + } + } + }, };