Skip to content

Commit aa084e7

Browse files
committed
add two new types, unicode strings, and other encoded strings
This commit adds two new types, one for unicode strings and one for other encoded strings. Unocide strings have no extra wire protocol overhead, where "other" strings send the encoding name along with the string.
1 parent 4e78dc4 commit aa084e7

File tree

6 files changed

+77
-34
lines changed

6 files changed

+77
-34
lines changed

ext/bert/c/decode.c

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
#define ERL_BIN 109
1616
#define ERL_SMALL_BIGNUM 110
1717
#define ERL_LARGE_BIGNUM 111
18+
#define ERL_ENC_STRING 112
19+
#define ERL_UNICODE_STRING 113
1820
#define ERL_VERSION 131
1921
#define ERL_VERSION2 132
2022

21-
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_LARGE_BIGNUM)
23+
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_UNICODE_STRING)
2224
#define BERT_TYPE_OFFSET (ERL_SMALL_INT)
2325

2426
static VALUE rb_mBERT;
@@ -47,7 +49,9 @@ static VALUE bert_read_nil(struct bert_buf *buf);
4749
static VALUE bert_read_string(struct bert_buf *buf);
4850
static VALUE bert_read_list(struct bert_buf *buf);
4951
static VALUE bert_read_bin(struct bert_buf *buf);
50-
static VALUE bert_read_bin_v2(struct bert_buf *buf);
52+
static VALUE bert_read_enc_string(struct bert_buf *buf);
53+
static VALUE bert_read_unicode_string(struct bert_buf *buf);
54+
static VALUE bert_read_unicode_string(struct bert_buf *buf);
5155
static VALUE bert_read_sbignum(struct bert_buf *buf);
5256
static VALUE bert_read_lbignum(struct bert_buf *buf);
5357

@@ -66,25 +70,9 @@ static bert_ptr bert_callbacks[] = {
6670
&bert_read_list,
6771
&bert_read_bin,
6872
&bert_read_sbignum,
69-
&bert_read_lbignum
70-
};
71-
72-
static bert_ptr bert_callbacks_v2[] = {
73-
&bert_read_sint,
74-
&bert_read_int,
75-
&bert_read_float,
76-
&bert_read_atom,
77-
&bert_read_invalid,
78-
&bert_read_invalid,
79-
&bert_read_invalid,
80-
&bert_read_stuple,
81-
&bert_read_ltuple,
82-
&bert_read_nil,
83-
&bert_read_string,
84-
&bert_read_list,
85-
&bert_read_bin_v2,
86-
&bert_read_sbignum,
87-
&bert_read_lbignum
73+
&bert_read_lbignum,
74+
&bert_read_enc_string,
75+
&bert_read_unicode_string
8876
};
8977

9078
static inline uint8_t bert_buf_read8(struct bert_buf *buf)
@@ -318,7 +306,17 @@ static VALUE bert_read_bin(struct bert_buf *buf)
318306
return rb_bin;
319307
}
320308

321-
static VALUE bert_read_bin_v2(struct bert_buf *buf)
309+
static VALUE bert_read_unicode_string(struct bert_buf *buf)
310+
{
311+
VALUE rb_str;
312+
313+
rb_str = bert_read_bin(buf);
314+
rb_enc_associate(rb_str, rb_utf8_encoding());
315+
316+
return rb_str;
317+
}
318+
319+
static VALUE bert_read_enc_string(struct bert_buf *buf)
322320
{
323321
uint8_t type;
324322
VALUE rb_bin, enc;
@@ -524,7 +522,7 @@ static VALUE rb_bert_decode(VALUE klass, VALUE rb_string)
524522
buf.callbacks = bert_callbacks;
525523
break;
526524
case ERL_VERSION2:
527-
buf.callbacks = bert_callbacks_v2;
525+
buf.callbacks = bert_callbacks;
528526
break;
529527
default:
530528
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");

lib/bert/decode.rb

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ def read_bin
99
length = read_4
1010
read_string(length)
1111
end
12+
13+
def read_erl_string
14+
fail("Invalid Type, not an erlang string") unless read_1 == STRING
15+
length = read_2
16+
read_string(length).unpack('C' * length)
17+
end
1218
end
1319

1420
def self.impl
@@ -52,6 +58,8 @@ def read_any_raw
5258
when STRING then read_erl_string
5359
when LIST then read_list
5460
when BIN then read_bin
61+
when ENC_STRING then read_enc_string
62+
when UNICODE_STRING then read_unicode_string
5563
else
5664
fail("Unknown term tag: #{peek_1}")
5765
end
@@ -238,6 +246,14 @@ def read_nil
238246
[]
239247
end
240248

249+
def read_unicode_string
250+
fail("Invalid Type, not a unicode string") unless read_1 == UNICODE_STRING
251+
length = read_4
252+
str = read_string(length)
253+
str.force_encoding "UTF-8"
254+
str
255+
end
256+
241257
def read_erl_string
242258
fail("Invalid Type, not an erlang string") unless read_1 == STRING
243259
length = read_2
@@ -255,16 +271,24 @@ def read_list
255271
def read_bin
256272
fail("Invalid Type, not an erlang binary") unless read_1 == BIN
257273
length = read_4
274+
read_string(length)
275+
end
276+
277+
def fail(str)
278+
raise str
279+
end
280+
281+
private
282+
283+
def read_enc_string
284+
fail("Invalid Type, not an erlang binary") unless read_1 == ENC_STRING
285+
length = read_4
258286
x = read_string(length)
259287

260288
fail("Invalid Type, not an erlang binary") unless read_1 == BIN
261289
length = read_4
262290
x.force_encoding read_string(length)
263291
x
264292
end
265-
266-
def fail(str)
267-
raise str
268-
end
269293
end
270294
end

lib/bert/encode.rb

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,35 @@ class Encode
44

55
class V2 < Encode
66
def write_binary(data)
7-
super
7+
enc = data.encoding
8+
case enc
9+
when ::Encoding::UTF_8, ::Encoding::US_ASCII
10+
write_unicode_string data
11+
when ::Encoding::ASCII_8BIT
12+
super
13+
else
14+
write_enc_string data
15+
end
16+
end
17+
18+
private
19+
20+
def write_unicode_string(data)
21+
write_1 UNICODE_STRING
22+
write_4 data.bytesize
23+
write_string data
24+
end
25+
26+
def write_enc_string(data)
27+
write_1 ENC_STRING
28+
write_4 data.bytesize
29+
write_string data
830
enc = data.encoding.name
931
write_1 BIN
1032
write_4 enc.bytesize
1133
write_string enc
1234
end
1335

14-
private
15-
1636
def version_header
1737
VERSION_2
1838
end

lib/bert/types.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@ module Types
1212
STRING = 107
1313
LIST = 108
1414
BIN = 109
15+
ENC_STRING = 112
16+
UNICODE_STRING = 113
1517
FUN = 117
16-
NEW_FUN = 112
1718
MAGIC = 131
1819
VERSION_2 = 132
1920
MAX_INT = (1 << 27) -1

test/bert_test.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ class BertTest < Test::Unit::TestCase
1313
setup do
1414
@old_version = BERT::Encode.version
1515
BERT::Encode.version = :v2
16-
@bert = "\x84h\td\x00\x04userh\x03d\x00\x04bertd\x00\x04dictl\x00\x00\x00\x01h\x02d\x00\x04namem\x00\x00\x00\x03TPWm\x00\x00\x00\x05UTF-8jl\x00\x00\x00\x02h\x04d\x00\x04bertd\x00\x05regexm\x00\x00\x00\x03catm\x00\x00\x00\bUS-ASCIIl\x00\x00\x00\x01d\x00\bcaselessjc9.900000000000000e+00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jh\x05d\x00\x04bertd\x00\x04timeb\x00\x00\x04\xE6b\x00\x0E\xE4\xC3a\x00h\x02d\x00\x04bertd\x00\x03nilh\x02d\x00\x04bertd\x00\x04trueh\x02d\x00\x04bertd\x00\x05falsed\x00\x04trued\x00\x05false".b
17-
@ebin = "<<132,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,109,0,0,0,5,85,84,70,45,56,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,109,0,0,0,8,85,83,45,65,83,67,73,73,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
16+
@bert = "\x84h\td\x00\x04userh\x03d\x00\x04bertd\x00\x04dictl\x00\x00\x00\x01h\x02d\x00\x04nameq\x00\x00\x00\x03TPWjl\x00\x00\x00\x02h\x04d\x00\x04bertd\x00\x05regexq\x00\x00\x00\x03catl\x00\x00\x00\x01d\x00\bcaselessjc9.900000000000000e+00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jh\x05d\x00\x04bertd\x00\x04timeb\x00\x00\x04\xE6b\x00\x0E\xE4\xC3a\x00h\x02d\x00\x04bertd\x00\x03nilh\x02d\x00\x04bertd\x00\x04trueh\x02d\x00\x04bertd\x00\x05falsed\x00\x04trued\x00\x05false".b
17+
@ebin = "<<132,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,113,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,113,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
1818
end
1919

2020
teardown do

test/encoder_test.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ class EncoderTest < Test::Unit::TestCase
112112

113113
should 'handle utf8 strings' do
114114
str = "été".encode 'UTF-8'
115-
bert = [132, 109, 0, 0, 0, 5, 195, 169, 116, 195, 169, 109, 0, 0, 0, 5, 85, 84, 70, 45, 56].pack('C*')
115+
bert = [132, 113, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
116116
assert_equal bert, BERT::Encoder.encode("été")
117117
end
118118

0 commit comments

Comments
 (0)