Skip to content

Commit fd47af6

Browse files
committed
Merge pull request #1 from github/encoding
Make BERT encoding aware
2 parents c2abcc4 + 3113c6f commit fd47af6

File tree

7 files changed

+211
-20
lines changed

7 files changed

+211
-20
lines changed

ext/bert/c/decode.c

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "ruby.h"
2+
#include "ruby/encoding.h"
23
#include <stdint.h>
34
#include <netinet/in.h>
45

@@ -14,9 +15,17 @@
1415
#define ERL_BIN 109
1516
#define ERL_SMALL_BIGNUM 110
1617
#define ERL_LARGE_BIGNUM 111
18+
19+
/* These two types are specific to version 2 of the protocol. They diverge
20+
* from Erlang, but allow us to pass string encodings across the wire. */
21+
#define ERLEXT_ENC_STRING 112
22+
#define ERLEXT_UNICODE_STRING 113
23+
24+
/* Protocol version constants. */
1725
#define ERL_VERSION 131
26+
#define ERL_VERSION2 132
1827

19-
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_LARGE_BIGNUM)
28+
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERLEXT_UNICODE_STRING)
2029
#define BERT_TYPE_OFFSET (ERL_SMALL_INT)
2130

2231
static VALUE rb_mBERT;
@@ -40,6 +49,8 @@ static VALUE bert_read_nil(struct bert_buf *buf);
4049
static VALUE bert_read_string(struct bert_buf *buf);
4150
static VALUE bert_read_list(struct bert_buf *buf);
4251
static VALUE bert_read_bin(struct bert_buf *buf);
52+
static VALUE bert_read_enc_string(struct bert_buf *buf);
53+
static VALUE bert_read_unicode_string(struct bert_buf *buf);
4354
static VALUE bert_read_sbignum(struct bert_buf *buf);
4455
static VALUE bert_read_lbignum(struct bert_buf *buf);
4556

@@ -59,7 +70,9 @@ static bert_ptr bert_callbacks[] = {
5970
&bert_read_list,
6071
&bert_read_bin,
6172
&bert_read_sbignum,
62-
&bert_read_lbignum
73+
&bert_read_lbignum,
74+
&bert_read_enc_string,
75+
&bert_read_unicode_string
6376
};
6477

6578
static inline uint8_t bert_buf_read8(struct bert_buf *buf)
@@ -293,6 +306,34 @@ static VALUE bert_read_bin(struct bert_buf *buf)
293306
return rb_bin;
294307
}
295308

309+
static VALUE bert_read_unicode_string(struct bert_buf *buf)
310+
{
311+
VALUE rb_str;
312+
313+
rb_str = bert_read_bin(buf);
314+
rb_enc_associate(rb_str, rb_utf8_encoding());
315+
316+
return rb_str;
317+
}
318+
319+
static VALUE bert_read_enc_string(struct bert_buf *buf)
320+
{
321+
uint8_t type;
322+
VALUE rb_bin, enc;
323+
324+
rb_bin = bert_read_bin(buf);
325+
326+
bert_buf_ensure(buf, 1);
327+
type = bert_buf_read8(buf);
328+
if (ERL_BIN != type)
329+
rb_raise(rb_eRuntimeError, "Invalid tag '%d' for term", type);
330+
331+
enc = bert_read_bin(buf);
332+
rb_enc_associate(rb_bin, rb_find_encoding(enc));
333+
334+
return rb_bin;
335+
}
336+
296337
static VALUE bert_read_string(struct bert_buf *buf)
297338
{
298339
uint16_t i, length;
@@ -467,17 +508,20 @@ static VALUE bert_read_invalid(struct bert_buf *buf)
467508
static VALUE rb_bert_decode(VALUE klass, VALUE rb_string)
468509
{
469510
struct bert_buf buf;
511+
uint8_t proto_version;
470512

471513
Check_Type(rb_string, T_STRING);
472514
buf.data = (uint8_t *)RSTRING_PTR(rb_string);
473515
buf.end = buf.data + RSTRING_LEN(rb_string);
474516

475517
bert_buf_ensure(&buf, 1);
476518

477-
if (bert_buf_read8(&buf) != ERL_VERSION)
478-
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");
479-
480-
return bert_read(&buf);
519+
proto_version = bert_buf_read8(&buf);
520+
if (proto_version == ERL_VERSION || proto_version == ERL_VERSION2) {
521+
return bert_read(&buf);
522+
} else {
523+
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");
524+
}
481525
}
482526

483527
static VALUE rb_bert_impl(VALUE klass)

lib/bert.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,4 @@
2222
# Global method for specifying that an array should be encoded as a tuple.
2323
def t
2424
BERT::Tuple
25-
end
25+
end

lib/bert/decode.rb

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@ def self.impl
1010
def self.decode(string)
1111
io = StringIO.new(string)
1212
io.set_encoding('binary') if io.respond_to?(:set_encoding)
13-
new(io).read_any
13+
header = io.getbyte
14+
case header
15+
when MAGIC, VERSION_2
16+
new(io).read_any
17+
else
18+
fail("Bad Magic")
19+
end
1420
end
1521

1622
def initialize(ins)
@@ -19,7 +25,6 @@ def initialize(ins)
1925
end
2026

2127
def read_any
22-
fail("Bad Magic") unless read_1 == MAGIC
2328
read_any_raw
2429
end
2530

@@ -37,6 +42,8 @@ def read_any_raw
3742
when STRING then read_erl_string
3843
when LIST then read_list
3944
when BIN then read_bin
45+
when ENC_STRING then read_enc_string
46+
when UNICODE_STRING then read_unicode_string
4047
else
4148
fail("Unknown term tag: #{peek_1}")
4249
end
@@ -223,6 +230,14 @@ def read_nil
223230
[]
224231
end
225232

233+
def read_unicode_string
234+
fail("Invalid Type, not a unicode string") unless read_1 == UNICODE_STRING
235+
length = read_4
236+
str = read_string(length)
237+
str.force_encoding "UTF-8"
238+
str
239+
end
240+
226241
def read_erl_string
227242
fail("Invalid Type, not an erlang string") unless read_1 == STRING
228243
length = read_2
@@ -246,5 +261,18 @@ def read_bin
246261
def fail(str)
247262
raise str
248263
end
264+
265+
private
266+
267+
def read_enc_string
268+
fail("Invalid Type, not an erlang binary") unless read_1 == ENC_STRING
269+
length = read_4
270+
x = read_string(length)
271+
272+
fail("Invalid Type, not an erlang binary") unless read_1 == BIN
273+
length = read_4
274+
x.force_encoding read_string(length)
275+
x
276+
end
249277
end
250278
end

lib/bert/encode.rb

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,47 @@ module BERT
22
class Encode
33
include Types
44

5+
class V2 < Encode
6+
def write_binary(data)
7+
enc = data.encoding
8+
case enc
9+
when ::Encoding::UTF_8, ::Encoding::US_ASCII
10+
write_unicode_string data
11+
when ::Encoding::ASCII_8BIT
12+
super
13+
else
14+
write_enc_string data
15+
end
16+
end
17+
18+
private
19+
20+
def write_unicode_string(data)
21+
write_1 UNICODE_STRING
22+
write_4 data.bytesize
23+
write_string data
24+
end
25+
26+
def write_enc_string(data)
27+
write_1 ENC_STRING
28+
write_4 data.bytesize
29+
write_string data
30+
enc = data.encoding.name
31+
write_1 BIN
32+
write_4 enc.bytesize
33+
write_string enc
34+
end
35+
36+
def version_header
37+
VERSION_2
38+
end
39+
end
40+
41+
class << self
42+
attr_accessor :version
43+
end
44+
self.version = :v1
45+
546
attr_accessor :out
647

748
def initialize(out)
@@ -11,12 +52,18 @@ def initialize(out)
1152
def self.encode(data)
1253
io = StringIO.new
1354
io.set_encoding('binary') if io.respond_to?(:set_encoding)
14-
self.new(io).write_any(data)
55+
56+
if version == :v2
57+
Encode::V2.new(io).write_any(data)
58+
else
59+
new(io).write_any(data)
60+
end
61+
1562
io.string
1663
end
1764

1865
def write_any obj
19-
write_1 MAGIC
66+
write_1 version_header
2067
write_any_raw obj
2168
end
2269

@@ -132,6 +179,10 @@ def write_binary(data)
132179

133180
private
134181

182+
def version_header
183+
MAGIC
184+
end
185+
135186
def fail(obj)
136187
raise "Cannot encode to erlang external format: #{obj.inspect}"
137188
end

lib/bert/types.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ module Types
1212
STRING = 107
1313
LIST = 108
1414
BIN = 109
15+
ENC_STRING = 112
16+
UNICODE_STRING = 113
1517
FUN = 117
16-
NEW_FUN = 112
1718
MAGIC = 131
19+
VERSION_2 = 132
1820
MAX_INT = (1 << 27) -1
1921
MIN_INT = -(1 << 27)
2022
end
21-
end
23+
end

test/bert_test.rb

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,55 @@ class BertTest < Test::Unit::TestCase
55
setup do
66
time = Time.at(1254976067)
77
@ruby = t[:user, {:name => 'TPW'}, [/cat/i, 9.9], time, nil, true, false, :true, :false]
8-
@bert = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false"
9-
@ebin = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
8+
@bert_old = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false".b
9+
@ebin_old = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
1010
end
1111

12-
should "encode" do
13-
assert_equal @bert, BERT.encode(@ruby)
12+
context "v2 encoder" do
13+
setup do
14+
@old_version = BERT::Encode.version
15+
BERT::Encode.version = :v2
16+
@bert = "\x84h\td\x00\x04userh\x03d\x00\x04bertd\x00\x04dictl\x00\x00\x00\x01h\x02d\x00\x04nameq\x00\x00\x00\x03TPWjl\x00\x00\x00\x02h\x04d\x00\x04bertd\x00\x05regexq\x00\x00\x00\x03catl\x00\x00\x00\x01d\x00\bcaselessjc9.900000000000000e+00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jh\x05d\x00\x04bertd\x00\x04timeb\x00\x00\x04\xE6b\x00\x0E\xE4\xC3a\x00h\x02d\x00\x04bertd\x00\x03nilh\x02d\x00\x04bertd\x00\x04trueh\x02d\x00\x04bertd\x00\x05falsed\x00\x04trued\x00\x05false".b
17+
@ebin = "<<132,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,113,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,113,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
18+
end
19+
20+
teardown do
21+
BERT::Encode.version = @old_version
22+
end
23+
24+
should "decode new format" do
25+
assert_equal @ruby, BERT.decode(@bert)
26+
end
27+
28+
should "roundtrip string and maintain encoding" do
29+
str = "日本語".encode 'EUC-JP'
30+
round = BERT.decode(BERT.encode(str))
31+
assert_equal str, round
32+
assert_equal str.encoding, round.encoding
33+
end
34+
35+
should "roundtrip binary string" do
36+
str = "日本語".b
37+
round = BERT.decode(BERT.encode(str))
38+
assert_equal str, round
39+
assert_equal str.encoding, round.encoding
40+
end
41+
42+
should "encode" do
43+
assert_equal @bert, BERT.encode(@ruby)
44+
end
45+
46+
should "ebin" do
47+
assert_equal @ebin, BERT.ebin(@bert)
48+
end
1449
end
1550

16-
should "decode" do
17-
assert_equal @ruby, BERT.decode(@bert)
51+
should "decode the old format" do
52+
assert_equal @ruby, BERT.decode(@bert_old)
1853
end
1954

2055
should "ebin" do
21-
assert_equal @ebin, BERT.ebin(@bert)
56+
assert_equal @ebin_old, BERT.ebin(@bert_old)
2257
end
2358

2459
should "do roundtrips" do

test/encoder_test.rb

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ class EncoderTest < Test::Unit::TestCase
8282
end
8383

8484
should 'handle utf8 strings' do
85+
str = "été".encode 'UTF-8'
8586
bert = [131, 109, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
8687
assert_equal bert, BERT::Encoder.encode("été")
8788
end
@@ -99,6 +100,36 @@ class EncoderTest < Test::Unit::TestCase
99100
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
100101
end
101102

103+
context "v2" do
104+
setup do
105+
@old_version = BERT::Encode.version
106+
BERT::Encode.version = :v2
107+
end
108+
109+
teardown do
110+
BERT::Encode.version = @old_version
111+
end
112+
113+
should 'handle utf8 strings' do
114+
str = "été".encode 'UTF-8'
115+
bert = [132, 113, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
116+
assert_equal bert, BERT::Encoder.encode("été")
117+
end
118+
119+
should 'handle utf8 symbols' do
120+
bert = [132, 100, 0, 5, 195, 169, 116, 195, 169].pack('C*')
121+
assert_equal bert, BERT::Encoder.encode(:'été')
122+
end
123+
124+
should "handle bignums" do
125+
bert = [132,110,8,0,0,0,232,137,4,35,199,138].pack('c*')
126+
assert_equal bert, BERT::Encoder.encode(10_000_000_000_000_000_000)
127+
128+
bert = [132,110,8,1,0,0,232,137,4,35,199,138].pack('c*')
129+
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
130+
end
131+
end
132+
102133
should "leave other stuff alone" do
103134
before = [1, 2.0, [:foo, 'bar']]
104135
assert_equal before, BERT::Encoder.convert(before)

0 commit comments

Comments
 (0)