Skip to content

Commit 44cb4fd

Browse files
authored
Introduce ios.LineEditingReader, an io.Reader wrapper with line editing mechanism (#22)
`LineEditingReader` implements `io.Reader` interface with a line editing mechanism. `LineEditingReader` reads data from underlying `io.Reader` and invokes the caller supplied edit function for each of the line (defined as `[]byte` ending with `'\n'`, therefore it works on both Mac/Linux and Windows, where `'\r\n'` is used). Note the last line before `EOF` will be edited as well even if it doesn't end with `'\n'`. Usage is highly flexible: the editing function can do in-place editing such as character replacement, prefix/suffix stripping, or word replacement, etc., as long as the line length isn't increased; or it can replace a line with a completely newly allocated and written line with no length restriction (although performance might be slower compared to in-place editing). `ios.LineEditingReader` is at least as performant as `ios.BytesReplacingReader`: ``` BenchmarkLineEditingReader_RawIORead-8 23300 51319 ns/op 1103392 B/op 23 allocs/op BenchmarkLineEditingReader_UseLineEditingReader-8 3343 351305 ns/op 1104512 B/op 25 allocs/op BenchmarkLineEditingReader_CompareWithBytesReplacingReader-8 978 1226656 ns/op 1107648 B/op 26 allocs/op ``` This PR is motivated from real usage case discussed in jf-tech/omniparser#154
1 parent 99ea835 commit 44cb4fd

File tree

2 files changed

+268
-0
lines changed

2 files changed

+268
-0
lines changed

ios/lineEditingReader.go

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
package ios
2+
3+
import (
4+
"bytes"
5+
"io"
6+
7+
"github.com/jf-tech/go-corelib/maths"
8+
)
9+
10+
// LineEditFunc edits a line and returns a resulting line. Note in-place editing is highly encouraged,
11+
// for performance reasons, when the resulting line is no longer than the original. If your edited line
12+
// is longer then the original `line`, however, you MUST allocate and return a new []byte. Directly
13+
// appending at the end of the original `line` will result in undefined behavior.
14+
type LineEditFunc func(line []byte) ([]byte, error)
15+
16+
// LineEditingReader implements io.Reader interface with a line editing mechanism. LineEditingReader reads data from
17+
// underlying io.Reader and invokes the caller supplied edit function for each of the line (defined as
18+
// []byte ending with '\n', therefore it works on both Mac/Linux and Windows, where '\r\n' is used).
19+
// Note the last line before EOF will be edited as well even if it doesn't end with '\n'. Usage is highly
20+
// flexible: the editing function can do in-place editing such as character replacement, prefix/suffix
21+
// stripping, or word replacement, etc., as long as the line length isn't changed; or it can replace a line
22+
// with a completely newly allocated and written line with no length restriction (although performance
23+
// would be slower compared to in-place editing).
24+
type LineEditingReader struct {
25+
r io.Reader
26+
edit LineEditFunc
27+
bufSize int // initial buf size and future buf growth increment.
28+
buf []byte // note len(buf) == cap(buf), we always use the full capacity of the buf.
29+
buf0 int // buf[:buf0] edited line(s) ready to be returned to caller.
30+
buf1 int // buf[buf0:buf1] unedited lines.
31+
err error
32+
}
33+
34+
func (r *LineEditingReader) scanEndOfLine(buf []byte) int {
35+
if lf := bytes.IndexByte(buf, '\n'); lf >= 0 {
36+
return lf
37+
}
38+
if r.err == io.EOF {
39+
return len(buf) - 1
40+
}
41+
return -1
42+
}
43+
44+
// Read implements io.Reader interface for LineEditingReader.
45+
func (r *LineEditingReader) Read(p []byte) (int, error) {
46+
n := 0
47+
for {
48+
if r.buf0 > 0 {
49+
n = copy(p, r.buf[:r.buf0])
50+
r.buf0 -= n
51+
r.buf1 -= n
52+
copy(r.buf, r.buf[n:r.buf1+n])
53+
return n, nil
54+
} else if r.err != nil {
55+
return 0, r.err
56+
}
57+
58+
if r.buf1 >= len(r.buf) {
59+
newBuf := make([]byte, len(r.buf)+r.bufSize)
60+
copy(newBuf, r.buf)
61+
r.buf = newBuf
62+
}
63+
64+
n, r.err = r.r.Read(r.buf[r.buf1:])
65+
r.buf1 += n
66+
lf := r.scanEndOfLine(r.buf[r.buf0:r.buf1])
67+
for ; lf >= 0; lf = r.scanEndOfLine(r.buf[r.buf0:r.buf1]) {
68+
lineLen := lf + 1
69+
edited, err := r.edit(r.buf[r.buf0 : r.buf0+lineLen])
70+
if err != nil {
71+
r.err = err
72+
break
73+
}
74+
editedLen := len(edited)
75+
delta := lineLen - editedLen
76+
if len(r.buf)-r.buf1+delta < 0 {
77+
// only expand the buf if there is no room left for the edited line growth.
78+
newBuf := make([]byte, len(r.buf)+maths.MaxInt(r.bufSize, -delta))
79+
copy(newBuf, r.buf[:r.buf1])
80+
r.buf = newBuf
81+
}
82+
if delta > 0 {
83+
// This is the case where the edited line is shorter than the original line.
84+
// Image we have:
85+
// xyz\nabc
86+
// where "xyz\n" is in-placed edited to drop the first letter to "yz\n".
87+
// If we shift "abc" up by delta (1) first, then we would've overwritten the "\n" in "yz\n"
88+
// and the edited would now be "yza".
89+
// Therefore, if edited is shorter, we need to move/copy edited to be at buf0 first
90+
// before we shift the rest of the buffer (up to buf1) up.
91+
copy(r.buf[r.buf0:r.buf0+editedLen], edited)
92+
copy(r.buf[r.buf0+editedLen:r.buf1-delta], r.buf[r.buf0+lineLen:r.buf1])
93+
} else {
94+
// Now if edited is longer, we need to move the rest buffer out first, before we can copy
95+
// the edited into the buffer.
96+
copy(r.buf[r.buf0+editedLen:r.buf1-delta], r.buf[r.buf0+lineLen:r.buf1])
97+
copy(r.buf[r.buf0:r.buf0+editedLen], edited)
98+
}
99+
r.buf0 += editedLen
100+
r.buf1 -= delta
101+
}
102+
}
103+
}
104+
105+
// NewLineEditingReader2 creates a new LineEditingReader with custom buffer size.
106+
func NewLineEditingReader2(r io.Reader, edit LineEditFunc, bufSize int) *LineEditingReader {
107+
buf := make([]byte, bufSize)
108+
return &LineEditingReader{
109+
r: r,
110+
edit: edit,
111+
bufSize: bufSize,
112+
buf: buf,
113+
}
114+
}
115+
116+
const (
117+
defaultLineEditingReaderBufSize = 1024
118+
)
119+
120+
// NewLineEditingReader creates a new LineEditingReader with the default buffer size.
121+
func NewLineEditingReader(r io.Reader, edit LineEditFunc) *LineEditingReader {
122+
return NewLineEditingReader2(r, edit, defaultLineEditingReaderBufSize)
123+
}

ios/lineEditingReader_test.go

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
package ios
2+
3+
import (
4+
"errors"
5+
"io/ioutil"
6+
"strings"
7+
"testing"
8+
9+
"github.com/stretchr/testify/assert"
10+
)
11+
12+
func TestLineEditingReader_CustomBufSize(t *testing.T) {
13+
for _, test := range []struct {
14+
name string
15+
editFunc LineEditFunc
16+
bufSize int
17+
input string
18+
expected string
19+
err string
20+
}{
21+
{
22+
name: "various successful editings",
23+
editFunc: func(line []byte) ([]byte, error) {
24+
if string(line) == "abc\n" {
25+
// testing returning a newly allocated line with same length
26+
return []byte("xyz\n"), nil
27+
}
28+
if string(line) == "one\r\n" {
29+
line[0] = '1'
30+
line[1] = '\r'
31+
line[2] = '\n'
32+
// testing an in-place edited line with shrunk length
33+
return line[:3], nil
34+
}
35+
if string(line) == "1" { // note there is no ending '\n' since line "1" is the last line before EOF.
36+
// testing returning a newly allocated line with much longer length plus some '\n' added.
37+
return []byte("first\nzuerst\nprimo\n第一の"), nil
38+
}
39+
return line, nil
40+
},
41+
bufSize: 2,
42+
input: "not changed\nabc\n\n\n\none\r\n1",
43+
expected: "not changed\nxyz\n\n\n\n1\r\nfirst\nzuerst\nprimo\n第一の",
44+
err: "",
45+
},
46+
{
47+
name: "successful editing followed by failed editing",
48+
editFunc: func(line []byte) ([]byte, error) {
49+
if string(line) == "abc\n" {
50+
return []byte("xyz\n"), nil
51+
}
52+
if string(line) == "boom\r\n" {
53+
return []byte("ignored\r\n"), errors.New("mock error")
54+
}
55+
return line, nil
56+
},
57+
bufSize: 100,
58+
input: "not changed\nabc\nboom\r\nend\n",
59+
expected: "",
60+
err: "mock error",
61+
},
62+
} {
63+
t.Run(test.name, func(t *testing.T) {
64+
ret, err := ioutil.ReadAll(NewLineEditingReader2(strings.NewReader(test.input), test.editFunc, test.bufSize))
65+
if test.err != "" {
66+
assert.Error(t, err)
67+
assert.Equal(t, test.err, err.Error())
68+
} else {
69+
assert.NoError(t, err)
70+
assert.Equal(t, test.expected, string(ret))
71+
}
72+
})
73+
}
74+
}
75+
76+
func TestNewLineEdtingReader(t *testing.T) {
77+
// Test against a real scenario where we need to strip each line's leading '|' pipe character.
78+
// See details at: https://github.com/jf-tech/omniparser/pull/154
79+
input := "|HDR|1|2|3|\n\n|DAT|X|\n|EOF|"
80+
expected := "HDR|1|2|3|\n\nDAT|X|\nEOF|"
81+
ret, err := ioutil.ReadAll(
82+
NewLineEditingReader(
83+
strings.NewReader(input),
84+
func(line []byte) ([]byte, error) {
85+
if len(line) < 2 || line[0] != '|' {
86+
return line, nil
87+
}
88+
return line[1:], nil
89+
}))
90+
assert.NoError(t, err)
91+
assert.Equal(t, expected, string(ret))
92+
}
93+
94+
var (
95+
lineEditingReaderBenchInputLine = "|HDR|1|2|3|4|5|6|7|8|9|\n"
96+
lineEditingReaderBenchInput = strings.Repeat(lineEditingReaderBenchInputLine, 10000)
97+
lineEditingReaderBenchOutput = strings.Repeat(
98+
strings.TrimLeft(lineEditingReaderBenchInputLine, "|"), 10000)
99+
)
100+
101+
func TestLineEditingReaderBenchCorrectness(t *testing.T) {
102+
ret, err := ioutil.ReadAll(
103+
NewLineEditingReader(
104+
strings.NewReader(lineEditingReaderBenchInput),
105+
func(line []byte) ([]byte, error) {
106+
if len(line) < 2 || line[0] != '|' {
107+
return line, nil
108+
}
109+
return line[1:], nil
110+
}))
111+
assert.NoError(t, err)
112+
assert.Equal(t, lineEditingReaderBenchOutput, string(ret))
113+
}
114+
115+
func BenchmarkLineEditingReader_RawIORead(b *testing.B) {
116+
for i := 0; i < b.N; i++ {
117+
_, _ = ioutil.ReadAll(strings.NewReader(lineEditingReaderBenchInput))
118+
}
119+
}
120+
121+
func BenchmarkLineEditingReader_UseLineEditingReader(b *testing.B) {
122+
for i := 0; i < b.N; i++ {
123+
_, _ = ioutil.ReadAll(
124+
NewLineEditingReader(
125+
strings.NewReader(lineEditingReaderBenchInput),
126+
func(line []byte) ([]byte, error) {
127+
if len(line) < 2 || line[0] != '|' {
128+
return line, nil
129+
}
130+
return line[1:], nil
131+
}))
132+
}
133+
}
134+
135+
func BenchmarkLineEditingReader_CompareWithBytesReplacingReader(b *testing.B) {
136+
search := []byte("|H")
137+
replace := []byte("H")
138+
for i := 0; i < b.N; i++ {
139+
_, _ = ioutil.ReadAll(
140+
NewBytesReplacingReader(
141+
strings.NewReader(lineEditingReaderBenchInput),
142+
search,
143+
replace))
144+
}
145+
}

0 commit comments

Comments
 (0)