Skip to content

Commit

Permalink
fix TestMassiveRuneDiffConversion by skip invalid unicode code point
Browse files Browse the repository at this point in the history
  • Loading branch information
iambus committed Sep 26, 2022
1 parent 3403a16 commit f91e37c
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 19 deletions.
27 changes: 16 additions & 11 deletions diffmatchpatch/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -388,24 +388,29 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int,
}

// DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line.
// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes.
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) {
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
return chars1, chars2, lineArray
chars1, chars2, lineArray := dmp.diffLinesToIndexes(text1, text2)
return indexesToString(chars1), indexesToString(chars2), lineArray
}

// DiffLinesToRunes splits two texts into a list of runes.
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
chars1, chars2, lineArray := dmp.diffLinesToIndexes(text1, text2)
return []rune(indexesToString(chars1)), []rune(indexesToString(chars2)), lineArray
}

// diffLinesToIndexes splits two texts into a list of indexes
func (dmp *DiffMatchPatch) diffLinesToIndexes(text1, text2 string) ([]index, []index, []string) {
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
return []rune(chars1), []rune(chars2), lineArray
return chars1, chars2, lineArray
}

// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
var sb strings.Builder
for _, i := range []rune(aDiff.Text) {
for _, i := range stringToIndex(aDiff.Text) {
sb.WriteString(lineArray[i])
}
aDiff.Text = sb.String()
Expand Down Expand Up @@ -1301,24 +1306,24 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di
}

// diffLinesToStrings splits two texts into a list of strings. Each string represents one line.
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) {
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) ([]index, []index, []string) {
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'

//Each string has the index of lineArray which it points to
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray)
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray)

return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray
return strIndexArray1, strIndexArray2, lineArray
}

// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string.
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []uint32 {
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []index {
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4
lineStart := 0
lineEnd := -1
strs := []uint32{}
strs := []index{}

for lineEnd < len(text)-1 {
lineEnd = indexOf(text, "\n", lineStart)
Expand All @@ -1332,11 +1337,11 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str
lineValue, ok := lineHash[line]

if ok {
strs = append(strs, uint32(lineValue))
strs = append(strs, index(lineValue))
} else {
*lineArray = append(*lineArray, line)
lineHash[line] = len(*lineArray) - 1
strs = append(strs, uint32(len(*lineArray)-1))
strs = append(strs, index(len(*lineArray)-1))
}
}

Expand Down
32 changes: 32 additions & 0 deletions diffmatchpatch/index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package diffmatchpatch

type index uint32

const runeSkipStart = 0xd800
const runeSkipEnd = 0xdfff + 1
const runeMax = 0x110000 // next invalid code point

func stringToIndex(text string) []index {
runes := []rune(text)
indexes := make([]index, len(runes))
for i, r := range runes {
if r < runeSkipEnd {
indexes[i] = index(r)
} else {
indexes[i] = index(r) - (runeSkipEnd - runeSkipStart)
}
}
return indexes
}

func indexesToString(indexes []index) string {
runes := make([]rune, len(indexes))
for i, index := range indexes {
if index < runeSkipStart {
runes[i] = rune(index)
} else {
runes[i] = rune(index + (runeSkipEnd - runeSkipStart))
}
}
return string(runes)
}
16 changes: 16 additions & 0 deletions diffmatchpatch/index_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package diffmatchpatch

import (
"github.com/stretchr/testify/assert"
"testing"
)

func TestIndexConversion(t *testing.T) {
n := runeMax - (runeSkipEnd - runeSkipStart)
indexes := make([]index, n)
for i := 0; i < n; i++ {
indexes[i] = index(i)
}
indexes2 := stringToIndex(indexesToString(indexes))
assert.EqualValues(t, indexes, indexes2)
}
8 changes: 0 additions & 8 deletions diffmatchpatch/stringutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,3 @@ func runesIndex(r1, r2 []rune) int {
}
return -1
}

func intArrayToString(ns []uint32) string {
runes := make([]rune, len(ns))
for i := 0; i < len(ns); i++ {
runes[i] = rune(ns[i])
}
return string(runes)
}

0 comments on commit f91e37c

Please sign in to comment.