Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 140 additions & 27 deletions decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ import (
"strings"
"unicode"
"unicode/utf8"

"github.com/bits-and-blooms/bitset"
)

func validateUnreservedWithExtra(s string, acceptedRunes []rune) error {
func validateUnreservedWithExtra(s string, runeSet charSet) error {
for i := 0; i < len(s); {
r, size := utf8.DecodeRuneInString(s[i:])
if r == utf8.RuneError {
Expand Down Expand Up @@ -35,29 +37,8 @@ func validateUnreservedWithExtra(s string, acceptedRunes []rune) error {
continue
}

// RFC grammar definitions:
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
// pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
if !unicode.IsLetter(r) && !unicode.IsDigit(r) &&
// unreserved
r != '-' && r != '.' && r != '_' && r != '~' &&
// sub-delims
r != '!' && r != '$' && r != '&' && r != '\'' && r != '(' && r != ')' &&
r != '*' && r != '+' && r != ',' && r != ';' && r != '=' {
runeFound := false
for _, acceptedRune := range acceptedRunes {
if r == acceptedRune {
runeFound = true
break
}
}

if !runeFound {
return fmt.Errorf("contains an invalid character: '%U' (%q) near %q", r, r, s[i:])
}
if !runeSet.IsInSet(r) {
return fmt.Errorf("contains an invalid character: '%U' (%q) near %q", r, r, s[i:])
}
}

Expand Down Expand Up @@ -86,7 +67,7 @@ func unescapePercentEncoding(s string) (rune, int, error) {
return 0, 0, fmt.Errorf("expected a '%%' escape character, near: %q", s)
}

if s[offset] != '%' {
if s[offset] != percentMark {
return 0, 0, fmt.Errorf("expected a '%%' escape character, near: %q", s[offset:])
}
offset++
Expand All @@ -104,7 +85,7 @@ func unescapePercentEncoding(s string) (rune, int, error) {
return 0, 0, fmt.Errorf("expected a '%%' escape character, near: %q", s)
}

if s[offset] != '%' {
if s[offset] != percentMark {
return 0, 0, fmt.Errorf("expected a '%%' escape character, near: %q", s[offset:])
}
offset++
Expand All @@ -121,7 +102,7 @@ func unescapePercentEncoding(s string) (rune, int, error) {
return 0, 0, fmt.Errorf("expected a '%%' escape character, near: %q", s)
}

if s[offset] != '%' {
if s[offset] != percentMark {
return 0, 0, fmt.Errorf("expected a '%%' escape character, near: %q", s[offset:])
}
offset++
Expand Down Expand Up @@ -179,6 +160,73 @@ func isNumerical(input string) bool {
return strings.IndexFunc(input, isNotDigit[rune]) == -1
}

var accepted = []byte{
'-', '.', '_', '~',
'!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=',
}

type charSet struct {
*bitset.BitSet
runeFunc func(rune) bool
}

func (c charSet) IsInSet(r rune) bool {
if r < utf8.RuneSelf {
return c.Test(uint(r))
}

return c.runeFunc(r)
}

func (c charSet) Clone() charSet {
return charSet{
BitSet: c.BitSet.Clone(),
}
}

var (
unreservedAndSubDelimsCharSet charSet
pcharCharSet charSet
userInfoCharSet charSet
queryOrFragmentCharSet charSet
)

func init() {
unreservedAndSubDelimsCharSet = charSet{
BitSet: bitset.New(uint(len(accepted))),
runeFunc: isUnreservedOrSubDelimsRune,
}

for _, r := range accepted {
unreservedAndSubDelimsCharSet.Set(uint(r))
}
for r := '0'; r <= '9'; r++ {
unreservedAndSubDelimsCharSet.Set(uint(r))
}
for r := 'A'; r <= 'Z'; r++ {
unreservedAndSubDelimsCharSet.Set(uint(r))
}
for r := 'a'; r <= 'z'; r++ {
unreservedAndSubDelimsCharSet.Set(uint(r))
}

pcharCharSet = unreservedAndSubDelimsCharSet.Clone()
pcharCharSet.Set(uint(':'))
pcharCharSet.Set(uint('@'))
pcharCharSet.runeFunc = isPcharRune

userInfoCharSet = unreservedAndSubDelimsCharSet.Clone()
userInfoCharSet.Set(uint(':'))
userInfoCharSet.runeFunc = isUserInfoRune

queryOrFragmentCharSet = unreservedAndSubDelimsCharSet.Clone()
queryOrFragmentCharSet.Set(uint(colonMark))
queryOrFragmentCharSet.Set(uint(atHost))
queryOrFragmentCharSet.Set(uint(slashMark))
queryOrFragmentCharSet.Set(uint(questionMark))
queryOrFragmentCharSet.runeFunc = isQueryOrFragmentRune
}

func unhex(c byte) byte {
switch {
case '0' <= c && c <= '9':
Expand All @@ -190,3 +238,68 @@ func unhex(c byte) byte {
}
return 0
}

func isUnreservedOrSubDelimsRune(r rune) bool {
return unicode.IsLetter(r) || unicode.IsDigit(r) ||
isUnreserved(r) ||
isSubDelims(r)
}

func isUnreserved(r rune) bool {
// unreserved characters
switch r {
case '-', '.', '_', '~':
return true
default:
return false
}
}

func isSubDelims(r rune) bool {
// sub-delims
switch r {
case '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=':
return true
default:
return false
}
}

/*
func isGenDelims(r rune) bool {
// gen-delims
switch r{
case ':', '/', '?', '#', '[', ']', '@':
return true
default:
return false
}
}
*/

func isPcharRune(r rune) bool {
switch r {
case colonMark, atHost:
return true
default:
return isUnreservedOrSubDelimsRune(r)
}
}

func isQueryOrFragmentRune(r rune) bool {
switch r {
case colonMark, atHost, slashMark, questionMark:
return true
default:
return isUnreservedOrSubDelimsRune(r)
}
}

func isUserInfoRune(r rune) bool {
switch r {
case colonMark:
return true
default:
return isUnreservedOrSubDelimsRune(r)
}
}
2 changes: 1 addition & 1 deletion decode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,6 @@ func TestUnhex(t *testing.T) {
func TestValidateUnreservedWithExtra(t *testing.T) {
// edge case: invalid rune in string
require.Error(t,
validateUnreservedWithExtra(string([]rune{utf8.RuneError}), nil),
validateUnreservedWithExtra(string([]rune{utf8.RuneError}), unreservedAndSubDelimsCharSet),
)
}
39 changes: 35 additions & 4 deletions docs/BENCHMARKS.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ Benchmark_Parse/with_URL_payload_with_IPs
Benchmark_Parse/with_URL_payload_with_IPs-16 96977450 376.3 ns/op 176 B/op 1 allocs/op

## After stricter IP parsing (naive)

Naive implementation with too many gc allocs.

go test -v -bench . -benchtime 30s -run Bench
goos: linux
goarch: amd64
Expand Down Expand Up @@ -191,7 +194,7 @@ Benchmark_Parse/with_URL_payload_with_IPs-16 93061443 374.6 ns/op
Benchmark_String-16 180403320 199.9 ns/op 142 B/op 5 allocs/op


# After strict percent-encoding check on host
## After strict percent-encoding check on host

goos: linux
goarch: amd64
Expand All @@ -214,7 +217,9 @@ Benchmark_String
Benchmark_String-16 178247580 203.6 ns/op 142 B/op 5 allocs/op
PASS

# After rewrite with uriReader
## After rewrite with uriReader

Abstraction comes at a cost. NO GO

go test -bench . -benchtime 30s -run Bench
goos: linux
Expand All @@ -230,7 +235,7 @@ Benchmark_Parse/with_URL_payload_with_IPs-16 96785080 369.1 ns/op
Benchmark_String-16 180658692 197.4 ns/op 142 B/op 5 allocs/op
PASS

# After rewrite with RuneInString, no Reader
## After rewrite with RuneInString, no Reader

go test -v -run Bench -benchtime 30s -bench Bench
goos: linux
Expand All @@ -254,7 +259,7 @@ Benchmark_String
Benchmark_String-16 176733871 202.6 ns/op 142 B/op 5 allocs/op
PASS

# After optim allocs String()
## After optim allocs String()

go test -v -run Bench -benchtime 30s -bench String
goos: linux
Expand All @@ -265,3 +270,29 @@ Benchmark_String
Benchmark_String-16 457095075 79.87 ns/op 48 B/op 1 allocs/op
PASS

## replaced rune slice iteration by switch statement

Actually a slight degradation. NO GO

go test -v -pgo=auto -run Bench -benchtime 30s -bench Bench
goos: linux
goarch: amd64
pkg: github.com/fredbi/uri
cpu: AMD Ryzen 7 5800X 8-Core Processor
Benchmark_Parse
Benchmark_Parse/with_URI_simple_payload
Benchmark_Parse/with_URI_simple_payload-16 92742778 391.3 ns/op 160 B/op 1 allocs/op
Benchmark_Parse/with_URL_simple_payload
Benchmark_Parse/with_URL_simple_payload-16 100000000 321.1 ns/op 168 B/op 1 allocs/op
Benchmark_Parse/with_URI_mixed_payload
Benchmark_Parse/with_URI_mixed_payload-16 93061579 393.8 ns/op 160 B/op 1 allocs/op
Benchmark_Parse/with_URL_mixed_payload
Benchmark_Parse/with_URL_mixed_payload-16 100000000 301.8 ns/op 163 B/op 1 allocs/op
Benchmark_Parse/with_URI_payload_with_IPs
Benchmark_Parse/with_URI_payload_with_IPs-16 81460168 424.6 ns/op 160 B/op 1 allocs/op
Benchmark_Parse/with_URL_payload_with_IPs
Benchmark_Parse/with_URL_payload_with_IPs-16 94139295 365.8 ns/op 176 B/op 1 allocs/op
Benchmark_String
Benchmark_String-16 178303498 201.8 ns/op 142 B/op 5 allocs/op
PASS

1 change: 1 addition & 0 deletions docs/TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* [x] more nitpicks - check if the length checks on DNS host name are in bytes or in runes => bytes
* [x] DefaultPort(), IsDefaultPort()
* [] IRI ucs charset compliance (att: perf challenge)
* [] FilePath()
* [] normalizer
* [] V2 zero alloc, no interface, fluent builder with inner error checking
* [] doc: complete the librarian/archivist work on specs, etc + FAQ to clarify the somewhat arcane world of this set of RFCs.
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/fredbi/uri
go 1.19

require (
github.com/bits-and-blooms/bitset v1.10.0
github.com/pkg/profile v1.7.0
github.com/stretchr/testify v1.8.4
)
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/bits-and-blooms/bitset v1.10.0 h1:ePXTeiPEazB5+opbv5fr8umg2R/1NlzgDsyepwsSr88=
github.com/bits-and-blooms/bitset v1.10.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
Expand Down
7 changes: 4 additions & 3 deletions ip.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ func validateIPv6(host string) error {
)
}

if err := validateUnreservedWithExtra(zoneID, nil); err != nil {
if err := validateUnreservedWithExtra(zoneID, unreservedAndSubDelimsCharSet); err != nil {
return errorsJoin(
ErrInvalidHostAddress,
fmt.Errorf("invalid IPv6 zoneID %q: %w", zoneID, err),
Expand Down Expand Up @@ -221,6 +221,7 @@ func validateIPvFuture(address string) error {
return errors.New("invalid IP vFuture format: expect a non-empty address after the version tag")
}

// TODO: wrong because IpvFuture is not escaped
return validateUnreservedWithExtra(address[offset:], userInfoExtraRunes)
// RFC3986 states that IpvFuture is not escaped, but IPv6 has already evolved to add an escape zoneID.
// We assume that IPvFuture supports escaping as well.
return validateUnreservedWithExtra(address[offset:], userInfoCharSet)
}
8 changes: 4 additions & 4 deletions profile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,12 @@ import (
"testing"

"github.com/pkg/profile"
"github.com/stretchr/testify/require"
)

func TestParseWithProfile(t *testing.T) {
const (
profDir = "prof"
n = 1000
n = 100000
)

t.Run("collect CPU profile", func(t *testing.T) {
Expand Down Expand Up @@ -52,8 +51,9 @@ func runProfile(t *testing.T, n int) {
}

u, err := Parse(testCase.uriRaw)
require.NoErrorf(t, err, "unexpected error for %q", testCase.uriRaw)
require.NotEmpty(t, u)
if u == nil || err != nil {
t.Fatalf("unexpected error for %q", testCase.uriRaw)
}
}
}
}
Expand Down
Loading