Skip to content

Commit

Permalink
add punycode test cases. preserve subdomain & domain if suffix is mis…
Browse files Browse the repository at this point in the history
…sing
  • Loading branch information
elliotwutingfeng committed May 2, 2022
1 parent 1d8bef9 commit bf93c26
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 11 deletions.
26 changes: 20 additions & 6 deletions fasttld.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ type SuffixListParams struct {

// URLParams specifies URL to extract components from.
//
// If IgnoreSubDomains = true, do not extract subdomains.
// If IgnoreSubDomains = true, do not extract SubDomain.
//
// If ConvertURLToPunyCode = true, convert non-ASCII characters like 世界 to punycode.
type URLParams struct {
Expand Down Expand Up @@ -203,7 +203,7 @@ func (f *FastTLD) Extract(e URLParams) *ExtractResult {
host = netloc
}

// extract port and "Path" if any
// extract Port and "Path" if any
if len(afterHost) != 0 {
pathStartIndex := strings.IndexRune(afterHost, '/')
var (
Expand Down Expand Up @@ -282,26 +282,40 @@ func (f *FastTLD) Extract(e URLParams) *ExtractResult {

if hasSuffix {
if sepIdx != -1 {
// if there is a domain
// if there is a Domain
urlParts.Suffix = host[sepIdx+sepSize(host[sepIdx]):]
domainStartSepIdx := strings.LastIndexAny(host[0:sepIdx], periodDelimiters)
if domainStartSepIdx != -1 {
// if subdomains exist
// if SubDomain exists
domainStartIdx := domainStartSepIdx + sepSize(host[domainStartSepIdx])
urlParts.Domain = host[domainStartIdx:sepIdx]
urlParts.RegisteredDomain = host[domainStartIdx:]
if !e.IgnoreSubDomains {
// if subdomains are to be included
// if SubDomain is to be included
urlParts.SubDomain = host[0:domainStartSepIdx]
}
} else {
urlParts.Domain = host[domainStartSepIdx+1 : sepIdx]
urlParts.RegisteredDomain = host[domainStartSepIdx+1:]
}
} else {
// if only suffix exists
// if only Suffix exists
urlParts.Suffix = host
}
} else {
// No Suffix ; check for SubDomain and Domain
if sepIdx != -1 {
// if there is a SubDomain
domainStartSepIdx := strings.LastIndexAny(host, periodDelimiters)
domainStartIdx := domainStartSepIdx + sepSize(host[domainStartSepIdx])
urlParts.Domain = host[domainStartIdx:]
if !e.IgnoreSubDomains {
// if SubDomain is to be included
urlParts.SubDomain = host[0:domainStartSepIdx]
}
} else {
urlParts.Domain = host
}
}

return &urlParts
Expand Down
17 changes: 12 additions & 5 deletions fasttld_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,13 @@ var schemeTests = []extractTest{
{urlParams: URLParams{URL: "ssh://server.example.com/"},
expected: &ExtractResult{Scheme: "ssh://", SubDomain: "server",
Domain: "example", Suffix: "com", RegisteredDomain: "example.com"}, description: "Full ssh URL with SubDomain"},
{urlParams: URLParams{URL: "http://www.www.net"},
expected: &ExtractResult{Scheme: "http://", SubDomain: "www",
Domain: "www", Suffix: "net", RegisteredDomain: "www.net"}, description: "Multiple www"},
}
var noSchemeTests = []extractTest{
{urlParams: URLParams{URL: "org"}, expected: &ExtractResult{Suffix: "org"}, description: "Single TLD | Suffix Only"},
{urlParams: URLParams{URL: "co.th"}, expected: &ExtractResult{Suffix: "co.th"}, description: "Double TLD | Suffix Only"},
{urlParams: URLParams{URL: "[email protected]"}, expected: &ExtractResult{UserInfo: "users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com"}, description: "UserInfo + Domain | No Scheme"},
{urlParams: URLParams{URL: "mailto:[email protected]"}, expected: &ExtractResult{UserInfo: "mailto:users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com"}, description: "Mailto | No Scheme"},
{urlParams: URLParams{URL: "example.com:999"}, expected: &ExtractResult{Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Port: "999"}, description: "Domain + Port | No Scheme"},
Expand Down Expand Up @@ -309,9 +314,9 @@ var invalidTests = []extractTest{
}, description: "Invalid Port number"},
{urlParams: URLParams{URL: "//server.example.com/path"}, expected: &ExtractResult{Scheme: "//", SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "path"}, description: "Missing protocol URL with subdomain"},
{urlParams: URLParams{URL: "http://temasek"}, expected: &ExtractResult{Scheme: "http://", Suffix: "temasek"}, description: "Basic URL with TLD only"},
{urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://"}, description: "Basic URL with bad TLD"},
{urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://"}, description: "Basic URL with subdomain and bad TLD"},
{urlParams: URLParams{URL: "http://127.0.0.256"}, expected: &ExtractResult{Scheme: "http://"}, description: "Basic IPv4 Address URL with bad IP"},
{urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://", SubDomain: "temasek", Domain: "this-tld-cannot-be-real"}, description: "Basic URL with bad TLD"},
{urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://", SubDomain: "temasek.temasek", Domain: "this-tld-cannot-be-real"}, description: "Basic URL with subdomain and bad TLD"},
{urlParams: URLParams{URL: "http://127.0.0.256"}, expected: &ExtractResult{Scheme: "http://", SubDomain: "127.0.0", Domain: "256"}, description: "Basic IPv4 Address URL with bad IP"},
{urlParams: URLParams{URL: "http://a:[email protected]"},
expected: &ExtractResult{Scheme: "http://", UserInfo: "a:b"}, description: "Invalid punycode Domain"},
// {urlParams: URLParams{URL: "git+ssh://www.!example.com/"}, expected: &ExtractResult{}, description: "Full git+ssh URL with bad Domain"},
Expand All @@ -323,8 +328,10 @@ var internationalTLDTests = []extractTest{
{urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "обр.срб", RegisteredDomain: "example.обр.срб", Path: "地图/A/b/C?编号=42"}, description: "Basic URL with full international TLD (result in unicode)"},
{urlParams: URLParams{URL: "http://example.xn--ciqpn.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", Path: "地图/A/b/C?编号=42"}, description: "Basic URL with mixed punycode international TLD (result in punycode)"},
{urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "地图/A/b/C?编号=42"}, description: "Basic URL with full punycode international TLD (result in punycode)"},
{urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk"}, description: "Basic URL with mixed punycode international TLD (result in unicode)"},
{urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac"}, description: "Basic URL with full punycode international TLD (result in unicode)"},
{urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk"}, description: "Basic URL with mixed punycode international TLD (no further conversion to punycode)"},
{urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac"}, description: "Basic URL with full punycode international TLD (no further conversion to punycode)"},
{urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac"}, expected: &ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac"}, description: "Mixed case Punycode Domain with full punycode international TLD (no further conversion to punycode)"},
{urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac", ConvertURLToPunyCode: true}, expected: &ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac"}, description: "Mixed case Punycode Domain with full punycode international TLD (with further conversion to punycode)"},
}
var domainOnlySingleTLDTests = []extractTest{
{urlParams: URLParams{URL: "https://example.ai/en"}, expected: &ExtractResult{Scheme: "https://", Domain: "example", Suffix: "ai", RegisteredDomain: "example.ai", Path: "en"}, description: "Domain only + ai"},
Expand Down

0 comments on commit bf93c26

Please sign in to comment.