From bf93c26fb073567a3b33ed68cc97526d997e4710 Mon Sep 17 00:00:00 2001 From: Tingfeng Date: Mon, 2 May 2022 10:14:12 +0800 Subject: [PATCH] add punycode test cases. preserve subdomain & domain if suffix is missing --- fasttld.go | 26 ++++++++++++++++++++------ fasttld_test.go | 17 ++++++++++++----- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/fasttld.go b/fasttld.go index 2929130..05f6373 100644 --- a/fasttld.go +++ b/fasttld.go @@ -53,7 +53,7 @@ type SuffixListParams struct { // URLParams specifies URL to extract components from. // -// If IgnoreSubDomains = true, do not extract subdomains. +// If IgnoreSubDomains = true, do not extract SubDomain. // // If ConvertURLToPunyCode = true, convert non-ASCII characters like 世界 to punycode. type URLParams struct { @@ -203,7 +203,7 @@ func (f *FastTLD) Extract(e URLParams) *ExtractResult { host = netloc } - // extract port and "Path" if any + // extract Port and "Path" if any if len(afterHost) != 0 { pathStartIndex := strings.IndexRune(afterHost, '/') var ( @@ -282,16 +282,16 @@ func (f *FastTLD) Extract(e URLParams) *ExtractResult { if hasSuffix { if sepIdx != -1 { - // if there is a domain + // if there is a Domain urlParts.Suffix = host[sepIdx+sepSize(host[sepIdx]):] domainStartSepIdx := strings.LastIndexAny(host[0:sepIdx], periodDelimiters) if domainStartSepIdx != -1 { - // if subdomains exist + // if SubDomain exists domainStartIdx := domainStartSepIdx + sepSize(host[domainStartSepIdx]) urlParts.Domain = host[domainStartIdx:sepIdx] urlParts.RegisteredDomain = host[domainStartIdx:] if !e.IgnoreSubDomains { - // if subdomains are to be included + // if SubDomain is to be included urlParts.SubDomain = host[0:domainStartSepIdx] } } else { @@ -299,9 +299,23 @@ func (f *FastTLD) Extract(e URLParams) *ExtractResult { urlParts.RegisteredDomain = host[domainStartSepIdx+1:] } } else { - // if only suffix exists + // if only Suffix exists urlParts.Suffix = host } + } else { + // No Suffix ; check for SubDomain and Domain + if sepIdx != -1 { + // if there is a SubDomain + domainStartSepIdx := strings.LastIndexAny(host, periodDelimiters) + domainStartIdx := domainStartSepIdx + sepSize(host[domainStartSepIdx]) + urlParts.Domain = host[domainStartIdx:] + if !e.IgnoreSubDomains { + // if SubDomain is to be included + urlParts.SubDomain = host[0:domainStartSepIdx] + } + } else { + urlParts.Domain = host + } } return &urlParts diff --git a/fasttld_test.go b/fasttld_test.go index e5b58de..c9e2b9d 100644 --- a/fasttld_test.go +++ b/fasttld_test.go @@ -237,8 +237,13 @@ var schemeTests = []extractTest{ {urlParams: URLParams{URL: "ssh://server.example.com/"}, expected: &ExtractResult{Scheme: "ssh://", SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com"}, description: "Full ssh URL with SubDomain"}, + {urlParams: URLParams{URL: "http://www.www.net"}, + expected: &ExtractResult{Scheme: "http://", SubDomain: "www", + Domain: "www", Suffix: "net", RegisteredDomain: "www.net"}, description: "Multiple www"}, } var noSchemeTests = []extractTest{ + {urlParams: URLParams{URL: "org"}, expected: &ExtractResult{Suffix: "org"}, description: "Single TLD | Suffix Only"}, + {urlParams: URLParams{URL: "co.th"}, expected: &ExtractResult{Suffix: "co.th"}, description: "Double TLD | Suffix Only"}, {urlParams: URLParams{URL: "users@example.com"}, expected: &ExtractResult{UserInfo: "users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com"}, description: "UserInfo + Domain | No Scheme"}, {urlParams: URLParams{URL: "mailto:users@example.com"}, expected: &ExtractResult{UserInfo: "mailto:users", Domain: "example", Suffix: "com", RegisteredDomain: "example.com"}, description: "Mailto | No Scheme"}, {urlParams: URLParams{URL: "example.com:999"}, expected: &ExtractResult{Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Port: "999"}, description: "Domain + Port | No Scheme"}, @@ -309,9 +314,9 @@ var invalidTests = []extractTest{ }, description: "Invalid Port number"}, {urlParams: URLParams{URL: "//server.example.com/path"}, expected: &ExtractResult{Scheme: "//", SubDomain: "server", Domain: "example", Suffix: "com", RegisteredDomain: "example.com", Path: "path"}, description: "Missing protocol URL with subdomain"}, {urlParams: URLParams{URL: "http://temasek"}, expected: &ExtractResult{Scheme: "http://", Suffix: "temasek"}, description: "Basic URL with TLD only"}, - {urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://"}, description: "Basic URL with bad TLD"}, - {urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://"}, description: "Basic URL with subdomain and bad TLD"}, - {urlParams: URLParams{URL: "http://127.0.0.256"}, expected: &ExtractResult{Scheme: "http://"}, description: "Basic IPv4 Address URL with bad IP"}, + {urlParams: URLParams{URL: "http://temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://", SubDomain: "temasek", Domain: "this-tld-cannot-be-real"}, description: "Basic URL with bad TLD"}, + {urlParams: URLParams{URL: "http://temasek.temasek.this-tld-cannot-be-real"}, expected: &ExtractResult{Scheme: "http://", SubDomain: "temasek.temasek", Domain: "this-tld-cannot-be-real"}, description: "Basic URL with subdomain and bad TLD"}, + {urlParams: URLParams{URL: "http://127.0.0.256"}, expected: &ExtractResult{Scheme: "http://", SubDomain: "127.0.0", Domain: "256"}, description: "Basic IPv4 Address URL with bad IP"}, {urlParams: URLParams{URL: "http://a:b@xn--tub-1m9d15sfkkhsifsbqygyujjrw60.com"}, expected: &ExtractResult{Scheme: "http://", UserInfo: "a:b"}, description: "Invalid punycode Domain"}, // {urlParams: URLParams{URL: "git+ssh://www.!example.com/"}, expected: &ExtractResult{}, description: "Full git+ssh URL with bad Domain"}, @@ -323,8 +328,10 @@ var internationalTLDTests = []extractTest{ {urlParams: URLParams{URL: "http://example.обр.срб/地图/A/b/C?编号=42"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "обр.срб", RegisteredDomain: "example.обр.срб", Path: "地图/A/b/C?编号=42"}, description: "Basic URL with full international TLD (result in unicode)"}, {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk", Path: "地图/A/b/C?编号=42"}, description: "Basic URL with mixed punycode international TLD (result in punycode)"}, {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac/地图/A/b/C?编号=42", ConvertURLToPunyCode: true}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac", Path: "地图/A/b/C?编号=42"}, description: "Basic URL with full punycode international TLD (result in punycode)"}, - {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk"}, description: "Basic URL with mixed punycode international TLD (result in unicode)"}, - {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac"}, description: "Basic URL with full punycode international TLD (result in unicode)"}, + {urlParams: URLParams{URL: "http://example.xn--ciqpn.hk"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--ciqpn.hk", RegisteredDomain: "example.xn--ciqpn.hk"}, description: "Basic URL with mixed punycode international TLD (no further conversion to punycode)"}, + {urlParams: URLParams{URL: "http://example.xn--90azh.xn--90a3ac"}, expected: &ExtractResult{Scheme: "http://", Domain: "example", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "example.xn--90azh.xn--90a3ac"}, description: "Basic URL with full punycode international TLD (no further conversion to punycode)"}, + {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac"}, expected: &ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac"}, description: "Mixed case Punycode Domain with full punycode international TLD (no further conversion to punycode)"}, + {urlParams: URLParams{URL: "http://xN--h1alffa9f.xn--90azh.xn--90a3ac", ConvertURLToPunyCode: true}, expected: &ExtractResult{Scheme: "http://", Domain: "xN--h1alffa9f", Suffix: "xn--90azh.xn--90a3ac", RegisteredDomain: "xN--h1alffa9f.xn--90azh.xn--90a3ac"}, description: "Mixed case Punycode Domain with full punycode international TLD (with further conversion to punycode)"}, } var domainOnlySingleTLDTests = []extractTest{ {urlParams: URLParams{URL: "https://example.ai/en"}, expected: &ExtractResult{Scheme: "https://", Domain: "example", Suffix: "ai", RegisteredDomain: "example.ai", Path: "en"}, description: "Domain only + ai"},