Skip to content

Commit

Permalink
Port over URL pattern changes from explosion#1411
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Oct 14, 2017
1 parent 09aed58 commit a4d974d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 12 deletions.
6 changes: 3 additions & 3 deletions spacy/lang/tokenizer_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
r"|"
# host name
r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
r"(?:(?:[a-z0-9\-]*)?[a-z0-9]+)"
# domain name
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
r"(?:\.(?:[a-z0-9\-])*[a-z0-9]+)*"
# TLD identifier
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
r"(?:\.(?:[a-z]{2,}))"
r")"
# port number
r"(?::\d{2,5})?"
Expand Down
18 changes: 9 additions & 9 deletions spacy/tests/tokenizer/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,10 @@
"http://userid:[email protected]/",
"http://142.42.1.1/",
"http://142.42.1.1:8080/",
"http://⌘.ws",
"http://⌘.ws/",
"http://foo.com/blah_(wikipedia)#cite-1",
"http://foo.com/blah_(wikipedia)_blah#cite-1",
"http://foo.com/unicode_(✪)_in_parens",
"http://foo.com/(something)?after=parens",
"http://☺.damowmow.com/",
"http://code.google.com/events/#&product=browser",
"http://j.mp",
"ftp://foo.bar/baz",
Expand All @@ -49,14 +46,17 @@
"http://a.b-c.de",
"http://223.255.255.254",
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"http://✪df.ws/123",
"http://➡.ws/䨹",
"http://مثال.إختبار",
"http://例子.测试",
"http://उदाहरण.परीक्षा",

pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"),
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"),
pytest.mark.xfail("http://⌘.ws"),
pytest.mark.xfail("http://⌘.ws/"),
pytest.mark.xfail("http://☺.damowmow.com/"),
pytest.mark.xfail("http://✪df.ws/123"),
pytest.mark.xfail("http://➡.ws/䨹"),
pytest.mark.xfail("http://مثال.إختبار"),
pytest.mark.xfail("http://例子.测试"),
pytest.mark.xfail("http://उदाहरण.परीक्षा"),
]

URLS_SHOULD_NOT_MATCH = [
Expand All @@ -83,7 +83,6 @@
"http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/",
"http://-error-.invalid/",
"http://-a.b.co",
"http://a.b-.co",
"http://0.0.0.0",
"http://10.1.1.0",
Expand All @@ -99,6 +98,7 @@
pytest.mark.xfail("foo.com"),
pytest.mark.xfail("http://1.1.1.1.1"),
pytest.mark.xfail("http://www.foo.bar./"),
pytest.mark.xfail("http://-a.b.co"),
]


Expand Down

0 comments on commit a4d974d

Please sign in to comment.