Merge branch 'develop' of https://github.com/explosion/spaCy into dev…

…elop
youndoldman · May 31, 2017 · fe28602 · fe28602
2 parents 66af019 + 981196c
commit fe28602
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 12 deletions.
diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
@@ -1,35 +1,32 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
 
+LIST_ICONS = [r'[\p{So}--[°]]']
 
 _currency = r'\$|¢|£|€|¥|฿'
 _quotes = QUOTES.replace("'", '')
 
+_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS)
 
-_prefixes = ([r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES)
-
-_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
+_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + LIST_ICONS +
              [r'(?<=[0-9])\+',
               r'(?<=°[FfCcKk])\.',
               r'(?<=[0-9])(?:{})'.format(_currency),
               r'(?<=[0-9])(?:{})'.format(UNITS),
               r'(?<=[{}{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES, _currency),
               r'(?<=[{})])-e'.format(ALPHA_LOWER)])
 
-
-_infixes = (LIST_ELLIPSES +
+_infixes = (LIST_ELLIPSES + LIST_ICONS +
             [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
              r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=_quotes)])
 
-
 TOKENIZER_PREFIXES = _prefixes
 TOKENIZER_SUFFIXES = _suffixes
 TOKENIZER_INFIXES = _infixes
diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py
@@ -41,7 +41,5 @@ def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
 @pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
                                          ('i💙you', 3), ('🤘🤘yay!', 4)])
 def test_tokenizer_handles_emoji(tokenizer, text, length):
-    exceptions = ["hu"]
     tokens = tokenizer(text)
-    if tokens[0].lang_ not in exceptions:
-        assert len(tokens) == length
+    assert len(tokens) == length
diff --git a/website/docs/usage/rule-based-matching.jade b/website/docs/usage/rule-based-matching.jade
@@ -408,7 +408,7 @@ p
     |  To label the hashtags, we first need to add a new custom flag.
     |  #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
     |  to the hashtag's span, and check its value via a token's
-    |  #[+api("token#check_flag") #[code code check_flag()]] method. On each
+    |  #[+api("token#check_flag") #[code check_flag()]] method. On each
     |  match, we merge the hashtag and assign the flag.
 
 +code.