Support HTML visualization in Jupyter notebooks

snuc · May 26, 2021 · e7a0a27 · e7a0a27
1 parent 309cf4a
commit e7a0a27
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 15 deletions.
diff --git a/hanlp/utils/log_util.py b/hanlp/utils/log_util.py
@@ -10,6 +10,8 @@
 
 import termcolor
 
+from hanlp_common.constant import IPYTHON
+
 
 class ColoredFormatter(logging.Formatter):
     def __init__(self, fmt=None, datefmt=None, style='%', enable=True):
@@ -70,16 +72,24 @@ def __init__(self, out=sys.stderr):
 
     def erase(self):
         if self._last_print_width:
-            self.out.write("\b" * self._last_print_width)
-            self.out.write(" " * self._last_print_width)
-            self.out.write("\b" * self._last_print_width)
+            if IPYTHON:
+                self.out.write("\r")
+                self.out.write(" " * self._last_print_width)
+            else:
+                self.out.write("\b" * self._last_print_width)
+                self.out.write(" " * self._last_print_width)
+                self.out.write("\b" * self._last_print_width)
             self.out.write("\r")  # \r is essential when multi-lines were printed
             self._last_print_width = 0
 
     def print(self, msg: str, color=True):
         self.erase()
         if color:
-            msg, _len = color_format_len(msg)
+            if IPYTHON:
+                msg, _len = color_format_len(msg)
+                _len = len(msg)
+            else:
+                msg, _len = color_format_len(msg)
             self._last_print_width = _len
         else:
             self._last_print_width = len(msg)

diff --git a/hanlp/version.py b/hanlp/version.py
@@ -2,5 +2,5 @@
 # Author: hankcs
 # Date: 2019-12-28 19:26
 
-__version__ = '2.1.0-alpha.46'
+__version__ = '2.1.0-alpha.47'
 """HanLP version"""
diff --git a/plugins/hanlp_common/hanlp_common/constant.py b/plugins/hanlp_common/hanlp_common/constant.py
@@ -19,3 +19,9 @@
 '''Enable verbose or not.'''
 NULL = '<null>'
 PRED = 'PRED'
+try:
+    # noinspection PyUnresolvedReferences,PyStatementEffect
+    get_ipython
+    IPYTHON = True
+except NameError:
+    IPYTHON = False
diff --git a/plugins/hanlp_common/hanlp_common/document.py b/plugins/hanlp_common/hanlp_common/document.py
@@ -9,7 +9,7 @@
 from phrasetree.tree import Tree
 
 from hanlp_common.conll import CoNLLUWord, CoNLLSentence, CoNLLSentenceList
-from hanlp_common.constant import PRED
+from hanlp_common.constant import PRED, IPYTHON
 from hanlp_common.util import collapse_json, prefix_match
 from hanlp_common.visualization import tree_to_list, list_to_tree, render_labeled_span, make_table
 
@@ -148,7 +148,7 @@ def get(_k, _i):
         return results
 
     def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
-                  show_header=True) -> Union[str, List[str]]:
+                  show_header=True, html=False) -> Union[str, List[str]]:
         """
         Convert to a pretty text representation which can be printed to visualize linguistic structures.
 
@@ -161,7 +161,8 @@ def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='
             ner: Named entity key.
             srl: Semantic role labeling key.
             con: Constituency parsing key.
-            show_header: ``True`` to print a header which indicates each field with its name.
+            show_header: ``True`` to include a header which indicates each field with its name.
+            html: ``True`` to output HTML format so that non-ASCII characters can align correctly.
 
         Returns:
             A pretty string.
@@ -331,12 +332,39 @@ def condense(block_, extras_=None):
                 results.append(make_table(extras, insert_header=True))
             else:
                 results.append(' '.join(['/'.join(str(f) for f in x.nonempty_fields) for x in conll]))
+        if html:
+            def to_html(pretty_text: str) -> str:
+                lines = [x for x in pretty_text.split('\n') if x]
+                cells = []
+                for line in lines:
+                    cells.append(line.split('\t'))
+
+                num_cols = len(cells[0])
+                cols = []
+
+                for i in range(num_cols):
+                    cols.append([])
+                    for row in cells:
+                        cols[-1].append(row[i])
+
+                html = '<div style="display: table; line-height: 128%;">'
+                for i, each in enumerate(cols):
+                    html += '<pre style="display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,' \
+                            'Liberation Mono,Courier New,monospace; white-space: nowrap;">'
+                    if i != len(cols) - 1:
+                        each = [x + ' ' for x in each]
+                    html += '<br>'.join([x.replace(' ', '&nbsp;') for x in each])
+                    html += '</pre>'
+                html += '</div>'
+                return html
+
+            results = [to_html(x) for x in results]
         if flat:
             return results[0]
         return results
 
     def pretty_print(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
-                     show_header=True):
+                     show_header=True, html=IPYTHON):
         """
         Print a pretty text representation which visualizes linguistic structures.
 
@@ -350,13 +378,18 @@ def pretty_print(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ne
             srl: Semantic role labeling key.
             con: Constituency parsing key.
             show_header: ``True`` to print a header which indicates each field with its name.
+            html: ``True`` to output HTML format so that non-ASCII characters can align correctly.
 
         """
-        results = self.to_pretty(tok, lem, pos, dep, sdp, ner, srl, con, show_header)
+        results = self.to_pretty(tok, lem, pos, dep, sdp, ner, srl, con, show_header, html=html)
         if isinstance(results, str):
             results = [results]
-        sent_new_line = '\n\n' if any('\n' in x for x in results) else '\n'
-        print(sent_new_line.join(results))
+        if IPYTHON:
+            from IPython.core.display import display, HTML
+            display(HTML('<br>'.join(results)))
+        else:
+            sent_new_line = '\n\n' if any('\n' in x for x in results) else '\n'
+            print(sent_new_line.join(results))
 
     def translate(self, lang, tok='tok', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl'):
         """

diff --git a/plugins/hanlp_common/setup.py b/plugins/hanlp_common/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='hanlp_common',
-    version='0.0.8',
+    version='0.0.9',
     description='HanLP: Han Language Processing',
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py b/plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py
@@ -5,7 +5,7 @@
 from hanlp_common.document import Document
 
 HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
-doc: Document = HanLP(['阿 婆主来到北京立方庭参观自然语义科技公司。'], tasks='tok/fine')
+doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
 print(doc)
 doc.pretty_print()
 # Specify which annotation to use

diff --git a/setup.py b/setup.py
@@ -42,7 +42,7 @@
         'transformers>=4.1.1',
         'sentencepiece>=0.1.91'
         'torch>=1.6.0',
-        'hanlp-common>=0.0.6',
+        'hanlp-common>=0.0.9',
         'hanlp-trie>=0.0.2',
         'hanlp-downloader',
     ],