Skip to content

Commit

Permalink
Support HTML visualization in Jupyter notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed May 26, 2021
1 parent 309cf4a commit e7a0a27
Show file tree
Hide file tree
Showing 7 changed files with 64 additions and 15 deletions.
18 changes: 14 additions & 4 deletions hanlp/utils/log_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

import termcolor

from hanlp_common.constant import IPYTHON


class ColoredFormatter(logging.Formatter):
def __init__(self, fmt=None, datefmt=None, style='%', enable=True):
Expand Down Expand Up @@ -70,16 +72,24 @@ def __init__(self, out=sys.stderr):

def erase(self):
if self._last_print_width:
self.out.write("\b" * self._last_print_width)
self.out.write(" " * self._last_print_width)
self.out.write("\b" * self._last_print_width)
if IPYTHON:
self.out.write("\r")
self.out.write(" " * self._last_print_width)
else:
self.out.write("\b" * self._last_print_width)
self.out.write(" " * self._last_print_width)
self.out.write("\b" * self._last_print_width)
self.out.write("\r") # \r is essential when multi-lines were printed
self._last_print_width = 0

def print(self, msg: str, color=True):
self.erase()
if color:
msg, _len = color_format_len(msg)
if IPYTHON:
msg, _len = color_format_len(msg)
_len = len(msg)
else:
msg, _len = color_format_len(msg)
self._last_print_width = _len
else:
self._last_print_width = len(msg)
Expand Down
2 changes: 1 addition & 1 deletion hanlp/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
# Author: hankcs
# Date: 2019-12-28 19:26

__version__ = '2.1.0-alpha.46'
__version__ = '2.1.0-alpha.47'
"""HanLP version"""
6 changes: 6 additions & 0 deletions plugins/hanlp_common/hanlp_common/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,9 @@
'''Enable verbose or not.'''
NULL = '<null>'
PRED = 'PRED'
try:
# noinspection PyUnresolvedReferences,PyStatementEffect
get_ipython
IPYTHON = True
except NameError:
IPYTHON = False
47 changes: 40 additions & 7 deletions plugins/hanlp_common/hanlp_common/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from phrasetree.tree import Tree

from hanlp_common.conll import CoNLLUWord, CoNLLSentence, CoNLLSentenceList
from hanlp_common.constant import PRED
from hanlp_common.constant import PRED, IPYTHON
from hanlp_common.util import collapse_json, prefix_match
from hanlp_common.visualization import tree_to_list, list_to_tree, render_labeled_span, make_table

Expand Down Expand Up @@ -148,7 +148,7 @@ def get(_k, _i):
return results

def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
show_header=True) -> Union[str, List[str]]:
show_header=True, html=False) -> Union[str, List[str]]:
"""
Convert to a pretty text representation which can be printed to visualize linguistic structures.
Expand All @@ -161,7 +161,8 @@ def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='
ner: Named entity key.
srl: Semantic role labeling key.
con: Constituency parsing key.
show_header: ``True`` to print a header which indicates each field with its name.
show_header: ``True`` to include a header which indicates each field with its name.
html: ``True`` to output HTML format so that non-ASCII characters can align correctly.
Returns:
A pretty string.
Expand Down Expand Up @@ -331,12 +332,39 @@ def condense(block_, extras_=None):
results.append(make_table(extras, insert_header=True))
else:
results.append(' '.join(['/'.join(str(f) for f in x.nonempty_fields) for x in conll]))
if html:
def to_html(pretty_text: str) -> str:
lines = [x for x in pretty_text.split('\n') if x]
cells = []
for line in lines:
cells.append(line.split('\t'))

num_cols = len(cells[0])
cols = []

for i in range(num_cols):
cols.append([])
for row in cells:
cols[-1].append(row[i])

html = '<div style="display: table; line-height: 128%;">'
for i, each in enumerate(cols):
html += '<pre style="display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,' \
'Liberation Mono,Courier New,monospace; white-space: nowrap;">'
if i != len(cols) - 1:
each = [x + ' ' for x in each]
html += '<br>'.join([x.replace(' ', '&nbsp;') for x in each])
html += '</pre>'
html += '</div>'
return html

results = [to_html(x) for x in results]
if flat:
return results[0]
return results

def pretty_print(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
show_header=True):
show_header=True, html=IPYTHON):
"""
Print a pretty text representation which visualizes linguistic structures.
Expand All @@ -350,13 +378,18 @@ def pretty_print(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ne
srl: Semantic role labeling key.
con: Constituency parsing key.
show_header: ``True`` to print a header which indicates each field with its name.
html: ``True`` to output HTML format so that non-ASCII characters can align correctly.
"""
results = self.to_pretty(tok, lem, pos, dep, sdp, ner, srl, con, show_header)
results = self.to_pretty(tok, lem, pos, dep, sdp, ner, srl, con, show_header, html=html)
if isinstance(results, str):
results = [results]
sent_new_line = '\n\n' if any('\n' in x for x in results) else '\n'
print(sent_new_line.join(results))
if IPYTHON:
from IPython.core.display import display, HTML
display(HTML('<br>'.join(results)))
else:
sent_new_line = '\n\n' if any('\n' in x for x in results) else '\n'
print(sent_new_line.join(results))

def translate(self, lang, tok='tok', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl'):
"""
Expand Down
2 changes: 1 addition & 1 deletion plugins/hanlp_common/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='hanlp_common',
version='0.0.8',
version='0.0.9',
description='HanLP: Han Language Processing',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
2 changes: 1 addition & 1 deletion plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from hanlp_common.document import Document

HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
doc: Document = HanLP(['阿 婆主来到北京立方庭参观自然语义科技公司。'], tasks='tok/fine')
doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
print(doc)
doc.pretty_print()
# Specify which annotation to use
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
'transformers>=4.1.1',
'sentencepiece>=0.1.91'
'torch>=1.6.0',
'hanlp-common>=0.0.6',
'hanlp-common>=0.0.9',
'hanlp-trie>=0.0.2',
'hanlp-downloader',
],
Expand Down

0 comments on commit e7a0a27

Please sign in to comment.