Skip to content

Commit

Permalink
Merge pull request #113 from JohnSnowLabs/improved-nlu-component-access
Browse files Browse the repository at this point in the history
Improved nlu component access
  • Loading branch information
C-K-Loan committed Apr 17, 2022
2 parents 124f1db + 9155ae3 commit 186be00
Show file tree
Hide file tree
Showing 14 changed files with 2,357 additions and 2,102 deletions.
2 changes: 1 addition & 1 deletion nlu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
if offline_utils.is_pipe(pipe_path):
# language, nlp_ref, nlu_ref,path=None, is_licensed=False
# todo deduct lang and if Licensed or not
pipe_components = construct_component_from_pipe_identifier('en', nlu_ref, nlu_ref, pipe_path, False)
pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
# Resource in path is a single model
elif offline_utils.is_model(pipe_path):
c = offline_utils.verify_and_create_model(pipe_path)
Expand Down
212 changes: 146 additions & 66 deletions nlu/pipe/component_resolution.py

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions nlu/pipe/nlu_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,14 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel]
nlp_ref: str,
language: LanguageIso,
loaded_from_pretrained_pipe: bool,
license_type: LicenseType,
license_type: Optional[LicenseType],
storage_ref: Optional[str] = None):
"""Write metadata to nlu component_to_resolve after constructing it """
self.model = jsl_anno_object
self.nlu_ref = nlu_ref
self.nlp_ref = nlp_ref
self.language = language
self.loaded_from_pretrained_pipe = loaded_from_pretrained_pipe
self.license = license_type
self.in_types = self.node.ins.copy()
self.out_types = self.node.outs.copy()
self.in_types_default = self.node.ins.copy()
Expand All @@ -141,6 +140,8 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel]
self.spark_output_column_names = self.out_types.copy()
if storage_ref:
self.storage_ref = storage_ref
if license_type:
self.license = license_type
if nlp_ref == 'glove_840B_300' or nlp_ref == 'glove_6B_300':
self.lang = 'xx'
if hasattr(self.model, 'setIncludeConfidence'):
Expand All @@ -154,7 +155,7 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel]
self.is_trained = False
from copy import copy

return copy(self)
return self

def __str__(self):
return f'Component(ID={self.name}, NLU_REF={self.nlu_ref} NLP_REF={self.nlp_ref})'
Expand Down
6 changes: 3 additions & 3 deletions nlu/pipe/pipe_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nlu.universe.logic_universes import AnnoTypes
from nlu import Licenses
from nlu.universe.feature_universes import NLP_FEATURES
from nlu.universe.component_universes import ComponentUniverse
from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component

logger = logging.getLogger('nlu')
from nlu.pipe.utils.pipe_utils import PipeUtils
Expand Down Expand Up @@ -206,7 +206,7 @@ def add_sentence_embedding_converter(resolution_data: StorageRefConversionResolu
"""
logger.info(f'Adding Sentence embedding conversion for Embedding Provider={resolution_data}')
word_embedding_provider = resolution_data.component_candidate
c = ComponentUniverse.os_components[NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER]
c = jsl_id_to_empty_component(NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER)
storage_ref = StorageRefUtils.extract_storage_ref(word_embedding_provider)
c.set_metadata(c.get_default_model(), 'sentence_embedding_converter',
NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER, 'xx', False, Licenses.open_source, storage_ref)
Expand All @@ -232,7 +232,7 @@ def add_chunk_embedding_converter(
entities_col = 'entities'
embed_provider_col = word_embedding_provider.info.spark_output_column_names[0]

c = ComponentUniverse.os_components[NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER]
c = jsl_id_to_empty_component(NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER)
c.set_metadata(c.get_default_model(),
NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER, NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER,
'xx',
Expand Down
62 changes: 19 additions & 43 deletions nlu/pipe/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from nlu.pipe.utils.output_level_resolution_utils import OutputLevelUtils
from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils
from nlu.universe.universes import Licenses
from nlu.utils.environment.env_utils import is_running_in_databricks
from nlu.utils.environment.env_utils import is_running_in_databricks, try_import_streamlit

logger = logging.getLogger('nlu')

Expand Down Expand Up @@ -617,11 +617,8 @@ def viz_streamlit(self,

) -> None:
"""Display Viz in streamlit"""
# try: from nlu.component_list.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.viz_streamlit_dashboard(self,
text,
model_selection,
Expand Down Expand Up @@ -663,10 +660,8 @@ def viz_streamlit_token(
show_text_input: bool = True,

):
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.visualize_tokens_information(self, text, title, sub_title, show_feature_select,
features, metadata, output_level, positions,
set_wide_layout_CSS, generate_code_sample, key,
Expand All @@ -691,10 +686,8 @@ def viz_streamlit_classes(
show_infos: bool = True,
show_logo: bool = True,
) -> None:
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.visualize_classes(self, text, output_level, title, sub_title, metadata, positions,
set_wide_layout_CSS, generate_code_sample, key, show_model_selector,
model_select_position, show_infos, show_logo)
Expand All @@ -712,10 +705,8 @@ def viz_streamlit_dep_tree(
show_logo: bool = True,
show_text_input: bool = True,
) -> None:
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.visualize_dep_tree(self, text, title, sub_title, set_wide_layout_CSS,
generate_code_sample, key, show_infos, show_logo, show_text_input, )

Expand All @@ -740,10 +731,8 @@ def viz_streamlit_ner(
show_text_input: bool = True,

):
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.visualize_ner(self, text, ner_tags, show_label_select, show_table, title, sub_title,
colors, show_color_selector, set_wide_layout_CSS, generate_code_sample,
key, model_select_position, show_model_select, show_infos, show_logo,
Expand Down Expand Up @@ -773,10 +762,8 @@ def viz_streamlit_word_similarity(
show_logo: bool = True,

):
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.display_word_similarity(self, texts, threshold, title, sub_tile, write_raw_pandas,
display_embed_information, similarity_matrix, show_algo_select,
dist_metrics, set_wide_layout_CSS, generate_code_sample, key,
Expand Down Expand Up @@ -810,10 +797,8 @@ def viz_streamlit_word_embed_manifold(self,
show_logo: bool = True,
n_jobs: Optional[int] = 3, # False
):
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.viz_streamlit_word_embed_manifold(self,
default_texts,
title,
Expand Down Expand Up @@ -862,10 +847,8 @@ def viz_streamlit_sentence_embed_manifold(self,
show_logo: bool = True,
n_jobs: Optional[int] = 3, # False
):
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.viz_streamlit_sentence_embed_manifold(self,
default_texts,
title,
Expand Down Expand Up @@ -907,10 +890,8 @@ def viz_streamlit_entity_embed_manifold(self,
show_logo: bool = True,
n_jobs: Optional[int] = 3, # False
):
try:
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
except ImportError:
print("You need to install Streamlit to run this functionality.")
try_import_streamlit()
from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
StreamlitVizBlockHandler.viz_streamlit_entity_embed_manifold(self,
default_texts,
title,
Expand All @@ -925,8 +906,3 @@ def viz_streamlit_entity_embed_manifold(self,
show_infos,
show_logo,
n_jobs)

def check_pyspark_pyarrow_optimization_compatibility(self):
# Only works for pyspark "3.1.2"
v = pyspark.version.__version__.split('.')
if int(v[0]) == 3 and int(v[1]) >= 1: return True
14 changes: 7 additions & 7 deletions nlu/pipe/utils/pipe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from nlu import Licenses
from nlu.pipe.nlu_component import NluComponent
from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils
from nlu.universe.component_universes import ComponentUniverse
from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component
from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS, OCR_NODE_IDS
from nlu.universe.feature_universes import NLP_FEATURES
from nlu.universe.logic_universes import NLP_LEVELS, AnnoTypes
Expand Down Expand Up @@ -223,14 +223,14 @@ def enforce_AT_schema_on_NER_processors_and_add_missing_NER_converters(pipe):
if converter_to_update is None:
if c.license == Licenses.hc:
# TODO SET METADATA FIELDS HERE ON ANNO!!
converter_to_update = ComponentUniverse.hc_components[NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL]
converter_to_update = jsl_id_to_empty_component(NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL)
converter_to_update.set_metadata(converter_to_update.get_default_model(),
NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL,
NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL,
'xx', False, Licenses.hc)
else:
# TODO SET METADATA FIELDS HERE ON ANNO!!
converter_to_update = ComponentUniverse.os_components[NLP_NODE_IDS.NER_CONVERTER]
converter_to_update = jsl_id_to_empty_component(NLP_NODE_IDS.NER_CONVERTER)
converter_to_update.set_metadata(converter_to_update.get_default_model(),
NLP_NODE_IDS.NER_CONVERTER, NLP_NODE_IDS.NER_CONVERTER,
'xx', False, Licenses.open_source)
Expand Down Expand Up @@ -462,7 +462,7 @@ def configure_component_output_levels(pipe, new_output_level=''):
if not PipeUtils.has_document_assembler(pipe):
# When loaded from OCR, we might not have a documentAssembler in pipe
pipe.is_fitted = False
document_assembler = ComponentUniverse.os_components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER]
document_assembler = ComponentUniverse.components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER]()
document_assembler.set_metadata(document_assembler.get_default_model(), 'document_assembler',
'document_assembler', 'xx', False, Licenses.open_source)
pipe.components.insert(0, document_assembler)
Expand All @@ -471,7 +471,7 @@ def configure_component_output_levels(pipe, new_output_level=''):
if not PipeUtils.has_sentence_detector(pipe):
logger.info("Adding missing Sentence Detector")
pipe.is_fitted = False
sentence_detector = ComponentUniverse.os_components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL]
sentence_detector = ComponentUniverse.components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL]()
sentence_detector.set_metadata(sentence_detector.get_default_model(), 'detect_sentence',
'sentence_detector_dl', 'en', False, Licenses.open_source)
insert_idx = PipeUtils.find_doc_assembler_idx_in_pipe(pipe)
Expand Down Expand Up @@ -657,7 +657,7 @@ def replace_untrained_component_with_trained(nlu_pipe, spark_transformer_pipe):
untrained_class_name = AnnoClassRef.JSL_anno2_py_class[trainable_c.jsl_anno_class_id]
trained_model = PipeUtils.get_model_of_class_from_spark_pipe(spark_transformer_pipe,
trained_class_name)
trained_component = ComponentUniverse.os_components[trainable_c.trained_mirror_anno].set_metadata(
trained_component = jsl_id_to_empty_component(trainable_c.trained_mirror_anno).set_metadata(
trained_model, trainable_c.trained_mirror_anno, trainable_c.trained_mirror_anno, nlu_pipe.lang,
False,
Licenses.open_source)
Expand All @@ -666,7 +666,7 @@ def replace_untrained_component_with_trained(nlu_pipe, spark_transformer_pipe):
untrained_class_name = AnnoClassRef.JSL_anno_HC_ref_2_py_class[trainable_c.jsl_anno_class_id]
trained_model = PipeUtils.get_model_of_class_from_spark_pipe(spark_transformer_pipe,
trained_class_name)
trained_component = ComponentUniverse.hc_components[trainable_c.trained_mirror_anno].set_metadata(
trained_component = jsl_id_to_empty_component(trainable_c.trained_mirror_anno).set_metadata(
trained_model, trainable_c.trained_mirror_anno, trainable_c.trained_mirror_anno, nlu_pipe.lang,
False, Licenses.hc)

Expand Down
8 changes: 6 additions & 2 deletions nlu/pipe/utils/resolution/nlu_ref_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def parse_language_from_nlu_ref(nlu_ref):
if nlu_ref[0:3] != 'xx.':
nlu_reference = 'xx.' + nlu_ref
logger.info(f'Setting lang as xx for nlu_ref={nlu_reference}')
if not lang :
if not lang:
lang = 'en'
logger.info(f'Parsed Nlu_ref={nlu_ref} as lang={lang}')

Expand All @@ -59,6 +59,10 @@ def nlu_ref_to_nlp_metadata(nlu_ref, is_recursive_call=False):
nlp_ref = None
license_type = Licenses.open_source
is_pipe = False
if 'translate_to' in nlu_ref:
# We append here xx and set lang as xx so users don't have to specify it
nlu_ref = 'xx.' + nlu_ref
lang = 'xx'
# 1. check if open source pipeline
if lang in Spellbook.pretrained_pipe_references.keys():
if nlu_ref in Spellbook.pretrained_pipe_references[lang].keys():
Expand All @@ -75,7 +79,7 @@ def nlu_ref_to_nlp_metadata(nlu_ref, is_recursive_call=False):
sparknlp_data = Spellbook.component_alias_references[nlu_ref]
nlp_ref = sparknlp_data[0]
is_pipe = 'component_list' in sparknlp_data[1]
if len(sparknlp_data) == 3 :
if len(sparknlp_data) == 3:
model_params = sparknlp_data[2]
# 4. check if healthcare pipe
if lang in Spellbook.pretrained_healthcare_pipe_references.keys():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from nlu.components import embeddings_chunker
from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS
from nlu.universe.logic_universes import AnnoTypes
from nlu.universe.component_universes import ComponentUniverse
from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component
from nlu.universe.universes import Licenses


Expand Down Expand Up @@ -42,7 +42,7 @@ def insert_chunk_embedder_to_pipe_if_missing(pipe):
if c.name == NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL:
ner_conveter_c = c

chunker = ComponentUniverse.os_components[NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER]
chunker = jsl_id_to_empty_component(NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER)
chunker.set_metadata(
chunker.get_default_model(),
'chunker', 'chunker', 'xx', False, Licenses.open_source)
Expand Down
Loading

0 comments on commit 186be00

Please sign in to comment.