Merge pull request #113 from JohnSnowLabs/improved-nlu-component-access

Improved nlu component access
JohnSnowLabs · Apr 17, 2022 · 186be00 · 186be00
2 parents 124f1db + 9155ae3
commit 186be00
Show file tree

Hide file tree

Showing 14 changed files with 2,357 additions and 2,102 deletions.
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -203,7 +203,7 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
         if offline_utils.is_pipe(pipe_path):
             # language, nlp_ref, nlu_ref,path=None, is_licensed=False
             # todo deduct lang and if Licensed or not
-            pipe_components = construct_component_from_pipe_identifier('en', nlu_ref, nlu_ref, pipe_path, False)
+            pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
         # Resource in path is a single model
         elif offline_utils.is_model(pipe_path):
             c = offline_utils.verify_and_create_model(pipe_path)

diff --git a/nlu/pipe/component_resolution.py b/nlu/pipe/component_resolution.py
diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py
@@ -124,15 +124,14 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel]
                          nlp_ref: str,
                          language: LanguageIso,
                          loaded_from_pretrained_pipe: bool,
-                         license_type: LicenseType,
+                         license_type: Optional[LicenseType],
                          storage_ref: Optional[str] = None):
             """Write metadata to nlu component_to_resolve after constructing it """
             self.model = jsl_anno_object
             self.nlu_ref = nlu_ref
             self.nlp_ref = nlp_ref
             self.language = language
             self.loaded_from_pretrained_pipe = loaded_from_pretrained_pipe
-            self.license = license_type
             self.in_types = self.node.ins.copy()
             self.out_types = self.node.outs.copy()
             self.in_types_default = self.node.ins.copy()
@@ -141,6 +140,8 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel]
             self.spark_output_column_names = self.out_types.copy()
             if storage_ref:
                 self.storage_ref = storage_ref
+            if license_type:
+                self.license = license_type
             if nlp_ref == 'glove_840B_300' or nlp_ref == 'glove_6B_300':
                 self.lang = 'xx'
             if hasattr(self.model, 'setIncludeConfidence'):
@@ -154,7 +155,7 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel]
                 self.is_trained = False
             from copy import copy
 
-            return copy(self)
+            return self
 
     def __str__(self):
         return f'Component(ID={self.name}, NLU_REF={self.nlu_ref} NLP_REF={self.nlp_ref})'

diff --git a/nlu/pipe/pipe_logic.py b/nlu/pipe/pipe_logic.py
@@ -6,7 +6,7 @@
 from nlu.universe.logic_universes import AnnoTypes
 from nlu import Licenses
 from nlu.universe.feature_universes import NLP_FEATURES
-from nlu.universe.component_universes import ComponentUniverse
+from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component
 
 logger = logging.getLogger('nlu')
 from nlu.pipe.utils.pipe_utils import PipeUtils
@@ -206,7 +206,7 @@ def add_sentence_embedding_converter(resolution_data: StorageRefConversionResolu
         """
         logger.info(f'Adding Sentence embedding conversion for Embedding Provider={resolution_data}')
         word_embedding_provider = resolution_data.component_candidate
-        c = ComponentUniverse.os_components[NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER]
+        c = jsl_id_to_empty_component(NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER)
         storage_ref = StorageRefUtils.extract_storage_ref(word_embedding_provider)
         c.set_metadata(c.get_default_model(), 'sentence_embedding_converter',
                        NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER, 'xx', False, Licenses.open_source, storage_ref)
@@ -232,7 +232,7 @@ def add_chunk_embedding_converter(
         entities_col = 'entities'
         embed_provider_col = word_embedding_provider.info.spark_output_column_names[0]
 
-        c = ComponentUniverse.os_components[NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER]
+        c = jsl_id_to_empty_component(NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER)
         c.set_metadata(c.get_default_model(),
                        NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER, NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER,
                        'xx',

diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -18,7 +18,7 @@
 from nlu.pipe.utils.output_level_resolution_utils import OutputLevelUtils
 from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils
 from nlu.universe.universes import Licenses
-from nlu.utils.environment.env_utils import is_running_in_databricks
+from nlu.utils.environment.env_utils import is_running_in_databricks, try_import_streamlit
 
 logger = logging.getLogger('nlu')
 
@@ -617,11 +617,8 @@ def viz_streamlit(self,
 
                       ) -> None:
         """Display Viz in streamlit"""
-        # try: from nlu.component_list.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except  ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.viz_streamlit_dashboard(self,
                                                          text,
                                                          model_selection,
@@ -663,10 +660,8 @@ def viz_streamlit_token(
             show_text_input: bool = True,
 
     ):
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except  ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.visualize_tokens_information(self, text, title, sub_title, show_feature_select,
                                                               features, metadata, output_level, positions,
                                                               set_wide_layout_CSS, generate_code_sample, key,
@@ -691,10 +686,8 @@ def viz_streamlit_classes(
             show_infos: bool = True,
             show_logo: bool = True,
     ) -> None:
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except  ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.visualize_classes(self, text, output_level, title, sub_title, metadata, positions,
                                                    set_wide_layout_CSS, generate_code_sample, key, show_model_selector,
                                                    model_select_position, show_infos, show_logo)
@@ -712,10 +705,8 @@ def viz_streamlit_dep_tree(
             show_logo: bool = True,
             show_text_input: bool = True,
     ) -> None:
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except  ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.visualize_dep_tree(self, text, title, sub_title, set_wide_layout_CSS,
                                                     generate_code_sample, key, show_infos, show_logo, show_text_input, )
 
@@ -740,10 +731,8 @@ def viz_streamlit_ner(
             show_text_input: bool = True,
 
     ):
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.visualize_ner(self, text, ner_tags, show_label_select, show_table, title, sub_title,
                                                colors, show_color_selector, set_wide_layout_CSS, generate_code_sample,
                                                key, model_select_position, show_model_select, show_infos, show_logo,
@@ -773,10 +762,8 @@ def viz_streamlit_word_similarity(
             show_logo: bool = True,
 
     ):
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.display_word_similarity(self, texts, threshold, title, sub_tile, write_raw_pandas,
                                                          display_embed_information, similarity_matrix, show_algo_select,
                                                          dist_metrics, set_wide_layout_CSS, generate_code_sample, key,
@@ -810,10 +797,8 @@ def viz_streamlit_word_embed_manifold(self,
                                           show_logo: bool = True,
                                           n_jobs: Optional[int] = 3,  # False
                                           ):
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.viz_streamlit_word_embed_manifold(self,
                                                                    default_texts,
                                                                    title,
@@ -862,10 +847,8 @@ def viz_streamlit_sentence_embed_manifold(self,
                                               show_logo: bool = True,
                                               n_jobs: Optional[int] = 3,  # False
                                               ):
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.viz_streamlit_sentence_embed_manifold(self,
                                                                        default_texts,
                                                                        title,
@@ -907,10 +890,8 @@ def viz_streamlit_entity_embed_manifold(self,
                                             show_logo: bool = True,
                                             n_jobs: Optional[int] = 3,  # False
                                             ):
-        try:
-            from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
-        except ImportError:
-            print("You need to install Streamlit to run this functionality.")
+        try_import_streamlit()
+        from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler
         StreamlitVizBlockHandler.viz_streamlit_entity_embed_manifold(self,
                                                                      default_texts,
                                                                      title,
@@ -925,8 +906,3 @@ def viz_streamlit_entity_embed_manifold(self,
                                                                      show_infos,
                                                                      show_logo,
                                                                      n_jobs)
-
-    def check_pyspark_pyarrow_optimization_compatibility(self):
-        # Only works for pyspark        "3.1.2"
-        v = pyspark.version.__version__.split('.')
-        if int(v[0]) == 3 and int(v[1]) >= 1: return True
diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
@@ -6,7 +6,7 @@
 from nlu import Licenses
 from nlu.pipe.nlu_component import NluComponent
 from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils
-from nlu.universe.component_universes import ComponentUniverse
+from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component
 from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS, OCR_NODE_IDS
 from nlu.universe.feature_universes import NLP_FEATURES
 from nlu.universe.logic_universes import NLP_LEVELS, AnnoTypes
@@ -223,14 +223,14 @@ def enforce_AT_schema_on_NER_processors_and_add_missing_NER_converters(pipe):
                 if converter_to_update is None:
                     if c.license == Licenses.hc:
                         # TODO SET METADATA FIELDS HERE ON ANNO!!
-                        converter_to_update = ComponentUniverse.hc_components[NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL]
+                        converter_to_update = jsl_id_to_empty_component(NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL)
                         converter_to_update.set_metadata(converter_to_update.get_default_model(),
                                                          NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL,
                                                          NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL,
                                                          'xx', False, Licenses.hc)
                     else:
                         # TODO SET METADATA FIELDS HERE ON ANNO!!
-                        converter_to_update = ComponentUniverse.os_components[NLP_NODE_IDS.NER_CONVERTER]
+                        converter_to_update = jsl_id_to_empty_component(NLP_NODE_IDS.NER_CONVERTER)
                         converter_to_update.set_metadata(converter_to_update.get_default_model(),
                                                          NLP_NODE_IDS.NER_CONVERTER, NLP_NODE_IDS.NER_CONVERTER,
                                                          'xx', False, Licenses.open_source)
@@ -462,7 +462,7 @@ def configure_component_output_levels(pipe, new_output_level=''):
         if not PipeUtils.has_document_assembler(pipe):
             # When loaded from OCR, we might not have a documentAssembler in pipe
             pipe.is_fitted = False
-            document_assembler = ComponentUniverse.os_components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER]
+            document_assembler = ComponentUniverse.components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER]()
             document_assembler.set_metadata(document_assembler.get_default_model(), 'document_assembler',
                                             'document_assembler', 'xx', False, Licenses.open_source)
             pipe.components.insert(0, document_assembler)
@@ -471,7 +471,7 @@ def configure_component_output_levels(pipe, new_output_level=''):
             if not PipeUtils.has_sentence_detector(pipe):
                 logger.info("Adding missing Sentence Detector")
                 pipe.is_fitted = False
-                sentence_detector = ComponentUniverse.os_components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL]
+                sentence_detector = ComponentUniverse.components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL]()
                 sentence_detector.set_metadata(sentence_detector.get_default_model(), 'detect_sentence',
                                                'sentence_detector_dl', 'en', False, Licenses.open_source)
                 insert_idx = PipeUtils.find_doc_assembler_idx_in_pipe(pipe)
@@ -657,7 +657,7 @@ def replace_untrained_component_with_trained(nlu_pipe, spark_transformer_pipe):
                     untrained_class_name = AnnoClassRef.JSL_anno2_py_class[trainable_c.jsl_anno_class_id]
                     trained_model = PipeUtils.get_model_of_class_from_spark_pipe(spark_transformer_pipe,
                                                                                  trained_class_name)
-                    trained_component = ComponentUniverse.os_components[trainable_c.trained_mirror_anno].set_metadata(
+                    trained_component = jsl_id_to_empty_component(trainable_c.trained_mirror_anno).set_metadata(
                         trained_model, trainable_c.trained_mirror_anno, trainable_c.trained_mirror_anno, nlu_pipe.lang,
                         False,
                         Licenses.open_source)
@@ -666,7 +666,7 @@ def replace_untrained_component_with_trained(nlu_pipe, spark_transformer_pipe):
                     untrained_class_name = AnnoClassRef.JSL_anno_HC_ref_2_py_class[trainable_c.jsl_anno_class_id]
                     trained_model = PipeUtils.get_model_of_class_from_spark_pipe(spark_transformer_pipe,
                                                                                  trained_class_name)
-                    trained_component = ComponentUniverse.hc_components[trainable_c.trained_mirror_anno].set_metadata(
+                    trained_component = jsl_id_to_empty_component(trainable_c.trained_mirror_anno).set_metadata(
                         trained_model, trainable_c.trained_mirror_anno, trainable_c.trained_mirror_anno, nlu_pipe.lang,
                         False, Licenses.hc)
 

diff --git a/nlu/pipe/utils/resolution/nlu_ref_utils.py b/nlu/pipe/utils/resolution/nlu_ref_utils.py
@@ -36,7 +36,7 @@ def parse_language_from_nlu_ref(nlu_ref):
             if nlu_ref[0:3] != 'xx.':
                 nlu_reference = 'xx.' + nlu_ref
             logger.info(f'Setting lang as xx for nlu_ref={nlu_reference}')
-    if not lang :
+    if not lang:
         lang = 'en'
     logger.info(f'Parsed Nlu_ref={nlu_ref} as lang={lang}')
 
@@ -59,6 +59,10 @@ def nlu_ref_to_nlp_metadata(nlu_ref, is_recursive_call=False):
     nlp_ref = None
     license_type = Licenses.open_source
     is_pipe = False
+    if 'translate_to' in nlu_ref:
+        # We append here xx and set lang as xx  so users don't have to specify it
+        nlu_ref = 'xx.' + nlu_ref
+        lang = 'xx'
     # 1. check if open source pipeline
     if lang in Spellbook.pretrained_pipe_references.keys():
         if nlu_ref in Spellbook.pretrained_pipe_references[lang].keys():
@@ -75,7 +79,7 @@ def nlu_ref_to_nlp_metadata(nlu_ref, is_recursive_call=False):
         sparknlp_data = Spellbook.component_alias_references[nlu_ref]
         nlp_ref = sparknlp_data[0]
         is_pipe = 'component_list' in sparknlp_data[1]
-        if len(sparknlp_data) == 3 :
+        if len(sparknlp_data) == 3:
             model_params = sparknlp_data[2]
     # 4. check if healthcare pipe
     if lang in Spellbook.pretrained_healthcare_pipe_references.keys():

diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py
@@ -3,7 +3,7 @@
 from nlu.components import embeddings_chunker
 from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS
 from nlu.universe.logic_universes import AnnoTypes
-from nlu.universe.component_universes import ComponentUniverse
+from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component
 from nlu.universe.universes import Licenses
 
 
@@ -42,7 +42,7 @@ def insert_chunk_embedder_to_pipe_if_missing(pipe):
             if c.name == NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL:
                 ner_conveter_c = c
 
-        chunker = ComponentUniverse.os_components[NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER]
+        chunker = jsl_id_to_empty_component(NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER)
         chunker.set_metadata(
             chunker.get_default_model(),
             'chunker', 'chunker', 'xx', False, Licenses.open_source)