From 668e04dec56630d28e4850627cd9dfaeae6b294e Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Mon, 11 Apr 2022 03:12:12 +0200 Subject: [PATCH 1/4] Fixed bad spell mapping for translator spells --- nlu/pipe/utils/resolution/nlu_ref_utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/nlu/pipe/utils/resolution/nlu_ref_utils.py b/nlu/pipe/utils/resolution/nlu_ref_utils.py index 9db193bf..172eb3cf 100644 --- a/nlu/pipe/utils/resolution/nlu_ref_utils.py +++ b/nlu/pipe/utils/resolution/nlu_ref_utils.py @@ -36,7 +36,7 @@ def parse_language_from_nlu_ref(nlu_ref): if nlu_ref[0:3] != 'xx.': nlu_reference = 'xx.' + nlu_ref logger.info(f'Setting lang as xx for nlu_ref={nlu_reference}') - if not lang : + if not lang: lang = 'en' logger.info(f'Parsed Nlu_ref={nlu_ref} as lang={lang}') @@ -59,6 +59,10 @@ def nlu_ref_to_nlp_metadata(nlu_ref, is_recursive_call=False): nlp_ref = None license_type = Licenses.open_source is_pipe = False + if 'translate_to' in nlu_ref: + # We append here xx and set lang as xx so users don't have to specify it + nlu_ref = 'xx.' + nlu_ref + lang = 'xx' # 1. check if open source pipeline if lang in Spellbook.pretrained_pipe_references.keys(): if nlu_ref in Spellbook.pretrained_pipe_references[lang].keys(): @@ -75,7 +79,7 @@ def nlu_ref_to_nlp_metadata(nlu_ref, is_recursive_call=False): sparknlp_data = Spellbook.component_alias_references[nlu_ref] nlp_ref = sparknlp_data[0] is_pipe = 'component_list' in sparknlp_data[1] - if len(sparknlp_data) == 3 : + if len(sparknlp_data) == 3: model_params = sparknlp_data[2] # 4. check if healthcare pipe if lang in Spellbook.pretrained_healthcare_pipe_references.keys(): From 6d39b17c101cf3db4a91f2c0f7d87cfc9a574e2d Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Mon, 11 Apr 2022 05:52:15 +0200 Subject: [PATCH 2/4] merged component universes and introduced partial accesor methods --- nlu/__init__.py | 2 +- nlu/pipe/component_resolution.py | 212 +- nlu/pipe/nlu_component.py | 7 +- nlu/pipe/pipe_logic.py | 6 +- nlu/pipe/utils/pipe_utils.py | 14 +- .../block_utils/entity_manifold_utils.py | 4 +- nlu/universe/annotator_class_universe.py | 16 +- nlu/universe/component_universes.py | 4032 +++++++++-------- nlu/universe/feature_resolutions.py | 34 +- nlu/universe/feature_universes.py | 1 - nlu/utils/environment/offline_load_utils.py | 6 +- 11 files changed, 2309 insertions(+), 2025 deletions(-) diff --git a/nlu/__init__.py b/nlu/__init__.py index 596392bc..ca42453a 100644 --- a/nlu/__init__.py +++ b/nlu/__init__.py @@ -203,7 +203,7 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline: if offline_utils.is_pipe(pipe_path): # language, nlp_ref, nlu_ref,path=None, is_licensed=False # todo deduct lang and if Licensed or not - pipe_components = construct_component_from_pipe_identifier('en', nlu_ref, nlu_ref, pipe_path, False) + pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False) # Resource in path is a single model elif offline_utils.is_model(pipe_path): c = offline_utils.verify_and_create_model(pipe_path) diff --git a/nlu/pipe/component_resolution.py b/nlu/pipe/component_resolution.py index 2ab3f80b..559d4131 100644 --- a/nlu/pipe/component_resolution.py +++ b/nlu/pipe/component_resolution.py @@ -2,7 +2,7 @@ Contains methods used to resolve a NLU reference to a NLU component_to_resolve. Handler for getting default components, etc. ''' -from typing import Dict +from typing import Dict, List, Union, Optional from pyspark.ml import PipelineModel from sparknlp.pretrained import PretrainedPipeline, LightPipeline @@ -14,14 +14,16 @@ from nlu.spellbook import Spellbook from nlu.universe.annotator_class_universe import AnnoClassRef from nlu.universe.atoms import LicenseType -from nlu.universe.component_universes import ComponentUniverse +from nlu.universe.component_universes import ComponentUniverse, anno_class_to_empty_component from nlu.universe.feature_resolutions import FeatureResolutions +from nlu.universe.feature_universes import NLP_HC_FEATURES, OCR_FEATURES from nlu.universe.universes import Licenses logger = logging.getLogger('nlu') -def resolve_feature(missing_feature_type, language='en', is_licensed=False, +def resolve_feature(missing_feature_type: Union[NLP_HC_FEATURES, OCR_FEATURES, NLP_HC_FEATURES], language='en', + is_licensed=False, is_trainable_pipe=False) -> NluComponent: ''' This function returns a default component_to_resolve for a missing component_to_resolve type @@ -55,9 +57,9 @@ def resolve_feature(missing_feature_type, language='en', is_licensed=False, model_bucket = None else: raise ValueError(f"Could not resolve feature={missing_feature_type}") - nlu_component = feature_resolution.nlu_component # Substitution to keep lines short - # Either call get_pretrained(nlp_ref, lang,bucket) or get_default_model() to instantiate Annotator object + nlu_component = feature_resolution.nlu_component() # Call the partial and init the nlu component + # Either call get_pretrained(nlp_ref, lang,bucket) or get_default_model() to instantiate Annotator object if feature_resolution.get_pretrained: return nlu_component.set_metadata( nlu_component.get_pretrained_model(feature_resolution.nlp_ref, feature_resolution.language, @@ -86,27 +88,31 @@ def resolve_feature(missing_feature_type, language='en', is_licensed=False, else: raise ValueError( f"Could not resolve empty storage ref with default feature for missing feature = {missing_feature_type}") - nlu_component = feature_resolution.nlu_component # Substitution to keep lines short + nlu_component = feature_resolution.nlu_component() # Call the partial and init the nlu component return nlu_component.set_metadata( nlu_component.get_pretrained_model(feature_resolution.nlp_ref, feature_resolution.language, - model_bucket), feature_resolution.nlu_ref, - feature_resolution.nlp_ref, language, False, license_type) + model_bucket), + feature_resolution.nlu_ref, feature_resolution.nlp_ref, language, False, license_type) + # Actually resolve storage ref nlu_ref, nlp_ref, is_licensed, language = resolve_storage_ref(language, storage_ref, missing_feature_type) - anno_class_name = Spellbook.nlp_ref_to_anno_class[nlp_ref] - # All storage ref providers are defined in open source - os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict() license_type = Licenses.hc if is_licensed else Licenses.open_source - model_bucket = 'clinical/models' if is_licensed else None - jsl_anno_id = os_annos[anno_class_name] - import copy - nlu_component = copy.copy(ComponentUniverse.os_components[jsl_anno_id]) - # We write storage ref to nlu_component, for the case of accumulated chunk and sentence embeddings. - # Anno Class has no storage ref in these cases, but it is still an embedding provider - return nlu_component.set_metadata(nlu_component.get_pretrained_model(nlp_ref, language, model_bucket), - nlu_ref, - nlp_ref, language, - False, license_type, storage_ref) + nlu_component = get_trained_component_for_nlp_model_ref(language, nlu_ref, nlp_ref, license_type) + return nlu_component + # anno_class_name = Spellbook.nlp_ref_to_anno_class[nlp_ref] + # # All storage ref providers are defined in open source + # os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict() + # license_type = Licenses.hc if is_licensed else Licenses.open_source + # model_bucket = 'clinical/models' if is_licensed else None + # jsl_anno_id = os_annos[anno_class_name] + # import copy + # nlu_component = copy.copy(ComponentUniverse.os_components[jsl_anno_id]) + # # We write storage ref to nlu_component, for the case of accumulated chunk and sentence embeddings. + # # Anno Class has no storage ref in these cases, but it is still an embedding provider + # return nlu_component.set_metadata(nlu_component.get_pretrained_model(nlp_ref, language, model_bucket), + # nlu_ref, + # nlp_ref, language, + # False, license_type, storage_ref) def nlu_ref_to_component(nlu_ref, detect_lang=False, authenticated=False) -> NluComponent: @@ -140,53 +146,100 @@ def nlu_ref_to_component(nlu_ref, detect_lang=False, authenticated=False) -> Nlu raise ValueError(f'Could not find trainable model for nlu_ref={nlu_ref}.' f'Supported values = {s.join(nlu.Spellbook.trainable_models.keys())}') # TODO ,nlp ref for traianble? - return construct_trainable_component_from_identifier(nlu_ref) - lang, nlu_ref, nlp_ref, license_type, is_pipe,model_params = nlu_ref_to_nlp_metadata(nlu_ref) + return get_trainable_component_for_nlu_ref(nlu_ref) + lang, nlu_ref, nlp_ref, license_type, is_pipe, model_params = nlu_ref_to_nlp_metadata(nlu_ref) if is_pipe: - resolved_component = construct_component_from_pipe_identifier(lang, nlp_ref, nlu_ref, license_type=license_type) + resolved_component = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref, + license_type=license_type) else: - resolved_component = construct_component_from_identifier(lang, nlu_ref, nlp_ref, license_type,model_params) + resolved_component = get_trained_component_for_nlp_model_ref(lang, nlu_ref, nlp_ref, license_type, model_params) if resolved_component is None: raise ValueError(f"EXCEPTION: Could not create a component_to_resolve for nlu reference={nlu_ref}", ) return resolved_component -def construct_trainable_component_from_identifier(nlu_ref, nlp_ref='') -> NluComponent: - ''' - This method returns a Spark NLP annotator Approach class embelished by a NLU component_to_resolve - :param nlu_ref: nlu ref to the trainable model - :param nlp_ref: nlp ref to the trainable model - :return: trainable model as a NLU component_to_resolve - ''' - logger.info(f'Creating trainable NLU component_to_resolve for nlu_ref = {nlu_ref} ') - - if nlu_ref in Spellbook.traianble_nlu_ref_to_jsl_anno_id.keys(): +def get_trainable_component_for_nlu_ref(nlu_ref) -> NluComponent: + if nlu_ref in Spellbook.traianble_nlu_ref_to_jsl_anno_id: anno_id = Spellbook.traianble_nlu_ref_to_jsl_anno_id[nlu_ref] else: raise ValueError(f'Could not find trainable Model for nlu_spell ={nlu_ref}') + if anno_id in ComponentUniverse.components: + component = ComponentUniverse.components[anno_id]() + return component.set_metadata(component.get_trainable_model(), nlu_ref, '', 'xx', False, ) + else: + raise ValueError(f'Could not find trainable Model for anno_id ={anno_id}') - try: - if anno_id in ComponentUniverse.os_components.keys(): - nlu_component = ComponentUniverse.os_components[anno_id] - return nlu_component.set_metadata(nlu_component.get_trainable_model(), nlu_ref, nlp_ref, 'xx', False, - Licenses.open_source) - elif anno_id in ComponentUniverse.hc_components.keys(): - nlu_component = ComponentUniverse.hc_components[anno_id] - return nlu_component.set_metadata(nlu_component.get_trainable_model(), nlu_ref, nlp_ref, 'xx', False, - Licenses.hc) +def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path=None, + license_type: LicenseType = Licenses.open_source) -> List[NluComponent]: + """ + creates a list of components from a Spark NLP Pipeline reference + 1. download pipeline + 2. unpack pipeline to annotators and create list of nlu components + 3. return list of nlu components + :param license_type: Type of license for the component + :param nlu_ref: Nlu ref that points to this pipe + :param language: language of the pipeline + :param nlp_ref: Reference to a spark nlp pretrained pipeline + :param path: Load component_list from HDD + :return: Each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list + """ + logger.info(f'Building pretrained pipe for nlu_ref={nlu_ref} nlp_ref={nlp_ref}') + if 'language' in nlp_ref: + # special edge case for lang detectors + language = 'xx' + if path is None: + if license_type != Licenses.open_source: + pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models') else: - raise ValueError(f'Could not find trainable Model for nlu_spell ={nlu_ref}') + pipe = PretrainedPipeline(nlp_ref, lang=language) + iterable_stages = pipe.light_model.pipeline_model.stages + else: + pipe = LightPipeline(PipelineModel.load(path=path)) + iterable_stages = pipe.pipeline_model.stages + constructed_components = [] + for jsl_anno_object in iterable_stages: + anno_class_name = type(jsl_anno_object).__name__ + logger.info(f"Building NLU component for class_name = {anno_class_name} ") + component = anno_class_to_empty_component(anno_class_name) + component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref, language, True, license_type) + constructed_components.append(component) + if None in constructed_components or len(constructed_components) == 0: + raise Exception(f"Failure inferring type anno_class={anno_class_name} ") + return ComponentUtils.set_storage_ref_attribute_of_embedding_converters( + PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components, nlp_ref, language, path)) - except Exception: # if reference is not in namespace and not a component_to_resolve it will cause a unrecoverable crash - ValueError( - f'EXCEPTION: Could not create trainable NLU component_to_resolve for nlu_ref = {nlu_ref} and nlp_ref = {nlp_ref}') +def get_trained_component_for_nlp_model_ref(lang: str, nlu_ref: Optional[str] = '', nlp_ref: str = '', + license_type: LicenseType = Licenses.open_source, + model_configs: Optional[Dict[str, any]] = None) -> NluComponent: + anno_class = Spellbook.nlp_ref_to_anno_class[nlp_ref] + component = anno_class_to_empty_component(anno_class) + model_bucket = 'clinical/models' if license_type != Licenses.open_source else None + try: + if component.get_pretrained_model: + component = component.set_metadata( + component.get_pretrained_model(nlp_ref, lang, model_bucket), + nlu_ref, nlp_ref, lang, False, license_type) + else: + component = component.set_metadata(component.get_default_model(), + nlu_ref, nlp_ref, lang, False, license_type) + if model_configs: + for method_name, parameter in model_configs.items(): + # Dynamically call method from provided name and value, to set parameters like T5 task + code = f'component.model.{method_name}({parameter})' + eval(code) + except Exception as e: + raise ValueError(f'Failure making component, nlp_ref={nlp_ref}, nlu_ref={nlu_ref}, lang={lang}, \n err={e}') + + return component -def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=None, - license_type: LicenseType = Licenses.open_source): # -> NLUPipeline + +########## OLD +def construct_component_from_pipe_identifier_OLD(language, nlp_ref, nlu_ref, path=None, + license_type: LicenseType = Licenses.open_source): # -> NLUPipeline """ creates a list of components from a Spark NLP Pipeline reference 1. download pipeline @@ -246,9 +299,40 @@ def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref, path=No PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components, nlp_ref, language, path)) -def construct_component_from_identifier(language: str, nlu_ref: str = '', nlp_ref: str = '', - license_type: LicenseType = Licenses.open_source, - model_configs: Dict[str, any] = {}) -> NluComponent: +def get_trainable_component_for_nlu_ref_OLD(nlu_ref) -> NluComponent: + ''' + This method returns a Spark NLP annotator Approach class embellished by a NLU component_to_resolve + :param nlu_ref: nlu ref to the trainable model + :return: trainable model as a NLU component_to_resolve + ''' + logger.info(f'Creating trainable NLU component_to_resolve for nlu_ref = {nlu_ref} ') + + if nlu_ref in Spellbook.traianble_nlu_ref_to_jsl_anno_id.keys(): + anno_id = Spellbook.traianble_nlu_ref_to_jsl_anno_id[nlu_ref] + else: + raise ValueError(f'Could not find trainable Model for nlu_spell ={nlu_ref}') + + try: + if anno_id in ComponentUniverse.os_components.keys(): + nlu_component = ComponentUniverse.os_components[anno_id] + return nlu_component.set_metadata(nlu_component.get_trainable_model(), nlu_ref, '', 'xx', False, + Licenses.open_source) + elif anno_id in ComponentUniverse.hc_components.keys(): + nlu_component = ComponentUniverse.hc_components[anno_id] + return nlu_component.set_metadata(nlu_component.get_trainable_model(), nlu_ref, '', 'xx', False, + Licenses.hc) + + else: + raise ValueError(f'Could not find trainable Model for nlu_spell ={nlu_ref}') + + except Exception: # if reference is not in namespace and not a component_to_resolve it will cause a unrecoverable crash + ValueError( + f'EXCEPTION: Could not create trainable NLU component_to_resolve for nlu_ref = {nlu_ref} ') + + +def get_trained_component_for_nlp_ref_OLD(language: str, nlu_ref: str = '', nlp_ref: str = '', + license_type: LicenseType = Licenses.open_source, + model_configs: Dict[str, any] = {}) -> NluComponent: ''' Creates a NLU component_to_resolve from a pretrained SparkNLP model reference or Class reference. First step to get the Root of the NLP DAG Class references will return default pretrained models @@ -258,17 +342,15 @@ def construct_component_from_identifier(language: str, nlu_ref: str = '', nlp_re :param license_type: Type of license for the component :return: Returns a new NLU component ''' + logger.info(f'Building sparknlp_ref={nlp_ref}, nlu_ref={nlu_ref},language={language} ') + if nlp_ref not in Spellbook.nlp_ref_to_anno_class: + raise ValueError('Invalid NLP ref for sparknlp_ref={nlp_ref}, nlu_ref={nlu_ref},language={language}') anno_class_name = Spellbook.nlp_ref_to_anno_class[nlp_ref] - os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict() - hc_annos = AnnoClassRef.get_hc_pyclass_2_anno_id_dict() - ocr_annos = AnnoClassRef.get_ocr_pyclass_2_anno_id_dict() - logger.info( - f'Creating component_to_resolve, sparknlp_ref={nlp_ref}, nlu_ref={nlu_ref},language={language} ') model_bucket = 'clinical/models' if license_type != Licenses.open_source else None try: - if anno_class_name in os_annos.keys(): + if anno_class_name in AnnoClassRef.JSL_OS_py_class_2_anno_id: # Open Source - jsl_anno_id = os_annos[anno_class_name] + jsl_anno_id = AnnoClassRef.JSL_OS_py_class_2_anno_id[anno_class_name] nlu_component = ComponentUniverse.os_components[jsl_anno_id] if nlu_component.get_pretrained_model: component = nlu_component.set_metadata( @@ -282,9 +364,9 @@ def construct_component_from_identifier(language: str, nlu_ref: str = '', nlp_re language, False, Licenses.open_source) - elif anno_class_name in hc_annos.keys(): + elif anno_class_name in AnnoClassRef.JSL_HC_py_class_2_anno_id: # Licensed HC - jsl_anno_id = hc_annos[anno_class_name] + jsl_anno_id = AnnoClassRef.JSL_HC_py_class_2_anno_id[anno_class_name] nlu_component = ComponentUniverse.hc_components[jsl_anno_id] if nlu_component.get_pretrained_model: component = nlu_component.set_metadata( @@ -298,12 +380,10 @@ def construct_component_from_identifier(language: str, nlu_ref: str = '', nlp_re nlp_ref, language, False, Licenses.hc) - elif anno_class_name in ocr_annos.keys(): - # Licensed OCR (WIP) - jsl_anno_id = ocr_annos[anno_class_name] + elif anno_class_name in AnnoClassRef.JSL_OCR_py_class_2_anno_id: + jsl_anno_id = AnnoClassRef.JSL_OCR_py_class_2_anno_id[anno_class_name] nlu_component = ComponentUniverse.ocr_components[jsl_anno_id] if nlu_component.get_pretrained_model: - component = nlu_component.set_metadata(nlu_component.get_pretrained_model(nlp_ref, language, ), nlu_ref, nlp_ref, language, False, Licenses.ocr) diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py index 026afcf0..3f0976cf 100644 --- a/nlu/pipe/nlu_component.py +++ b/nlu/pipe/nlu_component.py @@ -124,7 +124,7 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel] nlp_ref: str, language: LanguageIso, loaded_from_pretrained_pipe: bool, - license_type: LicenseType, + license_type: Optional[LicenseType], storage_ref: Optional[str] = None): """Write metadata to nlu component_to_resolve after constructing it """ self.model = jsl_anno_object @@ -132,7 +132,6 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel] self.nlp_ref = nlp_ref self.language = language self.loaded_from_pretrained_pipe = loaded_from_pretrained_pipe - self.license = license_type self.in_types = self.node.ins.copy() self.out_types = self.node.outs.copy() self.in_types_default = self.node.ins.copy() @@ -141,6 +140,8 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel] self.spark_output_column_names = self.out_types.copy() if storage_ref: self.storage_ref = storage_ref + if license_type: + self.license = license_type if nlp_ref == 'glove_840B_300' or nlp_ref == 'glove_6B_300': self.lang = 'xx' if hasattr(self.model, 'setIncludeConfidence'): @@ -154,7 +155,7 @@ def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel] self.is_trained = False from copy import copy - return copy(self) + return self def __str__(self): return f'Component(ID={self.name}, NLU_REF={self.nlu_ref} NLP_REF={self.nlp_ref})' diff --git a/nlu/pipe/pipe_logic.py b/nlu/pipe/pipe_logic.py index c6c0c347..f2f612b0 100644 --- a/nlu/pipe/pipe_logic.py +++ b/nlu/pipe/pipe_logic.py @@ -6,7 +6,7 @@ from nlu.universe.logic_universes import AnnoTypes from nlu import Licenses from nlu.universe.feature_universes import NLP_FEATURES -from nlu.universe.component_universes import ComponentUniverse +from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component logger = logging.getLogger('nlu') from nlu.pipe.utils.pipe_utils import PipeUtils @@ -206,7 +206,7 @@ def add_sentence_embedding_converter(resolution_data: StorageRefConversionResolu """ logger.info(f'Adding Sentence embedding conversion for Embedding Provider={resolution_data}') word_embedding_provider = resolution_data.component_candidate - c = ComponentUniverse.os_components[NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER] + c = jsl_id_to_empty_component(NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER) storage_ref = StorageRefUtils.extract_storage_ref(word_embedding_provider) c.set_metadata(c.get_default_model(), 'sentence_embedding_converter', NLP_NODE_IDS.SENTENCE_EMBEDDINGS_CONVERTER, 'xx', False, Licenses.open_source, storage_ref) @@ -232,7 +232,7 @@ def add_chunk_embedding_converter( entities_col = 'entities' embed_provider_col = word_embedding_provider.info.spark_output_column_names[0] - c = ComponentUniverse.os_components[NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER] + c = jsl_id_to_empty_component(NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER) c.set_metadata(c.get_default_model(), NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER, NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER, 'xx', diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py index 12f32d11..99d756b1 100644 --- a/nlu/pipe/utils/pipe_utils.py +++ b/nlu/pipe/utils/pipe_utils.py @@ -6,7 +6,7 @@ from nlu import Licenses from nlu.pipe.nlu_component import NluComponent from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils -from nlu.universe.component_universes import ComponentUniverse +from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS, OCR_NODE_IDS from nlu.universe.feature_universes import NLP_FEATURES from nlu.universe.logic_universes import NLP_LEVELS, AnnoTypes @@ -223,14 +223,14 @@ def enforce_AT_schema_on_NER_processors_and_add_missing_NER_converters(pipe): if converter_to_update is None: if c.license == Licenses.hc: # TODO SET METADATA FIELDS HERE ON ANNO!! - converter_to_update = ComponentUniverse.hc_components[NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL] + converter_to_update = jsl_id_to_empty_component(NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL) converter_to_update.set_metadata(converter_to_update.get_default_model(), NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL, NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL, 'xx', False, Licenses.hc) else: # TODO SET METADATA FIELDS HERE ON ANNO!! - converter_to_update = ComponentUniverse.os_components[NLP_NODE_IDS.NER_CONVERTER] + converter_to_update = jsl_id_to_empty_component(NLP_NODE_IDS.NER_CONVERTER) converter_to_update.set_metadata(converter_to_update.get_default_model(), NLP_NODE_IDS.NER_CONVERTER, NLP_NODE_IDS.NER_CONVERTER, 'xx', False, Licenses.open_source) @@ -462,7 +462,7 @@ def configure_component_output_levels(pipe, new_output_level=''): if not PipeUtils.has_document_assembler(pipe): # When loaded from OCR, we might not have a documentAssembler in pipe pipe.is_fitted = False - document_assembler = ComponentUniverse.os_components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER] + document_assembler = ComponentUniverse.components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER]() document_assembler.set_metadata(document_assembler.get_default_model(), 'document_assembler', 'document_assembler', 'xx', False, Licenses.open_source) pipe.components.insert(0, document_assembler) @@ -471,7 +471,7 @@ def configure_component_output_levels(pipe, new_output_level=''): if not PipeUtils.has_sentence_detector(pipe): logger.info("Adding missing Sentence Detector") pipe.is_fitted = False - sentence_detector = ComponentUniverse.os_components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL] + sentence_detector = ComponentUniverse.components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL]() sentence_detector.set_metadata(sentence_detector.get_default_model(), 'detect_sentence', 'sentence_detector_dl', 'en', False, Licenses.open_source) insert_idx = PipeUtils.find_doc_assembler_idx_in_pipe(pipe) @@ -657,7 +657,7 @@ def replace_untrained_component_with_trained(nlu_pipe, spark_transformer_pipe): untrained_class_name = AnnoClassRef.JSL_anno2_py_class[trainable_c.jsl_anno_class_id] trained_model = PipeUtils.get_model_of_class_from_spark_pipe(spark_transformer_pipe, trained_class_name) - trained_component = ComponentUniverse.os_components[trainable_c.trained_mirror_anno].set_metadata( + trained_component = jsl_id_to_empty_component(trainable_c.trained_mirror_anno).set_metadata( trained_model, trainable_c.trained_mirror_anno, trainable_c.trained_mirror_anno, nlu_pipe.lang, False, Licenses.open_source) @@ -666,7 +666,7 @@ def replace_untrained_component_with_trained(nlu_pipe, spark_transformer_pipe): untrained_class_name = AnnoClassRef.JSL_anno_HC_ref_2_py_class[trainable_c.jsl_anno_class_id] trained_model = PipeUtils.get_model_of_class_from_spark_pipe(spark_transformer_pipe, trained_class_name) - trained_component = ComponentUniverse.hc_components[trainable_c.trained_mirror_anno].set_metadata( + trained_component = jsl_id_to_empty_component(trainable_c.trained_mirror_anno).set_metadata( trained_model, trainable_c.trained_mirror_anno, trainable_c.trained_mirror_anno, nlu_pipe.lang, False, Licenses.hc) diff --git a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py index fbeb679b..7e480f3c 100644 --- a/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py +++ b/nlu/pipe/viz/streamlit_viz/viz_building_blocks/block_utils/entity_manifold_utils.py @@ -3,7 +3,7 @@ from nlu.components import embeddings_chunker from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS from nlu.universe.logic_universes import AnnoTypes -from nlu.universe.component_universes import ComponentUniverse +from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component from nlu.universe.universes import Licenses @@ -42,7 +42,7 @@ def insert_chunk_embedder_to_pipe_if_missing(pipe): if c.name == NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL: ner_conveter_c = c - chunker = ComponentUniverse.os_components[NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER] + chunker = jsl_id_to_empty_component(NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER) chunker.set_metadata( chunker.get_default_model(), 'chunker', 'chunker', 'xx', False, Licenses.open_source) diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py index 82110d72..e463feeb 100644 --- a/nlu/universe/annotator_class_universe.py +++ b/nlu/universe/annotator_class_universe.py @@ -1,5 +1,6 @@ from typing import Dict -from nlu.universe.atoms import JslAnnoId, JslAnnoPyClass, JslAnnoJavaClass + +from nlu.universe.atoms import JslAnnoId, JslAnnoPyClass from nlu.universe.feature_node_ids import OCR_NODE_IDS, NLP_NODE_IDS, NLP_HC_NODE_IDS @@ -10,7 +11,7 @@ class AnnoClassRef: A_H = None # NLP_HC_ANNO A_N = NLP_NODE_IDS HC_A_N = NLP_HC_NODE_IDS - + # Map AnnoID to PyCLass JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = { A_N.BIG_TEXT_MATCHER: 'BigTextMatcher', @@ -171,7 +172,6 @@ class AnnoClassRef: HC_A_N.ENTITY_CHUNK_EMBEDDING: 'EntityChunkEmbeddings', } - JSL_anno_OCR_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = { OCR_NODE_IDS.IMAGE2TEXT: 'ImageToText', OCR_NODE_IDS.PDF2TEXT: 'PdfToText', @@ -183,6 +183,8 @@ class AnnoClassRef: } + + @staticmethod def get_os_pyclass_2_anno_id_dict(): # Flipped, maps PyClass to AnnoID @@ -204,3 +206,11 @@ def get_ocr_pyclass_2_anno_id_dict(): for k in AnnoClassRef.JSL_anno_OCR_ref_2_py_class} return JSL_OCR_py_class_2_anno_id + + +# Flipped, maps PyClass to AnnoID +AnnoClassRef.JSL_OS_py_class_2_anno_id: Dict[JslAnnoPyClass, JslAnnoId] = {AnnoClassRef.JSL_anno2_py_class[k]: k for k in AnnoClassRef.JSL_anno2_py_class} +AnnoClassRef.JSL_HC_py_class_2_anno_id: Dict[JslAnnoId, JslAnnoPyClass] = {AnnoClassRef.JSL_anno_HC_ref_2_py_class[k]: k for k in + AnnoClassRef.JSL_anno_HC_ref_2_py_class} +AnnoClassRef.JSL_OCR_py_class_2_anno_id: Dict[JslAnnoId, JslAnnoPyClass] = {AnnoClassRef.JSL_anno_OCR_ref_2_py_class[k]: k for k in + AnnoClassRef.JSL_anno_OCR_ref_2_py_class} \ No newline at end of file diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py index 55484906..9bea5251 100644 --- a/nlu/universe/component_universes.py +++ b/nlu/universe/component_universes.py @@ -1,118 +1,189 @@ +from nlu.components.assertions.assertion_dl.assertion_dl import AssertionDL +from nlu.components.assertions.assertion_log_reg.assertion_log_reg import AssertionLogReg +from nlu.components.chunkers.contextual_parser.contextual_parser import ContextualParser +from nlu.components.chunkers.default_chunker.default_chunker import DefaultChunker +from nlu.components.chunkers.ngram.ngram import NGram +from nlu.components.classifiers.classifier_dl.classifier_dl import ClassifierDl +from nlu.components.classifiers.generic_classifier.generic_classifier import GenericClassifier +from nlu.components.classifiers.language_detector.language_detector import LanguageDetector +from nlu.components.classifiers.multi_classifier.multi_classifier import MultiClassifier +from nlu.components.classifiers.named_entity_recognizer_crf.ner_crf import NERDLCRF +from nlu.components.classifiers.ner.ner_dl import NERDL +from nlu.components.classifiers.ner_healthcare.ner_dl_healthcare import NERDLHealthcare +from nlu.components.classifiers.pos.part_of_speech_jsl import PartOfSpeechJsl +from nlu.components.classifiers.sentiment_detector.sentiment_detector import Sentiment +from nlu.components.classifiers.sentiment_dl.sentiment_dl import SentimentDl from nlu.components.classifiers.seq_albert.seq_albert import SeqAlbertClassifier +from nlu.components.classifiers.seq_bert.seq_bert_classifier import SeqBertClassifier from nlu.components.classifiers.seq_bert_medical.seq_bert_medical_classifier import SeqBertMedicalClassifier +from nlu.components.classifiers.seq_distilbert.seq_distilbert_classifier import SeqDilstilBertClassifier from nlu.components.classifiers.seq_distilbert_medical.seq_distilbert_medical_classifier import \ SeqDilstilBertMedicalClassifier from nlu.components.classifiers.seq_longformer.seq_longformer import SeqLongformerClassifier from nlu.components.classifiers.seq_roberta.seq_roberta import SeqRobertaClassifier from nlu.components.classifiers.seq_xlm_roberta.seq_xlm_roberta import SeqXlmRobertaClassifier from nlu.components.classifiers.seq_xlnet.seq_xlnet import SeqXlnetClassifier +from nlu.components.classifiers.token_albert.token_albert import TokenAlbert +from nlu.components.classifiers.token_bert.token_bert import TokenBert from nlu.components.classifiers.token_bert_healthcare.token_bert_healthcare import TokenBertHealthcare -from nlu.components.embeddings.deberta.deberta import Deberta -from nlu.components.embeddings.roberta.roberta import Roberta -from nlu.components.embeddings.word2vec.word2vec import Word2Vec -from nlu.components.embeddings_chunks.chunk_embedder.chunk_embedder import ChunkEmbedder -from nlu.components.classifiers.ner.ner_dl import NERDL -from nlu.components.chunkers.ngram.ngram import NGram -from nlu.components.classifiers.classifier_dl.classifier_dl import ClassifierDl -from nlu.components.relation_extractors.relation_extractor_dl.relation_extractor_dl import RelationExtractionDL -from nlu.components.seq2seqs.gpt2.gpt2 import GPT2 -from nlu.ocr_components.table_extractors.doc_table_extractor.doc2table import Doc2TextTable -from nlu.ocr_components.table_extractors.ppt_table_extractor.ppt2table import PPT2TextTable -from nlu.ocr_components.text_recognizers.doc2text.doc2text import Doc2Text -from nlu.ocr_components.text_recognizers.img2text.img2text import Img2Text -from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text - -from nlu.ocr_components.table_extractors.pdf_table_extractor.pdf2table import PDF2TextTable -from nlu.ocr_components.utils.binary2image.binary2image import Binary2Image -from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols -from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config -from nlu.universe.universes import Licenses, ComputeContexts -from nlu.universe.feature_universes import NLP_FEATURES -from nlu.universe.annotator_class_universe import AnnoClassRef -from nlu.pipe.nlu_component import NluComponent -from nlu.components.embeddings.distil_bert.distilbert import DistilBert -from nlu.components.embeddings.xlm.xlm import XLM -from nlu.components.embeddings.longformer.longformer import Longformer -from nlu.components.utils.sentence_embeddings.spark_nlp_sentence_embedding import SparkNLPSentenceEmbeddings -from nlu.components.dependency_untypeds.unlabeled_dependency_parser.unlabeled_dependency_parser import \ - UnlabeledDependencyParser +from nlu.components.classifiers.token_distilbert.token_distilbert import TokenDistilBert +from nlu.components.classifiers.token_longformer.token_longformer import TokenLongFormer +from nlu.components.classifiers.token_roberta.token_roberta import TokenRoBerta +from nlu.components.classifiers.token_xlm_roberta.token_xlmroberta import TokenXlmRoBerta +from nlu.components.classifiers.token_xlnet.token_xlnet import TokenXlnet +from nlu.components.classifiers.vivekn_sentiment.vivekn_sentiment_detector import ViveknSentiment +from nlu.components.classifiers.yake.yake import Yake +from nlu.components.deidentifiers.deidentifier.deidentifier import Deidentifier from nlu.components.dependency_typeds.labeled_dependency_parser.labeled_dependency_parser import \ LabeledDependencyParser -from nlu.components.utils.document_assembler.spark_nlp_document_assembler import SparkNlpDocumentAssembler -from nlu.components.utils.ner_to_chunk_converter.ner_to_chunk_converter import NerToChunkConverter -from nlu.components.sentence_detectors.pragmatic_sentence_detector.sentence_detector import PragmaticSentenceDetector -from nlu.components.sentence_detectors.deep_sentence_detector.deep_sentence_detector import SentenceDetectorDeep +from nlu.components.dependency_untypeds.unlabeled_dependency_parser.unlabeled_dependency_parser import \ + UnlabeledDependencyParser from nlu.components.embeddings.albert.spark_nlp_albert import SparkNLPAlbert -from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence -from nlu.components.embeddings.doc2vec.doc2vec import Doc2Vec from nlu.components.embeddings.bert.spark_nlp_bert import SparkNLPBert +from nlu.components.embeddings.deberta.deberta import Deberta +from nlu.components.embeddings.distil_bert.distilbert import DistilBert +from nlu.components.embeddings.doc2vec.doc2vec import Doc2Vec from nlu.components.embeddings.elmo.spark_nlp_elmo import SparkNLPElmo -from nlu.components.embeddings.xlnet.spark_nlp_xlnet import SparkNLPXlnet -from nlu.components.embeddings.use.spark_nlp_use import SparkNLPUse from nlu.components.embeddings.glove.glove import Glove -from nlu.components.classifiers.multi_classifier.multi_classifier import MultiClassifier -from nlu.components.classifiers.yake.yake import Yake -from nlu.components.classifiers.language_detector.language_detector import LanguageDetector -from nlu.components.classifiers.named_entity_recognizer_crf.ner_crf import NERDLCRF -from nlu.components.classifiers.sentiment_dl.sentiment_dl import SentimentDl -from nlu.components.classifiers.vivekn_sentiment.vivekn_sentiment_detector import ViveknSentiment -from nlu.components.classifiers.pos.part_of_speech_jsl import PartOfSpeechJsl -from nlu.components.classifiers.seq_bert.seq_bert_classifier import SeqBertClassifier -from nlu.components.classifiers.seq_distilbert.seq_distilbert_classifier import SeqDilstilBertClassifier -from nlu.components.classifiers.token_bert.token_bert import TokenBert -from nlu.components.classifiers.token_distilbert.token_distilbert import TokenDistilBert -from nlu.components.classifiers.token_albert.token_albert import TokenAlbert -from nlu.components.classifiers.token_roberta.token_roberta import TokenRoBerta -from nlu.components.classifiers.token_xlm_roberta.token_xlmroberta import TokenXlmRoBerta -from nlu.components.classifiers.token_longformer.token_longformer import TokenLongFormer -from nlu.components.classifiers.token_xlnet.token_xlnet import TokenXlnet +from nlu.components.embeddings.longformer.longformer import Longformer +from nlu.components.embeddings.roberta.roberta import Roberta +from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence from nlu.components.embeddings.sentence_xlm.sentence_xlm import Sentence_XLM -from nlu.components.stemmers.stemmer.spark_nlp_stemmer import SparkNLPStemmer -from nlu.components.normalizers.normalizer.spark_nlp_normalizer import SparkNLPNormalizer -from nlu.components.normalizers.document_normalizer.spark_nlp_document_normalizer import SparkNLPDocumentNormalizer +from nlu.components.embeddings.use.spark_nlp_use import SparkNLPUse +from nlu.components.embeddings.word2vec.word2vec import Word2Vec +from nlu.components.embeddings.xlm.xlm import XLM +from nlu.components.embeddings.xlnet.spark_nlp_xlnet import SparkNLPXlnet +from nlu.components.embeddings_chunks.chunk_embedder.chunk_embedder import ChunkEmbedder from nlu.components.lemmatizers.lemmatizer.spark_nlp_lemmatizer import SparkNLPLemmatizer -from nlu.components.stopwordscleaners.stopwordcleaner.nlustopwordcleaner import NLUStopWordcleaner -from nlu.components.spell_checkers.norvig_spell.norvig_spell_checker import NorvigSpellChecker +from nlu.components.normalizers.document_normalizer.spark_nlp_document_normalizer import SparkNLPDocumentNormalizer +from nlu.components.normalizers.drug_normalizer.drug_normalizer import DrugNorm +from nlu.components.normalizers.normalizer.spark_nlp_normalizer import SparkNLPNormalizer +from nlu.components.relation_extractors.relation_extractor.relation_extractor import RelationExtraction +from nlu.components.relation_extractors.relation_extractor_dl.relation_extractor_dl import RelationExtractionDL +from nlu.components.resolutions.sentence_entity_resolver.sentence_resolver import SentenceResolver +from nlu.components.sentence_detectors.deep_sentence_detector.deep_sentence_detector import SentenceDetectorDeep +from nlu.components.sentence_detectors.pragmatic_sentence_detector.sentence_detector import PragmaticSentenceDetector +from nlu.components.seq2seqs.gpt2.gpt2 import GPT2 +from nlu.components.seq2seqs.marian.marian import Marian +from nlu.components.seq2seqs.t5.t5 import T5 from nlu.components.spell_checkers.context_spell.context_spell_checker import ContextSpellChecker +from nlu.components.spell_checkers.norvig_spell.norvig_spell_checker import NorvigSpellChecker from nlu.components.spell_checkers.symmetric_spell.symmetric_spell_checker import SymmetricSpellChecker +from nlu.components.stemmers.stemmer.spark_nlp_stemmer import SparkNLPStemmer +from nlu.components.stopwordscleaners.stopwordcleaner.nlustopwordcleaner import NLUStopWordcleaner from nlu.components.tokenizers.default_tokenizer.default_tokenizer import DefaultTokenizer -from nlu.components.tokenizers.word_segmenter.word_segmenter import WordSegmenter -from nlu.components.chunkers.default_chunker.default_chunker import DefaultChunker -from nlu.components.seq2seqs.marian.marian import Marian -from nlu.components.seq2seqs.t5.t5 import T5 -from nlu.components.classifiers.sentiment_detector.sentiment_detector import Sentiment -from nlu.universe.logic_universes import NLP_LEVELS, AnnoTypes from nlu.components.tokenizers.regex_tokenizer.regex_tokenizer import RegexTokenizer +from nlu.components.tokenizers.word_segmenter.word_segmenter import WordSegmenter from nlu.components.utils.chunk_2_doc.doc_2_chunk import Chunk_2_Doc from nlu.components.utils.doc2chunk.doc_2_chunk import Doc_2_Chunk -from nlu.components.assertions.assertion_dl.assertion_dl import AssertionDL -from nlu.components.assertions.assertion_log_reg.assertion_log_reg import AssertionLogReg -from nlu.components.chunkers.contextual_parser.contextual_parser import ContextualParser -from nlu.components.classifiers.generic_classifier.generic_classifier import GenericClassifier -from nlu.components.classifiers.ner_healthcare.ner_dl_healthcare import NERDLHealthcare -from nlu.universe.annotator_class_universe import AnnoClassRef -from nlu.universe.universes import ComponentBackends, ComputeContexts -from nlu.universe.logic_universes import NLP_LEVELS, AnnoTypes -from nlu.components.deidentifiers.deidentifier.deidentifier import Deidentifier -from nlu.components.normalizers.drug_normalizer.drug_normalizer import DrugNorm -from nlu.components.relation_extractors.relation_extractor.relation_extractor import RelationExtraction -from nlu.components.resolutions.sentence_entity_resolver.sentence_resolver import SentenceResolver +from nlu.components.utils.document_assembler.spark_nlp_document_assembler import SparkNlpDocumentAssembler +from nlu.components.utils.ner_to_chunk_converter.ner_to_chunk_converter import NerToChunkConverter from nlu.components.utils.ner_to_chunk_converter_licensed.ner_to_chunk_converter_licensed import \ NerToChunkConverterLicensed +from nlu.components.utils.sentence_embeddings.spark_nlp_sentence_embedding import SparkNLPSentenceEmbeddings +from nlu.ocr_components.table_extractors.doc_table_extractor.doc2table import Doc2TextTable +from nlu.ocr_components.table_extractors.pdf_table_extractor.pdf2table import PDF2TextTable +from nlu.ocr_components.table_extractors.ppt_table_extractor.ppt2table import PPT2TextTable +from nlu.ocr_components.text_recognizers.doc2text.doc2text import Doc2Text +from nlu.ocr_components.text_recognizers.img2text.img2text import Img2Text +from nlu.ocr_components.text_recognizers.pdf2text.pdf2text import Pdf2Text +from nlu.ocr_components.utils.binary2image.binary2image import Binary2Image from nlu.pipe.col_substitution.col_substitution_HC import * +from nlu.pipe.col_substitution.col_substitution_OCR import substitute_recognized_text_cols from nlu.pipe.col_substitution.col_substitution_OS import * from nlu.pipe.extractors.extractor_configs_HC import * +from nlu.pipe.extractors.extractor_configs_HC import default_full_config +from nlu.pipe.extractors.extractor_configs_OCR import default_text_recognizer_config, default_binary_to_image_config from nlu.pipe.extractors.extractor_configs_OS import * -from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS, OCR_NODE_IDS -from nlu.universe.feature_node_universes import NLP_HC_FEATURE_NODES, OCR_FEATURE_NODES -from nlu.universe.feature_universes import NLP_FEATURES, OCR_FEATURES, NLP_HC_FEATURES from nlu.pipe.nlu_component import NluComponent -from nlu.universe.universes import Licenses, ComputeContexts -from nlu.pipe.extractors.extractor_configs_HC import default_full_config +from nlu.universe.annotator_class_universe import AnnoClassRef +from nlu.universe.atoms import JslAnnoId, LicenseType, JslAnnoPyClass from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS +from nlu.universe.feature_node_ids import OCR_NODE_IDS from nlu.universe.feature_node_universes import NLP_FEATURE_NODES +from nlu.universe.feature_node_universes import NLP_HC_FEATURE_NODES, OCR_FEATURE_NODES +from nlu.universe.feature_universes import NLP_FEATURES +from nlu.universe.logic_universes import NLP_LEVELS, AnnoTypes from nlu.universe.universes import ComponentBackends -from copy import copy +from nlu.universe.universes import Licenses, ComputeContexts + + +def anno_class_to_empty_component(anno_class) -> NluComponent: + """ + For a given anno-class returns NLU-Component which wraps the corrosponding annotator class + but has no model yet loaded + :param anno_class: compatible nlu-component to find for + :return: NluComponent which can load anno_class models + """ + jsl_anno_id = anno_class_to_jsl_id(anno_class) + if jsl_anno_id not in ComponentUniverse.components: + raise ValueError(f'Invalid JSL-Anno-ID={jsl_anno_id}') + component = ComponentUniverse.components[jsl_anno_id]() + return component + + +def jsl_id_to_empty_component(jsl_id) -> NluComponent: + """ + Get NLU component with given JSL-ID with no model loaded onto it + :param jsl_id: identifier of component/annotator type + :return: NluComponent for jsl_id + """ + return anno_class_to_empty_component(jsl_id_to_anno_class(jsl_id)) + + +def jsl_id_to_anno_class(jsl_id) -> JslAnnoPyClass: + """Returns anno_class name for jsl_id + :param jsl_id: id of anno + :return JslAnnoPyClass : cass of annotator + """ + if jsl_id in AnnoClassRef.JSL_anno2_py_class: + anno_class = AnnoClassRef.JSL_anno2_py_class[jsl_id] + elif jsl_id in AnnoClassRef.JSL_anno_HC_ref_2_py_class: + anno_class = AnnoClassRef.JSL_anno_HC_ref_2_py_class[jsl_id] + elif jsl_id in AnnoClassRef.JSL_anno_OCR_ref_2_py_class: + anno_class = AnnoClassRef.JSL_anno_OCR_ref_2_py_class[jsl_id] + else: + raise ValueError(f'Cannot find anno_class for jsl-id={jsl_id}') + return anno_class + + +def anno_class_to_jsl_id(anno_class) -> JslAnnoId: + """Returns JSL-Anno-ID and default license type for given anno_class name. + Note that an anno which maps to a component with default OS_license, + may load a HC model and nlu component must be updated to HC license then + :param anno_class: class name of the annotator + :return:JslAnnoID of anno class + """ + if anno_class in AnnoClassRef.get_os_pyclass_2_anno_id_dict(): + jsl_anno_id = AnnoClassRef.get_os_pyclass_2_anno_id_dict()[anno_class] + elif anno_class in AnnoClassRef.get_hc_pyclass_2_anno_id_dict(): + jsl_anno_id = AnnoClassRef.get_hc_pyclass_2_anno_id_dict()[anno_class] + elif anno_class in AnnoClassRef.get_ocr_pyclass_2_anno_id_dict(): + jsl_anno_id = AnnoClassRef.get_ocr_pyclass_2_anno_id_dict()[anno_class] + else: + raise ValueError(f'Cannot get class metadata for invalid anno_class={anno_class}') + return jsl_anno_id + + +def get_anno_class_metadata(anno_class) -> Tuple[JslAnnoId, LicenseType]: + """Returns JSL-Anno-ID and default license type for given anno_class name. + Note that an anno which maps to a component with default OS_license, + may load a HC model and nlu component must be updated to HC license then + :param anno_class: class name of the annotator + :return: Tuple, first entry JslAnnoID, second entry Default LicenseType + """ + if anno_class in AnnoClassRef.JSL_OS_py_class_2_anno_id: + jsl_anno_id = AnnoClassRef.JSL_OS_py_class_2_anno_id[anno_class] + license_type = Licenses.open_source + elif anno_class in AnnoClassRef.JSL_HC_py_class_2_anno_id: + jsl_anno_id = AnnoClassRef.JSL_HC_py_class_2_anno_id[anno_class] + license_type = Licenses.open_source + elif anno_class in AnnoClassRef.JSL_OCR_py_class_2_anno_id: + jsl_anno_id = AnnoClassRef.JSL_OCR_py_class_2_anno_id[anno_class] + license_type = Licenses.open_source + else: + raise ValueError(f'Cannot get class metadata for invalid anno_class={anno_class}') + return jsl_anno_id, license_type class ComponentUniverse: @@ -124,1079 +195,1148 @@ class ComponentUniverse: F = NLP_FEATURES L = NLP_LEVELS ACR = AnnoClassRef - os_components = { - A.CHUNK2DOC: copy(NluComponent( - name=A.CHUNK2DOC, - type=T.HELPER_ANNO, - get_default_model=Chunk_2_Doc.get_default_model, - pdf_extractor_methods={'default_full': default_full_config, }, # 'default': '', TODO no extractor - pdf_col_name_substitutor=substitute_doc2chunk_cols, - output_level=L.DOCUMENT, - node=NLP_FEATURE_NODES.nodes[A.CHUNK2DOC], - description='TODO', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.CHUNK2DOC, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CHUNK2DOC], + # os_components = {} + # hc_components = {} + # ocr_components = {} + components = { + A.CHUNK2DOC: partial(NluComponent, + name=A.CHUNK2DOC, + type=T.HELPER_ANNO, + get_default_model=Chunk_2_Doc.get_default_model, + pdf_extractor_methods={'default_full': default_full_config, }, + # 'default': '', TODO no extractor + pdf_col_name_substitutor=substitute_doc2chunk_cols, + output_level=L.DOCUMENT, + node=NLP_FEATURE_NODES.nodes[A.CHUNK2DOC], + description='TODO', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.CHUNK2DOC, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CHUNK2DOC], - )), - A.CHUNK_EMBEDDINGS_CONVERTER: copy(NluComponent( - name=A.CHUNK_EMBEDDINGS_CONVERTER, - type=T.HELPER_ANNO, - get_default_model=ChunkEmbedder.get_default_model, - pdf_extractor_methods={'default': default_chunk_embedding_config, 'default_full': default_full_config, }, - # TODO no extractor - pdf_col_name_substitutor=substitute_chunk_embed_cols, - output_level=L.CHUNK, - node=NLP_FEATURE_NODES.nodes[A.CHUNK_EMBEDDINGS_CONVERTER], - description='Convert Chunks to Doc type col', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.CHUNK_EMBEDDINGS_CONVERTER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CHUNK_EMBEDDINGS_CONVERTER], - is_storage_ref_producer=True, - has_storage_ref=True, - )), + ), + A.CHUNK_EMBEDDINGS_CONVERTER: partial(NluComponent, + name=A.CHUNK_EMBEDDINGS_CONVERTER, + type=T.HELPER_ANNO, + get_default_model=ChunkEmbedder.get_default_model, + pdf_extractor_methods={'default': default_chunk_embedding_config, + 'default_full': default_full_config, }, + # TODO no extractor + pdf_col_name_substitutor=substitute_chunk_embed_cols, + output_level=L.CHUNK, + node=NLP_FEATURE_NODES.nodes[A.CHUNK_EMBEDDINGS_CONVERTER], + description='Convert Chunks to Doc type col', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.CHUNK_EMBEDDINGS_CONVERTER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CHUNK_EMBEDDINGS_CONVERTER], + is_storage_ref_producer=True, + has_storage_ref=True, + ), A.CHUNK_TOKENIZER: 'TODO NOT INTEGRATED', - A.CHUNKER: copy(NluComponent( - name=A.CHUNKER, - type=T.CHUNK_CLASSIFIER, - get_default_model=DefaultChunker.get_default_model, - pdf_extractor_methods={'default': default_chunk_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_chunk_cols, - output_level=L.CHUNK, - node=NLP_FEATURE_NODES.nodes[A.CHUNKER], - description='Regex matcher that matches patters defined by part-of-speech (POS) tags', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.CHUNKER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CHUNKER], + A.CHUNKER: partial(NluComponent, + name=A.CHUNKER, + type=T.CHUNK_CLASSIFIER, + get_default_model=DefaultChunker.get_default_model, + pdf_extractor_methods={'default': default_chunk_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_chunk_cols, + output_level=L.CHUNK, + node=NLP_FEATURE_NODES.nodes[A.CHUNKER], + description='Regex matcher that matches patters defined by part-of-speech (POS) tags', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.CHUNKER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CHUNKER], - )), - A.CLASSIFIER_DL: copy(NluComponent( - name=A.CLASSIFIER_DL, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=ClassifierDl.get_default_model, - get_pretrained_model=ClassifierDl.get_pretrained_model, - get_trainable_model=ClassifierDl.get_trainable_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_classifier_dl_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.CLASSIFIER_DL], - description='Deep Learning based general classifier for many problems', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.CLASSIFIER_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CLASSIFIER_DL], - has_storage_ref=True, - is_storage_ref_consumer=True, - trainable_mirror_anno=A.TRAINABLE_CLASSIFIER_DL, - )), - A.TRAINABLE_CLASSIFIER_DL: copy(NluComponent( - name=A.TRAINABLE_CLASSIFIER_DL, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=ClassifierDl.get_default_model, - get_pretrained_model=ClassifierDl.get_pretrained_model, - get_trainable_model=ClassifierDl.get_trainable_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_classifier_dl_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_CLASSIFIER_DL], - description='Deep Learning based general classifier for many problems', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TRAINABLE_CLASSIFIER_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_CLASSIFIER_DL], - has_storage_ref=True, - is_storage_ref_consumer=True, - trainable=True, - trained_mirror_anno=A.CLASSIFIER_DL, - )), - A.CONTEXT_SPELL_CHECKER: copy(NluComponent( - name=A.CONTEXT_SPELL_CHECKER, - type=T.SPELL_CHECKER, - get_default_model=ContextSpellChecker.get_default_model, - get_pretrained_model=ContextSpellChecker.get_pretrained_model, - get_trainable_model=ContextSpellChecker.get_default_trainable_model, - pdf_extractor_methods={'default': default_spell_context_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_spell_context_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.CONTEXT_SPELL_CHECKER], - description='Deep Learning based spell checker that uses context to predict correct corrections.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.CONTEXT_SPELL_CHECKER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CONTEXT_SPELL_CHECKER], - trainable_mirror_anno=A.TRAINABLE_CONTEXT_SPELL_CHECKER, - )), + ), + A.CLASSIFIER_DL: partial(NluComponent, + name=A.CLASSIFIER_DL, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=ClassifierDl.get_default_model, + get_pretrained_model=ClassifierDl.get_pretrained_model, + get_trainable_model=ClassifierDl.get_trainable_model, + pdf_extractor_methods={'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_classifier_dl_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.CLASSIFIER_DL], + description='Deep Learning based general classifier for many problems', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.CLASSIFIER_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CLASSIFIER_DL], + has_storage_ref=True, + is_storage_ref_consumer=True, + trainable_mirror_anno=A.TRAINABLE_CLASSIFIER_DL, + ), + A.TRAINABLE_CLASSIFIER_DL: partial(NluComponent, + name=A.TRAINABLE_CLASSIFIER_DL, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=ClassifierDl.get_default_model, + get_pretrained_model=ClassifierDl.get_pretrained_model, + get_trainable_model=ClassifierDl.get_trainable_model, + pdf_extractor_methods={'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_classifier_dl_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_CLASSIFIER_DL], + description='Deep Learning based general classifier for many problems', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TRAINABLE_CLASSIFIER_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_CLASSIFIER_DL], + has_storage_ref=True, + is_storage_ref_consumer=True, + trainable=True, + trained_mirror_anno=A.CLASSIFIER_DL, + ), + A.CONTEXT_SPELL_CHECKER: partial(NluComponent, + name=A.CONTEXT_SPELL_CHECKER, + type=T.SPELL_CHECKER, + get_default_model=ContextSpellChecker.get_default_model, + get_pretrained_model=ContextSpellChecker.get_pretrained_model, + get_trainable_model=ContextSpellChecker.get_default_trainable_model, + pdf_extractor_methods={'default': default_spell_context_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_spell_context_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.CONTEXT_SPELL_CHECKER], + description='Deep Learning based spell checker that uses context to predict correct corrections.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.CONTEXT_SPELL_CHECKER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.CONTEXT_SPELL_CHECKER], + trainable_mirror_anno=A.TRAINABLE_CONTEXT_SPELL_CHECKER, + ), A.DATE_MATCHER: 'TODO no Extractor Implemented', - A.UNTYPED_DEPENDENCY_PARSER: copy(NluComponent( - name=A.UNTYPED_DEPENDENCY_PARSER, - type=T.TOKEN_CLASSIFIER, - get_default_model=LabeledDependencyParser.get_default_model, - get_pretrained_model=LabeledDependencyParser.get_pretrained_model, - get_trainable_model=LabeledDependencyParser.get_default_trainable_model, - pdf_extractor_methods={'default': default_dep_typed_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_labled_dependency_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.UNTYPED_DEPENDENCY_PARSER], - description='todo', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.UNTYPED_DEPENDENCY_PARSER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.UNTYPED_DEPENDENCY_PARSER], - trainable_mirror_anno=A.TRAINABLE_DEP_PARSE_UN_TYPED, - )), - A.TYPED_DEPENDENCY_PARSER: copy(NluComponent( - name=A.TYPED_DEPENDENCY_PARSER, - type=T.TOKEN_CLASSIFIER, - get_default_model=UnlabeledDependencyParser.get_default_model, - get_pretrained_model=UnlabeledDependencyParser.get_pretrained_model, - get_trainable_model=UnlabeledDependencyParser.get_default_trainable_model, - pdf_extractor_methods={'default': default_dep_untyped_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_un_labled_dependency_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.TYPED_DEPENDENCY_PARSER], - description='todo', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TYPED_DEPENDENCY_PARSER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TYPED_DEPENDENCY_PARSER], - trainable_mirror_anno=A.TRAINABLE_DEP_PARSE_TYPED, - )), - A.DOC2CHUNK: copy(NluComponent( - name=A.DOC2CHUNK, - type=T.HELPER_ANNO, - get_default_model=Doc_2_Chunk.get_default_model, - pdf_extractor_methods={'default': default_doc2chunk_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_doc2chunk_cols, - output_level=L.CHUNK, - node=NLP_FEATURE_NODES.nodes[A.DOC2CHUNK], - description='Converts Document type col to Chunk type col', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DOC2CHUNK, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOC2CHUNK], - )), - A.DOCUMENT_ASSEMBLER: copy(NluComponent( - name=A.DOCUMENT_ASSEMBLER, - type=T.HELPER_ANNO, - get_default_model=SparkNlpDocumentAssembler.get_default_model, - pdf_extractor_methods={'default': default_document_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_doc_assembler_cols, - output_level=L.DOCUMENT, - node=NLP_FEATURE_NODES.nodes[A.DOCUMENT_ASSEMBLER], - description='todo', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DOCUMENT_ASSEMBLER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOCUMENT_ASSEMBLER], - )), - A.DOCUMENT_NORMALIZER: copy(NluComponent( - name=A.DOCUMENT_NORMALIZER, - type=T.TEXT_NORMALIZER, - get_default_model=SparkNLPDocumentNormalizer.get_default_model, - pdf_extractor_methods={'default': default_norm_document_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_doc_norm_cols, - output_level=L.DOCUMENT, - node=NLP_FEATURE_NODES.nodes[A.DOCUMENT_NORMALIZER], - description='todo', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DOCUMENT_NORMALIZER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOCUMENT_NORMALIZER], - )), + A.UNTYPED_DEPENDENCY_PARSER: partial(NluComponent, + name=A.UNTYPED_DEPENDENCY_PARSER, + type=T.TOKEN_CLASSIFIER, + get_default_model=LabeledDependencyParser.get_default_model, + get_pretrained_model=LabeledDependencyParser.get_pretrained_model, + get_trainable_model=LabeledDependencyParser.get_default_trainable_model, + pdf_extractor_methods={'default': default_dep_typed_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_labled_dependency_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.UNTYPED_DEPENDENCY_PARSER], + description='todo', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.UNTYPED_DEPENDENCY_PARSER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.UNTYPED_DEPENDENCY_PARSER], + trainable_mirror_anno=A.TRAINABLE_DEP_PARSE_UN_TYPED, + ), + A.TYPED_DEPENDENCY_PARSER: partial(NluComponent, + name=A.TYPED_DEPENDENCY_PARSER, + type=T.TOKEN_CLASSIFIER, + get_default_model=UnlabeledDependencyParser.get_default_model, + get_pretrained_model=UnlabeledDependencyParser.get_pretrained_model, + get_trainable_model=UnlabeledDependencyParser.get_default_trainable_model, + pdf_extractor_methods={'default': default_dep_untyped_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_un_labled_dependency_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.TYPED_DEPENDENCY_PARSER], + description='todo', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TYPED_DEPENDENCY_PARSER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TYPED_DEPENDENCY_PARSER], + trainable_mirror_anno=A.TRAINABLE_DEP_PARSE_TYPED, + ), + A.DOC2CHUNK: partial(NluComponent, + name=A.DOC2CHUNK, + type=T.HELPER_ANNO, + get_default_model=Doc_2_Chunk.get_default_model, + pdf_extractor_methods={'default': default_doc2chunk_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_doc2chunk_cols, + output_level=L.CHUNK, + node=NLP_FEATURE_NODES.nodes[A.DOC2CHUNK], + description='Converts Document type col to Chunk type col', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DOC2CHUNK, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOC2CHUNK], + ), + A.DOCUMENT_ASSEMBLER: partial(NluComponent, + name=A.DOCUMENT_ASSEMBLER, + type=T.HELPER_ANNO, + get_default_model=SparkNlpDocumentAssembler.get_default_model, + pdf_extractor_methods={'default': default_document_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_doc_assembler_cols, + output_level=L.DOCUMENT, + node=NLP_FEATURE_NODES.nodes[A.DOCUMENT_ASSEMBLER], + description='todo', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DOCUMENT_ASSEMBLER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOCUMENT_ASSEMBLER], + ), + A.DOCUMENT_NORMALIZER: partial(NluComponent, + name=A.DOCUMENT_NORMALIZER, + type=T.TEXT_NORMALIZER, + get_default_model=SparkNLPDocumentNormalizer.get_default_model, + pdf_extractor_methods={'default': default_norm_document_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_doc_norm_cols, + output_level=L.DOCUMENT, + node=NLP_FEATURE_NODES.nodes[A.DOCUMENT_NORMALIZER], + description='todo', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DOCUMENT_NORMALIZER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOCUMENT_NORMALIZER], + ), A.EMBEDDINGS_FINISHER: 'TODO NOT INTEGRATED', A.ENTITY_RULER: 'TODO NOT INTEGRATED', A.FINISHER: 'TODO NOT INTEGRATED', A.GRAPH_EXTRACTION: 'TODO NOT INTEGRATED', A.GRAPH_FINISHER: 'TODO NOT INTEGRATED', - A.LANGUAGE_DETECTOR_DL: copy(NluComponent( - name=A.LANGUAGE_DETECTOR_DL, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=LanguageDetector.get_default_model, - get_pretrained_model=LanguageDetector.get_pretrained_model, - pdf_extractor_methods={'default': default_lang_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=None, # TODO no sub defined - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, # TODO sub-token actually(?) - node=NLP_FEATURE_NODES.nodes[A.LANGUAGE_DETECTOR_DL], - description='Get lemmatized base version of tokens', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.LANGUAGE_DETECTOR_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LANGUAGE_DETECTOR_DL], - )), - A.LEMMATIZER: copy(NluComponent( - name=A.LEMMATIZER, - type=T.TOKEN_NORMALIZER, - output_context=ComputeContexts.spark, - get_default_model=SparkNLPLemmatizer.get_default_model, - get_pretrained_model=SparkNLPLemmatizer.get_pretrained_model, - get_trainable_model=SparkNLPLemmatizer.get_default_trainable_model, - pdf_extractor_methods={'default': default_lemma_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_lem_cols, - output_level=L.TOKEN, # TODO sub-token actually(?) - node=NLP_FEATURE_NODES.nodes[A.LEMMATIZER], - description='Get lemmatized base version of tokens', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - jsl_anno_class_id=A.LEMMATIZER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LEMMATIZER], - trainable_mirror_anno=A.TRAINABLE_LEMMATIZER - )), - A.MULTI_CLASSIFIER_DL: copy(NluComponent( - name=A.MULTI_CLASSIFIER_DL, - type=T.DOCUMENT_CLASSIFIER, - output_level=L.MULTI_TOKEN_CLASSIFIER, - get_default_model=MultiClassifier.get_default_model, - get_pretrained_model=MultiClassifier.get_pretrained_model, - get_trainable_model=MultiClassifier.get_default_trainable_model, - pdf_extractor_methods={'default': default_multi_classifier_dl_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_multi_classifier_dl_cols, - node=NLP_FEATURE_NODES.nodes[A.MULTI_CLASSIFIER_DL], - description='Deep Learning based general classifier for multi-label classification problem. I.e. problems, where one document may be labled with multiple labels at the same time.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.MULTI_CLASSIFIER_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MULTI_CLASSIFIER_DL], - has_storage_ref=True, - is_storage_ref_consumer=True, - trainable_mirror_anno=A.TRAINABLE_MULTI_CLASSIFIER_DL, - )), - A.TRAINABLE_MULTI_CLASSIFIER_DL: copy(NluComponent( - name=A.TRAINABLE_MULTI_CLASSIFIER_DL, - type=T.DOCUMENT_CLASSIFIER, - output_level=L.MULTI_TOKEN_CLASSIFIER, - get_default_model=MultiClassifier.get_default_model, - get_pretrained_model=MultiClassifier.get_pretrained_model, - get_trainable_model=MultiClassifier.get_default_trainable_model, - pdf_extractor_methods={'default': default_multi_classifier_dl_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_multi_classifier_dl_cols, - node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_MULTI_CLASSIFIER_DL], - description='Trainable Deep Learning based general classifier for multi-label classification problem. I.e. problems, where one document may be labled with multiple labels at the same time.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TRAINABLE_MULTI_CLASSIFIER_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_MULTI_CLASSIFIER_DL], - has_storage_ref=True, - is_storage_ref_consumer=True, - trainable=True, - trained_mirror_anno=A.CLASSIFIER_DL, - # Should be A.MULTI_CLASSIFIER_DL, but fitted class is actually classifier DL, special edge case - )), + A.LANGUAGE_DETECTOR_DL: partial(NluComponent, + name=A.LANGUAGE_DETECTOR_DL, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=LanguageDetector.get_default_model, + get_pretrained_model=LanguageDetector.get_pretrained_model, + pdf_extractor_methods={'default': default_lang_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=None, # TODO no sub defined + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + # TODO sub-token actually(?) + node=NLP_FEATURE_NODES.nodes[A.LANGUAGE_DETECTOR_DL], + description='Get lemmatized base version of tokens', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.LANGUAGE_DETECTOR_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LANGUAGE_DETECTOR_DL], + ), + A.LEMMATIZER: partial(NluComponent, + name=A.LEMMATIZER, + type=T.TOKEN_NORMALIZER, + output_context=ComputeContexts.spark, + get_default_model=SparkNLPLemmatizer.get_default_model, + get_pretrained_model=SparkNLPLemmatizer.get_pretrained_model, + get_trainable_model=SparkNLPLemmatizer.get_default_trainable_model, + pdf_extractor_methods={'default': default_lemma_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_lem_cols, + output_level=L.TOKEN, # TODO sub-token actually(?) + node=NLP_FEATURE_NODES.nodes[A.LEMMATIZER], + description='Get lemmatized base version of tokens', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + jsl_anno_class_id=A.LEMMATIZER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LEMMATIZER], + trainable_mirror_anno=A.TRAINABLE_LEMMATIZER + ), + A.MULTI_CLASSIFIER_DL: partial(NluComponent, + name=A.MULTI_CLASSIFIER_DL, + type=T.DOCUMENT_CLASSIFIER, + output_level=L.MULTI_TOKEN_CLASSIFIER, + get_default_model=MultiClassifier.get_default_model, + get_pretrained_model=MultiClassifier.get_pretrained_model, + get_trainable_model=MultiClassifier.get_default_trainable_model, + pdf_extractor_methods={'default': default_multi_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_multi_classifier_dl_cols, + node=NLP_FEATURE_NODES.nodes[A.MULTI_CLASSIFIER_DL], + description='Deep Learning based general classifier for multi-label classification problem. I.e. problems, where one document may be labled with multiple labels at the same time.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.MULTI_CLASSIFIER_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MULTI_CLASSIFIER_DL], + has_storage_ref=True, + is_storage_ref_consumer=True, + trainable_mirror_anno=A.TRAINABLE_MULTI_CLASSIFIER_DL, + ), + A.TRAINABLE_MULTI_CLASSIFIER_DL: partial(NluComponent, + name=A.TRAINABLE_MULTI_CLASSIFIER_DL, + type=T.DOCUMENT_CLASSIFIER, + output_level=L.MULTI_TOKEN_CLASSIFIER, + get_default_model=MultiClassifier.get_default_model, + get_pretrained_model=MultiClassifier.get_pretrained_model, + get_trainable_model=MultiClassifier.get_default_trainable_model, + pdf_extractor_methods={'default': default_multi_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_multi_classifier_dl_cols, + node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_MULTI_CLASSIFIER_DL], + description='Trainable Deep Learning based general classifier for multi-label classification problem. I.e. problems, where one document may be labled with multiple labels at the same time.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TRAINABLE_MULTI_CLASSIFIER_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.TRAINABLE_MULTI_CLASSIFIER_DL], + has_storage_ref=True, + is_storage_ref_consumer=True, + trainable=True, + trained_mirror_anno=A.CLASSIFIER_DL, + # Should be A.MULTI_CLASSIFIER_DL, but fitted class is actually classifier DL, special edge case + ), A.MULTI_DATE_MATCHER: 'TODO NOT INTEGRATED', - A.N_GRAMM_GENERATOR: copy(NluComponent( - name=A.N_GRAMM_GENERATOR, - type=T.CHUNK_CLASSIFIER, # Classify each n-gram wether they match Pattern or not - get_default_model=NGram.get_default_model, - pdf_extractor_methods={'default': default_ngram_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_ngram_cols, - output_level=L.CHUNK, - node=NLP_FEATURE_NODES.nodes[A.N_GRAMM_GENERATOR], - description='Extract N-Gram chunks from texts', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.N_GRAMM_GENERATOR, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.N_GRAMM_GENERATOR], - )), - A.NER_CONVERTER: copy(NluComponent( - name=A.NER_CONVERTER, - type=T.HELPER_ANNO, - get_default_model=NerToChunkConverter.get_default_model, - pdf_extractor_methods={'default': default_ner_converter_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_ner_converter_cols, - output_level=L.CHUNK, - node=NLP_FEATURE_NODES.nodes[A.NER_CONVERTER], - description='Convert NER-IOB tokens into concatenated strings (aka chunks)', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.NER_CONVERTER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NER_CONVERTER], - )), - A.NER_CRF: copy(NluComponent( - name=A.NER_CRF, - type=T.TOKEN_CLASSIFIER, - output_level=L.TOKEN, - get_default_model=NERDLCRF.get_default_model, - get_pretrained_model=NERDLCRF.get_pretrained_model, - get_trainable_model=NERDLCRF.get_default_trainable_model, - pdf_extractor_methods={'default': '', 'default_full': default_full_config, }, - pdf_col_name_substitutor=None, # TODO - node=NLP_FEATURE_NODES.nodes[A.NER_CRF], - description='Classical NER model based on conditional random fields (CRF). Predicts IOB tags ', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.NER_CRF, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NER_CRF], - trainable_mirror_anno=A.TRAINABLE_NER_CRF, - )), - A.NER_DL: copy(NluComponent( - name=A.NER_DL, - type=T.TOKEN_CLASSIFIER, - output_level=L.TOKEN, - get_default_model=NERDL.get_default_model, - get_pretrained_model=NERDL.get_pretrained_model, - get_trainable_model=NERDL.get_default_trainable_model, - pdf_extractor_methods={'default': default_NER_config, 'meta': meta_NER_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_ner_dl_cols, - node=NLP_FEATURE_NODES.nodes[A.NER_DL], - description='Deep Learning based NER model that predicts IOB tags. ', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.NER_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NER_DL], - trainable_mirror_anno=A.TRAINABLE_NER_DL, - has_storage_ref=True, - is_storage_ref_consumer=True - )), - A.TRAINABLE_NER_DL: copy(NluComponent( - name=A.TRAINABLE_NER_DL, - type=T.TOKEN_CLASSIFIER, - get_default_model=NERDL.get_default_model, - get_pretrained_model=NERDL.get_pretrained_model, - get_trainable_model=NERDL.get_default_trainable_model, - pdf_extractor_methods={'default': default_NER_config, 'meta': meta_NER_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_ner_dl_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_NER_DL], - description='Deep Learning based NER model that predicts IOB tags. ', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TRAINABLE_NER_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_NER_DL], - trained_mirror_anno=A.NER_DL, - trainable=True, - has_storage_ref=True, - is_storage_ref_consumer=True - )), + A.N_GRAMM_GENERATOR: partial(NluComponent, + name=A.N_GRAMM_GENERATOR, + type=T.CHUNK_CLASSIFIER, # Classify each n-gram wether they match Pattern or not + get_default_model=NGram.get_default_model, + pdf_extractor_methods={'default': default_ngram_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_ngram_cols, + output_level=L.CHUNK, + node=NLP_FEATURE_NODES.nodes[A.N_GRAMM_GENERATOR], + description='Extract N-Gram chunks from texts', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.N_GRAMM_GENERATOR, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.N_GRAMM_GENERATOR], + ), + A.NER_CONVERTER: partial(NluComponent, + name=A.NER_CONVERTER, + type=T.HELPER_ANNO, + get_default_model=NerToChunkConverter.get_default_model, + pdf_extractor_methods={'default': default_ner_converter_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_ner_converter_cols, + output_level=L.CHUNK, + node=NLP_FEATURE_NODES.nodes[A.NER_CONVERTER], + description='Convert NER-IOB tokens into concatenated strings (aka chunks)', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.NER_CONVERTER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NER_CONVERTER], + ), + A.NER_CRF: partial(NluComponent, + name=A.NER_CRF, + type=T.TOKEN_CLASSIFIER, + output_level=L.TOKEN, + get_default_model=NERDLCRF.get_default_model, + get_pretrained_model=NERDLCRF.get_pretrained_model, + get_trainable_model=NERDLCRF.get_default_trainable_model, + pdf_extractor_methods={'default': '', 'default_full': default_full_config, }, + pdf_col_name_substitutor=None, # TODO + node=NLP_FEATURE_NODES.nodes[A.NER_CRF], + description='Classical NER model based on conditional random fields (CRF). Predicts IOB tags ', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.NER_CRF, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NER_CRF], + trainable_mirror_anno=A.TRAINABLE_NER_CRF, + ), + A.NER_DL: partial(NluComponent, + name=A.NER_DL, + type=T.TOKEN_CLASSIFIER, + output_level=L.TOKEN, + get_default_model=NERDL.get_default_model, + get_pretrained_model=NERDL.get_pretrained_model, + get_trainable_model=NERDL.get_default_trainable_model, + pdf_extractor_methods={'default': default_NER_config, 'meta': meta_NER_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_ner_dl_cols, + node=NLP_FEATURE_NODES.nodes[A.NER_DL], + description='Deep Learning based NER model that predicts IOB tags. ', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.NER_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NER_DL], + trainable_mirror_anno=A.TRAINABLE_NER_DL, + has_storage_ref=True, + is_storage_ref_consumer=True + ), + A.TRAINABLE_NER_DL: partial(NluComponent, + name=A.TRAINABLE_NER_DL, + type=T.TOKEN_CLASSIFIER, + get_default_model=NERDL.get_default_model, + get_pretrained_model=NERDL.get_pretrained_model, + get_trainable_model=NERDL.get_default_trainable_model, + pdf_extractor_methods={'default': default_NER_config, 'meta': meta_NER_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_ner_dl_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_NER_DL], + description='Deep Learning based NER model that predicts IOB tags. ', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TRAINABLE_NER_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_NER_DL], + trained_mirror_anno=A.NER_DL, + trainable=True, + has_storage_ref=True, + is_storage_ref_consumer=True + ), A.NER_OVERWRITER: 'TODO NOT INTEGRATED', - A.NORMALIZER: copy(NluComponent( - name=A.NORMALIZER, - type=T.TOKEN_NORMALIZER, - get_default_model=SparkNLPNormalizer.get_default_model, - get_pretrained_model=SparkNLPNormalizer.get_pretrained_model, - # get_trainable_model=SparkNLPLemmatizer.get_default_trainable_model, - pdf_extractor_methods={'default': default_norm_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_norm_cols, - output_level=L.TOKEN, # TODO sub-token actually(?) - node=NLP_FEATURE_NODES.nodes[A.NORMALIZER], - description='Get lemmatized base version of tokens', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.NORMALIZER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NORMALIZER], - trainable_mirror_anno=A.TRAINABLE_NORMALIZER - )), - A.NORVIG_SPELL_CHECKER: copy(NluComponent( - name=A.NORVIG_SPELL_CHECKER, - type=T.SPELL_CHECKER, - get_default_model=NorvigSpellChecker.get_default_model, - get_pretrained_model=NorvigSpellChecker.get_pretrained_model, - get_trainable_model=NorvigSpellChecker.get_default_trainable_model, - pdf_extractor_methods={'default': default_spell_norvig_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_spell_norvig_cols, - output_level=L.TOKEN, # TODO sub-token actually - node=NLP_FEATURE_NODES.nodes[A.NORVIG_SPELL_CHECKER], - description='Norvig algorithm based Spell Checker', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.NORVIG_SPELL_CHECKER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NORVIG_SPELL_CHECKER], - trainable_mirror_anno=A.TRAINABLE_NORVIG_SPELL_CHECKER - )), - A.POS: copy(NluComponent( - name=A.POS, - type=T.TOKEN_CLASSIFIER, - get_default_model=PartOfSpeechJsl.get_default_model, - get_pretrained_model=PartOfSpeechJsl.get_pretrained_model, - get_trainable_model=PartOfSpeechJsl.get_default_trainable_model, - pdf_extractor_methods={'default': default_POS_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_pos_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.POS], - description='todo', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.POS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.POS], - trainable_mirror_anno=A.TRAINABLE_POS, - )), - A.TRAINABLE_POS: copy(NluComponent( - name=A.TRAINABLE_POS, - type=T.TOKEN_CLASSIFIER, - get_default_model=PartOfSpeechJsl.get_default_model, - get_pretrained_model=PartOfSpeechJsl.get_pretrained_model, - get_trainable_model=PartOfSpeechJsl.get_default_trainable_model, - pdf_extractor_methods={'default': default_POS_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_pos_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_POS], - description='todo', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TRAINABLE_POS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_POS], - trained_mirror_anno=A.POS, - trainable=True - )), + A.NORMALIZER: partial(NluComponent, + name=A.NORMALIZER, + type=T.TOKEN_NORMALIZER, + get_default_model=SparkNLPNormalizer.get_default_model, + get_pretrained_model=SparkNLPNormalizer.get_pretrained_model, + # get_trainable_model=SparkNLPLemmatizer.get_default_trainable_model, + pdf_extractor_methods={'default': default_norm_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_norm_cols, + output_level=L.TOKEN, # TODO sub-token actually(?) + node=NLP_FEATURE_NODES.nodes[A.NORMALIZER], + description='Get lemmatized base version of tokens', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.NORMALIZER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NORMALIZER], + trainable_mirror_anno=A.TRAINABLE_NORMALIZER + ), + A.NORVIG_SPELL_CHECKER: partial(NluComponent, + name=A.NORVIG_SPELL_CHECKER, + type=T.SPELL_CHECKER, + get_default_model=NorvigSpellChecker.get_default_model, + get_pretrained_model=NorvigSpellChecker.get_pretrained_model, + get_trainable_model=NorvigSpellChecker.get_default_trainable_model, + pdf_extractor_methods={'default': default_spell_norvig_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_spell_norvig_cols, + output_level=L.TOKEN, # TODO sub-token actually + node=NLP_FEATURE_NODES.nodes[A.NORVIG_SPELL_CHECKER], + description='Norvig algorithm based Spell Checker', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.NORVIG_SPELL_CHECKER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.NORVIG_SPELL_CHECKER], + trainable_mirror_anno=A.TRAINABLE_NORVIG_SPELL_CHECKER + ), + A.POS: partial(NluComponent, + name=A.POS, + type=T.TOKEN_CLASSIFIER, + get_default_model=PartOfSpeechJsl.get_default_model, + get_pretrained_model=PartOfSpeechJsl.get_pretrained_model, + get_trainable_model=PartOfSpeechJsl.get_default_trainable_model, + pdf_extractor_methods={'default': default_POS_config, 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_pos_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.POS], + description='todo', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.POS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.POS], + trainable_mirror_anno=A.TRAINABLE_POS, + ), + A.TRAINABLE_POS: partial(NluComponent, + name=A.TRAINABLE_POS, + type=T.TOKEN_CLASSIFIER, + get_default_model=PartOfSpeechJsl.get_default_model, + get_pretrained_model=PartOfSpeechJsl.get_pretrained_model, + get_trainable_model=PartOfSpeechJsl.get_default_trainable_model, + pdf_extractor_methods={'default': default_POS_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_pos_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_POS], + description='todo', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TRAINABLE_POS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_POS], + trained_mirror_anno=A.POS, + trainable=True + ), A.RECURISVE_TOKENIZER: 'TODO NOT INTEGRATED', A.REGEX_MATCHER: 'TODO no Extractor Implemented', A.TRAINABLE_REGEX_MATCHER: 'TODO no Extractor Implemented', - A.REGEX_TOKENIZER: copy(NluComponent( - name=A.POS, - type=T.TOKEN_CLASSIFIER, - get_default_model=RegexTokenizer.get_default_model, - pdf_extractor_methods={'default': default_tokenizer_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_tokenizer_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.POS], - description='todo', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.REGEX_TOKENIZER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.REGEX_TOKENIZER], - )), - A.SENTENCE_DETECTOR: copy(NluComponent( - name=A.SENTENCE_DETECTOR, - type=T.SENTENCE_DETECTOR, - get_default_model=PragmaticSentenceDetector.get_default_model, - pdf_extractor_methods={'default': default_sentence_detector_DL_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentence_detector_dl_cols, - output_level=L.SENTENCE, - node=NLP_FEATURE_NODES.nodes[A.SENTENCE_DETECTOR], - description='Classical rule based Sentence Detector', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.SENTENCE_DETECTOR, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTENCE_DETECTOR], - )), - A.SENTENCE_DETECTOR_DL: copy(NluComponent( - name=A.SENTENCE_DETECTOR_DL, - type=T.SENTENCE_DETECTOR, - get_default_model=SentenceDetectorDeep.get_default_model, - get_pretrained_model=SentenceDetectorDeep.get_pretrained_model, - # get_trainable_model=SentenceDetectorDeep.get_trainable_model, - pdf_extractor_methods={'default': default_sentence_detector_DL_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentence_detector_dl_cols, - output_level=L.SENTENCE, - node=NLP_FEATURE_NODES.nodes[A.SENTENCE_DETECTOR_DL], - description='Deep Learning based sentence Detector', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.SENTENCE_DETECTOR_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTENCE_DETECTOR_DL], - trainable_mirror_anno=A.TRAINABLE_SENTENCE_DETECTOR_DL - )), - A.SENTENCE_EMBEDDINGS_CONVERTER: copy(NluComponent( - name=A.SENTENCE_EMBEDDINGS_CONVERTER, - type=T.DOCUMENT_EMBEDDING, - get_default_model=SparkNLPSentenceEmbeddings.get_default_model, - pdf_extractor_methods={'default': default_sentence_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sent_embed_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, - node=NLP_FEATURE_NODES.nodes[A.SENTENCE_EMBEDDINGS_CONVERTER], - description='Converts Word Embeddings to Sentence/Document Embeddings', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.SENTENCE_EMBEDDINGS_CONVERTER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTENCE_EMBEDDINGS_CONVERTER], - is_storage_ref_producer=True, - has_storage_ref=True - )), - A.STEMMER: copy(NluComponent( - name=A.STEMMER, - type=T.TOKEN_NORMALIZER, - get_default_model=SparkNLPStemmer.get_default_model, - pdf_extractor_methods={'default': default_stemm_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_stem_cols, - output_level=L.TOKEN, # TODO sub-token actually(?) - node=NLP_FEATURE_NODES.nodes[A.STEMMER], - description='Get stemmed base version of tokens', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.STEMMER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.STEMMER], - )), - A.STOP_WORDS_CLEANER: copy(NluComponent( - name=A.STOP_WORDS_CLEANER, - type=T.TEXT_NORMALIZER, - get_default_model=NLUStopWordcleaner.get_default_model, - get_pretrained_model=NLUStopWordcleaner.get_pretrained_model, - pdf_extractor_methods={'default': default_stopwords_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_stopwords_cols, - output_level=L.TOKEN, # TODO sub-token actually - node=NLP_FEATURE_NODES.nodes[A.STOP_WORDS_CLEANER], - description='Removes stopwords from text based on internal list of stop words.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.STOP_WORDS_CLEANER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.STOP_WORDS_CLEANER], - )), - A.SYMMETRIC_DELETE_SPELLCHECKER: copy(NluComponent( - name=A.SYMMETRIC_DELETE_SPELLCHECKER, - type=T.SPELL_CHECKER, - get_default_model=SymmetricSpellChecker.get_default_model, - get_pretrained_model=SymmetricSpellChecker.get_pretrained_model, - get_trainable_model=SymmetricSpellChecker.get_default_trainable_model, - pdf_extractor_methods={'default': default_spell_symmetric_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_spell_symm_cols, - output_level=L.TOKEN, # TODO sub-token actually - node=NLP_FEATURE_NODES.nodes[A.SYMMETRIC_DELETE_SPELLCHECKER], - description='Symmetric Spell Checker', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.SYMMETRIC_DELETE_SPELLCHECKER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SYMMETRIC_DELETE_SPELLCHECKER], - trainable_mirror_anno=A.TRAINABLE_SYMMETRIC_DELETE_SPELLCHECKER - )), + A.REGEX_TOKENIZER: partial(NluComponent, + name=A.POS, + type=T.TOKEN_CLASSIFIER, + get_default_model=RegexTokenizer.get_default_model, + pdf_extractor_methods={'default': default_tokenizer_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_tokenizer_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.POS], + description='todo', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.REGEX_TOKENIZER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.REGEX_TOKENIZER], + ), + A.SENTENCE_DETECTOR: partial(NluComponent, + name=A.SENTENCE_DETECTOR, + type=T.SENTENCE_DETECTOR, + get_default_model=PragmaticSentenceDetector.get_default_model, + pdf_extractor_methods={'default': default_sentence_detector_DL_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentence_detector_dl_cols, + output_level=L.SENTENCE, + node=NLP_FEATURE_NODES.nodes[A.SENTENCE_DETECTOR], + description='Classical rule based Sentence Detector', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.SENTENCE_DETECTOR, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTENCE_DETECTOR], + ), + A.SENTENCE_DETECTOR_DL: partial(NluComponent, + name=A.SENTENCE_DETECTOR_DL, + type=T.SENTENCE_DETECTOR, + get_default_model=SentenceDetectorDeep.get_default_model, + get_pretrained_model=SentenceDetectorDeep.get_pretrained_model, + # get_trainable_model=SentenceDetectorDeep.get_trainable_model, + pdf_extractor_methods={'default': default_sentence_detector_DL_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentence_detector_dl_cols, + output_level=L.SENTENCE, + node=NLP_FEATURE_NODES.nodes[A.SENTENCE_DETECTOR_DL], + description='Deep Learning based sentence Detector', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.SENTENCE_DETECTOR_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTENCE_DETECTOR_DL], + trainable_mirror_anno=A.TRAINABLE_SENTENCE_DETECTOR_DL + ), + A.SENTENCE_EMBEDDINGS_CONVERTER: partial(NluComponent, + name=A.SENTENCE_EMBEDDINGS_CONVERTER, + type=T.DOCUMENT_EMBEDDING, + get_default_model=SparkNLPSentenceEmbeddings.get_default_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, + node=NLP_FEATURE_NODES.nodes[A.SENTENCE_EMBEDDINGS_CONVERTER], + description='Converts Word Embeddings to Sentence/Document Embeddings', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.SENTENCE_EMBEDDINGS_CONVERTER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.SENTENCE_EMBEDDINGS_CONVERTER], + is_storage_ref_producer=True, + has_storage_ref=True + ), + A.STEMMER: partial(NluComponent, + name=A.STEMMER, + type=T.TOKEN_NORMALIZER, + get_default_model=SparkNLPStemmer.get_default_model, + pdf_extractor_methods={'default': default_stemm_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_stem_cols, + output_level=L.TOKEN, # TODO sub-token actually(?) + node=NLP_FEATURE_NODES.nodes[A.STEMMER], + description='Get stemmed base version of tokens', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.STEMMER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.STEMMER], + ), + A.STOP_WORDS_CLEANER: partial(NluComponent, + name=A.STOP_WORDS_CLEANER, + type=T.TEXT_NORMALIZER, + get_default_model=NLUStopWordcleaner.get_default_model, + get_pretrained_model=NLUStopWordcleaner.get_pretrained_model, + pdf_extractor_methods={'default': default_stopwords_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_stopwords_cols, + output_level=L.TOKEN, # TODO sub-token actually + node=NLP_FEATURE_NODES.nodes[A.STOP_WORDS_CLEANER], + description='Removes stopwords from text based on internal list of stop words.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.STOP_WORDS_CLEANER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.STOP_WORDS_CLEANER], + ), + A.SYMMETRIC_DELETE_SPELLCHECKER: partial(NluComponent, + name=A.SYMMETRIC_DELETE_SPELLCHECKER, + type=T.SPELL_CHECKER, + get_default_model=SymmetricSpellChecker.get_default_model, + get_pretrained_model=SymmetricSpellChecker.get_pretrained_model, + get_trainable_model=SymmetricSpellChecker.get_default_trainable_model, + pdf_extractor_methods={'default': default_spell_symmetric_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_spell_symm_cols, + output_level=L.TOKEN, # TODO sub-token actually + node=NLP_FEATURE_NODES.nodes[A.SYMMETRIC_DELETE_SPELLCHECKER], + description='Symmetric Spell Checker', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.SYMMETRIC_DELETE_SPELLCHECKER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.SYMMETRIC_DELETE_SPELLCHECKER], + trainable_mirror_anno=A.TRAINABLE_SYMMETRIC_DELETE_SPELLCHECKER + ), A.TEXT_MATCHER: 'TODO EXTRACTOR METHOD MISSING', # TODO A.TRAINABLE_TEXT_MATCHER: 'TODO EXTRACTOR METHOD MISSING', # TODO A.TOKEN2CHUNK: 'TODO NOT INTEGRATED', # TODO A.TOKEN_ASSEMBLER: 'TODO EXTRACTORS MISSING', # TODO - A.TOKENIZER: copy(NluComponent( - name=A.TOKENIZER, - type=T.TOKENIZER, - get_default_model=DefaultTokenizer.get_default_model, - pdf_extractor_methods={'default': default_tokenizer_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_tokenizer_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.TOKENIZER], - description='Default tokenizer', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TOKENIZER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TOKENIZER], - )), - A.SENTIMENT_DL: copy(NluComponent( - name=A.SENTIMENT_DL, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=SentimentDl.get_default_model, - get_pretrained_model=SentimentDl.get_pretrained_model, - get_trainable_model=SentimentDl.get_default_trainable_model, - pdf_extractor_methods={'default': default_sentiment_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentiment_dl_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.SENTIMENT_DL], - description='Deep Learning based Sentiment Detector', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.SENTIMENT_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTIMENT_DL], - trainable_mirror_anno=A.TRAINABLE_SENTIMENT_DL, - is_storage_ref_consumer=True, - has_storage_ref=True - )), - A.TRAINABLE_SENTIMENT_DL: copy(NluComponent( - name=A.TRAINABLE_SENTIMENT_DL, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=SentimentDl.get_default_model, - get_pretrained_model=SentimentDl.get_pretrained_model, - get_trainable_model=SentimentDl.get_default_trainable_model, - pdf_extractor_methods={'default': default_sentiment_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentiment_dl_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_SENTIMENT_DL], - description='Deep Learning based Sentiment Detector', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TRAINABLE_SENTIMENT_DL, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_SENTIMENT_DL], - trained_mirror_anno=A.SENTIMENT_DL, - is_storage_ref_consumer=True, - has_storage_ref=True, - trainable=True - )), - A.SENTIMENT_DETECTOR: copy(NluComponent( - name=A.SENTIMENT_DETECTOR, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=Sentiment.get_default_model, - # get_pretrained_model = Sentiment.get_pretrained_model, - get_trainable_model=Sentiment.get_default_trainable_model, - pdf_extractor_methods={'default': default_sentiment_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentiment_dl_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.SENTIMENT_DETECTOR], - description='Rule based sentiment detector, which calculates a score based on predefined keywords', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.SENTIMENT_DETECTOR, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTIMENT_DETECTOR], - trainable_mirror_anno=A.TRAINABLE_SENTIMENT, - )), - A.VIVEKN_SENTIMENT: copy(NluComponent( - name=A.VIVEKN_SENTIMENT, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=ViveknSentiment.get_default_model, - get_pretrained_model=ViveknSentiment.get_pretrained_model, - get_trainable_model=ViveknSentiment.get_default_trainable_model, - pdf_extractor_methods={'default': default_sentiment_vivk_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentiment_vivk_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.VIVEKN_SENTIMENT], - description='Sentiment detector based on the vivekn algorithm', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.VIVEKN_SENTIMENT, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.VIVEKN_SENTIMENT], - trainable_mirror_anno=A.TRAINABLE_VIVEKN_SENTIMENT - )), - A.WORD_EMBEDDINGS: copy(NluComponent( - name=A.WORD_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=Glove.get_default_model, - get_pretrained_model=Glove.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.WORD_EMBEDDINGS], - description='Static Word Embeddings generator, i.e. Glove, etc..', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.WORD_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.WORD_EMBEDDINGS], - is_storage_ref_producer=True, - has_storage_ref=True, - )), - A.WORD_SEGMENTER: copy(NluComponent( - name=A.WORD_SEGMENTER, - type=T.TOKENIZER, - get_default_model=WordSegmenter.get_default_model, - get_pretrained_model=WordSegmenter.get_pretrained_model, - get_trainable_model=WordSegmenter.get_default_model_for_lang, - pdf_extractor_methods={'default': default_word_segmenter_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_seg_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.WORD_SEGMENTER], - description='Segments non white space seperated text into tokens, like Chinese or Japanese. ', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.WORD_SEGMENTER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.WORD_SEGMENTER], - trainable_mirror_anno=A.TRAINABLE_WORD_SEGMENTER - )), - A.YAKE_KEYWORD_EXTRACTION: copy(NluComponent( - name=A.YAKE_KEYWORD_EXTRACTION, - type=T.CHUNK_CLASSIFIER, # TODO??? Classifies each chunks/ngram likelyhood of beeing a Ketyword - get_default_model=Yake.get_default_model, - pdf_extractor_methods={'default': default_yake_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_YAKE_cols, - output_level=L.CHUNK, # Actual sub-ngram/ngram filter - node=NLP_FEATURE_NODES.nodes[A.YAKE_KEYWORD_EXTRACTION], - description='Calculates probability of each n-gram beeing a keyword. Yields a selection of these n-grams with specific filters,i.e. length, probability, etc..', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.YAKE_KEYWORD_EXTRACTION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.YAKE_KEYWORD_EXTRACTION], - has_storage_ref=False, - is_storage_ref_consumer=False, - is_storage_ref_producer=False, - )), + A.TOKENIZER: partial(NluComponent, + name=A.TOKENIZER, + type=T.TOKENIZER, + get_default_model=DefaultTokenizer.get_default_model, + pdf_extractor_methods={'default': default_tokenizer_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_tokenizer_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.TOKENIZER], + description='Default tokenizer', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TOKENIZER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TOKENIZER], + ), + A.SENTIMENT_DL: partial(NluComponent, + name=A.SENTIMENT_DL, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=SentimentDl.get_default_model, + get_pretrained_model=SentimentDl.get_pretrained_model, + get_trainable_model=SentimentDl.get_default_trainable_model, + pdf_extractor_methods={'default': default_sentiment_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentiment_dl_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.SENTIMENT_DL], + description='Deep Learning based Sentiment Detector', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.SENTIMENT_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTIMENT_DL], + trainable_mirror_anno=A.TRAINABLE_SENTIMENT_DL, + is_storage_ref_consumer=True, + has_storage_ref=True + ), + A.TRAINABLE_SENTIMENT_DL: partial(NluComponent, + name=A.TRAINABLE_SENTIMENT_DL, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=SentimentDl.get_default_model, + get_pretrained_model=SentimentDl.get_pretrained_model, + get_trainable_model=SentimentDl.get_default_trainable_model, + pdf_extractor_methods={'default': default_sentiment_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentiment_dl_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_SENTIMENT_DL], + description='Deep Learning based Sentiment Detector', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TRAINABLE_SENTIMENT_DL, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_SENTIMENT_DL], + trained_mirror_anno=A.SENTIMENT_DL, + is_storage_ref_consumer=True, + has_storage_ref=True, + trainable=True + ), + A.SENTIMENT_DETECTOR: partial(NluComponent, + name=A.SENTIMENT_DETECTOR, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=Sentiment.get_default_model, + # get_pretrained_model = Sentiment.get_pretrained_model, + get_trainable_model=Sentiment.get_default_trainable_model, + pdf_extractor_methods={'default': default_sentiment_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentiment_dl_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.SENTIMENT_DETECTOR], + description='Rule based sentiment detector, which calculates a score based on predefined keywords', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.SENTIMENT_DETECTOR, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.SENTIMENT_DETECTOR], + trainable_mirror_anno=A.TRAINABLE_SENTIMENT, + ), + A.VIVEKN_SENTIMENT: partial(NluComponent, + name=A.VIVEKN_SENTIMENT, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=ViveknSentiment.get_default_model, + get_pretrained_model=ViveknSentiment.get_pretrained_model, + get_trainable_model=ViveknSentiment.get_default_trainable_model, + pdf_extractor_methods={'default': default_sentiment_vivk_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentiment_vivk_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.VIVEKN_SENTIMENT], + description='Sentiment detector based on the vivekn algorithm', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.VIVEKN_SENTIMENT, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.VIVEKN_SENTIMENT], + trainable_mirror_anno=A.TRAINABLE_VIVEKN_SENTIMENT + ), + A.WORD_EMBEDDINGS: partial(NluComponent, + name=A.WORD_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=Glove.get_default_model, + get_pretrained_model=Glove.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.WORD_EMBEDDINGS], + description='Static Word Embeddings generator, i.e. Glove, etc..', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.WORD_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.WORD_EMBEDDINGS], + is_storage_ref_producer=True, + has_storage_ref=True, + ), + A.WORD_SEGMENTER: partial(NluComponent, + name=A.WORD_SEGMENTER, + type=T.TOKENIZER, + get_default_model=WordSegmenter.get_default_model, + get_pretrained_model=WordSegmenter.get_pretrained_model, + get_trainable_model=WordSegmenter.get_default_model_for_lang, + pdf_extractor_methods={'default': default_word_segmenter_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_seg_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.WORD_SEGMENTER], + description='Segments non white space seperated text into tokens, like Chinese or Japanese. ', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.WORD_SEGMENTER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.WORD_SEGMENTER], + trainable_mirror_anno=A.TRAINABLE_WORD_SEGMENTER + ), + A.YAKE_KEYWORD_EXTRACTION: partial(NluComponent, + name=A.YAKE_KEYWORD_EXTRACTION, + type=T.CHUNK_CLASSIFIER, + # TODO??? Classifies each chunks/ngram likelyhood of beeing a Ketyword + get_default_model=Yake.get_default_model, + pdf_extractor_methods={'default': default_yake_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_YAKE_cols, + output_level=L.CHUNK, # Actual sub-ngram/ngram filter + node=NLP_FEATURE_NODES.nodes[A.YAKE_KEYWORD_EXTRACTION], + description='Calculates probability of each n-gram beeing a keyword. Yields a selection of these n-grams with specific filters,i.e. length, probability, etc..', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.YAKE_KEYWORD_EXTRACTION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.YAKE_KEYWORD_EXTRACTION], + has_storage_ref=False, + is_storage_ref_consumer=False, + is_storage_ref_producer=False, + ), - A.DOC2VEC: copy(NluComponent( - name=A.DOC2VEC, - type=T.TOKEN_EMBEDDING, - get_default_model=Doc2Vec.get_default_model, - get_trainable_model=Doc2Vec.get_trainable_model, - get_pretrained_model=Doc2Vec.get_pretrained_model, - pdf_extractor_methods={'default': default_sentence_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sent_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.DOC2VEC], - description='Trains a Word2Vec model that creates vector representations of words in a text corpus. The algorithm first constructs a vocabulary from the corpus and then learns vector representation of words in the vocabulary. The vector representation can be used as features in natural language processing and machine learning algorithms.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DOC2VEC, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOC2VEC], - has_storage_ref=True, - is_storage_ref_producer=True, - trainable_mirror_anno=A.TRAINABLE_DOC2VEC - )), + A.DOC2VEC: partial(NluComponent, + name=A.DOC2VEC, + type=T.TOKEN_EMBEDDING, + get_default_model=Doc2Vec.get_default_model, + get_trainable_model=Doc2Vec.get_trainable_model, + get_pretrained_model=Doc2Vec.get_pretrained_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.DOC2VEC], + description='Trains a Word2Vec model that creates vector representations of words in a text corpus. The algorithm first constructs a vocabulary from the corpus and then learns vector representation of words in the vocabulary. The vector representation can be used as features in natural language processing and machine learning algorithms.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DOC2VEC, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DOC2VEC], + has_storage_ref=True, + is_storage_ref_producer=True, + trainable_mirror_anno=A.TRAINABLE_DOC2VEC + ), - A.TRAINABLE_DOC2VEC: copy(NluComponent( - name=A.TRAINABLE_DOC2VEC, - type=T.TOKEN_EMBEDDING, - get_default_model=Doc2Vec.get_default_model, - get_trainable_model=Doc2Vec.get_trainable_model, - get_pretrained_model=Doc2Vec.get_pretrained_model, - pdf_extractor_methods={'default': default_sentence_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sent_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_DOC2VEC], - description='Trains a Word2Vec model that creates vector representations of words in a text corpus. The algorithm first constructs a vocabulary from the corpus and then learns vector representation of words in the vocabulary. The vector representation can be used as features in natural language processing and machine learning algorithms.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.TRAINABLE_DOC2VEC, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_DOC2VEC], - has_storage_ref=True, - is_storage_ref_producer=True, - trained_mirror_anno=A.DOC2VEC, - trainable=True - )), + A.TRAINABLE_DOC2VEC: partial(NluComponent, + name=A.TRAINABLE_DOC2VEC, + type=T.TOKEN_EMBEDDING, + get_default_model=Doc2Vec.get_default_model, + get_trainable_model=Doc2Vec.get_trainable_model, + get_pretrained_model=Doc2Vec.get_pretrained_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.TRAINABLE_DOC2VEC], + description='Trains a Word2Vec model that creates vector representations of words in a text corpus. The algorithm first constructs a vocabulary from the corpus and then learns vector representation of words in the vocabulary. The vector representation can be used as features in natural language processing and machine learning algorithms.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.TRAINABLE_DOC2VEC, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.TRAINABLE_DOC2VEC], + has_storage_ref=True, + is_storage_ref_producer=True, + trained_mirror_anno=A.DOC2VEC, + trainable=True + ), ### ________ TRANSFORMERS BELOW _________ - A.ALBERT_EMBEDDINGS: copy(NluComponent( - name=A.ALBERT_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=SparkNLPAlbert.get_default_model, - get_pretrained_model=SparkNLPAlbert.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.ALBERT_EMBEDDINGS], - description='ALBERT: A LITE BERT FOR SELF-SUPERVISED LEARNING OF LANGUAGE REPRESENTATIONS - Google Research, Toyota Technological Institute at Chicago', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.ALBERT_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ALBERT_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), + A.ALBERT_EMBEDDINGS: partial(NluComponent, + name=A.ALBERT_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=SparkNLPAlbert.get_default_model, + get_pretrained_model=SparkNLPAlbert.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.ALBERT_EMBEDDINGS], + description='ALBERT: A LITE BERT FOR SELF-SUPERVISED LEARNING OF LANGUAGE REPRESENTATIONS - Google Research, Toyota Technological Institute at Chicago', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.ALBERT_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ALBERT_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), - A.ALBERT_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=A.ALBERT_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_TOKEN_CLASSIFIER, - get_default_model=TokenAlbert.get_default_model, - get_pretrained_model=TokenAlbert.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, # Handled like NER model - node=NLP_FEATURE_NODES.nodes[A.ALBERT_FOR_TOKEN_CLASSIFICATION], - description='AlbertForTokenClassification can load ALBERT Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.ALBERT_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ALBERT_FOR_TOKEN_CLASSIFICATION], - )), - A.BERT_EMBEDDINGS: copy(NluComponent( - name=A.BERT_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=SparkNLPBert.get_default_model, - get_pretrained_model=SparkNLPBert.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.BERT_EMBEDDINGS], - description='Token-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.BERT_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BERT_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), - A.BERT_SENTENCE_EMBEDDINGS: copy(NluComponent( - name=A.BERT_SENTENCE_EMBEDDINGS, - type=T.DOCUMENT_EMBEDDING, - get_default_model=BertSentence.get_default_model, - get_pretrained_model=BertSentence.get_pretrained_model, - pdf_extractor_methods={'default': default_sentence_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sent_embed_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, - node=NLP_FEATURE_NODES.nodes[A.BERT_SENTENCE_EMBEDDINGS], - description='Sentence-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.BERT_SENTENCE_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BERT_SENTENCE_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), - A.BERT_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=A.BERT_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_TOKEN_CLASSIFIER, - get_default_model=TokenBert.get_default_model, - get_pretrained_model=TokenBert.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, # Handled like NER model - node=NLP_FEATURE_NODES.nodes[A.BERT_FOR_TOKEN_CLASSIFICATION], - description='BertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.BERT_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BERT_FOR_TOKEN_CLASSIFICATION], - )), + A.ALBERT_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=A.ALBERT_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_TOKEN_CLASSIFIER, + get_default_model=TokenAlbert.get_default_model, + get_pretrained_model=TokenAlbert.get_pretrained_model, + pdf_extractor_methods={'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, # Handled like NER model + node=NLP_FEATURE_NODES.nodes[A.ALBERT_FOR_TOKEN_CLASSIFICATION], + description='AlbertForTokenClassification can load ALBERT Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.ALBERT_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.ALBERT_FOR_TOKEN_CLASSIFICATION], + ), + A.BERT_EMBEDDINGS: partial(NluComponent, + name=A.BERT_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=SparkNLPBert.get_default_model, + get_pretrained_model=SparkNLPBert.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.BERT_EMBEDDINGS], + description='Token-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.BERT_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BERT_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + A.BERT_SENTENCE_EMBEDDINGS: partial(NluComponent, + name=A.BERT_SENTENCE_EMBEDDINGS, + type=T.DOCUMENT_EMBEDDING, + get_default_model=BertSentence.get_default_model, + get_pretrained_model=BertSentence.get_pretrained_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, + node=NLP_FEATURE_NODES.nodes[A.BERT_SENTENCE_EMBEDDINGS], + description='Sentence-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.BERT_SENTENCE_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BERT_SENTENCE_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + A.BERT_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=A.BERT_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_TOKEN_CLASSIFIER, + get_default_model=TokenBert.get_default_model, + get_pretrained_model=TokenBert.get_pretrained_model, + pdf_extractor_methods={'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, # Handled like NER model + node=NLP_FEATURE_NODES.nodes[A.BERT_FOR_TOKEN_CLASSIFICATION], + description='BertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.BERT_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.BERT_FOR_TOKEN_CLASSIFICATION], + ), - A.BERT_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=A.BERT_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqBertClassifier.get_default_model, - get_pretrained_model=SeqBertClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.BERT_FOR_SEQUENCE_CLASSIFICATION], - description='BertForSequenceClassification can load Bert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.BERT_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.BERT_FOR_SEQUENCE_CLASSIFICATION], - )), - A.DISTIL_BERT_EMBEDDINGS: copy(NluComponent( - name=A.DISTIL_BERT_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=DistilBert.get_default_model, - get_pretrained_model=DistilBert.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.DISTIL_BERT_EMBEDDINGS], - description='DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT’s performances as measured on the GLUE language understanding benchmark.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DISTIL_BERT_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DISTIL_BERT_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), - A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqDilstilBertClassifier.get_default_model, - get_pretrained_model=SeqDilstilBertClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION], - description='DistilBertForSequenceClassification can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION], - )), - A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_TOKEN_CLASSIFIER, - get_default_model=TokenDistilBert.get_default_model, - get_pretrained_model=TokenDistilBert.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION], - description='DistilBertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION], - )), - A.ELMO_EMBEDDINGS: copy(NluComponent( - name=A.ELMO_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=SparkNLPElmo.get_default_model, - get_pretrained_model=SparkNLPElmo.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.ELMO_EMBEDDINGS], - description='Word embeddings from ELMo (Embeddings from Language Models)), a language model trained on the 1 Billion Word Benchmark.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.ELMO_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ELMO_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), - A.LONGFORMER_EMBEDDINGS: copy(NluComponent( - name=A.LONGFORMER_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=Longformer.get_default_model, - get_pretrained_model=Longformer.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.LONGFORMER_EMBEDDINGS], - description='Longformer is a transformer model for long documents. The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, Matthew E. Peters, Arman Cohan. longformer-base-4096 is a BERT-like model started from the RoBERTa checkpoint and pretrained for MLM on long documents. It supports sequences of length up to 4,096.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.LONGFORMER_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LONGFORMER_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), + A.BERT_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=A.BERT_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqBertClassifier.get_default_model, + get_pretrained_model=SeqBertClassifier.get_pretrained_model, + pdf_extractor_methods={'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.BERT_FOR_SEQUENCE_CLASSIFICATION], + description='BertForSequenceClassification can load Bert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.BERT_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.BERT_FOR_SEQUENCE_CLASSIFICATION], + ), + A.DISTIL_BERT_EMBEDDINGS: partial(NluComponent, + name=A.DISTIL_BERT_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=DistilBert.get_default_model, + get_pretrained_model=DistilBert.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.DISTIL_BERT_EMBEDDINGS], + description='DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT’s performances as measured on the GLUE language understanding benchmark.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DISTIL_BERT_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DISTIL_BERT_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqDilstilBertClassifier.get_default_model, + get_pretrained_model=SeqDilstilBertClassifier.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[ + A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION], + description='DistilBertForSequenceClassification can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.DISTIL_BERT_FOR_SEQUENCE_CLASSIFICATION], + ), + A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_TOKEN_CLASSIFIER, + get_default_model=TokenDistilBert.get_default_model, + get_pretrained_model=TokenDistilBert.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[ + A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION], + description='DistilBertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.DISTIL_BERT_FOR_TOKEN_CLASSIFICATION], + ), + A.ELMO_EMBEDDINGS: partial(NluComponent, + name=A.ELMO_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=SparkNLPElmo.get_default_model, + get_pretrained_model=SparkNLPElmo.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.ELMO_EMBEDDINGS], + description='Word embeddings from ELMo (Embeddings from Language Models), a language model trained on the 1 Billion Word Benchmark.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.ELMO_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ELMO_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + A.LONGFORMER_EMBEDDINGS: partial(NluComponent, + name=A.LONGFORMER_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=Longformer.get_default_model, + get_pretrained_model=Longformer.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.LONGFORMER_EMBEDDINGS], + description='Longformer is a transformer model for long documents. The Longformer model was presented in Longformer: The Long-Document Transformer by Iz Beltagy, Matthew E. Peters, Arman Cohan. longformer-base-4096 is a BERT-like model started from the RoBERTa checkpoint and pretrained for MLM on long documents. It supports sequences of length up to 4,096.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.LONGFORMER_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LONGFORMER_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), - A.LONGFORMER_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=A.LONGFORMER_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_TOKEN_CLASSIFIER, - get_default_model=TokenLongFormer.get_default_model, - get_pretrained_model=TokenLongFormer.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.LONGFORMER_FOR_TOKEN_CLASSIFICATION], - description='LongformerForTokenClassification can load Longformer Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.LONGFORMER_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LONGFORMER_FOR_TOKEN_CLASSIFICATION], - )), - A.MARIAN_TRANSFORMER: copy(NluComponent( - name=A.MARIAN_TRANSFORMER, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=Marian.get_default_model, - get_pretrained_model=Marian.get_pretrained_model, - pdf_extractor_methods={'default': default_marian_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_marian_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.MARIAN_TRANSFORMER], - description='Marian is an efficient, free Neural Machine Translation framework written in pure C++ with minimal dependencies. It is mainly being developed by the Microsoft Translator team. Many academic (most notably the University of Edinburgh and in the past the Adam Mickiewicz University in Poznań) and commercial contributors help with its development. MarianTransformer uses the models trained by MarianNMT.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.MARIAN_TRANSFORMER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MARIAN_TRANSFORMER], - )), - A.ROBERTA_EMBEDDINGS: copy(NluComponent( - name=A.ROBERTA_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=Roberta.get_default_model, - get_pretrained_model=Roberta.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.ROBERTA_EMBEDDINGS], - description='The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google’s BERT model released in 2018.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.ROBERTA_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ROBERTA_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), + A.LONGFORMER_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=A.LONGFORMER_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_TOKEN_CLASSIFIER, + get_default_model=TokenLongFormer.get_default_model, + get_pretrained_model=TokenLongFormer.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[ + A.LONGFORMER_FOR_TOKEN_CLASSIFICATION], + description='LongformerForTokenClassification can load Longformer Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.LONGFORMER_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.LONGFORMER_FOR_TOKEN_CLASSIFICATION], + ), + A.MARIAN_TRANSFORMER: partial(NluComponent, + name=A.MARIAN_TRANSFORMER, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=Marian.get_default_model, + get_pretrained_model=Marian.get_pretrained_model, + pdf_extractor_methods={'default': default_marian_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_marian_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.MARIAN_TRANSFORMER], + description='Marian is an efficient, free Neural Machine Translation framework written in pure C++ with minimal dependencies. It is mainly being developed by the Microsoft Translator team. Many academic (most notably the University of Edinburgh and in the past the Adam Mickiewicz University in Poznań) and commercial contributors help with its development. MarianTransformer uses the models trained by MarianNMT.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.MARIAN_TRANSFORMER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MARIAN_TRANSFORMER], + ), + A.ROBERTA_EMBEDDINGS: partial(NluComponent, + name=A.ROBERTA_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=Roberta.get_default_model, + get_pretrained_model=Roberta.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.ROBERTA_EMBEDDINGS], + description='The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google’s BERT model released in 2018.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.ROBERTA_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ROBERTA_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), - A.ROBERTA_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=A.ROBERTA_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=TokenRoBerta.get_default_model, - get_pretrained_model=TokenRoBerta.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, # Handled like NER model - node=NLP_FEATURE_NODES.nodes[A.ROBERTA_FOR_TOKEN_CLASSIFICATION], - description='RoBertaForTokenClassification can load RoBERTa Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.ROBERTA_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ROBERTA_FOR_TOKEN_CLASSIFICATION], - )), - # A.ROBERTA_SENTENCE_EMBEDDINGS: copy(NluComponent( # TODO not integrated + A.ROBERTA_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=A.ROBERTA_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=TokenRoBerta.get_default_model, + get_pretrained_model=TokenRoBerta.get_pretrained_model, + pdf_extractor_methods={'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, # Handled like NER model + node=NLP_FEATURE_NODES.nodes[A.ROBERTA_FOR_TOKEN_CLASSIFICATION], + description='RoBertaForTokenClassification can load RoBERTa Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.ROBERTA_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.ROBERTA_FOR_TOKEN_CLASSIFICATION], + ), + # A.ROBERTA_SENTENCE_EMBEDDINGS: partial(NluComponent, # TODO not integrated # name=A.ROBERTA_SENTENCE_EMBEDDINGS, # type=T.DOCUMENT_EMBEDDING, # get_default_model=BertSentence.get_default_model, @@ -1215,331 +1355,362 @@ class ComponentUniverse: # # has_storage_ref=True, # is_is_storage_ref_producer=True, - # )), - A.T5_TRANSFORMER: copy(NluComponent( - # TODO task based construction, i.e. get_preconfigured_model - name=A.T5_TRANSFORMER, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=T5.get_default_model, - get_pretrained_model=T5.get_pretrained_model, - pdf_extractor_methods={'default': default_T5_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_T5_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.T5_TRANSFORMER], - description='T5 reconsiders all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. The text-to-text framework is able to use the same model, loss function, and hyper-parameters on any NLP task, including machine translation, document summarization, question answering, and classification tasks (e.g., sentiment analysis). T5 can even apply to regression tasks by training it to predict the string representation of a number instead of the number itself.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.T5_TRANSFORMER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.T5_TRANSFORMER], - )), - A.UNIVERSAL_SENTENCE_ENCODER: copy(NluComponent( - name=A.UNIVERSAL_SENTENCE_ENCODER, - type=T.DOCUMENT_EMBEDDING, - get_default_model=SparkNLPUse.get_default_model, - get_pretrained_model=SparkNLPUse.get_pretrained_model, - pdf_extractor_methods={'default': default_sentence_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sent_embed_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, - node=NLP_FEATURE_NODES.nodes[A.UNIVERSAL_SENTENCE_ENCODER], - description='The Universal Sentence Encoder encodes text into high dimensional vectors that can be used for text classification, semantic similarity, clustering and other natural language tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.UNIVERSAL_SENTENCE_ENCODER, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.UNIVERSAL_SENTENCE_ENCODER], - has_storage_ref=True, - is_storage_ref_producer=True, - )), + # ), + A.T5_TRANSFORMER: partial(NluComponent, + # TODO task based construction, i.e. get_preconfigured_model + name=A.T5_TRANSFORMER, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=T5.get_default_model, + get_pretrained_model=T5.get_pretrained_model, + pdf_extractor_methods={'default': default_T5_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_T5_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.T5_TRANSFORMER], + description='T5 reconsiders all NLP tasks into a unified text-to-text-format where the input and output are always text strings, in contrast to BERT-style models that can only output either a class label or a span of the input. The text-to-text framework is able to use the same model, loss function, and hyper-parameters on any NLP task, including machine translation, document summarization, question answering, and classification tasks (e.g., sentiment analysis). T5 can even apply to regression tasks by training it to predict the string representation of a number instead of the number itself.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.T5_TRANSFORMER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.T5_TRANSFORMER], + ), + A.UNIVERSAL_SENTENCE_ENCODER: partial(NluComponent, + name=A.UNIVERSAL_SENTENCE_ENCODER, + type=T.DOCUMENT_EMBEDDING, + get_default_model=SparkNLPUse.get_default_model, + get_pretrained_model=SparkNLPUse.get_pretrained_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, + node=NLP_FEATURE_NODES.nodes[A.UNIVERSAL_SENTENCE_ENCODER], + description='The Universal Sentence Encoder encodes text into high dimensional vectors that can be used for text classification, semantic similarity, clustering and other natural language tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.UNIVERSAL_SENTENCE_ENCODER, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.UNIVERSAL_SENTENCE_ENCODER], + has_storage_ref=True, + is_storage_ref_producer=True, + ), - A.XLM_ROBERTA_EMBEDDINGS: copy(NluComponent( - name=A.XLM_ROBERTA_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=XLM.get_default_model, - get_pretrained_model=XLM.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.XLM_ROBERTA_EMBEDDINGS], - description='The XLM-RoBERTa model was proposed in Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook’s RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.XLM_ROBERTA_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLM_ROBERTA_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), + A.XLM_ROBERTA_EMBEDDINGS: partial(NluComponent, + name=A.XLM_ROBERTA_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=XLM.get_default_model, + get_pretrained_model=XLM.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.XLM_ROBERTA_EMBEDDINGS], + description='The XLM-RoBERTa model was proposed in Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook’s RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.XLM_ROBERTA_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLM_ROBERTA_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), - A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_TOKEN_CLASSIFIER, - get_default_model=TokenXlmRoBerta.get_default_model, - get_pretrained_model=TokenXlmRoBerta.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION], - description='XlmRoBertaForTokenClassification can load XLM-RoBERTa Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION], - )), - A.XLM_ROBERTA_SENTENCE_EMBEDDINGS: copy(NluComponent( - name=A.XLM_ROBERTA_SENTENCE_EMBEDDINGS, - type=T.DOCUMENT_EMBEDDING, - get_default_model=Sentence_XLM.get_default_model, - get_pretrained_model=Sentence_XLM.get_pretrained_model, - pdf_extractor_methods={'default': default_sentence_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sent_embed_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, - node=NLP_FEATURE_NODES.nodes[A.XLM_ROBERTA_SENTENCE_EMBEDDINGS], - description='Sentence-level embeddings using XLM-RoBERTa. The XLM-RoBERTa model was proposed in Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook’s RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.XLM_ROBERTA_SENTENCE_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLM_ROBERTA_SENTENCE_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), - A.XLNET_EMBEDDINGS: copy(NluComponent( - name=A.XLNET_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=SparkNLPXlnet.get_default_model, - get_pretrained_model=SparkNLPXlnet.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.XLNET_EMBEDDINGS], - description='XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context. Overall, XLNet achieves state-of-the-art (SOTA) results on various downstream language tasks including question answering, natural language inference, sentiment analysis, and document ranking.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.XLNET_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLNET_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), - A.XLNET_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=A.XLNET_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_TOKEN_CLASSIFIER, - get_default_model=TokenXlnet.get_default_model, - get_pretrained_model=TokenXlnet.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.XLNET_FOR_TOKEN_CLASSIFICATION], - description='XlnetForTokenClassification can load XLNet Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.XLNET_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLNET_FOR_TOKEN_CLASSIFICATION], - )), + A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_TOKEN_CLASSIFIER, + get_default_model=TokenXlmRoBerta.get_default_model, + get_pretrained_model=TokenXlmRoBerta.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[ + A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION], + description='XlmRoBertaForTokenClassification can load XLM-RoBERTa Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.XLM_ROBERTA_FOR_TOKEN_CLASSIFICATION], + ), + A.XLM_ROBERTA_SENTENCE_EMBEDDINGS: partial(NluComponent, + name=A.XLM_ROBERTA_SENTENCE_EMBEDDINGS, + type=T.DOCUMENT_EMBEDDING, + get_default_model=Sentence_XLM.get_default_model, + get_pretrained_model=Sentence_XLM.get_pretrained_model, + pdf_extractor_methods={'default': default_sentence_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sent_embed_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING, + node=NLP_FEATURE_NODES.nodes[A.XLM_ROBERTA_SENTENCE_EMBEDDINGS], + description='Sentence-level embeddings using XLM-RoBERTa. The XLM-RoBERTa model was proposed in Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook’s RoBERTa model released in 2019. It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.XLM_ROBERTA_SENTENCE_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.XLM_ROBERTA_SENTENCE_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + A.XLNET_EMBEDDINGS: partial(NluComponent, + name=A.XLNET_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=SparkNLPXlnet.get_default_model, + get_pretrained_model=SparkNLPXlnet.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.XLNET_EMBEDDINGS], + description='XLNet is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs Transformer-XL as the backbone model, exhibiting excellent performance for language tasks involving long context. Overall, XLNet achieves state-of-the-art (SOTA) results on various downstream language tasks including question answering, natural language inference, sentiment analysis, and document ranking.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.XLNET_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLNET_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + A.XLNET_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=A.XLNET_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_TOKEN_CLASSIFIER, + get_default_model=TokenXlnet.get_default_model, + get_pretrained_model=TokenXlnet.get_pretrained_model, + pdf_extractor_methods={'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.XLNET_FOR_TOKEN_CLASSIFICATION], + description='XlnetForTokenClassification can load XLNet Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.XLNET_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.XLNET_FOR_TOKEN_CLASSIFICATION], + ), - A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqXlmRobertaClassifier.get_default_model, - get_pretrained_model=SeqXlmRobertaClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION], - description='XlmRoBertaForSequenceClassification can load XLM-RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification task', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION], - )), - A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqRobertaClassifier.get_default_model, - get_pretrained_model=SeqRobertaClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION], - description='RoBertaForSequenceClassification can load RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION], - )), + A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqXlmRobertaClassifier.get_default_model, + get_pretrained_model=SeqXlmRobertaClassifier.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[ + A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION], + description='XlmRoBertaForSequenceClassification can load XLM-RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification task', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.XLM_ROBERTA_FOR_SEQUENCE_CLASSIFICATION], + ), + A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqRobertaClassifier.get_default_model, + get_pretrained_model=SeqRobertaClassifier.get_pretrained_model, + pdf_extractor_methods={'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[ + A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION], + description='RoBertaForSequenceClassification can load RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.ROBERTA_FOR_SEQUENCE_CLASSIFICATION], + ), - A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqLongformerClassifier.get_default_model, - get_pretrained_model=SeqLongformerClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION], - description='RoBertaForSequenceClassification can load RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION], - )), - A.ALBERT_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=A.ALBERT_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqAlbertClassifier.get_default_model, - get_pretrained_model=SeqAlbertClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.ALBERT_FOR_SEQUENCE_CLASSIFICATION], - description='AlbertForSequenceClassification can load ALBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.ALBERT_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.ALBERT_FOR_SEQUENCE_CLASSIFICATION], - )), + A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqLongformerClassifier.get_default_model, + get_pretrained_model=SeqLongformerClassifier.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[ + A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION], + description='RoBertaForSequenceClassification can load RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.LONGFORMER_FOR_SEQUENCE_CLASSIFICATION], + ), + A.ALBERT_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=A.ALBERT_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqAlbertClassifier.get_default_model, + get_pretrained_model=SeqAlbertClassifier.get_pretrained_model, + pdf_extractor_methods={'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[ + A.ALBERT_FOR_SEQUENCE_CLASSIFICATION], + description='AlbertForSequenceClassification can load ALBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.ALBERT_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.ALBERT_FOR_SEQUENCE_CLASSIFICATION], + ), - A.XLNET_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=A.XLNET_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqXlnetClassifier.get_default_model, - get_pretrained_model=SeqXlnetClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.XLNET_FOR_SEQUENCE_CLASSIFICATION], - description='AlbertForSequenceClassification can load ALBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.XLNET_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.XLNET_FOR_SEQUENCE_CLASSIFICATION], - )), + A.XLNET_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=A.XLNET_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqXlnetClassifier.get_default_model, + get_pretrained_model=SeqXlnetClassifier.get_pretrained_model, + pdf_extractor_methods={'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.XLNET_FOR_SEQUENCE_CLASSIFICATION], + description='AlbertForSequenceClassification can load ALBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.XLNET_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno2_py_class[ + A.XLNET_FOR_SEQUENCE_CLASSIFICATION], + ), - A.GPT2: copy(NluComponent( - name=A.GPT2, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=GPT2.get_default_model, - get_pretrained_model=GPT2.get_pretrained_model, - pdf_extractor_methods={'default': default_gpt2_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_gpt2_cols, # TIODO TESt - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, - node=NLP_FEATURE_NODES.nodes[A.GPT2], - description='AlbertForSequenceClassification can load ALBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.GPT2, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.GPT2], - )), + A.GPT2: partial(NluComponent, + name=A.GPT2, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=GPT2.get_default_model, + get_pretrained_model=GPT2.get_pretrained_model, + pdf_extractor_methods={'default': default_gpt2_config, 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_gpt2_cols, # TIODO TESt + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + node=NLP_FEATURE_NODES.nodes[A.GPT2], + description='AlbertForSequenceClassification can load ALBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.GPT2, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.GPT2], + ), - A.WORD_2_VEC: copy(NluComponent( # TOOD - name=A.WORD_2_VEC, - type=T.TOKEN_EMBEDDING, - get_default_model=Word2Vec.get_default_model, - get_pretrained_model=Word2Vec.get_pretrained_model, - get_trainable_model=Word2Vec.get_trainable_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, # TODO? - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.WORD_2_VEC], - description='We use Word2Vec implemented in Spark ML. It uses skip-gram model in our implementation and a hierarchical softmax method to train the model. The variable names in the implementation match the original C implementation.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.WORD_2_VEC, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.WORD_2_VEC], - has_storage_ref=True, - is_storage_ref_producer=True, - )), + A.WORD_2_VEC: partial(NluComponent, # TOOD + name=A.WORD_2_VEC, + type=T.TOKEN_EMBEDDING, + get_default_model=Word2Vec.get_default_model, + get_pretrained_model=Word2Vec.get_pretrained_model, + get_trainable_model=Word2Vec.get_trainable_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, # TODO? + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.WORD_2_VEC], + description='We use Word2Vec implemented in Spark ML. It uses skip-gram model in our implementation and a hierarchical softmax method to train the model. The variable names in the implementation match the original C implementation.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.WORD_2_VEC, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.WORD_2_VEC], + has_storage_ref=True, + is_storage_ref_producer=True, + ), - A.DEBERTA_WORD_EMBEDDINGS: copy(NluComponent( - name=A.DEBERTA_WORD_EMBEDDINGS, - type=T.TOKEN_EMBEDDING, - get_default_model=Deberta.get_default_model, - get_pretrained_model=Deberta.get_pretrained_model, - pdf_extractor_methods={'default': default_word_embedding_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_word_embed_cols, - output_level=L.TOKEN, - node=NLP_FEATURE_NODES.nodes[A.DEBERTA_WORD_EMBEDDINGS], - description='Token-level embeddings using DeBERTa. The DeBERTa model was proposed in DeBERTa: Decoding-enhanced BERT with Disentangled Attention by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019.', - provider=ComponentBackends.open_source, - license=Licenses.open_source, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=A.DEBERTA_WORD_EMBEDDINGS, - jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DEBERTA_WORD_EMBEDDINGS], - has_storage_ref=True, - is_storage_ref_producer=True, - )), + A.DEBERTA_WORD_EMBEDDINGS: partial(NluComponent, + name=A.DEBERTA_WORD_EMBEDDINGS, + type=T.TOKEN_EMBEDDING, + get_default_model=Deberta.get_default_model, + get_pretrained_model=Deberta.get_pretrained_model, + pdf_extractor_methods={'default': default_word_embedding_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_word_embed_cols, + output_level=L.TOKEN, + node=NLP_FEATURE_NODES.nodes[A.DEBERTA_WORD_EMBEDDINGS], + description='Token-level embeddings using DeBERTa. The DeBERTa model was proposed in DeBERTa: Decoding-enhanced BERT with Disentangled Attention by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019.', + provider=ComponentBackends.open_source, + license=Licenses.open_source, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=A.DEBERTA_WORD_EMBEDDINGS, + jsl_anno_py_class=ACR.JSL_anno2_py_class[A.DEBERTA_WORD_EMBEDDINGS], + has_storage_ref=True, + is_storage_ref_producer=True, + ), + ######### HEALTHCARE ############## - } - hc_components = { - # TODO THIS SHOULD BE A SEPERATED CLASS which ONLY INSTATIATE when LICENSE VALIDATE!!!> - H_A.ASSERTION_DL: copy(NluComponent( - name=H_A.ASSERTION_DL, - type=T.CHUNK_CLASSIFIER, - get_default_model=AssertionDL.get_default_model, - get_pretrained_model=AssertionDL.get_pretrained_model, - get_trainable_model=AssertionDL.get_default_trainable_model, - pdf_extractor_methods={'default': default_assertion_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_assertion_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.ASSERTION_DL], - description='Deep Learning based Assertion model that maps NER-Chunks into a pre-defined terminology.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.ASSERTION_DL, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.ASSERTION_DL], - has_storage_ref=True, - is_storage_ref_consumer=True, - trainable_mirror_anno=H_A.TRAINABLE_ASSERTION_DL - )), - H_A.TRAINABLE_ASSERTION_DL: copy(NluComponent( - name=H_A.TRAINABLE_ASSERTION_DL, - type=T.CHUNK_CLASSIFIER, - get_default_model=AssertionDL.get_default_model, - get_pretrained_model=AssertionDL.get_pretrained_model, - get_trainable_model=AssertionDL.get_default_trainable_model, - pdf_extractor_methods={'default': default_assertion_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_assertion_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_ASSERTION_DL], - description='Trainable Deep Learning based Assertion model that maps NER-Chunks into a pre-defined terminology.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.TRAINABLE_ASSERTION_DL, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.TRAINABLE_ASSERTION_DL], - has_storage_ref=True, - is_storage_ref_consumer=True, - trainable=True, - trained_mirror_anno=H_A.ASSERTION_DL)), - # H_A.ASSERTION_FILTERER: copy(NluComponent( # TODO not integrated + H_A.ASSERTION_DL: partial(NluComponent, + name=H_A.ASSERTION_DL, + type=T.CHUNK_CLASSIFIER, + get_default_model=AssertionDL.get_default_model, + get_pretrained_model=AssertionDL.get_pretrained_model, + get_trainable_model=AssertionDL.get_default_trainable_model, + pdf_extractor_methods={'default': default_assertion_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_assertion_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.ASSERTION_DL], + description='Deep Learning based Assertion model that maps NER-Chunks into a pre-defined terminology.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.ASSERTION_DL, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.ASSERTION_DL], + has_storage_ref=True, + is_storage_ref_consumer=True, + trainable_mirror_anno=H_A.TRAINABLE_ASSERTION_DL + ), + H_A.TRAINABLE_ASSERTION_DL: partial(NluComponent, + name=H_A.TRAINABLE_ASSERTION_DL, + type=T.CHUNK_CLASSIFIER, + get_default_model=AssertionDL.get_default_model, + get_pretrained_model=AssertionDL.get_pretrained_model, + get_trainable_model=AssertionDL.get_default_trainable_model, + pdf_extractor_methods={'default': default_assertion_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_assertion_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_ASSERTION_DL], + description='Trainable Deep Learning based Assertion model that maps NER-Chunks into a pre-defined terminology.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.TRAINABLE_ASSERTION_DL, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.TRAINABLE_ASSERTION_DL], + has_storage_ref=True, + is_storage_ref_consumer=True, + trainable=True, + trained_mirror_anno=H_A.ASSERTION_DL), + # H_A.ASSERTION_FILTERER: partial(NluComponent, # TODO not integrated # name=H_A.ASSERTION_FILTERER, # type=T.CHUNK_FILTERER, # get_default_model=AssertionDL.get_default_model, @@ -1560,100 +1731,106 @@ class ComponentUniverse: # has_storage_ref=True, # is_is_storage_ref_consumer=True, # trainable=True, - # trained_mirror_anno=H_A.ASSERTION_FILTERER)), AssertionLogReg - H_A.ASSERTION_LOG_REG: copy(NluComponent( - name=H_A.ASSERTION_LOG_REG, - type=T.CHUNK_CLASSIFIER, - get_default_model=AssertionLogReg.get_default_model, - get_pretrained_model=AssertionLogReg.get_pretrained_model, - get_trainable_model=AssertionLogReg.get_default_trainable_model, - pdf_extractor_methods={'default': default_assertion_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_assertion_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.ASSERTION_LOG_REG], - description='Classical ML based Assertion model that maps NER-Chunks into a pre-defined terminology.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.ASSERTION_LOG_REG, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.ASSERTION_LOG_REG], - trained_mirror_anno=H_A.TRAINABLE_ASSERTION_LOG_REG)), - H_A.TRAINABLE_ASSERTION_LOG_REG: copy(NluComponent( - name=H_A.TRAINABLE_ASSERTION_LOG_REG, - type=T.CHUNK_CLASSIFIER, - get_default_model=AssertionLogReg.get_default_model, - get_pretrained_model=AssertionLogReg.get_pretrained_model, - get_trainable_model=AssertionLogReg.get_default_trainable_model, - pdf_extractor_methods={'default': default_assertion_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_assertion_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_ASSERTION_LOG_REG], - description='Classical ML based Assertion model that maps NER-Chunks into a pre-defined terminology.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.TRAINABLE_ASSERTION_LOG_REG, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.TRAINABLE_ASSERTION_LOG_REG], - trained_mirror_anno=H_A.ASSERTION_LOG_REG)), + # trained_mirror_anno=H_A.ASSERTION_FILTERER), AssertionLogReg + H_A.ASSERTION_LOG_REG: partial(NluComponent, + name=H_A.ASSERTION_LOG_REG, + type=T.CHUNK_CLASSIFIER, + get_default_model=AssertionLogReg.get_default_model, + get_pretrained_model=AssertionLogReg.get_pretrained_model, + get_trainable_model=AssertionLogReg.get_default_trainable_model, + pdf_extractor_methods={'default': default_assertion_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_assertion_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.ASSERTION_LOG_REG], + description='Classical ML based Assertion model that maps NER-Chunks into a pre-defined terminology.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.ASSERTION_LOG_REG, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.ASSERTION_LOG_REG], + trained_mirror_anno=H_A.TRAINABLE_ASSERTION_LOG_REG), + H_A.TRAINABLE_ASSERTION_LOG_REG: partial(NluComponent, + name=H_A.TRAINABLE_ASSERTION_LOG_REG, + type=T.CHUNK_CLASSIFIER, + get_default_model=AssertionLogReg.get_default_model, + get_pretrained_model=AssertionLogReg.get_pretrained_model, + get_trainable_model=AssertionLogReg.get_default_trainable_model, + pdf_extractor_methods={'default': default_assertion_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_assertion_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_ASSERTION_LOG_REG], + description='Classical ML based Assertion model that maps NER-Chunks into a pre-defined terminology.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.TRAINABLE_ASSERTION_LOG_REG, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.TRAINABLE_ASSERTION_LOG_REG], + trained_mirror_anno=H_A.ASSERTION_LOG_REG), H_A.CHUNK2TOKEN: 'TODO not integrated', H_A.CHUNK_ENTITY_RESOLVER: 'Deprecated', H_A.TRAINABLE_CHUNK_ENTITY_RESOLVER: 'Deprecated', H_A.CHUNK_FILTERER: 'TODO not integrated', H_A.CHUNK_KEY_PHRASE_EXTRACTION: 'TODO not integrated', H_A.CHUNK_MERGE: 'TODO not integrated', - H_A.CONTEXTUAL_PARSER: copy(NluComponent( - name=H_A.CONTEXTUAL_PARSER, - type=T.CHUNK_CLASSIFIER, - get_default_model=ContextualParser.get_default_model, - get_trainable_model=ContextualParser.get_trainable_model, - pdf_extractor_methods={'default': default_full_config, 'default_full': default_full_config, }, - # TODO extractr method - pdf_col_name_substitutor=substitute_context_parser_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.CONTEXTUAL_PARSER], - description='Rule based entity extractor.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.CONTEXTUAL_PARSER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.CONTEXTUAL_PARSER], )), - H_A.DE_IDENTIFICATION: copy(NluComponent( - name=H_A.DE_IDENTIFICATION, - type=T.CHUNK_CLASSIFIER, - get_default_model=Deidentifier.get_default_model, - get_pretrained_model=Deidentifier.get_pretrained_model, - pdf_extractor_methods={'default': default_de_identification_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_de_identification_cols, - output_level=L.DOCUMENT, - node=NLP_HC_FEATURE_NODES.nodes[H_A.DE_IDENTIFICATION], - description='De-Identify named entity according to various Healthcare Data Protection standards', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.DE_IDENTIFICATION, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.DE_IDENTIFICATION], )), + H_A.CONTEXTUAL_PARSER: partial(NluComponent, + name=H_A.CONTEXTUAL_PARSER, + type=T.CHUNK_CLASSIFIER, + get_default_model=ContextualParser.get_default_model, + get_trainable_model=ContextualParser.get_trainable_model, + pdf_extractor_methods={'default': default_full_config, + 'default_full': default_full_config, }, + # TODO extractr method + pdf_col_name_substitutor=substitute_context_parser_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.CONTEXTUAL_PARSER], + description='Rule based entity extractor.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.CONTEXTUAL_PARSER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.CONTEXTUAL_PARSER], ), + H_A.DE_IDENTIFICATION: partial(NluComponent, + name=H_A.DE_IDENTIFICATION, + type=T.CHUNK_CLASSIFIER, + get_default_model=Deidentifier.get_default_model, + get_pretrained_model=Deidentifier.get_pretrained_model, + pdf_extractor_methods={'default': default_de_identification_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_de_identification_cols, + output_level=L.DOCUMENT, + node=NLP_HC_FEATURE_NODES.nodes[H_A.DE_IDENTIFICATION], + description='De-Identify named entity according to various Healthcare Data Protection standards', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.DE_IDENTIFICATION, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.DE_IDENTIFICATION], ), H_A.DOCUMENT_LOG_REG_CLASSIFIER: 'TODO not integrated', H_A.TRAINABLE_DOCUMENT_LOG_REG_CLASSIFIER: 'TODO not integrated', - H_A.DRUG_NORMALIZER: copy(NluComponent( - name=H_A.DRUG_NORMALIZER, - type=T.CHUNK_CLASSIFIER, - get_default_model=DrugNorm.get_default_model, - pdf_extractor_methods={'default': default_only_result_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_drug_normalizer_cols, - output_level=L.DOCUMENT, - node=NLP_HC_FEATURE_NODES.nodes[H_A.DRUG_NORMALIZER], - description='Normalizes raw clinical and crawled text which contains drug names into cleaned and standardized representation', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.DRUG_NORMALIZER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.DRUG_NORMALIZER], )), - # H_A.FEATURES_ASSEMBLER: copy(NluComponent( # TODO partially integrated. featire mpde ,ossomg + H_A.DRUG_NORMALIZER: partial(NluComponent, + name=H_A.DRUG_NORMALIZER, + type=T.CHUNK_CLASSIFIER, + get_default_model=DrugNorm.get_default_model, + pdf_extractor_methods={'default': default_only_result_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_drug_normalizer_cols, + output_level=L.DOCUMENT, + node=NLP_HC_FEATURE_NODES.nodes[H_A.DRUG_NORMALIZER], + description='Normalizes raw clinical and crawled text which contains drug names into cleaned and standardized representation', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.DRUG_NORMALIZER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.DRUG_NORMALIZER], ), + # H_A.FEATURES_ASSEMBLER: partial(NluComponent, # TODO partially integrated. featire mpde ,ossomg # name=H_A.FEATURES_ASSEMBLER, # type=T.HELPER_ANNO, # get_default_model=SparkNLPFeatureAssembler.get_default_model, @@ -1669,181 +1846,189 @@ class ComponentUniverse: # jsl_anno_class_id_id=H_A.FEATURES_ASSEMBLER, # jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.FEATURES_ASSEMBLER], # - H_A.GENERIC_CLASSIFIER: copy(NluComponent( - name=H_A.GENERIC_CLASSIFIER, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=GenericClassifier.get_default_model, - get_trainable_model=GenericClassifier.get_default_model, - get_pretrained_model=GenericClassifier.get_default_model, - pdf_extractor_methods={'default': default_generic_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_generic_classifier_parser_cols, - output_level=L.DOCUMENT, - node=NLP_HC_FEATURE_NODES.nodes[H_A.GENERIC_CLASSIFIER], - description='Generic Deep Learning based tensorflow classifier', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.GENERIC_CLASSIFIER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.GENERIC_CLASSIFIER], - trainable_mirror_anno=H_A.TRAINABLE_GENERIC_CLASSIFIER - )), - H_A.TRAINABLE_GENERIC_CLASSIFIER: copy(NluComponent( - name=H_A.TRAINABLE_GENERIC_CLASSIFIER, - type=T.DOCUMENT_CLASSIFIER, - get_default_model=GenericClassifier.get_default_model, - get_trainable_model=GenericClassifier.get_default_model, - get_pretrained_model=GenericClassifier.get_default_model, - pdf_extractor_methods={'default': default_generic_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_generic_classifier_parser_cols, - output_level=L.DOCUMENT, - node=NLP_HC_FEATURE_NODES.nodes[H_A.GENERIC_CLASSIFIER], - description='Generic Deep Learning based tensorflow classifier', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.TRAINABLE_GENERIC_CLASSIFIER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.TRAINABLE_GENERIC_CLASSIFIER], + H_A.GENERIC_CLASSIFIER: partial(NluComponent, + name=H_A.GENERIC_CLASSIFIER, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=GenericClassifier.get_default_model, + get_trainable_model=GenericClassifier.get_default_model, + get_pretrained_model=GenericClassifier.get_default_model, + pdf_extractor_methods={'default': default_generic_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_generic_classifier_parser_cols, + output_level=L.DOCUMENT, + node=NLP_HC_FEATURE_NODES.nodes[H_A.GENERIC_CLASSIFIER], + description='Generic Deep Learning based tensorflow classifier', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.GENERIC_CLASSIFIER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.GENERIC_CLASSIFIER], + trainable_mirror_anno=H_A.TRAINABLE_GENERIC_CLASSIFIER + ), + H_A.TRAINABLE_GENERIC_CLASSIFIER: partial(NluComponent, + name=H_A.TRAINABLE_GENERIC_CLASSIFIER, + type=T.DOCUMENT_CLASSIFIER, + get_default_model=GenericClassifier.get_default_model, + get_trainable_model=GenericClassifier.get_default_model, + get_pretrained_model=GenericClassifier.get_default_model, + pdf_extractor_methods={'default': default_generic_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_generic_classifier_parser_cols, + output_level=L.DOCUMENT, + node=NLP_HC_FEATURE_NODES.nodes[H_A.GENERIC_CLASSIFIER], + description='Generic Deep Learning based tensorflow classifier', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.TRAINABLE_GENERIC_CLASSIFIER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.TRAINABLE_GENERIC_CLASSIFIER], - trained_mirror_anno=H_A.GENERIC_CLASSIFIER - )), + trained_mirror_anno=H_A.GENERIC_CLASSIFIER + ), H_A.IOB_TAGGER: 'TODO not integrated', - H_A.MEDICAL_NER: copy(NluComponent( - name=H_A.MEDICAL_NER, - type=T.CHUNK_CLASSIFIER, - get_default_model=NERDLHealthcare.get_default_model, - get_trainable_model=NERDLHealthcare.get_default_trainable_model, - get_pretrained_model=NERDLHealthcare.get_pretrained_model, - pdf_extractor_methods={'default': default_ner_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_ner_dl_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.MEDICAL_NER], - description='Deep Learning based Medical Named Entity Recognizer (NER)', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.MEDICAL_NER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.MEDICAL_NER], - trainable_mirror_anno=H_A.TRAINABLE_MEDICAL_NER, - has_storage_ref=True, - is_storage_ref_consumer=True - )), - H_A.TRAINABLE_MEDICAL_NER: copy(NluComponent( - name=H_A.TRAINABLE_MEDICAL_NER, - type=T.CHUNK_CLASSIFIER, - get_default_model=NERDLHealthcare.get_default_model, - get_trainable_model=NERDLHealthcare.get_default_model, - get_pretrained_model=NERDLHealthcare.get_default_model, - pdf_extractor_methods={'default': default_ner_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_ner_dl_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_MEDICAL_NER], - description='Trainable Deep Learning based Medical Named Entity Recognizer (NER)', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.TRAINABLE_MEDICAL_NER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.TRAINABLE_MEDICAL_NER], - trained_mirror_anno=H_A.TRAINABLE_MEDICAL_NER, - has_storage_ref=True, - is_storage_ref_consumer=True - )), + H_A.MEDICAL_NER: partial(NluComponent, + name=H_A.MEDICAL_NER, + type=T.CHUNK_CLASSIFIER, + get_default_model=NERDLHealthcare.get_default_model, + get_trainable_model=NERDLHealthcare.get_default_trainable_model, + get_pretrained_model=NERDLHealthcare.get_pretrained_model, + pdf_extractor_methods={'default': default_ner_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_ner_dl_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.MEDICAL_NER], + description='Deep Learning based Medical Named Entity Recognizer (NER)', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.MEDICAL_NER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.MEDICAL_NER], + trainable_mirror_anno=H_A.TRAINABLE_MEDICAL_NER, + has_storage_ref=True, + is_storage_ref_consumer=True + ), + H_A.TRAINABLE_MEDICAL_NER: partial(NluComponent, + name=H_A.TRAINABLE_MEDICAL_NER, + type=T.CHUNK_CLASSIFIER, + get_default_model=NERDLHealthcare.get_default_model, + get_trainable_model=NERDLHealthcare.get_default_model, + get_pretrained_model=NERDLHealthcare.get_default_model, + pdf_extractor_methods={'default': default_ner_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_ner_dl_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_MEDICAL_NER], + description='Trainable Deep Learning based Medical Named Entity Recognizer (NER)', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.TRAINABLE_MEDICAL_NER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.TRAINABLE_MEDICAL_NER], + trained_mirror_anno=H_A.TRAINABLE_MEDICAL_NER, + has_storage_ref=True, + is_storage_ref_consumer=True + ), H_A.NER_CHUNKER: 'TODO not integrated', - H_A.NER_CONVERTER_INTERNAL: copy(NluComponent( - name=H_A.NER_CONVERTER_INTERNAL, - type=T.HELPER_ANNO, - get_default_model=NerToChunkConverterLicensed.get_default_model, - pdf_extractor_methods={'default': default_NER_converter_licensed_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_ner_internal_converter_cols, - output_level=L.CHUNK, - node=NLP_HC_FEATURE_NODES.nodes[H_A.NER_CONVERTER_INTERNAL], - description='Convert NER-IOB tokens into concatenated strings (aka chunks)', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.NER_CONVERTER_INTERNAL, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.NER_CONVERTER_INTERNAL], + H_A.NER_CONVERTER_INTERNAL: partial(NluComponent, + name=H_A.NER_CONVERTER_INTERNAL, + type=T.HELPER_ANNO, + get_default_model=NerToChunkConverterLicensed.get_default_model, + pdf_extractor_methods={'default': default_NER_converter_licensed_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_ner_internal_converter_cols, + output_level=L.CHUNK, + node=NLP_HC_FEATURE_NODES.nodes[H_A.NER_CONVERTER_INTERNAL], + description='Convert NER-IOB tokens into concatenated strings (aka chunks)', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.NER_CONVERTER_INTERNAL, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.NER_CONVERTER_INTERNAL], - )), + ), H_A.NER_DISAMBIGUATOR: 'TODO not integrated', H_A.RELATION_NER_CHUNKS_FILTERER: 'TODO not integrated', H_A.RE_IDENTIFICATION: 'TODO not integrated', - H_A.RELATION_EXTRACTION: copy(NluComponent( - name=H_A.RELATION_EXTRACTION, - type=T.RELATION_CLASSIFIER, - get_default_model=RelationExtraction.get_default_model, - get_pretrained_model=RelationExtraction.get_pretrained_model, - get_trainable_model=RelationExtraction.get_default_trainable_model, - pdf_extractor_methods={'default': default_relation_extraction_config, - 'positional': default_relation_extraction_positional_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_relation_cols, - output_level=L.RELATION, - node=NLP_HC_FEATURE_NODES.nodes[H_A.RELATION_EXTRACTION], - description='Classical ML model for predicting relation ship between entity pairs', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.RELATION_EXTRACTION, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.RELATION_EXTRACTION], - trainable_mirror_anno=H_A.TRAINABLE_RELATION_EXTRACTION, - has_storage_ref=True, - is_storage_ref_consumer=True + H_A.RELATION_EXTRACTION: partial(NluComponent, + name=H_A.RELATION_EXTRACTION, + type=T.RELATION_CLASSIFIER, + get_default_model=RelationExtraction.get_default_model, + get_pretrained_model=RelationExtraction.get_pretrained_model, + get_trainable_model=RelationExtraction.get_default_trainable_model, + pdf_extractor_methods={'default': default_relation_extraction_config, + 'positional': default_relation_extraction_positional_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_relation_cols, + output_level=L.RELATION, + node=NLP_HC_FEATURE_NODES.nodes[H_A.RELATION_EXTRACTION], + description='Classical ML model for predicting relation ship between entity pairs', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.RELATION_EXTRACTION, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.RELATION_EXTRACTION], + trainable_mirror_anno=H_A.TRAINABLE_RELATION_EXTRACTION, + has_storage_ref=True, + is_storage_ref_consumer=True - )), - H_A.TRAINABLE_RELATION_EXTRACTION: copy(NluComponent( - name=H_A.TRAINABLE_RELATION_EXTRACTION, - type=T.RELATION_CLASSIFIER, - get_default_model=RelationExtraction.get_default_model, - get_pretrained_model=RelationExtraction.get_pretrained_model, - get_trainable_model=RelationExtraction.get_default_trainable_model, - pdf_extractor_methods={'default': default_relation_extraction_config, - 'positional': default_relation_extraction_positional_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_relation_cols, - output_level=L.RELATION, - node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_RELATION_EXTRACTION], - description='Trainable Classical ML model for predicting relation ship between entity pairs', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.TRAINABLE_RELATION_EXTRACTION, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.TRAINABLE_RELATION_EXTRACTION], - trained_mirror_anno=H_A.RELATION_EXTRACTION, - trainable=True, - has_storage_ref=True, - is_storage_ref_consumer=True - )), - H_A.RELATION_EXTRACTION_DL: copy(NluComponent( - name=H_A.RELATION_EXTRACTION_DL, - type=T.RELATION_CLASSIFIER, - get_default_model=RelationExtractionDL.get_default_model, - get_pretrained_model=RelationExtractionDL.get_pretrained_model, - # get_trainable_model=RelationExtractionDL.get_default_trainable_model, - pdf_extractor_methods={'default': default_relation_extraction_config, - 'positional': default_relation_extraction_positional_config, - 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_relation_cols, - output_level=L.RELATION, - node=NLP_HC_FEATURE_NODES.nodes[H_A.RELATION_EXTRACTION_DL], - description='Deep Learning based model for predicting relation ship between entity pairs', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.RELATION_EXTRACTION_DL, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.RELATION_EXTRACTION_DL], + ), + H_A.TRAINABLE_RELATION_EXTRACTION: partial(NluComponent, + name=H_A.TRAINABLE_RELATION_EXTRACTION, + type=T.RELATION_CLASSIFIER, + get_default_model=RelationExtraction.get_default_model, + get_pretrained_model=RelationExtraction.get_pretrained_model, + get_trainable_model=RelationExtraction.get_default_trainable_model, + pdf_extractor_methods={'default': default_relation_extraction_config, + 'positional': default_relation_extraction_positional_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_relation_cols, + output_level=L.RELATION, + node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_RELATION_EXTRACTION], + description='Trainable Classical ML model for predicting relation ship between entity pairs', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.TRAINABLE_RELATION_EXTRACTION, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.TRAINABLE_RELATION_EXTRACTION], + trained_mirror_anno=H_A.RELATION_EXTRACTION, + trainable=True, + has_storage_ref=True, + is_storage_ref_consumer=True + ), + H_A.RELATION_EXTRACTION_DL: partial(NluComponent, + name=H_A.RELATION_EXTRACTION_DL, + type=T.RELATION_CLASSIFIER, + get_default_model=RelationExtractionDL.get_default_model, + get_pretrained_model=RelationExtractionDL.get_pretrained_model, + # get_trainable_model=RelationExtractionDL.get_default_trainable_model, + pdf_extractor_methods={'default': default_relation_extraction_config, + 'positional': default_relation_extraction_positional_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_relation_cols, + output_level=L.RELATION, + node=NLP_HC_FEATURE_NODES.nodes[H_A.RELATION_EXTRACTION_DL], + description='Deep Learning based model for predicting relation ship between entity pairs', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.RELATION_EXTRACTION_DL, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.RELATION_EXTRACTION_DL], - # trainable_mirror_anno=H_A.TRAINABLE_RELATION_EXTRACTION_DL - )), - # H_A.TRAINABLE_RELATION_EXTRACTION_DL: copy(NluComponent( # DOES NOT EXIST! + # trainable_mirror_anno=H_A.TRAINABLE_RELATION_EXTRACTION_DL + ), + # H_A.TRAINABLE_RELATION_EXTRACTION_DL: partial(NluComponent, # DOES NOT EXIST! # name=H_A.TRAINABLE_RELATION_EXTRACTION_DL, # type=T.RELATION_CLASSIFIER, # get_default_model=RelationExtractionDL.get_default_model, @@ -1862,244 +2047,253 @@ class ComponentUniverse: # # trained_mirror_anno=H_A.RELATION_EXTRACTION_DL, # trainable=True - # )), - H_A.SENTENCE_ENTITY_RESOLVER: copy(NluComponent( - name=H_A.SENTENCE_ENTITY_RESOLVER, - type=T.CHUNK_CLASSIFIER, - get_pretrained_model=SentenceResolver.get_pretrained_model, - get_trainable_model=SentenceResolver.get_default_trainable_model, - pdf_extractor_methods={'default': default_chunk_resolution_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentence_resolution_cols, - output_level=L.RELATION, - node=NLP_HC_FEATURE_NODES.nodes[H_A.SENTENCE_ENTITY_RESOLVER], - description='Deep Learning based entity resolver which extracts resolved entities directly from Sentence Embedding. No NER model required.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.SENTENCE_ENTITY_RESOLVER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.SENTENCE_ENTITY_RESOLVER], - - trained_mirror_anno=H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER, - is_storage_ref_consumer=True, - has_storage_ref=True - )), - H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER: copy(NluComponent( - name=H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER, - type=T.CHUNK_CLASSIFIER, - get_pretrained_model=SentenceResolver.get_pretrained_model, - get_trainable_model=SentenceResolver.get_default_trainable_model, - pdf_extractor_methods={'default': default_chunk_resolution_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_sentence_resolution_cols, - output_level=L.RELATION, - node=NLP_HC_FEATURE_NODES.nodes[H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER], - description='Trainable Deep Learning based entity resolver which extracts resolved entities directly from Sentence Embedding. No NER model required.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER], - trained_mirror_anno=H_A.SENTENCE_ENTITY_RESOLVER, - is_storage_ref_consumer=True, - trainable=True, - has_storage_ref=True - )), - H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION: copy(NluComponent( - name=H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION, - type=T.TRANSFORMER_TOKEN_CLASSIFIER, - get_default_model=TokenBertHealthcare.get_default_model, - get_pretrained_model=TokenBertHealthcare.get_pretrained_model, - pdf_extractor_methods={'default': default_token_classifier_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, - output_level=L.TOKEN, # Handled like NER model - node=NLP_HC_FEATURE_NODES.nodes[H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION], - description='MedicalBertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', - provider=ComponentBackends.open_source, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION], - - )), - - H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqBertMedicalClassifier.get_default_model, - get_pretrained_model=SeqBertMedicalClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, # Handled like NER model - node=NLP_HC_FEATURE_NODES.nodes[H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION], - description='Custom Architecture John Snow labs developed, called MedicalBertForSequenceClassification. It can load BERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION], - )), - - H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION: copy(NluComponent( - name=H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION, - type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, - get_default_model=SeqDilstilBertMedicalClassifier.get_default_model, - get_pretrained_model=SeqDilstilBertMedicalClassifier.get_pretrained_model, - pdf_extractor_methods={'default': default_classifier_dl_config, 'default_full': default_full_config, }, - pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, - output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, # Handled like NER model - node=NLP_HC_FEATURE_NODES.nodes[H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION], - description='Custom Architecture John Snow labs developed, called MedicalDistilBertForSequenceClassification. It can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', - provider=ComponentBackends.hc, - license=Licenses.hc, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION, - jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION], - )) - - # MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION fTOK - } - ocr_components = { - O_A.IMAGE2TEXT: copy(NluComponent( - name=O_A.IMAGE2TEXT, - type=T.TEXT_RECOGNIZER, - get_default_model=Img2Text.get_default_model, - pdf_extractor_methods={'default': default_text_recognizer_config}, - pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor - output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? - node=OCR_FEATURE_NODES.nodes[O_A.IMAGE2TEXT], - description='Recognize text from image files', - provider=ComponentBackends.ocr, - license=Licenses.ocr, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=O_A.IMAGE2TEXT, - jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE2TEXT], - - applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', '.TIFF'] - )), - - O_A.DOC2TEXT: copy(NluComponent( - name=O_A.DOC2TEXT, - type=T.TEXT_RECOGNIZER, - get_default_model=Doc2Text.get_default_model, - pdf_extractor_methods={'default': default_text_recognizer_config}, - pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor - output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? - node=OCR_FEATURE_NODES.nodes[O_A.DOC2TEXT], - description='Recognize text from DOC/DOCX files', - provider=ComponentBackends.ocr, - license=Licenses.ocr, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=O_A.DOC2TEXT, - jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.DOC2TEXT], - - applicable_file_types=['DOC', 'DOCX'] - )), - - O_A.PDF2TEXT: copy(NluComponent( - name=O_A.PDF2TEXT, - type=T.TEXT_RECOGNIZER, - get_default_model=Pdf2Text.get_default_model, - pdf_extractor_methods={'default': default_text_recognizer_config}, - pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor - output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? - node=OCR_FEATURE_NODES.nodes[O_A.PDF2TEXT], - description='Recognize text from PDF files', - provider=ComponentBackends.ocr, - license=Licenses.ocr, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=O_A.PDF2TEXT, - jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.PDF2TEXT], - applicable_file_types=['PDF'] - )), + # ), + H_A.SENTENCE_ENTITY_RESOLVER: partial(NluComponent, + name=H_A.SENTENCE_ENTITY_RESOLVER, + type=T.CHUNK_CLASSIFIER, + get_pretrained_model=SentenceResolver.get_pretrained_model, + get_trainable_model=SentenceResolver.get_default_trainable_model, + pdf_extractor_methods={'default': default_chunk_resolution_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentence_resolution_cols, + output_level=L.RELATION, + node=NLP_HC_FEATURE_NODES.nodes[H_A.SENTENCE_ENTITY_RESOLVER], + description='Deep Learning based entity resolver which extracts resolved entities directly from Sentence Embedding. No NER model required.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.SENTENCE_ENTITY_RESOLVER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.SENTENCE_ENTITY_RESOLVER], - O_A.BINARY2IMAGE: copy(NluComponent( - name=O_A.BINARY2IMAGE, - type=T.HELPER_ANNO, - get_default_model=Binary2Image.get_default_model, - pdf_extractor_methods={'default': default_binary_to_image_config}, - pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor - output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? - node=OCR_FEATURE_NODES.nodes[O_A.BINARY2IMAGE], - description='Convert binary image data to OCR image Spark struct representation', - provider=ComponentBackends.ocr, - license=Licenses.ocr, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=O_A.BINARY2IMAGE, - jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.BINARY2IMAGE], - applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', 'TIFF'] + trained_mirror_anno=H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER, + is_storage_ref_consumer=True, + has_storage_ref=True + ), + H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER: partial(NluComponent, + name=H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER, + type=T.CHUNK_CLASSIFIER, + get_pretrained_model=SentenceResolver.get_pretrained_model, + get_trainable_model=SentenceResolver.get_default_trainable_model, + pdf_extractor_methods={ + 'default': default_chunk_resolution_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_sentence_resolution_cols, + output_level=L.RELATION, + node=NLP_HC_FEATURE_NODES.nodes[ + H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER], + description='Trainable Deep Learning based entity resolver which extracts resolved entities directly from Sentence Embedding. No NER model required.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.TRAINABLE_SENTENCE_ENTITY_RESOLVER], + trained_mirror_anno=H_A.SENTENCE_ENTITY_RESOLVER, + is_storage_ref_consumer=True, + trainable=True, + has_storage_ref=True + ), + H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION: partial(NluComponent, + name=H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION, + type=T.TRANSFORMER_TOKEN_CLASSIFIER, + get_default_model=TokenBertHealthcare.get_default_model, + get_pretrained_model=TokenBertHealthcare.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_token_classifier_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_transformer_token_classifier_cols, + output_level=L.TOKEN, # Handled like NER model + node=NLP_HC_FEATURE_NODES.nodes[ + H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION], + description='MedicalBertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.', + provider=ComponentBackends.open_source, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.MEDICAL_BERT_FOR_TOKEN_CLASSIFICATION], - )), + ), - O_A.PDF2TEXT_TABLE: copy(NluComponent( - name=O_A.PDF2TEXT_TABLE, - type=T.TABLE_RECOGNIZER, - get_default_model=PDF2TextTable.get_default_model, - pdf_extractor_methods={'default': default_binary_to_image_config}, # TODO EXtractor - pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor - output_level=L.DOCUMENT, - node=OCR_FEATURE_NODES.nodes[O_A.PDF2TEXT_TABLE], - description='Extract Tables from PDFs with have highlightable text', - provider=ComponentBackends.ocr, - license=Licenses.ocr, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=O_A.PDF2TEXT_TABLE, - jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.PDF2TEXT_TABLE], - applicable_file_types=['PDF'] + H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqBertMedicalClassifier.get_default_model, + get_pretrained_model=SeqBertMedicalClassifier.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + # Handled like NER model + node=NLP_HC_FEATURE_NODES.nodes[ + H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION], + description='Custom Architecture John Snow labs developed, called MedicalBertForSequenceClassification. It can load BERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.MEDICAL_BERT_FOR_SEQUENCE_CLASSIFICATION], + ), - )), + H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent, + name=H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION, + type=T.TRANSFORMER_SEQUENCE_CLASSIFIER, + get_default_model=SeqDilstilBertMedicalClassifier.get_default_model, + get_pretrained_model=SeqDilstilBertMedicalClassifier.get_pretrained_model, + pdf_extractor_methods={ + 'default': default_classifier_dl_config, + 'default_full': default_full_config, }, + pdf_col_name_substitutor=substitute_seq_bert_classifier_cols, + output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER, + # Handled like NER model + node=NLP_HC_FEATURE_NODES.nodes[ + H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION], + description='Custom Architecture John Snow labs developed, called MedicalDistilBertForSequenceClassification. It can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.', + provider=ComponentBackends.hc, + license=Licenses.hc, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION, + jsl_anno_py_class=ACR.JSL_anno_HC_ref_2_py_class[ + H_A.MEDICAL_DISTILBERT_FOR_SEQUENCE_CLASSIFICATION], + ), + ######### OCR ############## + O_A.IMAGE2TEXT: partial(NluComponent, + name=O_A.IMAGE2TEXT, + type=T.TEXT_RECOGNIZER, + get_default_model=Img2Text.get_default_model, + pdf_extractor_methods={'default': default_text_recognizer_config}, + pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor + output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? + node=OCR_FEATURE_NODES.nodes[O_A.IMAGE2TEXT], + description='Recognize text from image files', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.IMAGE2TEXT, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.IMAGE2TEXT], - O_A.PPT2TEXT_TABLE: copy(NluComponent( - name=O_A.PPT2TEXT_TABLE, - type=T.TABLE_RECOGNIZER, - get_default_model=PPT2TextTable.get_default_model, - pdf_extractor_methods={'default': default_binary_to_image_config}, # TODO EXtractor - pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor - output_level=L.DOCUMENT, - node=OCR_FEATURE_NODES.nodes[O_A.PPT2TEXT_TABLE], - description='Extract Tables from PPT and PPTX files', - provider=ComponentBackends.ocr, - license=Licenses.ocr, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=O_A.PPT2TEXT_TABLE, - jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.PPT2TEXT_TABLE], - applicable_file_types=['PPT','PPTX'] - )), + applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', '.TIFF'] + ), + O_A.DOC2TEXT: partial(NluComponent, + name=O_A.DOC2TEXT, + type=T.TEXT_RECOGNIZER, + get_default_model=Doc2Text.get_default_model, + pdf_extractor_methods={'default': default_text_recognizer_config}, + pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor + output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? + node=OCR_FEATURE_NODES.nodes[O_A.DOC2TEXT], + description='Recognize text from DOC/DOCX files', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.DOC2TEXT, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.DOC2TEXT], + applicable_file_types=['DOC', 'DOCX'] + ), - O_A.DOC2TEXT_TABLE: copy(NluComponent( - name=O_A.DOC2TEXT_TABLE, - type=T.TABLE_RECOGNIZER, - get_default_model=Doc2TextTable.get_default_model, - pdf_extractor_methods={'default': default_binary_to_image_config}, # TODO EXtractor - pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor - output_level=L.DOCUMENT, - node=OCR_FEATURE_NODES.nodes[O_A.DOC2TEXT_TABLE], - description='Extract Tables from PPT and PPTX files', - provider=ComponentBackends.ocr, - license=Licenses.ocr, - computation_context=ComputeContexts.spark, - output_context=ComputeContexts.spark, - jsl_anno_class_id=O_A.DOC2TEXT_TABLE, - jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.DOC2TEXT_TABLE], - applicable_file_types=['DOCX','DOC'] - )), + O_A.PDF2TEXT: partial(NluComponent, + name=O_A.PDF2TEXT, + type=T.TEXT_RECOGNIZER, + get_default_model=Pdf2Text.get_default_model, + pdf_extractor_methods={'default': default_text_recognizer_config}, + pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor + output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? + node=OCR_FEATURE_NODES.nodes[O_A.PDF2TEXT], + description='Recognize text from PDF files', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.PDF2TEXT, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.PDF2TEXT], + applicable_file_types=['PDF'] + ), + O_A.BINARY2IMAGE: partial(NluComponent, + name=O_A.BINARY2IMAGE, + type=T.HELPER_ANNO, + get_default_model=Binary2Image.get_default_model, + pdf_extractor_methods={'default': default_binary_to_image_config}, + pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor + output_level=L.DOCUMENT, # TODO new output level IMG? Or treat as DOC? + node=OCR_FEATURE_NODES.nodes[O_A.BINARY2IMAGE], + description='Convert binary image data to OCR image Spark struct representation', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.BINARY2IMAGE, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.BINARY2IMAGE], + applicable_file_types=['JPEG', 'PNG', 'BMP', 'WBMP', 'GIF', 'JPG', 'TIFF'] + ), + O_A.PDF2TEXT_TABLE: partial(NluComponent, + name=O_A.PDF2TEXT_TABLE, + type=T.TABLE_RECOGNIZER, + get_default_model=PDF2TextTable.get_default_model, + pdf_extractor_methods={'default': default_binary_to_image_config}, # TODO EXtractor + pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor + output_level=L.DOCUMENT, + node=OCR_FEATURE_NODES.nodes[O_A.PDF2TEXT_TABLE], + description='Extract Tables from PDFs with have highlightable text', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.PDF2TEXT_TABLE, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.PDF2TEXT_TABLE], + applicable_file_types=['PDF'] + ), + O_A.PPT2TEXT_TABLE: partial(NluComponent, + name=O_A.PPT2TEXT_TABLE, + type=T.TABLE_RECOGNIZER, + get_default_model=PPT2TextTable.get_default_model, + pdf_extractor_methods={'default': default_binary_to_image_config}, # TODO EXtractor + pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor + output_level=L.DOCUMENT, + node=OCR_FEATURE_NODES.nodes[O_A.PPT2TEXT_TABLE], + description='Extract Tables from PPT and PPTX files', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.PPT2TEXT_TABLE, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.PPT2TEXT_TABLE], + applicable_file_types=['PPT', 'PPTX'] + ), + O_A.DOC2TEXT_TABLE: partial(NluComponent, + name=O_A.DOC2TEXT_TABLE, + type=T.TABLE_RECOGNIZER, + get_default_model=Doc2TextTable.get_default_model, + pdf_extractor_methods={'default': default_binary_to_image_config}, # TODO EXtractor + pdf_col_name_substitutor=substitute_recognized_text_cols, # TODO substitor + output_level=L.DOCUMENT, + node=OCR_FEATURE_NODES.nodes[O_A.DOC2TEXT_TABLE], + description='Extract Tables from PPT and PPTX files', + provider=ComponentBackends.ocr, + license=Licenses.ocr, + computation_context=ComputeContexts.spark, + output_context=ComputeContexts.spark, + jsl_anno_class_id=O_A.DOC2TEXT_TABLE, + jsl_anno_py_class=ACR.JSL_anno_OCR_ref_2_py_class[O_A.DOC2TEXT_TABLE], + applicable_file_types=['DOCX', 'DOC'] + ), } diff --git a/nlu/universe/feature_resolutions.py b/nlu/universe/feature_resolutions.py index edf8133f..64f991a1 100644 --- a/nlu/universe/feature_resolutions.py +++ b/nlu/universe/feature_resolutions.py @@ -25,41 +25,41 @@ class FeatureResolutions: default_OS_resolutions = { NLP_FEATURES.DOCUMENT: ResolvedFeature('document_assembler', 'document_assembler', 'xx', False, - ComponentUniverse.os_components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER]), + ComponentUniverse.components[NLP_NODE_IDS.DOCUMENT_ASSEMBLER]), NLP_FEATURES.TOKEN: ResolvedFeature('en.tokenize', 'spark_nlp_tokenizer', 'en', False, - ComponentUniverse.os_components[NLP_NODE_IDS.TOKENIZER]), + ComponentUniverse.components[NLP_NODE_IDS.TOKENIZER]), NLP_FEATURES.SENTENCE: ResolvedFeature('detect_sentence', 'sentence_detector_dl', 'en', False, - ComponentUniverse.os_components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL]), + ComponentUniverse.components[NLP_NODE_IDS.SENTENCE_DETECTOR_DL]), NLP_FEATURES.SENTENCE_EMBEDDINGS: ResolvedFeature('en.embed_sentence.small_bert_L2_128', 'sent_small_bert_L2_128', 'en', True, - ComponentUniverse.os_components[ + ComponentUniverse.components[ NLP_NODE_IDS.BERT_SENTENCE_EMBEDDINGS]), NLP_FEATURES.WORD_EMBEDDINGS: ResolvedFeature('en.embed.bert.small_L2_128', 'small_bert_L2_128', 'en', True, - ComponentUniverse.os_components[NLP_NODE_IDS.BERT_EMBEDDINGS]), + ComponentUniverse.components[NLP_NODE_IDS.BERT_EMBEDDINGS]), NLP_FEATURES.POS: ResolvedFeature('en.pos', 'pos_anc', 'en', True, - ComponentUniverse.os_components[NLP_NODE_IDS.POS]), + ComponentUniverse.components[NLP_NODE_IDS.POS]), NLP_FEATURES.NAMED_ENTITY_IOB: ResolvedFeature('en.ner.onto.bert.cased_base', 'onto_bert_base_cased', 'en', True, - ComponentUniverse.os_components[NLP_NODE_IDS.NER_DL]), + ComponentUniverse.components[NLP_NODE_IDS.NER_DL]), NLP_FEATURES.NAMED_ENTITY_CONVERTED: ResolvedFeature('ner_converter', 'ner_converter', 'xx', False, - ComponentUniverse.os_components[NLP_NODE_IDS.NER_CONVERTER]), + ComponentUniverse.components[NLP_NODE_IDS.NER_CONVERTER]), NLP_FEATURES.UNLABLED_DEPENDENCY: ResolvedFeature('en.dep.untyped', 'dependency_conllu', 'en', True, - ComponentUniverse.os_components[ + ComponentUniverse.components[ NLP_NODE_IDS.UNTYPED_DEPENDENCY_PARSER]), NLP_FEATURES.LABELED_DEPENDENCY: ResolvedFeature('en.dep.typed', 'dependency_typed_conllu', 'en', True, - ComponentUniverse.os_components[ + ComponentUniverse.components[ NLP_NODE_IDS.TYPED_DEPENDENCY_PARSER]), NLP_FEATURES.CHUNK: ResolvedFeature('en.chunk', 'default_chunker', 'xx', False, - ComponentUniverse.os_components[NLP_NODE_IDS.CHUNKER]), + ComponentUniverse.components[NLP_NODE_IDS.CHUNKER]), NLP_FEATURES.DOCUMENT_FROM_CHUNK: ResolvedFeature(NLP_NODE_IDS.CHUNK2DOC, NLP_NODE_IDS.CHUNK2DOC, 'xx', False, - ComponentUniverse.os_components[NLP_NODE_IDS.CHUNK2DOC]), + ComponentUniverse.components[NLP_NODE_IDS.CHUNK2DOC]), NLP_FEATURES.CHUNK_EMBEDDINGS: ResolvedFeature('en.embed_chunk', 'chunk_embeddings', 'xx', False, - ComponentUniverse.os_components[ + ComponentUniverse.components[ NLP_NODE_IDS.CHUNK_EMBEDDINGS_CONVERTER]), } @@ -67,11 +67,11 @@ class FeatureResolutions: # TODO we need ideal resolution for each lang and domain...! NLP_FEATURES.NAMED_ENTITY_IOB: ResolvedFeature('en.med_ner.jsl', 'ner_jsl', 'en', True, - ComponentUniverse.hc_components[NLP_HC_NODE_IDS.MEDICAL_NER]), + ComponentUniverse.components[NLP_HC_NODE_IDS.MEDICAL_NER]), NLP_FEATURES.NAMED_ENTITY_CONVERTED: ResolvedFeature(NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL, NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL, 'xx', False, - ComponentUniverse.hc_components[ + ComponentUniverse.components[ NLP_HC_NODE_IDS.NER_CONVERTER_INTERNAL]), } @@ -80,10 +80,10 @@ class FeatureResolutions: NLP_FEATURES.NAMED_ENTITY_CONVERTED: ResolvedFeature(NLP_NODE_IDS.DOC2CHUNK, NLP_NODE_IDS.DOC2CHUNK, 'xx', False, - ComponentUniverse.os_components[NLP_NODE_IDS.DOC2CHUNK]), + ComponentUniverse.components[NLP_NODE_IDS.DOC2CHUNK]), } default_OCR_resolutions = { OCR_FEATURES.OCR_IMAGE: ResolvedFeature(OCR_NODE_IDS.BINARY2IMAGE, OCR_NODE_IDS.BINARY2IMAGE, 'xx', False, - ComponentUniverse.ocr_components[OCR_NODE_IDS.BINARY2IMAGE]), + ComponentUniverse.components[OCR_NODE_IDS.BINARY2IMAGE]), } diff --git a/nlu/universe/feature_universes.py b/nlu/universe/feature_universes.py index 8f92a41a..97e4eb69 100644 --- a/nlu/universe/feature_universes.py +++ b/nlu/universe/feature_universes.py @@ -6,7 +6,6 @@ ### ____ Annotator Feature Representations ____ - class NLP_FEATURES(JslFeature): """ NLP Features diff --git a/nlu/utils/environment/offline_load_utils.py b/nlu/utils/environment/offline_load_utils.py index f1368634..ba89f3e2 100644 --- a/nlu/utils/environment/offline_load_utils.py +++ b/nlu/utils/environment/offline_load_utils.py @@ -2,7 +2,7 @@ import os from nlu.universe.annotator_class_universe import AnnoClassRef -from nlu.universe.component_universes import ComponentUniverse +from nlu.universe.component_universes import ComponentUniverse, jsl_id_to_empty_component from nlu.universe.universes import Licenses @@ -48,7 +48,7 @@ def verify_and_create_model(model_path: str): # construct_component_from_identifier('xx', nlu_ref = class_name, nlp_ref = class_name, anno_class_name=class_name) if class_name in os_annos.keys(): jsl_anno_id = os_annos[class_name] - nlu_component = ComponentUniverse.os_components[jsl_anno_id] + nlu_component = jsl_id_to_empty_component(jsl_anno_id) return nlu_component.set_metadata(m, jsl_anno_id, jsl_anno_id, 'xx', @@ -56,7 +56,7 @@ def verify_and_create_model(model_path: str): elif class_name in hc_annos.keys(): jsl_anno_id = hc_annos[class_name] - nlu_component = ComponentUniverse.hc_components[jsl_anno_id] + nlu_component = jsl_id_to_empty_component(jsl_anno_id) return nlu_component.set_metadata(m, jsl_anno_id, jsl_anno_id, 'xx', From 545f95026628783afc88cf9601912cd64886666a Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Mon, 11 Apr 2022 08:35:34 +0200 Subject: [PATCH 3/4] refactored streamlit imports --- nlu/pipe/pipeline.py | 62 ++++++++++++++------------------------------ 1 file changed, 19 insertions(+), 43 deletions(-) diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py index 05a28110..f4d2dafa 100644 --- a/nlu/pipe/pipeline.py +++ b/nlu/pipe/pipeline.py @@ -18,7 +18,7 @@ from nlu.pipe.utils.output_level_resolution_utils import OutputLevelUtils from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils from nlu.universe.universes import Licenses -from nlu.utils.environment.env_utils import is_running_in_databricks +from nlu.utils.environment.env_utils import is_running_in_databricks, try_import_streamlit logger = logging.getLogger('nlu') @@ -617,11 +617,8 @@ def viz_streamlit(self, ) -> None: """Display Viz in streamlit""" - # try: from nlu.component_list.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.viz_streamlit_dashboard(self, text, model_selection, @@ -663,10 +660,8 @@ def viz_streamlit_token( show_text_input: bool = True, ): - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.visualize_tokens_information(self, text, title, sub_title, show_feature_select, features, metadata, output_level, positions, set_wide_layout_CSS, generate_code_sample, key, @@ -691,10 +686,8 @@ def viz_streamlit_classes( show_infos: bool = True, show_logo: bool = True, ) -> None: - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.visualize_classes(self, text, output_level, title, sub_title, metadata, positions, set_wide_layout_CSS, generate_code_sample, key, show_model_selector, model_select_position, show_infos, show_logo) @@ -712,10 +705,8 @@ def viz_streamlit_dep_tree( show_logo: bool = True, show_text_input: bool = True, ) -> None: - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.visualize_dep_tree(self, text, title, sub_title, set_wide_layout_CSS, generate_code_sample, key, show_infos, show_logo, show_text_input, ) @@ -740,10 +731,8 @@ def viz_streamlit_ner( show_text_input: bool = True, ): - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.visualize_ner(self, text, ner_tags, show_label_select, show_table, title, sub_title, colors, show_color_selector, set_wide_layout_CSS, generate_code_sample, key, model_select_position, show_model_select, show_infos, show_logo, @@ -773,10 +762,8 @@ def viz_streamlit_word_similarity( show_logo: bool = True, ): - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.display_word_similarity(self, texts, threshold, title, sub_tile, write_raw_pandas, display_embed_information, similarity_matrix, show_algo_select, dist_metrics, set_wide_layout_CSS, generate_code_sample, key, @@ -810,10 +797,8 @@ def viz_streamlit_word_embed_manifold(self, show_logo: bool = True, n_jobs: Optional[int] = 3, # False ): - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.viz_streamlit_word_embed_manifold(self, default_texts, title, @@ -862,10 +847,8 @@ def viz_streamlit_sentence_embed_manifold(self, show_logo: bool = True, n_jobs: Optional[int] = 3, # False ): - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.viz_streamlit_sentence_embed_manifold(self, default_texts, title, @@ -907,10 +890,8 @@ def viz_streamlit_entity_embed_manifold(self, show_logo: bool = True, n_jobs: Optional[int] = 3, # False ): - try: - from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler - except ImportError: - print("You need to install Streamlit to run this functionality.") + try_import_streamlit() + from nlu.pipe.viz.streamlit_viz.streamlit_dashboard_OS import StreamlitVizBlockHandler StreamlitVizBlockHandler.viz_streamlit_entity_embed_manifold(self, default_texts, title, @@ -925,8 +906,3 @@ def viz_streamlit_entity_embed_manifold(self, show_infos, show_logo, n_jobs) - - def check_pyspark_pyarrow_optimization_compatibility(self): - # Only works for pyspark "3.1.2" - v = pyspark.version.__version__.split('.') - if int(v[0]) == 3 and int(v[1]) >= 1: return True From 9155ae3233ee3068791e765e67ef9418117193c8 Mon Sep 17 00:00:00 2001 From: C-K-Loan Date: Sun, 17 Apr 2022 12:24:13 +0200 Subject: [PATCH 4/4] fixed bad th storage ref --- nlu/spellbook.py | 55 ++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/nlu/spellbook.py b/nlu/spellbook.py index 3ae6211f..2d1a16f0 100644 --- a/nlu/spellbook.py +++ b/nlu/spellbook.py @@ -1,10 +1,4 @@ -from typing import Dict - -import sparknlp - -from nlu.universe.annotator_class_universe import AnnoClassRef from nlu.universe.feature_node_ids import NLP_NODE_IDS, NLP_HC_NODE_IDS, OCR_NODE_IDS -from nlu.universe.atoms import JslAnnoId class Spellbook: @@ -65,7 +59,6 @@ class Spellbook: # 'train.labeled_dependency_parser': '', # 'train.vivekn_sentiment': '', - } # Reference to all datasets for which we have pretrained models @@ -230,10 +223,10 @@ class Spellbook: # # 2.7.0 new aliases 't5': ('t5_base', 'model'), - 'summarize': ('t5_base', 'model', {'setTask':'"summarize: "'}), - 'grammar_correctness': ('t5_base', 'model',{'setTask':'"cola sentence: "' }), - 'answer_question': ('t5_base', 'model', {'setTask':'"question: "'}), - 'classify.sentiment_t5': ('t5_base','model',{'setTask':'"sst2 sentence: "'}), + 'summarize': ('t5_base', 'model', {'setTask': '"summarize: "'}), + 'grammar_correctness': ('t5_base', 'model', {'setTask': '"cola sentence: "'}), + 'answer_question': ('t5_base', 'model', {'setTask': '"question: "'}), + 'classify.sentiment_t5': ('t5_base', 'model', {'setTask': '"sst2 sentence: "'}), } # multi lang models @@ -2028,7 +2021,6 @@ class Spellbook: 'en.t5.small': 't5_small', 'en.t5.base': 't5_base', - # 2.7,1 and 2.7.2 ATIS classifier and ALIASES "en.classify.questions.atis": "classifierdl_use_atis", "en.classify.questions.airline": "classifierdl_use_atis", @@ -2181,10 +2173,10 @@ class Spellbook: 'en.classify.typos.distilbert': 'distilbert_token_classifier_typo_detector', # NLP 3.4.2 - 'en.embed.deberta_v3_xsmall':'deberta_v3_xsmall', - 'en.embed.deberta_v3_small':'deberta_v3_small', - 'en.embed.deberta_v3_base':'deberta_v3_base', - 'en.embed.deberta_v3_large':'deberta_v3_large', + 'en.embed.deberta_v3_xsmall': 'deberta_v3_xsmall', + 'en.embed.deberta_v3_small': 'deberta_v3_small', + 'en.embed.deberta_v3_base': 'deberta_v3_base', + 'en.embed.deberta_v3_large': 'deberta_v3_large', }, @@ -2767,7 +2759,7 @@ class Spellbook: }, 'xx': { - 'xx.embed.mdeberta_v3_base':'mdeberta_v3_base', + 'xx.embed.mdeberta_v3_base': 'mdeberta_v3_base', 'xx.embed.albert.indic': 'albert_indic', 'xx.ner.masakhaner.distilbert': 'distilbert_base_token_classifier_masakhaner', # 3.4.0 @@ -2835,7 +2827,6 @@ class Spellbook: # 'xx.use.xling_en_es': 'tfhub_use_xling_en_es', # 'xx.use.xling_en_fr': 'tfhub_use_xling_en_fr', # 'xx.use.xling_many': 'tfhub_use_xling_many', - 'xx.use.multi' # 2.7.0 marian translate model references "xx.swc.marian.translate_to.en": "opus_mt_swc_en", @@ -4198,7 +4189,7 @@ class Spellbook: { # 3.4.2 - 'en.med_ner.clinical_trials' : 'bert_sequence_classifier_rct_biobert', + 'en.med_ner.clinical_trials': 'bert_sequence_classifier_rct_biobert', # 3.4.1 'en.med_ner.supplement_clinical': 'ner_supplement_clinical', @@ -4555,8 +4546,8 @@ class Spellbook: 'es': { # 3.4.2 - 'es.med_ner.deid.generic.roberta' : 'ner_deid_generic_roberta_augmented', - 'es.med_ner.deid.subentity.roberta' : 'ner_deid_subentity_roberta_augmented', + 'es.med_ner.deid.generic.roberta': 'ner_deid_generic_roberta_augmented', + 'es.med_ner.deid.subentity.roberta': 'ner_deid_subentity_roberta_augmented', # 3.4.1 'es.embed.sciwiki_300d': 'embeddings_sciwiki_300d', @@ -4589,7 +4580,6 @@ class Spellbook: 'pdf2text': OCR_NODE_IDS.PDF2TEXT, 'doc2text': OCR_NODE_IDS.DOC2TEXT, - 'pdf2table': OCR_NODE_IDS.PDF2TEXT_TABLE, 'doc2table': OCR_NODE_IDS.DOC2TEXT_TABLE, 'ppt2table': OCR_NODE_IDS.PPT2TEXT_TABLE, @@ -4643,7 +4633,8 @@ class Spellbook: 'glove_6B_300': 'xx.embed.glove.6B_300', }, 'th': { - 'tfhub_use_multi_lg': 'xx.use.multi' + 'tfhub_use_multi_lg': 'xx.use.multi', + 'labse': 'xx.embed_sentence.labse', }, 'ur': { @@ -4835,17 +4826,17 @@ class Spellbook: 'default_chunker': 'Chunker', # HC 3.4.2 - 'bert_sequence_classifier_rct_biobert' : 'MedicalBertForSequenceClassification', - 'ner_deid_generic_augmented' : 'MedicalNerModel', - 'ner_deid_generic_roberta_augmented' : 'MedicalNerModel', - 'ner_deid_subentity_roberta_augmented' : 'MedicalNerModel', + 'bert_sequence_classifier_rct_biobert': 'MedicalBertForSequenceClassification', + 'ner_deid_generic_augmented': 'MedicalNerModel', + 'ner_deid_generic_roberta_augmented': 'MedicalNerModel', + 'ner_deid_subentity_roberta_augmented': 'MedicalNerModel', # NLP 3.4.2 - 'deberta_v3_xsmall':'DeBertaEmbeddings', - 'deberta_v3_small':'DeBertaEmbeddings', - 'deberta_v3_base':'DeBertaEmbeddings', - 'deberta_v3_large':'DeBertaEmbeddings', - 'mdeberta_v3_base':'DeBertaEmbeddings', + 'deberta_v3_xsmall': 'DeBertaEmbeddings', + 'deberta_v3_small': 'DeBertaEmbeddings', + 'deberta_v3_base': 'DeBertaEmbeddings', + 'deberta_v3_large': 'DeBertaEmbeddings', + 'mdeberta_v3_base': 'DeBertaEmbeddings', # NLP HC 3.4.1 'embeddings_sciwiki_300d': 'WordEmbeddingsModel', 'ner_deid_generic': 'MedicalNerModel',