From 4ca7c5f645d826ed9d9c60f81526ea49db209328 Mon Sep 17 00:00:00 2001 From: TURX Date: Sat, 10 Dec 2022 22:09:17 -0600 Subject: [PATCH] shared sent segments, mnli on gpu, moverscore minor: classical metrics, moverscore topk fix: #6, #7, #11 rm: obsolete achived_experiments tofix: moverscore truncation & corr, pipeline refactor to automodel --- README.md | 24 +++- archived_experiments/README.md | 5 - archived_experiments/bao_tac2010.py | 144 -------------------- archived_experiments/eval.py | 204 ---------------------------- bertscore_sentence/eval.py | 36 +++-- classic/eval.py | 49 +++++++ dar_env.py | 13 +- mnli/eval.py | 19 +-- requirements.txt | 4 +- topk/eval.py | 54 ++++---- type_piece.py | 11 ++ 11 files changed, 137 insertions(+), 426 deletions(-) delete mode 100644 archived_experiments/README.md delete mode 100644 archived_experiments/bao_tac2010.py delete mode 100644 archived_experiments/eval.py create mode 100644 classic/eval.py create mode 100644 type_piece.py diff --git a/README.md b/README.md index da13a8f..5524408 100644 --- a/README.md +++ b/README.md @@ -14,17 +14,29 @@ Do not reinvent the wheel: # Approaches -## Approach 0: just replacing human summaries with documents +For metrics, add the following, as well as changes specified in "Usage" part of other approaches, to `env.py` of [EvalBase](https://github.com/SigmaWe/EvalBase)](https://github.com/SigmaWe/EvalBase): + +```python +import sys +sys.path.append("/path/to/DocAsRef/") +``` -Metrics: ROUGE, BERTScore, BLEURT +## Approach 0: just replacing human summaries with documents -Integrated into [EvalBase](https://github.com/SigmaWe/EvalBase)](https://github.com/SigmaWe/EvalBase) +Metrics: BERTScore, ROUGE, BLEURT, MoverScore -For non-integrated metrics, add the following, as well as changes specified in "Usage" part of other approaches, to `env.py` of EvalBase: +Implemented in `/classic` +Usage: ```python -import sys -sys.path.append("/path/to/DocAsRef/") +import classic.eval as classic +metrics = { + "bertscore": classic.bertscore_compute, + "rouge": classic.rouge_compute, + "bleurt": classic.bleurt_compute, + "moverscore-1gram": functools.partial(classic.moverscore_compute, n_gram=1), + "moverscore-2gram": functools.partial(classic.moverscore_compute, n_gram=2), +} ``` ## Approach 1: sentence-level, better similarity metrics, and better weighting methods diff --git a/archived_experiments/README.md b/archived_experiments/README.md deleted file mode 100644 index 9497bc5..0000000 --- a/archived_experiments/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Archived Experiments - -For TAC2010 testing only - -Moved to https://github.com/SigmaWe/EvalBase diff --git a/archived_experiments/bao_tac2010.py b/archived_experiments/bao_tac2010.py deleted file mode 100644 index 44f0738..0000000 --- a/archived_experiments/bao_tac2010.py +++ /dev/null @@ -1,144 +0,0 @@ -# Running experiments in TAC 2010 - -import pandas -import pickle -import json -import sys -sys.path.append("../SueNes/human/tac") -import tac -import os.path - - -# 43 machine + 4 human summarizers per docsetID -# Each docsetID has 10 articles - -def clean_text(s:str): - """Clean up the text in doc or summ in RealSumm dataset - including, removing HTML tags, unescap HTML control sequences - """ - s = s.replace("", "") - s = s.replace("", "") - s = s.replace("\t", " ") - s = s.strip() - return s - -def merge_article_summary_score(articles, summaries, scores, debug=False ): - columns = [ - "docsetID", - "System", # we need docsetID so later we can use groupby easily to average across multiple documents corresponds to one summary - "ArticleText", "ReferenceSummary", "SystemSummary", - "Pyramid", "Linguistic", "Overall" # the human scores - ] - - counter = 0 - dataset_df = pandas.DataFrame(columns=columns) - for docID, summary_dict in summaries.items(): - for articleID in range(10): - ArticleText = " ".join(articles[docID][articleID]) - # sentences were cut into lists of strings in TAC - for System, summary_sentences in summary_dict.items(): - row = { - "docsetID": [docID], - "ArticleText": [ArticleText], - "System": [System], - "ReferenceSummary" : ["Place Holder"], # TODO where is TAC's reference summary? - "SystemSummary": [" ".join(summary_sentences)], - "Pyramid": [scores[docID][System][0] ], - "Linguistic": [scores[docID][System][1]], - "Overall": [scores[docID][System][2]] - } - - tmp_dataset_df = pandas.DataFrame.from_dict(row) - - dataset_df = pandas.concat([dataset_df, tmp_dataset_df], ignore_index=True) - counter += 1 - if debug and counter > 3: - break - - return dataset_df - -def load_tac(dataroot:str, debug=False): - """ - - We assume that you have fully recursively extracted the two files. - - [`GuidedSumm2010_eval.tgz`](https://tac.nist.gov/protected/past-aquaint-aquaint2/2010/GuidedSumm2010_eval.tgz) Downloadable from web, containing human evaluation results and system summaries. - - `TAC2010_Summarization_Documents.tgz` Emailed by NIST, containing the documents for which summaries are generated and rated. - Both files require you to apply to NIST for access. - - The _dataroot_ directory should have the following structure: - dataroot - ├── GuidedSumm2010_eval - │   ├── BE - │   │   ├── models - │   │   └── peers - │   ├── manual - │   │   ├── models - │   │   ├── peers - │   │   └── pyramids - │   └── ROUGE - │   ├── models - │   └── peers - └── TAC2010_Summarization_Documents - └── GuidedSumm10_test_docs_files - ├── D1001A - │   ├── D1001A-A - │   └── D1001A-B - ├── D1002A - │   ├── D1002A-A - │   └── D1002A-B - ├── D1003A - │   ├── D1003A-A - │   └── D1003A-B - ├── D1004A - │   ├── D1004A-A - │   └── D1004A-B - ... abridged ... - - """ - article_set_path = os.path.join(dataroot, "TAC2010_Summarization_Documents/GuidedSumm10_test_docs_files/") - summary_set_path = os.path.join(dataroot, "GuidedSumm2010_eval/ROUGE") - human_score_path = os.path.join(dataroot, "GuidedSumm2010_eval/manual") - - # rouge_score_path = os.path.join(dataroot, "GuidedSumm2010_eval/ROUGE/rouge_A.m.out") - - setIDs = ["A"] # we only use set A because set B is not applicable - sentence_delimiter = " " - summary_types = ["peers", "models"] - - articles = tac.get_articles(article_set_path, setIDs, sentence_delimiter) - # _,_,_ = get_statistics(articles) - - summaries = tac.get_summaries(summary_set_path, setIDs, sentence_delimiter, summary_types) - # sentence_delimiter, NOT IN USE - - scores = tac.get_scores(human_score_path, summary_types, setIDs) - - dataset_df = merge_article_summary_score(articles, summaries, scores, debug=debug) - - - return dataset_df - -if __name__ == "__main__": - - import pickle - debug= True - - if debug: - dataset_df = pickle.load(open('tac_df.pkl', 'rb')) - else: - dataset_df = load_tac("/media/forrest/12T_EasyStore1/data/NLP/resources/TAC_DUC/TAC2010", debug=debug) - pickle.dump(dataset_df, open("tac_df.pkl", 'wb')) - - import eval - - corr_df = eval.eval_summary_level(dataset_df, debug=debug, is_multi=True ) - with pandas.option_context('display.max_rows', None, - 'display.max_columns', None, - 'display.precision', 3, - ): - print(corr_df['average']) - - with open(f"result_tac2010.json", 'w') as f: - json_ugly = corr_df.to_json(orient="index") - json_parsed = json.loads(json_ugly) - f.write(json.dumps(json_parsed, indent=2)) diff --git a/archived_experiments/eval.py b/archived_experiments/eval.py deleted file mode 100644 index 96d4c03..0000000 --- a/archived_experiments/eval.py +++ /dev/null @@ -1,204 +0,0 @@ -import json - -import evaluate -import pandas - -import functools -import env -import numpy, scipy - -import sys -sys.path.append("../") -import bertscore_sentence.eval as bertscore_sentence - -import typing -import tqdm - - -# TODO -# ref_{free,based}_metrics is a dict {str:function} -# ref_based_metrics = { -# "bleurt": evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric'), -# "rouge": functools.partial( evaluate.load("rouge"), use_aggregator=False) - -# } - -# ref_free_metrics = { -# "bertscore-sentence": bertscore_sentence -# } -# all metrics shall return a dict {metric_name: List[float]} - - -def model_eval( - sys_summaries: list, - ref_summaries: list, - docs: list, - models: typing.List[str], - approaches: typing.List[str]) -> pandas.DataFrame: - """Given a batch of samples, run various automated summary metrics to evaluate the quality of summaries. - """ - - # Create a placeholder multiindex Dataframe - # Each row corresponds to one (doc, sys) or (ref, sys) pair, i.e., one sample. - # columns are the metrics nested in 3 levels (approach, model, score_name). - index= pandas.MultiIndex.from_tuples([], names = ["approach", "model", "score_name"]) - batch_result_df = pandas.DataFrame((), columns =index) - - for model_name in models: - # print('Model: ' + model_name) - if model_name == 'bleurt': - model = evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric') - elif model_name == 'bertscore-sentence': - model = bertscore_sentence - else: - model = evaluate.load(model_name) - - # calculate traditional (reference, system summary) pairs and new (document, system summary) pairs - for approach in approaches: - # print('Evaluating on ' + approach + ' approach') - cands = sys_summaries - refs = ref_summaries if approach == "trad" else docs - if model_name == 'bertscore': - model_result = model.compute(predictions=cands, references=refs, lang='en', use_fast_tokenizer=True) - elif model_name == 'rouge': - model_result = model.compute(predictions=cands, references=refs, use_aggregator=False) - elif model_name == 'bertscore-sentence': - model_result = model.compute(predictions=cands, references=refs) - else: - model_result = model.compute(predictions=cands, references=refs) - - # model_result is a dict, e.g., {'ROUGE-1': [0.1, 0.9, 0.8], 'ROUGE-2':[0.5, 0.7 0.8]} each item in a value-list corresponds to a (doc, sys summ) pair or a (ref summ, sys summ) pair. - for score_name, score_list in model_result.items(): - if score_name != "hashcode": - batch_result_df[approach, model_name, score_name] = score_list - - return batch_result_df - -def batched_corr(corr_df, human_scores, batch_result_df, corr_metrics, batchID): - """Compute the correlations between human scores and automated metric scores on batch of samples, each of which is a pair of (doc, sys summ) or (ref summ, sys summ) - - Iteratively add rows to corr_df. - """ - - corr_metric_mapping = {"pearsonr": scipy.stats.pearsonr, "spearmanr": scipy.stats.spearmanr} - for corr_metric in corr_metrics: - for aspect_name, human_score in human_scores.iteritems(): - for (approach, model, score_name) in batch_result_df.columns: - metric_score = batch_result_df[(approach, model, score_name)] - # FIXME: Why cannot I use the f-string below? - # cc = eval(f"scipy.stats.{corr_metric}")(human_score, metric_score)[0] - - cc = corr_metric_mapping[corr_metric](human_score, metric_score)[0] - - corr_df.loc[ - (corr_metric, aspect_name, approach, model, score_name), # row - batchID - ] = cc - return corr_df - -def pool_multidoc(batch_df: pandas.DataFrame, result_df: pandas.DataFrame): - """Pool muiltidocument evaluation results - """ - docsetID_and_System = batch_df[['docsetID', 'System']] - # print (docsetID_and_System.shape, result_df.shape) - - docsetID_and_System = docsetID_and_System.reset_index(drop=True) - # reset index from 0 because batch_df's index is a segment of a much longer index range - # if not reset index, cannot concat below without ignore_index due to misaligned indexes - # We do not use ignore_index below because it will otherwise reset multiindex column headers to 0 to N. - combined = pandas.concat([docsetID_and_System, result_df], axis=1) - - - combined_pooled = combined.groupby(['docsetID', 'System']).mean() - # combined_pooled = combined_pooled.drop(["index", 'docsetID', 'System'], axis=1) - - # Drop scores of the common summary - human_scores = batch_df.drop(['ArticleText', 'ReferenceSummary', - 'SystemSummary'], axis=1) - human_scores = batch_df.groupby(['docsetID', 'System']).mean() - - - # print (batch_df_new.shape, combined_pooled.shape) - - # The returned DataFrame does not have multi-indexed columns but has tuples as column names - return human_scores, combined_pooled - -# TODO: Default value shouldn't be tied to env -def eval_summary_level( - dataset_df:pandas.DataFrame, - exp_approaches: typing.List[str] = env.approaches, - exp_models: typing.List[str] = env.models, - corr_metrics: typing.List[str] = env.corr_metrics, - document_column: str = env.document_column, - docID_column: str = env.docID_column, # TODO: some in newsroom, realsumm, summeval have not supported this yet - system_summary_column: str = env.system_summary_column, - reference_summary_column: str = env.reference_summary_column, - human_metrics: typing.List[str] = env.human_metrics, - pre_calculated_metrics: typing.List[str] = [], # some datasets contain metric scores - debug = False, - is_multi = False, # multi-document summarization -): - """Get summary-level scores for system summaries using various scoring methods. - - Summary-level evaluation means that we compute corraltion for each document and then average across documents. For its definitions, see Eq. (1) of RealSumm paper EMNLP 2020 https://aclanthology.org/2020.emnlp-main.751.pdf - - """ - - # batching based on articles. Also saves memory. - # for articleID in df["ArticleID"].unique(): # summary-level, we so need to loop over articles - # print (articleID) - # batch = df [ df["ArticleID"] == articleID] - - index = pandas.MultiIndex.from_tuples( - [], - names = ["corr_metric", "aspect", "approach", "model", "score_name"]) - corr_df = pandas.DataFrame((), index= index) - # each COLUMN corresponds to one document/batchs - # An Index (per row) is nested in 5 levels: - # (corr_metric, aspect, approach, model, score_name) - # - # At the end, just average every row (axis=1) - # We could let the multilevel on columns, - # but the code will be slightly longer. - - for batchID, docID in enumerate(tqdm.tqdm ( dataset_df[docID_column].unique())): - - if debug: - if batchID > 2 : - break - - batch = dataset_df [ dataset_df[docID_column] == docID] - # without .to_numpy(), will run into issues starting from 2nd iteration - docs = batch[document_column].to_numpy() - sys_summs = batch[system_summary_column].to_numpy() - ref_summs = batch[reference_summary_column].to_numpy() - human_scores = batch[human_metrics] # a DF - - batch_result_df = model_eval(sys_summs, ref_summs, docs, exp_models, exp_approaches) - - if is_multi: # average the scores for multiple documents to the same reference - human_scores, batch_result_df = pool_multidoc(batch, batch_result_df) - - # batch_result_df[approach, model, score_name] ===> a list for each pair in the batch - - # Insert precalculated metrics - if isinstance(pre_calculated_metrics, list) and len(pre_calculated_metrics)> 0: - for score_name in pre_calculated_metrics: - batch_result_df["PreCalc","PreCalc",score_name] = batch[score_name].to_numpy() - - corr_df = batched_corr(corr_df, human_scores, batch_result_df, corr_metrics, batchID) - - final_corr_df = corr_df.mean(axis=1) - corr_df['average'] = final_corr_df # last column - - return corr_df - - -def eval_system_level(): - """Get system-level scores for system summaries using various scoring methods. - - System-level evaluation means that we compute corraltion for each system and then average across systems - - """ - - pass \ No newline at end of file diff --git a/bertscore_sentence/eval.py b/bertscore_sentence/eval.py index 1a3fef4..9119cf1 100644 --- a/bertscore_sentence/eval.py +++ b/bertscore_sentence/eval.py @@ -7,23 +7,21 @@ import numpy as np import torch from tqdm.auto import trange -from dar_env import nlp_spacy import functools import sentence_transformers +from type_piece import EvalPieces -def cos_sim_mat_f(cand, ref, embedder) -> np.ndarray: - def bert_encode(piece: str): - sentence_emb = list() - doc = nlp_spacy(piece) - doc_sents = [sent.text for sent in doc.sents] - for sentence in doc_sents: +def cos_sim_mat_f(cand_segments: typing.List[str], ref_segments: typing.List[str], embedder) -> np.ndarray: + def bert_encode(piece_segments: typing.List[str]): + sent_emb = list() + for sent in piece_segments: with torch.no_grad(): - sentence_emb.append(embedder.encode(sentence, convert_to_numpy=True)) - return sentence_emb, doc_sents + sent_emb.append(embedder.encode(sent, convert_to_numpy=True)) + return sent_emb - ref_sent_emb_list, ref_sents = bert_encode(ref) - cand_sent_emb_list, cand_sents = bert_encode(cand) + ref_sent_emb_list = bert_encode(ref_segments) + cand_sent_emb_list = bert_encode(cand_segments) ref_sent_emb = np.stack(ref_sent_emb_list, axis=0) cand_sent_emb = np.stack(cand_sent_emb_list, axis=0) numerators = np.inner(ref_sent_emb, cand_sent_emb) @@ -31,15 +29,15 @@ def bert_encode(piece: str): cand_sent_emb_norms = np.linalg.norm(cand_sent_emb, axis=1) denominators = np.outer(ref_sent_emb_norms, cand_sent_emb_norms) sim_mat = np.divide(numerators, denominators) - return sim_mat, cand_sents, ref_sents + return sim_mat -def score_np(predictions: typing.List[str], references: typing.List[str], sim_mat_f: typing.Callable) -> np.ndarray: - cands, refs = predictions, references # simple renaming. +def score_np(predictions: EvalPieces, references: EvalPieces, sim_mat_f: typing.Callable) -> np.ndarray: + cands, refs = predictions.segments_list, references.segments_list all_scores = np.empty((len(cands), 3)) for index in trange(len(cands), desc="bertscore-sentence {}".format(sim_mat_f.__name__), leave=False): # all pieces, len(cands) == len(refs) - sim_mat, cand_sents, ref_sents = sim_mat_f(cand=cands[index], ref=refs[index]) + sim_mat = sim_mat_f(cand_segments=cands[index], ref_segments=refs[index]) def sum_max(is_r: bool) -> float: if is_r: @@ -47,16 +45,16 @@ def sum_max(is_r: bool) -> float: else: return np.sum(np.max(sim_mat, axis=0)) # equals to np.sum(np.max(sim_mat.T, axis=1)) - R = (1 / len(ref_sents)) * sum_max(True) - P = (1 / len(cand_sents)) * sum_max(False) + R = (1 / len(refs[index])) * sum_max(True) + P = (1 / len(cands[index])) * sum_max(False) F = 2 * ((P * R) / (P + R)) all_scores[index, :] = np.array([P, R, F]) - del sim_mat + np.nan_to_num(all_scores, copy=False, nan=0, posinf=1, neginf=-1) return all_scores -def compute(predictions: typing.List[str], references: typing.List[str], sim_mat_f: typing.Optional[typing.Callable] = None, embedder: typing.Optional[sentence_transformers.SentenceTransformer] = None) -> typing.Dict: +def compute(predictions: EvalPieces, references: EvalPieces, sim_mat_f: typing.Optional[typing.Callable] = None, embedder: typing.Optional[sentence_transformers.SentenceTransformer] = None) -> typing.Dict: cands, refs = predictions, references # simple renaming if sim_mat_f is None: # cosine similarity by default sim_mat_f = functools.partial(cos_sim_mat_f, embedder=embedder) diff --git a/classic/eval.py b/classic/eval.py new file mode 100644 index 0000000..d9c7b8e --- /dev/null +++ b/classic/eval.py @@ -0,0 +1,49 @@ +import sys +from os import path +file_path = path.abspath(__file__) +sys.path.append(path.dirname(path.dirname(file_path))) + +from type_piece import EvalPieces +import typing +from dar_env import bertscore, rouge, bleurt, get_idf_dict, word_mover_score +import functools + + +bertscore_partial = functools.partial(bertscore.compute, lang='en', use_fast_tokenizer=True) +rouge_partial = functools.partial(rouge.compute, use_aggregator=False) +bleurt_partial = functools.partial(bleurt.compute) + + +def moverscore_partial(predictions: typing.List[str], references: typing.List[str], n_gram: int) -> typing.Dict: + # https://github.com/AIPHES/emnlp19-moverscore + idf_dict_hyp = get_idf_dict(predictions) # idf_dict_hyp = defaultdict(lambda: 1.) + idf_dict_ref = get_idf_dict(references) # idf_dict_ref = defaultdict(lambda: 1.) + scores = word_mover_score(references, predictions, idf_dict_ref, idf_dict_hyp, \ + stop_words=[], n_gram=n_gram, remove_subwords=True) + return { "scores": scores } + + +def data_raw(predictions: EvalPieces, references: EvalPieces) -> typing.Tuple[typing.List[str], typing.List[str]]: + preds = predictions.raw_list + refs = references.raw_list + return preds, refs + + +def bertscore_compute(predictions: EvalPieces, references: EvalPieces) -> typing.Dict: + preds, refs = data_raw(predictions, references) + return bertscore_partial(predictions=preds, references=refs) + + +def rouge_compute(predictions: EvalPieces, references: EvalPieces) -> typing.Dict: + preds, refs = data_raw(predictions, references) + return rouge_partial(predictions=preds, references=refs) + + +def bleurt_compute(predictions: EvalPieces, references: EvalPieces) -> typing.Dict: + preds, refs = data_raw(predictions, references) + return bleurt_partial(predictions=preds, references=refs) + + +def moverscore_compute(predictions: EvalPieces, references: EvalPieces, n_gram: int) -> typing.Dict: + preds, refs = data_raw(predictions, references) + return moverscore_partial(predictions=preds, references=refs, n_gram=n_gram) diff --git a/dar_env.py b/dar_env.py index b6b66de..f2d2cd6 100644 --- a/dar_env.py +++ b/dar_env.py @@ -1,13 +1,16 @@ -import spacy +import os + +os.environ["MOVERSCORE_MODEL"] = "roberta-large" +os.environ["TOKENIZERS_PARALLELISM"] = "true" + from transformers import pipeline import sentence_transformers import evaluate +from moverscore_v2 import get_idf_dict, word_mover_score - -nlp_spacy = spacy.load("en_core_web_lg") -mnli_classifier_roberta = pipeline("text-classification", model="roberta-large-mnli", top_k=None) +mnli_classifier_roberta = pipeline("text-classification", model="roberta-large-mnli", top_k=None, device=0) mnli_classifier_roberta.__name__ = "roberta-large-mnli" -mnli_classifier_bart = pipeline("text-classification", model="facebook/bart-large-mnli", top_k=None) +mnli_classifier_bart = pipeline("text-classification", model="facebook/bart-large-mnli", top_k=None, device=0) mnli_classifier_bart.__name__ = "bart-large-mnli" sent_embedder_mpnet = sentence_transformers.SentenceTransformer("all-mpnet-base-v2") sent_embedder_mpnet.__name__ = "all-mpnet-base-v2" diff --git a/mnli/eval.py b/mnli/eval.py index a972ef5..07ffd0f 100644 --- a/mnli/eval.py +++ b/mnli/eval.py @@ -7,26 +7,19 @@ from bertscore_sentence import eval import numpy as np from mnli.sim import similarity -from dar_env import nlp_spacy import functools import transformers +from type_piece import EvalPieces -def mnli_sim_mat(cand: str, ref: str, classifier: transformers.Pipeline) -> np.ndarray: - def segmentation(piece: str): - doc = nlp_spacy(piece) - doc_sents = [sent.text for sent in doc.sents] - return doc_sents - - cand_sents = segmentation(cand) - ref_sents = segmentation(ref) - sent_pairs = [" ".join([x, y]) for x in ref_sents for y in cand_sents] - sim_mat = np.empty((len(ref_sents), len(cand_sents))) +def mnli_sim_mat(cand_segments: typing.List[str], ref_segments: typing.List[str], classifier: transformers.Pipeline) -> np.ndarray: + sent_pairs = [" ".join([x, y]) for x in ref_segments for y in cand_segments] + sim_mat = np.empty((len(ref_segments), len(cand_segments))) sim_mat.flat = similarity(sent_pairs, classifier) - return sim_mat, cand_sents, ref_sents + return sim_mat -def bertscore_sentence_compute(predictions: typing.List[str], references: typing.List[str], classifier: transformers.Pipeline) -> typing.Dict: +def bertscore_sentence_compute(predictions: EvalPieces, references: EvalPieces, classifier: transformers.Pipeline) -> typing.Dict: sim_mat_f = functools.partial(mnli_sim_mat, classifier=classifier) sim_mat_f.__name__ = " ".join(["mnli", classifier.__name__]) return eval.compute(predictions=predictions, references=references, sim_mat_f=sim_mat_f) diff --git a/requirements.txt b/requirements.txt index 484924d..23a4e40 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,5 @@ pandas rouge_score bert_score git+https://github.com/google-research/bleurt.git -spacy -en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl +pyemd +git+https://github.com/AIPHES/emnlp19-moverscore.git diff --git a/topk/eval.py b/topk/eval.py index c713091..04987b5 100644 --- a/topk/eval.py +++ b/topk/eval.py @@ -4,42 +4,40 @@ sys.path.append(path.dirname(path.dirname(file_path))) import typing -from dar_env import nlp_spacy, bertscore, rouge, bleurt +import classic.eval as classic +from type_piece import EvalPieces -def extract_topk_doc(ref: str, topk: int) -> str: - doc = nlp_spacy(ref) - doc_sents = [sent.text for sent in doc.sents] - topk_sents = doc_sents[0:topk] +def extract_topk_doc(ref_segments: typing.List[str], topk: int) -> str: + topk_sents = ref_segments[0:topk] return " ".join(topk_sents) -def extract_topk(references: typing.List[str], topk: int) -> typing.List[str]: - return [extract_topk_doc(ref, topk) for ref in references] +def extract_topk(ref_segments_list: typing.List[typing.List[str]], topk: int) -> typing.List[str]: + return [extract_topk_doc(ref_segments, topk) for ref_segments in ref_segments_list] -def bertscore_compute(predictions: typing.List[str], references: typing.List[str], topk: int) -> typing.Dict: - refs = extract_topk(references, topk) - return bertscore.compute( - predictions=predictions, - references=refs, - lang='en', - use_fast_tokenizer=True - ) +def data_topk(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Tuple[str, str]: + preds = predictions.raw_list + refs = extract_topk(references.segments_list, topk) + return preds, refs -def rouge_compute(predictions: typing.List[str], references: typing.List[str], topk: int) -> typing.Dict: - refs = extract_topk(references, topk) - return rouge.compute( - predictions=predictions, - references=refs, - use_aggregator=False - ) +def bertscore_compute(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Dict: + preds, refs = data_topk(predictions, references, topk) + return classic.bertscore_partial(predictions=preds, references=refs) -def bleurt_compute(predictions: typing.List[str], references: typing.List[str], topk: int) -> typing.Dict: - refs = extract_topk(references, topk) - return bleurt.compute( - predictions=predictions, - references=refs - ) +def rouge_compute(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Dict: + preds, refs = data_topk(predictions, references, topk) + return classic.rouge_partial(predictions=preds, references=refs) + + +def bleurt_compute(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Dict: + preds, refs = data_topk(predictions, references, topk) + return classic.bleurt_partial(predictions=preds, references=refs) + + +def moverscore_compute(predictions: EvalPieces, references: EvalPieces, n_gram: int, topk: int) -> typing.Dict: + preds, refs = data_topk(predictions, references, topk) + return classic.moverscore_partial(predictions=preds, references=refs, n_gram=n_gram) diff --git a/type_piece.py b/type_piece.py new file mode 100644 index 0000000..66eabc9 --- /dev/null +++ b/type_piece.py @@ -0,0 +1,11 @@ +# for reference and type hint only, see EvalBase for full + +import typing + + +class EvalPieces: + raw_list: typing.List[str] + segments_list: typing.List[typing.List[str]] + + def __init__(self): + raise Exception("read only")