From 4ca7c5f645d826ed9d9c60f81526ea49db209328 Mon Sep 17 00:00:00 2001
From: TURX <turx2003@gmail.com>
Date: Sat, 10 Dec 2022 22:09:17 -0600
Subject: [PATCH] shared sent segments, mnli on gpu, moverscore minor:
 classical metrics, moverscore topk fix: #6, #7, #11 rm: obsolete
 achived_experiments tofix: moverscore truncation & corr, pipeline refactor to
 automodel

---
 README.md                           |  24 +++-
 archived_experiments/README.md      |   5 -
 archived_experiments/bao_tac2010.py | 144 --------------------
 archived_experiments/eval.py        | 204 ----------------------------
 bertscore_sentence/eval.py          |  36 +++--
 classic/eval.py                     |  49 +++++++
 dar_env.py                          |  13 +-
 mnli/eval.py                        |  19 +--
 requirements.txt                    |   4 +-
 topk/eval.py                        |  54 ++++----
 type_piece.py                       |  11 ++
 11 files changed, 137 insertions(+), 426 deletions(-)
 delete mode 100644 archived_experiments/README.md
 delete mode 100644 archived_experiments/bao_tac2010.py
 delete mode 100644 archived_experiments/eval.py
 create mode 100644 classic/eval.py
 create mode 100644 type_piece.py
diff --git a/README.md b/README.md
index da13a8f..5524408 100644
--- a/README.md
+++ b/README.md
@@ -14,17 +14,29 @@ Do not reinvent the wheel:
 
 # Approaches
 
-## Approach 0: just replacing human summaries with documents
+For metrics, add the following, as well as changes specified in "Usage" part of other approaches, to `env.py` of [EvalBase](https://github.com/SigmaWe/EvalBase)](https://github.com/SigmaWe/EvalBase):
+
+```python
+import sys
+sys.path.append("/path/to/DocAsRef/")
+```
 
-Metrics: ROUGE, BERTScore, BLEURT
+## Approach 0: just replacing human summaries with documents
 
-Integrated into [EvalBase](https://github.com/SigmaWe/EvalBase)](https://github.com/SigmaWe/EvalBase)
+Metrics: BERTScore, ROUGE, BLEURT, MoverScore
 
-For non-integrated metrics, add the following, as well as changes specified in "Usage" part of other approaches, to `env.py` of EvalBase:
+Implemented in `/classic`
 
+Usage:
 ```python
-import sys
-sys.path.append("/path/to/DocAsRef/")
+import classic.eval as classic
+metrics = {
+    "bertscore": classic.bertscore_compute,
+    "rouge": classic.rouge_compute,
+    "bleurt": classic.bleurt_compute,
+    "moverscore-1gram": functools.partial(classic.moverscore_compute, n_gram=1),
+    "moverscore-2gram": functools.partial(classic.moverscore_compute, n_gram=2),
+}
 ```
 
 ## Approach 1: sentence-level, better similarity metrics, and better weighting methods
diff --git a/archived_experiments/README.md b/archived_experiments/README.md
deleted file mode 100644
index 9497bc5..0000000
--- a/archived_experiments/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Archived Experiments
-
-For TAC2010 testing only
-
-Moved to https://github.com/SigmaWe/EvalBase
diff --git a/archived_experiments/bao_tac2010.py b/archived_experiments/bao_tac2010.py
deleted file mode 100644
index 44f0738..0000000
--- a/archived_experiments/bao_tac2010.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Running experiments in TAC 2010 
-
-import pandas
-import pickle 
-import json 
-import sys
-sys.path.append("../SueNes/human/tac")
-import tac 
-import os.path
-
-
-# 43 machine + 4 human summarizers per docsetID
-# Each docsetID has 10 articles 
-
-def clean_text(s:str):
-    """Clean up the text in doc or summ in RealSumm dataset
-    including, removing HTML tags, unescap HTML control sequences 
-    """
-    s = s.replace("<t>", "")
-    s = s.replace("</t>", "")
-    s = s.replace("\t", " ")
-    s = s.strip()
-    return  s 
-
-def merge_article_summary_score(articles, summaries, scores, debug=False ):
-    columns = [
-        "docsetID", 
-        "System", # we need docsetID so later we can use groupby easily to average across multiple documents corresponds to one summary 
-        "ArticleText", "ReferenceSummary", "SystemSummary", 
-        "Pyramid", "Linguistic", "Overall" # the human scores
-    ]
-    
-    counter  = 0 
-    dataset_df = pandas.DataFrame(columns=columns)
-    for docID, summary_dict in summaries.items():
-        for articleID in range(10):
-            ArticleText = " ".join(articles[docID][articleID]) 
-            # sentences were cut into lists of strings in TAC
-            for System, summary_sentences in summary_dict.items():
-                row = {
-                    "docsetID": [docID],
-                    "ArticleText": [ArticleText], 
-                    "System": [System], 
-                    "ReferenceSummary" : ["Place Holder"], # TODO where is TAC's reference summary? 
-                    "SystemSummary": [" ".join(summary_sentences)],
-                    "Pyramid": [scores[docID][System][0] ], 
-                    "Linguistic": [scores[docID][System][1]],
-                    "Overall": [scores[docID][System][2]]
-                }
-
-                tmp_dataset_df = pandas.DataFrame.from_dict(row)
-
-                dataset_df = pandas.concat([dataset_df, tmp_dataset_df], ignore_index=True)
-        counter += 1 
-        if debug and counter > 3:
-            break 
-
-    return dataset_df
-
-def load_tac(dataroot:str, debug=False):
-    """
-
-    We assume that you have fully recursively extracted the two files. 
-    - [`GuidedSumm2010_eval.tgz`](https://tac.nist.gov/protected/past-aquaint-aquaint2/2010/GuidedSumm2010_eval.tgz) Downloadable from web, containing human evaluation results and system summaries. 
-    - `TAC2010_Summarization_Documents.tgz` Emailed by NIST, containing the documents for which summaries are generated and rated. 
-    Both files require you to apply to NIST for access. 
-
-    The _dataroot_ directory should have the following structure: 
-    dataroot 
-    ├── GuidedSumm2010_eval
-    │   ├── BE
-    │   │   ├── models
-    │   │   └── peers
-    │   ├── manual
-    │   │   ├── models
-    │   │   ├── peers
-    │   │   └── pyramids
-    │   └── ROUGE
-    │       ├── models
-    │       └── peers
-    └── TAC2010_Summarization_Documents
-        └── GuidedSumm10_test_docs_files
-            ├── D1001A
-            │   ├── D1001A-A
-            │   └── D1001A-B
-            ├── D1002A
-            │   ├── D1002A-A
-            │   └── D1002A-B
-            ├── D1003A
-            │   ├── D1003A-A
-            │   └── D1003A-B
-            ├── D1004A
-            │   ├── D1004A-A
-            │   └── D1004A-B
-            ... abridged ... 
-
-    """
-    article_set_path = os.path.join(dataroot, "TAC2010_Summarization_Documents/GuidedSumm10_test_docs_files/")
-    summary_set_path = os.path.join(dataroot, "GuidedSumm2010_eval/ROUGE")
-    human_score_path = os.path.join(dataroot, "GuidedSumm2010_eval/manual")
-
-    # rouge_score_path = os.path.join(dataroot, "GuidedSumm2010_eval/ROUGE/rouge_A.m.out")
-    
-    setIDs = ["A"]  # we only use set A because set B is not applicable 
-    sentence_delimiter = "  "
-    summary_types = ["peers", "models"]    
-    
-    articles = tac.get_articles(article_set_path, setIDs, sentence_delimiter)
-    # _,_,_ = get_statistics(articles)
-
-    summaries = tac.get_summaries(summary_set_path, setIDs, sentence_delimiter, summary_types)
-                                                # sentence_delimiter,  NOT IN USE 
-
-    scores = tac.get_scores(human_score_path, summary_types, setIDs)
-
-    dataset_df = merge_article_summary_score(articles, summaries, scores, debug=debug)
-
-
-    return dataset_df
-        
-if __name__ == "__main__":
-
-    import pickle
-    debug= True 
-
-    if debug: 
-        dataset_df = pickle.load(open('tac_df.pkl', 'rb'))
-    else:  
-        dataset_df = load_tac("/media/forrest/12T_EasyStore1/data/NLP/resources/TAC_DUC/TAC2010", debug=debug)
-        pickle.dump(dataset_df, open("tac_df.pkl", 'wb'))
- 
-    import eval 
-
-    corr_df = eval.eval_summary_level(dataset_df, debug=debug, is_multi=True )
-    with pandas.option_context('display.max_rows', None,
-                       'display.max_columns', None,
-                       'display.precision', 3,
-                       ):
-        print(corr_df['average'])
-
-    with open(f"result_tac2010.json", 'w') as f:
-        json_ugly = corr_df.to_json(orient="index")
-        json_parsed = json.loads(json_ugly)
-        f.write(json.dumps(json_parsed, indent=2))
diff --git a/archived_experiments/eval.py b/archived_experiments/eval.py
deleted file mode 100644
index 96d4c03..0000000
--- a/archived_experiments/eval.py
+++ /dev/null
@@ -1,204 +0,0 @@
-import json
-
-import evaluate
-import pandas
-
-import functools
-import env
-import numpy, scipy
-
-import sys
-sys.path.append("../")
-import bertscore_sentence.eval as bertscore_sentence
-
-import typing 
-import tqdm
-
-
-# TODO 
-# ref_{free,based}_metrics is a dict {str:function}
-# ref_based_metrics = {
-#     "bleurt": evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric'), 
-#     "rouge": functools.partial( evaluate.load("rouge"), use_aggregator=False)
-
-#     }
-
-# ref_free_metrics = {    
-#     "bertscore-sentence": bertscore_sentence
-# }
-# all metrics shall return a dict {metric_name: List[float]}
-
-
-def model_eval(
-    sys_summaries: list, 
-    ref_summaries: list, 
-    docs: list, 
-    models: typing.List[str], 
-    approaches: typing.List[str]) -> pandas.DataFrame:
-    """Given a batch of samples, run various automated summary metrics to evaluate the quality of summaries. 
-    """
-    
-    # Create a placeholder multiindex Dataframe
-    # Each row corresponds to one (doc, sys) or (ref, sys) pair, i.e., one sample. 
-    # columns are the metrics nested in 3 levels (approach, model, score_name). 
-    index= pandas.MultiIndex.from_tuples([], names = ["approach", "model", "score_name"])
-    batch_result_df = pandas.DataFrame((), columns =index)
-
-    for model_name in models:
-        # print('Model: ' + model_name)
-        if model_name == 'bleurt':
-            model = evaluate.load('bleurt', config_name='BLEURT-20', module_type='metric')
-        elif model_name == 'bertscore-sentence':
-            model = bertscore_sentence
-        else:
-            model = evaluate.load(model_name)
-
-        # calculate traditional (reference, system summary) pairs and new (document, system summary) pairs
-        for approach in approaches:
-            # print('Evaluating on ' + approach + ' approach')
-            cands = sys_summaries
-            refs = ref_summaries if approach == "trad" else docs
-            if model_name == 'bertscore':
-                model_result = model.compute(predictions=cands, references=refs, lang='en', use_fast_tokenizer=True)
-            elif model_name == 'rouge':
-                model_result = model.compute(predictions=cands, references=refs, use_aggregator=False)
-            elif model_name == 'bertscore-sentence':
-                model_result = model.compute(predictions=cands, references=refs)
-            else:
-                model_result = model.compute(predictions=cands, references=refs)
-
-            # model_result is a dict, e.g., {'ROUGE-1': [0.1, 0.9, 0.8], 'ROUGE-2':[0.5, 0.7 0.8]} each item in a value-list corresponds to a (doc, sys summ) pair  or a (ref summ, sys summ) pair. 
-            for score_name, score_list in model_result.items():
-                if score_name != "hashcode":
-                    batch_result_df[approach, model_name, score_name] = score_list
-
-    return batch_result_df
-
-def batched_corr(corr_df, human_scores, batch_result_df, corr_metrics, batchID):
-    """Compute the correlations between human scores and automated metric scores on batch of samples, each of which is a pair of (doc, sys summ) or (ref summ, sys summ)
-
-    Iteratively add rows to corr_df. 
-    """
-
-    corr_metric_mapping = {"pearsonr": scipy.stats.pearsonr, "spearmanr": scipy.stats.spearmanr}
-    for corr_metric in corr_metrics:
-        for aspect_name, human_score in human_scores.iteritems(): 
-            for (approach, model, score_name) in batch_result_df.columns:
-                metric_score = batch_result_df[(approach, model, score_name)]
-                # FIXME: Why cannot I use the f-string below? 
-                # cc = eval(f"scipy.stats.{corr_metric}")(human_score, metric_score)[0]
-
-                cc =  corr_metric_mapping[corr_metric](human_score, metric_score)[0]
-
-                corr_df.loc[
-                    (corr_metric, aspect_name, approach, model, score_name),  # row
-                    batchID 
-                    ] = cc 
-    return  corr_df
-
-def pool_multidoc(batch_df: pandas.DataFrame, result_df: pandas.DataFrame):
-    """Pool muiltidocument evaluation results 
-    """
-    docsetID_and_System = batch_df[['docsetID', 'System']]
-    # print (docsetID_and_System.shape, result_df.shape)
-
-    docsetID_and_System = docsetID_and_System.reset_index(drop=True) 
-    # reset index from 0 because batch_df's index is a segment of a much longer index range 
-    # if not reset index, cannot concat below without ignore_index due to misaligned indexes 
-    # We do not use ignore_index below because it will otherwise reset multiindex column headers to 0 to N. 
-    combined = pandas.concat([docsetID_and_System, result_df], axis=1)
-
-
-    combined_pooled = combined.groupby(['docsetID', 'System']).mean()
-    # combined_pooled = combined_pooled.drop(["index", 'docsetID', 'System'], axis=1)
-
-    # Drop scores of the common summary
-    human_scores = batch_df.drop(['ArticleText', 'ReferenceSummary',
-       'SystemSummary'], axis=1)
-    human_scores = batch_df.groupby(['docsetID', 'System']).mean() 
-
-    
-    # print (batch_df_new.shape, combined_pooled.shape)
-
-    # The returned DataFrame does not have multi-indexed columns but has tuples as column names 
-    return human_scores, combined_pooled
-
-# TODO: Default value shouldn't be tied to env 
-def eval_summary_level(
-    dataset_df:pandas.DataFrame, 
-    exp_approaches: typing.List[str] = env.approaches,
-    exp_models: typing.List[str]    = env.models,
-    corr_metrics: typing.List[str]  = env.corr_metrics, 
-    document_column: str            = env.document_column, 
-    docID_column: str               = env.docID_column,  # TODO: some in newsroom, realsumm, summeval have not supported this yet 
-    system_summary_column: str      = env.system_summary_column, 
-    reference_summary_column: str   = env.reference_summary_column, 
-    human_metrics: typing.List[str] = env.human_metrics, 
-    pre_calculated_metrics: typing.List[str] = [], # some datasets contain metric scores 
-    debug = False, 
-    is_multi = False, # multi-document summarization
-):
-    """Get summary-level scores for system summaries using various scoring methods. 
-
-    Summary-level evaluation means that we compute corraltion for each document and then average across documents. For its definitions, see Eq. (1) of RealSumm paper EMNLP 2020 https://aclanthology.org/2020.emnlp-main.751.pdf 
-
-    """
-
-    # batching based on articles. Also saves memory. 
-    # for articleID in df["ArticleID"].unique(): # summary-level, we so need to loop over articles 
-    #     print (articleID)
-    #     batch = df [ df["ArticleID"] == articleID] 
-
-    index = pandas.MultiIndex.from_tuples(
-        [], 
-        names = ["corr_metric", "aspect", "approach", "model", "score_name"])
-    corr_df = pandas.DataFrame((), index= index)
-    # each COLUMN corresponds to one document/batchs
-    # An Index (per row) is nested in 5 levels: 
-    # (corr_metric, aspect, approach, model, score_name)
-    # 
-    # At the end, just average every row (axis=1)
-    # We could let the multilevel on columns,
-    #  but the code will be slightly longer.
-
-    for batchID, docID in enumerate(tqdm.tqdm ( dataset_df[docID_column].unique())):
-
-        if debug: 
-            if batchID > 2 : 
-                break 
-
-        batch = dataset_df [ dataset_df[docID_column] == docID]
-        # without .to_numpy(), will run into issues starting from 2nd iteration 
-        docs   = batch[document_column].to_numpy()
-        sys_summs  = batch[system_summary_column].to_numpy()
-        ref_summs   = batch[reference_summary_column].to_numpy()
-        human_scores = batch[human_metrics] # a DF
-
-        batch_result_df = model_eval(sys_summs, ref_summs, docs, exp_models, exp_approaches)
-
-        if is_multi: # average the scores for multiple documents to the same reference 
-            human_scores, batch_result_df = pool_multidoc(batch, batch_result_df)
-
-        # batch_result_df[approach, model, score_name] ===> a list for each pair in the batch 
-
-        # Insert precalculated metrics 
-        if isinstance(pre_calculated_metrics, list) and len(pre_calculated_metrics)> 0: 
-            for score_name in pre_calculated_metrics: 
-                batch_result_df["PreCalc","PreCalc",score_name] = batch[score_name].to_numpy()
-
-        corr_df = batched_corr(corr_df, human_scores, batch_result_df, corr_metrics, batchID)
-
-    final_corr_df = corr_df.mean(axis=1)
-    corr_df['average'] = final_corr_df # last column 
-
-    return corr_df
-
-
-def eval_system_level():
-    """Get system-level scores for system summaries using various scoring methods. 
-
-    System-level evaluation means that we compute corraltion for each system and then average across systems
-
-    """
-
-    pass
\ No newline at end of file
diff --git a/bertscore_sentence/eval.py b/bertscore_sentence/eval.py
index 1a3fef4..9119cf1 100644
--- a/bertscore_sentence/eval.py
+++ b/bertscore_sentence/eval.py
@@ -7,23 +7,21 @@
 import numpy as np
 import torch
 from tqdm.auto import trange
-from dar_env import nlp_spacy
 import functools
 import sentence_transformers
+from type_piece import EvalPieces
 
 
-def cos_sim_mat_f(cand, ref, embedder) -> np.ndarray:
-    def bert_encode(piece: str):
-        sentence_emb = list()
-        doc = nlp_spacy(piece)
-        doc_sents = [sent.text for sent in doc.sents]
-        for sentence in doc_sents:
+def cos_sim_mat_f(cand_segments: typing.List[str], ref_segments: typing.List[str], embedder) -> np.ndarray:
+    def bert_encode(piece_segments: typing.List[str]):
+        sent_emb = list()
+        for sent in piece_segments:
             with torch.no_grad():
-                sentence_emb.append(embedder.encode(sentence, convert_to_numpy=True))
-        return sentence_emb, doc_sents
+                sent_emb.append(embedder.encode(sent, convert_to_numpy=True))
+        return sent_emb
 
-    ref_sent_emb_list, ref_sents = bert_encode(ref)
-    cand_sent_emb_list, cand_sents = bert_encode(cand)
+    ref_sent_emb_list = bert_encode(ref_segments)
+    cand_sent_emb_list = bert_encode(cand_segments)
     ref_sent_emb = np.stack(ref_sent_emb_list, axis=0)
     cand_sent_emb = np.stack(cand_sent_emb_list, axis=0)
     numerators = np.inner(ref_sent_emb, cand_sent_emb)
@@ -31,15 +29,15 @@ def bert_encode(piece: str):
     cand_sent_emb_norms = np.linalg.norm(cand_sent_emb, axis=1)
     denominators = np.outer(ref_sent_emb_norms, cand_sent_emb_norms)
     sim_mat = np.divide(numerators, denominators)
-    return sim_mat, cand_sents, ref_sents
+    return sim_mat
 
 
-def score_np(predictions: typing.List[str], references: typing.List[str], sim_mat_f: typing.Callable) -> np.ndarray:
-    cands, refs = predictions, references # simple renaming. 
+def score_np(predictions: EvalPieces, references: EvalPieces, sim_mat_f: typing.Callable) -> np.ndarray:
+    cands, refs = predictions.segments_list, references.segments_list
     all_scores = np.empty((len(cands), 3))
 
     for index in trange(len(cands), desc="bertscore-sentence {}".format(sim_mat_f.__name__), leave=False):  # all pieces, len(cands) == len(refs)
-        sim_mat, cand_sents, ref_sents = sim_mat_f(cand=cands[index], ref=refs[index])
+        sim_mat = sim_mat_f(cand_segments=cands[index], ref_segments=refs[index])
 
         def sum_max(is_r: bool) -> float:
             if is_r:
@@ -47,16 +45,16 @@ def sum_max(is_r: bool) -> float:
             else:
                 return np.sum(np.max(sim_mat, axis=0))  # equals to np.sum(np.max(sim_mat.T, axis=1))
 
-        R = (1 / len(ref_sents)) * sum_max(True)
-        P = (1 / len(cand_sents)) * sum_max(False)
+        R = (1 / len(refs[index])) * sum_max(True)
+        P = (1 / len(cands[index])) * sum_max(False)
         F = 2 * ((P * R) / (P + R))
         all_scores[index, :] = np.array([P, R, F])
-        del sim_mat
+        np.nan_to_num(all_scores, copy=False, nan=0, posinf=1, neginf=-1)
 
     return all_scores
 
 
-def compute(predictions: typing.List[str], references: typing.List[str], sim_mat_f: typing.Optional[typing.Callable] = None, embedder: typing.Optional[sentence_transformers.SentenceTransformer] = None) -> typing.Dict:
+def compute(predictions: EvalPieces, references: EvalPieces, sim_mat_f: typing.Optional[typing.Callable] = None, embedder: typing.Optional[sentence_transformers.SentenceTransformer] = None) -> typing.Dict:
     cands, refs = predictions, references # simple renaming
     if sim_mat_f is None:  # cosine similarity by default
         sim_mat_f = functools.partial(cos_sim_mat_f, embedder=embedder)
diff --git a/classic/eval.py b/classic/eval.py
new file mode 100644
index 0000000..d9c7b8e
--- /dev/null
+++ b/classic/eval.py
@@ -0,0 +1,49 @@
+import sys
+from os import path
+file_path = path.abspath(__file__)
+sys.path.append(path.dirname(path.dirname(file_path)))
+
+from type_piece import EvalPieces
+import typing
+from dar_env import bertscore, rouge, bleurt, get_idf_dict, word_mover_score
+import functools
+
+
+bertscore_partial = functools.partial(bertscore.compute, lang='en', use_fast_tokenizer=True)
+rouge_partial = functools.partial(rouge.compute, use_aggregator=False)
+bleurt_partial = functools.partial(bleurt.compute)
+
+
+def moverscore_partial(predictions: typing.List[str], references: typing.List[str], n_gram: int) -> typing.Dict:
+    # https://github.com/AIPHES/emnlp19-moverscore
+    idf_dict_hyp = get_idf_dict(predictions) # idf_dict_hyp = defaultdict(lambda: 1.)
+    idf_dict_ref = get_idf_dict(references) # idf_dict_ref = defaultdict(lambda: 1.)
+    scores = word_mover_score(references, predictions, idf_dict_ref, idf_dict_hyp, \
+                          stop_words=[], n_gram=n_gram, remove_subwords=True)
+    return { "scores": scores }
+
+
+def data_raw(predictions: EvalPieces, references: EvalPieces) -> typing.Tuple[typing.List[str], typing.List[str]]:
+    preds = predictions.raw_list
+    refs = references.raw_list
+    return preds, refs
+
+
+def bertscore_compute(predictions: EvalPieces, references: EvalPieces) -> typing.Dict:
+    preds, refs = data_raw(predictions, references)
+    return bertscore_partial(predictions=preds, references=refs)
+
+
+def rouge_compute(predictions: EvalPieces, references: EvalPieces) -> typing.Dict:
+    preds, refs = data_raw(predictions, references)
+    return rouge_partial(predictions=preds, references=refs)
+
+
+def bleurt_compute(predictions: EvalPieces, references: EvalPieces) -> typing.Dict:
+    preds, refs = data_raw(predictions, references)
+    return bleurt_partial(predictions=preds, references=refs)
+
+
+def moverscore_compute(predictions: EvalPieces, references: EvalPieces, n_gram: int) -> typing.Dict:
+    preds, refs = data_raw(predictions, references)
+    return moverscore_partial(predictions=preds, references=refs, n_gram=n_gram)
diff --git a/dar_env.py b/dar_env.py
index b6b66de..f2d2cd6 100644
--- a/dar_env.py
+++ b/dar_env.py
@@ -1,13 +1,16 @@
-import spacy
+import os
+
+os.environ["MOVERSCORE_MODEL"] = "roberta-large"
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
 from transformers import pipeline
 import sentence_transformers
 import evaluate
+from moverscore_v2 import get_idf_dict, word_mover_score
 
-
-nlp_spacy = spacy.load("en_core_web_lg")
-mnli_classifier_roberta = pipeline("text-classification", model="roberta-large-mnli", top_k=None)
+mnli_classifier_roberta = pipeline("text-classification", model="roberta-large-mnli", top_k=None, device=0)
 mnli_classifier_roberta.__name__ = "roberta-large-mnli"
-mnli_classifier_bart = pipeline("text-classification", model="facebook/bart-large-mnli", top_k=None)
+mnli_classifier_bart = pipeline("text-classification", model="facebook/bart-large-mnli", top_k=None, device=0)
 mnli_classifier_bart.__name__ = "bart-large-mnli"
 sent_embedder_mpnet = sentence_transformers.SentenceTransformer("all-mpnet-base-v2")
 sent_embedder_mpnet.__name__ = "all-mpnet-base-v2"
diff --git a/mnli/eval.py b/mnli/eval.py
index a972ef5..07ffd0f 100644
--- a/mnli/eval.py
+++ b/mnli/eval.py
@@ -7,26 +7,19 @@
 from bertscore_sentence import eval
 import numpy as np
 from mnli.sim import similarity
-from dar_env import nlp_spacy
 import functools
 import transformers
+from type_piece import EvalPieces
 
 
-def mnli_sim_mat(cand: str, ref: str, classifier: transformers.Pipeline) -> np.ndarray:
-    def segmentation(piece: str):
-        doc = nlp_spacy(piece)
-        doc_sents = [sent.text for sent in doc.sents]
-        return doc_sents
-
-    cand_sents = segmentation(cand)
-    ref_sents = segmentation(ref)
-    sent_pairs = [" ".join([x, y]) for x in ref_sents for y in cand_sents]
-    sim_mat = np.empty((len(ref_sents), len(cand_sents)))
+def mnli_sim_mat(cand_segments: typing.List[str], ref_segments: typing.List[str], classifier: transformers.Pipeline) -> np.ndarray:
+    sent_pairs = [" ".join([x, y]) for x in ref_segments for y in cand_segments]
+    sim_mat = np.empty((len(ref_segments), len(cand_segments)))
     sim_mat.flat = similarity(sent_pairs, classifier)
-    return sim_mat, cand_sents, ref_sents
+    return sim_mat
 
 
-def bertscore_sentence_compute(predictions: typing.List[str], references: typing.List[str], classifier: transformers.Pipeline) -> typing.Dict:
+def bertscore_sentence_compute(predictions: EvalPieces, references: EvalPieces, classifier: transformers.Pipeline) -> typing.Dict:
     sim_mat_f = functools.partial(mnli_sim_mat, classifier=classifier)
     sim_mat_f.__name__ = " ".join(["mnli", classifier.__name__])
     return eval.compute(predictions=predictions, references=references, sim_mat_f=sim_mat_f)
diff --git a/requirements.txt b/requirements.txt
index 484924d..23a4e40 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,5 +10,5 @@ pandas
 rouge_score
 bert_score
 git+https://github.com/google-research/bleurt.git
-spacy
-en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl
+pyemd
+git+https://github.com/AIPHES/emnlp19-moverscore.git
diff --git a/topk/eval.py b/topk/eval.py
index c713091..04987b5 100644
--- a/topk/eval.py
+++ b/topk/eval.py
@@ -4,42 +4,40 @@
 sys.path.append(path.dirname(path.dirname(file_path)))
 
 import typing
-from dar_env import nlp_spacy, bertscore, rouge, bleurt
+import classic.eval as classic
+from type_piece import EvalPieces
 
 
-def extract_topk_doc(ref: str, topk: int) -> str:
-    doc = nlp_spacy(ref)
-    doc_sents = [sent.text for sent in doc.sents]
-    topk_sents = doc_sents[0:topk]
+def extract_topk_doc(ref_segments: typing.List[str], topk: int) -> str:
+    topk_sents = ref_segments[0:topk]
     return " ".join(topk_sents)
 
 
-def extract_topk(references: typing.List[str], topk: int) -> typing.List[str]:
-    return [extract_topk_doc(ref, topk) for ref in references]
+def extract_topk(ref_segments_list: typing.List[typing.List[str]], topk: int) -> typing.List[str]:
+    return [extract_topk_doc(ref_segments, topk) for ref_segments in ref_segments_list]
 
 
-def bertscore_compute(predictions: typing.List[str], references: typing.List[str], topk: int) -> typing.Dict:
-    refs = extract_topk(references, topk)
-    return bertscore.compute(
-        predictions=predictions,
-        references=refs,
-        lang='en',
-        use_fast_tokenizer=True
-    )
+def data_topk(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Tuple[str, str]:
+    preds = predictions.raw_list
+    refs = extract_topk(references.segments_list, topk)
+    return preds, refs
 
 
-def rouge_compute(predictions: typing.List[str], references: typing.List[str], topk: int) -> typing.Dict:
-    refs = extract_topk(references, topk)
-    return rouge.compute(
-        predictions=predictions,
-        references=refs,
-        use_aggregator=False
-    )
+def bertscore_compute(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Dict:
+    preds, refs = data_topk(predictions, references, topk)
+    return classic.bertscore_partial(predictions=preds, references=refs)
 
 
-def bleurt_compute(predictions: typing.List[str], references: typing.List[str], topk: int) -> typing.Dict:
-    refs = extract_topk(references, topk)
-    return bleurt.compute(
-        predictions=predictions,
-        references=refs
-    )
+def rouge_compute(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Dict:
+    preds, refs = data_topk(predictions, references, topk)
+    return classic.rouge_partial(predictions=preds, references=refs)
+
+
+def bleurt_compute(predictions: EvalPieces, references: EvalPieces, topk: int) -> typing.Dict:
+    preds, refs = data_topk(predictions, references, topk)
+    return classic.bleurt_partial(predictions=preds, references=refs)
+
+
+def moverscore_compute(predictions: EvalPieces, references: EvalPieces, n_gram: int, topk: int) -> typing.Dict:
+    preds, refs = data_topk(predictions, references, topk)
+    return classic.moverscore_partial(predictions=preds, references=refs, n_gram=n_gram)
diff --git a/type_piece.py b/type_piece.py
new file mode 100644
index 0000000..66eabc9
--- /dev/null
+++ b/type_piece.py
@@ -0,0 +1,11 @@
+# for reference and type hint only, see EvalBase for full
+
+import typing
+
+
+class EvalPieces:
+    raw_list: typing.List[str]
+    segments_list: typing.List[typing.List[str]]
+
+    def __init__(self):
+        raise Exception("read only")