diff --git a/BM_25_pyterrier.py b/BM_25_pyterrier.py index 242fcd2..d8e4e89 100644 --- a/BM_25_pyterrier.py +++ b/BM_25_pyterrier.py @@ -4,32 +4,34 @@ # for the next parts of the project import pyterrier as pt -import pandas as pd import gzip import shutil import os as os +from pyterrier.measures import * +import pandas as pd # Init pyterrier pt.init() # Get MS Marco passages used in TREC-2019 dataset = pt.get_dataset("trec-deep-learning-passages") - +print(dataset) # Get corpus pathCorpus = dataset.get_corpus() +print(pathCorpus) # Get the index stemmed (Porter stemmer) path = dataset.get_index("terrier_stemmed") index = pt.IndexFactory.of(path) # Get the queries -queries = dataset.get_topics("test-2019") +queries = dataset.get_topics("test-2020") print("query examples") print(queries) print() # Get the qrels -qrels = dataset.get_qrels("test-2019") +qrels = dataset.get_qrels("test-2020") print("qrel examples:") print(qrels) print() @@ -38,23 +40,25 @@ bm25 = pt.BatchRetrieve(index, wmodel="BM25") # Run BM-25 on the whole test dataset -pt.Experiment( +results = pt.Experiment( [bm25], queries, qrels, - eval_metrics=["map", "recip_rank", "ndcg"], + eval_metrics=["map", "recip_rank", "ndcg", "recall"], save_dir="./", save_mode="overwrite", dataframe=True, ) +print(results) # Run BM-25 on a subset of queries -queries_uni = queries.loc[queries["qid"] == str(156493)] +queries_uni = queries.loc[queries["qid"] == str(1037496)] +print(queries_uni) pt.Experiment( [bm25], queries_uni, qrels, - eval_metrics=["map", "recip_rank", "ndcg"], + eval_metrics=["map", "recip_rank", "ndcg", "recall"], perquery=True, dataframe=True, ) @@ -79,3 +83,5 @@ with open("retrieved.txt", "wb") as f_out: shutil.copyfileobj(f_in, f_out) os.remove("BR(BM25).res.gz") + +print("The ranking is generated !") diff --git a/evaluate.py b/evaluate.py deleted file mode 100644 index 58cbbf1..0000000 --- a/evaluate.py +++ /dev/null @@ -1,48 +0,0 @@ -import sys -from trectools import TrecQrel, TrecRun, TrecEval - -arguments = sys.argv - -print("format: evaluate.py retrieved.txt qrels.txt") -print("Note: all files should be on the same local path") - -retrievedPath = "retrieved.txt" -qrelsPath = "qrels.txt" -if len(arguments) == 1: - print( - "Assuming the files are retrieved.txt and qrels.txt if no parameters are used" - ) -elif len(arguments) == 3: - retrievedPath = arguments[1] - qrelsPath = arguments[2] -else: - print("Print wrong numbers of parameters !") - exit() - -print() - - -def evaluation(): - r1 = TrecRun(retrievedPath) - - qrelstrec = TrecQrel(qrelsPath) - - te = TrecEval(r1, qrelstrec) - - p100 = te.get_precision(depth=100) - rr = te.get_reciprocal_rank(depth=100) - ndcg = te.get_ndcg(depth=100) - map_ = te.get_map(depth=100) - print("relevent docs: ", te.get_relevant_documents()) - print("retrieved docs: ", te.get_retrieved_documents()) - print("retrieved relevant docs: ", te.get_relevant_retrieved_documents()) - print() - print("P@100 : \t", p100) - print("MRR@100 : \t", rr) - print("NDCG@100 : \t", ndcg) - print("MAP@100 : \t", map_) - print() - - -if __name__ == "__main__": - evaluation() diff --git a/reranking.py b/reranking.py new file mode 100644 index 0000000..09492ee --- /dev/null +++ b/reranking.py @@ -0,0 +1,92 @@ +from sentence_transformers import CrossEncoder +import pyterrier as pt +import pandas as pd + +PATH_TO_TOP_1000 = "retrieved.txt" +OUTPUT_PATH = "x.txt" + +# Init +pd.set_option("display.max_rows", None) +pd.set_option("display.max_colwidth", 150) +pt.init() + +dataset = pt.get_dataset("trec-deep-learning-passages") + +# Get the previously retrieved top 1000 (by a baseline method) +retrieved = pd.read_csv(PATH_TO_TOP_1000, sep=" ") +retrieved.columns = ["qid", "Q0", "docID", "rank", "score", "system"] +print(retrieved.dtypes) +print(retrieved.head(n=5)) +print() + +# Get the queries +queries = dataset.get_topics("test-2020") +queries = queries.astype({"qid": "int64", "query": "string"}) +print(queries.dtypes) +print("query examples") +print(queries.head(n=5)) +print() + + +# Get the text corpus +pathCorpus = dataset.get_corpus() +print(pathCorpus[0]) +print("Load CSV...") +corpus = pd.read_csv(pathCorpus[0], sep="\t") +corpus.columns = ["docno", "text"] +corpus = corpus.astype({"text": "string"}) +print("corpus examples:") +print(corpus.head(n=5)) +print() + +model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512) + + +def getReranked(qid): + querytext = queries.loc[queries["qid"] == qid].iloc[0]["query"] + print("query text: ", querytext) + docIds = retrieved.loc[retrieved["qid"] == qid]["docID"] + print(docIds.head(n=5)) + docs = corpus.loc[corpus["docno"].isin(docIds)] + print(docs.head(n=5)) + print() + + print("Predict...") + couples = [(querytext, docText) for docText in docs["text"]] + scores = model.predict(couples) + print(scores) + + print("Sort...") + sorted_indices = [i[0] for i in sorted(enumerate(scores), key=lambda x: -x[1])] + + top = docs.iloc[sorted_indices] + return top + + +s = "" +numberquery = 0 +for qid in retrieved["qid"].unique(): + print(numberquery, " query processed ...") + numberquery += 1 + top = getReranked(qid) + i = 0 + for index, row in top.iterrows(): + s += ( + str(qid) + + " " + + "Q0" + + " " + + str(row["docno"]) + + " " + + str(i) + + " " + + str(1 / (i + 1)) + + " " + + "BERT" + + "\n" + ) + i += 1 + + +with open(OUTPUT_PATH, "w+") as file: + file.write(s)