diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..6778b04 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: 'github-actions' + directory: '/' + schedule: + interval: 'daily' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..c6d849a --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,26 @@ +name: Release + +on: + push: + tags: + - "v[0-9].[0-9]+.[0-9]+*" + +jobs: + release-on-pypi: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Hatch + run: pip install hatch + + - name: Build + run: hatch build + + - name: Publish on PyPi + env: + HATCH_INDEX_USER: __token__ + HATCH_INDEX_AUTH: ${{ secrets.PYPI_API_TOKEN }} + run: hatch publish -y \ No newline at end of file diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..e497a6a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,45 @@ +name: Test + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: test-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + HF_API_TOKEN: ${{ secrets.HF_API_TOKEN }} + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-12] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + run: git config --system core.longpaths true + + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run lint:all diff --git a/README.md b/README.md index 1b4d715..d67e74d 100644 --- a/README.md +++ b/README.md @@ -1 +1,64 @@ -# rrf \ No newline at end of file +## Performance Evaluation of Rankers and RRF Techniques for Retrieval Pipelines + +**Paper:** [Performance Evaluation of Rankers and RRF Techniques for Retrieval Pipelines](paper/rankers_rrf.pdf) + +In the intricate world of Long-form Question Answering (LFQA) and Retrieval Augmented Generation (RAG), making the most of the LLM’s context window is paramount. Any wasted space or repetitive content limits the depth and breadth of the answers we can extract and generate. It’s a delicate balancing act to lay out the content of the context window appropriately. + +With the addition of three rankers, viz., Diversity Ranker, Lost In The Middle Ranker, Similarity Rankers and RRF techniques, we aim to address these challenges and improve the answers generated by the LFQA/RAG pipelines. We have done a comparative study of adding different combinations of rankers in a Retrieval pipeline and evaluated the results on four metrics, viz., Normalized Discounted Cumulative Gain (NDCG), Mean Average Precision (MAP), Recall and Precision. + +In our study, we consider the following cases of retrieval: + +RAG Pipelines Taxonomy + +The following rankers were used: + +- **Diversity Ranker:** The Diversity Ranker enhances the diversity of the paragraphs selected for the context window. + +- **Lost In The Middle Ranker:** The Lost In The Middle Ranker optimizes the layout of the selected documents in the LLM’s context window. + +- **Transformers Similarity Ranker:** The Transformers Similarity Ranker ranks Documents based on how similar they are to the query. It uses a pre-trained cross-encoder model to embed both the query and the Documents. It then compares the embeddings to determine how similar they are. + +**Dense Retrieval:** + +For Dense retrieval, `INSTRUCTOR-XL` and `all-mpnet-base-v2` models were employed. + +Dense Pipeline with Rankers + +**Hybrid Retrieval:** + +BM25 retrieval was used for Sparse retrieval in the Hybrid pipelines. The `bge-reranker-large` model was used in the Similarity Ranker, and `ms-marco-MiniLM-L-12-v2` for the Diversity Ranker. + +**Reciprocal Rank Fusion** (RRF) was used to combine the results for Hybrid retrieval. + +Hybrid Pipeline with Rankers + +## Usage + +To run the pipelines, you will need to clone this repository and install the required libraries. + +1. Install the `rrf` package: + +```bash +git clone https://github.com/avnlp/rrf +cd rrf +pip install -e . +``` + +2. To add the data to an index in Pinecone using the INSTRUCTOR-XL embedding model: + +```python +cd src/rrf/indexing_pipeline/fiqa +python pinecone_instructor_index.py +``` + +3. To run a specific pipeline you will have to go that file path and then run the file. +For example, running the pipeline that uses dense retrieval with a combination of Diversity Ranker, Lost In The Middle Ranker and Similarity Ranker: + +```python +cd src/rrf/pointwise/instructor_xl/fiqa/ +python dense_similarity_diversity_litm.py +``` + +## License + +The source files are distributed under the [MIT License](https://github.com/avnlp/rrf/blob/main/LICENSE). diff --git a/paper/rankers_rrf.pdf b/paper/rankers_rrf.pdf new file mode 100644 index 0000000..45b3739 Binary files /dev/null and b/paper/rankers_rrf.pdf differ diff --git a/plots/pipelines_taxonomy.png b/plots/pipelines_taxonomy.png new file mode 100644 index 0000000..7d18503 Binary files /dev/null and b/plots/pipelines_taxonomy.png differ diff --git a/plots/ranker_pipeline.jpg b/plots/ranker_pipeline.jpg new file mode 100644 index 0000000..2761665 Binary files /dev/null and b/plots/ranker_pipeline.jpg differ diff --git a/plots/rankers_dense_pipeline.png b/plots/rankers_dense_pipeline.png new file mode 100644 index 0000000..45a4ed3 Binary files /dev/null and b/plots/rankers_dense_pipeline.png differ diff --git a/plots/rankers_hybrid_pipeline.png b/plots/rankers_hybrid_pipeline.png new file mode 100644 index 0000000..9d65dff Binary files /dev/null and b/plots/rankers_hybrid_pipeline.png differ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..57cde92 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,181 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "rrf" +dynamic = ["version"] +description = 'Performance Evaluation of Rankers and RRF Techniques for Retrieval Pipelines' +readme = "README.md" +requires-python = ">=3.8" +license = "MIT" +keywords = ["RAG", "Retrieval", "Rankers", "LLMs", "RRF"] +authors = [ + { name = "Ashwin Mathur", email = "" }, + { name = "Varun Mathur", email = "" }, +] +maintainers = [ + { name = "Ashwin Mathur", email = "" }, + { name = "Varun Mathur", email = "" }, +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "License :: Freely Distributable", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] + +dependencies = [ + "typing_extensions", + "haystack-ai", + "sentence-transformers", + "instructor-embedders-haystack", + "beir", + "pinecone-haystack", + "chroma-haystack", + "weaviate-haystack", + "llama-cpp-haystack", +] + + +[project.urls] +Documentation = "https://github.com/avnlp/rrf#readme" +Issues = "https://github.com/avnlp/rrf/issues" +Source = "https://github.com/avnlp/rrf" + +[tool.hatch.build.targets.wheel] +packages = ["src/rrf"] + +[tool.hatch.version] +path = "src/rrf/__about__.py" + +[tool.hatch.envs.default] +dependencies = ["coverage[toml]>=6.5", "coveralls", "pytest"] + +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = ["- coverage combine", "coverage xml"] +cov = ["test-cov", "cov-report"] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] + +[tool.hatch.envs.lint] +detached = true +dependencies = ["black>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243"] + +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/rrf}" +style = ["ruff check {args:.}", "black --check --diff {args:.}"] +fmt = ["black {args:.}", "ruff check --fix {args:.}", "style"] +all = ["fmt", "typing"] + + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +lint.select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +lint.ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Ignore print statements + "T201", + "E501", +] +lint.unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.lint.isort] +known-first-party = ["rrf"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["rrf", "tests"] +branch = true +parallel = true +omit = ["src/rrf/__about__.py", "examples"] + +[tool.coverage.paths] +rrf = [ + "src/rrf", + "*/rrf/src/rrf", +] +tests = ["tests", "*rrf/tests"] + +[tool.coverage.report] +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-vv" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.mypy] +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = ["haystack.*", "pytest.*"] +ignore_missing_imports = true diff --git a/src/rrf/__about__.py b/src/rrf/__about__.py new file mode 100644 index 0000000..f102a9c --- /dev/null +++ b/src/rrf/__about__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/src/rrf/__init__.py b/src/rrf/__init__.py new file mode 100644 index 0000000..5ce65f5 --- /dev/null +++ b/src/rrf/__init__.py @@ -0,0 +1,4 @@ +from rrf.beir_dataloader import BeirDataloader +from rrf.beir_evaluator import BeirEvaluator + +__all__ = ["BeirEvaluator", "BeirDataloader"] diff --git a/src/rrf/beir_dataloader.py b/src/rrf/beir_dataloader.py new file mode 100644 index 0000000..af6933a --- /dev/null +++ b/src/rrf/beir_dataloader.py @@ -0,0 +1,26 @@ +import os +from typing import Any, Dict, Optional, Tuple + +from beir import util +from beir.datasets.data_loader import GenericDataLoader + + +class BeirDataloader: + + def __init__(self, dataset: str): + self.dataset = dataset + + def download_and_unzip(self): + url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{self.dataset}.zip" + out_dir = os.path.join(os.getcwd(), "datasets") + self.data_path = util.download_and_unzip(url, out_dir) + print(f"Dataset downloaded here: {self.data_path}") + return self.data_path + + def load( + self, data_path: Optional[str] = None, split: str = "test" + ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]: + if data_path: + self.data_path = data_path + corpus, queries, qrels = GenericDataLoader(self.data_path).load(split=split) + return corpus, queries, qrels diff --git a/src/rrf/beir_evaluator.py b/src/rrf/beir_evaluator.py new file mode 100644 index 0000000..91fc04a --- /dev/null +++ b/src/rrf/beir_evaluator.py @@ -0,0 +1,71 @@ +from typing import Dict, List, Tuple + +from pytrec_eval import RelevanceEvaluator + + +class BeirEvaluator: + + def __init__( + self, + qrels: Dict[str, Dict[str, int]], + results: Dict[str, Dict[str, float]], + k_values: List[int], + ignore_identical_ids: bool = True, # noqa: FBT001, FBT002 + ): + self.qrels = qrels + self.results = results + self.k_values = k_values + self.ignore_identical_ids = ignore_identical_ids + + def evaluate( + self, + ) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]: + if self.ignore_identical_ids: + print( + "For evaluation, we ignore identical query and document ids (default), please explicitly set " + "``ignore_identical_ids=False`` to ignore this." + ) + popped = [] + for qid, rels in self.results.items(): + for pid in list(rels): + if qid == pid: + self.results[qid].pop(pid) + popped.append(pid) + + ndcg = {} + _map = {} + recall = {} + precision = {} + + for k in self.k_values: + ndcg[f"NDCG@{k}"] = 0.0 + _map[f"MAP@{k}"] = 0.0 + recall[f"Recall@{k}"] = 0.0 + precision[f"P@{k}"] = 0.0 + + map_string = "map_cut." + ",".join([str(k) for k in self.k_values]) + ndcg_string = "ndcg_cut." + ",".join([str(k) for k in self.k_values]) + recall_string = "recall." + ",".join([str(k) for k in self.k_values]) + precision_string = "P." + ",".join([str(k) for k in self.k_values]) + evaluator = RelevanceEvaluator(self.qrels, {map_string, ndcg_string, recall_string, precision_string}) + scores = evaluator.evaluate(self.results) + + for query_id in scores.keys(): + for k in self.k_values: + ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)] + _map[f"MAP@{k}"] += scores[query_id]["map_cut_" + str(k)] + recall[f"Recall@{k}"] += scores[query_id]["recall_" + str(k)] + precision[f"P@{k}"] += scores[query_id]["P_" + str(k)] + + for k in self.k_values: + ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"] / len(scores), 5) + _map[f"MAP@{k}"] = round(_map[f"MAP@{k}"] / len(scores), 5) + recall[f"Recall@{k}"] = round(recall[f"Recall@{k}"] / len(scores), 5) + precision[f"P@{k}"] = round(precision[f"P@{k}"] / len(scores), 5) + + for eval_metric in [ndcg, _map, recall, precision]: + print("\n") + for k in eval_metric.keys(): # type: ignore + print(f"{k}: {eval_metric[k]:.4f}") # type: ignore + + return ndcg, _map, recall, precision diff --git a/src/rrf/indexing_pipelines/fiqa/pinecone_instructor_index.py b/src/rrf/indexing_pipelines/fiqa/pinecone_instructor_index.py new file mode 100644 index 0000000..513be5a --- /dev/null +++ b/src/rrf/indexing_pipelines/fiqa/pinecone_instructor_index.py @@ -0,0 +1,40 @@ +from haystack import Document, Pipeline +from haystack.components.writers import DocumentWriter +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorDocumentEmbedder +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore + +from rrf import BeirDataloader + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +doc_instruction = "Represent the financial document for retrieval:" + +embedder = InstructorDocumentEmbedder(model="hkunlp/instructor-xl", instruction=doc_instruction) +embedder.warm_up() + +indexing_pipeline = Pipeline() +indexing_pipeline.add_component("embedder", embedder) +indexing_pipeline.add_component("writer", DocumentWriter(document_store)) +indexing_pipeline.connect("embedder", "writer") + +indexing_pipeline.run({"embedder": {"documents": documents}}) diff --git a/src/rrf/indexing_pipelines/fiqa/pinecone_mpnet_index.py b/src/rrf/indexing_pipelines/fiqa/pinecone_mpnet_index.py new file mode 100644 index 0000000..088a4cd --- /dev/null +++ b/src/rrf/indexing_pipelines/fiqa/pinecone_mpnet_index.py @@ -0,0 +1,38 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersDocumentEmbedder +from haystack.components.writers import DocumentWriter +from haystack.utils import Secret +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore + +from rrf import BeirDataloader + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-mpnet-base-v2") +embedder.warm_up() + +indexing_pipeline = Pipeline() +indexing_pipeline.add_component("embedder", embedder) +indexing_pipeline.add_component("writer", DocumentWriter(document_store)) +indexing_pipeline.connect("embedder", "writer") + +indexing_pipeline.run({"embedder": {"documents": documents}}) diff --git a/src/rrf/pointwise/__init__.py b/src/rrf/pointwise/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/rrf/pointwise/instructor_xl/__init__.py b/src/rrf/pointwise/instructor_xl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/rrf/pointwise/instructor_xl/fiqa/__init__.py b/src/rrf/pointwise/instructor_xl/fiqa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense.py b/src/rrf/pointwise/instructor_xl/fiqa/dense.py new file mode 100644 index 0000000..8259b56 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense.py @@ -0,0 +1,59 @@ +from haystack import Document, Pipeline +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.connect("text_embedder", "embedding_retriever") + + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run({"text_embedder": {"text": query}}) + output_docs = output["embedding_retriever"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity.py new file mode 100644 index 0000000..794d8bb --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity.py @@ -0,0 +1,65 @@ +from haystack import Document, Pipeline +from haystack.components.rankers import SentenceTransformersDiversityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "diversity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run({"text_embedder": {"text": query}, "diversity_ranker": {"query": query}}) + output_docs = output["diversity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_instructor_rag.py new file mode 100644 index 0000000..c5438ae --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_instructor_rag.py @@ -0,0 +1,92 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.rankers import SentenceTransformersDiversityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("diversity_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_litm.py new file mode 100644 index 0000000..83e4cd4 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_litm.py @@ -0,0 +1,75 @@ +from haystack import Document, Pipeline +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_litm_instructor_rag.py new file mode 100644 index 0000000..c1e2e4f --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_diversity_litm_instructor_rag.py @@ -0,0 +1,97 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.rankers import LostInTheMiddleRanker, SentenceTransformersDiversityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") +dense_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_instructor_rag.py new file mode 100644 index 0000000..6101198 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_instructor_rag.py @@ -0,0 +1,89 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") + +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("embedding_retriever.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_litm.py new file mode 100644 index 0000000..bd188c2 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_litm.py @@ -0,0 +1,67 @@ +from haystack import Document, Pipeline +from haystack.components.rankers import ( + LostInTheMiddleRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "litm_ranker.documents") + + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run({"text_embedder": {"text": query}}) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_litm_instructor_rag.py new file mode 100644 index 0000000..968247d --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_litm_instructor_rag.py @@ -0,0 +1,90 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.rankers import LostInTheMiddleRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "litm_ranker.documents") +dense_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity.py new file mode 100644 index 0000000..52a03f9 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity.py @@ -0,0 +1,65 @@ +from haystack import Document, Pipeline +from haystack.components.rankers import TransformersSimilarityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") + + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run({"text_embedder": {"text": query}, "similarity_ranker": {"query": query}}) + output_docs = output["similarity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity.py new file mode 100644 index 0000000..0175a4b --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity.py @@ -0,0 +1,74 @@ +from haystack import Document, Pipeline +from haystack.components.rankers import SentenceTransformersDiversityRanker, TransformersSimilarityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["diversity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_instructor_rag.py new file mode 100644 index 0000000..b77fea7 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_instructor_rag.py @@ -0,0 +1,95 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.rankers import SentenceTransformersDiversityRanker, TransformersSimilarityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("diversity_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_litm.py new file mode 100644 index 0000000..d7849f3 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_litm.py @@ -0,0 +1,79 @@ +from haystack import Document, Pipeline +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, + TransformersSimilarityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_litm_instructor_rag.py new file mode 100644 index 0000000..fb627bd --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_diversity_litm_instructor_rag.py @@ -0,0 +1,106 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, + TransformersSimilarityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") +dense_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_instructor_rag.py new file mode 100644 index 0000000..6c322ca --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_instructor_rag.py @@ -0,0 +1,92 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.rankers import TransformersSimilarityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") + +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("similarity_ranker.documents", "answer_builder.documents") + + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_litm.py new file mode 100644 index 0000000..b14f3ac --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_litm.py @@ -0,0 +1,76 @@ +from haystack import Document, Pipeline +from haystack.components.rankers import ( + LostInTheMiddleRanker, + TransformersSimilarityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "litm_ranker.documents") + + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_litm_instructor_rag.py new file mode 100644 index 0000000..ed6e321 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/dense_similarity_litm_instructor_rag.py @@ -0,0 +1,95 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.rankers import LostInTheMiddleRanker, TransformersSimilarityRanker +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +dense_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +dense_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +dense_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "litm_ranker.documents") +dense_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +dense_pipeline.connect("prompt_builder", "llm") +dense_pipeline.connect("llm.replies", "answer_builder.replies") +dense_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse.py new file mode 100644 index 0000000..ed5e9f3 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse.py @@ -0,0 +1,37 @@ +from haystack import Document, Pipeline +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +sparse_pipeline = Pipeline() +sparse_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output_docs = sparse_pipeline.run({"bm25_retriever": {"query": query}})["bm25_retriever"]["documents"] + doc_qrels = {doc.meta["corpus_id"]: doc.score for doc in output_docs} + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense.py new file mode 100644 index 0000000..8c04aa1 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense.py @@ -0,0 +1,75 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run({"bm25_retriever": {"query": query}, "text_embedder": {"text": query}}) + output_docs = output["joiner"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity.py new file mode 100644 index 0000000..98f5c76 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity.py @@ -0,0 +1,86 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import SentenceTransformersDiversityRanker +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "diversity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["diversity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_instructor_rag.py new file mode 100644 index 0000000..cf15ab7 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_instructor_rag.py @@ -0,0 +1,110 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import SentenceTransformersDiversityRanker +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "diversity_ranker.documents") +hybrid_pipeline.connect("diversity_ranker.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("diversity_ranker.documents", "answer_builder.documents") + + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_litm.py new file mode 100644 index 0000000..1f43f8d --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_litm.py @@ -0,0 +1,90 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, +) +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "diversity_ranker.documents") +hybrid_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_litm_instructor_rag.py new file mode 100644 index 0000000..4296fa7 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_diversity_litm_instructor_rag.py @@ -0,0 +1,109 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import LostInTheMiddleRanker, SentenceTransformersDiversityRanker +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "diversity_ranker.documents") +hybrid_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") +hybrid_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_instructor_rag.py new file mode 100644 index 0000000..0c7d395 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_instructor_rag.py @@ -0,0 +1,104 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") + +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("joiner.documents", "answer_builder.documents") + + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_litm.py new file mode 100644 index 0000000..0ae3ff8 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_litm.py @@ -0,0 +1,85 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import LostInTheMiddleRanker +from haystack.components.retrievers import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_litm_instructor_rag.py new file mode 100644 index 0000000..ddb46ee --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_litm_instructor_rag.py @@ -0,0 +1,109 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import LostInTheMiddleRanker +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "litm_ranker.documents") +hybrid_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity.py new file mode 100644 index 0000000..70c6233 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity.py @@ -0,0 +1,85 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import TransformersSimilarityRanker +from haystack.components.retrievers import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + } + ) + output_docs = output["similarity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity.py new file mode 100644 index 0000000..0fd2621 --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity.py @@ -0,0 +1,90 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import SentenceTransformersDiversityRanker, TransformersSimilarityRanker +from haystack.components.retrievers import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") +hybrid_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["diversity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_instructor_rag.py new file mode 100644 index 0000000..8f7f31e --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_instructor_rag.py @@ -0,0 +1,113 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import SentenceTransformersDiversityRanker, TransformersSimilarityRanker +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") +hybrid_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") +hybrid_pipeline.connect("diversity_ranker.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("diversity_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_litm.py new file mode 100644 index 0000000..c8af83f --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_litm.py @@ -0,0 +1,94 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, + TransformersSimilarityRanker, +) +from haystack.components.retrievers import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") +hybrid_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") +hybrid_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_litm_instructor_rag.py new file mode 100644 index 0000000..a3ed9dd --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_diversity_litm_instructor_rag.py @@ -0,0 +1,121 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, + TransformersSimilarityRanker, +) +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +hybrid_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") +hybrid_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") +hybrid_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") +hybrid_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_instructor_rag.py new file mode 100644 index 0000000..6edd01f --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_instructor_rag.py @@ -0,0 +1,108 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import TransformersSimilarityRanker +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") + +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") +hybrid_pipeline.connect("similarity_ranker.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("similarity_ranker.documents", "answer_builder.documents") + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_litm.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_litm.py new file mode 100644 index 0000000..1bbeb8b --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_litm.py @@ -0,0 +1,94 @@ +from haystack import Document, Pipeline +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import ( + LostInTheMiddleRanker, + TransformersSimilarityRanker, +) +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder( + model="hkunlp/instructor-xl", + instruction=query_instruction, +) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") +hybrid_pipeline.connect("similarity_ranker.documents", "litm_ranker.documents") + + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_litm_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_litm_instructor_rag.py new file mode 100644 index 0000000..b79345b --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_dense_similarity_litm_instructor_rag.py @@ -0,0 +1,118 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.joiners import DocumentJoiner +from haystack.components.rankers import LostInTheMiddleRanker, TransformersSimilarityRanker +from haystack.components.retrievers import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from haystack.utils import Secret +from haystack_integrations.components.embedders.instructor_embedders import InstructorTextEmbedder +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +query_instruction = "Represent the financial question for retrieving supporting documents:" +text_embedder = InstructorTextEmbedder(model="hkunlp/instructor-xl", instruction=query_instruction) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +joiner = DocumentJoiner(join_mode="reciprocal_rank_fusion") + +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +hybrid_pipeline = Pipeline() + +hybrid_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +hybrid_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +hybrid_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +hybrid_pipeline.add_component(instance=joiner, name="joiner") +hybrid_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +hybrid_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +hybrid_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +hybrid_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={ + "max_new_tokens": 1024, + "temperature": 0.5, + "do_sample": True, + }, + ), + name="llm", +) +hybrid_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +hybrid_pipeline.connect("bm25_retriever", "joiner") +hybrid_pipeline.connect("text_embedder", "embedding_retriever") +hybrid_pipeline.connect("embedding_retriever", "joiner") +hybrid_pipeline.connect("joiner.documents", "similarity_ranker.documents") +hybrid_pipeline.connect("similarity_ranker.documents", "litm_ranker.documents") +hybrid_pipeline.connect("litm_ranker.documents", "prompt_builder.documents") +hybrid_pipeline.connect("prompt_builder", "llm") +hybrid_pipeline.connect("llm.replies", "answer_builder.replies") +hybrid_pipeline.connect("litm_ranker.documents", "answer_builder.documents") + + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = hybrid_pipeline.run( + { + "bm25_retriever": {"query": query}, + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/instructor_xl/fiqa/sparse_instructor_rag.py b/src/rrf/pointwise/instructor_xl/fiqa/sparse_instructor_rag.py new file mode 100644 index 0000000..f27f9cc --- /dev/null +++ b/src/rrf/pointwise/instructor_xl/fiqa/sparse_instructor_rag.py @@ -0,0 +1,72 @@ +from typing import Any, Dict + +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.generators import HuggingFaceLocalGenerator +from haystack.components.retrievers.in_memory import InMemoryBM25Retriever +from haystack.document_stores.in_memory import InMemoryDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader + +prompt_template = """ + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + Answer the question based on the financial documents being provided.<|eot_id|><|start_header_id|>user<|end_header_id|> + Question: {{question}} + Documents: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + <|eot_id|> + """ + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +sparse_document_store = InMemoryDocumentStore() +sparse_document_store.write_documents(documents_corp) + +sparse_retriever = InMemoryBM25Retriever(document_store=sparse_document_store, top_k=10) + +sparse_pipeline = Pipeline() +sparse_pipeline.add_component(instance=sparse_retriever, name="bm25_retriever") +sparse_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") +sparse_pipeline.add_component( + instance=HuggingFaceLocalGenerator( + model="meta-llama/Meta-Llama-3-8B-Instruct", + generation_kwargs={"max_new_tokens": 1024, "temperature": 0.5, "do_sample": True}, + ), + name="llm", +) +sparse_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + + +sparse_pipeline.connect("bm25_retriever", "prompt_builder.documents") +sparse_pipeline.connect("prompt_builder", "llm") +sparse_pipeline.connect("llm.replies", "answer_builder.replies") +sparse_pipeline.connect("bm25_retriever", "answer_builder.documents") + + +answers: Dict[str, Any] = {} + +for query_id, query in tqdm(queries.items()): + output = sparse_pipeline.run( + { + "bm25_retriever": {"query": query}, + "prompt_builder": {"question": query}, + "answer_builder": {"query": query}, + } + ) + generated_answer = output["answer_builder"]["answers"][0] + answers[query_id] = generated_answer diff --git a/src/rrf/pointwise/mpnet/__init__.py b/src/rrf/pointwise/mpnet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/rrf/pointwise/mpnet/fiqa/__init__.py b/src/rrf/pointwise/mpnet/fiqa/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/rrf/pointwise/mpnet/fiqa/dense.py b/src/rrf/pointwise/mpnet/fiqa/dense.py new file mode 100644 index 0000000..2a7d702 --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense.py @@ -0,0 +1,55 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-mpnet-base-v2") + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.connect("text_embedder", "embedding_retriever") + + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run({"text_embedder": {"text": query}}) + output_docs = output["embedding_retriever"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/mpnet/fiqa/dense_diversity.py b/src/rrf/pointwise/mpnet/fiqa/dense_diversity.py new file mode 100644 index 0000000..6514810 --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense_diversity.py @@ -0,0 +1,58 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import SentenceTransformersDiversityRanker +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +text_embedder = SentenceTransformersTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", +) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +dense_pipeline = Pipeline() +dense_pipeline.add_component(instance=text_embedder, name="text_embedder") +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "diversity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run({"text_embedder": {"text": query}, "diversity_ranker": {"query": query}}) + output_docs = output["diversity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/mpnet/fiqa/dense_diversity_litm.py b/src/rrf/pointwise/mpnet/fiqa/dense_diversity_litm.py new file mode 100644 index 0000000..13ddcc8 --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense_diversity_litm.py @@ -0,0 +1,75 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, + TransformersSimilarityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +text_embedder = SentenceTransformersTextEmbedder( + model="all-mpnet-base-v2", +) +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/mpnet/fiqa/dense_litm.py b/src/rrf/pointwise/mpnet/fiqa/dense_litm.py new file mode 100644 index 0000000..21ba4fa --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense_litm.py @@ -0,0 +1,66 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import LostInTheMiddleRanker +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +text_embedder = SentenceTransformersTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", +) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "litm_ranker.documents") + + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/mpnet/fiqa/dense_similarity.py b/src/rrf/pointwise/mpnet/fiqa/dense_similarity.py new file mode 100644 index 0000000..1305130 --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense_similarity.py @@ -0,0 +1,59 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import TransformersSimilarityRanker +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +text_embedder = SentenceTransformersTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", +) +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run({"text_embedder": {"text": query}, "similarity_ranker": {"query": query}}) + output_docs = output["similarity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/mpnet/fiqa/dense_similarity_diversity.py b/src/rrf/pointwise/mpnet/fiqa/dense_similarity_diversity.py new file mode 100644 index 0000000..260d45c --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense_similarity_diversity.py @@ -0,0 +1,72 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import ( + SentenceTransformersDiversityRanker, + TransformersSimilarityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +text_embedder = SentenceTransformersTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", +) +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["diversity_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/mpnet/fiqa/dense_similarity_diversity_litm.py b/src/rrf/pointwise/mpnet/fiqa/dense_similarity_diversity_litm.py new file mode 100644 index 0000000..f700047 --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense_similarity_diversity_litm.py @@ -0,0 +1,76 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import ( + LostInTheMiddleRanker, + SentenceTransformersDiversityRanker, + TransformersSimilarityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) + +text_embedder = SentenceTransformersTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", +) +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +diversity_ranker = SentenceTransformersDiversityRanker(model="cross-encoder/ms-marco-MiniLM-L-12-v2", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=diversity_ranker, name="diversity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "diversity_ranker.documents") +dense_pipeline.connect("diversity_ranker.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + "diversity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate() diff --git a/src/rrf/pointwise/mpnet/fiqa/dense_similarity_litm.py b/src/rrf/pointwise/mpnet/fiqa/dense_similarity_litm.py new file mode 100644 index 0000000..0c32168 --- /dev/null +++ b/src/rrf/pointwise/mpnet/fiqa/dense_similarity_litm.py @@ -0,0 +1,72 @@ +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder +from haystack.components.rankers import ( + LostInTheMiddleRanker, + TransformersSimilarityRanker, +) +from haystack.utils import Secret +from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever +from haystack_integrations.document_stores.pinecone import PineconeDocumentStore +from tqdm import tqdm + +from rrf import BeirDataloader, BeirEvaluator + +dataset = "fiqa" +data_loader = BeirDataloader(dataset) +data_loader.download_and_unzip() +corpus, queries, qrels = data_loader.load() + +documents_corp = [ + Document( + content=text_dict["text"], + meta={"corpus_id": str(corpus_id), "title": text_dict["title"]}, + ) + for corpus_id, text_dict in corpus.items() +] + +dense_document_store = PineconeDocumentStore( + api_key=Secret.from_env_var("PINECONE_API_KEY"), + environment="gcp-starter", + index="fiqa", + namespace="default", + dimension=768, +) + +dense_retriever = PineconeEmbeddingRetriever(document_store=dense_document_store, top_k=10) +text_embedder = SentenceTransformersTextEmbedder( + model="sentence-transformers/all-mpnet-base-v2", +) +similarity_ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-large", top_k=10) +litm_ranker = LostInTheMiddleRanker(top_k=10) + +dense_pipeline = Pipeline() +dense_pipeline.add_component( + instance=text_embedder, + name="text_embedder", +) +dense_pipeline.add_component(instance=dense_retriever, name="embedding_retriever") +dense_pipeline.add_component(instance=similarity_ranker, name="similarity_ranker") +dense_pipeline.add_component(instance=litm_ranker, name="litm_ranker") + + +dense_pipeline.connect("text_embedder", "embedding_retriever") +dense_pipeline.connect("embedding_retriever.documents", "similarity_ranker.documents") +dense_pipeline.connect("similarity_ranker.documents", "litm_ranker.documents") + +result_qrels_all = {} + +for query_id, query in tqdm(queries.items()): + output = dense_pipeline.run( + { + "text_embedder": {"text": query}, + "similarity_ranker": {"query": query}, + } + ) + output_docs = output["litm_ranker"]["documents"] + doc_qrels = {} + for doc in output_docs: + doc_qrels[doc.meta["corpus_id"]] = doc.score + result_qrels_all[query_id] = doc_qrels + +evaluator = BeirEvaluator(qrels, result_qrels_all, [3, 5, 7, 10]) +ndcg, _map, recall, precision = evaluator.evaluate()