diff --git a/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb new file mode 100644 index 0000000..a20eaf9 --- /dev/null +++ b/notebooks/advanced_techniques/contextual_chunk_embedding.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b5718b88-0de5-4874-8a9b-e9cad42d1a86", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mongodb-developer/GenAI-Showcase/blob/main/notebooks/advanced_techniques/contextual_chunk_embeddings.ipynb)\n", + "\n", + "[![View Article](https://img.shields.io/badge/View%20Article-blue)](https://www.mongodb.com/developer/products/atlas/contextual-chunk-embeddings/?utm_campaign=devrel&utm_source=cross-post&utm_medium=organic_social&utm_content=https%3A%2F%2Fgithub.com%2Fmongodb-developer%2FGenAI-Showcase&utm_term=apoorva.joshi)" + ] + }, + { + "cell_type": "markdown", + "id": "b93d31ee-ebe0-4393-88f9-7ce6b9dca241", + "metadata": {}, + "source": [ + "# Contextualized chunk embeddings: Combining local detail with global context\n", + "\n", + "This notebook shows you how to implement and evaluate Voyage AI's _voyage-context-3_ contextualized chunk embedding model." + ] + }, + { + "cell_type": "markdown", + "id": "21c19fbe-e42e-4a46-9928-f439eb54caf2", + "metadata": {}, + "source": [ + "## Step 1: Install required libraries\n", + "\n", + "- **datasets**: Python library to get access to datasets available on Hugging Face Hub\n", + "- **pdfplumber**: Python library to parse and analyze PDFs\n", + "- **langchain-text-splitters**: Text chunking utilities in LangChain\n", + "- **tiktoken**: Token counting and encoding library\n", + "- **voyageai**: Python library to interact with Voyage AI's APIs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "91a9f9cb-d865-4451-b507-63be68291e0b", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -qU datasets==4.3.0 pdfplumber==0.11.7 langchain-text-splitters==0.3.11 voyageai==0.3.5 tiktoken==0.12.0" + ] + }, + { + "cell_type": "markdown", + "id": "e8a58c57-6b0d-4b72-aa60-325605a8ce6a", + "metadata": {}, + "source": [ + "## Step 2: Setup prerequisites\n", + "\n", + "Follow the steps [here](https://dashboard.voyageai.com/organization/api-keys) to obtain a Voyage AI API key." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d413f794-23b8-4b1b-98ae-2443a53457df", + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "import voyageai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf6ff564-053c-4d2b-8a38-6a0b1a17a85e", + "metadata": {}, + "outputs": [], + "source": [ + "# Set Voyage AI API key as an environment variable\n", + "os.environ[\"VOYAGE_API_KEY\"] = getpass.getpass(\"Enter your VoyageAI API key:\")\n", + "# Initialize the Voyage AI client\n", + "voyage_client = voyageai.Client()" + ] + }, + { + "cell_type": "markdown", + "id": "4df97e59-d542-4851-81fe-d6a38988e609", + "metadata": {}, + "source": [ + "## Step 3: Download the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "3ec450d7-6d6f-42ec-ab7b-c007fd0c1c68", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "# Download a dataset from Hugging Face\n", + "docs = load_dataset(\"MongoDB/legal-docs\", split=\"train\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "2bdac794-1454-42d0-8c4e-fafce420042d", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the first PDF in the dataset\n", + "pdf = docs[0][\"pdf\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "f95f5434-9acf-491f-bca5-02b61fde9bd2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "40" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the number of pages in the PDF\n", + "len(pdf.pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "be8f15a9-573d-48d5-86a9-6c876d3a4aa1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Exhibit 10.2\\nExecution Version\\nINTELLECTUAL PROPERTY AGREEMENT\\nThis INTELLECTUAL PROPERTY AGREEMENT (this “Agreement”), dated as of December 31, 2018 (the “Effective Date”) is entered into by and\\nbetween Armstrong Flooring, Inc., a Delaware corporation (“Seller”) and AFI Licensing LLC, a Delaware limited liability company (“Licensing” and\\ntogether with Seller, “Arizona”) and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation (“Buyer”) and Armstrong\\nHardwood Flooring Company, a Tennessee corporation (the “Company” and together with Buyer the “Buyer Entities”) (each of Arizona on the one hand\\nand the Buyer Entities on the other hand, a “Party” and collectively, the “Parties”).\\nWHEREAS, Seller and Buyer have entered into that certain Stock Purchase Agreement, dated November 14, 2018 (the “Stock Purchase\\nAgreement”); WHEREAS, pursuant to the Stock Purchase Agreement, Seller has agreed to sell and transfer, and Buyer has agreed to purchase and\\nacquire, all of Seller’s right, title and interest in and to Armstrong Wood Products, Inc., a Delaware corporation (“AWP”) and its Subsidiaries, the\\nCompany and HomerWood Hardwood Flooring Company, a Delaware corporation (“HHFC,” and together with the Company, the “Company\\nSubsidiaries” and together with AWP, the “Company Entities” and each a “Company Entity”) by way of a purchase by Buyer and sale by Seller of the\\nShares, all upon the terms and condition set forth therein;\\nWHEREAS, Arizona owns certain Copyrights, Know-How, Patents and Trademarks which may be used in the Company Field, and in connection\\nwith the transactions contemplated by the Stock Purchase Agreement the Company desires to acquire all of Arizona’s right, title and interest in and to\\nsuch Intellectual Property used exclusively in the Company Field, and obtain a license from Arizona to use other such Intellectual Property on the terms\\nand subject to the conditions set forth herein;\\nWHEREAS, Seller is signatory to the Trademark License Agreement pursuant to which Seller obtains a license to the Arizona Licensed\\nTrademarks;\\nWHEREAS, the Company desires to obtain a sublicense to use the Arizona Licensed Trademarks in the Company Field;\\nWHEREAS, Arizona has obtained consent from all counterparties to the Trademark License Agreement to grant to the Company the sublicenses\\nto the Arizona Licensed Trademarks included in this Agreement; and\\nWHEREAS, the Company Entities own certain Copyrights and Know-How which may be used in the Arizona Field, and in connection with the\\ntransactions contemplated by the Stock Purchase Agreement, Arizona desires to obtain a license from the Company Entities to use such Intellectual\\nProperty on the terms and subject to the conditions set forth herein.\\nNOW, THEREFORE, in consideration of the foregoing and the mutual agreements, provisions and covenants contained in this Agreement, and\\nfor other good and valuable consideration, the receipt and sufficiency of which are hereby acknowledged, the Parties hereby agree as follows:\\nSource: ARMSTRONG FLOORING, INC., 8-K, 1/7/2019'" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Preview the first page in the PDF\n", + "pdf.pages[0].extract_text()" + ] + }, + { + "cell_type": "markdown", + "id": "95ef4336-38a5-485c-81ee-075692e3dab5", + "metadata": {}, + "source": [ + "## Step 4: Chunk the PDF content" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b45e326c-da92-4f2e-9cc7-dd32bd72c2e4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_text_splitters import RecursiveCharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "e8e3e365-7393-47bc-90e2-4df6fad34c05", + "metadata": {}, + "outputs": [], + "source": [ + "separators = [\"\\n\\n\", \"\\n\", \" \", \"\", \"#\", \"##\", \"###\"]\n", + "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n", + " model_name=\"gpt-4\", separators=separators, chunk_size=200, chunk_overlap=0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "ba9771e9-c8ee-4cb8-a6fc-d3a14292e0b3", + "metadata": {}, + "outputs": [], + "source": [ + "chunked_docs = []\n", + "# Iterate through the documents\n", + "for doc_id, doc in enumerate(docs):\n", + " pages = doc[\"pdf\"].pages\n", + " # Keep track of chunk IDs per document\n", + " chunk_id = 0\n", + " # Iterate through the pages in each document\n", + " for page in pages:\n", + " chunks = text_splitter.split_text(page.extract_text())\n", + " for chunk in chunks:\n", + " chunked_docs.append(\n", + " {\"chunk\": chunk, \"chunk_id\": chunk_id, \"doc_id\": doc_id}\n", + " )\n", + " chunk_id += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "c35afc6b-745e-4a2e-ba8a-73f328858bef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'chunk': 'Exhibit 10.2\\nExecution Version\\nINTELLECTUAL PROPERTY AGREEMENT\\nThis INTELLECTUAL PROPERTY AGREEMENT (this “Agreement”), dated as of December 31, 2018 (the “Effective Date”) is entered into by and\\nbetween Armstrong Flooring, Inc., a Delaware corporation (“Seller”) and AFI Licensing LLC, a Delaware limited liability company (“Licensing” and\\ntogether with Seller, “Arizona”) and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation (“Buyer”) and Armstrong\\nHardwood Flooring Company, a Tennessee corporation (the “Company” and together with Buyer the “Buyer Entities”) (each of Arizona on the one hand\\nand the Buyer Entities on the other hand, a “Party” and collectively, the “Parties”).',\n", + " 'chunk_id': 0,\n", + " 'doc_id': 0}" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunked_docs[0]" + ] + }, + { + "cell_type": "markdown", + "id": "a99b4582-0aa4-4783-80ad-7e689a0d04e4", + "metadata": {}, + "source": [ + "## Step 5: Embed the chunks" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "76c96323-c60c-42a0-883c-e3e9e0746949", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "47073960-ce27-4331-9447-9c69c46e8bb9", + "metadata": {}, + "outputs": [], + "source": [ + "def get_std_embeddings(input: List[str], input_type: str) -> List[List[float]]:\n", + " \"\"\"\n", + " Generate context-agnostic embeddings.\n", + "\n", + " Args:\n", + " input (List[str]): List of document chunks or query wrapped in a list\n", + " input_type: Either \"document\" or \"query\"\n", + "\n", + " Returns:\n", + " List[List[float]]: List of embedding vectors\n", + " \"\"\"\n", + " response = voyage_client.embed(input, model=\"voyage-3-large\", input_type=input_type)\n", + " return response.embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "0ce67a4e-8474-4610-a4af-b43b4a397e05", + "metadata": {}, + "outputs": [], + "source": [ + "def get_contextualized_embeddings(\n", + " input: List[List[str]], input_type: str\n", + ") -> List[List[float]]:\n", + " \"\"\"\n", + " Generate contextualized chunk embeddings.\n", + "\n", + " Args:\n", + " input (List[List[str]]): List of document chunks or query wrapped in a list of lists\n", + " input_type: Either \"document\" or \"query\"\n", + "\n", + " Returns:\n", + " List[List[float]]: List of embedding vectors\n", + " \"\"\"\n", + " response = voyage_client.contextualized_embed(\n", + " input, model=\"voyage-context-3\", input_type=input_type\n", + " )\n", + " return [emb for r in response.results for emb in r.embeddings]" + ] + }, + { + "cell_type": "markdown", + "id": "dedb68ed-43d5-4aee-8166-8c032bb41f49", + "metadata": {}, + "source": [ + "## Step 6: Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "c3e29bf9-4170-4109-8c0b-c5c4d38523e7", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "a2914983-bafb-404a-a005-c70b9eeba327", + "metadata": {}, + "outputs": [], + "source": [ + "queries = [\n", + " {\n", + " \"question\": \"Which state’s law governs the agreement between Armstrong Flooring and AHF Holding?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 44,\n", + " },\n", + " {\n", + " \"question\": \"In the Armstrong-AHF agreement, how many days' notice is required to remedy a breach?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 9,\n", + " },\n", + " {\n", + " \"question\": \"Where will disputes be resolved under the agreement between Armstrong Flooring and AHF Holding?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 44,\n", + " },\n", + " {\n", + " \"question\": \"What happens if either party materially breaches the Armstrong-AHF Intellectual agreement?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 35,\n", + " },\n", + " {\n", + " \"question\": \"Under the Armstrong Flooring-AHF Holding agreement, what is the minimum logo size?\",\n", + " \"doc_id\": 0,\n", + " \"chunk_id\": 94,\n", + " },\n", + " {\n", + " \"question\": \"When does Playa Hotels & Resorts' right of first offer expire?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 4,\n", + " },\n", + " {\n", + " \"question\": \"Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 13,\n", + " },\n", + " {\n", + " \"question\": \"What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 1,\n", + " },\n", + " {\n", + " \"question\": \"How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 15,\n", + " },\n", + " {\n", + " \"question\": \"Where will arbitration take place for disputes under the Hyatt-Playa agreement?\",\n", + " \"doc_id\": 1,\n", + " \"chunk_id\": 15,\n", + " },\n", + " {\n", + " \"question\": \"When was the Quaker/Gulf Houghton agreement effective?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 0,\n", + " },\n", + " {\n", + " \"question\": \"Which state’s law governs the Quaker/Gulf Houghton agreement?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 18,\n", + " },\n", + " {\n", + " \"question\": \"What is the geographic scope of the Quaker/Gulf Houghton agreement?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 9,\n", + " },\n", + " {\n", + " \"question\": \"What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 9,\n", + " },\n", + " {\n", + " \"question\": \"How long must before Gulf Houghton sellers can hire former employees?\",\n", + " \"doc_id\": 2,\n", + " \"chunk_id\": 12,\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "766918f0-ffa3-4068-8a33-03e28f586924", + "metadata": {}, + "outputs": [], + "source": [ + "def calculate_metrics(query, chunk_embds, embd_type, k):\n", + " # Get query embeddings\n", + " if embd_type == \"standard\":\n", + " query_embd = get_std_embeddings([query[\"question\"]], \"query\")[0]\n", + " elif embd_type == \"contextual\":\n", + " query_embd = get_contextualized_embeddings([[query[\"question\"]]], \"query\")[0]\n", + " # Calculate pairwise dot product similarity\n", + " similarities = np.dot(chunk_embds, query_embd)\n", + " # Get indices of the top k by similarity\n", + " top_k_idxs = np.argsort(similarities)[::-1][:k]\n", + " # Get the top k most similar chunks\n", + " top_k_docs = [chunked_docs[i] for i in top_k_idxs]\n", + " rank = None\n", + " for i, doc in enumerate(top_k_docs):\n", + " # Check for golden chunk\n", + " if doc[\"doc_id\"] == query[\"doc_id\"] and doc[\"chunk_id\"] == query[\"chunk_id\"]:\n", + " rank = i + 1\n", + " break\n", + "\n", + " recall = 1 if rank else 0\n", + " return recall, rank" + ] + }, + { + "cell_type": "markdown", + "id": "70ccdfe0-363e-40d5-b7a1-c96b5bf7afae", + "metadata": {}, + "source": [ + "### Standard Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "5e0bcbdb-53cd-495e-b39a-4c08529d6ba4", + "metadata": {}, + "outputs": [], + "source": [ + "std_embds = get_std_embeddings([record[\"chunk\"] for record in chunked_docs], \"document\")" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "4222a5e2-45cd-4f33-9bdc-cf8b6a5330c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Which state’s law governs the agreement between Armstrong Flooring and AHF Holding?: 1\n", + "In the Armstrong-AHF agreement, how many days' notice is required to remedy a breach?: None\n", + "Where will disputes be resolved under the agreement between Armstrong Flooring and AHF Holding?: None\n", + "What happens if either party materially breaches the Armstrong-AHF Intellectual agreement?: 1\n", + "Under the Armstrong Flooring-AHF Holding agreement, what is the minimum logo size?: 2\n", + "When does Playa Hotels & Resorts' right of first offer expire?: 5\n", + "Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?: 3\n", + "What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?: 2\n", + "How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?: 1\n", + "Where will arbitration take place for disputes under the Hyatt-Playa agreement?: 2\n", + "When was the Quaker/Gulf Houghton agreement effective?: 1\n", + "Which state’s law governs the Quaker/Gulf Houghton agreement?: 2\n", + "What is the geographic scope of the Quaker/Gulf Houghton agreement?: None\n", + "What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?: 1\n", + "How long must before Gulf Houghton sellers can hire former employees?: 2\n", + "Mean recall: 80.00%\n", + "Mean reciprocal rank: 53.56%\n" + ] + } + ], + "source": [ + "recalls = []\n", + "reciprocal_ranks = []\n", + "for query in queries:\n", + " recall, rank = calculate_metrics(query, std_embds, \"standard\", 5)\n", + " recalls.append(recall)\n", + " print(f\"{query['question']}: {rank}\")\n", + " reciprocal_ranks.append(1 / rank if rank else 0.0)\n", + "\n", + "print(f\"Mean recall: {np.mean(recalls) * 100:.2f}%\")\n", + "print(f\"Mean reciprocal rank: {np.mean(reciprocal_ranks) * 100:.2f}%\")" + ] + }, + { + "cell_type": "markdown", + "id": "7c6f40f1-d46b-4d04-b0ff-3e95cfb2278d", + "metadata": {}, + "source": [ + "### Contextualized Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "f1b618d5-8fe0-4eb0-ac54-465ced3d2000", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "d0c2ea78-ce96-4743-9b2a-2bc3ad054a3e", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert chunked_docs to list of lists of document chunks-- one list per document\n", + "grouped_docs = defaultdict(list)\n", + "for chunk in chunked_docs:\n", + " grouped_docs[chunk[\"doc_id\"]].append(chunk[\"chunk\"])\n", + "\n", + "chunks_by_doc = list(grouped_docs.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "a3117edc-ec2b-481c-a8d7-479080d11118", + "metadata": {}, + "outputs": [], + "source": [ + "ctxt_embds = get_contextualized_embeddings(chunks_by_doc, \"document\")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "95d9c26f-be8c-4962-a437-8debf0cce199", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Which state’s law governs the agreement between Armstrong Flooring and AHF Holding?: 1\n", + "In the Armstrong-AHF agreement, how many days' notice is required to remedy a breach?: None\n", + "Where will disputes be resolved under the agreement between Armstrong Flooring and AHF Holding?: 2\n", + "What happens if either party materially breaches the Armstrong-AHF Intellectual agreement?: 1\n", + "Under the Armstrong Flooring-AHF Holding agreement, what is the minimum logo size?: 2\n", + "When does Playa Hotels & Resorts' right of first offer expire?: 5\n", + "Which state’s law governs the agreement between Hyatt Franchising Latin America and Playa Hotels & Resorts B.V.?: 1\n", + "What countries can Hyatt Franchising Latin America and Playa develop Hyatt All-Inclusive Resorts in?: 3\n", + "How many years of hotel experience must arbitrators have under the Hyatt-Playa agreement?: 1\n", + "Where will arbitration take place for disputes under the Hyatt-Playa agreement?: 1\n", + "When was the Quaker/Gulf Houghton agreement effective?: 1\n", + "Which state’s law governs the Quaker/Gulf Houghton agreement?: 1\n", + "What is the geographic scope of the Quaker/Gulf Houghton agreement?: 2\n", + "What percentage of publicly traded securities can Gulf Houghton sellers own as passive investors?: 1\n", + "How long must before Gulf Houghton sellers can hire former employees?: 1\n", + "Mean recall: 93.33%\n", + "Mean reciprocal rank: 73.56%\n" + ] + } + ], + "source": [ + "recalls = []\n", + "ranks = []\n", + "reciprocal_ranks = []\n", + "for query in queries:\n", + " recall, rank = calculate_metrics(query, ctxt_embds, \"contextual\", 5)\n", + " recalls.append(recall)\n", + " print(f\"{query['question']}: {rank}\")\n", + " reciprocal_ranks.append(1 / rank if rank else 0.0)\n", + "\n", + "print(f\"Mean recall: {np.mean(recalls) * 100:.2f}%\")\n", + "print(f\"Mean reciprocal rank: {np.mean(reciprocal_ranks) * 100:.2f}%\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {} + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ruff.toml b/ruff.toml index 4faecee..88ec02d 100644 --- a/ruff.toml +++ b/ruff.toml @@ -24,6 +24,7 @@ ignore = [ "B007", # Loop control variable `index` not used within loop body "B008", # Do not perform function call `File` in argument defaults "B904", # Within an `except` clause, raise exceptions with `raise ... from err`" + "RUF001", # String contains ambiguous `’` "RUF005", # Consider iterable unpacking instead of concatenation" "RUF015", # Prefer `next(iter(queries.items()))` over single element slice "F841", # Local variable `full_text_search_result` is assigned to but never used"