Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ metadata:
name: edp-ingestion-configmap
namespace: edp
data:
EMBEDDING_MODEL_NAME: {{ .Values.ingestion.config.embedding_model_name | quote }}
VECTOR_STORE: {{ .Values.ingestion.config.vector_store | quote }}
VECTOR_ALGORITHM: {{ .Values.ingestion.config.vector_algorithm | quote }}
VECTOR_DIMS: {{ .Values.ingestion.config.vector_dims | quote }}
Expand Down
3 changes: 3 additions & 0 deletions deployment/components/edp/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ edpOidcConfigUrl: "http://keycloak-http.auth.svc/realms/EnterpriseRAG/.well-know
edpOidcClientSecret: ""
bucketNameRegexFilter: '.*'
presignedUrlCredentialsSystemFallback: "false"
embedding_model_name: &embedding_model_name "BAAI/bge-base-en-v1.5"


minioApiDomain: &minioApiDomain "s3.erag.com"
minioBrowserDomain: &minioBrowserDomain "minio.erag.com"
Expand Down Expand Up @@ -895,6 +897,7 @@ ingestion:
tag: latest
config:
opeaLoggerLevel: "INFO" # "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"
embedding_model_name: *embedding_model_name # e.g., "BAAI/bge-base-en-v1.5"
# Vector Algorithm configuration
vector_algorithm: "FLAT" # "FLAT", "HNSW"
vector_dims: "768" # Depends on model used in embedding. For example bge-large-en-v1.5=768, bge-large-en-v1.5=1024
Expand Down
2 changes: 2 additions & 0 deletions deployment/components/gmc/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ images:
tag: *tag
pullPolicy: Always
envfile: "src/comps/retrievers/impl/microservice/.env"
envs:
EMBEDDING_MODEL_NAME: *embedding_model_name
ingestion-usvc:
image: "erag-ingestion"
repository: *repo
Expand Down
4 changes: 3 additions & 1 deletion deployment/roles/application/edp/templates/values.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ proxy:
alternateTagging: {{ use_alternate_tagging }}
{% endif %}

embedding_model_name: &embedding_model_name {{ embedding_model_name }}

{% set storage = lookup('env', 'edp_storage_type') or edp.storageType if edp.storageType is defined else "minio" %}
{% if storage == "minio" %}
edpAccessKey: {{ EDP_MINIO_ACCESS_KEY }}
Expand Down Expand Up @@ -136,10 +138,10 @@ ingestion:
tag: {{ tag }}
repository: {{ registry }}
config:
embedding_model_name: *embedding_model_name
{% if edp.hierarchical_indices.enabled is true %}
use_hierarchical_indices: "True"
{% endif %}
config:
vector_dims: {{ vector_databases.vector_dims }}
vector_datatype: {{ vector_databases.vector_datatype }}
{% if edp.late_chunking.enabled is true %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ images:
vector_store: {{ vector_databases.vector_store }}
{% endif %}
envs:
EMBEDDING_MODEL_NAME: *embedding_model_name
{% if edp.hierarchical_indices.enabled is true %}
USE_HIERARCHICAL_INDICES: "True"
K_SUMMARIES: {{ edp.hierarchical_indices.kSummaries }}
Expand Down
3 changes: 2 additions & 1 deletion src/comps/vectorstores/utils/connectors/connector_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def _metadata_schema(self):
return metadata_schema

def _vector_schema(self, schema: dict, metadata_schema: Optional[dict]=None) -> IndexSchema:
index_name = f"{schema['algorithm'].lower()}_{schema['datatype'].lower()}_{schema['distance_metric'].lower()}_index"
model_name = sanitize_env(os.getenv("EMBEDDING_MODEL_NAME", "default")).replace("/", "_").replace("-", "_")
index_name = f"{model_name.lower()}_{schema['algorithm'].lower()}_{schema['datatype'].lower()}_{schema['distance_metric'].lower()}_{schema['dims']}_index"

data = {
"index": {
Expand Down
84 changes: 63 additions & 21 deletions src/tests/e2e/evals/evaluation/rag_eval/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
- [MultiHop (English dataset)](#multihop-english-dataset)
- [Evaluation](#evaluation)
- [Usage Guide](#usage-guide)
- [Tips to Control the Evaluation Scope](#tips-to-control-the-evaluation-scope)
- [Acknowledgements](#acknowledgements)


<!-- /TOC -->

## Introduction
Expand Down Expand Up @@ -187,7 +187,7 @@ This evaluation uses [yixuantt/MultiHopRAG](https://huggingface.co/datasets/yixu

### Evaluation

This section explains how to run the evaluation pipeline for Multihop dataset.
This section explains how to run the evaluation pipeline for MultiHop dataset.

The evaluation script is located at `examples/eval_multihop.py`.

Expand All @@ -199,23 +199,42 @@ python eval_multihop.py --help

| **Argument** | **Default Value** | **Description** |
| ---------------------- |---------------------------------------------------|-------------------------------------------------------------------------------------------------|
| `--output_dir` | `./output` | Directory to save evaluation results |
| `--auth_file` | `deployment/ansible-logs/default_credentials.txt` | Path to credentials file with `KEYCLOAK_ERAG_ADMIN_USERNAME` and `KEYCLOAK_ERAG_ADMIN_PASSWORD` |
| `--cluster_config_file`| `deployment/inventory/sample/config.yaml` | Path to cluster configuration YAML file with deployment settings |
| `--dataset_path` | `multihop_dataset/MultiHopRAG.json` | Path to the evaluation dataset |
| `--docs_path` | `multihop_dataset/corpus.json` | Path to the documents for retrieval |
| `--limits` | `100` | Number of queries to evaluate (0 means evaluate all; default: 100) |
| `--ingest_docs` | *(flag)* | Ingest documents into the vector database (use only on first run) |
| `--generation_metrics` | *(flag)* | Compute text generation metrics (`BLEU`, `ROUGE`) |
| `--retrieval_metrics` | *(flag)* | Compute retrieval metrics (`Hits@K`, `MAP@K`, `MRR@K`) |
| `--skip_normalize` | *(flag)* | Skip 'None' separator normalization for exact 1:1 text matching |
| `--ragas_metrics` | *(flag)* | Compute RAGAS metrics (answer correctness, context precision, etc.) |
| `--resume_checkpoint` | *None* | Path to a checkpoint file to resume evaluation from previous state |
| `--keep_checkpoint` | *(flag)* | Keep the checkpoint file after evaluation (do not delete) |
| `--llm_judge_endpoint` | `http://localhost:8008` | URL of the LLM judge service; only used for RAGAS evaluation |
| `--embedding_endpoint` | `http://localhost:8090/embed` | URL of the embedding service endpoint, only used for RAGAS |
| `--temperature` | Read from RAG system config | Controls text generation randomness; defaults to RAG system setting if omitted. |
| `--max_new_tokens` | Read from RAG system config | Maximum tokens generated; defaults to RAG system setting if omitted. |
| `--output_dir` | `./output` | Directory to save evaluation results
|
| `--auth_file` | `deployment/ansible-logs/default_credentials.txt` | Path to credentials file with `KEYCLOAK_ERAG_ADMIN_USERNAME` and `KEYCLOAK_ERAG_ADMIN_PASSWORD`
|
| `--cluster_config_file`| `deployment/inventory/sample/config.yaml` | Path to cluster configuration YAML file with deployment settings
|
| `--dataset_path` | `multihop_dataset/MultiHopRAG.json` | Path to the evaluation dataset
|
| `--docs_path` | `multihop_dataset/corpus.json` | Path to the documents for retrieval
|
| `--limits` | `100` | Number of queries to evaluate (0 means evaluate all; default: 100)
|
| `--exclude_types` | *None* | Exclude queries by question type. Queries matching these question types will be skipped. Example: --exclude_types comparison_query
|
| `--ingest_docs` | *(flag)* | Ingest documents into the vector database (use only on first run)
|
| `--generation_metrics` | *(flag)* | Compute text generation metrics (`BLEU`, `ROUGE`)
|
| `--retrieval_metrics` | *(flag)* | Compute retrieval metrics (`Hits@K`, `MAP@K`, `MRR@K`)
|
| `--skip_normalize` | *(flag)* | Skip 'None' separator normalization for exact 1:1 text matching
|
| `--ragas_metrics` | *(flag)* | Compute RAGAS metrics (answer correctness, context precision, etc.)
|
| `--resume_checkpoint` | *None* | Path to a checkpoint file to resume evaluation from previous state
|
| `--keep_checkpoint` | *(flag)* | Keep the checkpoint file after evaluation (do not delete)
|
| `--llm_judge_endpoint` | `http://localhost:8008` | URL of the LLM judge service; only used for RAGAS evaluation
|
| `--embedding_endpoint` | `http://localhost:8090/embed` | URL of the embedding service endpoint, only used for RAGAS
|
| `--temperature` | Read from RAG system config | Controls text generation randomness; defaults to RAG system setting if omitted
|
| `--max_new_tokens` | Read from RAG system config | Maximum tokens generated; defaults to RAG system setting if omitted
|


> Note: If `--dataset_path` and `--docs_path` are set to their default values and the corresponding files are not found locally, they will be automatically downloaded at runtime from [yixuantt/MultiHopRAG](https://huggingface.co/datasets/yixuantt/MultiHopRAG) and saved to the expected local paths.
Expand All @@ -224,7 +243,7 @@ python eval_multihop.py --help

### Usage Guide

This section outlines how to run Multihop evaluation of the RAG pipeline using [examples/eval_multihop.py](examples/eval_multihop.py).
This section outlines how to run MultiHop evaluation of the RAG pipeline using [examples/eval_multihop.py](examples/eval_multihop.py).
- **Ingest Documents**

To ingest the MultiHop dataset into the RAG system, use the flag `--ingest_docs`:
Expand All @@ -250,7 +269,7 @@ This section outlines how to run Multihop evaluation of the RAG pipeline using [

_Metrics: BLEU, ROUGE, (LLM-score – not implemented yet)_

To evaluate the quality of RAG generated answers on Multihop queries, run:
To evaluate the quality of RAG generated answers on MultiHop queries, run:

```bash
# First-time run (with document ingestion)
Expand Down Expand Up @@ -374,6 +393,29 @@ The evaluation results are stored in the output/ directory with detailed logs an

The query and its corresponding ground_truth_text originate from the yixuantt/MultiHopRAG dataset.

### Tips to Control the Evaluation Scope

**Controlling the Number of Queries with `--limits`:**

The `--limits` parameter allows you to control how many queries from the dataset are evaluated. This is particularly useful for quick testing during development to verify that the pipeline works correctly.

```bash
# Evaluate only the first 2 queries (quick test)
python eval_multihop.py --generation_metrics --limits 2
```

**Excluding Specific Query Types with `--exclude_types`:**

The dataset specified by `--dataset_path` (default: multihop_dataset/MultiHopRAG.json) contains queries along with their `question_type`. You can selectively exclude specific query types from evaluation using the `--exclude_types` parameter. This is useful when you want to focus on particular aspects of your RAG system, for example, to compute accuracy metrics separately for each query type and identify which types your RAG handles better or worse.

```bash
# Exclude comparison queries from evaluation
python eval_multihop.py --retrieval_metrics --exclude_types comparison_query

# Exclude multiple query types (space-separated)
python eval_multihop.py --retrieval_metrics --exclude_types comparison_query inference_query
```


## Acknowledgements
This example is mostly adapted from [MultiHop-RAG](https://github.com/yixuantt/MultiHop-RAG) repo, we thank the authors for their great work!
5 changes: 5 additions & 0 deletions src/tests/e2e/evals/evaluation/rag_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ def get_golden_context(self, data: dict):
def get_query(self, data: dict):
raise NotImplementedError("Depends on the specific dataset.")

def get_question_type(self, data: dict):
raise NotImplementedError("Depends on the specific dataset.")

def get_document(self, data: dict):
raise NotImplementedError("Depends on the specific dataset.")

Expand All @@ -164,6 +167,7 @@ def scoring(self, data: dict) -> dict:
},
"log": {
"query": self.get_query(data),
"query_type": self.get_question_type(data),
"generated_text": generated_text,
"ground_truth_text": ground_truth_text,
"evaluateDatetime": str(datetime.now()),
Expand Down Expand Up @@ -217,6 +221,7 @@ def scoring_retrieval(self, data: dict, normalize: bool = True) -> dict:
},
"log": {
"query": self.get_query(data),
"query_type": self.get_question_type(data),
"golden_context": golden_context,
"num_retrieved_documents": len(retrieved_documents),
"num_reranked_documents": len(reranked_documents),
Expand Down
42 changes: 39 additions & 3 deletions src/tests/e2e/evals/evaluation/rag_eval/examples/eval_multihop.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def get_ground_truth_text(self, data: dict):
def get_query(self, data: dict):
return data["query"]

def get_question_type(self, data: dict):
return data.get("question_type") or "unknown"

def get_template(self):
return None

Expand All @@ -63,7 +66,7 @@ def evaluate(self, all_queries, arguments):
generated_text = self.send_request(query, arguments)
data["generated_text"] = generated_text

result = {"id": index, "uuid": self.get_uuid(query), **self.scoring(data)}
result = {"id": index, "uuid": self.get_uuid(query), "question_type": self.get_question_type(data), **self.scoring(data)}
logger.debug(f"Result for query {index}: {result}")
results.append(result)
index += 1
Expand Down Expand Up @@ -194,6 +197,7 @@ def get_retrieval_metrics(self, all_queries, arguments):

def prepare_ragas_record(self, data, arguments):
query = self.get_query(data)
question_type = self.get_question_type(data)
generated_text = self.send_request(query, arguments)

try:
Expand All @@ -204,6 +208,7 @@ def prepare_ragas_record(self, data, arguments):

return {
"query": query,
"question_type": question_type,
"generated_text": generated_text,
"ground_truth": self.get_ground_truth_text(data),
"golden_context": self.get_golden_context(data),
Expand Down Expand Up @@ -293,8 +298,9 @@ def get_ragas_metrics(self, all_queries, arguments):

# Store metadata for each query
query_metadata.append({
"query": result["query"],
"uuid": self.get_uuid(result["query"]),
"query": result["query"],
"question_type": result["question_type"],
"generated_text": result["generated_text"],
"ground_truth": result["ground_truth"],
"golden_context": result["golden_context"],
Expand Down Expand Up @@ -328,6 +334,7 @@ def get_ragas_metrics(self, all_queries, arguments):
"ragas_metrics": score,
"log": {
"query": query_metadata[idx]["query"],
"question_type": query_metadata[idx]["question_type"],
"generated_text": query_metadata[idx]["generated_text"],
"ground_truth": query_metadata[idx]["ground_truth"],
"golden_context": query_metadata[idx]["golden_context"],
Expand Down Expand Up @@ -398,6 +405,7 @@ def args_parser():
parser.add_argument("--ragas_metrics", action="store_true", help="Whether to compute ragas metrics such as answer correctness, relevancy, semantic similarity, context precision, context recall , and faithfulness")
parser.add_argument("--skip_normalize", action="store_true", help="Skip normalization of 'None' separators in retrieval metrics. By default, normalization is enabled")
parser.add_argument("--limits", type=int, default=100, help="Number of queries to evaluate. Set to 0 to evaluate all provided queries")
parser.add_argument("--exclude_types", type=str, nargs='+', dest='exclude_types', help="Exclude queries by question type. Queries matching these question types will be skipped. Example: --exclude_types comparision_query")
parser.add_argument("--resume_checkpoint", type=str, help="Path to a checkpoint file to resume evaluation from previously saved progress")
parser.add_argument("--keep_checkpoint", action="store_true", help="Keep the checkpoint file after successful evaluation instead of deleting it")
parser.add_argument("--llm_judge_endpoint", type=str, default="http://localhost:8008", help="URL of the LLM judge service. Only used for RAGAS metrics")
Expand Down Expand Up @@ -475,6 +483,29 @@ def filter_category_null_queries(queries):

return [q for q in queries if q.get("question_type") != 'null_query']


def filter_queries_by_type(queries, exclude_types=None):
"""
Filter queries by excluding specific question types.

Args:
queries: List of query dictionaries
exclude_types: List of question types to exclude (if None, exclude none)

Returns:
Filtered list of queries
"""
if not exclude_types:
return queries

logger.info(f"Excluding question types: {exclude_types}")
# Normalize exclude_types to lowercase and strip whitespace for case-insensitive comparison
normalized_exclude_types = {qt.lower().strip() for qt in exclude_types}
filtered = [q for q in queries if q.get("question_type", "").lower().strip() not in normalized_exclude_types]

return filtered


def main():
args = args_parser()
logger.info(f"Running Multihop evaluation with arguments: {args.__dict__}")
Expand Down Expand Up @@ -536,8 +567,13 @@ def main():
all_queries = filter_category_null_queries(all_queries)
logger.info(f"Queries remaining: {len(all_queries)}")

# Filter by question type if specified
if args.exclude_types:
all_queries = filter_queries_by_type(all_queries, args.exclude_types)
logger.info(f"Queries after type filtering: {len(all_queries)}")

except Exception as e:
logger.error(f"Error filtering queries categorized as 'null_query': {e}")
logger.error(f"Error filtering queries: {e}")

if not all_queries:
logger.error("No queries remain after filtering 'null_query' category. Please check the dataset.")
Expand Down
Loading