Spaces:
Runtime error
Runtime error
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= | |
| from typing import Any, Callable, Dict, List, Literal, Optional, Sequence | |
| import numpy as np | |
| from datasets import Dataset, load_dataset | |
| from camel.agents import ChatAgent | |
| from camel.benchmarks import BaseBenchmark | |
| from camel.logger import get_logger | |
| from camel.retrievers import AutoRetriever | |
| logger = get_logger(__name__) | |
| class RagasFields: | |
| r"""Constants for RAGAS evaluation field names.""" | |
| INPUT_CONTEXT = "contexts" | |
| INPUT_QUESTION = "question" | |
| INPUT_ANSWER = "answer" | |
| def annotate_dataset( | |
| dataset: Dataset, | |
| context_call: Optional[Callable[[Dict[str, Any]], List[str]]], | |
| answer_call: Optional[Callable[[Dict[str, Any]], str]], | |
| ) -> Dataset: | |
| r"""Annotate the dataset by adding context and answers using the provided | |
| functions. | |
| Args: | |
| dataset (Dataset): The input dataset to annotate. | |
| context_call (Optional[Callable[[Dict[str, Any]], List[str]]]): | |
| Function to generate context for each example. | |
| answer_call (Optional[Callable[[Dict[str, Any]], str]]): Function to | |
| generate answer for each example. | |
| Returns: | |
| Dataset: The annotated dataset with added contexts and/or answers. | |
| """ | |
| def process_example(example: Dict[str, Any]) -> Dict[str, Any]: | |
| if context_call: | |
| example["contexts"] = context_call(example) | |
| if answer_call: | |
| example["answer"] = answer_call(example) | |
| return example | |
| return dataset.map(process_example) | |
| def rmse( | |
| input_trues: Sequence[float], | |
| input_preds: Sequence[float], | |
| ) -> Optional[float]: | |
| r"""Calculate Root Mean Squared Error (RMSE). | |
| Args: | |
| input_trues (Sequence[float]): Ground truth values. | |
| input_preds (Sequence[float]): Predicted values. | |
| Returns: | |
| Optional[float]: RMSE value, or None if inputs have different lengths. | |
| """ | |
| if len(input_trues) != len(input_preds): | |
| logger.warning("Input lengths mismatch in RMSE calculation") | |
| return None | |
| trues = np.array(input_trues) | |
| preds = np.array(input_preds, dtype=float) | |
| # Ignore NaN values in predictions | |
| eval_idx = ~np.isnan(preds) | |
| if not np.any(eval_idx): | |
| logger.warning("No valid predictions for RMSE calculation") | |
| return None | |
| trues = trues[eval_idx] | |
| preds = preds[eval_idx] | |
| return float(np.sqrt(np.mean((preds - trues) ** 2))) | |
| def auroc(trues: Sequence[bool], preds: Sequence[float]) -> float: | |
| r"""Calculate Area Under Receiver Operating Characteristic Curve (AUROC). | |
| Args: | |
| trues (Sequence[bool]): Ground truth binary values. | |
| preds (Sequence[float]): Predicted probability values. | |
| Returns: | |
| float: AUROC score. | |
| """ | |
| from sklearn.metrics import roc_auc_score # type: ignore[import-untyped] | |
| eval_idx = ~np.isnan(preds) | |
| if not np.any(eval_idx): | |
| logger.warning("No valid predictions for AUROC calculation") | |
| return 0.5 # Return random classifier score | |
| return float( | |
| roc_auc_score(np.array(trues)[eval_idx], np.array(preds)[eval_idx]) | |
| ) | |
| def ragas_calculate_metrics( | |
| dataset: Dataset, | |
| pred_context_relevance_field: Optional[str], | |
| pred_faithfulness_field: Optional[str], | |
| metrics_to_evaluate: Optional[List[str]] = None, | |
| ground_truth_context_relevance_field: str = "relevance_score", | |
| ground_truth_faithfulness_field: str = "adherence_score", | |
| ) -> Dict[str, Optional[float]]: | |
| r"""Calculate RAGAS evaluation metrics. | |
| Args: | |
| dataset (Dataset): The dataset containing predictions and ground truth. | |
| pred_context_relevance_field (Optional[str]): Field name for predicted | |
| context relevance. | |
| pred_faithfulness_field (Optional[str]): Field name for predicted | |
| faithfulness. | |
| metrics_to_evaluate (Optional[List[str]]): List of metrics to evaluate. | |
| ground_truth_context_relevance_field (str): Field name for ground truth | |
| relevance. | |
| ground_truth_faithfulness_field (str): Field name for ground truth | |
| adherence. | |
| Returns: | |
| Dict[str, Optional[float]]: Dictionary of calculated metrics. | |
| """ | |
| metrics_to_evaluate = metrics_to_evaluate or [ | |
| "context_relevancy", | |
| "faithfulness", | |
| ] | |
| calculated_metrics: Dict[str, Optional[float]] = {} | |
| if ( | |
| "context_relevancy" in metrics_to_evaluate | |
| and pred_context_relevance_field | |
| ): | |
| trues_relevance = dataset[ground_truth_context_relevance_field] | |
| preds_relevance = dataset[pred_context_relevance_field] | |
| calculated_metrics["relevance_rmse"] = rmse( | |
| trues_relevance, preds_relevance | |
| ) | |
| if "faithfulness" in metrics_to_evaluate and pred_faithfulness_field: | |
| trues_hallucination = ~np.array( | |
| dataset[ground_truth_faithfulness_field] | |
| ) | |
| preds_hallucination = 1 - np.array( | |
| dataset[pred_faithfulness_field], dtype=float | |
| ) | |
| calculated_metrics["hallucination_auroc"] = auroc( | |
| trues_hallucination.tolist(), preds_hallucination.tolist() | |
| ) | |
| return calculated_metrics | |
| def ragas_evaluate_dataset( | |
| dataset: Dataset, | |
| contexts_field_name: Optional[str], | |
| answer_field_name: Optional[str], | |
| metrics_to_evaluate: Optional[List[str]] = None, | |
| ) -> Dataset: | |
| r"""Evaluate the dataset using RAGAS metrics. | |
| Args: | |
| dataset (Dataset): Input dataset to evaluate. | |
| contexts_field_name (Optional[str]): Field name containing contexts. | |
| answer_field_name (Optional[str]): Field name containing answers. | |
| metrics_to_evaluate (Optional[List[str]]): List of metrics to evaluate. | |
| Returns: | |
| Dataset: Dataset with added evaluation metrics. | |
| """ | |
| from ragas import evaluate | |
| from ragas.metrics import ( # type: ignore[import-untyped] | |
| context_relevancy, | |
| faithfulness, | |
| ) | |
| metrics_to_evaluate = metrics_to_evaluate or [ | |
| "context_relevancy", | |
| "faithfulness", | |
| ] | |
| # Rename fields if necessary | |
| if ( | |
| contexts_field_name | |
| and contexts_field_name != RagasFields.INPUT_CONTEXT | |
| ): | |
| dataset = dataset.rename_column( | |
| contexts_field_name, RagasFields.INPUT_CONTEXT | |
| ) | |
| if answer_field_name and answer_field_name != RagasFields.INPUT_ANSWER: | |
| dataset = dataset.rename_column( | |
| answer_field_name, RagasFields.INPUT_ANSWER | |
| ) | |
| metrics = [] | |
| if "context_relevancy" in metrics_to_evaluate: | |
| metrics.append(context_relevancy) | |
| if "faithfulness" in metrics_to_evaluate: | |
| metrics.append(faithfulness) | |
| ragas_result = evaluate(dataset, metrics=metrics) | |
| return Dataset.from_pandas(ragas_result.to_pandas()) | |
| class RAGBenchBenchmark(BaseBenchmark): | |
| r"""RAGBench Benchmark for evaluating RAG performance. | |
| This benchmark uses the rungalileo/ragbench dataset to evaluate | |
| retrieval-augmented generation (RAG) systems. It measures context | |
| relevancy and faithfulness metrics as described in | |
| https://arxiv.org/abs/2407.11005. | |
| Args: | |
| processes (int, optional): Number of processes for parallel processing. | |
| subset (str, optional): Dataset subset to use (e.g., "hotpotqa"). | |
| split (str, optional): Dataset split to use (e.g., "test"). | |
| """ | |
| def __init__( | |
| self, | |
| processes: int = 1, | |
| subset: Literal[ | |
| "covidqa", | |
| "cuad", | |
| "delucionqa", | |
| "emanual", | |
| "expertqa", | |
| "finqa", | |
| "hagrid", | |
| "hotpotqa", | |
| "msmarco", | |
| "pubmedqa", | |
| "tatqa", | |
| "techqa", | |
| ] = "hotpotqa", | |
| split: Literal["train", "test", "validation"] = "test", | |
| ) -> None: | |
| super().__init__("ragbench", "rag_bench", "", processes) | |
| self.subset = subset | |
| self.split = split | |
| self.dataset: Optional[Dataset] = None | |
| def download(self): | |
| r"""Download the RAGBench dataset.""" | |
| try: | |
| self.dataset = load_dataset( | |
| "rungalileo/ragbench", self.subset, split=self.split | |
| ) | |
| except Exception as e: | |
| logger.error(f"Failed to download dataset: {e}") | |
| raise | |
| def load(self, force_download: bool = False): | |
| r"""Load the RAGBench dataset. | |
| Args: | |
| force_download (bool, optional): Whether to force download the | |
| data. | |
| """ | |
| if force_download or self.dataset is None: | |
| logger.info( | |
| "%s dataset", | |
| "Force downloading" if force_download else "Loading", | |
| ) | |
| self.download() | |
| def run( # type: ignore[override, return] | |
| self, | |
| agent: ChatAgent, | |
| auto_retriever: AutoRetriever, | |
| ) -> Dict[str, Optional[float]]: | |
| r"""Run the benchmark evaluation. | |
| Args: | |
| agent (ChatAgent): Chat agent for generating answers. | |
| auto_retriever (AutoRetriever): Retriever for finding relevant | |
| contexts. | |
| Returns: | |
| Dict[str, Optional[float]]: Dictionary of evaluation metrics. | |
| """ | |
| def context_call(example): | |
| retrieved_info = auto_retriever.run_vector_retriever( | |
| query=example['question'], | |
| contents=example['documents'], | |
| top_k=1, | |
| return_detailed_info=True, | |
| similarity_threshold=0.5, | |
| ) | |
| return [c['text'] for c in retrieved_info['Retrieved Context']] | |
| def answer_call(example: Dict[str, Any]) -> str: | |
| user_msg = str(example) | |
| assistant_response = agent.step(user_msg) | |
| return assistant_response.msg.content | |
| # Annotate the dataset | |
| annotated_ds = annotate_dataset( | |
| self.dataset, context_call, answer_call | |
| ) | |
| evaluated_ds = ragas_evaluate_dataset( | |
| annotated_ds, | |
| contexts_field_name="contexts", | |
| answer_field_name="answer", | |
| metrics_to_evaluate=["context_relevancy", "faithfulness"], | |
| ) | |
| return ragas_calculate_metrics( | |
| evaluated_ds, | |
| pred_context_relevance_field="context_relevancy", | |
| pred_faithfulness_field="faithfulness", | |
| ) | |