/v1\" #\n",
+ "byom_model_id = \"meta/llama-3.1-8b-instruct\"\n",
+ "\n",
+ "byom_model_alias = \"review-generator\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c8ee67f3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Define both the models you want to use for data generation\n",
+ "model_configs_yaml = f\"\"\"\\\n",
+ "model_configs:\n",
+ " - alias: \"{model_alias}\"\n",
+ " inference_parameters:\n",
+ " max_tokens: 1024\n",
+ " temperature: 0.5\n",
+ " top_p: 1.0\n",
+ " model:\n",
+ " api_endpoint:\n",
+ " api_key: \"{api_key}\"\n",
+ " model_id: \"{model_id}\"\n",
+ " url: \"{endpoint}\"\n",
+ " - alias: \"{byom_model_alias}\"\n",
+ " inference_parameters:\n",
+ " max_tokens: 1024\n",
+ " temperature: 0.5\n",
+ " top_p: 1.0\n",
+ " model:\n",
+ " api_endpoint:\n",
+ " model_id: \"{byom_model_id}\"\n",
+ " url: \"{byom_endpoint}\"\n",
+ "\"\"\"\n",
+ "\n",
+ "config_builder = DataDesignerConfigBuilder(model_configs=model_configs_yaml)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e78f8070",
+ "metadata": {},
+ "source": [
+ "## 2. Configuration\n",
+ "\n",
+ "Let's define our source documents and the total number of evaluation pairs we want to generate. You can replace the document list with your own PDFs, web pages, or other text sources."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "fd6f9e64",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define source documents and total number of evaluation pairs to generate\n",
+ "# You can replace this with your own documents\n",
+ "DOCUMENT_LIST = [\"https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/rag_evals/databricks-state-of-data-ai-report.pdf\"]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0c98449",
+ "metadata": {},
+ "source": [
+ "## 3. Document Processing\n",
+ "\n",
+ "Now we'll create a Document Processor class that handles loading and chunking the source documents. \n",
+ "\n",
+ "This class uses langchain's RecursiveCharacterTextSplitter and unstructured.io for robust document parsing."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bfec3608",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from typing import List, Union\n",
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+ "from unstructured.partition.auto import partition\n",
+ "from smart_open import open\n",
+ "import tempfile\n",
+ "import os\n",
+ "\n",
+ "class DocumentProcessor:\n",
+ " \"\"\"Handles loading and chunking source documents for RAG evaluation.\"\"\"\n",
+ " \n",
+ " def __init__(self, chunk_size: int = 4192, chunk_overlap: int = 200):\n",
+ " \"\"\"Initialize with configurable chunk size and overlap.\"\"\"\n",
+ " self.text_splitter = RecursiveCharacterTextSplitter(\n",
+ " chunk_size=chunk_size,\n",
+ " chunk_overlap=chunk_overlap,\n",
+ " length_function=len,\n",
+ " )\n",
+ "\n",
+ " def parse_document(self, uri: str) -> str:\n",
+ " \"\"\"Parse a single document from URI into raw text.\"\"\"\n",
+ " with open(uri, 'rb') as file:\n",
+ " content = file.read()\n",
+ " with tempfile.NamedTemporaryFile(delete=False) as temp_file:\n",
+ " temp_file.write(content)\n",
+ " temp_file.flush()\n",
+ " elements = partition(temp_file.name)\n",
+ "\n",
+ " os.unlink(temp_file.name)\n",
+ " return \"\\n\\n\".join([str(element) for element in elements])\n",
+ "\n",
+ " def process_documents(self, uris: Union[str, List[str]]) -> List[str]:\n",
+ " \"\"\"Process one or more documents into chunks for RAG evaluation.\"\"\"\n",
+ " if isinstance(uris, str):\n",
+ " uris = [uris]\n",
+ "\n",
+ " all_chunks = []\n",
+ " for uri in uris:\n",
+ " text = self.parse_document(uri)\n",
+ " chunks = self.text_splitter.split_text(text)\n",
+ " all_chunks.extend(chunks)\n",
+ "\n",
+ " return all_chunks"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c44785c",
+ "metadata": {},
+ "source": [
+ "## 4. Data Models\n",
+ "\n",
+ "Let's define Pydantic models for structured output generation. These schemas will ensure our generated data has consistent structure and validation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "9cab035f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pydantic import BaseModel, Field\n",
+ "\n",
+ "class QAPair(BaseModel):\n",
+ " question: str = Field(\n",
+ " ..., description=\"A specific question related to the domain of the context\"\n",
+ " )\n",
+ " answer: str = Field(\n",
+ " ..., description=\"Either a context-supported answer or explanation of why the question cannot be answered\"\n",
+ " )\n",
+ " reasoning: str = Field(\n",
+ " ..., description=\"A clear and traceable explanation of the reasoning behind the answer\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ada29f90",
+ "metadata": {},
+ "source": [
+ "## 5. Processing Documents and Setting Up Data Designer\n",
+ "\n",
+ "Now we'll process our document chunks and set up the Data Designer with our seed dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5325b303",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Process document chunks\n",
+ "processor = DocumentProcessor(chunk_size=4192, chunk_overlap=10)\n",
+ "chunks = processor.process_documents(DOCUMENT_LIST)\n",
+ "\n",
+ "# Create a seed DataFrame with the document chunks\n",
+ "seed_df = pd.DataFrame({\"context\": chunks})\n",
+ "\n",
+ "# Save to CSV\n",
+ "seed_df.to_csv(\"document_chunks.csv\", index=False)\n",
+ "print(\"Seed dataset \", seed_df.head())\n",
+ "print(\"Saved to document_chunks.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "ab274279",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ ")\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ ")"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "config_builder.with_seed_dataset(\n",
+ " repo_id=\"into-tutorials/seeding-with-a-dataset\",\n",
+ " filename=\"document_chunks.csv\",\n",
+ " dataset_path=\"/home/slikhite/Desktop/NVAIE/gretel-SDG/document_chunks.csv\",\n",
+ " sampling_strategy=\"shuffle\",\n",
+ " with_replacement=True,\n",
+ " datastore={\"endpoint\": \"http://localhost:3000/v1/hf\"},\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "280e2fec",
+ "metadata": {},
+ "source": [
+ "## 6. Adding Categorical Columns for Controlled Diversity\n",
+ "\n",
+ "Now we'll add categorical columns to control the diversity of our RAG evaluation pairs. We'll define:\n",
+ "\n",
+ "1. **Difficulty levels**: easy, medium, hard\n",
+ "\n",
+ "2. **Reasoning types**: factual recall, inferential reasoning, etc.\n",
+ "\n",
+ "3. **Question types**: answerable vs. unanswerable (with weighting)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "e3e27cac",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[12:16:26] [INFO] β
Validation passed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ " sampler_columns: ['difficulty', 'reasoning_type', 'question_type']\n",
+ ")\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ " sampler_columns: ['difficulty', 'reasoning_type', 'question_type']\n",
+ ")"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "config_builder.add_column(\n",
+ " C.SamplerColumn(\n",
+ " name=\"difficulty\",\n",
+ " type=P.SamplerType.CATEGORY,\n",
+ " params=P.CategorySamplerParams(\n",
+ " values=[\"easy\",\"medium\", \"hard\"],\n",
+ " description=\"The difficulty level of the question\"\n",
+ " )\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "config_builder.add_column(\n",
+ " C.SamplerColumn(\n",
+ " name=\"reasoning_type\",\n",
+ " type=P.SamplerType.CATEGORY,\n",
+ " params=P.CategorySamplerParams(\n",
+ " values=[\n",
+ " \"factual recall\",\n",
+ " \"inferential reasoning\",\n",
+ " \"comparative analysis\",\n",
+ " \"procedural understanding\",\n",
+ " \"cause and effect\"\n",
+ " ],\n",
+ " description=\"The type of reasoning required to answer the question\"\n",
+ " )\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "config_builder.add_column(\n",
+ " C.SamplerColumn(\n",
+ " name=\"question_type\",\n",
+ " type=P.SamplerType.CATEGORY,\n",
+ " params=P.CategorySamplerParams(\n",
+ " values=[\"answerable\", \"unanswerable\"],\n",
+ " # 10:1 ratio of answerable to unanswerable questions.\n",
+ " weights=[10, 1], \n",
+ " )\n",
+ " )\n",
+ ").validate()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "735cbbea",
+ "metadata": {},
+ "source": [
+ "## 7. Adding LLM-Structured Column for Q&A Pair Generation\n",
+ "\n",
+ "Now let's set up the core of our data generation: the Q&A pair column, which will produce structured questionβanswer pairs based on our document context and control parameters. The columns in the seed dataβ `context` in our caseβcan be used in the prompt for data generation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "ecf44d9e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[12:20:10] [INFO] β
Validation passed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ " sampler_columns: ['difficulty', 'reasoning_type', 'question_type']\n",
+ " llm_structured_columns: ['qa_pair']\n",
+ " llm_judge_columns: ['eval_metrics']\n",
+ ")\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ " sampler_columns: ['difficulty', 'reasoning_type', 'question_type']\n",
+ " llm_structured_columns: ['qa_pair']\n",
+ " llm_judge_columns: ['eval_metrics']\n",
+ ")"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Add Q&A pair generation column\n",
+ "config_builder.add_column(\n",
+ " C.LLMStructuredColumn(\n",
+ " name=\"qa_pair\",\n",
+ " system_prompt=( \n",
+ " \"You are an expert at generating high-quality RAG evaluation pairs. \"\n",
+ " \"You are very careful in assessing whether the question can be answered from the provided context. \"\n",
+ " ),\n",
+ " prompt=\"\"\"\\\n",
+ "{{context}}\n",
+ "\n",
+ "Generate a {{difficulty}} {{reasoning_type}} question-answer pair.\n",
+ "The question should be {{question_type}} using the provided context.\n",
+ "\n",
+ "For answerable questions:\n",
+ "- Ensure the answer is fully supported by the context\n",
+ "\n",
+ "For unanswerable questions:\n",
+ "- Keep the question topically relevant\n",
+ "- Make it clearly beyond the context's scope\n",
+ "\"\"\",\n",
+ " output_format=QAPair,\n",
+ " model_alias=byom_model_alias,\n",
+ " )\n",
+ ").validate()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "41e6cc02",
+ "metadata": {},
+ "source": [
+ "## 8. Adding Evaluation Metrics with Custom Rubrics\n",
+ "\n",
+ "To assess the quality of our generated Q&A pairs, we'll add evaluation metrics using detailed rubrics for scoring. \n",
+ "\n",
+ "We use Data Designer's `LLMJudgeColumn` for this, defining a set of custom Rubrics designed for our task."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "953bca63",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[12:20:14] [INFO] β
Validation passed\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ " sampler_columns: ['difficulty', 'reasoning_type', 'question_type']\n",
+ " llm_structured_columns: ['qa_pair']\n",
+ " llm_judge_columns: ['eval_metrics']\n",
+ ")\n",
+ "
\n",
+ "\n"
+ ],
+ "text/plain": [
+ "DataDesignerConfigBuilder(\n",
+ " seed_dataset: 'into-tutorials/seeding-with-a-dataset/document_chunks.csv'\n",
+ " seed_columns: ['context']\n",
+ " sampler_columns: ['difficulty', 'reasoning_type', 'question_type']\n",
+ " llm_structured_columns: ['qa_pair']\n",
+ " llm_judge_columns: ['eval_metrics']\n",
+ ")"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from nemo_microservices.beta.data_designer.config import params as P\\\n",
+ "\n",
+ "context_relevance_rubric = P.Rubric(\n",
+ " name=\"Context Relevance\",\n",
+ " description=\"Evaluates how relevant the answer is to the provided context\",\n",
+ " scoring={\n",
+ " \"5\": \"Perfect relevance to context with no extraneous information\",\n",
+ " \"4\": \"Highly relevant with minor deviations from context\",\n",
+ " \"3\": \"Moderately relevant but includes some unrelated information\",\n",
+ " \"2\": \"Minimally relevant with significant departure from context\",\n",
+ " \"1\": \"Almost entirely irrelevant to the provided context\"\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "answer_precision_rubric = P.Rubric(\n",
+ " name=\"Answer Precision\",\n",
+ " description=\"Evaluates the accuracy and specificity of the answer\",\n",
+ " scoring={\n",
+ " \"5\": \"Extremely precise with exact, specific information\",\n",
+ " \"4\": \"Very precise with minor imprecisions\",\n",
+ " \"3\": \"Adequately precise but could be more specific\",\n",
+ " \"2\": \"Imprecise with vague or ambiguous information\",\n",
+ " \"1\": \"Completely imprecise or inaccurate\"\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "answer_completeness_rubric = P.Rubric(\n",
+ " name=\"Answer Completeness\",\n",
+ " description=\"Evaluates how thoroughly the answer addresses all aspects of the question\",\n",
+ " scoring={\n",
+ " \"5\": \"Fully complete, addressing all aspects of the question\",\n",
+ " \"4\": \"Mostly complete with minor omissions\",\n",
+ " \"3\": \"Adequately complete but missing some details\",\n",
+ " \"2\": \"Substantially incomplete, missing important aspects\",\n",
+ " \"1\": \"Severely incomplete, barely addresses the question\"\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "hallucination_avoidance_rubric = P.Rubric(\n",
+ " name=\"Hallucination Avoidance\",\n",
+ " description=\"Evaluates the absence of made-up or incorrect information\",\n",
+ " scoring={\n",
+ " \"5\": \"No hallucinations, all information is factual and verifiable\",\n",
+ " \"4\": \"Minimal hallucinations that don't impact the core answer\",\n",
+ " \"3\": \"Some hallucinations that partially affect the answer quality\",\n",
+ " \"2\": \"Significant hallucinations that undermine the answer\",\n",
+ " \"1\": \"Severe hallucinations making the answer entirely unreliable\"\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "EVAL_METRICS_PROMPT_TEMPLATE = \"\"\"\\\n",
+ "You are an expert evaluator of question-answer pairs. Analyze the following Q&A pair and evaluate it objectively.\n",
+ "\n",
+ "For this {{difficulty}} {{reasoning_type}} Q&A pair:\n",
+ "{{qa_pair}}\n",
+ "\n",
+ "Take a deep breath and carefully evaluate each criterion based on the provided rubrics, considering the difficulty level and reasoning type indicated.\n",
+ "\"\"\"\n",
+ "\n",
+ "#use a different model for evaluation\n",
+ "config_builder.add_column(\n",
+ " C.LLMJudgeColumn(\n",
+ " name=\"eval_metrics\",\n",
+ " prompt=EVAL_METRICS_PROMPT_TEMPLATE,\n",
+ " rubrics=[context_relevance_rubric, answer_precision_rubric, answer_completeness_rubric, hallucination_avoidance_rubric],\n",
+ " model_alias=model_alias\n",
+ " )\n",
+ ").validate()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8fb3dc84",
+ "metadata": {},
+ "source": [
+ "## 9. Preview Sample Records\n",
+ "\n",
+ "Let's generate a preview to see what our data will look like before running the full generation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b55913d0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "preview = ndd.preview(config_builder, verbose_logging=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "b655a45f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " context | \n",
+ " difficulty | \n",
+ " reasoning_type | \n",
+ " question_type | \n",
+ " qa_pair | \n",
+ " judged_by_llm | \n",
+ " eval_metrics | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " shows that many data teams are choosing to bui... | \n",
+ " hard | \n",
+ " factual recall | \n",
+ " answerable | \n",
+ " {\"question\": \"What is the name of the two smal... | \n",
+ " True | \n",
+ " {'Context Relevance': {'reasoning': 'The answe... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Sciences experiences significant fluctuations ... | \n",
+ " easy | \n",
+ " factual recall | \n",
+ " unanswerable | \n",
+ " {\"question\": \"What is the approximate number o... | \n",
+ " True | \n",
+ " {'Context Relevance': {'reasoning': 'The answe... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " process of experimental testing, trying out di... | \n",
+ " hard | \n",
+ " procedural understanding | \n",
+ " answerable | \n",
+ " {\"question\": \"What percentage of companies are... | \n",
+ " True | \n",
+ " {'Context Relevance': {'reasoning': 'The answe... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " shows that many data teams are choosing to bui... | \n",
+ " easy | \n",
+ " inferential reasoning | \n",
+ " answerable | \n",
+ " {\"question\": \"What percentage of open source L... | \n",
+ " True | \n",
+ " {'Context Relevance': {'reasoning': 'The answe... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " models and GenAI, John Snow Labs is instrument... | \n",
+ " easy | \n",
+ " comparative analysis | \n",
+ " answerable | \n",
+ " {\"question\": \"What is the difference between S... | \n",
+ " True | \n",
+ " {'Context Relevance': {'reasoning': 'The answe... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " context difficulty \\\n",
+ "0 shows that many data teams are choosing to bui... hard \n",
+ "1 Sciences experiences significant fluctuations ... easy \n",
+ "2 process of experimental testing, trying out di... hard \n",
+ "3 shows that many data teams are choosing to bui... easy \n",
+ "4 models and GenAI, John Snow Labs is instrument... easy \n",
+ "\n",
+ " reasoning_type question_type \\\n",
+ "0 factual recall answerable \n",
+ "1 factual recall unanswerable \n",
+ "2 procedural understanding answerable \n",
+ "3 inferential reasoning answerable \n",
+ "4 comparative analysis answerable \n",
+ "\n",
+ " qa_pair judged_by_llm \\\n",
+ "0 {\"question\": \"What is the name of the two smal... True \n",
+ "1 {\"question\": \"What is the approximate number o... True \n",
+ "2 {\"question\": \"What percentage of companies are... True \n",
+ "3 {\"question\": \"What percentage of open source L... True \n",
+ "4 {\"question\": \"What is the difference between S... True \n",
+ "\n",
+ " eval_metrics \n",
+ "0 {'Context Relevance': {'reasoning': 'The answe... \n",
+ "1 {'Context Relevance': {'reasoning': 'The answe... \n",
+ "2 {'Context Relevance': {'reasoning': 'The answe... \n",
+ "3 {'Context Relevance': {'reasoning': 'The answe... \n",
+ "4 {'Context Relevance': {'reasoning': 'The answe... "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# The preview dataset is available as a pandas DataFrame.\n",
+ "preview.dataset.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "d4364c13",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " \n",
+ " Seed Columns \n",
+ "βββββββββββ³ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ "β Name β Value β\n",
+ "β‘ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ©\n",
+ "β context β shows that many data teams are choosing to build vs. buy. Companies increasingly invest β\n",
+ "β β β\n",
+ "β β in LLM tools, such as LangChain, to work with and build proprietary LLMs. Transformer- β\n",
+ "β β β\n",
+ "β β related libraries like Hugging Face are used to train LLMs, and still claim the highest β\n",
+ "β β β\n",
+ "β β adoption by number of customers. Use of these libraries grew 36% YoY. Together, these β\n",
+ "β β β\n",
+ "β β trend lines indicate a more sophisticated adoption of open source LLMs. β\n",
+ "β β β\n",
+ "β β 377% YoY growth in the number of customers β\n",
+ "β β β\n",
+ "β β using vector databases β\n",
+ "β β β\n",
+ "β β 23 β\n",
+ "β β β\n",
+ "β β Companies prefer smaller open source models β\n",
+ "β β β\n",
+ "β β USE OF OPEN SOURCE LLMs β\n",
+ "β β β\n",
+ "β β Figure 7: Relative adoption of Mistral and Meta Llama open source models in Databricksβ foundation β\n",
+ "β β model APIs. β\n",
+ "β β β\n",
+ "β β NOTE: Chart extended to May 19, 2024, to accommodate the Meta Llama 3 launch. β\n",
+ "β β β\n",
+ "β β STATE OF DATA + AI β\n",
+ "β β β\n",
+ "β β 24 24 β\n",
+ "β β β\n",
+ "β β One of the biggest benefits of open source LLMs is the ability to customize them for β\n",
+ "β β β\n",
+ "β β specific use cases β especially in enterprise settings. We often hear the question: Whatβs β\n",
+ "β β β\n",
+ "β β 76% β\n",
+ "β β β\n",
+ "β β the most popular open source model? In practice, customers often try many models and β\n",
+ "β β β\n",
+ "β β model families. We analyzed the open source model usage of Meta Llama and Mistral, the β\n",
+ "β β β\n",
+ "β β two biggest players. Our data shows that the open LLM space is fluid, with new state-of- β\n",
+ "β β β\n",
+ "β β of companies that use LLMs are choosing open source models, often alongside proprietary models. β\n",
+ "β β β\n",
+ "β β the-art models getting rapid adoption. β\n",
+ "β β β\n",
+ "β β With each model, there is a trade-off between cost, latency and performance. Together, β\n",
+ "β β β\n",
+ "β β 70% β\n",
+ "β β β\n",
+ "β β usage of the two smallest Meta Llama 2 models (7B and 13B) is significantly higher than the β\n",
+ "β β β\n",
+ "β β largest, Meta Llama 2 70B. Across Meta Llama 2, Llama 3 and Mistral users, 77% choose β\n",
+ "β β β\n",
+ "β β models with 13B parameters or fewer. This suggests that companies care significantly β\n",
+ "β β β\n",
+ "β β of companies that leverage GenAI are using tools, retrieval and vector databases to customize models. β\n",
+ "β β β\n",
+ "β β about cost and latency. β\n",
+ "β β β\n",
+ "β β COMPANIES ARE QUICK TO TRY NEW MODELS β\n",
+ "β β β\n",
+ "β β Meta Llama 3 launched on April 18, 2024. Within its first week, organizations already started β\n",
+ "β β β\n",
+ "β β leveraging it over other models and providers. Just 4 weeks after its launch, Llama 3 accounted for β\n",
+ "β β 39% of all open source LLM usage. β\n",
+ "β β β\n",
+ "β β Top GenAI Python Packages β\n",
+ "β β β\n",
+ "β β STATE OF DATA + AI β\n",
+ "β β β\n",
+ "β β 25 25 β\n",
+ "β β β\n",
+ "β β Generative AI β\n",
+ "β β β\n",
+ "β β Highly regulated industries are early adopters β\n",
+ "β β β\n",
+ "β β Highly regulated industries have the reputation of being risk averse β\n",
+ "β β β\n",
+ "β β and hesitant to adopt new technologies. There are multiple reasons, β\n",
+ "β β β\n",
+ "β β including strict compliance requirements, ingrained legacy systems β\n",
+ "β β β\n",
+ "β β that are costly to replace and the need for regulatory approval before β\n",
+ "β β β\n",
+ "β β implementation. β\n",
+ "β β β\n",
+ "β β While all industries are embracing new AI innovations, two highly β\n",
+ "β β β\n",
+ "β β regulated industries β Financial Services and Healthcare & Life β\n",
+ "β β β\n",
+ "β β Sciences β are keeping pace with, and often surpassing, their less- β\n",
+ "β β β\n",
+ "β β regulated counterparts. β\n",
+ "β β β\n",
+ "β β In December 2023, Databricks released foundation model APIs, β\n",
+ "β β β\n",
+ "β β providing instant access to popular open source LLMs, such as Meta β\n",
+ "β β β\n",
+ "β β Llama and MPT models. We expect the interest in open source to grow β\n",
+ "β β β\n",
+ "β β significantly as models continue to rapidly improve, as shown by the β\n",
+ "β β β\n",
+ "β β recent launches of Llama 3. β\n",
+ "β β β\n",
+ "β β STATE OF DATA + AI β\n",
+ "β β β\n",
+ "β β 26 β\n",
+ "β β β\n",
+ "β β HARNESSING OPEN LLMs FOR INDUSTRY-SPECIFIC NEEDS β\n",
+ "β β β\n",
+ "β β Manufacturing & Automotive and Healthcare & Life Sciences take the lead in adopting foundation model β\n",
+ "β β APIs with the highest average usage per customer. In manufacturing, β\n",
+ "β β β\n",
+ "β β supply chain optimization, quality control and efficiency are deemed the most promising β\n",
+ "β β β\n",
+ "β β use cases. β\n",
+ "β β β\n",
+ "β β A recent report from MIT Tech Review Insights shares that, among those surveyed, CIOs β\n",
+ "β β β\n",
+ "β β in Healthcare & Life Sciences believe GenAI will bring value to their organizations. Open β\n",
+ "β β β\n",
+ "β β source LLMs enable highly regulated industries like Healthcare & Life Sciences to integrate β\n",
+ "β β β\n",
+ "β β GenAI while maintaining the utmost control of their data. β\n",
+ "β β β\n",
+ "β β Figure 8: Manufacturing & Automotive and Healthcare & Life Sciences lead the adoption of foundation β\n",
+ "β β model APIs with the highest average usage per customer. β\n",
+ "β β β\n",
+ "β β NOTE: Date Range: January 2024 to March 2024. β\n",
+ "β β β\n",
+ "β β 27 β\n",
+ "β β β\n",
+ "β β CPUs vs. GPUs: Financial Servicesβ commitment to LLMs grows 88% in 6 months β\n",
+ "β β β\n",
+ "β β CPUs are general-purpose processors designed to handle a wide range of tasks quickly, β\n",
+ "βββββββββββ΄ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ " \n",
+ " \n",
+ " Generated Columns \n",
+ "ββββββββββββββββββ³βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ "β Name β Value β\n",
+ "β‘ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ©\n",
+ "β difficulty β hard β\n",
+ "ββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β reasoning_type β factual recall β\n",
+ "ββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β question_type β answerable β\n",
+ "ββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β qa_pair β { β\n",
+ "β β 'question': 'What is the name of the two smallest Meta Llama 2 models mentioned in the β\n",
+ "β β context?', β\n",
+ "β β 'answer': '7B and 13B', β\n",
+ "β β 'reasoning': 'According to the context, the two smallest Meta Llama 2 models are mentioned β\n",
+ "β β as having a significant higher usage compared to the largest model, Meta Llama 2 70B.' β\n",
+ "β β } β\n",
+ "ββββββββββββββββββ΄βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ " \n",
+ " \n",
+ " LLM-as-a-Judge: eval_metrics \n",
+ "ββββββββββββββββββββββββββββββ³ββββββββββββββββββββββββββββ³βββββββββββββββββββββββββββββ³ββββββββββββββββββββββββββββ\n",
+ "β Context Relevance β Answer Precision β Hallucination Avoidance β Answer Completeness β\n",
+ "β‘ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ©\n",
+ "β score: 5 β score: 5 β score: 5 β score: 5 β\n",
+ "β reasoning: The answer β reasoning: The answer β reasoning: The answer does β reasoning: The answer β\n",
+ "β directly references the β precisely names the two β not contain any made-up or β fully addresses the β\n",
+ "β context provided in the β smallest models, '7B' and β incorrect information; it β question by providing the β\n",
+ "β question, which mentions β '13B', which are the β accurately reflects the β names of the two smallest β\n",
+ "β the two smallest Meta β correct models mentioned β details given in the β models without any β\n",
+ "β Llama 2 models. β in the context. β context. β omissions. β\n",
+ "ββββββββββββββββββββββββββββββ΄ββββββββββββββββββββββββββββ΄βββββββββββββββββββββββββββββ΄ββββββββββββββββββββββββββββ\n",
+ " \n",
+ " [index: 0] \n",
+ "
\n"
+ ],
+ "text/plain": [
+ " \n",
+ "\u001b[3m Seed Columns \u001b[0m\n",
+ "βββββββββββ³ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ "β\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0mβ\u001b[1m \u001b[0m\u001b[1mValue \u001b[0m\u001b[1m \u001b[0mβ\n",
+ "β‘ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ©\n",
+ "β context β shows that many data teams are choosing to build vs. buy. Companies increasingly invest β\n",
+ "β β β\n",
+ "β β in LLM tools, such as LangChain, to work with and build proprietary LLMs. Transformer- β\n",
+ "β β β\n",
+ "β β related libraries like Hugging Face are used to train LLMs, and still claim the highest β\n",
+ "β β β\n",
+ "β β adoption by number of customers. Use of these libraries grew 36% YoY. Together, these β\n",
+ "β β β\n",
+ "β β trend lines indicate a more sophisticated adoption of open source LLMs. β\n",
+ "β β β\n",
+ "β β 377% YoY growth in the number of customers β\n",
+ "β β β\n",
+ "β β using vector databases β\n",
+ "β β β\n",
+ "β β 23 β\n",
+ "β β β\n",
+ "β β Companies prefer smaller open source models β\n",
+ "β β β\n",
+ "β β USE OF OPEN SOURCE LLMs β\n",
+ "β β β\n",
+ "β β Figure 7: Relative adoption of Mistral and Meta Llama open source models in Databricksβ foundation β\n",
+ "β β model APIs. β\n",
+ "β β β\n",
+ "β β NOTE: Chart extended to May 19, 2024, to accommodate the Meta Llama 3 launch. β\n",
+ "β β β\n",
+ "β β STATE OF DATA + AI β\n",
+ "β β β\n",
+ "β β 24 24 β\n",
+ "β β β\n",
+ "β β One of the biggest benefits of open source LLMs is the ability to customize them for β\n",
+ "β β β\n",
+ "β β specific use cases β especially in enterprise settings. We often hear the question: Whatβs β\n",
+ "β β β\n",
+ "β β 76% β\n",
+ "β β β\n",
+ "β β the most popular open source model? In practice, customers often try many models and β\n",
+ "β β β\n",
+ "β β model families. We analyzed the open source model usage of Meta Llama and Mistral, the β\n",
+ "β β β\n",
+ "β β two biggest players. Our data shows that the open LLM space is fluid, with new state-of- β\n",
+ "β β β\n",
+ "β β of companies that use LLMs are choosing open source models, often alongside proprietary models. β\n",
+ "β β β\n",
+ "β β the-art models getting rapid adoption. β\n",
+ "β β β\n",
+ "β β With each model, there is a trade-off between cost, latency and performance. Together, β\n",
+ "β β β\n",
+ "β β 70% β\n",
+ "β β β\n",
+ "β β usage of the two smallest Meta Llama 2 models (7B and 13B) is significantly higher than the β\n",
+ "β β β\n",
+ "β β largest, Meta Llama 2 70B. Across Meta Llama 2, Llama 3 and Mistral users, 77% choose β\n",
+ "β β β\n",
+ "β β models with 13B parameters or fewer. This suggests that companies care significantly β\n",
+ "β β β\n",
+ "β β of companies that leverage GenAI are using tools, retrieval and vector databases to customize models. β\n",
+ "β β β\n",
+ "β β about cost and latency. β\n",
+ "β β β\n",
+ "β β COMPANIES ARE QUICK TO TRY NEW MODELS β\n",
+ "β β β\n",
+ "β β Meta Llama 3 launched on April 18, 2024. Within its first week, organizations already started β\n",
+ "β β β\n",
+ "β β leveraging it over other models and providers. Just 4 weeks after its launch, Llama 3 accounted for β\n",
+ "β β 39% of all open source LLM usage. β\n",
+ "β β β\n",
+ "β β Top GenAI Python Packages β\n",
+ "β β β\n",
+ "β β STATE OF DATA + AI β\n",
+ "β β β\n",
+ "β β 25 25 β\n",
+ "β β β\n",
+ "β β Generative AI β\n",
+ "β β β\n",
+ "β β Highly regulated industries are early adopters β\n",
+ "β β β\n",
+ "β β Highly regulated industries have the reputation of being risk averse β\n",
+ "β β β\n",
+ "β β and hesitant to adopt new technologies. There are multiple reasons, β\n",
+ "β β β\n",
+ "β β including strict compliance requirements, ingrained legacy systems β\n",
+ "β β β\n",
+ "β β that are costly to replace and the need for regulatory approval before β\n",
+ "β β β\n",
+ "β β implementation. β\n",
+ "β β β\n",
+ "β β While all industries are embracing new AI innovations, two highly β\n",
+ "β β β\n",
+ "β β regulated industries β Financial Services and Healthcare & Life β\n",
+ "β β β\n",
+ "β β Sciences β are keeping pace with, and often surpassing, their less- β\n",
+ "β β β\n",
+ "β β regulated counterparts. β\n",
+ "β β β\n",
+ "β β In December 2023, Databricks released foundation model APIs, β\n",
+ "β β β\n",
+ "β β providing instant access to popular open source LLMs, such as Meta β\n",
+ "β β β\n",
+ "β β Llama and MPT models. We expect the interest in open source to grow β\n",
+ "β β β\n",
+ "β β significantly as models continue to rapidly improve, as shown by the β\n",
+ "β β β\n",
+ "β β recent launches of Llama 3. β\n",
+ "β β β\n",
+ "β β STATE OF DATA + AI β\n",
+ "β β β\n",
+ "β β 26 β\n",
+ "β β β\n",
+ "β β HARNESSING OPEN LLMs FOR INDUSTRY-SPECIFIC NEEDS β\n",
+ "β β β\n",
+ "β β Manufacturing & Automotive and Healthcare & Life Sciences take the lead in adopting foundation model β\n",
+ "β β APIs with the highest average usage per customer. In manufacturing, β\n",
+ "β β β\n",
+ "β β supply chain optimization, quality control and efficiency are deemed the most promising β\n",
+ "β β β\n",
+ "β β use cases. β\n",
+ "β β β\n",
+ "β β A recent report from MIT Tech Review Insights shares that, among those surveyed, CIOs β\n",
+ "β β β\n",
+ "β β in Healthcare & Life Sciences believe GenAI will bring value to their organizations. Open β\n",
+ "β β β\n",
+ "β β source LLMs enable highly regulated industries like Healthcare & Life Sciences to integrate β\n",
+ "β β β\n",
+ "β β GenAI while maintaining the utmost control of their data. β\n",
+ "β β β\n",
+ "β β Figure 8: Manufacturing & Automotive and Healthcare & Life Sciences lead the adoption of foundation β\n",
+ "β β model APIs with the highest average usage per customer. β\n",
+ "β β β\n",
+ "β β NOTE: Date Range: January 2024 to March 2024. β\n",
+ "β β β\n",
+ "β β 27 β\n",
+ "β β β\n",
+ "β β CPUs vs. GPUs: Financial Servicesβ commitment to LLMs grows 88% in 6 months β\n",
+ "β β β\n",
+ "β β CPUs are general-purpose processors designed to handle a wide range of tasks quickly, β\n",
+ "βββββββββββ΄ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ " \n",
+ " \n",
+ "\u001b[3m Generated Columns \u001b[0m\n",
+ "ββββββββββββββββββ³βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ "β\u001b[1m \u001b[0m\u001b[1mName \u001b[0m\u001b[1m \u001b[0mβ\u001b[1m \u001b[0m\u001b[1mValue \u001b[0m\u001b[1m \u001b[0mβ\n",
+ "β‘ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ©\n",
+ "β difficulty β hard β\n",
+ "ββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β reasoning_type β factual recall β\n",
+ "ββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β question_type β answerable β\n",
+ "ββββββββββββββββββΌβββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€\n",
+ "β qa_pair β \u001b[1m{\u001b[0m β\n",
+ "β β \u001b[32m'question'\u001b[0m: \u001b[32m'What is the name of the two smallest Meta Llama 2 models mentioned in the \u001b[0m β\n",
+ "β β \u001b[32mcontext?'\u001b[0m, β\n",
+ "β β \u001b[32m'answer'\u001b[0m: \u001b[32m'7B and 13B'\u001b[0m, β\n",
+ "β β \u001b[32m'reasoning'\u001b[0m: \u001b[32m'According to the context, the two smallest Meta Llama 2 models are mentioned\u001b[0m β\n",
+ "β β \u001b[32mas having a significant higher usage compared to the largest model, Meta Llama 2 70B.'\u001b[0m β\n",
+ "β β \u001b[1m}\u001b[0m β\n",
+ "ββββββββββββββββββ΄βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n",
+ " \n",
+ " \n",
+ "\u001b[3m LLM-as-a-Judge: eval_metrics \u001b[0m\n",
+ "ββββββββββββββββββββββββββββββ³ββββββββββββββββββββββββββββ³βββββββββββββββββββββββββββββ³ββββββββββββββββββββββββββββ\n",
+ "β\u001b[1m \u001b[0m\u001b[1mContext Relevance \u001b[0m\u001b[1m \u001b[0mβ\u001b[1m \u001b[0m\u001b[1mAnswer Precision \u001b[0m\u001b[1m \u001b[0mβ\u001b[1m \u001b[0m\u001b[1mHallucination Avoidance \u001b[0m\u001b[1m \u001b[0mβ\u001b[1m \u001b[0m\u001b[1mAnswer Completeness \u001b[0m\u001b[1m \u001b[0mβ\n",
+ "β‘ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ©\n",
+ "β score: 5 β score: 5 β score: 5 β score: 5 β\n",
+ "β reasoning: The answer β reasoning: The answer β reasoning: The answer does β reasoning: The answer β\n",
+ "β directly references the β precisely names the two β not contain any made-up or β fully addresses the β\n",
+ "β context provided in the β smallest models, '7B' and β incorrect information; it β question by providing the β\n",
+ "β question, which mentions β '13B', which are the β accurately reflects the β names of the two smallest β\n",
+ "β the two smallest Meta β correct models mentioned β details given in the β models without any β\n",
+ "β Llama 2 models. β in the context. β context. β omissions. β\n",
+ "ββββββββββββββββββββββββββββββ΄ββββββββββββββββββββββββββββ΄βββββββββββββββββββββββββββββ΄ββββββββββββββββββββββββββββ\n",
+ " \n",
+ " [index: 0] \n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Run this cell multiple times to cycle through the 10 preview records.\n",
+ "preview.display_sample_record()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "40099da2",
+ "metadata": {},
+ "source": [
+ "## 11. Generate the Full Dataset\n",
+ "\n",
+ "Now let's generate our full dataset of RAG evaluation pairs, analyze the coverage, and export it to a JSONL file for use in evaluating RAG systems. If you want to wait for the job to complete, set wait_until_done=True."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "cb57388d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[12:24:17] [INFO] π¨ Creating Data Designer generation job\n",
+ "[12:24:17] [INFO] |-- job_id: 3ca1af735043492e97687b2c2e630eaa\n",
+ "[12:24:19] [INFO] π² Sampling 20 records from input dataset *with replacement*\n",
+ "[12:24:19] [INFO] π² Using numerical samplers to generate 20 records across 3 columns\n",
+ "[12:24:19] [INFO] (πΎ + πΎ) Concatenating 2 datasets\n",
+ "[12:24:19] [INFO] π Preparing template to generate data column `qa_pair`\n",
+ "[12:24:19] [INFO] |-- model_alias: review-generator-local\n",
+ "[12:24:19] [INFO] Model config being used for model alias 'review-generator-local': \n",
+ "{\n",
+ " \"alias\": \"review-generator-local\",\n",
+ " \"model\": {\n",
+ " \"api_endpoint\": {\n",
+ " \"url\": \"http://10.110.20.111:8004/v1\",\n",
+ " \"model_id\": \"meta/llama-3.1-8b-instruct\",\n",
+ " \"provider_type\": \"openai\"\n",
+ " }\n",
+ " },\n",
+ " \"inference_parameters\": {\n",
+ " \"temperature\": 0.5,\n",
+ " \"top_p\": 1.0,\n",
+ " \"max_tokens\": 1024,\n",
+ " \"max_parallel_requests\": 4\n",
+ " },\n",
+ " \"is_reasoner\": false\n",
+ "}\n",
+ "[12:24:19] [INFO] π©Ί Running health check for model with alias 'review-generator-local'\n",
+ "[12:24:27] [INFO] βοΈ Performing LLM-as-a-Judge on 20 randomly sampled records out of 20.\n",
+ "[12:24:27] [INFO] Model config being used for model alias 'mistral-small': \n",
+ "{\n",
+ " \"alias\": \"mistral-small\",\n",
+ " \"model\": {\n",
+ " \"api_endpoint\": {\n",
+ " \"url\": \"https://integrate.api.nvidia.com/v1\",\n",
+ " \"model_id\": \"mistralai/mistral-small-24b-instruct\",\n",
+ " \"provider_type\": \"openai\"\n",
+ " }\n",
+ " },\n",
+ " \"inference_parameters\": {\n",
+ " \"temperature\": 0.5,\n",
+ " \"top_p\": 1.0,\n",
+ " \"max_tokens\": 1024,\n",
+ " \"max_parallel_requests\": 4\n",
+ " },\n",
+ " \"is_reasoner\": false\n",
+ "}\n",
+ "[12:24:27] [INFO] π©Ί Running health check for model with alias 'mistral-small'\n",
+ "[12:25:17] [INFO] Model config being used for model alias 'mistral-small': \n",
+ "{\n",
+ " \"alias\": \"mistral-small\",\n",
+ " \"model\": {\n",
+ " \"api_endpoint\": {\n",
+ " \"url\": \"https://integrate.api.nvidia.com/v1\",\n",
+ " \"model_id\": \"mistralai/mistral-small-24b-instruct\",\n",
+ " \"provider_type\": \"openai\"\n",
+ " }\n",
+ " },\n",
+ " \"inference_parameters\": {\n",
+ " \"temperature\": 0.5,\n",
+ " \"top_p\": 1.0,\n",
+ " \"max_tokens\": 1024,\n",
+ " \"max_parallel_requests\": 4\n",
+ " },\n",
+ " \"is_reasoner\": false\n",
+ "}\n",
+ "[12:25:17] [INFO] π©Ί Running health check for model with alias 'mistral-small'\n",
+ "[12:25:41] [INFO] Job '3ca1af735043492e97687b2c2e630eaa' completed successfully\n",
+ "[12:25:41] [INFO] π Dataset generation completed successfully.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Let's add an evaluation report to the dataset\n",
+ "config_builder.with_evaluation_report()\n",
+ "\n",
+ "# Generate the full dataset.\n",
+ "workflow_run = ndd.create(\n",
+ " config_builder, num_records=20, wait_until_done=True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "ac909600",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Generated dataset shape: (20, 7)\n",
+ "\n",
+ "Dataset exported to rag_evals.jsonl\n"
+ ]
+ }
+ ],
+ "source": [
+ "dataset = workflow_run.load_dataset()\n",
+ "\n",
+ "print(\"\\nGenerated dataset shape:\", dataset.shape)\n",
+ "\n",
+ "# Export the dataset to JSONL format.\n",
+ "dataset.to_json('rag_evals.jsonl', orient='records', lines=True)\n",
+ "print(\"\\nDataset exported to rag_evals.jsonl\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19c674f4",
+ "metadata": {},
+ "source": [
+ "## 12. Using Your RAG Evaluation Dataset\n",
+ "\n",
+ "Now that you've generated a diverse RAG evaluation dataset, here are some ways to use it:\n",
+ "\n",
+ "1. **Benchmarking**: Test your RAG system against these evaluation pairs to measure performance\n",
+ "\n",
+ "2. **Error Analysis**: Identify patterns in where your RAG system struggles\n",
+ "\n",
+ "3. **Optimization**: Use insights to tune retrieval and generation parameters\n",
+ "\n",
+ "4. **Regression Testing**: Track performance over time as you improve your system\n",
+ "\n",
+ "5. **Model Comparison**: Compare different LLMs, retrievers, or RAG architectures\n",
+ "\n",
+ "The JSONL file contains structured data with questions, ground truth answers, and quality metrics that you can use with most evaluation frameworks."
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "nemo-data-designer",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.9"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}