diff --git a/src/phoenix/experimental/evals/templates/__init__.py b/src/phoenix/experimental/evals/templates/__init__.py index 7b7b1ba15a..9abf1a74cc 100644 --- a/src/phoenix/experimental/evals/templates/__init__.py +++ b/src/phoenix/experimental/evals/templates/__init__.py @@ -5,6 +5,8 @@ HALLUCINATION_PROMPT_TEMPLATE, RAG_RELEVANCY_PROMPT_RAILS_MAP, RAG_RELEVANCY_PROMPT_TEMPLATE, + REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP, + REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE, TOXICITY_PROMPT_RAILS_MAP, TOXICITY_PROMPT_TEMPLATE, ) @@ -35,4 +37,6 @@ "RAG_RELEVANCY_PROMPT_TEMPLATE", "TOXICITY_PROMPT_RAILS_MAP", "TOXICITY_PROMPT_TEMPLATE", + "REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP", + "REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE", ] diff --git a/src/phoenix/experimental/evals/templates/default_templates.py b/src/phoenix/experimental/evals/templates/default_templates.py index 4df840b2a8..b51b356dd5 100644 --- a/src/phoenix/experimental/evals/templates/default_templates.py +++ b/src/phoenix/experimental/evals/templates/default_templates.py @@ -305,6 +305,57 @@ EXPLANATION:""" +REFERENCE_LINK_CORRECTNESS_PROMPT_BASE_TEMPLATE = """ +You are given a conversation that contains questions by a CUSTOMER and you are +trying to determine if the documentation page shared by the ASSISTANT correctly +answers the CUSTOMERS questions. We will give you the conversation between the +customer and the ASSISTANT and the text of the documentation returned: + [CONVERSATION AND QUESTION]: + {input} + ************ + [DOCUMENTATION URL TEXT]: + {reference} + ************ +You should respond "correct" if the documentation text answers the question the +CUSTOMER had in the conversation. If the documentation roughly answers the +question even in a general way the please answer "correct". If there are +multiple questions and a single question is answered, please still answer +"correct". If the text does not answer the question in the conversation, or +doesn't contain information that would allow you to answer the specific question +please answer "incorrect". +""" +REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE_WITH_EXPLANATION = """ +You are given a conversation that contains questions by a CUSTOMER and you are +trying to determine if the documentation page shared by the ASSISTANT correctly +answers the CUSTOMERS questions. We will give you the conversation between the +customer and the ASSISTANT and the text of the documentation returned: + [CONVERSATION AND QUESTION]: + {input} + ************ + [DOCUMENTATION URL TEXT]: + {reference} + ************ +Please read the text carefully, then write out in a step by step manner an +EXPLANATION to show how to evaluate the correctness of the documentation text. +Avoid simply stating the correct answer at the outset. Your response LABEL must +be a single word, either "correct" or "incorrect", and should not contain any +text or characters aside from that. "correct" means the documentation text +answers the question the CUSTOMER had in the conversation. If the documentation +roughly answers the question even in a general way the please answer "correct". +If there are multiple questions and a single question is answered, please still +answer "correct". If the text does not answer the question in the conversation, +or doesn't contain information that would allow you to answer the specific +question please answer "incorrect". + +Example response: +************ +EXPLANATION: An explanation of your reasoning for why the documentation text is correct or incorrect +LABEL: "correct" or "incorrect" +************ + +EXPLANATION:""" +REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP = OrderedDict({True: "correct", False: "incorrect"}) + RAG_RELEVANCY_PROMPT_TEMPLATE = ClassificationTemplate( rails=list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()), @@ -341,3 +392,9 @@ template=CODE_READABILITY_PROMPT_BASE_TEMPLATE, explanation_template=CODE_READABILITY_PROMPT_TEMPLATE_WITH_EXPLANATION, ) + +REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE = ClassificationTemplate( + rails=list(REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP.values()), + template=REFERENCE_LINK_CORRECTNESS_PROMPT_BASE_TEMPLATE, + explanation_template=REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE_WITH_EXPLANATION, +) diff --git a/tutorials/evals/evaluate_reference_link_correctness_classifications.ipynb b/tutorials/evals/evaluate_reference_link_correctness_classifications.ipynb new file mode 100644 index 0000000000..2bb48597e3 --- /dev/null +++ b/tutorials/evals/evaluate_reference_link_correctness_classifications.ipynb @@ -0,0 +1,1983 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "RHNxZTMC8Ute" + }, + "source": [ + "
\n",
+ " \n",
+ "
\n",
+ " Docs\n",
+ " |\n",
+ " GitHub\n",
+ " |\n",
+ " Community\n",
+ "
\n", + " | Unnamed: 0 | \n", + "input | \n", + "url | \n", + "reference | \n", + "is_correct_ref_link | \n", + "
---|---|---|---|---|---|
15 | \n", + "20 | \n", + "Can I pass in my own metrics within Arize? | \n", + "https://docs.arize.com/arize/resources/integrations | \n", + "\\n\\n\\n\\n\\n\\nML Platforms - Arize Docs\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCommunity SlackAsk or search…⌃KLinksArize AIWhat is ML Observability?What is LLM Observability?QuickstartAll Tutorials/Notebooks🪄Sending Data GuidesWhat Is A Model SchemaHow To Send Delayed ActualsFAQ & Troubleshoot Data UploadTable Ingestion Tuning🔌Sending Data MethodsPython Pandas SDKUI Drag & DropGoogle Cloud Storage (GCS)AWS S3Azure Blob StorageGoogle BigQueryDatabricksSnowflake🔢Model TypesLarge Language Models (LLM)Binary ClassificationMulti-Class ClassificationRegressionTimeseries ForecastingRankingNatural Language Processing (NLP)Image ClassificationObject Detection🔔MonitorsGet Started With MonitorsPerformance MonitorsDrift MonitorsData Quality MonitorsNotifications & Integrations🔎TracingPerformance TracingDrift TracingData Quality Troubleshooting🖌EmbeddingsGenerate EmbeddingsEmbedding DriftEmbedding & Cluster AnalyzerEmbeddings for Tabular Data (Multivariate Drift)Embeddings FAQ🦙LLM (Large Language Models)LLM EvaluationsPrompt EngineeringTroubleshoot Retrieval with Vector StoresOpen AI Cluster SummarizationCapturing User FeedbackIntegrations💡Active Learning and Fine TuningAutomate Model RetrainingExport Data to Notebook🎨dashboardsCreate A Dashboard🧙♂Explainability & FairnessModel ExplainabilityBias Tracing (Fairness)🧩API ReferencePython SDKJava SDKR SDKRest APICustom Metrics Query LanguageGraphQL APIData API🏡On-Premise DeploymentOverview🔑Admin & SettingsSSO & RBAC (Role Based Access Control)Whitelisting📚ResourcesProduct FAQGlossaryML PlatformsAlgorithmiaAnyscaleAzure & DatabricksBentoMLCML (DVC)DeepnoteKafkaFeastGoogle Cloud MLHugging FaceLangChain 🦜🔗MLflowNeptunePaperspacePySparkRay Serve (Anyscale)SageMakerSpellUbiOpsWeights & BiasesCommon Industry Use Casesarize.comProduct Release NotesPhoenix OSSPowered By GitBookML PlatformsTutorials and blogs for integrations with the Arize platformInbound IntegrationsAlgorithmiaAlgorithmia is an MLOps platform with APIs to serve, host, and manage models. The Arize platform can easily integrate into Algorithmia to enable model observability, explainability, and monitoring.AnyscaleAnyscale Endpoints is a service enabling developers to integrate fast, cost-efficient, and scalable large language models (LLMs) into their applications using popular LLM APIs. Azure & DatabricksDatabricks is an open and unified data analytics platform for data engineering, data science, machine learning, and analytics. Surface and fix issues with ML models served on Azure with Arize. BentoMLLeverage Bento’s ML service platform to turn ML models into production-worthy prediction services. Once your model is in production, use Arize’s ML observability platform to attain the necessary visibility to keep your model in production.CML (DVC)DVC version controls ML projects. This tutorial runs through how to use Arize in a Continuous Integration and Continuous Deployment workflow for ML models. DeepnoteDeepnote is a new kind of Jupyter-compatible data science notebook with real-time collaboration and running in the cloud. The Arize platform can easily integrate with Deepnote to enable model observability, explainability, and monitoring while also allowing collaboration between team members.FeastFeast (i.e, Feature Store) is an operational data system for managing and serving machine learning features to models in production. Arize leverages Feast to visualize model performance, understand drift & data quality issues, and share insights as your Evaluation Store.Google Cloud MLHugging FaceHugging Face is a library offers both models on-demand in its Model Hub as well as APIs for you to fine-tune NLP models and serve them directly from Hugging Face.MLflowMLflow is an open source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, model registry. By integrating Arize and MLflow, you will be able to train, manage, and register your models while actively monitoring performance, data quality, and troubleshooting degradations across your models.NeptuneNeptune logs, stores, displays, and compares all your MLOps metadata for better experiment tracking. Arize leverages Neptune to visualize your production model performance, understand drift & data quality issues.Ray Serve (Anyscale)Ray Serve is an framework agnostic and scalable model serving library built on Ray. Arize helps you visualize your model performance, understand drift & data quality issues, and share insights learned from your models with Ray Serve.SageMakerSageMaker enables developers to create, train, and deploy machine-learning models in the cloud. Monitor and observe models deployed on SageMaker with Arize for data quality issues, performance checks, and drift. SpellSpell is an end-to-end ML platform that provides infrastructure for company to deploy and train models. Visualize your model's performance, understand drift & data quality issues, and share insights learned from your models deployed on Spell.UbiOpsUbiOps is an MLOps platform with APIs to deploy and serve models. The Arize platform can easily integrate with UbiOps to enable model observability, explainability, and monitoring.Weights & BiasesWeights and Biases helps you build better model by logging metrics and visualize your experiments before production. Arize helps you visualize your model performance, understand drift & data quality issues, and share insights learned from your models.Outbound IntegrationsPagerDutyArize supports an email integration with PagerDuty. This section reviews how to set it up in PagerDutyOpsGenieArize supports an email integration with OpsGenie for automatic notifications.SlackIntegrate with Slack via Slack's email feature. Set up your model's monitors to automatically notify your Slack channel when a monitor has been triggered.Amazon EventBridgeEvent-driven workflows that connect native AWS services with Arize's monitoring capabilities.Resources - PreviousGlossaryNextAlgorithmiaLast modified 1mo agoOn this pageInbound IntegrationsOutbound IntegrationsSupportResourcesGet Started Chat Us On SlackBlogSignup For Free[email protected]CourseBook A DemoSupportChat Us On Slack[email protected]ResourcesBlogCourseGet Started Signup For FreeBook A DemoCopyright © 2023 Arize AI, Inc\\n\\n\\n\\n\\n | \n", + "False | \n", + "
155 | \n", + "195 | \n", + "How do I use custom metrics on monitors? | \n", + "https://docs.arize.com/arize/sending-data-guides/table-ingestion-tuning | \n", + "\\n\\n\\n\\n\\n\\nTable Ingestion Tuning - Arize Docs\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCommunity SlackAsk or search…⌃KLinksArize AIWhat is ML Observability?What is LLM Observability?QuickstartAll Tutorials/Notebooks🪄Sending Data GuidesWhat Is A Model SchemaHow To Send Delayed ActualsFAQ & Troubleshoot Data UploadTable Ingestion Tuning🔌Sending Data MethodsPython Pandas SDKUI Drag & DropGoogle Cloud Storage (GCS)AWS S3Azure Blob StorageGoogle BigQueryDatabricksSnowflake🔢Model TypesLarge Language Models (LLM)Binary ClassificationMulti-Class ClassificationRegressionTimeseries ForecastingRankingNatural Language Processing (NLP)Image ClassificationObject Detection🔔MonitorsGet Started With MonitorsPerformance MonitorsDrift MonitorsData Quality MonitorsNotifications & Integrations🔎TracingPerformance TracingDrift TracingData Quality Troubleshooting🖌EmbeddingsGenerate EmbeddingsEmbedding DriftEmbedding & Cluster AnalyzerEmbeddings for Tabular Data (Multivariate Drift)Embeddings FAQ🦙LLM (Large Language Models)LLM EvaluationsPrompt EngineeringTroubleshoot Retrieval with Vector StoresOpen AI Cluster SummarizationCapturing User FeedbackIntegrations💡Active Learning and Fine TuningAutomate Model RetrainingExport Data to Notebook🎨dashboardsCreate A Dashboard🧙♂Explainability & FairnessModel ExplainabilityBias Tracing (Fairness)🧩API ReferencePython SDKJava SDKR SDKRest APICustom Metrics Query LanguageGraphQL APIData API🏡On-Premise DeploymentOverview🔑Admin & SettingsSSO & RBAC (Role Based Access Control)Whitelisting📚ResourcesProduct FAQGlossaryML PlatformsCommon Industry Use Casesarize.comProduct Release NotesPhoenix OSSPowered By GitBookTable Ingestion TuningTable Ingestion ParametersData is ingested from tables by querying your table or view periodically. There are a few parameters that control how much data is ingested, as well as how often. To see the defaults of these parameters, as well as to change them, click Query Parameters on Job Options.You will see the following 3 parameters with the current value displayed:Query CadenceThis parameter controls how often, in minutes, we should query your table. It is relative to the last time your table was queried, which you can see by clicking the Job ID which gives you a chronological list of queries to your table.Query Window SizeThis parameter controls how large, in hours, of a query window we should use: a query window is the time interval of your data, where time is given in the change_timestamp column you supplied when first configuring the job. The beginning of the query window is always the largest change_timestamp we have encountered while querying your table. The end of the query window is either specified in hours by this parameter, or if left to 0 as the default, means unbounded to the current time.This is useful if you need to limit the amount of data scanned per query. If your table is large, we recommend partitioning your data by the change_timestamp column, so this parameter gives you a way to limit the number of partitions scanned per query if cost is a concern.Row LimitThis parameter controls how many rows to ingest, at most, per query. Note if you specify a query window size that covers an interval of rows with less than the row limit, you may get less than the row limit number of rows.PreviousSending Data FAQNext - Sending Data MethodsPython Pandas SDKLast modified 2mo agoSupportResourcesGet Started Chat Us On SlackBlogSignup For Free[email protected]CourseBook A DemoSupportChat Us On Slack[email protected]ResourcesBlogCourseGet Started Signup For FreeBook A DemoCopyright © 2023 Arize AI, Inc\\n\\n\\n\\n\\n | \n", + "False | \n", + "
79 | \n", + "95 | \n", + "Is the entire data set copied when connecting to data in files? | \n", + "https://docs.arize.com/arize/api-reference/python-sdk | \n", + "\\n\\n\\n\\n\\n\\nPython SDK - Arize Docs\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCommunity SlackAsk or search…⌃KLinksArize AIWhat is ML Observability?What is LLM Observability?QuickstartAll Tutorials/Notebooks🪄Sending Data GuidesWhat Is A Model SchemaHow To Send Delayed ActualsFAQ & Troubleshoot Data UploadTable Ingestion Tuning🔌Sending Data MethodsPython Pandas SDKUI Drag & DropGoogle Cloud Storage (GCS)AWS S3Azure Blob StorageGoogle BigQueryDatabricksSnowflake🔢Model TypesLarge Language Models (LLM)Binary ClassificationMulti-Class ClassificationRegressionTimeseries ForecastingRankingNatural Language Processing (NLP)Image ClassificationObject Detection🔔MonitorsGet Started With MonitorsPerformance MonitorsDrift MonitorsData Quality MonitorsNotifications & Integrations🔎TracingPerformance TracingDrift TracingData Quality Troubleshooting🖌EmbeddingsGenerate EmbeddingsEmbedding DriftEmbedding & Cluster AnalyzerEmbeddings for Tabular Data (Multivariate Drift)Embeddings FAQ🦙LLM (Large Language Models)LLM EvaluationsPrompt EngineeringTroubleshoot Retrieval with Vector StoresOpen AI Cluster SummarizationCapturing User FeedbackIntegrations💡Active Learning and Fine TuningAutomate Model RetrainingExport Data to Notebook🎨dashboardsCreate A Dashboard🧙♂Explainability & FairnessModel ExplainabilityBias Tracing (Fairness)🧩API ReferencePython SDKPandas Batch LoggingSingle Record LoggingChangelogJava SDKR SDKRest APICustom Metrics Query LanguageGraphQL APIData API🏡On-Premise DeploymentOverview🔑Admin & SettingsSSO & RBAC (Role Based Access Control)Whitelisting📚ResourcesProduct FAQGlossaryML PlatformsCommon Industry Use Casesarize.comProduct Release NotesPhoenix OSSPowered By GitBookPython SDKArize AI for Model Monitoring, Troubleshooting, and Explainability Use the Arize Python package to monitor machine learning predictions to observe your ML models and their features, predicted labels, and actual labels with just a few lines of code.Installing the packagepip install arizeIn addition to the basic functionality installed by the command above, the Arize SDK has additional functionality that can be installed with some extra dependencies:Auto Embeddings minimum required for Auto EmbeddingsWith this extra module, Arize extracts the embeddings in the appropriate way depending on your use case, and we return it to you to include in your Pandas DataFrame. Learn more here. To install the Arize package including this functionality:pip install arize[AutoEmbeddings]LLM Evaluation minimum required for LLM EvaluationWith this extra module, Arize helps you calculate evaluation metrics for your LLM Generative tasks. Learn more here. To install the Arize package including this functionality:pip install arize[LLM_Evaluation]Mimic Explainer minimum required for Mimic ExplainerWith this extra module, Arize gives the user the option to pass a flag with their request to send data that would produce SHAP values using the surrogate explainability approach. Learn more here. To install the Arize package including this functionality:pip install arize[MimicExplainer]Logging OptionsThe Arize Python SDK offers 2 ways of logging data into the platform:Pandas Batch LoggingDesigned for logging a batch of your model inferences using Pandas DataFrames. Go to the following page for more information.Pandas Batch LoggingSingle Record LoggingDesigned for low latency, one-at-a-time, logging of your model inferences. Go to the following page for more information.Single Record LoggingEnd of Support TableMajor ReleaseFirst ReleasedLatestSupport7.xJune, 2023latestEnds January 1st, 20266.xJanuary, 20236.1.3Ends January 1st, 20255.xAugust, 20225.5.0Ends October 1st, 20244.xMarch, 20224.2.2Ends June 1st, 20243.xSeptember, 20213.4.0Ends April 1st, 20242.xMarch, 20212.2.1Ended July 1st, 20231.xJuly, 20201.2.1Ended March 1st, 20220.xMarch, 20200.0.20Ended March 1st, 2022Explainability & Fairness - PreviousBias Tracing (Fairness)NextPandas Batch LoggingLast modified 29d agoOn this pageInstalling the packageLogging OptionsEnd of Support TableSupportResourcesGet Started Chat Us On SlackBlogSignup For Free[email protected]CourseBook A DemoSupportChat Us On Slack[email protected]ResourcesBlogCourseGet Started Signup For FreeBook A DemoCopyright © 2023 Arize AI, Inc\\n\\n\\n\\n\\n | \n", + "False | \n", + "