From 5c3734280dd364743ab47f61fcb54914e35951b5 Mon Sep 17 00:00:00 2001 From: JessicaXYWang <108437381+JessicaXYWang@users.noreply.github.com> Date: Mon, 20 May 2024 11:21:43 -0700 Subject: [PATCH] chore: update resource location (#2225) * update resource location * fix typo * update openai resources * fix bug * update nasaearth data * update openai resource name in notebooks * clean up --- .../form/FormOntologyLearnerSuite.scala | 6 +- .../openai/OpenAIChatCompletionSuite.scala | 4 +- .../openai/OpenAICompletionSuite.scala | 6 +- .../openai/OpenAIEmbeddingsSuite.scala | 4 +- .../microsoft/azure/synapse/ml/Secrets.scala | 3 +- .../Quickstart - Create Audiobooks.ipynb | 220 +++++++++--------- ...ent Question and Answering with PDFs.ipynb | 162 ++++++------- .../Explore Algorithms/OpenAI/Langchain.ipynb | 4 +- docs/Explore Algorithms/OpenAI/OpenAI.ipynb | 4 +- ...- OpenAI Embedding and GPU based KNN.ipynb | 4 +- .../Quickstart - OpenAI Embedding.ipynb | 20 +- ...kstart - Understand and Search Forms.ipynb | 8 +- 12 files changed, 221 insertions(+), 224 deletions(-) diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormOntologyLearnerSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormOntologyLearnerSuite.scala index 472c6d8191..af55615ec4 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormOntologyLearnerSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/form/FormOntologyLearnerSuite.scala @@ -26,9 +26,9 @@ class FormOntologyLearnerSuite extends EstimatorFuzzing[FormOntologyLearner] wit .setOutputCol("unified_ontology") lazy val urlDF: DataFrame = Seq( - "https://mmlsparkdemo.blob.core.windows.net/ignite2021/forms/2017/Invoice115991.pdf", - "https://mmlsparkdemo.blob.core.windows.net/ignite2021/forms/2018/Invoice119554.pdf", - "https://mmlsparkdemo.blob.core.windows.net/ignite2021/forms/2009/Invoice12241.pdf" + "https://mmlspark.blob.core.windows.net/publicwasb/form_test/Invoice115991.pdf", + "https://mmlspark.blob.core.windows.net/publicwasb/form_test/Invoice119554.pdf", + "https://mmlspark.blob.core.windows.net/publicwasb/form_test/Invoice12241.pdf" ).toDF("url") lazy val tableUrlDF: DataFrame = Seq( diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletionSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletionSuite.scala index 4516ebcbea..079106493c 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIChatCompletionSuite.scala @@ -15,13 +15,13 @@ class OpenAIChatCompletionSuite extends TransformerFuzzing[OpenAIChatCompletion] lazy val completion: OpenAIChatCompletion = new OpenAIChatCompletion() .setDeploymentName(deploymentNameGpt4) - .setCustomServiceName(openAIServiceNameGpt4) + .setCustomServiceName(openAIServiceName) .setApiVersion("2023-05-15") .setMaxTokens(5000) .setOutputCol("out") .setMessagesCol("messages") .setTemperature(0) - .setSubscriptionKey(openAIAPIKeyGpt4) + .setSubscriptionKey(openAIAPIKey) lazy val goodDf: DataFrame = Seq( diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletionSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletionSuite.scala index cd14a58498..807426c468 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletionSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAICompletionSuite.scala @@ -12,12 +12,10 @@ import org.apache.spark.sql.{DataFrame, Row} import org.scalactic.Equality trait OpenAIAPIKey { - lazy val openAIAPIKey: String = sys.env.getOrElse("OPENAI_API_KEY", Secrets.OpenAIApiKey) - lazy val openAIServiceName: String = sys.env.getOrElse("OPENAI_SERVICE_NAME", "synapseml-openai") + lazy val openAIAPIKey: String = sys.env.getOrElse("OPENAI_API_KEY_2", Secrets.OpenAIApiKey) + lazy val openAIServiceName: String = sys.env.getOrElse("OPENAI_SERVICE_NAME_2", "synapseml-openai-2") lazy val deploymentName: String = "gpt-35-turbo" lazy val modelName: String = "gpt-35-turbo" - lazy val openAIAPIKeyGpt4: String = sys.env.getOrElse("OPENAI_API_KEY_2", Secrets.OpenAIApiKeyGpt4) - lazy val openAIServiceNameGpt4: String = sys.env.getOrElse("OPENAI_SERVICE_NAME_2", "synapseml-openai-2") lazy val deploymentNameGpt4: String = "gpt-4" lazy val modelNameGpt4: String = "gpt-4" } diff --git a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIEmbeddingsSuite.scala b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIEmbeddingsSuite.scala index 9d4f71910b..53990ec04e 100644 --- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIEmbeddingsSuite.scala +++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/services/openai/OpenAIEmbeddingsSuite.scala @@ -34,12 +34,12 @@ class OpenAIEmbeddingsSuite extends TransformerFuzzing[OpenAIEmbedding] with Ope } lazy val embeddingExtra: OpenAIEmbedding = new OpenAIEmbedding() - .setSubscriptionKey(openAIAPIKeyGpt4) + .setSubscriptionKey(openAIAPIKey) .setDeploymentName("text-embedding-3-small") .setApiVersion("2024-03-01-preview") .setDimensions(100) .setUser("testUser") - .setCustomServiceName(openAIServiceNameGpt4) + .setCustomServiceName(openAIServiceName) .setTextCol("text") .setOutputCol("out") diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala index d36b0731e2..17eed8a668 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/Secrets.scala @@ -54,8 +54,7 @@ object Secrets { } lazy val CognitiveApiKey: String = getSecret("cognitive-api-key") - lazy val OpenAIApiKey: String = getSecret("openai-api-key") - lazy val OpenAIApiKeyGpt4: String = getSecret("openai-api-key-2") + lazy val OpenAIApiKey: String = getSecret("openai-api-key-2") lazy val CustomSpeechApiKey: String = getSecret("custom-speech-api-key") lazy val ConversationTranscriptionUrl: String = getSecret("conversation-transcription-url") diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.ipynb index c7c2543e00..b0739c2cae 100644 --- a/docs/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.ipynb +++ b/docs/Explore Algorithms/AI Services/Quickstart - Create Audiobooks.ipynb @@ -2,36 +2,47 @@ "cells": [ { "cell_type": "markdown", - "source": [ - "# Create audiobooks using neural Text to speech" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "389e4a78-19aa-4c3f-9b7a-92e81f088168", "inputWidgets": {}, + "nuid": "389e4a78-19aa-4c3f-9b7a-92e81f088168", + "showTitle": false, "title": "" } - } + }, + "source": [ + "# Create audiobooks using neural Text to speech" + ] }, { "cell_type": "markdown", - "source": [ - "## Step 1: Load libraries and add service information" - ], "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "f320d6af-b255-4cb5-b60b-da840760713e", "inputWidgets": {}, + "nuid": "f320d6af-b255-4cb5-b60b-da840760713e", + "showTitle": false, "title": "" } - } + }, + "source": [ + "## Step 1: Load libraries and add service information" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ab422610-0438-4ca4-bd16-b45e90125294", + "showTitle": false, + "title": "" + } + }, + "outputs": [], "source": [ "from synapse.ml.core.platform import *\n", "\n", @@ -49,54 +60,54 @@ " secret_name=\"madtest-storage-key\", keyvault=\"mmlspark-build-keys\"\n", ")\n", "storage_account = \"anomalydetectiontest\"" - ], - "outputs": [], - "execution_count": null, + ] + }, + { + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "ab422610-0438-4ca4-bd16-b45e90125294", "inputWidgets": {}, + "nuid": "10c83d0e-998f-4d72-a351-4ffab15f662c", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "## Step 2: Attach the storage account to hold the audio files" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "10c83d0e-998f-4d72-a351-4ffab15f662c", "inputWidgets": {}, + "nuid": "55b83038-e907-4101-a914-0a32825a9d03", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "code", + }, + "outputs": [], "source": [ "spark_key_setting = f\"fs.azure.account.key.{storage_account}.blob.core.windows.net\"\n", "spark.sparkContext._jsc.hadoopConfiguration().set(spark_key_setting, storage_key)" - ], - "outputs": [], + ] + }, + { + "cell_type": "code", "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "55b83038-e907-4101-a914-0a32825a9d03", "inputWidgets": {}, + "nuid": "625c7b1d-4034-4df2-b919-3775ac9c271c", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "code", + }, + "outputs": [], "source": [ "import os\n", "from os.path import exists, join\n", @@ -115,36 +126,37 @@ " mount_point=f\"/mnt/{storage_container}\",\n", " extra_configs={spark_key_setting: storage_key},\n", " )" - ], - "outputs": [], - "execution_count": null, + ] + }, + { + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "625c7b1d-4034-4df2-b919-3775ac9c271c", "inputWidgets": {}, + "nuid": "381c3af7-e0e8-4a29-ae88-467e86a0e717", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "## Step 3: Read in text data" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "381c3af7-e0e8-4a29-ae88-467e86a0e717", "inputWidgets": {}, + "nuid": "56c8ebab-567f-4c1d-a2ea-1aeb5aefcf1e", + "showTitle": false, "title": "" - } - } - }, - { - "cell_type": "code", + }, + "collapsed": false + }, + "outputs": [], "source": [ "from pyspark.sql.functions import udf\n", "\n", @@ -163,41 +175,41 @@ ")\n", "\n", "display(df)" - ], - "outputs": [], - "execution_count": null, + ] + }, + { + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "56c8ebab-567f-4c1d-a2ea-1aeb5aefcf1e", "inputWidgets": {}, + "nuid": "9fcb4305-a6d4-4f48-ac6f-cf4f863c7f5f", + "showTitle": false, "title": "" - }, - "collapsed": false - } - }, - { - "cell_type": "markdown", + } + }, "source": [ "## Step 4: Synthesize audio from text\n", "\n", "
\n", - "\n", + "\n", "
" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "9fcb4305-a6d4-4f48-ac6f-cf4f863c7f5f", "inputWidgets": {}, + "nuid": "2730c8cd-616a-4258-909d-912ea66d6446", + "showTitle": false, "title": "" - } - } - }, - { - "cell_type": "code", + }, + "collapsed": false + }, + "outputs": [], "source": [ "from synapse.ml.services.speech import TextToSpeech\n", "\n", @@ -213,37 +225,36 @@ "\n", "audio = tts.transform(df).cache()\n", "display(audio)" - ], - "outputs": [], - "execution_count": null, + ] + }, + { + "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "2730c8cd-616a-4258-909d-912ea66d6446", "inputWidgets": {}, + "nuid": "157a368a-d80b-4bf8-a5cb-c1f266be2f00", + "showTitle": false, "title": "" - }, - "collapsed": false - } - }, - { - "cell_type": "markdown", + } + }, "source": [ "## Step 5: Listen to an audio file" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { - "showTitle": false, "cellMetadata": {}, - "nuid": "157a368a-d80b-4bf8-a5cb-c1f266be2f00", "inputWidgets": {}, + "nuid": "7a0ad60f-5511-42ba-9882-e93f474f85e9", + "showTitle": false, "title": "" } - } - }, - { - "cell_type": "code", + }, + "outputs": [], "source": [ "from IPython.display import Audio\n", "\n", @@ -256,36 +267,25 @@ "\n", "\n", "Audio(filename=get_audio_file(1))" - ], - "outputs": [], - "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "showTitle": false, - "cellMetadata": {}, - "nuid": "7a0ad60f-5511-42ba-9882-e93f474f85e9", - "inputWidgets": {}, - "title": "" - } - } + ] } ], "metadata": { + "kernel_info": { + "name": "synapse_pyspark" + }, "kernelspec": { - "name": "synapse_pyspark", + "display_name": "Synapse PySpark", "language": "Python", - "display_name": "Synapse PySpark" + "name": "synapse_pyspark" }, "language_info": { "name": "python" }, - "kernel_info": { - "name": "synapse_pyspark" - }, "save_output": true, "synapse_widget": { - "version": "0.1", - "state": {} + "state": {}, + "version": "0.1" } }, "nbformat": 4, diff --git a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb index 27211d1c08..0579bcc7e9 100644 --- a/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb +++ b/docs/Explore Algorithms/AI Services/Quickstart - Document Question and Answering with PDFs.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "a8a1541c29383520", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -21,11 +22,11 @@ }, "source": [ "# A Guide to Q&A on PDF Documents" - ], - "id": "a8a1541c29383520" + ] }, { "cell_type": "markdown", + "id": "802e72cb91971292", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -52,11 +53,11 @@ "1. Preprocessing PDF Documents: Learn how to load the PDF documents into a Spark DataFrame, read the documents using the [Azure AI Document Intelligence](https://azure.microsoft.com/products/ai-services/ai-document-intelligence) in Azure AI Services, and use SynapseML to split the documents into chunks.\n", "2. Embedding Generation and Storage: Learn how to generate embeddings for the chunks using SynapseML and [Azure OpenAI Services](https://azure.microsoft.com/products/ai-services/openai-service), store the embeddings in a vector store using [Azure Cognitive Search](https://azure.microsoft.com/products/search), and search the vector store to answer the user’s question.\n", "3. Question Answering Pipeline: Learn how to retrieve relevant document based on the user’s question and provide the answer using [Langchain](https://python.langchain.com/en/latest/index.html#)." - ], - "id": "802e72cb91971292" + ] }, { "cell_type": "markdown", + "id": "2bc3e2b42bff041c", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -71,21 +72,21 @@ }, "source": [ "We start by installing the necessary python libraries." - ], - "id": "2bc3e2b42bff041c" + ] }, { "cell_type": "code", "execution_count": null, + "id": "9e3b7e183bba5bfc", "metadata": {}, "outputs": [], "source": [ "%pip install openai==0.28.1 langchain==0.0.331" - ], - "id": "9e3b7e183bba5bfc" + ] }, { "cell_type": "markdown", + "id": "fb8f796d1fd622e6", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -105,11 +106,11 @@ }, "source": [ "### Step 1: Provide the keys for Azure AI Services and Azure OpenAI to authenticate the applications." - ], - "id": "fb8f796d1fd622e6" + ] }, { "cell_type": "markdown", + "id": "609142905ffbb2d7", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -129,12 +130,12 @@ }, "source": [ "To authenticate Azure AI Services and Azure OpenAI applications, you need to provide the respective API keys. Here is an example of how you can provide the keys in Python code. `find_secret()` function uses Azure Keyvault to get the API keys, however you can directly paste your own keys there." - ], - "id": "609142905ffbb2d7" + ] }, { "cell_type": "code", "execution_count": null, + "id": "8fbc0743f3a0f6ab", "metadata": {}, "outputs": [], "source": [ @@ -160,11 +161,11 @@ "cogsearch_api_key = find_secret(\n", " secret_name=\"azure-search-key\", keyvault=\"mmlspark-build-keys\"\n", ")" - ], - "id": "8fbc0743f3a0f6ab" + ] }, { "cell_type": "markdown", + "id": "906c53ccba03db4d", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -184,11 +185,11 @@ }, "source": [ "### Step 2: Load the PDF documents into a Spark DataFrame." - ], - "id": "906c53ccba03db4d" + ] }, { "cell_type": "markdown", + "id": "7b64938eebf6a881", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -203,25 +204,25 @@ }, "source": [ "For this tutorial, we will be using NASA's [Earth](https://www.nasa.gov/sites/default/files/atoms/files/earth_book_2019_tagged.pdf) and [Earth at Night](https://www.nasa.gov/sites/default/files/atoms/files/earth_at_night_508.pdf) e-books. To load PDF documents into a Spark DataFrame, you can use the ```spark.read.format(\"binaryFile\")``` method provided by Apache Spark." - ], - "id": "7b64938eebf6a881" + ] }, { "cell_type": "code", "execution_count": null, + "id": "4959c5737781149a", "metadata": {}, "outputs": [], "source": [ "from pyspark.sql.functions import udf\n", "from pyspark.sql.types import StringType\n", "\n", - "document_path = \"wasbs://public@synapseaisolutionsa.blob.core.windows.net/NASAEarth\" # path to your document\n", + "document_path = \"wasbs://publicwasb@mmlspark.blob.core.windows.net/NASAEarth\" # path to your document\n", "df = spark.read.format(\"binaryFile\").load(document_path).limit(10).cache()" - ], - "id": "4959c5737781149a" + ] }, { "cell_type": "markdown", + "id": "fd5b7e549b813d97", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -236,11 +237,11 @@ }, "source": [ "This code will read the PDF documents and create a Spark DataFrame named df with the contents of the PDFs. The DataFrame will have a schema that represents the structure of the PDF documents, including their textual content." - ], - "id": "fd5b7e549b813d97" + ] }, { "cell_type": "markdown", + "id": "28aa80718b187897", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -258,11 +259,11 @@ "\n", "\n", "" - ], - "id": "28aa80718b187897" + ] }, { "cell_type": "markdown", + "id": "f0d64237df1354a4", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -282,22 +283,22 @@ }, "source": [ "##### Display the raw data from the PDF documents" - ], - "id": "f0d64237df1354a4" + ] }, { "cell_type": "code", "execution_count": null, + "id": "393470a52e83b607", "metadata": {}, "outputs": [], "source": [ "# Show the dataframe without the content\n", "display(df.drop(\"content\"))" - ], - "id": "393470a52e83b607" + ] }, { "cell_type": "markdown", + "id": "d888040fcfdccd0a", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -317,11 +318,11 @@ }, "source": [ "### Step 3: Read the documents using Azure AI Document Intelligence." - ], - "id": "d888040fcfdccd0a" + ] }, { "cell_type": "markdown", + "id": "c46bf0a8029196f9", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -343,12 +344,12 @@ "We utilize [SynapseML](https://microsoft.github.io/SynapseML/), an ecosystem of tools designed to enhance the distributed computing framework [Apache Spark](https://github.com/apache/spark). SynapseML introduces advanced networking capabilities to the Spark ecosystem and offers user-friendly SparkML transformers for various [Azure AI Services](https://azure.microsoft.com/products/ai-services).\n", "\n", "Additionally, we employ AnalyzeDocument from Azure AI Services to extract the complete document content and present it in the designated columns called \"output_content\" and \"paragraph.\"" - ], - "id": "c46bf0a8029196f9" + ] }, { "cell_type": "code", "execution_count": null, + "id": "a198b14e6c20489d", "metadata": {}, "outputs": [], "source": [ @@ -372,11 +373,11 @@ " .withColumn(\"output_content\", col(\"result.analyzeResult.content\"))\n", " .withColumn(\"paragraphs\", col(\"result.analyzeResult.paragraphs\"))\n", ").cache()" - ], - "id": "a198b14e6c20489d" + ] }, { "cell_type": "markdown", + "id": "f2a40a2afcf95a9c", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -396,22 +397,22 @@ }, "source": [ "We can observe the analayzed Spark DataFrame named ```analyzed_df``` using the following code. Note that we drop the \"content\" column as it is not needed anymore." - ], - "id": "f2a40a2afcf95a9c" + ] }, { "cell_type": "code", "execution_count": null, + "id": "45d422f7dbc8de6d", "metadata": {}, "outputs": [], "source": [ "analyzed_df = analyzed_df.drop(\"content\")\n", "display(analyzed_df)" - ], - "id": "45d422f7dbc8de6d" + ] }, { "cell_type": "markdown", + "id": "6b8e05223a5f0953", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -431,11 +432,11 @@ }, "source": [ "### Step 4: Split the documents into chunks." - ], - "id": "6b8e05223a5f0953" + ] }, { "cell_type": "markdown", + "id": "5a5b64272230878c", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -455,12 +456,12 @@ }, "source": [ "After analyzing the document, we leverage SynapseML’s PageSplitter to divide the documents into smaller sections, which are subsequently stored in the “chunks” column. This allows for more granular representation and processing of the document content." - ], - "id": "5a5b64272230878c" + ] }, { "cell_type": "code", "execution_count": null, + "id": "34e3a57e64e81ba0", "metadata": {}, "outputs": [], "source": [ @@ -476,11 +477,11 @@ "\n", "splitted_df = ps.transform(analyzed_df)\n", "display(splitted_df)" - ], - "id": "34e3a57e64e81ba0" + ] }, { "cell_type": "markdown", + "id": "126d4367fa9bf899", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -500,12 +501,12 @@ }, "source": [ "Note that the chunks for each document are presented in a single row inside an array. In order to embed all the chunks in the following cells, we need to have each chunk in a separate row. To accomplish that, we first explode these arrays so there is only one chunk in each row, then filter the Spark DataFrame in order to only keep the path to the document and the chunk in a single row." - ], - "id": "126d4367fa9bf899" + ] }, { "cell_type": "code", "execution_count": null, + "id": "27dbf4dc20efc19e", "metadata": {}, "outputs": [], "source": [ @@ -517,11 +518,11 @@ " \"path\", \"chunk\"\n", ")\n", "display(exploded_df)" - ], - "id": "27dbf4dc20efc19e" + ] }, { "cell_type": "markdown", + "id": "12539a7efea29008", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -541,11 +542,11 @@ }, "source": [ "### Step 5: Generate Embeddings." - ], - "id": "12539a7efea29008" + ] }, { "cell_type": "markdown", + "id": "a512653b409a31b5", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -560,12 +561,12 @@ }, "source": [ "To produce embeddings for each chunk, we utilize both SynapseML and Azure OpenAI Service. By integrating the Azure OpenAI service with SynapseML, we can leverage the power of the Apache Spark distributed computing framework to process numerous prompts using the OpenAI service. This integration enables the SynapseML embedding client to generate embeddings in a distributed manner, enabling efficient processing of large volumes of data. If you're interested in applying large language models at a distributed scale using Azure OpenAI and Azure Synapse Analytics, you can refer to [this approach](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/OpenAI/). For more detailed information on generating embeddings with Azure OpenAI, you can look [here]( https://learn.microsoft.com/azure/cognitive-services/openai/how-to/embeddings?tabs=console)." - ], - "id": "a512653b409a31b5" + ] }, { "cell_type": "code", "execution_count": null, + "id": "df686c1b62da8fde", "metadata": {}, "outputs": [], "source": [ @@ -584,11 +585,11 @@ "df_embeddings = embedding.transform(exploded_df)\n", "\n", "display(df_embeddings)" - ], - "id": "df686c1b62da8fde" + ] }, { "cell_type": "markdown", + "id": "35ebf08c47b1a6ff", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -608,11 +609,11 @@ }, "source": [ "### Step 6: Store the embeddings in Azure Cognitive Search Vector Store." - ], - "id": "35ebf08c47b1a6ff" + ] }, { "cell_type": "markdown", + "id": "9a5407b73888a5da", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -636,12 +637,12 @@ "Adding Chunked Documents and Embeddings: The second step involves adding the chunked documents, along with their corresponding embeddings, to the vector datastore. This allows for efficient storage and retrieval of the data using vector search capabilities.\n", "\n", "By following these steps, you can effectively store your chunked documents and their associated embeddings in the AzureCogSearch vector database, enabling seamless retrieval of relevant information through vector search functionality." - ], - "id": "9a5407b73888a5da" + ] }, { "cell_type": "code", "execution_count": null, + "id": "2a5f68ff786229b7", "metadata": {}, "outputs": [], "source": [ @@ -655,12 +656,12 @@ " ) # create index ID for ACS\n", " .withColumn(\"searchAction\", lit(\"upload\"))\n", ")" - ], - "id": "2a5f68ff786229b7" + ] }, { "cell_type": "code", "execution_count": null, + "id": "924f34c3e1612826", "metadata": {}, "outputs": [], "source": [ @@ -675,11 +676,11 @@ " keyCol=\"idx\",\n", " vectorCols=json.dumps([{\"name\": \"embeddings\", \"dimension\": 1536}]),\n", ")" - ], - "id": "924f34c3e1612826" + ] }, { "cell_type": "markdown", + "id": "12c516463ddf41f0", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -699,11 +700,11 @@ }, "source": [ "### Step 7: Ask a Question." - ], - "id": "12c516463ddf41f0" + ] }, { "cell_type": "markdown", + "id": "83f34faff00e6a43", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -723,23 +724,23 @@ }, "source": [ "After processing the document, we can proceed to pose a question. We will use [SynapseML](https://microsoft.github.io/SynapseML/docs/Explore%20Algorithms/OpenAI/Quickstart%20-%20OpenAI%20Embedding/) to convert the user's question into an embedding and then utilize cosine similarity to retrieve the top K document chunks that closely match the user's question. It's worth mentioning that alternative similarity metrics can also be employed." - ], - "id": "83f34faff00e6a43" + ] }, { "cell_type": "code", "execution_count": null, + "id": "6a8b461103d3c24e", "metadata": {}, "outputs": [], "source": [ "user_question = \"What did the astronaut Edgar Mitchell call Earth?\"\n", "retrieve_k = 2 # Retrieve the top 2 documents from vector database" - ], - "id": "6a8b461103d3c24e" + ] }, { "cell_type": "code", "execution_count": null, + "id": "34400fa63af3ca80", "metadata": {}, "outputs": [], "source": [ @@ -789,11 +790,11 @@ "# Generate embeddings for the question and retrieve the top k document chunks\n", "question_embedding = gen_question_embedding(user_question)\n", "output = retrieve_k_chunk(retrieve_k, question_embedding)" - ], - "id": "34400fa63af3ca80" + ] }, { "cell_type": "markdown", + "id": "7b14d74ade6d19d7", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -813,11 +814,11 @@ }, "source": [ "### Step 8: Respond to a User’s Question." - ], - "id": "7b14d74ade6d19d7" + ] }, { "cell_type": "markdown", + "id": "be48fedab0bc8e63", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -837,12 +838,12 @@ }, "source": [ "To provide a response to the user's question, we will utilize the [LangChain](https://python.langchain.com/en/latest/index.html) framework. With the LangChain framework we will augment the retrieved documents with respect to the user's question. Following this, we can request a response to the user's question from our framework." - ], - "id": "be48fedab0bc8e63" + ] }, { "cell_type": "code", "execution_count": null, + "id": "5c22f50db797d61d", "metadata": {}, "outputs": [], "source": [ @@ -856,11 +857,11 @@ "openai.api_base = aoai_endpoint\n", "openai.api_version = \"2022-12-01\"\n", "openai.api_key = aoai_key" - ], - "id": "5c22f50db797d61d" + ] }, { "cell_type": "markdown", + "id": "ca2c56887f7dd034", "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -875,12 +876,12 @@ }, "source": [ "We can now wrap up the Q&A journey by asking a question and checking the answer. You will see that Edgar Mitchell called Earth \"a sparkling blue and white jewel\"!" - ], - "id": "ca2c56887f7dd034" + ] }, { "cell_type": "code", "execution_count": null, + "id": "bf9fb76f9bd16298", "metadata": {}, "outputs": [], "source": [ @@ -920,8 +921,7 @@ "answer = qa_chain.run({\"context\": context, \"query\": user_question})\n", "\n", "print(answer)" - ], - "id": "bf9fb76f9bd16298" + ] } ], "metadata": { diff --git a/docs/Explore Algorithms/OpenAI/Langchain.ipynb b/docs/Explore Algorithms/OpenAI/Langchain.ipynb index 64579c2d70..78cebd956a 100644 --- a/docs/Explore Algorithms/OpenAI/Langchain.ipynb +++ b/docs/Explore Algorithms/OpenAI/Langchain.ipynb @@ -162,9 +162,9 @@ "outputs": [], "source": [ "openai_api_key = find_secret(\n", - " secret_name=\"openai-api-key\", keyvault=\"mmlspark-build-keys\"\n", + " secret_name=\"openai-api-key-2\", keyvault=\"mmlspark-build-keys\"\n", ")\n", - "openai_api_base = \"https://synapseml-openai.openai.azure.com/\"\n", + "openai_api_base = \"https://synapseml-openai-2.openai.azure.com/\"\n", "openai_api_version = \"2022-12-01\"\n", "openai_api_type = \"azure\"\n", "deployment_name = \"text-davinci-003\"\n", diff --git a/docs/Explore Algorithms/OpenAI/OpenAI.ipynb b/docs/Explore Algorithms/OpenAI/OpenAI.ipynb index 531a376143..efccf13565 100644 --- a/docs/Explore Algorithms/OpenAI/OpenAI.ipynb +++ b/docs/Explore Algorithms/OpenAI/OpenAI.ipynb @@ -88,12 +88,12 @@ "\n", "# Fill in the following lines with your service information\n", "# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model\n", - "service_name = \"synapseml-openai\"\n", + "service_name = \"synapseml-openai-2\"\n", "deployment_name = \"gpt-35-turbo\"\n", "deployment_name_embeddings = \"text-embedding-ada-002\"\n", "\n", "key = find_secret(\n", - " secret_name=\"openai-api-key\", keyvault=\"mmlspark-build-keys\"\n", + " secret_name=\"openai-api-key-2\", keyvault=\"mmlspark-build-keys\"\n", ") # please replace this line with your key as a string\n", "\n", "assert key is not None and service_name is not None" diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.ipynb index 5b4f9cc3c1..6e90974a48 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding and GPU based KNN.ipynb @@ -67,11 +67,11 @@ "\n", "# Fill in the following lines with your service information\n", "# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model\n", - "service_name = \"synapseml-openai\"\n", + "service_name = \"synapseml-openai-2\"\n", "deployment_name_embeddings = \"text-embedding-ada-002\"\n", "\n", "key = find_secret(\n", - " secret_name=\"openai-api-key\", keyvault=\"mmlspark-build-keys\"\n", + " secret_name=\"openai-api-key-2\", keyvault=\"mmlspark-build-keys\"\n", ") # please replace this with your key as a string\n", "\n", "assert key is not None and service_name is not None" diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.ipynb index 96d46b7824..6b973bab22 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - OpenAI Embedding.ipynb @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -63,11 +63,11 @@ "\n", "# Fill in the following lines with your service information\n", "# Learn more about selecting which embedding model to choose: https://openai.com/blog/new-and-improved-embedding-model\n", - "service_name = \"synapseml-openai\"\n", + "service_name = \"synapseml-openai-2\"\n", "deployment_name_embeddings = \"text-embedding-ada-002\"\n", "\n", "key = find_secret(\n", - " secret_name=\"openai-api-key\", keyvault=\"mmlspark-build-keys\"\n", + " secret_name=\"openai-api-key-2\", keyvault=\"mmlspark-build-keys\"\n", ") # please replace this with your key as a string\n", "\n", "assert key is not None and service_name is not None" @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -200,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -250,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -307,7 +307,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -357,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -407,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { diff --git a/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb b/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb index d2b58e36e2..e20786e3e0 100644 --- a/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb +++ b/docs/Explore Algorithms/OpenAI/Quickstart - Understand and Search Forms.ipynb @@ -97,9 +97,9 @@ "search_index = \"form-demo-index-5\"\n", "\n", "openai_key = find_secret(\n", - " secret_name=\"openai-api-key\", keyvault=\"mmlspark-build-keys\"\n", + " secret_name=\"openai-api-key-2\", keyvault=\"mmlspark-build-keys\"\n", ") # Replace the call to find_secret with your key as a python string.\n", - "openai_service_name = \"synapseml-openai\"\n", + "openai_service_name = \"synapseml-openai-2\"\n", "openai_deployment_name = \"gpt-35-turbo\"\n", "openai_url = f\"https://{openai_service_name}.openai.azure.com/\"" ] @@ -157,7 +157,7 @@ "\n", "df2 = (\n", " spark.read.format(\"binaryFile\")\n", - " .load(\"wasbs://ignite2021@mmlsparkdemo.blob.core.windows.net/form_subset/*\")\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/form_subset/*\")\n", " .select(\"path\")\n", " .limit(10)\n", " .select(udf(blob_to_url, StringType())(\"path\").alias(\"url\"))\n", @@ -189,7 +189,7 @@ ] }, "source": [ - "" + "" ] }, {