diff --git a/.github/workflows/samples_flows_chat_chat_math_variant.yml b/.github/workflows/samples_flows_chat_chat_math_variant.yml new file mode 100644 index 00000000000..3c9ca7d6e4e --- /dev/null +++ b/.github/workflows/samples_flows_chat_chat_math_variant.yml @@ -0,0 +1,92 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: samples_flows_chat_chat_math_variant +on: + schedule: + - cron: "35 20 * * *" # Every day starting at 4:35 BJT + pull_request: + branches: [ main ] + paths: [ examples/flows/chat/chat-math-variant/**, examples/*requirements.txt, .github/workflows/samples_flows_chat_chat_math_variant.yml ] + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + +jobs: + samples_readme_ci: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Setup Python 3.9 environment + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Generate config.json + run: echo '${{ secrets.TEST_WORKSPACE_CONFIG_JSON_CANARY }}' > ${{ github.workspace }}/examples/config.json + - name: Prepare requirements + working-directory: examples + run: | + if [[ -e requirements.txt ]]; then + python -m pip install --upgrade pip + pip install -r requirements.txt + fi + - name: Prepare dev requirements + working-directory: examples + run: | + python -m pip install --upgrade pip + pip install -r dev_requirements.txt + - name: Refine .env file + working-directory: examples/flows/chat/chat-math-variant + run: | + AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }} + AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/}) + if [[ -e .env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" .env.example + mv .env.example .env + fi + - name: Create run.yml + working-directory: examples/flows/chat/chat-math-variant + run: | + gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }} + gpt_base=$(echo ${gpt_base//\//\\/}) + if [[ -e run.yml ]]; then + sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml + fi + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Extract Steps examples/flows/chat/chat-math-variant/README.md + working-directory: ${{ github.workspace }} + run: | + python scripts/readme/extract_steps_from_readme.py -f examples/flows/chat/chat-math-variant/README.md -o examples/flows/chat/chat-math-variant + - name: Cat script + working-directory: examples/flows/chat/chat-math-variant + run: | + cat bash_script.sh + - name: Run scripts + working-directory: examples/flows/chat/chat-math-variant + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_CANARY }} + bash bash_script.sh + - name: Pip List for Debug + if : ${{ always() }} + working-directory: examples/flows/chat/chat-math-variant + run: | + pip list + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: artifact + path: examples/flows/chat/chat-math-variant/bash_script.sh \ No newline at end of file diff --git a/.github/workflows/samples_flows_evaluation_eval_chat_math.yml b/.github/workflows/samples_flows_evaluation_eval_chat_math.yml new file mode 100644 index 00000000000..21d1ef1d734 --- /dev/null +++ b/.github/workflows/samples_flows_evaluation_eval_chat_math.yml @@ -0,0 +1,92 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: samples_flows_evaluation_eval_chat_math +on: + schedule: + - cron: "52 19 * * *" # Every day starting at 3:52 BJT + pull_request: + branches: [ main ] + paths: [ examples/flows/evaluation/eval-chat-math/**, examples/*requirements.txt, .github/workflows/samples_flows_evaluation_eval_chat_math.yml ] + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + +jobs: + samples_readme_ci: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Setup Python 3.9 environment + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Generate config.json + run: echo '${{ secrets.TEST_WORKSPACE_CONFIG_JSON_CANARY }}' > ${{ github.workspace }}/examples/config.json + - name: Prepare requirements + working-directory: examples + run: | + if [[ -e requirements.txt ]]; then + python -m pip install --upgrade pip + pip install -r requirements.txt + fi + - name: Prepare dev requirements + working-directory: examples + run: | + python -m pip install --upgrade pip + pip install -r dev_requirements.txt + - name: Refine .env file + working-directory: examples/flows/evaluation/eval-chat-math + run: | + AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }} + AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/}) + if [[ -e .env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" .env.example + mv .env.example .env + fi + - name: Create run.yml + working-directory: examples/flows/evaluation/eval-chat-math + run: | + gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }} + gpt_base=$(echo ${gpt_base//\//\\/}) + if [[ -e run.yml ]]; then + sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml + fi + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Extract Steps examples/flows/evaluation/eval-chat-math/README.md + working-directory: ${{ github.workspace }} + run: | + python scripts/readme/extract_steps_from_readme.py -f examples/flows/evaluation/eval-chat-math/README.md -o examples/flows/evaluation/eval-chat-math + - name: Cat script + working-directory: examples/flows/evaluation/eval-chat-math + run: | + cat bash_script.sh + - name: Run scripts + working-directory: examples/flows/evaluation/eval-chat-math + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_CANARY }} + bash bash_script.sh + - name: Pip List for Debug + if : ${{ always() }} + working-directory: examples/flows/evaluation/eval-chat-math + run: | + pip list + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: artifact + path: examples/flows/evaluation/eval-chat-math/bash_script.sh \ No newline at end of file diff --git a/.github/workflows/samples_getstarted_flowasfunction.yml b/.github/workflows/samples_getstarted_flowasfunction.yml new file mode 100644 index 00000000000..ebb390996c0 --- /dev/null +++ b/.github/workflows/samples_getstarted_flowasfunction.yml @@ -0,0 +1,52 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: samples_getstarted_flowasfunction +on: + schedule: + - cron: "17 21 * * *" # Every day starting at 5:17 BJT + pull_request: + branches: [ main ] + paths: [ examples/**, .github/workflows/samples_getstarted_flowasfunction.yml ] + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + +jobs: + samples_notebook_ci: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Generate config.json + run: echo '${{ secrets.TEST_WORKSPACE_CONFIG_JSON_CANARY }}' > ${{ github.workspace }}/examples/config.json + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Setup Python 3.9 environment + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Prepare requirements + run: | + python -m pip install --upgrade pip + pip install -r ${{ github.workspace }}/examples/requirements.txt + pip install -r ${{ github.workspace }}/examples/dev_requirements.txt + - name: Create Aoai Connection + run: pf connection create -f ${{ github.workspace }}/examples/connections/azure_openai.yml --set api_key="${{ secrets.AOAI_API_KEY_TEST }}" api_base="${{ secrets.AOAI_API_ENDPOINT_TEST }}" + - name: Create new Aoai Connection + run: pf connection create -f ${{ github.workspace }}/examples/connections/azure_openai.yml --set api_key="${{ secrets.AOAI_API_KEY_TEST }}" api_base="${{ secrets.AOAI_API_ENDPOINT_TEST }}" name=new_ai_connection + - name: Test Notebook + working-directory: examples/tutorials/get-started + run: | + papermill -k python flow-as-function.ipynb flow-as-function.output.ipynb + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: artifact + path: examples/tutorials/get-started diff --git a/.github/workflows/samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement.yml b/.github/workflows/samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement.yml new file mode 100644 index 00000000000..23ce45edc03 --- /dev/null +++ b/.github/workflows/samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement.yml @@ -0,0 +1,92 @@ +# This code is autogenerated. +# Code is generated by running custom script: python3 readme.py +# Any manual changes to this file may cause incorrect behavior. +# Any manual changes will be overwritten if the code is regenerated. + +name: samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement +on: + schedule: + - cron: "15 19 * * *" # Every day starting at 3:15 BJT + pull_request: + branches: [ main ] + paths: [ examples/**, .github/workflows/samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement.yml ] + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + +jobs: + samples_readme_ci: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + - name: Setup Python 3.9 environment + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Generate config.json + run: echo '${{ secrets.TEST_WORKSPACE_CONFIG_JSON_CANARY }}' > ${{ github.workspace }}/examples/config.json + - name: Prepare requirements + working-directory: examples + run: | + if [[ -e requirements.txt ]]; then + python -m pip install --upgrade pip + pip install -r requirements.txt + fi + - name: Prepare dev requirements + working-directory: examples + run: | + python -m pip install --upgrade pip + pip install -r dev_requirements.txt + - name: Refine .env file + working-directory: examples/tutorials/flow-fine-tuning-evaluation + run: | + AOAI_API_KEY=${{ secrets.AOAI_API_KEY_TEST }} + AOAI_API_ENDPOINT=${{ secrets.AOAI_API_ENDPOINT_TEST }} + AOAI_API_ENDPOINT=$(echo ${AOAI_API_ENDPOINT//\//\\/}) + if [[ -e .env.example ]]; then + echo "env replacement" + sed -i -e "s//$AOAI_API_KEY/g" -e "s//$AOAI_API_ENDPOINT/g" .env.example + mv .env.example .env + fi + - name: Create run.yml + working-directory: examples/tutorials/flow-fine-tuning-evaluation + run: | + gpt_base=${{ secrets.AOAI_API_ENDPOINT_TEST }} + gpt_base=$(echo ${gpt_base//\//\\/}) + if [[ -e run.yml ]]; then + sed -i -e "s/\${azure_open_ai_connection.api_key}/${{ secrets.AOAI_API_KEY_TEST }}/g" -e "s/\${azure_open_ai_connection.api_base}/$gpt_base/g" run.yml + fi + - name: Azure Login + uses: azure/login@v1 + with: + creds: ${{ secrets.AZURE_CREDENTIALS }} + - name: Extract Steps examples/tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md + working-directory: ${{ github.workspace }} + run: | + python scripts/readme/extract_steps_from_readme.py -f examples/tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md -o examples/tutorials/flow-fine-tuning-evaluation + - name: Cat script + working-directory: examples/tutorials/flow-fine-tuning-evaluation + run: | + cat bash_script.sh + - name: Run scripts + working-directory: examples/tutorials/flow-fine-tuning-evaluation + run: | + export aoai_api_key=${{secrets.AOAI_API_KEY_TEST }} + export aoai_api_endpoint=${{ secrets.AOAI_API_ENDPOINT_TEST }} + export test_workspace_sub_id=${{ secrets.TEST_WORKSPACE_SUB_ID }} + export test_workspace_rg=${{ secrets.TEST_WORKSPACE_RG }} + export test_workspace_name=${{ secrets.TEST_WORKSPACE_NAME_CANARY }} + bash bash_script.sh + - name: Pip List for Debug + if : ${{ always() }} + working-directory: examples/tutorials/flow-fine-tuning-evaluation + run: | + pip list + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: artifact + path: examples/tutorials/flow-fine-tuning-evaluation/bash_script.sh \ No newline at end of file diff --git a/README.md b/README.md index 3b49415b570..9bfc595d4af 100644 --- a/README.md +++ b/README.md @@ -83,15 +83,19 @@ Interact with your chatbot by running: (press `Ctrl + C` to end the session) pf flow test --flow ./my_chatbot --interactive ``` +**Core value: ensuring "High Quality” from prototype to production** + +Explore our [**15-minute tutorial**](examples/tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md) that guides you through prompt tuning ➡ batch testing ➡ evaluation, all designed to ensure high quality ready for production. + Next Step! Continue with the **Tutorial** 👇 section to delve deeper into prompt flow. ## Tutorial 🏃‍♂️ -Prompt Flow is a tool designed to **build high quality LLM apps**, the development process in prompt flow follows these steps: develop a flow, improve the flow quality, deploy the flow to production. +Prompt flow is a tool designed to **build high quality LLM apps**, the development process in prompt flow follows these steps: develop a flow, improve the flow quality, deploy the flow to production. ### Develop your own LLM apps -#### VS Code Extensionlogo +#### VS Code Extension We also offer a VS Code extension (a flow designer) for an interactive flow development experience with UI. diff --git a/examples/README.md b/examples/README.md index 52cd261ada3..c6de3b10fc2 100644 --- a/examples/README.md +++ b/examples/README.md @@ -31,6 +31,7 @@ | [distribute-flow-as-executable-app](tutorials/flow-deploy/distribute-flow-as-executable-app/README.md) | [![samples_tutorials_flow_deploy_distribute_flow_as_executable_app](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_deploy_distribute_flow_as_executable_app.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_deploy_distribute_flow_as_executable_app.yml) | This example demos how to package flow as a executable app | | [docker](tutorials/flow-deploy/docker/README.md) | [![samples_tutorials_flow_deploy_docker](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_deploy_docker.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_deploy_docker.yml) | This example demos how to deploy flow as a docker app | | [kubernetes](tutorials/flow-deploy/kubernetes/README.md) | [![samples_tutorials_flow_deploy_kubernetes](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_deploy_kubernetes.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_deploy_kubernetes.yml) | This example demos how to deploy flow as a Kubernetes app | +| [promptflow-quality-improvement](tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md) | [![samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_tutorials_flow_fine_tuning_evaluation_promptflow_quality_improvement.yml) | This tutorial is designed to enhance your understanding of improving flow quality through prompt tuning and evaluation | ### Flows ([flows](flows)) @@ -59,6 +60,7 @@ | path | status | description | ------|--------|------------- | [eval-basic](flows/evaluation/eval-basic/README.md) | [![samples_flows_evaluation_eval_basic](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_basic.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_basic.yml) | This example shows how to create a basic evaluation flow | +| [eval-chat-math](flows/evaluation/eval-chat-math/README.md) | [![samples_flows_evaluation_eval_chat_math](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_chat_math.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_chat_math.yml) | This example shows how to evaluate the answer of math questions, which can compare the output results with the standard answers numerically | | [eval-classification-accuracy](flows/evaluation/eval-classification-accuracy/README.md) | [![samples_flows_evaluation_eval_classification_accuracy](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_classification_accuracy.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_classification_accuracy.yml) | This is a flow illustrating how to evaluate the performance of a classification system | | [eval-entity-match-rate](flows/evaluation/eval-entity-match-rate/README.md) | [![samples_flows_evaluation_eval_entity_match_rate](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_entity_match_rate.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_entity_match_rate.yml) | This is a flow evaluates: entity match rate | | [eval-groundedness](flows/evaluation/eval-groundedness/README.md) | [![samples_flows_evaluation_eval_groundedness](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_groundedness.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_evaluation_eval_groundedness.yml) | This is a flow leverage llm to eval groundedness: whether answer is stating facts that are all present in the given context | @@ -70,6 +72,7 @@ | path | status | description | ------|--------|------------- | [basic-chat](flows/chat/basic-chat/README.md) | [![samples_flows_chat_basic_chat](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_basic_chat.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_basic_chat.yml) | This example shows how to create a basic chat flow | +| [chat-math-variant](flows/chat/chat-math-variant/README.md) | [![samples_flows_chat_chat_math_variant](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_chat_math_variant.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_chat_math_variant.yml) | This is a prompt tuning case with 3 prompt variants for math question answering | | [chat-with-pdf](flows/chat/chat-with-pdf/README.md) | [![samples_flows_chat_chat_with_pdf](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_chat_with_pdf.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_chat_with_pdf.yml) | This is a simple flow that allow you to ask questions about the content of a PDF file and get answers | | [chat-with-wikipedia](flows/chat/chat-with-wikipedia/README.md) | [![samples_flows_chat_chat_with_wikipedia](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_chat_with_wikipedia.yml/badge.svg?branch=main)](https://github.com/microsoft/promptflow/actions/workflows/samples_flows_chat_chat_with_wikipedia.yml) | This flow demonstrates how to create a chatbot that can remember previous interactions and use the conversation history to generate next message | @@ -119,4 +122,4 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope ## Reference -* [Promptflow documentation](https://microsoft.github.io/promptflow/) +* [Promptflow documentation](https://microsoft.github.io/promptflow/) \ No newline at end of file diff --git a/examples/dev_requirements.txt b/examples/dev_requirements.txt index 026b3ba448a..1c53472949d 100644 --- a/examples/dev_requirements.txt +++ b/examples/dev_requirements.txt @@ -3,6 +3,7 @@ ipython_genutils ipykernel papermill keyrings.alt +black==23.7.0 black-nb pypandoc # for markdown reader pypandoc_binary # pypandoc pandoc backend diff --git a/examples/flows/chat/chat-math-variant/.gitignore b/examples/flows/chat/chat-math-variant/.gitignore new file mode 100644 index 00000000000..61bd7254f60 --- /dev/null +++ b/examples/flows/chat/chat-math-variant/.gitignore @@ -0,0 +1,5 @@ +.env +__pycache__/ +.promptflow/* +!.promptflow/flow.tools.json +.runs/ diff --git a/examples/flows/chat/chat-math-variant/.promptflow/flow.tools.json b/examples/flows/chat/chat-math-variant/.promptflow/flow.tools.json new file mode 100644 index 00000000000..74f6d91c16c --- /dev/null +++ b/examples/flows/chat/chat-math-variant/.promptflow/flow.tools.json @@ -0,0 +1,65 @@ +{ + "package": {}, + "code": { + "chat.jinja2": { + "type": "llm", + "inputs": { + "chat_history": { + "type": [ + "string" + ] + }, + "question": { + "type": [ + "string" + ] + } + }, + "source": "chat.jinja2" + }, + "chat_variant_1.jinja2": { + "type": "llm", + "inputs": { + "chat_history": { + "type": [ + "string" + ] + }, + "question": { + "type": [ + "string" + ] + } + }, + "source": "chat_variant_1.jinja2" + }, + "chat_variant_2.jinja2": { + "type": "llm", + "inputs": { + "chat_history": { + "type": [ + "string" + ] + }, + "question": { + "type": [ + "string" + ] + } + }, + "source": "chat_variant_2.jinja2" + }, + "extract_result.py": { + "type": "python", + "inputs": { + "input1": { + "type": [ + "string" + ] + } + }, + "source": "extract_result.py", + "function": "my_python_tool" + } + } +} \ No newline at end of file diff --git a/examples/flows/chat/chat-math-variant/README.md b/examples/flows/chat/chat-math-variant/README.md new file mode 100644 index 00000000000..b225d01a05f --- /dev/null +++ b/examples/flows/chat/chat-math-variant/README.md @@ -0,0 +1,48 @@ +# Test your prompt variants for chat with math +This is a prompt tuning case with 3 prompt variants for math question answering. + +By utilizing this flow, in conjunction with the `evaluation/eval-chat-math` flow, you can quickly grasp the advantages of prompt tuning and experimentation with prompt flow. Here we provide a [video](https://www.youtube.com/watch?v=gcIe6nk2gA4) and a [tutorial]((../../../tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md)) for you to get started. + +Tools used in this flow: +- `llm` tool +- custom `python` Tool + +## Prerequisites + +Install promptflow sdk and other dependencies in this folder: +```bash +pip install -r requirements.txt +``` + +## Getting started + +### 1 Create connection for LLM tool to use +Go to "Prompt flow" "Connections" tab. Click on "Create" button, select one of LLM tool supported connection types and fill in the configurations. + +Currently, there are two connection types supported by LLM tool: "AzureOpenAI" and "OpenAI". If you want to use "AzureOpenAI" connection type, you need to create an Azure OpenAI service first. Please refer to [Azure OpenAI Service](https://azure.microsoft.com/en-us/products/cognitive-services/openai-service/) for more details. If you want to use "OpenAI" connection type, you need to create an OpenAI account first. Please refer to [OpenAI](https://platform.openai.com/) for more details. + +```bash +# Override keys with --set to avoid yaml file changes +pf connection create --file ../../../connections/azure_openai.yml --set api_key= api_base= --name open_ai_connection +``` + +Note in [flow.dag.yaml](flow.dag.yaml) we are using connection named `open_ai_connection`. +```bash +# show registered connection +pf connection show --name open_ai_connection +``` + +### 2 Start chatting + +```bash +# run chat flow with default question in flow.dag.yaml +pf flow test --flow . + +# run chat flow with new question +pf flow test --flow . --inputs question="2+5=?" + +# start a interactive chat session in CLI +pf flow test --flow . --interactive + +# start a interactive chat session in CLI with verbose info +pf flow test --flow . --interactive --verbose \ No newline at end of file diff --git a/examples/flows/chat/chat-math-variant/chat.jinja2 b/examples/flows/chat/chat-math-variant/chat.jinja2 new file mode 100644 index 00000000000..b29be64b174 --- /dev/null +++ b/examples/flows/chat/chat-math-variant/chat.jinja2 @@ -0,0 +1,13 @@ +system: +You are an assistant to calculate the answer to the provided math problems. +Please return the final numerical answer only, without any accompanying reasoning or explanation. + +{% for item in chat_history %} +user: +{{item.inputs.question}} +assistant: +{{item.outputs.answer}} +{% endfor %} + +user: +{{question}} diff --git a/examples/flows/chat/chat-math-variant/chat_variant_1.jinja2 b/examples/flows/chat/chat-math-variant/chat_variant_1.jinja2 new file mode 100644 index 00000000000..d54532b39df --- /dev/null +++ b/examples/flows/chat/chat-math-variant/chat_variant_1.jinja2 @@ -0,0 +1,23 @@ +system: +You are an assistant to calculate the answer to the provided math problems. +Please think step by step. +Return the final numerical answer only and any accompanying reasoning or explanation seperately as json format. + +user: +A jar contains two red marbles, three green marbles, ten white marbles and no other marbles. Two marbles are randomly drawn from this jar without replacement. What is the probability that these two marbles drawn will both be red? Express your answer as a common fraction. +assistant: +{Chain of thought: "The total number of marbles is $2+3+10=15$. The probability that the first marble drawn will be red is $2/15$. Then, there will be one red left, out of 14. Therefore, the probability of drawing out two red marbles will be: $$\\frac{2}{15}\\cdot\\frac{1}{14}=\\boxed{\\frac{1}{105}}$$.", "answer": "1/105"} +user: +Find the greatest common divisor of $7!$ and $(5!)^2.$ +assistant: +{"Chain of thought": "$$ \\begin{array} 7! &=& 7 \\cdot 6 \\cdot 5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1 &=& 2^4 \\cdot 3^2 \\cdot 5^1 \\cdot 7^1 \\\\ (5!)^2 &=& (5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1)^2 &=& 2^6 \\cdot 3^2 \\cdot 5^2 \\\\ \\text{gcd}(7!, (5!)^2) &=& 2^4 \\cdot 3^2 \\cdot 5^1 &=& \\boxed{720} \\end{array} $$.", "answer": "720"} +{% for item in chat_history %} + +user: +{{item.inputs.question}} +assistant: +{{item.outputs.answer}} +{% endfor %} + +user: +{{question}} \ No newline at end of file diff --git a/examples/flows/chat/chat-math-variant/chat_variant_2.jinja2 b/examples/flows/chat/chat-math-variant/chat_variant_2.jinja2 new file mode 100644 index 00000000000..35c65a9624f --- /dev/null +++ b/examples/flows/chat/chat-math-variant/chat_variant_2.jinja2 @@ -0,0 +1,39 @@ +system: +You are an assistant to calculate the answer to the provided math problems. +Please think step by step. +Return the final numerical answer only and any accompanying reasoning or explanation seperately as json format. + +user: +A jar contains two red marbles, three green marbles, ten white marbles and no other marbles. Two marbles are randomly drawn from this jar without replacement. What is the probability that these two marbles drawn will both be red? Express your answer as a common fraction. +assistant: +{Chain of thought: "The total number of marbles is $2+3+10=15$. The probability that the first marble drawn will be red is $2/15$. Then, there will be one red left, out of 14. Therefore, the probability of drawing out two red marbles will be: $$\\frac{2}{15}\\cdot\\frac{1}{14}=\\boxed{\\frac{1}{105}}$$.", "answer": "1/105"} +user: +Find the greatest common divisor of $7!$ and $(5!)^2.$ +assistant: +{"Chain of thought": "$$ \\begin{array} 7! &=& 7 \\cdot 6 \\cdot 5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1 &=& 2^4 \\cdot 3^2 \\cdot 5^1 \\cdot 7^1 \\\\ (5!)^2 &=& (5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1)^2 &=& 2^6 \\cdot 3^2 \\cdot 5^2 \\\\ \\text{gcd}(7!, (5!)^2) &=& 2^4 \\cdot 3^2 \\cdot 5^1 &=& \\boxed{720} \\end{array} $$.", "answer": "720"} +user: +A club has 10 members, 5 boys and 5 girls. Two of the members are chosen at random. What is the probability that they are both girls? +assistant: +{"Chain of thought": "There are $\\binomial{10}{2} = 45$ ways to choose two members of the group, and there are $\\binomial{5}{2} = 10$ ways to choose two girls. Therefore, the probability that two members chosen at random are girls is $\\dfrac{10}{45} = \\boxed{\\dfrac{2}{9}}$.", "answer": "2/9"} +user: +Allison, Brian and Noah each have a 6-sided cube. All of the faces on Allison's cube have a 5. The faces on Brian's cube are numbered 1, 2, 3, 4, 5 and 6. Three of the faces on Noah's cube have a 2 and three of the faces have a 6. All three cubes are rolled. What is the probability that Allison's roll is greater than each of Brian's and Noah's? Express your answer as a common fraction. +assistant: +{"Chain of thought": "Since Allison will always roll a 5, we must calculate the probability that both Brian and Noah roll a 4 or lower. The probability of Brian rolling a 4 or lower is $\\frac{4}{6} = \\frac{2}{3}$ since Brian has a standard die. Noah, however, has a $\\frac{3}{6} = \\frac{1}{2}$ probability of rolling a 4 or lower, since the only way he can do so is by rolling one of his 3 sides that have a 2. So, the probability of both of these independent events occurring is $\\frac{2}{3} \\cdot \\frac{1}{2} = \\boxed{\\frac{1}{3}}$.", "answer": "1/3"} +user: +Compute $\\density binomial{50}{2}$. +assistant: +{"Chain of thought": "$\\density binomial{50}{2} = \\dfrac{50!}{2!48!}=\\dfrac{50\\times 49}{2\\times 1}=\\boxed{1225}.$", "answer": "1225"} +user: +The set $S = \\{1, 2, 3, \\ldots , 49, 50\\}$ contains the first $50$ positive integers. After the multiples of 2 and the multiples of 3 are removed, how many integers remain in the set $S$? +assistant: +{"Chain of thought": "The set $S$ contains $25$ multiples of 2 (that is, even numbers). When these are removed, the set $S$ is left with only the odd integers from 1 to 49. At this point, there are $50-25=25$ integers in $S$. We still need to remove the multiples of 3 from $S$.\n\nSince $S$ only contains odd integers after the multiples of 2 are removed, we must remove the odd multiples of 3 between 1 and 49. These are 3, 9, 15, 21, 27, 33, 39, 45, of which there are 8. Therefore, the number of integers remaining in the set $S$ is $25 - 8 = \\boxed{17}$.", "answer": "17"} +{% for item in chat_history %} + +user: +{{item.inputs.question}} +assistant: +{{item.outputs.answer}} +{% endfor %} + +user: +{{question}} diff --git a/examples/flows/chat/chat-math-variant/data.jsonl b/examples/flows/chat/chat-math-variant/data.jsonl new file mode 100644 index 00000000000..ab5b5a72c23 --- /dev/null +++ b/examples/flows/chat/chat-math-variant/data.jsonl @@ -0,0 +1,20 @@ +{"question": "Compute $\\dbinom{16}{5}$.", "answer": "4368", "raw_answer": "$\\dbinom{16}{5}=\\dfrac{16\\times 15\\times 14\\times 13\\times 12}{5\\times 4\\times 3\\times 2\\times 1}=\\boxed{4368}.$"} +{"question": "Determine the number of ways to arrange the letters of the word PROOF.", "answer": "60", "raw_answer": "There are two O's and five total letters, so the answer is $\\dfrac{5!}{2!} = \\boxed{60}$."} +{"question": "23 people attend a party. Each person shakes hands with at most 22 other people. What is the maximum possible number of handshakes, assuming that any two people can shake hands at most once?", "answer": "253", "raw_answer": "Note that if each person shakes hands with every other person, then the number of handshakes is maximized. There are $\\binom{23}{2} = \\frac{(23)(22)}{2} = (23)(11) = 230+23 = \\boxed{253}$ ways to choose two people to form a handshake."} +{"question": "James has 7 apples. 4 of them are red, and 3 of them are green. If he chooses 2 apples at random, what is the probability that both the apples he chooses are green?", "answer": "1/7", "raw_answer": "There are $\\binom{7}{2}=21$ total ways for James to choose 2 apples from 7, but only $\\binom{3}{2}=3$ ways for him to choose 2 green apples. So, the probability that he chooses 2 green apples is $\\frac{3}{21}=\\boxed{\\frac{1}{7}}$."} +{"question": "We are allowed to remove exactly one integer from the list $$-1,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,$$and then we choose two distinct integers at random from the remaining list. What number should we remove if we wish to maximize the probability that the sum of the two chosen numbers is 10?", "answer": "5", "raw_answer": "For each integer $x$ in the list besides 5, the integer $10-x$ is also in the list. So, for each of these integers, removing $x$ reduces the number of pairs of distinct integers whose sum is 10. However, there is no other integer in list that can be added to 5 to give 10, so removing 5 from the list will not reduce the number of pairs of distinct integers whose sum is 10.\n\nSince removing any integer besides 5 will reduce the number of pairs that add to 10, while removing 5 will leave the number of pairs that add to 10 unchanged, we have the highest probability of having a sum of 10 when we remove $\\boxed{5}$."} +{"question": "The numbers 1 through 25 are written on 25 cards with one number on each card. Sara picks one of the 25 cards at random. What is the probability that the number on her card will be a multiple of 2 or 5? Express your answer as a common fraction.", "answer": "3/5", "raw_answer": "There are $12$ even numbers and $5$ multiples of $5$ in the range $1$ to $25$. However, we have double-counted $10$ and $20$, which are divisible by both $2$ and $5$. So the number of good outcomes is $12+5-2=15$ and the probability is $\\frac{15}{25}=\\boxed{\\frac{3}{5}}$."} +{"question": "A bag has 3 red marbles and 5 white marbles. Two marbles are drawn from the bag and not replaced. What is the probability that the first marble is red and the second marble is white?", "answer": "15/56", "raw_answer": "The probability that the first is red is $\\dfrac38$. Now with 7 remaining, the probability that the second is white is $\\dfrac57$. The answer is $\\dfrac38 \\times \\dfrac57 = \\boxed{\\dfrac{15}{56}}$."} +{"question": "Find the largest prime divisor of 11! + 12!", "answer": "13", "raw_answer": "Since $12! = 12 \\cdot 11!$, we can examine the sum better by factoring $11!$ out of both parts: $$ 11! + 12! = 11! + 12 \\cdot 11! = 11!(1 + 12) = 11! \\cdot 13. $$Since no prime greater than 11 divides $11!$, $\\boxed{13}$ is the largest prime factor of $11! + 12!$."} +{"question": "These two spinners are divided into thirds and quarters, respectively. If each of these spinners is spun once, what is the probability that the product of the results of the two spins will be an even number? Express your answer as a common fraction.\n\n[asy]\n\nsize(5cm,5cm);\n\ndraw(Circle((0,0),1));\n\ndraw(Circle((3,0),1));\n\ndraw((0,0)--(0,1));\n\ndraw((0,0)--(-0.9,-0.47));\n\ndraw((0,0)--(0.9,-0.47));\n\ndraw((2,0)--(4,0));\n\ndraw((3,1)--(3,-1));\n\nlabel(\"$3$\",(-0.5,0.3));\n\nlabel(\"$4$\",(0.5,0.3));\n\nlabel(\"$5$\",(0,-0.5));\n\nlabel(\"$5$\",(2.6,-0.4));\n\nlabel(\"$6$\",(2.6,0.4));\n\nlabel(\"$7$\",(3.4,0.4));\n\nlabel(\"$8$\",(3.4,-0.4));\n\ndraw((0,0)--(0.2,0.8),Arrow);\n\ndraw((3,0)--(3.2,0.8),Arrow);\n\n[/asy]", "answer": "2/3", "raw_answer": "We will subtract the probability that the product is odd from 1 to get the probability that the product is even. In order for the product to be odd, we must have both numbers be odd. There are $2\\cdot2=4$ possibilities for this (a 3 or 5 is spun on the left spinner and a 5 or 7 on the right) out of a total of $3\\cdot4=12$ possibilities, so the probability that the product is odd is $4/12=1/3$. The probability that the product is even is $1-1/3=\\boxed{\\frac{2}{3}}$."} +{"question": "No two students in Mrs. Vale's 26-student mathematics class have the same two initials. Each student's first name and last name begin with the same letter. If the letter ``Y'' is considered a vowel, what is the probability of randomly picking a student whose initials are vowels? Express your answer as a common fraction.", "answer": "3/13", "raw_answer": "The students' initials are AA, BB, CC, $\\cdots$, ZZ, representing all 26 letters. The vowels are A, E, I, O, U, and Y, which are 6 letters out of the possible 26. So the probability of picking a student whose initials are vowels is $\\frac{6}{26}=\\boxed{\\frac{3}{13}}$."} +{"question": "What is the expected value of the roll of a standard 6-sided die?", "answer": "3.5", "raw_answer": "Each outcome of rolling a 6-sided die has probability $\\frac16$, and the possible outcomes are 1, 2, 3, 4, 5, and 6. So the expected value is $$ \\frac16(1) + \\frac16(2) + \\frac16(3) + \\frac16(4) + \\frac16(5) + \\frac16(6) = \\frac{21}{6} = \\boxed{3.5}. $$"} +{"question": "How many positive divisors of 30! are prime?", "answer": "10", "raw_answer": "The only prime numbers that divide $30!$ are less than or equal to 30. So 2, 3, 5, 7, 11, 13, 17, 19, 23, 29 are primes that divide $30!$, and there are $\\boxed{10}$ of these."} +{"question": "Marius is entering a wildlife photo contest, and wishes to arrange his seven snow leopards of different heights in a row. If the shortest two leopards have inferiority complexes and demand to be placed at the ends of the row, how many ways can he line up the leopards?", "answer": "240", "raw_answer": "There are two ways to arrange the shortest two leopards. For the five remaining leopards, there are $5!$ ways to arrange them.\n\nTherefore, the answer is $2\\times5!=\\boxed{240\\text{ ways.}}$"} +{"question": "My school's math club has 6 boys and 8 girls. I need to select a team to send to the state math competition. We want 6 people on the team. In how many ways can I select the team without restrictions?", "answer": "3003", "raw_answer": "With no restrictions, we are merely picking 6 students out of 14. This is $\\binom{14}{6} = \\boxed{3003}$."} +{"question": "Nathan will roll two six-sided dice. What is the probability that he will roll a number less than three on the first die and a number greater than three on the second die? Express your answer as a common fraction.", "answer": "1/6", "raw_answer": "For the first die to be less than three, it must be a 1 or a 2, which occurs with probability $\\frac{1}{3}$. For the second die to be greater than 3, it must be a 4 or a 5 or a 6, which occurs with probability $\\frac{1}{2}$. The probability of both of these events occuring, as they are independent, is $\\frac{1}{3} \\cdot \\frac{1}{2} = \\boxed{\\frac{1}{6}}$."} +{"question": "A Senate committee has 8 Republicans and 6 Democrats. In how many ways can we form a subcommittee with 3 Republicans and 2 Democrats?", "answer": "840", "raw_answer": "There are 8 Republicans and 3 spots for them, so there are $\\binom{8}{3} = 56$ ways to choose the Republicans. There are 6 Democrats and 2 spots for them, so there are $\\binom{6}{2} = 15$ ways to choose the Democrats. So there are $56 \\times 15 = \\boxed{840}$ ways to choose the subcommittee."} +{"question": "How many different positive, four-digit integers can be formed using the digits 2, 2, 9 and 9?", "answer": "6", "raw_answer": "We could go ahead and count these directly, but instead we could count in general and then correct for overcounting. That is, if we had 4 distinct digits, there would be $4! = 24$ orderings. However, we must divide by 2! once for the repetition of the digit 2, and divide by 2! for the repetition of the digit 9 (this should make sense because if the repeated digit were different we would have twice as many orderings). So, our answer is $\\frac{4!}{2!\\cdot 2!} = 2 \\cdot 3 = \\boxed{6}$."} +{"question": "I won a trip for four to the Super Bowl. I can bring three of my friends. I have 8 friends. In how many ways can I form my Super Bowl party?", "answer": "56", "raw_answer": "Order does not matter, so it is a combination. Choosing $3$ out of $8$ is $\\binom{8}{3}=\\boxed{56}.$"} +{"question": "Determine the number of ways to arrange the letters of the word MADAM.", "answer": "30", "raw_answer": "First we count the arrangements if all the letters are unique, which is $5!$. Then since the M's and the A's are not unique, we divide by $2!$ twice for the arrangements of M's and the arrangements of A's, for an answer of $\\dfrac{5!}{2! \\times 2!} = \\boxed{30}$."} +{"question": "A palindrome is a number that reads the same forwards and backwards, such as 3003. How many positive four-digit integers are palindromes?", "answer": "90", "raw_answer": "Constructing palindromes requires that we choose the thousands digit (which defines the units digit) and the hundreds digit (which defines the tens digit). Since there are 9 choices for the thousands digit, and 10 choices for the hundreds digit, creating $9 \\cdot 10 = \\boxed{90}$ palindromes."} \ No newline at end of file diff --git a/examples/flows/chat/chat-math-variant/extract_result.py b/examples/flows/chat/chat-math-variant/extract_result.py new file mode 100644 index 00000000000..5ce86dd507a --- /dev/null +++ b/examples/flows/chat/chat-math-variant/extract_result.py @@ -0,0 +1,19 @@ +from promptflow import tool +import json +import re + +# The inputs section will change based on the arguments of the tool function, after you save the code +# Adding type to arguments and return value will help the system show the types properly +# Please update the function name/signature per need + + +@tool +def my_python_tool(input1: str) -> str: + input1 = re.sub(r'[$\\!]', '', input1) + try: + json_answer = json.loads(input1) + answer = json_answer['answer'] + except Exception: + answer = input1 + + return answer diff --git a/examples/flows/chat/chat-math-variant/flow.dag.yaml b/examples/flows/chat/chat-math-variant/flow.dag.yaml new file mode 100644 index 00000000000..ba02e6a021d --- /dev/null +++ b/examples/flows/chat/chat-math-variant/flow.dag.yaml @@ -0,0 +1,75 @@ +environment: + python_requirements_txt: requirements.txt +inputs: + chat_history: + type: list + is_chat_history: true + default: [] + question: + type: string + is_chat_input: true + default: '1+1=?' +outputs: + answer: + type: string + reference: ${extract_result.output} + is_chat_output: true +nodes: +- name: chat + use_variants: true +- name: extract_result + type: python + source: + type: code + path: extract_result.py + inputs: + input1: ${chat.output} +node_variants: + chat: + default_variant_id: variant_0 + variants: + variant_0: + node: + type: llm + source: + type: code + path: chat.jinja2 + inputs: + deployment_name: gpt-4 + max_tokens: 256 + temperature: 0 + chat_history: ${inputs.chat_history} + question: ${inputs.question} + model: gpt-4 + connection: open_ai_connection + api: chat + variant_1: + node: + type: llm + source: + type: code + path: chat_variant_1.jinja2 + inputs: + deployment_name: gpt-4 + max_tokens: 256 + temperature: 0 + chat_history: ${inputs.chat_history} + question: ${inputs.question} + model: gpt-4 + connection: open_ai_connection + api: chat + variant_2: + node: + type: llm + source: + type: code + path: chat_variant_2.jinja2 + inputs: + deployment_name: gpt-4 + max_tokens: 256 + temperature: 0 + chat_history: ${inputs.chat_history} + question: ${inputs.question} + model: gpt-4 + connection: open_ai_connection + api: chat diff --git a/examples/flows/chat/chat-math-variant/requirements.txt b/examples/flows/chat/chat-math-variant/requirements.txt new file mode 100644 index 00000000000..34d068f5f1c --- /dev/null +++ b/examples/flows/chat/chat-math-variant/requirements.txt @@ -0,0 +1,2 @@ +promptflow +promptflow-tools \ No newline at end of file diff --git a/examples/flows/evaluation/eval-chat-math/README.md b/examples/flows/evaluation/eval-chat-math/README.md new file mode 100644 index 00000000000..2d29340ab86 --- /dev/null +++ b/examples/flows/evaluation/eval-chat-math/README.md @@ -0,0 +1,36 @@ +# Eval chat math + +This example shows how to evaluate the answer of math questions, which can compare the output results with the standard answers numerically. + +Learn more on corresponding [tutorials](../../../tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md) + +Tools used in this flow: +- `python` tool + +## Prerequisites + +Install promptflow sdk and other dependencies in this folder: +```bash +pip install -r requirements.txt +``` + +### 1. Test flow with single line data + +Testing flow/node: +```bash +# test with default input value in flow.dag.yaml +pf flow test --flow . + +# test with flow inputs +pf flow test --flow . --inputs groundtruth=123 prediction=123 + +# test node with inputs +pf flow test --flow . --node line_process --inputs groundtruth=123 prediction=123 +``` + +### 2. create flow run with multi line data +There are two ways to evaluate an classification flow. + +```bash +pf run create --flow . --data ./data.jsonl --stream +``` \ No newline at end of file diff --git a/examples/flows/evaluation/eval-chat-math/aggregate.py b/examples/flows/evaluation/eval-chat-math/aggregate.py new file mode 100644 index 00000000000..537fd66cf53 --- /dev/null +++ b/examples/flows/evaluation/eval-chat-math/aggregate.py @@ -0,0 +1,37 @@ +from typing import List +from promptflow import tool +from promptflow import log_metric + + +@tool +def accuracy_aggregate(processed_results: List[int]): + + num_exception = 0 + num_correct = 0 + + for i in range(len(processed_results)): + if processed_results[i] == -1: + num_exception += 1 + elif processed_results[i] == 1: + num_correct += 1 + + num_total = len(processed_results) + accuracy = round(1.0 * num_correct / num_total, 2) + error_rate = round(1.0 * num_exception / num_total, 2) + + log_metric(key="accuracy", value=accuracy) + log_metric(key="error_rate", value=error_rate) + + return { + "num_total": num_total, + "num_correct": num_correct, + "num_exception": num_exception, + "accuracy": accuracy, + "error_rate": error_rate + } + + +if __name__ == "__main__": + numbers = [1, 1, 1, 1, 0, -1, -1] + accuracy = accuracy_aggregate(numbers) + print("The accuracy is", accuracy) diff --git a/examples/flows/evaluation/eval-chat-math/data.jsonl b/examples/flows/evaluation/eval-chat-math/data.jsonl new file mode 100644 index 00000000000..d1e184daf2b --- /dev/null +++ b/examples/flows/evaluation/eval-chat-math/data.jsonl @@ -0,0 +1,3 @@ +{"groundtruth": "10","prediction": "10"} +{"groundtruth": "253","prediction": "506"} +{"groundtruth": "1/3","prediction": "2/6"} \ No newline at end of file diff --git a/examples/flows/evaluation/eval-chat-math/flow.dag.yaml b/examples/flows/evaluation/eval-chat-math/flow.dag.yaml new file mode 100644 index 00000000000..d7e9a591ae3 --- /dev/null +++ b/examples/flows/evaluation/eval-chat-math/flow.dag.yaml @@ -0,0 +1,34 @@ +inputs: + groundtruth: + type: string + default: "10" + is_chat_input: false + prediction: + type: string + default: "10" + is_chat_input: false +outputs: + score: + type: string + reference: ${line_process.output} +nodes: +- name: line_process + type: python + source: + type: code + path: line_process.py + inputs: + groundtruth: ${inputs.groundtruth} + prediction: ${inputs.prediction} + use_variants: false +- name: aggregate + type: python + source: + type: code + path: aggregate.py + inputs: + processed_results: ${line_process.output} + aggregation: true + use_variants: false +node_variants: {} +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json diff --git a/examples/flows/evaluation/eval-chat-math/line_process.py b/examples/flows/evaluation/eval-chat-math/line_process.py new file mode 100644 index 00000000000..454049e91af --- /dev/null +++ b/examples/flows/evaluation/eval-chat-math/line_process.py @@ -0,0 +1,55 @@ +from promptflow import tool + + +def string_to_number(raw_string: str) -> float: + ''' Try to parse the prediction string and groundtruth string to float number. + Support parse int, float, fraction and recognize non-numeric string with wrong format. + Wrong format cases: 'the answer is \box{2/3}', '0, 5, or any number greater than 11', '4/7//9' + ''' + float_number = 0.0 + try: + float_number = float(raw_string) + except Exception: + if '/' in raw_string: + split_list = raw_string.split('/') + if len(split_list) == 2: + numerator, denominator = split_list + try: + float_number = float(numerator) / float(denominator) + except Exception: + return None + else: + return None + else: + return None + return float_number + + +@tool +def line_process(groundtruth: str, prediction: str) -> int: + pred_float = string_to_number(prediction) + '''Early stop''' + if (pred_float is None): + return -1 + gt_float = string_to_number(groundtruth) + if (gt_float is None): + return -1 + ''' both pred_float and gt_float are valid''' + if round(pred_float, 10) == round(gt_float, 10): + return 1 + else: + return -1 + + +if __name__ == "__main__": + processed_result = line_process("3/5", "6/10") + print("The processed result is", processed_result) + + processed_result = line_process("1/2", "0.5") + print("The processed result is", processed_result) + + processed_result = line_process("3", "5") + print("The processed result is", processed_result) + + processed_result = line_process("2/3", "the answer is \box{2/3}") + print("The processed result is", processed_result) diff --git a/examples/flows/evaluation/eval-chat-math/requirements.txt b/examples/flows/evaluation/eval-chat-math/requirements.txt new file mode 100644 index 00000000000..34d068f5f1c --- /dev/null +++ b/examples/flows/evaluation/eval-chat-math/requirements.txt @@ -0,0 +1,2 @@ +promptflow +promptflow-tools \ No newline at end of file diff --git a/examples/flows/standard/customer-intent-extraction/.promptflow/flow.tools.json b/examples/flows/standard/customer-intent-extraction/.promptflow/flow.tools.json index 45e1b31006c..a50b08fa0dd 100644 --- a/examples/flows/standard/customer-intent-extraction/.promptflow/flow.tools.json +++ b/examples/flows/standard/customer-intent-extraction/.promptflow/flow.tools.json @@ -1,38 +1,54 @@ { - "package": {}, - "code": { + "package": {}, + "code": { + "chat_prompt": { + "type": "prompt", + "inputs": { + "customer_info": { + "type": [ + "string" + ] + }, + "chat_history": { + "type": [ + "string" + ] + } + }, + "source": "user_intent_zero_shot.jinja2" + }, + "extract_intent_tool.py": { + "type": "python", + "inputs": { "chat_prompt": { - "type": "prompt", - "inputs": { - "customer_info": { - "type": [ - "string" - ] - }, - "chat_history": { - "type": [ - "string" - ] - } - }, - "source": "user_intent_zero_shot.jinja2" + "type": [ + "object" + ] + }, + "connection": { + "type": [ + "CustomConnection" + ] + } + }, + "source": "extract_intent_tool.py", + "function": "extract_intent_tool" + }, + "user_intent_zero_shot.jinja2": { + "type": "prompt", + "inputs": { + "customer_info": { + "type": [ + "string" + ] }, - "extract_intent_tool.py": { - "type": "python", - "inputs": { - "chat_prompt": { - "type": [ - "string" - ] - }, - "connection": { - "type": [ - "CustomConnection" - ] - } - }, - "function": "extract_intent_tool", - "source": "extract_intent_tool.py" + "history": { + "type": [ + "string" + ] } + }, + "source": "user_intent_zero_shot.jinja2" } + } } \ No newline at end of file diff --git a/examples/tutorials/flow-fine-tuning-evaluation/media/Screenshot-video.png b/examples/tutorials/flow-fine-tuning-evaluation/media/Screenshot-video.png new file mode 100644 index 00000000000..3ef8e124962 Binary files /dev/null and b/examples/tutorials/flow-fine-tuning-evaluation/media/Screenshot-video.png differ diff --git a/examples/tutorials/flow-fine-tuning-evaluation/media/realcase.png b/examples/tutorials/flow-fine-tuning-evaluation/media/realcase.png new file mode 100644 index 00000000000..18d9d2613c5 Binary files /dev/null and b/examples/tutorials/flow-fine-tuning-evaluation/media/realcase.png differ diff --git a/examples/tutorials/flow-fine-tuning-evaluation/media/visualization_chat_flow.png b/examples/tutorials/flow-fine-tuning-evaluation/media/visualization_chat_flow.png new file mode 100644 index 00000000000..6c17086f593 Binary files /dev/null and b/examples/tutorials/flow-fine-tuning-evaluation/media/visualization_chat_flow.png differ diff --git a/examples/tutorials/flow-fine-tuning-evaluation/media/visualization_eval_flow.png b/examples/tutorials/flow-fine-tuning-evaluation/media/visualization_eval_flow.png new file mode 100644 index 00000000000..49037815ef7 Binary files /dev/null and b/examples/tutorials/flow-fine-tuning-evaluation/media/visualization_eval_flow.png differ diff --git a/examples/tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md b/examples/tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md new file mode 100644 index 00000000000..4f0be44c38e --- /dev/null +++ b/examples/tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md @@ -0,0 +1,496 @@ +# Tutorial: How prompt flow helps on quality improvement + +This tutorial is designed to enhance your understanding of improving flow quality through prompt tuning and evaluation. + +Embark on a journey to overcome the inherent randomness of Language Models (LLMs) and enhance output reliability through **prompt fine-tuning** with this comprehensive tutorial. Explore how prompt flow can simplify this process, enabling you to swiftly build high-quality, LLM-native apps. + +Prompt fine-tuning involves optimizing the input prompts given to an LLM. This strategic adjustment helps the model to focus on specific information needed for a task, thereby improving the accuracy and reliability of the LLM's responses. + +When we talk about "high quality", it's not just about accuracy. It's equally important to strike a balance between the accuracy and the token cost of the LLM. Spend just 15 minutes with us to discover how prompt flow expedites the process of prompt tuning, testing, and evaluation, guiding you towards finding the ideal prompt **(accuracy ↑,token ↓)** + +comparison result + +## Video tutorial + +Before practicing, you can watch the video for a quick understand. This video shows how to use the **prompt flow VS code extension** to develop your chat flow, fine tune the prompt, batch test the flow, and evaluate the quality. + + + +## Hands-on practice + +* Option 1 - VS Code Extension: [Install the prompt flow extension](https://marketplace.visualstudio.com/items?itemName=prompt-flow.prompt-flow) in VS Code and follow the [video tutorial](https://youtu.be/gcIe6nk2gA4) above for a guided practice. +* Option 2 - CLI:Follow the steps below to gain hands-on experience with the prompt flow CLI. + +It's time to put theory into practice! Execute our sample and witness the effects. + +### Prerequisite + +Before moving ahead, ensure you've completed the [Quick Start](../../../README.md#get-started-with-prompt-flow-⚡) guidance. Ensure you have the following setup: +* [Install prompt flow](../../../README.md#installation) +* [Setup a connection for your API key](../../../README.md#quick-start-⚡) + +> ℹ️ For testing quickly, this tutorial uses CLI command. + +Clone the promptflow repository to your local machine: + +```shell +git clone https://github.com/microsoft/promptflow.git +``` + +Setup sample `open_ai_connection` connection +```bash +# Override keys with --set to avoid yaml file changes +pf connection create --file ../../connections/azure_openai.yml --set api_key= api_base= --name open_ai_connection +``` + +Next, let's get started with customizing the flow for a specific task. + +### Customize the flow for a specific task + +In the `promptflow/examples/flows/chat` folder, you can see a `basic-chat` folder, which represents a chat template flow as same as the one you created in the [Quick Start](../../../README.md#get-started-with-prompt-flow-⚡) guidance. We'll use this flow as a starting point to build a math problem solver. + +```bash +cd ../../flows/chat/basic-chat/ +``` + +To enable your chatbot flow to solve math problems, you need to instruct the LLM about the task and target in the prompt. Open `chat.jinja2`, update the prompt as below: + +```jinja +system: +You are an assistant to calculate the answer to the provided math problems. +Please return the final numerical answer only, without any accompanying reasoning or explanation. + +{% for item in chat_history %} +user: +{{item.inputs.question}} +assistant: +{{item.outputs.answer}} +{% endfor %} + +user: +{{question}} +``` + +Before run, check your connection settings in `flow.dag.yaml` file. The default connection name is `open_ai_connection`, and the default model is `gpt-3.5-turbo`. If you have a different connection name or model, please modify the `flow.dag.yaml` file accordingly. + +>
+>(click to toggle details) For example, if you use Azure Open AI, please modify the `flow.dag.yaml` file to specify your connection and deployment +> +> Replace the 'node:' section with following content, specify the 'connection_name' to your Azure Open AI connection, and specify the 'deployment_name' to the model deployment you'd like to use. +> ```yaml +>nodes: +>- name: chat +> type: llm +> source: +> type: code +> path: chat.jinja2 +> inputs: +> deployment_name: #specify your deployment name +> max_tokens: '256' +> temperature: '0' +> chat_history: ${inputs.chat_history} +> question: ${inputs.question} +> api: chat +> connection: #specify your azure openai connection name +> ``` +
+ +Go back to the `promptflow/examples/flows/chat` path, run the following command to test the flow with a simple math problem: + +```bash +cd .. +pf flow test --flow ./basic-chat --inputs question="1+1=?" +``` + +This will yield the following output: +```json +{ + "answer": "2" +} +``` + +Sometime, the question may be challenging. Now, let's test it with a complex math problem, such as: + +```bash +pf flow test --flow ./basic-chat --inputs question="We are allowed to remove exactly one integer from the list $$-1,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,$$and then we choose two distinct integers at random from the remaining list. What number should we remove if we wish to maximize the probability that the sum of the two chosen numbers is 10?" +``` + +The output is: +```json +{ + "answer": "-1" +} +``` +However, the correct answer is 5, so the output answer is incorrect! (Don't be surprised if you got the correct answer, as the randiness of LLM. You can try multiple times for different answers.) It indicates that we need to further evaluate the performance. Therefore, in the next step, we will test the flow with more math problems to better evaluate the quality. + +### Evaluate the quality of your prompt + +With prompt flow, you can quickly trigger a batch-run to test your prompt with a larger dataset, and evaluate the quality of the answers. + +There is a `data.jsonl` file in the `promptflow/examples/flows/chat/chat-math-variant` folder, which is a dataset containing 20 test data entries (a subset of [the Math Dataset](https://github.com/hendrycks/math/)). It includes the input question, the ground truth for numerical answer, and the reasoning (raw_answer). Here's one example: + +```json +{ + "question": "Determine the number of ways to arrange the letters of the word PROOF.", + "answer": "60", + "raw_answer": "There are two O's and five total letters, so the answer is $\\dfrac{5!}{2!} = \\boxed{60}$." +} + +``` + +Run the following command to test your prompt with this dataset: + +First, set the environment variable `base_run_name` to specify the run name. + +```bash +base_run_name="base_run" +``` + +
+For Windows CMD users, run commnad in toggle + +```shell +set base_run_name=base_run +``` + +
+ + +>ℹ️ The default model is `gpt-turbo-3.5`, let's try `gpt-4` to see if it's smarter to get better results. Use `--connections .connection=...`to specify. + +```bash +pf run create --flow ./basic-chat --data ./chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --connections chat.connection=open_ai_connection chat.model=gpt-4 --stream --name $base_run_name +``` + +>ℹ️ For Azure Open AI, run the following command instead: +> ```shell +> pf run create --flow ./chat_math_variant --data test_data.jsonl --column-mapping question='${data.question}' chat_history=[] --connections chat.connection=azure_open_ai_connection chat.deployment_name=gpt-4 --stream --name $base_run_name +> ``` + +
+For Windows CMD users, run commnad in toggle + +```shell +pf run create --flow ./basic-chat --data ./chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --connections chat.connection=open_ai_connection chat.model=gpt-4 --stream --name %base_run_name% +``` + +
+ +> ℹ️ The run name must be unique. Please specify a new name in `--name`. +> If you see "Run 'base_run' already exists.", you can specify another name. But please remember the name you specified, because you'll need it in the next step. + + +When it completes, you can run the following command to see the details of results: +> Specify the run name of your completed run in `--name` argument: + +```bash +pf run show-details --name $base_run_name +``` + +
+For Windows CMD users, run commnad in toggle + +```shell +pf run show-details --name %base_run_name% +``` + +
+ +This can show the line by line input and output of the run: +``` ++----+---------------+-----------------+---------------+---------------+ +| | inputs.chat | inputs.question | inputs.line | outputs.ans | +| | _history | | _number | wer | ++====+===============+=================+===============+===============+ +| 0 | [] | Compute $\dbi | 0 | 4368 | +| | | nom{16}{5}$. | | | ++----+---------------+-----------------+---------------+---------------+ +| 1 | [] | Determine the | 1 | 60 | +| | | number of | | | +| | | ways to | | | +| | | arrange the | | | +| | | letters of | | | +| | | the word | | | +| | | PROOF. | | | ++----+---------------+-----------------+---------------+---------------+ +| .. | ... | ... |... | ... | +``` + +Next, create an **evaluation run** to calculate the accuracy of the answers based on the previous run. + +In the `promptflow/examples/flows/evaluation` folder, you can see a `eval-chat-math` folder, which represents an evaluation flow. We'll use this flow to evaluate the accuracy of the answers. + +```bash +cd ../evaluation +``` + +Run the following command to create an evaluation run: + +```bash +eval_run_name="eval_run" +pf run create --flow ./eval-chat-math --data ../chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --stream --run $base_run_name --name $eval_run_name +``` + +
+For Windows CMD users, run commnad in toggle + +```shell +set eval_run_name=eval_run +pf run create --flow ./eval-chat-math --data ../chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --run %base_run_name% --name %eval_run_name% --stream +``` + +
+ +> If needed, specify the run name which you want to evaluate in `--run` argument, and specify this evaluation run name in `--name` argument. + +Then get metrics of the `eval_run`: + +```bash +pf run show-metrics --name $eval_run_name +``` + +
+For Windows CMD users, run commnad in toggle + +```shell +pf run show-details --name %eval_run_name% +``` + +
+ +You can visualize and compare the output line by line of `base_run` and `eval_run` in a web browser: + +```bash +pf run visualize --name "$base_run_name,$eval_run_name" +``` + +
+For Windows CMD users, run commnad in toggle + +```shell +pf run visualize --name "%base_run_name%,%eval_run_name%" +``` + +
+ +Because of the randomness of the LLM, the accuracy may vary. For example, in my run, the metrics are as follows: + +```json +{ + "accuracy": 0.35, + "error_rate": 0.65 +} +``` + +Oops! The accuracy isn't satisfactory. It's time to fine-tune your prompt for higher quality! + +### Fine-tuning your prompt and evaluate the improvement + +In the `/chat` folder, you can see a `chat-math-variant` folder, which represents a flow with two additional prompt variants compared to the original one you customized based on the `basic-chat`. + +In this sample flow, you'll find three Jinja files: +* `chat.jinja2` is the original prompt as same as the one you customized in `basic-chat`. +* `chat_variant_1.jinja2` and `chat_variant_2.jinja2` are the 2 additional prompt variants. + +We leverage the Chain of Thought (CoT) prompt engineering method to adjust the prompt. The goal is to activate the Language Model's reasoning capability of the questions, by providing a few CoT examples. + +
+Variant_1: 2 CoT examples + +```jinja +system: +You are an assistant to calculate the answer to the provided math problems. +Please think step by step. +Return the final numerical answer only and any accompanying reasoning or explanation seperately as json format.
+user: +A jar contains two red marbles, three green marbles, ten white marbles and no other marbles. Two marbles are randomly drawn from this jar without replacement. What is the probability that these two marbles drawn will both be red? Express your answer as a common fraction. +assistant: +{Chain of thought: "The total number of marbles is $2+3+10=15$. The probability that the first marble drawn will be red is $2/15$. Then, there will be one red left, out of 14. Therefore, the probability of drawing out two red marbles will be: $$\\frac{2}{15}\\cdot\\frac{1}{14}=\\boxed{\\frac{1}{105}}$$.", "answer": "1/105"} +user: +Find the greatest common divisor of $7!$ and $(5!)^2.$ +assistant: +{"Chain of thought": "$$ \\begin{array} 7! &=& 7 \\cdot 6 \\cdot 5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1 &=& 2^4 \\cdot 3^2 \\cdot 5^1 \\cdot 7^1 \\\\ (5!)^2 &=& (5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1)^2 &=& 2^6 \\cdot 3^2 \\cdot 5^2 \\\\ \\text{gcd}(7!, (5!)^2) &=& 2^4 \\cdot 3^2 \\cdot 5^1 &=& \\boxed{720} \\end{array} $$.", "answer": "720"} +``` +
+ +
+Variant_2 : 6 CoT examples. + +```jinja +system: +You are an assistant to calculate the answer to the provided math problems. +Please think step by step. +Return the final numerical answer only and any accompanying reasoning or explanation seperately as json format. + +user: +A jar contains two red marbles, three green marbles, ten white marbles and no other marbles. Two marbles are randomly drawn from this jar without replacement. What is the probability that these two marbles drawn will both be red? Express your answer as a common fraction. +assistant: +{Chain of thought: "The total number of marbles is $2+3+10=15$. The probability that the first marble drawn will be red is $2/15$. Then, there will be one red left, out of 14. Therefore, the probability of drawing out two red marbles will be: $$\\frac{2}{15}\\cdot\\frac{1}{14}=\\boxed{\\frac{1}{105}}$$.", "answer": "1/105"} +user: +Find the greatest common divisor of $7!$ and $(5!)^2.$ +assistant: +{"Chain of thought": "$$ \\begin{array} 7! &=& 7 \\cdot 6 \\cdot 5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1 &=& 2^4 \\cdot 3^2 \\cdot 5^1 \\cdot 7^1 \\\\ (5!)^2 &=& (5 \\cdot 4 \\cdot 3 \\cdot 2 \\cdot 1)^2 &=& 2^6 \\cdot 3^2 \\cdot 5^2 \\\\ \\text{gcd}(7!, (5!)^2) &=& 2^4 \\cdot 3^2 \\cdot 5^1 &=& \\boxed{720} \\end{array} $$.", "answer": "720"} +user: +A club has 10 members, 5 boys and 5 girls. Two of the members are chosen at random. What is the probability that they are both girls? +assistant: +{"Chain of thought": "There are $\\binomial{10}{2} = 45$ ways to choose two members of the group, and there are $\\binomial{5}{2} = 10$ ways to choose two girls. Therefore, the probability that two members chosen at random are girls is $\\dfrac{10}{45} = \\boxed{\\dfrac{2}{9}}$.", "answer": "2/9"} +user: +Allison, Brian and Noah each have a 6-sided cube. All of the faces on Allison's cube have a 5. The faces on Brian's cube are numbered 1, 2, 3, 4, 5 and 6. Three of the faces on Noah's cube have a 2 and three of the faces have a 6. All three cubes are rolled. What is the probability that Allison's roll is greater than each of Brian's and Noah's? Express your answer as a common fraction. +assistant: +{"Chain of thought": "Since Allison will always roll a 5, we must calculate the probability that both Brian and Noah roll a 4 or lower. The probability of Brian rolling a 4 or lower is $\\frac{4}{6} = \\frac{2}{3}$ since Brian has a standard die. Noah, however, has a $\\frac{3}{6} = \\frac{1}{2}$ probability of rolling a 4 or lower, since the only way he can do so is by rolling one of his 3 sides that have a 2. So, the probability of both of these independent events occurring is $\\frac{2}{3} \\cdot \\frac{1}{2} = \\boxed{\\frac{1}{3}}$.", "answer": "1/3"} +user: +Compute $\\density binomial{50}{2}$. +assistant: +{"Chain of thought": "$\\density binomial{50}{2} = \\dfrac{50!}{2!48!}=\\dfrac{50\\times 49}{2\\times 1}=\\boxed{1225}.$", "answer": "1225"} +user: +The set $S = \\{1, 2, 3, \\ldots , 49, 50\\}$ contains the first $50$ positive integers. After the multiples of 2 and the multiples of 3 are removed, how many integers remain in the set $S$? +assistant: +{"Chain of thought": "The set $S$ contains $25$ multiples of 2 (that is, even numbers). When these are removed, the set $S$ is left with only the odd integers from 1 to 49. At this point, there are $50-25=25$ integers in $S$. We still need to remove the multiples of 3 from $S$.\n\nSince $S$ only contains odd integers after the multiples of 2 are removed, we must remove the odd multiples of 3 between 1 and 49. These are 3, 9, 15, 21, 27, 33, 39, 45, of which there are 8. Therefore, the number of integers remaining in the set $S$ is $25 - 8 = \\boxed{17}$.", "answer": "17"} +``` +
+ +These two jinja files are specified in the `flow.dag.yaml` file, which defines the flow structure. You can see that the `chat` node has 3 variants, which point to these 3 Jinja files. + +### Test and evaluate your prompt variants + +First, you need to modify your flow to add two more prompt variants into the chat node, in addition to the existed default one. In the flow.dag.yaml file, you can see 3 variants definition of the `chat` node, which point to these 3 Jinja files. + +Run the CLI command below to start the experiment: test all variants, evaluate them, get the visualized comparison results of the experiment. + +> ℹ️ By default, the connection is set to `open_ai_connection` and and the model is set to `gpt-4` for each variant, as specified in the `flow.dag.yaml` file. However, you have the flexibility to specify a different connection and model by adding `--connections chat.connection= chat.deployment_name=` in the test run command. + +Navigate to the `promptflow/examples/flows` folder + +```bash +cd .. +``` + +Set the environment variable `base_run_name` and `eval_run_name` to specify the run name. + +```bash +base_run_name="base_run_variant_" +eval_run_name="eval_run_variant_" +``` + +
+For Windows CMD users, run commnad in toggle + +```shell +set base_run_name=base_run_variant_ +set eval_run_name=eval_run_variant_ +``` + +
+ +Run the following command to test and evaluate the variants: + +```bash +# Test and evaluate variant_0: +# Test-run +pf run create --flow ./chat/chat-math-variant --data ./chat/chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --variant '${chat.variant_0}' --stream --name "${base_run_name}0" +# Evaluate-run +pf run create --flow ./evaluation/eval-chat-math --data ./chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --stream --run "${base_run_name}0" --name "${eval_run_name}0" + +# Test and evaluate variant_1: +# Test-run +pf run create --flow ./chat/chat-math-variant --data ./chat/chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --variant '${chat.variant_1}' --stream --name "${base_run_name}1" +# Evaluate-run +pf run create --flow ./evaluation/eval-chat-math --data ./chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --stream --run "${base_run_name}1" --name "${eval_run_name}1" + +# Test and evaluate variant_2: +# Test-run +pf run create --flow ./chat/chat-math-variant --data ./chat/chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --variant '${chat.variant_2}' --stream --name "${base_run_name}2" +# Evaluate-run +pf run create --flow ./evaluation/eval-chat-math --data ./chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --stream --run "${base_run_name}2" --name "${eval_run_name}2" +``` + + + +
+For Windows CMD users, run commnad in toggle + +```shell +# Test and evaluate variant_0: +# Test-run +pf run create --flow ./chat/chat-math-variant --data ./chat/chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --variant '${chat.variant_0}' --stream --name %base_run_name%0 +# Evaluate-run +pf run create --flow ./evaluation/eval-chat-math --data ./chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --stream --run %base_run_name%0 --name %eval_run_name%0 + +# Test and evaluate variant_1: +# Test-run +pf run create --flow ./chat/chat-math-variant --data ./chat/chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --variant '${chat.variant_1}' --stream --name %base_run_name%1 +# Evaluate-run +pf run create --flow ./evaluation/eval-chat-math --data ./chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --stream --run %base_run_name%1 --name %eval_run_name%1 + +# Test and evaluate variant_2: +# Test-run +pf run create --flow ./chat/chat-math-variant --data ./chat/chat-math-variant/data.jsonl --column-mapping question='${data.question}' chat_history=[] --variant '${chat.variant_2}' --stream --name %base_run_name%2 +# Evaluate-run +pf run create --flow ./evaluation/eval-chat-math --data ./chat/chat-math-variant/data.jsonl --column-mapping groundtruth='${data.answer}' prediction='${run.outputs.answer}' --stream --run %base_run_name%2 --name %eval_run_name%2 +``` + +
+ +Get metrics of the all evaluations: + +```bash +pf run show-metrics --name "${eval_run_name}0" +pf run show-metrics --name "${eval_run_name}1" +pf run show-metrics --name "${eval_run_name}2" +``` + +You may get the familiar output like this: +``` +# eval_variant_0_run +{ + "accuracy": 0.3, + "error_rate": 0.7 +} +# eval_variant_1_run +{ + "accuracy": 0.9, + "error_rate": 0.1 +} +# eval_variant_2_run +{ + "accuracy": 0.9, + "error_rate": 0.1 +} +``` + +Visualize the results: + +```bash +pf run visualize --name "${base_run_name}0,${eval_run_name}0,${base_run_name}1,${eval_run_name}1,${base_run_name}2,${eval_run_name}2" +``` + +
+For Windows CMD users, run commnad in toggle + +```shell +pf run visualize --name "%base_run_name%0,%eval_run_name%0,base_run_name%1,%eval_run_name%1,base_run_name%2,%eval_run_name%2" +``` + +
+ +Click the HTML link, to get the experiment results. Click on column in the **Output** table will allow you to view the snapshot of each line. + +The snapshot of chat flow: +![Visualization chat flow](media/visualization_chat_flow.png) + +The snapshot of evaluation flow: +![Visualization eval flow](media/visualization_eval_flow.png) + +Excellent! Now you can compare their performances and token costs, and choose the prompt that best suits your needs. We can see that variant_1 and variant_2 have the same accuracy, but variant_1 has a lower token cost (only 2 few shots rather than the 6 in in variant_2). So variant_1 is the best choice for the quality and cost balance. + +### Conclusion + +Great! Now you can compare their performances and token costs to choose the prompt that best suits your needs. Upon comparison, we can observe that variant_1 and variant_2 have the similar accuracy. However, variant_1 stands out as the better choice due to its lower token cost (2 few-shots vs. 6 few-shots). + +comparison result + +It is evident that adding more CoT examples in the prompt does not necessarily improve the accuracy further. Instead, we should identify the optimal point where the number of shots maximizes accuracy while minimizing cost. + +Just in a few steps, we identified that variant_1 strikes the ideal balance between quality and cost! This is where the value of prompt tuning and evaluation using prompt flow becomes apparent. With prompt flow, you can easily test and evaluate different prompt variants, enabling you to facilitate high quality LLM-native apps to production. \ No newline at end of file diff --git a/examples/tutorials/flow-in-pipeline/pipeline.ipynb b/examples/tutorials/flow-in-pipeline/pipeline.ipynb index 604bd58cd82..acf187b667e 100644 --- a/examples/tutorials/flow-in-pipeline/pipeline.ipynb +++ b/examples/tutorials/flow-in-pipeline/pipeline.ipynb @@ -127,7 +127,10 @@ "metadata": {}, "outputs": [], "source": [ - "data_input = Input(path=\"../../flows/standard/web-classification/data.jsonl\", type=AssetTypes.URI_FILE)\n", + "data_input = Input(\n", + " path=\"../../flows/standard/web-classification/data.jsonl\", type=AssetTypes.URI_FILE\n", + ")\n", + "\n", "\n", "@pipeline()\n", "def pipeline_func_with_flow(data):\n", diff --git a/examples/tutorials/get-started/flow-as-function.ipynb b/examples/tutorials/get-started/flow-as-function.ipynb index 01f9f43863b..6e8b1efd8be 100644 --- a/examples/tutorials/get-started/flow-as-function.ipynb +++ b/examples/tutorials/get-started/flow-as-function.ipynb @@ -16,7 +16,7 @@ "from promptflow import load_flow\n", "\n", "\n", - "flow_path=\"../../flows/standard/web-classification\"\n", + "flow_path = \"../../flows/standard/web-classification\"\n", "sample_url = \"https://www.youtube.com/watch?v=o5ZQyXaAv1g\"\n", "\n", "f = load_flow(source=flow_path)\n", @@ -86,7 +86,7 @@ " source=flow_path,\n", ")\n", "# need to create the connection\n", - "f.context.connections={\"classify_with_llm\": {\"connection\": \"new_ai_connection\"}}\n", + "f.context.connections = {\"classify_with_llm\": {\"connection\": \"new_ai_connection\"}}\n", "\n", "result = f(url=sample_url)\n", "\n", @@ -136,11 +136,11 @@ "result = f(\n", " chat_history=[\n", " {\n", - " \"inputs\": {\"chat_input\": \"Hi\"}, \n", - " \"outputs\": {\"chat_output\": \"Hello! How can I assist you today?\"}\n", + " \"inputs\": {\"chat_input\": \"Hi\"},\n", + " \"outputs\": {\"chat_output\": \"Hello! How can I assist you today?\"},\n", " }\n", " ],\n", - " question=\"How are you?\"\n", + " question=\"How are you?\",\n", ")\n", "\n", "\n", diff --git a/examples/tutorials/quick-start/media/logo_pf.png b/examples/tutorials/quick-start/media/logo_pf.png deleted file mode 100644 index a87ac5051df..00000000000 Binary files a/examples/tutorials/quick-start/media/logo_pf.png and /dev/null differ diff --git a/examples/tutorials/run-management/run-management.ipynb b/examples/tutorials/run-management/run-management.ipynb index bfff7ce0fd1..602390565e1 100644 --- a/examples/tutorials/run-management/run-management.ipynb +++ b/examples/tutorials/run-management/run-management.ipynb @@ -119,9 +119,7 @@ "base_run = load_run(\n", " source=\"../../flows/standard/web-classification/run.yml\",\n", " # override the default params in the YAML file\n", - " params_override=[\n", - " {\"column_mapping\": {\"url\": \"${data.url}\"}}\n", - " ]\n", + " params_override=[{\"column_mapping\": {\"url\": \"${data.url}\"}}],\n", ")\n", "\n", "# create the run\n", diff --git a/scripts/readme/readme.py b/scripts/readme/readme.py index 62317bcf4e4..6941d1a1c2d 100644 --- a/scripts/readme/readme.py +++ b/scripts/readme/readme.py @@ -53,7 +53,7 @@ def get_readme_description_first_sentence(readme) -> str: return "" -def write_readme(workflow_telemetry, readme_telemetry): +def write_readme(workflow_telemetries, readme_telemetries): global BRANCH ReadmeStepsManage.git_base_dir() @@ -88,7 +88,7 @@ def write_readme(workflow_telemetry, readme_telemetry): "notebooks": [], } - for workflow_telemetry in workflow_telemetry: + for workflow_telemetry in workflow_telemetries: notebook_name = f"{workflow_telemetry.name}.ipynb" gh_working_dir = workflow_telemetry.gh_working_dir pipeline_name = workflow_telemetry.workflow_name @@ -174,7 +174,7 @@ def write_readme(workflow_telemetry, readme_telemetry): # Adjust tutorial names: - for readme_telemetry in readme_telemetry: + for readme_telemetry in readme_telemetries: if readme_telemetry.readme_name.endswith("README.md"): notebook_name = readme_telemetry.readme_folder.split("/")[-1] else: @@ -300,7 +300,8 @@ def write_readme(workflow_telemetry, readme_telemetry): input_glob_readme = [ "examples/flows/**/README.md", "examples/connections/**/README.md", - "examples/tutorials/**/chat*.md", + "examples/tutorials/e2e-development/*.md", + "examples/tutorials/flow-fine-tuning-evaluation/*.md", "examples/tutorials/**/README.md", "examples/tools/use-cases/**/README.md" ]