more changes

Azure · Aug 23, 2023 · aa5ac62 · aa5ac62
1 parent b36f2e7
commit aa5ac62
Show file tree

Hide file tree

Showing 42 changed files with 2,198 additions and 9 deletions.
diff --git a/.github/workflows/run-eval-pf-pipeline.yml b/.github/workflows/run-eval-pf-pipeline.yml
@@ -0,0 +1,93 @@
+name: Test and Evaulate Prompts with Promptflow
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ main ]
+
+env: 
+  GROUP: ${{secrets.GROUP}}
+  WORKSPACE: ${{secrets.WORKSPACE}}
+  SUBSCRIPTION: ${{secrets.SUBSCRIPTION}}
+  RUN_NAME: web_classification_variant_1_20230816_215600_605116
+  EVAL_RUN_NAME: classification_accuracy_eval_default_20230821_111809_077086
+
+jobs:
+  login-and-run-and-evalpf:
+    runs-on: ubuntu-latest 
+    steps:
+    - name: check out repo
+      uses: actions/checkout@v2
+    - name: install az ml extension
+      run: az extension add -n ml -y
+    - name: azure login
+      uses: azure/login@v1
+      with:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.11.4'
+    - name: list current directory
+      run: ls
+    - name: install promptflow
+      run: pip install -r promptflow/web-classification/requirements.txt
+    - name: run promptflow
+      run: |
+        pfazure run create -f promptflow/web-classification/run.yml --subscription ${{env.SUBSCRIPTION}} -g ${{env.GROUP}} -w ${{env.WORKSPACE}} --stream > promptflow/llmops-helper/run_info.txt
+        cat promptflow/llmops-helper/run_info.txt
+    - name: set run name
+      run: |
+        echo "RUN_NAME=$(python promptflow/llmops-helper/parse_run_output.py run_info.txt)" >> "$GITHUB_ENV"
+    - name: show the current run name
+      run: echo "Run name is:" ${{env.RUN_NAME}}
+    - name: show promptflow results
+      run: pfazure run show-details --name ${{env.RUN_NAME}} --subscription ${{env.SUBSCRIPTION}} -g ${{env.GROUP}} -w ${{env.WORKSPACE}}
+    - name: run promptflow evaluations
+      run: pfazure run create -f promptflow/web-classification/run_evaluation.yml --run ${{env.RUN_NAME}} --subscription ${{env.SUBSCRIPTION}} -g ${{env.GROUP}} -w ${{env.WORKSPACE}} --stream > promptflow/llmops-helper/eval_info.txt 
+    - name: get eval run name
+      run: export EVAL_RUN_NAME=$(python promptflow/llmops-helper/parse_run_output.py eval_info.txt)
+    - name: show promptflow details
+      run: pfazure run show-details --name ${{env.EVAL_RUN_NAME}} --subscription ${{env.SUBSCRIPTION}} -g ${{env.GROUP}} -w ${{env.WORKSPACE}}
+    - name: show promptflow metrics
+      run: pfazure run show-metrics --name ${{env.EVAL_RUN_NAME}} --subscription ${{env.SUBSCRIPTION}} -g ${{env.GROUP}} -w ${{env.WORKSPACE}} > promptflow/llmops-helper/eval_result.json 
+
+  assert-and-register-model:
+    needs: login-and-run-and-evalpf
+    runs-on: ubuntu-latest 
+    steps:
+    - name: check out repo
+      uses: actions/checkout@v2
+    - name: install az ml extension
+      run: az extension add -n ml -y
+    - name: azure login
+      uses: azure/login@v1
+      with:
+        creds: ${{secrets.AZURE_CREDENTIALS}}
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.11.4'
+    - name: set default subscription
+      run: |
+            az account set -s ${{env.SUBSCRIPTION}}
+    - name: list current directory
+      run: ls
+    - name: install promptflow
+      run: pip install -r promptflow/web-classification/requirements.txt
+    - name: get assert eval results
+      id: jobMetricAssert
+      run: |
+            export ASSERT=$(python promptflow/llmops-helper/assert.py result.json 0.6) # NOTE <file>.json is the file name and decimal is the threshold for the assertion
+            echo "::debug::Assert has returned the following value: $ASSERT"
+            # assert.py will return True or False, but bash expects lowercase.
+            if ${ASSERT,,} ; then
+              echo "::debug::Prompt flow run met the quality bar and can be deployed."
+              echo "::set-output name=result::true"
+            else
+              echo "::warning::Prompt flow run didn't meet quality bar."
+              echo "::set-output name=result::false"
+            fi
+    - name: register promptflow model
+      if: ${{ steps.jobMetricAssert.outputs.result == 'true' }}
+      run: az ml model create --file promptflow/deployment/model.yaml  -g ${{env.GROUP}} -w ${{env.WORKSPACE}}
diff --git a/README.md b/README.md
@@ -1,14 +1,6 @@
 # Project
 
-> This repo has been populated by an initial template to help get you started. Please
-> make sure to update the content to build a great experience for community-building.
-
-As the maintainer of this project, please make a few updates:
-
-- Improving this README.MD file to provide a great experience
-- Updating SUPPORT.MD with content about this project's support experience
-- Understanding the security reporting process in SECURITY.MD
-- Remove this section from the README
+Read more about how to get started with [LLMOps on Microsoft Offical Docs](https://aka.ms/llmops_getting-started)
 
 ## Contributing
 

diff --git a/promptflow/deployment/deployment.yaml b/promptflow/deployment/deployment.yaml
@@ -0,0 +1,33 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: blue
+endpoint_name: web-classification-7272ff
+model: azureml:web-classification-model:1
+  # You can also specify model files path inline
+  # path: examples/flows/chat/basic-chat
+environment: 
+  image: mcr.microsoft.com/azureml/promptflow/promptflow-runtime:20230808.v1
+  # inference config is used to build a serving container for online deployments
+  inference_config:
+    liveness_route:
+      path: /health
+      port: 8080
+    readiness_route:
+      path: /health
+      port: 8080
+    scoring_route:
+      path: /score
+      port: 8080
+instance_type: Standard_E16s_v3
+instance_count: 1
+environment_variables:
+
+  # "compute" mode is the default mode, if you want to deploy to serving mode, you need to set this env variable to "serving"
+  PROMPTFLOW_RUN_MODE: serving
+
+  # for pulling connections from workspace
+  PRT_CONFIG_OVERRIDE: deployment.subscription_id=<sub-id>,deployment.resource_group=o<resource-group>,deployment.workspace_name=<workspace-name>,deployment.endpoint_name=<endpoint-name>,deployment.deployment_name=<deployment-name>
+
+  # (Optional) When there are multiple fields in the response, using this env variable will filter the fields to expose in the response.
+  # For example, if there are 2 flow outputs: "answer", "context", and I only want to have "answer" in the endpoint response, I can set this env variable to '["answer"]'.
+  # If you don't set this environment, by default all flow outputs will be included in the endpoint response.
+  # PROMPTFLOW_RESPONSE_INCLUDED_FIELDS: '["category", "evidence"]'
diff --git a/promptflow/deployment/endpoint.yaml b/promptflow/deployment/endpoint.yaml
@@ -0,0 +1,3 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
+name: web-classification-endpoint
+auth_mode: key
diff --git a/promptflow/deployment/model.yaml b/promptflow/deployment/model.yaml
@@ -0,0 +1,12 @@
+$schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
+name: web-classification-model
+path: ../web-classification
+stage: Production
+description: register web-classification flow folder as a custom model
+properties:
+  # In AuzreML studio UI, endpoint detail UI Test tab needs this property to know it's from prompt flow
+  azureml.promptflow.source_flow_id: web-classification
+
+  # Following are properties only for classification flow 
+  # endpoint detail UI Test tab needs this property to know it's a classification flow
+  azureml.promptflow.mode: classification
diff --git a/promptflow/deployment/sample-request.json b/promptflow/deployment/sample-request.json
@@ -0,0 +1,3 @@
+{
+    "url": "https://www.microsoft.com/en-us/store/collections/xboxseriessconsoles?icid=CNav_Xbox_Series_S"
+}
diff --git a/promptflow/evaluation/basic-eval/README.md b/promptflow/evaluation/basic-eval/README.md
@@ -0,0 +1,40 @@
+# Basic Eval
+This example shows how to create a basic evaluation flow. 
+
+Tools used in this flow：
+- `python` tool
+
+## Prerequisites
+
+Install promptflow sdk and other dependencies in this folder:
+```bash
+pip install -r requirements.txt
+```
+
+## What you will learn
+
+In this flow, you will learn
+- how to compose a point based evaluation flow, where you can calculate point-wise metrics.
+- the way to log metrics. use `from promptflow import log_metric`
+    - see file [aggregate](aggregate.py). TODO.
+
+### 1. Test flow with single line data
+
+Testing flow/node:
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+
+# test with flow inputs
+pf flow test --flow . --inputs groundtruth=ABC prediction=ABC
+
+# test node with inputs
+pf flow test --flow . --node line_process --inputs groundtruth=ABC prediction=ABC
+```
+
+### 2. create flow run with multi line data
+There are two ways to evaluate an classification flow.
+
+```bash
+pf run create --flow . --data ./data.jsonl --stream
+```
diff --git a/promptflow/evaluation/basic-eval/aggregate.py b/promptflow/evaluation/basic-eval/aggregate.py
@@ -0,0 +1,24 @@
+from typing import List
+
+from promptflow import tool
+
+
+@tool
+def aggregate(processed_results: List[str]):
+    """
+    This tool aggregates the processed result of all lines to the variant level and log metric for each variant.
+
+    :param processed_results: List of the output of line_process node.
+    """
+
+    # Add your aggregation logic here
+    # aggregated_results should be a dictionary with the metric name as the key and the metric value as the value.
+    results_num = len(processed_results)
+    print(results_num)
+    print(processed_results)
+
+    # Log metric for each variant
+    from promptflow import log_metric
+    log_metric(key="results_num", value=results_num)
+
+    return results_num
diff --git a/promptflow/evaluation/basic-eval/data.jsonl b/promptflow/evaluation/basic-eval/data.jsonl
@@ -0,0 +1 @@
+{"groundtruth": "Tomorrow's weather will be sunny.","prediction": "The weather will be sunny tomorrow."}
diff --git a/promptflow/evaluation/basic-eval/flow.dag.yaml b/promptflow/evaluation/basic-eval/flow.dag.yaml
@@ -0,0 +1,28 @@
+inputs:
+  groundtruth:
+    type: string
+    default: groundtruth
+  prediction:
+    type: string
+    default: prediction
+outputs:
+  results:
+    type: string
+    reference: ${line_process.output}
+nodes:
+- name: line_process
+  type: python
+  source:
+    type: code
+    path: line_process.py
+  inputs:
+    groundtruth: ${inputs.groundtruth}
+    prediction: ${inputs.prediction}
+- name: aggregate
+  type: python
+  source:
+    type: code
+    path: aggregate.py
+  inputs:
+    processed_results: ${line_process.output}
+  aggregation: true
diff --git a/promptflow/evaluation/basic-eval/line_process.py b/promptflow/evaluation/basic-eval/line_process.py
@@ -0,0 +1,14 @@
+from promptflow import tool
+
+
+@tool
+def line_process(groundtruth: str, prediction: str):
+    """
+    This tool processes the prediction of a single line and returns the processed result.
+
+    :param groundtruth: the groundtruth of a single line.
+    :param prediction: the prediction of a single line.
+    """
+
+    # Add your line processing logic here
+    return "Correct" if groundtruth.lower() == prediction.lower() else "Incorrect"
diff --git a/promptflow/evaluation/basic-eval/requirements.txt b/promptflow/evaluation/basic-eval/requirements.txt
@@ -0,0 +1,5 @@
+--extra-index-url https://azuremlsdktestpypi.azureedge.net/promptflow/
+promptflow
+promptflow-tools
+langchain
+jinja2
diff --git a/promptflow/evaluation/classification-accuracy-eval/.promptflow/flow.tools.json b/promptflow/evaluation/classification-accuracy-eval/.promptflow/flow.tools.json
@@ -0,0 +1,36 @@
+{
+    "package": {},
+    "code": {
+        "grade.py": {
+            "name": "grade.py",
+            "type": "python",
+            "inputs": {
+                "groundtruth": {
+                    "type": [
+                        "string"
+                    ]
+                },
+                "prediction": {
+                    "type": [
+                        "string"
+                    ]
+                }
+            },
+            "source": "grade.py",
+            "function": "grade"
+        },
+        "calculate_accuracy.py": {
+            "name": "calculate_accuracy.py",
+            "type": "python",
+            "inputs": {
+                "grades": {
+                    "type": [
+                        "object"
+                    ]
+                }
+            },
+            "source": "calculate_accuracy.py",
+            "function": "calculate_accuracy"
+        }
+    }
+}
diff --git a/promptflow/evaluation/classification-accuracy-eval/README.md b/promptflow/evaluation/classification-accuracy-eval/README.md
@@ -0,0 +1,38 @@
+# Classification Accuracy Evaluation
+
+This is a flow illustrating how to evaluate the performance of a classification system. It involves comparing each prediction to the groundtruth and assigns a "Correct" or "Incorrect" grade, and aggregating the results to produce metrics such as accuracy, which reflects how good the system is at classifying the data.
+
+Tools used in this flow：
+- `python` tool
+
+## What you will learn
+
+In this flow, you will learn
+- how to compose a point based evaluation flow, where you can calculate point-wise metrics.
+- the way to log metrics. use `from promptflow import log_metric`
+    - see file [calculate_accuracy.py](calculate_accuracy.py)
+
+### 1. Test flow/node
+
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+
+# test with flow inputs
+pf flow test --flow . --inputs groundtruth=APP prediction=APP
+
+# test node with inputs
+pf flow test --flow . --node grade --inputs groundtruth=groundtruth prediction=prediction
+```
+
+### 2. create flow run with multi line data
+There are two ways to evaluate an classification flow.
+
+```bash
+pf run create --flow . --data ./data.jsonl --stream
+```
+
+### 3. create run against other flow run
+
+Learn more in [web-classification](../../standard/web-classification/README.md)
+
diff --git a/...ow/evaluation/classification-accuracy-eval/__pycache__/calculate_accuracy.cpython-311.pyc b/...ow/evaluation/classification-accuracy-eval/__pycache__/calculate_accuracy.cpython-311.pyc
diff --git a/promptflow/evaluation/classification-accuracy-eval/__pycache__/grade.cpython-311.pyc b/promptflow/evaluation/classification-accuracy-eval/__pycache__/grade.cpython-311.pyc
diff --git a/promptflow/evaluation/classification-accuracy-eval/calculate_accuracy.py b/promptflow/evaluation/classification-accuracy-eval/calculate_accuracy.py
@@ -0,0 +1,17 @@
+from typing import List
+
+from promptflow import log_metric, tool
+
+
+@tool
+def calculate_accuracy(grades: List[str]):
+    result = []
+    for index in range(len(grades)):
+        grade = grades[index]
+        result.append(grade)
+
+    # calculate accuracy for each variant
+    accuracy = round((result.count("Correct") / len(result)), 2)
+    log_metric("accuracy", accuracy)
+
+    return result
diff --git a/promptflow/evaluation/classification-accuracy-eval/data.jsonl b/promptflow/evaluation/classification-accuracy-eval/data.jsonl
@@ -0,0 +1,3 @@
+{"groundtruth": "App","prediction": "App"}
+{"groundtruth": "Channel","prediction": "Channel"}
+{"groundtruth": "Academic","prediction": "Academic"}