Merge branch 'main' into zahra-infra-deployment

microsoft · Jun 27, 2024 · ae54097 · ae54097
2 parents 92c5513 + ff30796
commit ae54097
Show file tree

Hide file tree

Showing 18 changed files with 156 additions and 40 deletions.
diff --git a/.env.sample b/.env.sample
@@ -37,6 +37,7 @@ AZURE_COSMOSDB_CONVERSATIONS_CONTAINER=conversations
 AZURE_COSMOSDB_ACCOUNT_KEY=
 AZURE_COSMOSDB_ENABLE_FEEDBACK=False
 # Chat with data: common settings
+DATASOURCE_TYPE=
 SEARCH_TOP_K=5
 SEARCH_STRICTNESS=3
 SEARCH_ENABLE_IN_DOMAIN=True

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -27,6 +27,22 @@ If applicable, add screenshots to help explain your problem.
  - Azure OpenAI model name and version (e.g. 'gpt-35-turbo-16k, version 0613')
  - Is chat history enabled?
  - Are you using data? If so, what data source? (e.g. Azure AI Search, Azure CosmosDB Mongo vCore, etc)
+- Verify the startup command and runtime configuration by showing the output of the following az CLI command:
+```
+az webapp show --name <app name> --resource-group <resource group name> --query "{startupCommand: siteConfig.appCommandLine, runtime: siteConfig.linuxFxVersion}"
+```
+
+**Logs**
+
+1. If the application deployment is failing, please share the deployment logs using the following az CLI command:
+```
+az webapp log deployment show --name <app name> --resource-group <rg name>
+```
+
+2. If the application is crashing after deployment, please share the application logs using the following az CLI command:
+```
+az webapp log tail --name <app name> --resource-group <resource group name>
+```
 
 **Additional context**
 Add any other context about the problem here.
diff --git a/.github/workflows/check-static-files.yml b/.github/workflows/check-static-files.yml
@@ -0,0 +1,51 @@
+name: Check changes to static files
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  changed_files:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: List changed frontend source files
+        uses: tj-actions/changed-files@v44
+        id: changed-frontend-src-files
+        with:
+          files: |
+             frontend/src/**/*.tsx
+             frontend/src/**/*.css
+
+      - name: List changed frontend static files
+        id: changed-frontend-static-files
+        if: steps.changed-frontend-src-files.outputs.files_changed == 'true'
+        uses: tj-actions/changed-files@v44
+        with:
+          files: |
+            static
+
+      - name: Check changed static files
+        if: steps.changed-frontend-src-files.outputs.files_changed == 'true' && steps.changed-frontend-static-files.outputs.files_changed == 'false'
+        run: echo "There were changes to the frontend code, but no corresponding changes to static files.  Please build the project locally, commit the static file changes and push the changes to try again." && exit 1
+
+      - name: List logo customizations
+        uses: tj-actions/changed-files@v44
+        id: changed-logo-files
+        with:
+          files: |
+             frontend/public/favicon.ico
+             frontend/src/assets
+
+      - name: Comment on logo customizations
+        if: steps.changed-logo-files.outputs.any_changed == 'true'
+        uses: mshick/add-pr-comment@v2
+        with:
+          message: |
+            "Hello contributor, it appears that your pull request contains some customizations to branding. We are unable to approve your PR at this time. If you believe this message to be in error, please correct these unsupported changes and try again, or reach out to one of the maintainers of the repo to discuss the changes."
+        
+      - name: Fail on logo customizations
+        if: steps.changed-logo-files.outputs.any_changed == 'true'
+        run: echo "Logos are customized, which is not a supported change." && exit 1
diff --git a/.github/workflows/check-test-updates.yml b/.github/workflows/check-test-updates.yml
diff --git a/.github/workflows/docker-image-build.yml b/.github/workflows/docker-image-build.yml
@@ -12,16 +12,6 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - name: Azure Container Registry Login
-      uses: Azure/docker-login@v1
-      with:
-        # Container registry username
-        username: ${{ secrets.SAMPLEAPP_ACR_USERNAME }}
-        # Container registry password
-        password: ${{ secrets.SAMPLEAPP_ACR_PASSWORD }}
-        # Container registry server url
-        login-server: sampleappaoaichatgpt.azurecr.io
-
     - uses: actions/checkout@v3
     - name: Build the Docker image
       run:         

diff --git a/.github/workflows/docker-image-publish.yml b/.github/workflows/docker-image-publish.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
     - name: Azure Container Registry Login
-      uses: Azure/docker-login@v1
+      uses: Azure/docker-login@v2
       with:
         # Container registry username
         username: ${{ secrets.SAMPLEAPP_ACR_USERNAME }}

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -29,7 +29,22 @@ jobs:
     - name: Test with pytest
       run: |
         export PYTHONPATH=$(pwd)
-        pytest -v --show-capture=stdout -k "not integration"
+        coverage run --omit=tests/integration_tests -m pytest -v --show-capture=stdout -k "not integration"
+        coverage report -m
+        coverage xml
+
+    - name: Code Coverage Report
+      uses: irongut/[email protected]
+      with:
+        filename: coverage.xml
+        badge: true
+        fail_below_min: true
+        format: markdown
+        hide_branch_rate: false
+        hide_complexity: true
+        indicators: true
+        output: both
+        thresholds: '60 80'
 
   test_windows:
     runs-on:
@@ -47,4 +62,4 @@ jobs:
     - name: Test with pytest
       run: |
         $env:PYTHONPATH=$pwd
-        pytest -v --show-capture=stdout -k "not integration"
+        pytest -v --show-capture=stdout -k "not integration"
diff --git a/.github/workflows/stale-issues.yml b/.github/workflows/stale-issues.yml
@@ -0,0 +1,22 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          days-before-issue-stale: 60
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: 60
+          days-before-pr-close: 14
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ Please see the [section below](#add-an-identity-provider) for important informat
     These variables are required:
     - `AZURE_OPENAI_RESOURCE`
     - `AZURE_OPENAI_MODEL`
-    - `AZURE_OPENAI_KEY`
+    - `AZURE_OPENAI_KEY` (optional if using Entra ID)
 
     These variables are optional:
     - `AZURE_OPENAI_TEMPERATURE`
@@ -58,7 +58,7 @@ NOTE: You may find you need to set: MacOS: `export NODE_OPTIONS="--max-old-space
     - `DATASOURCE_TYPE` (should be set to `AzureCognitiveSearch`)
     - `AZURE_SEARCH_SERVICE`
     - `AZURE_SEARCH_INDEX`
-    - `AZURE_SEARCH_KEY`
+    - `AZURE_SEARCH_KEY` (optional if using Entra ID)
 
     These variables are optional:
     - `AZURE_SEARCH_USE_SEMANTIC_SEARCH`
@@ -188,6 +188,20 @@ The Citation panel is defined at the end of `frontend/src/pages/chat/Chat.tsx`.
 
 ```
 
+### Using Entra ID
+
+The app uses Azure OpenAI on your data [(see documentation)](https://learn.microsoft.com/en-us/azure/ai-services/openai/references/on-your-data). To enable Entra ID for intra-service authentication
+
+1. Enable managed identity on Azure OpenAI
+2. Configure AI search to allow access from Azure OpenAI
+   1. Enable Role Based Access control on the used AI search instance [(see documentation)](https://learn.microsoft.com/en-us/azure/search/search-security-enable-roles)
+   2. Assign `Search Index Data Reader` and `Search Service Contributor` to the identity of the Azure OpenAI instance
+3. Do not configure `AZURE_SEARCH_KEY` and `AZURE_OPENAI_KEY` to use Entra ID authentication.
+4. Configure the webapp identity
+   1. Enable managed identity in the app service that hosts the webapp
+   2. Go to the Azure OpenAI instance and assign the role `Cognitive Services OpenAI User` to the identity of the webapp
+
+Note: RBAC assignments can take a few minutes before becoming effective.
 
 ### Best Practices
 We recommend keeping these best practices in mind:
@@ -207,24 +221,24 @@ Note: settings starting with `AZURE_SEARCH` are only needed when using Azure Ope
 | --- | --- | ------------- |
 |AZURE_SEARCH_SERVICE||The name of your Azure AI Search resource|
 |AZURE_SEARCH_INDEX||The name of your Azure AI Search Index|
-|AZURE_SEARCH_KEY||An **admin key** for your Azure AI Search resource|
+|AZURE_SEARCH_KEY||An **admin key** for your Azure AI Search resource.|
 |AZURE_SEARCH_USE_SEMANTIC_SEARCH|False|Whether or not to use semantic search|
 |AZURE_SEARCH_QUERY_TYPE|simple|Query type: simple, semantic, vector, vectorSimpleHybrid, or vectorSemanticHybrid. Takes precedence over AZURE_SEARCH_USE_SEMANTIC_SEARCH|
 |AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG||The name of the semantic search configuration to use if using semantic search.|
 |AZURE_SEARCH_TOP_K|5|The number of documents to retrieve from Azure AI Search.|
 |AZURE_SEARCH_ENABLE_IN_DOMAIN|True|Limits responses to only queries relating to your data.|
 |AZURE_SEARCH_CONTENT_COLUMNS||List of fields in your Azure AI Search index that contains the text content of your documents to use when formulating a bot response. Represent these as a string joined with "|", e.g. `"product_description|product_manual"`|
-|AZURE_SEARCH_FILENAME_COLUMN|| Field from your Azure AI Search index that gives a unique idenitfier of the source of your data to display in the UI.|
+|AZURE_SEARCH_FILENAME_COLUMN|| Field from your Azure AI Search index that gives a unique identifier of the source of your data to display in the UI.|
 |AZURE_SEARCH_TITLE_COLUMN||Field from your Azure AI Search index that gives a relevant title or header for your data content to display in the UI.|
 |AZURE_SEARCH_URL_COLUMN||Field from your Azure AI Search index that contains a URL for the document, e.g. an Azure Blob Storage URI. This value is not currently used.|
 |AZURE_SEARCH_VECTOR_COLUMNS||List of fields in your Azure AI Search index that contain vector embeddings of your documents to use when formulating a bot response. Represent these as a string joined with "|", e.g. `"product_description|product_manual"`|
 |AZURE_SEARCH_PERMITTED_GROUPS_COLUMN||Field from your Azure AI Search index that contains AAD group IDs that determine document-level access control.|
 |AZURE_SEARCH_STRICTNESS|3|Integer from 1 to 5 specifying the strictness for the model limiting responses to your data.|
-|AZURE_OPENAI_RESOURCE||the name of your Azure OpenAI resource|
+|AZURE_OPENAI_RESOURCE||the name of your Azure OpenAI resource (only one of AZURE_OPENAI_RESOURCE/AZURE_OPENAI_ENDPOINT is required)|
 |AZURE_OPENAI_MODEL||The name of your model deployment|
-|AZURE_OPENAI_ENDPOINT||The endpoint of your Azure OpenAI resource.|
+|AZURE_OPENAI_ENDPOINT||The endpoint of your Azure OpenAI resource (only one of AZURE_OPENAI_RESOURCE/AZURE_OPENAI_ENDPOINT is required)|
 |AZURE_OPENAI_MODEL_NAME|gpt-35-turbo-16k|The name of the model|
-|AZURE_OPENAI_KEY||One of the API keys of your Azure OpenAI resource|
+|AZURE_OPENAI_KEY||One of the API keys of your Azure OpenAI resource (optional if using Entra ID)|
 |AZURE_OPENAI_TEMPERATURE|0|What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. A value of 0 is recommended when using your data.|
 |AZURE_OPENAI_TOP_P|1.0|An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. We recommend setting this to 1.0 when using your data.|
 |AZURE_OPENAI_MAX_TOKENS|1000|The maximum number of tokens allowed for the generated answer.|

diff --git a/app.py b/app.py
@@ -126,7 +126,7 @@ def init_openai_client():
         aoai_api_key = app_settings.azure_openai.key
         ad_token_provider = None
         if not aoai_api_key:
-            logging.debug("No AZURE_OPENAI_KEY found, using Azure AD auth")
+            logging.debug("No AZURE_OPENAI_KEY found, using Azure Entra ID auth")
             ad_token_provider = get_bearer_token_provider(
                 DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"
             )
@@ -207,7 +207,8 @@ def prepare_model_args(request_body, request_headers):
     user_json = None
     if (MS_DEFENDER_ENABLED):
         authenticated_user_details = get_authenticated_user_details(request_headers)
-        user_json = get_msdefender_user_json(authenticated_user_details, request_headers)
+        conversation_id = request_body.get("conversation_id", None)        
+        user_json = get_msdefender_user_json(authenticated_user_details, request_headers, conversation_id)
 
     model_args = {
         "messages": messages,
@@ -835,9 +836,9 @@ async def ensure_cosmos():
             return jsonify({"error": "CosmosDB is not working"}), 500
 
 
-async def generate_title(conversation_messages):
+async def generate_title(conversation_messages) -> str:
     ## make sure the messages are sorted by _ts descending
-    title_prompt = 'Summarize the conversation so far into a 4-word or less title. Do not use any quotation marks or punctuation. Respond with a json object in the format {{"title": string}}. Do not include any other commentary or description.'
+    title_prompt = "Summarize the conversation so far into a 4-word or less title. Do not use any quotation marks or punctuation. Do not include any other commentary or description."
 
     messages = [
         {"role": msg["role"], "content": msg["content"]}
@@ -846,14 +847,15 @@ async def generate_title(conversation_messages):
     messages.append({"role": "user", "content": title_prompt})
 
     try:
-        azure_openai_client = init_openai_client(use_data=False)
+        azure_openai_client = init_openai_client()
         response = await azure_openai_client.chat.completions.create(
             model=app_settings.azure_openai.model, messages=messages, temperature=1, max_tokens=64
         )
 
-        title = json.loads(response.choices[0].message.content)["title"]
+        title = response.choices[0].message.content
         return title
     except Exception as e:
+        logging.exception("Exception while generating title", e)
         return messages[-2]["content"]
 
 

diff --git a/backend/security/ms_defender_utils.py b/backend/security/ms_defender_utils.py
@@ -1,11 +1,14 @@
 import json
 
-def get_msdefender_user_json(authenticated_user_details, request_headers):
+def get_msdefender_user_json(authenticated_user_details, request_headers, conversation_id):
     auth_provider = authenticated_user_details.get('auth_provider')
-    source_ip = request_headers.get('X-Forwarded-For', request_headers.get('Remote-Addr', ''))
+    source_ip = request_headers.get('Remote-Addr', '')
+    header_names = ['User-Agent', 'X-Forwarded-For', 'Forwarded', 'X-Real-IP', 'True-Client-IP', 'CF-Connecting-IP']
     user_args = {
         "EndUserId": authenticated_user_details.get('user_principal_id'),
         "EndUserIdType": "EntraId" if auth_provider == "aad" else auth_provider,
         "SourceIp": source_ip.split(':')[0], #remove port
+        "SourceRequestHeaders": {header: request_headers[header] for header in header_names if header in request_headers},
+        "ConversationId": conversation_id,
     }
     return json.dumps(user_args)
diff --git a/frontend/src/components/Answer/Answer.tsx b/frontend/src/components/Answer/Answer.tsx
@@ -10,7 +10,7 @@ import remarkGfm from 'remark-gfm'
 import supersub from 'remark-supersub'
 import Plot from 'react-plotly.js'
 import { AskResponse, Citation, Feedback, historyMessageFeedback } from '../../api'
-import { XSSAllowTags } from '../../constants/xssAllowTags'
+import { XSSAllowTags, XSSAllowAttributes } from '../../constants/sanatizeAllowables'
 import { AppStateContext } from '../../state/AppProvider'
 
 import { parseAnswer } from './AnswerParser'
@@ -253,7 +253,7 @@ export const Answer = ({ answer, onCitationClicked, onExectResultClicked }: Prop
                 remarkPlugins={[remarkGfm, supersub]}
                 children={
                   SANITIZE_ANSWER
-                    ? DOMPurify.sanitize(parsedAnswer.markdownFormatText, { ALLOWED_TAGS: XSSAllowTags })
+                    ? DOMPurify.sanitize(parsedAnswer.markdownFormatText, { ALLOWED_TAGS: XSSAllowTags, ALLOWED_ATTR: XSSAllowAttributes })
                     : parsedAnswer.markdownFormatText
                 }
                 className={styles.answerText}

diff --git a/frontend/src/constants/xssAllowTags.ts → frontend/src/constants/sanatizeAllowables.ts b/frontend/src/constants/xssAllowTags.ts → frontend/src/constants/sanatizeAllowables.ts
@@ -42,3 +42,5 @@ export const XSSAllowTags = [
   'ol',
   'li'
 ]
+
+export const XSSAllowAttributes = ['href']
diff --git a/frontend/src/pages/chat/Chat.tsx b/frontend/src/pages/chat/Chat.tsx
@@ -13,7 +13,7 @@ import { nord } from 'react-syntax-highlighter/dist/esm/styles/prism'
 
 import styles from './Chat.module.css'
 import Contoso from '../../assets/Contoso.svg'
-import { XSSAllowTags } from '../../constants/xssAllowTags'
+import { XSSAllowTags } from '../../constants/sanatizeAllowables'
 
 import {
   ChatMessage,
@@ -32,7 +32,6 @@ import {
   CosmosDBStatus,
   ErrorMessage,
   ExecResults,
-  AzureSqlServerCodeExecResult
 } from "../../api";
 import { Answer } from "../../components/Answer";
 import { QuestionInput } from "../../components/QuestionInput";

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -11,4 +11,5 @@ pytest==7.4.0
 pytest-asyncio==0.23.2
 azure-storage-blob
 chardet
-azure-keyvault-secrets
+azure-keyvault-secrets
+coverage
diff --git a/static/assets/index-252b88f2.js → static/assets/index-2e11eaf6.js b/static/assets/index-252b88f2.js → static/assets/index-2e11eaf6.js
diff --git a/static/assets/index-252b88f2.js.map → static/assets/index-2e11eaf6.js.map b/static/assets/index-252b88f2.js.map → static/assets/index-2e11eaf6.js.map
diff --git a/static/index.html b/static/index.html
@@ -5,7 +5,7 @@
     <link rel="icon" type="image/x-icon" href="{{ favicon }}" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>{{ title }}</title>
-    <script type="module" crossorigin src="/assets/index-252b88f2.js"></script>
+    <script type="module" crossorigin src="/assets/index-2e11eaf6.js"></script>
     <link rel="stylesheet" href="/assets/index-61492790.css">
   </head>
   <body>