Add URL prefix support in data preparation and hyperlink citation tit…

…les (#264) Co-authored-by: Sarah Widder <[email protected]>
microsoft · Oct 7, 2023 · 970d366 · 970d366
1 parent e9feb40
commit 970d366
Show file tree

Hide file tree

Showing 13 changed files with 331 additions and 73 deletions.
diff --git a/README.md b/README.md
@@ -115,8 +115,50 @@ const getUserInfoList = async () => {
 }
 ```
 
-## Best Practices
-Feel free to fork this repository and make your own modifications to the UX or backend logic. For example, you may want to expose some of the settings in `app.py` in the UI for users to try out different behaviors. We recommend keeping these best practices in mind:
+## Common Customization Scenarios
+Feel free to fork this repository and make your own modifications to the UX or backend logic. For example, you may want to change aspects of the chat display, or expose some of the settings in `app.py` in the UI for users to try out different behaviors. 
+
+### Updating the default chat logo and headers
+The landing chat page logo and headers are specified in `frontend/src/pages/chat/Chat.tsx`:
+```
+<Stack className={styles.chatEmptyState}>
+    <img
+        src={Azure}
+        className={styles.chatIcon}
+        aria-hidden="true"
+    />
+    <h1 className={styles.chatEmptyStateTitle}>Start chatting</h1>
+    <h2 className={styles.chatEmptyStateSubtitle}>This chatbot is configured to answer your questions</h2>
+</Stack>
+```
+To update the logo, change `src={Azure}` to point to your own SVG file, which you can put in `frontend/src/assets`/
+To update the headers, change the strings "Start chatting" and "This chatbot is configured to answer your questions" to your desired values.
+
+### Changing Citation Display
+The Citation panel is defined at the end of `frontend/src/pages/chat/Chat.tsx`. The citations returned from Azure OpenAI On Your Data will include `content`, `title`, `filepath`, and in some cases `url`. You can customize the Citation section to use and display these as you like. For example, the "View Source" button will open the citation URL in a new tab when clicked:
+```
+const onViewSource = (citation: Citation) => {
+        if (citation.url) {
+            window.open(citation.url, "_blank");
+        }
+    };
+
+<span 
+        title={activeCitation.url} 
+        tabIndex={0} 
+        role="link" 
+        onClick={() => onViewSource(activeCitation)} 
+        onKeyDown={e => e.key === "Enter" || e.key === " " ? onViewSource(activeCitation) : null}
+        className={styles.viewSourceButton}
+        aria-label={activeCitation.url}
+    >
+        View Source
+</span>
+```
+
+
+### Best Practices
+We recommend keeping these best practices in mind:
 
 - Reset the chat session (clear chat) if the user changes any settings. Notify the user that their chat history will be lost.
 - Clearly communicate to the user what impact each setting will have on their experience.

diff --git a/frontend/src/pages/chat/Chat.module.css b/frontend/src/pages/chat/Chat.module.css
@@ -290,6 +290,11 @@
     margin-bottom: 12px;
 } 
 
+.citationPanelTitle:hover {
+    text-decoration: underline;
+    cursor: pointer;
+}
+
 .citationPanelContent {
     font-style: normal;
     font-weight: 400;
@@ -305,4 +310,23 @@
 a {
     padding-left: 5px;
     padding-right: 5px;
+}
+
+.viewSourceButton {
+    font-style: normal;
+    font-weight: 600;
+    font-size: 12px;
+    line-height: 16px;
+    color: #115EA3;
+    flex-direction: row;
+    align-items: center;
+    padding: 4px 6px;
+    gap: 4px;
+    border: 1px solid #D1D1D1;
+    border-radius: 4px;
+}
+
+.viewSourceButton:hover {
+    text-decoration: underline;
+    cursor: pointer;
 }
diff --git a/frontend/src/pages/chat/Chat.tsx b/frontend/src/pages/chat/Chat.tsx
@@ -43,7 +43,7 @@ const Chat = () => {
     const chatMessageStreamEnd = useRef<HTMLDivElement | null>(null);
     const [isLoading, setIsLoading] = useState<boolean>(false);
     const [showLoadingMessage, setShowLoadingMessage] = useState<boolean>(false);
-    const [activeCitation, setActiveCitation] = useState<[content: string, id: string, title: string, filepath: string, url: string, metadata: string]>();
+    const [activeCitation, setActiveCitation] = useState<Citation>();
     const [isCitationPanelOpen, setIsCitationPanelOpen] = useState<boolean>(false);
     const abortFuncs = useRef([] as AbortController[]);
     const [showAuthMessage, setShowAuthMessage] = useState<boolean>(true);
@@ -502,10 +502,16 @@ const Chat = () => {
     }, [showLoadingMessage, processMessages]);
 
     const onShowCitation = (citation: Citation) => {
-        setActiveCitation([citation.content, citation.id, citation.title ?? "", citation.filepath ?? "", "", ""]);
+        setActiveCitation(citation);
         setIsCitationPanelOpen(true);
     };
 
+    const onViewSource = (citation: Citation) => {
+        if (citation.url && !citation.url.includes("blob.core")) {
+            window.open(citation.url, "_blank");
+        }
+    };
+
     const parseCitationFromMessage = (message: ChatMessage) => {
         if (message?.role && message?.role === "tool") {
             try {
@@ -668,23 +674,23 @@ const Chat = () => {
                             />
                         </Stack>
                     </div>
-                    {messages && messages.length > 0 && isCitationPanelOpen && activeCitation && (
+                    {/* Citation Panel */}
+                    {messages && messages.length > 0 && isCitationPanelOpen && activeCitation && ( 
                     <Stack.Item className={styles.citationPanel} tabIndex={0} role="tabpanel" aria-label="Citations Panel">
                         <Stack aria-label="Citations Panel Header Container" horizontal className={styles.citationPanelHeaderContainer} horizontalAlign="space-between" verticalAlign="center">
                             <span aria-label="Citations" className={styles.citationPanelHeader}>Citations</span>
                             <IconButton iconProps={{ iconName: 'Cancel'}} aria-label="Close citations panel" onClick={() => setIsCitationPanelOpen(false)}/>
                         </Stack>
-                        <h5 className={styles.citationPanelTitle} tabIndex={0}>{activeCitation[2]}</h5>
+                        <h5 className={styles.citationPanelTitle} tabIndex={0} title={activeCitation.url && !activeCitation.url.includes("blob.core") ? activeCitation.url : activeCitation.title ?? ""} onClick={() => onViewSource(activeCitation)}>{activeCitation.title}</h5>
                         <div tabIndex={0}> 
                         <ReactMarkdown 
                             linkTarget="_blank"
                             className={styles.citationPanelContent}
-                            children={activeCitation[0]} 
+                            children={activeCitation.content} 
                             remarkPlugins={[remarkGfm]} 
                             rehypePlugins={[rehypeRaw]}
                         />
                         </div>
-
                     </Stack.Item>
                 )}
                 {(appStateContext?.state.isChatHistoryOpen && appStateContext?.state.isCosmosDBAvailable?.status !== CosmosDBStatus.NotConfigured) && <ChatHistoryPanel/>}

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -7,4 +7,5 @@ tiktoken==0.4.0
 langchain==0.0.292
 bs4==0.0.1
 urllib3==2.0.6
-pytest==7.4.0
+pytest==7.4.0
+azure-storage-blob
diff --git a/scripts/config_multiple_url.json b/scripts/config_multiple_url.json
@@ -0,0 +1,22 @@
+[
+    {
+        "data_paths": [
+            {
+                "path": "data/source1",
+                "url_prefix": "https://<URL for source 1>.com/"
+            },
+            {
+                "path": "data/source2",
+                "url_prefix": "https://<URL for source 2>.com/"
+            }
+        ],
+        "subscription_id": "<subscription id>",
+        "resource_group": "<resource group name>",
+        "search_service_name": "<search service name to use or create>",
+        "index_name": "<index name to use or create>",
+        "chunk_size": 1024,
+        "token_overlap": 128,
+        "semantic_config_name": "default",
+        "language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
+    }
+]
diff --git a/scripts/data_preparation.py b/scripts/data_preparation.py
@@ -13,7 +13,7 @@
 from azure.search.documents import SearchClient
 from tqdm import tqdm
 
-from data_utils import chunk_directory
+from data_utils import chunk_directory, chunk_blob_container
 
 SUPPORTED_LANGUAGE_CODES = {
     "ar": "Arabic",
@@ -229,7 +229,7 @@ def create_or_update_search_index(
             "searchable": True,
             "retrievable": True,
             "dimensions": 1536,
-            "vectorSearchConfiguration": "default"
+            "vectorSearchConfiguration": vector_config_name
         })
 
         body["vectorSearch"] = {
@@ -365,26 +365,44 @@ def create_index(config, credential, form_recognizer_client=None, embedding_mode
     if not create_or_update_search_index(service_name, subscription_id, resource_group, index_name, config["semantic_config_name"], credential, language, vector_config_name=config.get("vector_config_name", None)):
         raise Exception(f"Failed to create or update index {index_name}")
 
-    # chunk directory
-    print("Chunking directory...")
-    add_embeddings = False
-    if config.get("vector_config_name") and embedding_model_endpoint:
-        add_embeddings = True
-    result = chunk_directory(config["data_path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
-                             azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
-                             add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint)
-
-    if len(result.chunks) == 0:
-        raise Exception("No chunks found. Please check the data path and chunk size.")
-
-    print(f"Processed {result.total_files} files")
-    print(f"Unsupported formats: {result.num_unsupported_format_files} files")
-    print(f"Files with errors: {result.num_files_with_errors} files")
-    print(f"Found {len(result.chunks)} chunks")
-
-    # upload documents to index
-    print("Uploading documents to index...")
-    upload_documents_to_index(service_name, subscription_id, resource_group, index_name, result.chunks, credential)
+    data_configs = []
+    if "data_path" in config:
+        data_configs.append({
+            "path": config["data_path"],
+            "url_prefix": config.get("url_prefix", None),
+        })
+    if "data_paths" in config:
+        data_configs.extend(config["data_paths"])
+
+    for data_config in data_configs:
+        # chunk directory
+        print(f"Chunking path {data_config['path']}...")
+        add_embeddings = False
+        if config.get("vector_config_name") and embedding_model_endpoint:
+            add_embeddings = True
+
+        if "blob.core" in data_config["path"]:
+            result = chunk_blob_container(data_config["path"], credential=credential, num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
+                                azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
+                                add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
+        elif os.path.exists(data_config["path"]):
+            result = chunk_directory(data_config["path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
+                                    azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
+                                    add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
+        else:
+            raise Exception(f"Path {data_config['path']} does not exist and is not a blob URL. Please check the path and try again.")
+
+        if len(result.chunks) == 0:
+            raise Exception("No chunks found. Please check the data path and chunk size.")
+
+        print(f"Processed {result.total_files} files")
+        print(f"Unsupported formats: {result.num_unsupported_format_files} files")
+        print(f"Files with errors: {result.num_files_with_errors} files")
+        print(f"Found {len(result.chunks)} chunks")
+
+        # upload documents to index
+        print("Uploading documents to index...")
+        upload_documents_to_index(service_name, subscription_id, resource_group, index_name, result.chunks, credential)
 
     # check if index is ready/validate index
     print("Validating index...")

diff --git a/scripts/data_utils.py b/scripts/data_utils.py
@@ -7,6 +7,8 @@
 import re
 import requests
 import openai
+import re
+import tempfile
 from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor
 from dataclasses import dataclass
@@ -18,6 +20,7 @@
 from azure.identity import DefaultAzureCredential
 from azure.ai.formrecognizer import DocumentAnalysisClient
 from azure.core.credentials import AzureKeyCredential
+from azure.storage.blob import ContainerClient
 from bs4 import BeautifulSoup
 from langchain.text_splitter import TextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter
 from tqdm import tqdm
@@ -450,6 +453,32 @@ class ChunkingResult:
     # some chunks might be skipped to small number of tokens
     skipped_chunks: int = 0
 
+def extractStorageDetailsFromUrl(url):
+    matches = re.fullmatch(r'https:\/\/([^\/.]*)\.blob\.core\.windows\.net\/([^\/]*)\/(.*)', url)
+    if not matches:
+        raise Exception(f"Not a valid blob storage URL: {url}")
+    return (matches.group(1), matches.group(2), matches.group(3))
+
+def downloadBlobUrlToLocalFolder(blob_url, local_folder, credential):
+    (storage_account, container_name, path) = extractStorageDetailsFromUrl(blob_url)
+    container_url = f'https://{storage_account}.blob.core.windows.net/{container_name}'
+    container_client = ContainerClient.from_container_url(container_url, credential=credential)
+    if path and not path.endswith('/'):
+        path = path + '/'
+
+    last_destination_folder = None
+    for blob in container_client.list_blobs(name_starts_with=path):
+        relative_path = blob.name[len(path):]
+        destination_path = os.path.join(local_folder, relative_path)
+        destination_folder = os.path.dirname(destination_path)
+        if destination_folder != last_destination_folder:
+            os.makedirs(destination_folder, exist_ok=True)
+            last_destination_folder = destination_folder
+        blob_client = container_client.get_blob_client(blob.name)
+        with open(file=destination_path, mode='wb') as local_file:
+            stream = blob_client.download_blob()
+            local_file.write(stream.readall())
+
 def get_files_recursively(directory_path: str) -> List[str]:
     """Gets all files in the given directory recursively.
     Args:
@@ -857,6 +886,44 @@ def process_file(
         result =None
     return result, is_error
 
+def chunk_blob_container(
+        blob_url: str,
+        credential,
+        ignore_errors: bool = True,
+        num_tokens: int = 1024,
+        min_chunk_size: int = 10,
+        url_prefix = None,
+        token_overlap: int = 0,
+        extensions_to_process: List[str] = list(FILE_FORMAT_DICT.keys()),
+        form_recognizer_client = None,
+        use_layout = False,
+        njobs=4,
+        add_embeddings = False,
+        azure_credential = None,
+        embedding_endpoint = None
+):
+    with tempfile.TemporaryDirectory() as local_data_folder:
+        print(f'Downloading {blob_url} to local folder')
+        downloadBlobUrlToLocalFolder(blob_url, local_data_folder, credential)
+        print(f'Downloaded.')
+
+        result = chunk_directory(
+            local_data_folder,
+            ignore_errors=ignore_errors,
+            num_tokens=num_tokens,
+            min_chunk_size=min_chunk_size,
+            url_prefix=url_prefix,
+            token_overlap=token_overlap,
+            extensions_to_process=extensions_to_process,
+            form_recognizer_client=form_recognizer_client,
+            use_layout=use_layout,
+            njobs=njobs,
+            add_embeddings=add_embeddings,
+            azure_credential=azure_credential,
+            embedding_endpoint=embedding_endpoint
+        )
+
+    return result
 
 
 def chunk_directory(
@@ -954,14 +1021,13 @@ def chunk_directory(
 
 class SingletonFormRecognizerClient:
     instance = None
-    url = os.getenv("FORM_RECOGNIZER_ENDPOINT")
-    key = os.getenv("FORM_RECOGNIZER_KEY")
-
     def __new__(cls, *args, **kwargs):
         if not cls.instance:
             print("SingletonFormRecognizerClient: Creating instance of Form recognizer per process")
-            if cls.url and cls.key:
-                cls.instance = DocumentAnalysisClient(endpoint=cls.url, credential=AzureKeyCredential(cls.key))
+            url = os.getenv("FORM_RECOGNIZER_ENDPOINT")
+            key = os.getenv("FORM_RECOGNIZER_KEY")
+            if url and key:
+                cls.instance = DocumentAnalysisClient(endpoint=url, credential=AzureKeyCredential(key))
             else:
                 print("SingletonFormRecognizerClient: Skipping since credentials not provided. Assuming NO form recognizer extensions(like .pdf) in directory")
                 cls.instance = object() # dummy object
@@ -972,4 +1038,4 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         url, key = state
-        self.instance = DocumentAnalysisClient(endpoint=url, credential=AzureKeyCredential(key))
+        self.instance = DocumentAnalysisClient(endpoint=url, credential=AzureKeyCredential(key))