Skip to content

Commit

Permalink
Add URL prefix support in data preparation and hyperlink citation tit…
Browse files Browse the repository at this point in the history
…les (#264)

Co-authored-by: Sarah Widder <[email protected]>
  • Loading branch information
sarah-widder and sarah-widder authored Oct 7, 2023
1 parent e9feb40 commit 970d366
Show file tree
Hide file tree
Showing 13 changed files with 331 additions and 73 deletions.
46 changes: 44 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,50 @@ const getUserInfoList = async () => {
}
```

## Best Practices
Feel free to fork this repository and make your own modifications to the UX or backend logic. For example, you may want to expose some of the settings in `app.py` in the UI for users to try out different behaviors. We recommend keeping these best practices in mind:
## Common Customization Scenarios
Feel free to fork this repository and make your own modifications to the UX or backend logic. For example, you may want to change aspects of the chat display, or expose some of the settings in `app.py` in the UI for users to try out different behaviors.

### Updating the default chat logo and headers
The landing chat page logo and headers are specified in `frontend/src/pages/chat/Chat.tsx`:
```
<Stack className={styles.chatEmptyState}>
<img
src={Azure}
className={styles.chatIcon}
aria-hidden="true"
/>
<h1 className={styles.chatEmptyStateTitle}>Start chatting</h1>
<h2 className={styles.chatEmptyStateSubtitle}>This chatbot is configured to answer your questions</h2>
</Stack>
```
To update the logo, change `src={Azure}` to point to your own SVG file, which you can put in `frontend/src/assets`/
To update the headers, change the strings "Start chatting" and "This chatbot is configured to answer your questions" to your desired values.

### Changing Citation Display
The Citation panel is defined at the end of `frontend/src/pages/chat/Chat.tsx`. The citations returned from Azure OpenAI On Your Data will include `content`, `title`, `filepath`, and in some cases `url`. You can customize the Citation section to use and display these as you like. For example, the "View Source" button will open the citation URL in a new tab when clicked:
```
const onViewSource = (citation: Citation) => {
if (citation.url) {
window.open(citation.url, "_blank");
}
};
<span
title={activeCitation.url}
tabIndex={0}
role="link"
onClick={() => onViewSource(activeCitation)}
onKeyDown={e => e.key === "Enter" || e.key === " " ? onViewSource(activeCitation) : null}
className={styles.viewSourceButton}
aria-label={activeCitation.url}
>
View Source
</span>
```


### Best Practices
We recommend keeping these best practices in mind:

- Reset the chat session (clear chat) if the user changes any settings. Notify the user that their chat history will be lost.
- Clearly communicate to the user what impact each setting will have on their experience.
Expand Down
24 changes: 24 additions & 0 deletions frontend/src/pages/chat/Chat.module.css
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,11 @@
margin-bottom: 12px;
}

.citationPanelTitle:hover {
text-decoration: underline;
cursor: pointer;
}

.citationPanelContent {
font-style: normal;
font-weight: 400;
Expand All @@ -305,4 +310,23 @@
a {
padding-left: 5px;
padding-right: 5px;
}

.viewSourceButton {
font-style: normal;
font-weight: 600;
font-size: 12px;
line-height: 16px;
color: #115EA3;
flex-direction: row;
align-items: center;
padding: 4px 6px;
gap: 4px;
border: 1px solid #D1D1D1;
border-radius: 4px;
}

.viewSourceButton:hover {
text-decoration: underline;
cursor: pointer;
}
18 changes: 12 additions & 6 deletions frontend/src/pages/chat/Chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ const Chat = () => {
const chatMessageStreamEnd = useRef<HTMLDivElement | null>(null);
const [isLoading, setIsLoading] = useState<boolean>(false);
const [showLoadingMessage, setShowLoadingMessage] = useState<boolean>(false);
const [activeCitation, setActiveCitation] = useState<[content: string, id: string, title: string, filepath: string, url: string, metadata: string]>();
const [activeCitation, setActiveCitation] = useState<Citation>();
const [isCitationPanelOpen, setIsCitationPanelOpen] = useState<boolean>(false);
const abortFuncs = useRef([] as AbortController[]);
const [showAuthMessage, setShowAuthMessage] = useState<boolean>(true);
Expand Down Expand Up @@ -502,10 +502,16 @@ const Chat = () => {
}, [showLoadingMessage, processMessages]);

const onShowCitation = (citation: Citation) => {
setActiveCitation([citation.content, citation.id, citation.title ?? "", citation.filepath ?? "", "", ""]);
setActiveCitation(citation);
setIsCitationPanelOpen(true);
};

const onViewSource = (citation: Citation) => {
if (citation.url && !citation.url.includes("blob.core")) {
window.open(citation.url, "_blank");
}
};

const parseCitationFromMessage = (message: ChatMessage) => {
if (message?.role && message?.role === "tool") {
try {
Expand Down Expand Up @@ -668,23 +674,23 @@ const Chat = () => {
/>
</Stack>
</div>
{messages && messages.length > 0 && isCitationPanelOpen && activeCitation && (
{/* Citation Panel */}
{messages && messages.length > 0 && isCitationPanelOpen && activeCitation && (
<Stack.Item className={styles.citationPanel} tabIndex={0} role="tabpanel" aria-label="Citations Panel">
<Stack aria-label="Citations Panel Header Container" horizontal className={styles.citationPanelHeaderContainer} horizontalAlign="space-between" verticalAlign="center">
<span aria-label="Citations" className={styles.citationPanelHeader}>Citations</span>
<IconButton iconProps={{ iconName: 'Cancel'}} aria-label="Close citations panel" onClick={() => setIsCitationPanelOpen(false)}/>
</Stack>
<h5 className={styles.citationPanelTitle} tabIndex={0}>{activeCitation[2]}</h5>
<h5 className={styles.citationPanelTitle} tabIndex={0} title={activeCitation.url && !activeCitation.url.includes("blob.core") ? activeCitation.url : activeCitation.title ?? ""} onClick={() => onViewSource(activeCitation)}>{activeCitation.title}</h5>
<div tabIndex={0}>
<ReactMarkdown
linkTarget="_blank"
className={styles.citationPanelContent}
children={activeCitation[0]}
children={activeCitation.content}
remarkPlugins={[remarkGfm]}
rehypePlugins={[rehypeRaw]}
/>
</div>

</Stack.Item>
)}
{(appStateContext?.state.isChatHistoryOpen && appStateContext?.state.isCosmosDBAvailable?.status !== CosmosDBStatus.NotConfigured) && <ChatHistoryPanel/>}
Expand Down
3 changes: 2 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ tiktoken==0.4.0
langchain==0.0.292
bs4==0.0.1
urllib3==2.0.6
pytest==7.4.0
pytest==7.4.0
azure-storage-blob
22 changes: 22 additions & 0 deletions scripts/config_multiple_url.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[
{
"data_paths": [
{
"path": "data/source1",
"url_prefix": "https://<URL for source 1>.com/"
},
{
"path": "data/source2",
"url_prefix": "https://<URL for source 2>.com/"
}
],
"subscription_id": "<subscription id>",
"resource_group": "<resource group name>",
"search_service_name": "<search service name to use or create>",
"index_name": "<index name to use or create>",
"chunk_size": 1024,
"token_overlap": 128,
"semantic_config_name": "default",
"language": "<Language to support for example use 'en' for English. Checked supported languages here under lucene - https://learn.microsoft.com/en-us/azure/search/index-add-language-analyzers"
}
]
62 changes: 40 additions & 22 deletions scripts/data_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from azure.search.documents import SearchClient
from tqdm import tqdm

from data_utils import chunk_directory
from data_utils import chunk_directory, chunk_blob_container

SUPPORTED_LANGUAGE_CODES = {
"ar": "Arabic",
Expand Down Expand Up @@ -229,7 +229,7 @@ def create_or_update_search_index(
"searchable": True,
"retrievable": True,
"dimensions": 1536,
"vectorSearchConfiguration": "default"
"vectorSearchConfiguration": vector_config_name
})

body["vectorSearch"] = {
Expand Down Expand Up @@ -365,26 +365,44 @@ def create_index(config, credential, form_recognizer_client=None, embedding_mode
if not create_or_update_search_index(service_name, subscription_id, resource_group, index_name, config["semantic_config_name"], credential, language, vector_config_name=config.get("vector_config_name", None)):
raise Exception(f"Failed to create or update index {index_name}")

# chunk directory
print("Chunking directory...")
add_embeddings = False
if config.get("vector_config_name") and embedding_model_endpoint:
add_embeddings = True
result = chunk_directory(config["data_path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint)

if len(result.chunks) == 0:
raise Exception("No chunks found. Please check the data path and chunk size.")

print(f"Processed {result.total_files} files")
print(f"Unsupported formats: {result.num_unsupported_format_files} files")
print(f"Files with errors: {result.num_files_with_errors} files")
print(f"Found {len(result.chunks)} chunks")

# upload documents to index
print("Uploading documents to index...")
upload_documents_to_index(service_name, subscription_id, resource_group, index_name, result.chunks, credential)
data_configs = []
if "data_path" in config:
data_configs.append({
"path": config["data_path"],
"url_prefix": config.get("url_prefix", None),
})
if "data_paths" in config:
data_configs.extend(config["data_paths"])

for data_config in data_configs:
# chunk directory
print(f"Chunking path {data_config['path']}...")
add_embeddings = False
if config.get("vector_config_name") and embedding_model_endpoint:
add_embeddings = True

if "blob.core" in data_config["path"]:
result = chunk_blob_container(data_config["path"], credential=credential, num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
elif os.path.exists(data_config["path"]):
result = chunk_directory(data_config["path"], num_tokens=config["chunk_size"], token_overlap=config.get("token_overlap",0),
azure_credential=credential, form_recognizer_client=form_recognizer_client, use_layout=use_layout, njobs=njobs,
add_embeddings=add_embeddings, embedding_endpoint=embedding_model_endpoint, url_prefix=data_config["url_prefix"])
else:
raise Exception(f"Path {data_config['path']} does not exist and is not a blob URL. Please check the path and try again.")

if len(result.chunks) == 0:
raise Exception("No chunks found. Please check the data path and chunk size.")

print(f"Processed {result.total_files} files")
print(f"Unsupported formats: {result.num_unsupported_format_files} files")
print(f"Files with errors: {result.num_files_with_errors} files")
print(f"Found {len(result.chunks)} chunks")

# upload documents to index
print("Uploading documents to index...")
upload_documents_to_index(service_name, subscription_id, resource_group, index_name, result.chunks, credential)

# check if index is ready/validate index
print("Validating index...")
Expand Down
78 changes: 72 additions & 6 deletions scripts/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import re
import requests
import openai
import re
import tempfile
from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor
from dataclasses import dataclass
Expand All @@ -18,6 +20,7 @@
from azure.identity import DefaultAzureCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import ContainerClient
from bs4 import BeautifulSoup
from langchain.text_splitter import TextSplitter, MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter
from tqdm import tqdm
Expand Down Expand Up @@ -450,6 +453,32 @@ class ChunkingResult:
# some chunks might be skipped to small number of tokens
skipped_chunks: int = 0

def extractStorageDetailsFromUrl(url):
matches = re.fullmatch(r'https:\/\/([^\/.]*)\.blob\.core\.windows\.net\/([^\/]*)\/(.*)', url)
if not matches:
raise Exception(f"Not a valid blob storage URL: {url}")
return (matches.group(1), matches.group(2), matches.group(3))

def downloadBlobUrlToLocalFolder(blob_url, local_folder, credential):
(storage_account, container_name, path) = extractStorageDetailsFromUrl(blob_url)
container_url = f'https://{storage_account}.blob.core.windows.net/{container_name}'
container_client = ContainerClient.from_container_url(container_url, credential=credential)
if path and not path.endswith('/'):
path = path + '/'

last_destination_folder = None
for blob in container_client.list_blobs(name_starts_with=path):
relative_path = blob.name[len(path):]
destination_path = os.path.join(local_folder, relative_path)
destination_folder = os.path.dirname(destination_path)
if destination_folder != last_destination_folder:
os.makedirs(destination_folder, exist_ok=True)
last_destination_folder = destination_folder
blob_client = container_client.get_blob_client(blob.name)
with open(file=destination_path, mode='wb') as local_file:
stream = blob_client.download_blob()
local_file.write(stream.readall())

def get_files_recursively(directory_path: str) -> List[str]:
"""Gets all files in the given directory recursively.
Args:
Expand Down Expand Up @@ -857,6 +886,44 @@ def process_file(
result =None
return result, is_error

def chunk_blob_container(
blob_url: str,
credential,
ignore_errors: bool = True,
num_tokens: int = 1024,
min_chunk_size: int = 10,
url_prefix = None,
token_overlap: int = 0,
extensions_to_process: List[str] = list(FILE_FORMAT_DICT.keys()),
form_recognizer_client = None,
use_layout = False,
njobs=4,
add_embeddings = False,
azure_credential = None,
embedding_endpoint = None
):
with tempfile.TemporaryDirectory() as local_data_folder:
print(f'Downloading {blob_url} to local folder')
downloadBlobUrlToLocalFolder(blob_url, local_data_folder, credential)
print(f'Downloaded.')

result = chunk_directory(
local_data_folder,
ignore_errors=ignore_errors,
num_tokens=num_tokens,
min_chunk_size=min_chunk_size,
url_prefix=url_prefix,
token_overlap=token_overlap,
extensions_to_process=extensions_to_process,
form_recognizer_client=form_recognizer_client,
use_layout=use_layout,
njobs=njobs,
add_embeddings=add_embeddings,
azure_credential=azure_credential,
embedding_endpoint=embedding_endpoint
)

return result


def chunk_directory(
Expand Down Expand Up @@ -954,14 +1021,13 @@ def chunk_directory(

class SingletonFormRecognizerClient:
instance = None
url = os.getenv("FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("FORM_RECOGNIZER_KEY")

def __new__(cls, *args, **kwargs):
if not cls.instance:
print("SingletonFormRecognizerClient: Creating instance of Form recognizer per process")
if cls.url and cls.key:
cls.instance = DocumentAnalysisClient(endpoint=cls.url, credential=AzureKeyCredential(cls.key))
url = os.getenv("FORM_RECOGNIZER_ENDPOINT")
key = os.getenv("FORM_RECOGNIZER_KEY")
if url and key:
cls.instance = DocumentAnalysisClient(endpoint=url, credential=AzureKeyCredential(key))
else:
print("SingletonFormRecognizerClient: Skipping since credentials not provided. Assuming NO form recognizer extensions(like .pdf) in directory")
cls.instance = object() # dummy object
Expand All @@ -972,4 +1038,4 @@ def __getstate__(self):

def __setstate__(self, state):
url, key = state
self.instance = DocumentAnalysisClient(endpoint=url, credential=AzureKeyCredential(key))
self.instance = DocumentAnalysisClient(endpoint=url, credential=AzureKeyCredential(key))
Loading

0 comments on commit 970d366

Please sign in to comment.