From 44aeb806f069bdb1c5394c790cde0dfc732c0215 Mon Sep 17 00:00:00 2001 From: Rohan Marwaha <123789373+rohan-uiuc@users.noreply.github.com> Date: Tue, 19 Dec 2023 22:11:25 -0600 Subject: [PATCH] Citations improvement; Fix regressions from adding image support (#76) * Fixing validations of checking file type * Regression Fixes: 1. Added page number for parsing and opening pdfs 2. Added all user text messages to context retrieval query 3. Added citation number to final list of sources. 4. Maintaining a cache for links and creating them in real time instead of prefetching every link. * Fix regex escape issue in Chat component and prevent multiple stream closures - Escaped special characters in regex pattern to correctly match filenames in Chat.tsx. - Added flag to track and prevent multiple closures of the ReadableStream avoiding 'stream already closed' errors. * Fixed regression issues 1. No message in retrieval API fixed 2. Caching and citation link generation improvement 3. Handle stream to shut it down gracefully on last chunk 4. Commented some debugging logs to keep the console clear * Commenting debug logs * Commented another log * Regression fixes: 1. Fixed changes incorrectly merged from main 2. Fixed clickable introductory statements introduced from image support PR 3. Removed unused and commented code --- src/components/Chat/Chat.tsx | 186 +++++++++++++++++++------ src/components/Chat/ChatInput.tsx | 112 +-------------- src/components/Chat/ChatMessage.tsx | 2 +- src/pages/api/chat.ts | 44 +++--- src/pages/api/contextStuffingHelper.ts | 8 +- src/utils/apiUtils.ts | 3 +- src/utils/server/index.ts | 57 +++++--- 7 files changed, 215 insertions(+), 197 deletions(-) diff --git a/src/components/Chat/Chat.tsx b/src/components/Chat/Chat.tsx index 080035485..59f1e1c6b 100644 --- a/src/components/Chat/Chat.tsx +++ b/src/components/Chat/Chat.tsx @@ -92,6 +92,7 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => { } const [inputContent, setInputContent] = useState('') + const [cacheMetrics, setCacheMetrics] = useState({ hits: 0, misses: 0 }); useEffect(() => { if (courseMetadata?.banner_image_s3 && courseMetadata.banner_image_s3 !== '') { @@ -185,7 +186,7 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => { ...message, content: [ ...imageContent, - { type: 'text', text: 'Provide detailed description of the image(s) focusing on any text (OCR information), distinct objects, colors, and actions depicted. Include contextual information, subtle details, and specific terminologies relevant for semantic document retrieval.' } + { type: 'text', text: `"Provide a detailed description of the image(s), focusing exclusively on the elements and details that are visibly present. Include descriptions of text (OCR information), distinct objects, spatial relationships, colors, actions, annotations, labels, or significant color usage. Use specific, technical, or domain-specific terminology to accurately describe elements, particularly for specialized fields like medicine, agriculture, technology, etc. Classify the image into relevant categories and list key terms associated with that category. Identify and list potential keywords or key phrases that summarize the main elements and themes. If the image contains abstract or emotional content, infer the overall message or content. Emphasize the most prominent features first, moving to less significant details. Also, provide synonyms or related terms for technical aspects. DO NOT reference or mention any features, elements, or aspects that are absent in the image. The GOAL is to create a precise, focused, and keyword-rich description that encapsulates only the observable details, suitable for semantic document retrieval across various domains."` } ] } ], @@ -226,7 +227,7 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => { (message.content as Content[]).push({ type: 'text', text: `Image description: ${imgDesc}` }); } } catch (error) { - console.error('Error in chat.tsx running onResponseCompletion():', error); + console.error('Error in chat.tsx running handleImageContent():', error); controller.abort(); } finally { homeDispatch({ field: 'isImg2TextLoading', value: false }) @@ -237,15 +238,62 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => { const handleContextSearch = async (message: Message, selectedConversation: Conversation, searchQuery: string) => { if (getCurrentPageName() != 'gpt4') { + // Extract text from all user messages in the conversation const token_limit = OpenAIModels[selectedConversation?.model.id as OpenAIModelID].tokenLimit const useMQRetrieval = localStorage.getItem('UseMQRetrieval') === 'true'; const fetchContextsFunc = useMQRetrieval ? fetchMQRContexts : fetchContexts; await fetchContextsFunc(getCurrentPageName(), searchQuery, token_limit).then((curr_contexts) => { message.contexts = curr_contexts as ContextWithMetadata[] + console.log('message.contexts: ', message.contexts) }) } } + const generateCitationLink = async (context: ContextWithMetadata) => { + // Uncomment for debugging + // console.log('context: ', context); + if (context.url) { + return context.url; + } else if (context.s3_path) { + return fetchPresignedUrl(context.s3_path); + } + return ''; + } + + const getCitationLink = async (context: ContextWithMetadata, citationLinkCache: Map, citationIndex: number) => { + // console.log("Generating citation link for context: ", citationIndex, context.readable_filename) + const cachedLink = citationLinkCache.get(citationIndex); + if (cachedLink) { + setCacheMetrics((prevMetrics) => { + const newMetrics = { ...prevMetrics, hits: prevMetrics.hits + 1 }; + // Uncomment for debugging + // console.log(`Cache hit for citation index ${citationIndex}. Current cache hit ratio: ${(newMetrics.hits / (newMetrics.hits + newMetrics.misses)).toFixed(2)}`); + return newMetrics; + }); + return cachedLink; + } else { + setCacheMetrics((prevMetrics) => { + const newMetrics = { ...prevMetrics, misses: prevMetrics.misses + 1 }; + // Uncomment for debugging + // console.log(`Cache miss for citation index ${citationIndex}. Current cache hit ratio: ${(newMetrics.hits / (newMetrics.hits + newMetrics.misses)).toFixed(2)}`); + return newMetrics; + }); + const link = await generateCitationLink(context); + citationLinkCache.set(citationIndex, link); + return link; + } + } + + const resetCacheMetrics = () => { + // console.log(`Final cache hit ratio for the message: ${(cacheMetrics.hits / (cacheMetrics.hits + cacheMetrics.misses)).toFixed(2)}`); + console.log(`Final Cache metrics: ${JSON.stringify(cacheMetrics)}`); + setCacheMetrics({ hits: 0, misses: 0 }); + } + + function escapeRegExp(string: string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string + } + // THIS IS WHERE MESSAGES ARE SENT. const handleSend = useCallback( async (message: Message, deleteCount = 0, plugin: Plugin | null = null) => { @@ -397,57 +445,103 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => { let done = false let isFirst = true let text = '' - while (!done) { - if (stopConversationRef.current === true) { - controller.abort() - done = true - break - } - const { value, done: doneReading } = await reader.read() - done = doneReading - const chunkValue = decoder.decode(value) - text += chunkValue - if (isFirst) { - // isFirst refers to the first chunk of data received from the API (happens once for each new message from API) - isFirst = false - const updatedMessages: Message[] = [ - ...updatedConversation.messages, - { - role: 'assistant', - content: chunkValue, - contexts: message.contexts, - }, - ] - updatedConversation = { - ...updatedConversation, - messages: updatedMessages, + const citationLinkCache = new Map(); + try { + while (!done) { + if (stopConversationRef.current === true) { + controller.abort() + done = true + break } - homeDispatch({ - field: 'selectedConversation', - value: updatedConversation, - }) - } else { - const updatedMessages: Message[] = - updatedConversation.messages.map((message, index) => { - if (index === updatedConversation.messages.length - 1) { - return { - ...message, - content: text, - // responseTimeSec: // TODO: try to track this.. mostly in ChatMessage.tsx + const { value, done: doneReading } = await reader.read() + done = doneReading + const chunkValue = decoder.decode(value) + text += chunkValue + + if (isFirst) { + // isFirst refers to the first chunk of data received from the API (happens once for each new message from API) + isFirst = false + const updatedMessages: Message[] = [ + ...updatedConversation.messages, + { + role: 'assistant', + content: chunkValue, + contexts: message.contexts, + }, + ] + updatedConversation = { + ...updatedConversation, + messages: updatedMessages, + } + homeDispatch({ + field: 'selectedConversation', + value: updatedConversation, + }) + } else { + + const updatedMessagesPromises: Promise[] = updatedConversation.messages.map(async (message, index) => { + if (index === updatedConversation.messages.length - 1 && message.contexts) { + let content = text; + + // Identify all unique citation indices in the content + const citationIndices = new Set(); + const citationPattern = /\[(\d+)\](?!\([^)]*\))/g; + let match; + while ((match = citationPattern.exec(content)) !== null) { + citationIndices.add(parseInt(match[1] as string)); + } + + // Generate citation links only for the referenced indices + for (const citationIndex of citationIndices) { + const context = message.contexts[citationIndex - 1]; // Adjust index for zero-based array + if (context) { + const link = await getCitationLink(context, citationLinkCache, citationIndex); + const pageNumberMatch = content.match(new RegExp(`\\[${escapeRegExp(context.readable_filename)}, page: (\\d+)\\]\\(#\\)`)); + const pageNumber = pageNumberMatch ? `#page=${pageNumberMatch[1]}` : ''; + + // Replace citation index with link + content = content.replace(new RegExp(`\\[${citationIndex}\\](?!\\([^)]*\\))`, 'g'), `[${citationIndex}](${link}${pageNumber})`); + + // Replace filename with link + content = content.replace(new RegExp(`(\\b${citationIndex}\\.)\\s*\\[(.*?)\\]\\(\\#\\)`, 'g'), (match, index, filename) => { + return `${index} [${index} ${filename}](${link}${pageNumber})`; + }); + } } + // Uncomment for debugging + // console.log('content: ', content); + return { ...message, content }; } - return message + return message; + }); + + // Use Promise.all to wait for all promises to resolve + const updatedMessages = await Promise.all(updatedMessagesPromises); + + updatedConversation = { + ...updatedConversation, + messages: updatedMessages, + } + homeDispatch({ + field: 'selectedConversation', + value: updatedConversation, }) - updatedConversation = { - ...updatedConversation, - messages: updatedMessages, } - homeDispatch({ - field: 'selectedConversation', - value: updatedConversation, - }) } + } catch (error) { + console.error('Error reading from stream:', error); + homeDispatch({ field: 'loading', value: false }); + homeDispatch({ field: 'messageIsStreaming', value: false }); + return; + } finally { + // Reset cache metrics after each message + resetCacheMetrics(); + } + + if (!done) { + throw new Error('Stream ended prematurely'); } + saveConversation(updatedConversation) // todo: add clerk user info to onMessagereceived for logging. if (clerk_obj.isLoaded && clerk_obj.isSignedIn) { diff --git a/src/components/Chat/ChatInput.tsx b/src/components/Chat/ChatInput.tsx index bf939229d..4d5391305 100644 --- a/src/components/Chat/ChatInput.tsx +++ b/src/components/Chat/ChatInput.tsx @@ -357,111 +357,6 @@ export const ChatInput = ({ return validImageTypes.includes(`.${ext}`); } - // const uploadToS3 = async (file: File) => { - // if (!file) { - // console.error('No file provided for upload'); - // return; - // } - - // // Generate a unique file name using uuidv4 - // const uniqueFileName = `${uuidv4()}.${file.name.split('.').pop()}`; - // const s3_filepath = `courses/${courseName}/${uniqueFileName}`; // Define s3_filepath here - - // console.log('uploadToS3 called with uniqueFileName:', uniqueFileName); - // console.log('uploadToS3 called with s3_filepath:', s3_filepath); - - // // Prepare the request body for the API call - // // Prepare the request body for the API call - // const requestObject = { - // method: 'POST', - // headers: { - // 'Content-Type': 'application/json', - // }, - // body: JSON.stringify({ - // uniqueFileName: uniqueFileName, - // fileType: file.type, - // courseName: courseName, - // }), - // }; - - // try { - // // Call your API to get the presigned POST data - // const response = await fetch('/api/UIUC-api/uploadToS3', requestObject); - // if (!response.ok) { - // throw new Error(`HTTP error! Status: ${response.status}`); - // } - // const { post } = await response.json(); - - // // Use the presigned POST data to upload the file to S3 - // const formData = new FormData(); - // Object.entries(post.fields).forEach(([key, value]) => { - // formData.append(key, value as string); - // }); - // formData.append('file', file); - - // // Post the file to the S3 bucket using the presigned URL and form data - // const uploadResponse = await fetch(post.url, { - // method: 'POST', - // body: formData, - // }); - - // if (!uploadResponse.ok) { - // throw new Error('Failed to upload the file to S3'); - // } - - // // Construct the URL to the uploaded file using the response from the presigned POST - // const uploadedImageUrl = `https://${aws_config.bucketName}.s3.${aws_config.region}.amazonaws.com/${encodeURIComponent(s3_filepath)}`; - - // return uploadedImageUrl; - // } catch (error) { - // console.error('Error uploading file:', error); - // } - // }; - - - - const ingestFile = async (file: File | null) => { - if (!file) return; - - const fileExtension = file.name.slice(((file.name.lastIndexOf(".") - 1) >>> 0) + 2); - const uniqueFileName = `${uuidv4()}.${fileExtension}`; - - const queryParams = new URLSearchParams({ - courseName: courseName, - fileName: uniqueFileName, - }).toString(); - - const requestObject = { - method: 'GET', - headers: { - 'Content-Type': 'application/json', - }, - query: { - fileName: file.name, - courseName: courseName, - }, - } - - // Actually we CAN await here, just don't await this function. - console.log('right before call /ingest...') - const response = await fetch( - `/api/UIUC-api/ingest?${queryParams}`, - requestObject, - ) - - // check if the response was ok - if (response.ok) { - const data = await response.json() - // console.log(file.name as string + ' ingested successfully!!') - console.log('Success or Failure:', data) - return data - } else { - console.log('Error during ingest:', response.statusText) - console.log('Full Response message:', response) - return response - } - } - const showToastOnInvalidImage = useCallback(() => { notifications.show({ id: 'error-notification', @@ -664,6 +559,13 @@ export const ChatInput = ({ } }, []); + useEffect(() => { + setContent(inputContent) + if (textareaRef.current) { + textareaRef.current.focus() + } + }, [inputContent, textareaRef]) + // This is where we upload images and generate their presigned url async function uploadImageAndGetUrl(file: File, courseName: string): Promise { try { diff --git a/src/components/Chat/ChatMessage.tsx b/src/components/Chat/ChatMessage.tsx index 170c3ffad..804a57316 100644 --- a/src/components/Chat/ChatMessage.tsx +++ b/src/components/Chat/ChatMessage.tsx @@ -545,7 +545,7 @@ export const ChatMessage: FC = memo( const { href, title } = props; // console.log("href:", href); // console.log("title:", title); - console.log("children:", children); + // console.log("children:", children); const isCitationLink = /^\d+$/.test(children[0] as string); if (isCitationLink) { return ( diff --git a/src/pages/api/chat.ts b/src/pages/api/chat.ts index 6e692bcff..de6df0f74 100644 --- a/src/pages/api/chat.ts +++ b/src/pages/api/chat.ts @@ -80,27 +80,29 @@ const handler = async (req: Request): Promise => { // todo // } - // regular context stuffing - const stuffedPrompt = (await getStuffedPrompt( - course_name, - search_query, - contexts_arr, - token_limit, - )) as string - if (typeof messages[messages.length - 1]?.content === 'string') { - messages[messages.length - 1]!.content = stuffedPrompt; - } else if (Array.isArray(messages[messages.length - 1]?.content) && - (messages[messages.length - 1]!.content as Content[]).every(item => 'type' in item)) { - - const contentArray = messages[messages.length - 1]!.content as Content[]; - const textContentIndex = contentArray.findIndex(item => item.type === 'text') || 0; - - if (textContentIndex !== -1 && contentArray[textContentIndex]) { - // Replace existing text content with the new stuffed prompt - contentArray[textContentIndex] = { ...contentArray[textContentIndex], text: stuffedPrompt, type: 'text' }; - } else { - // Add new stuffed prompt if no text content exists - contentArray.push({ type: 'text', text: stuffedPrompt }); + else if (stream) { + // regular context stuffing + const stuffedPrompt = (await getStuffedPrompt( + course_name, + search_query, + contexts_arr, + token_limit, + )) as string + if (typeof messages[messages.length - 1]?.content === 'string') { + messages[messages.length - 1]!.content = stuffedPrompt; + } else if (Array.isArray(messages[messages.length - 1]?.content) && + (messages[messages.length - 1]!.content as Content[]).every(item => 'type' in item)) { + + const contentArray = messages[messages.length - 1]!.content as Content[]; + const textContentIndex = contentArray.findIndex(item => item.type === 'text') || 0; + + if (textContentIndex !== -1 && contentArray[textContentIndex]) { + // Replace existing text content with the new stuffed prompt + contentArray[textContentIndex] = { ...contentArray[textContentIndex], text: stuffedPrompt, type: 'text' }; + } else { + // Add new stuffed prompt if no text content exists + contentArray.push({ type: 'text', text: stuffedPrompt }); + } } } diff --git a/src/pages/api/contextStuffingHelper.ts b/src/pages/api/contextStuffingHelper.ts index 0cb62bb0f..6a67b5075 100644 --- a/src/pages/api/contextStuffingHelper.ts +++ b/src/pages/api/contextStuffingHelper.ts @@ -63,13 +63,13 @@ export async function getStuffedPrompt( At the end of your response, list the document title with a clickable link, like this: "[1]:[document_name]" Nothing else should prefixxed or suffixed to the citation or document name. - Suppose a document name is shared with you along with the number below like "27: www.pdf, page: 2" where 27 is the number and www.pdf is the document_name, then cite it in the response as follows: + Suppose a document name is shared with you along with the number below like "27: www.pdf, page: 2", "28: www.osd" where 27, 28 are numbers, www.pdf, www.osd are document_name, and 2 is the pageNumber, then cite it in the response as follows: """ The sky is blue. [27] The grass is green. [28] Relevant Sources: - 27. [document_name](#) - 28. [document_name](#) + 27. [www.pdf, page: 2](#) + 28. [www.osd](#) """ ONLY return the documents with relevant information and cited in the response. If there are no relevant sources, don't include the "Relevant Sources" section in response. Here are excerpts from the high-quality documents provided: @@ -78,7 +78,7 @@ export async function getStuffedPrompt( } let tokenCounter = encoding.encode( - prePrompt + '\n\nNow please respond to my query: ' + searchQuery, + prePrompt + '\n\nNow please respond to my conversation: ' + searchQuery, ).length const validDocs = [] for (const [index, d] of contexts.entries()) { diff --git a/src/utils/apiUtils.ts b/src/utils/apiUtils.ts index 7f5193a5f..6cd591a3f 100644 --- a/src/utils/apiUtils.ts +++ b/src/utils/apiUtils.ts @@ -108,10 +108,11 @@ export const uploadToS3 = async (file: File | null, course_name: string) => { } } -export async function fetchPresignedUrl(filePath: string) { +export async function fetchPresignedUrl(filePath: string, page?: string) { try { const response = await axios.post('/api/download', { filePath, + page, }) return response.data.url } catch (error) { diff --git a/src/utils/server/index.ts b/src/utils/server/index.ts index bc3b28e43..e879311c8 100644 --- a/src/utils/server/index.ts +++ b/src/utils/server/index.ts @@ -145,37 +145,56 @@ export const OpenAIStream = async ( if (stream) { console.log("Streaming response ") + let isStreamClosed = false; // Flag to track the state of the stream const apiStream = new ReadableStream({ async start(controller) { const onParse = (event: ParsedEvent | ReconnectInterval) => { if (event.type === 'event') { const data = event.data - try { - // console.log('data: ', data) // ! DEBUGGING - if (data.trim() !== "[DONE]") { - const json = JSON.parse(data) - if (json.choices[0].finish_reason != null) { - controller.close() - return - } - const text = json.choices[0].delta.content - const queue = encoder.encode(text) - controller.enqueue(queue) - } else { - controller.close() - return; + try { + // console.log('data: ', data) // ! DEBUGGING + if (data.trim() !== "[DONE]") { + const json = JSON.parse(data) + if (json.choices[0].finish_reason != null) { + if (!isStreamClosed) { + controller.close() + isStreamClosed = true; // Update the flag after closing the stream + } + return + } + const text = json.choices[0].delta.content + const queue = encoder.encode(text) + controller.enqueue(queue) + } else { + if (!isStreamClosed) { + controller.close() + isStreamClosed = true; // Update the flag after closing the stream + } + return; + } + } catch (e) { + if (!isStreamClosed) { + controller.error(e) + isStreamClosed = true; // Update the flag if an error occurs + } } - } catch (e) { - controller.error(e) } } - } const parser = createParser(onParse) - for await (const chunk of res.body as any) { - parser.feed(decoder.decode(chunk)) + try { + for await (const chunk of res.body as any) { + if (!isStreamClosed) { // Only feed the parser if the stream is not closed + parser.feed(decoder.decode(chunk)) + } + } + } catch (e) { + if (!isStreamClosed) { + controller.error(e) + isStreamClosed = true; + } } }, })