Merge main into sweep/fetchMQRContexts-toggle-switch

CAII-NCSA · Dec 8, 2023 · 25b05bb · 25b05bb
2 parents c7f21c5 + 760e6b6
commit 25b05bb
Show file tree

Hide file tree

Showing 6 changed files with 315 additions and 81 deletions.
diff --git a/src/components/Chat/Chat.tsx b/src/components/Chat/Chat.tsx
@@ -183,7 +183,9 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
             ...message,
             content: [
               ...imageContent,
-              { type: 'text', text: 'Provide detailed description of the image(s) focusing on any text (OCR information), distinct objects, colors, and actions depicted. Include contextual information, subtle details, and specific terminologies relevant for semantic document retrieval.' }
+              {
+                type: 'text', text: `"Provide a detailed description of the image(s), focusing exclusively on the elements and details that are visibly present. Include descriptions of text (OCR information), distinct objects, spatial relationships, colors, actions, annotations, labels, or significant color usage. Use specific, technical, or domain-specific terminology to accurately describe elements, particularly for specialized fields like medicine, agriculture, technology, etc. Classify the image into relevant categories and list key terms associated with that category. Identify and list potential keywords or key phrases that summarize the main elements and themes. If the image contains abstract or emotional content, infer the overall message or content. Emphasize the most prominent features first, moving to less significant details. Also, provide synonyms or related terms for technical aspects. DO NOT reference or mention any features, elements, or aspects that are absent in the image. The GOAL is to create a precise, focused, and keyword-rich description that encapsulates only the observable details, suitable for semantic document retrieval across various domains."`
+              }
             ]
           }
         ],
@@ -244,6 +246,18 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
     }
   }
 
+  const generateCitationLink = async (context: ContextWithMetadata) => {
+    let url = ''
+    console.log("context: ", context)
+
+    if (context.url !== '') {
+      url = context.url
+    } else if (context.s3_path !== '') {
+      url = await fetchPresignedUrl(context.s3_path)
+    }
+    return url
+  }
+
   // THIS IS WHERE MESSAGES ARE SENT.
   const handleSend = useCallback(
     async (message: Message, deleteCount = 0, plugin: Plugin | null = null) => {
@@ -395,57 +409,89 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
           let done = false
           let isFirst = true
           let text = ''
-          while (!done) {
-            if (stopConversationRef.current === true) {
-              controller.abort()
-              done = true
-              break
-            }
-            const { value, done: doneReading } = await reader.read()
-            done = doneReading
-            const chunkValue = decoder.decode(value)
-            text += chunkValue
-            if (isFirst) {
-              // isFirst refers to the first chunk of data received from the API (happens once for each new message from API)
-              isFirst = false
-              const updatedMessages: Message[] = [
-                ...updatedConversation.messages,
-                {
-                  role: 'assistant',
-                  content: chunkValue,
-                  contexts: message.contexts,
-                },
-              ]
-              updatedConversation = {
-                ...updatedConversation,
-                messages: updatedMessages,
-              }
-              homeDispatch({
-                field: 'selectedConversation',
-                value: updatedConversation,
+          let citationLinks: { link: string, citationRegex: RegExp, readable_filename: string }[] = [];
+          if (message.contexts) {
+            citationLinks = await Promise.all(
+              message.contexts.map(async (context, index) => {
+                const link = await generateCitationLink(context)
+                const citationRegex = new RegExp(`\\[${context.readable_filename}\\]`, 'g')
+                return { link, citationRegex, readable_filename: context.readable_filename }
               })
-            } else {
-              const updatedMessages: Message[] =
-                updatedConversation.messages.map((message, index) => {
+            )
+          }
+          try {
+            while (!done) {
+              if (stopConversationRef.current === true) {
+                controller.abort()
+                done = true
+                break
+              }
+              const { value, done: doneReading } = await reader.read()
+              done = doneReading
+              const chunkValue = decoder.decode(value)
+              text += chunkValue
+              if (isFirst) {
+                // isFirst refers to the first chunk of data received from the API (happens once for each new message from API)
+                isFirst = false
+                const updatedMessages: Message[] = [
+                  ...updatedConversation.messages,
+                  {
+                    role: 'assistant',
+                    content: chunkValue,
+                    contexts: message.contexts,
+                  },
+                ]
+                updatedConversation = {
+                  ...updatedConversation,
+                  messages: updatedMessages,
+                }
+                homeDispatch({
+                  field: 'selectedConversation',
+                  value: updatedConversation,
+                })
+              } else {
+                const updatedMessages: Message[] = updatedConversation.messages.map((message, index) => {
                   if (index === updatedConversation.messages.length - 1) {
-                    return {
-                      ...message,
-                      content: text,
-                      // responseTimeSec: // TODO: try to track this.. mostly in ChatMessage.tsx
+                    let content = text
+                    if (message.contexts) {
+                      citationLinks.forEach(({ link, citationRegex, readable_filename }, index) => {
+                        const citationLink = `[${index + 1}](${link})`;
+                        const filenameLink = `${index + 1}. [${readable_filename}](${link})`;
+                      // This replaces placeholders with clickable links but Markdown rendering removes the placeholder and only shows the number.
+                        content = content.replace(new RegExp(`\\[${index + 1}\\](?!\\:\\s\\[)`, 'g'), citationLink);
+                        content = content.replace(new RegExp(`${index + 1}\\.\\s\\[${readable_filename}\\]\\(\\#\\)`, 'g'), filenameLink);
+                      })
+                      // Uncomment for debugging
+                      // console.log('content: ', content) 
+                      return {
+                        ...message,
+                        content,
+                      }
                     }
                   }
                   return message
                 })
-              updatedConversation = {
-                ...updatedConversation,
-                messages: updatedMessages,
+                updatedConversation = {
+                  ...updatedConversation,
+                  messages: updatedMessages,
+                }
+                homeDispatch({
+                  field: 'selectedConversation',
+                  value: updatedConversation,
+                })
               }
-              homeDispatch({
-                field: 'selectedConversation',
-                value: updatedConversation,
-              })
             }
+          } catch (error) {
+            console.error('Error reading from stream:', error);
+            homeDispatch({ field: 'loading', value: false });
+            homeDispatch({ field: 'messageIsStreaming', value: false });
+            return;
           }
+
+          if (!done) {
+            throw new Error('Stream ended prematurely');
+          }
+
           saveConversation(updatedConversation)
           // todo: add clerk user info to onMessagereceived for logging.
           if (clerk_obj.isLoaded && clerk_obj.isSignedIn) {
@@ -524,9 +570,8 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
         // Remove the existing image description
         (currentMessage.content as Content[]).splice(imgDescIndex, 1);
       }
-
-      handleSend(currentMessage, 2, null);
     }
+    handleSend(currentMessage as Message, 2, null);
   }, [currentMessage, handleSend]);
 
   const scrollToBottom = useCallback(() => {

diff --git a/src/components/Chat/ChatMessage.tsx b/src/components/Chat/ChatMessage.tsx
@@ -489,10 +489,10 @@ export const ChatMessage: FC<Props> = memo(
                 )}
               </div>
             ) : (
-              <div className="flex flex-row">
-                <div className="flex-1">
+                <div className="flex flex-row ">
+                  <div className="flex-1 max-w-full w-full overflow-hidden">
                   <MemoizedReactMarkdown
-                    className="dark:prose-invert prose flex-1"
+                      className="dark:prose-invert prose flex-1 linkMarkDown supMarkdown "
                     remarkPlugins={[remarkGfm, remarkMath]}
                     rehypePlugins={[rehypeMathjax]}
                     components={{
@@ -548,20 +548,53 @@ export const ChatMessage: FC<Props> = memo(
                           </td>
                         )
                       },
+                      a({ node, className, children, ...props }) {
+                        const { href, title } = props;
+                        // console.log("href:", href);
+                        // console.log("title:", title);
+                        console.log("children:", children);
+                        const isCitationLink = /^\d+$/.test(children[0] as string);
+                        if (isCitationLink) {
+                          return (
+                            <a
+                              id="styledLink"
+                              href={href}
+                              target="_blank"
+                              title={title}
+                              rel="noopener noreferrer"
+                              className={'supMarkdown'}
+                            >
+                              {children}
+                            </a>
+                          )
+                        } else {
+
+                          return (
+                            <button
+                              id="styledLink"
+                              onClick={() => window.open(href, '_blank')}
+                              title={title}
+                              className={'linkMarkDown'}
+                            >
+                              {children}
+                            </button>
+                          )
+                        }
+                      },
                     }}
                   >
-                    {`${message.content}${messageIsStreaming &&
+                      {`${message.content}${messageIsStreaming &&
                       messageIndex ==
                       (selectedConversation?.messages.length ?? 0) - 1
                       ? '`▍`'
                       : ''
                       }`}
                   </MemoizedReactMarkdown>
-                  {message.contexts && message.contexts.length > 0 && (
+                    {/* {message.contexts && message.contexts.length > 0 && (
                     <Group variant="row" spacing="xs">
                       <ContextCards contexts={message.contexts} />
                     </Group>
-                  )}
+                  )} */}
                 </div>
 
                 <div className="ml-1 flex flex-col items-center justify-end gap-4 md:-mr-8 md:ml-0 md:flex-row md:items-start md:justify-start md:gap-1">

diff --git a/src/components/Markdown/CodeBlock.tsx b/src/components/Markdown/CodeBlock.tsx
@@ -72,14 +72,14 @@ export const CodeBlock: FC<Props> = memo(({ language, value }) => {
 
         <div className="flex items-center">
           <button
-            className="flex items-center gap-1.5 rounded bg-none p-1 text-xs text-white"
+            className="flex items-center gap-1.5 rounded bg-none p-1 text-xs text-white codeblock-button"
             onClick={copyToClipboard}
           >
             {isCopied ? <IconCheck size={18} /> : <IconClipboard size={18} />}
             {isCopied ? t('Copied!') : t('Copy code')}
           </button>
           <button
-            className="flex items-center rounded bg-none p-1 text-xs text-white"
+            className="flex items-center rounded bg-none p-1 text-xs text-white codeblock-button"
             onClick={downloadAsFile}
           >
             <IconDownload size={18} />

diff --git a/src/pages/api/contextStuffingHelper.ts b/src/pages/api/contextStuffingHelper.ts
@@ -27,16 +27,38 @@ export async function getStuffedPrompt(
       prePrompt =
         "Please answer the following question. Use the documents below, and ONLY the documents below, to answer the question. This is for the law domain and we train law students to stick to facts that are in the record. Do not improvise or use your world knowledge, stick to only the information provided and make heavy use of direct quotes instead of paraphrasing or summarizing. When citing the documents, always use Markdown footnotes in the react-markdown format. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Say that 'the topic is not discussed in these documents' when the answer is not directly available in the documents. If there are related documents, tell the user that they might be able to learn more in that document.\nHere's a few passages of the documents:\n"
     } else {
-      prePrompt =
-        "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
+      prePrompt = `Please analyze and respond to the following question using the excerpts from the provided documents. These documents can be pdf files or web pages.
+      Integrate relevant information from these documents, ensuring each reference is linked to the document's number.
+      Use Markdown to format citations as clickable links. Your response should be semi-formal. 
+      When quoting directly, cite with footnotes linked to the document number. 
+      Summarize or paraphrase other relevant information with inline citations, again referencing the document number. 
+      If the answer is not in the provided documents, state so. 
+      Conclude your response with a LIST of the document titles as clickable links, each linked to its respective document number.
+      ALWAYS follow the examples below:
+      If you're referencing the first document, insert a citation like this in your response: "[1]" 
+      At the end of your response, list the document title with a clickable link, like this: "[1]:[document_name]"
+      Nothing else should prefixxed or suffixed to the citation or document name. 
+      
+      Suppose a document name is shared with you along with the number below like "27: www.pdf, page: 2" where 27 is the number and www.pdf is the document_name, then cite it in the response as follows:
+      """
+      The sky is blue. [27] The grass is green. [28]
+      Relevant Sources:
+
+      27. [document_name](#)
+      28. [document_name](#)
+      """
+      ONLY return the documents with relevant information and cited in the response. If there are no relevant sources, don't include the "Relevant Sources" section in response.
+      Here are excerpts from the high-quality documents provided:
+      \n"`
+      // "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n"
     }
 
     let tokenCounter = encoding.encode(
       prePrompt + '\n\nNow please respond to my query: ' + searchQuery,
     ).length
     const validDocs = []
-    for (const d of contexts) {
-      const docString = `---\nDocument: ${d.readable_filename}${
+    for (const [index, d] of contexts.entries()) {
+      const docString = `---\n${index + 1}: ${d.readable_filename}${
         d.pagenumber ? ', page: ' + d.pagenumber : ''
       }\n${d.text}\n`
       const numTokens = encoding.encode(docString).length
@@ -45,7 +67,7 @@ export async function getStuffedPrompt(
       )
       if (tokenCounter + numTokens <= tokenLimit) {
         tokenCounter += numTokens
-        validDocs.push(d)
+        validDocs.push({ index, d })
       } else {
         continue
       }
@@ -54,8 +76,8 @@ export async function getStuffedPrompt(
     const separator = '---\n' // between each context
     const contextText = validDocs
       .map(
-        (d) =>
-          `Document: ${d.readable_filename}${
+        ({ index, d }) =>
+          `${index + 1}: ${d.readable_filename}${
             d.pagenumber ? ', page: ' + d.pagenumber : ''
           }\n${d.text}\n`,
       )