Skip to content

Commit

Permalink
Citations improvement; Fix regressions from adding image support (#76)
Browse files Browse the repository at this point in the history
* Fixing validations of checking file type

* Regression Fixes:
1. Added page number for parsing and opening pdfs
2. Added all user text messages to context retrieval query
3. Added citation number to final list of sources.
4. Maintaining a cache for links and creating them in real time instead of prefetching every link.

* Fix regex escape issue in Chat component and prevent multiple stream closures

- Escaped special characters in regex pattern to correctly match filenames in Chat.tsx.
- Added flag to track and prevent multiple closures of the ReadableStream avoiding 'stream already closed' errors.

* Fixed regression issues
1. No message in retrieval API fixed
2. Caching and citation link generation improvement
3. Handle stream to shut it down gracefully on last chunk
4. Commented some debugging logs to keep the console clear

* Commenting debug logs

* Commented another log

* Regression fixes:
1. Fixed changes incorrectly merged from main
2. Fixed clickable introductory statements introduced from image support PR
3. Removed unused and commented code
  • Loading branch information
rohan-uiuc authored Dec 20, 2023
1 parent 9d755ce commit 44aeb80
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 197 deletions.
186 changes: 140 additions & 46 deletions src/components/Chat/Chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
}

const [inputContent, setInputContent] = useState<string>('')
const [cacheMetrics, setCacheMetrics] = useState({ hits: 0, misses: 0 });

useEffect(() => {
if (courseMetadata?.banner_image_s3 && courseMetadata.banner_image_s3 !== '') {
Expand Down Expand Up @@ -185,7 +186,7 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
...message,
content: [
...imageContent,
{ type: 'text', text: 'Provide detailed description of the image(s) focusing on any text (OCR information), distinct objects, colors, and actions depicted. Include contextual information, subtle details, and specific terminologies relevant for semantic document retrieval.' }
{ type: 'text', text: `"Provide a detailed description of the image(s), focusing exclusively on the elements and details that are visibly present. Include descriptions of text (OCR information), distinct objects, spatial relationships, colors, actions, annotations, labels, or significant color usage. Use specific, technical, or domain-specific terminology to accurately describe elements, particularly for specialized fields like medicine, agriculture, technology, etc. Classify the image into relevant categories and list key terms associated with that category. Identify and list potential keywords or key phrases that summarize the main elements and themes. If the image contains abstract or emotional content, infer the overall message or content. Emphasize the most prominent features first, moving to less significant details. Also, provide synonyms or related terms for technical aspects. DO NOT reference or mention any features, elements, or aspects that are absent in the image. The GOAL is to create a precise, focused, and keyword-rich description that encapsulates only the observable details, suitable for semantic document retrieval across various domains."` }
]
}
],
Expand Down Expand Up @@ -226,7 +227,7 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
(message.content as Content[]).push({ type: 'text', text: `Image description: ${imgDesc}` });
}
} catch (error) {
console.error('Error in chat.tsx running onResponseCompletion():', error);
console.error('Error in chat.tsx running handleImageContent():', error);
controller.abort();
} finally {
homeDispatch({ field: 'isImg2TextLoading', value: false })
Expand All @@ -237,15 +238,62 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {

const handleContextSearch = async (message: Message, selectedConversation: Conversation, searchQuery: string) => {
if (getCurrentPageName() != 'gpt4') {
// Extract text from all user messages in the conversation
const token_limit = OpenAIModels[selectedConversation?.model.id as OpenAIModelID].tokenLimit
const useMQRetrieval = localStorage.getItem('UseMQRetrieval') === 'true';
const fetchContextsFunc = useMQRetrieval ? fetchMQRContexts : fetchContexts;
await fetchContextsFunc(getCurrentPageName(), searchQuery, token_limit).then((curr_contexts) => {
message.contexts = curr_contexts as ContextWithMetadata[]
console.log('message.contexts: ', message.contexts)
})
}
}

const generateCitationLink = async (context: ContextWithMetadata) => {
// Uncomment for debugging
// console.log('context: ', context);
if (context.url) {
return context.url;
} else if (context.s3_path) {
return fetchPresignedUrl(context.s3_path);
}
return '';
}

const getCitationLink = async (context: ContextWithMetadata, citationLinkCache: Map<number, string>, citationIndex: number) => {
// console.log("Generating citation link for context: ", citationIndex, context.readable_filename)
const cachedLink = citationLinkCache.get(citationIndex);
if (cachedLink) {
setCacheMetrics((prevMetrics) => {
const newMetrics = { ...prevMetrics, hits: prevMetrics.hits + 1 };
// Uncomment for debugging
// console.log(`Cache hit for citation index ${citationIndex}. Current cache hit ratio: ${(newMetrics.hits / (newMetrics.hits + newMetrics.misses)).toFixed(2)}`);
return newMetrics;
});
return cachedLink;
} else {
setCacheMetrics((prevMetrics) => {
const newMetrics = { ...prevMetrics, misses: prevMetrics.misses + 1 };
// Uncomment for debugging
// console.log(`Cache miss for citation index ${citationIndex}. Current cache hit ratio: ${(newMetrics.hits / (newMetrics.hits + newMetrics.misses)).toFixed(2)}`);
return newMetrics;
});
const link = await generateCitationLink(context);
citationLinkCache.set(citationIndex, link);
return link;
}
}

const resetCacheMetrics = () => {
// console.log(`Final cache hit ratio for the message: ${(cacheMetrics.hits / (cacheMetrics.hits + cacheMetrics.misses)).toFixed(2)}`);
console.log(`Final Cache metrics: ${JSON.stringify(cacheMetrics)}`);
setCacheMetrics({ hits: 0, misses: 0 });
}

function escapeRegExp(string: string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); // $& means the whole matched string
}

// THIS IS WHERE MESSAGES ARE SENT.
const handleSend = useCallback(
async (message: Message, deleteCount = 0, plugin: Plugin | null = null) => {
Expand Down Expand Up @@ -397,57 +445,103 @@ export const Chat = memo(({ stopConversationRef, courseMetadata }: Props) => {
let done = false
let isFirst = true
let text = ''
while (!done) {
if (stopConversationRef.current === true) {
controller.abort()
done = true
break
}
const { value, done: doneReading } = await reader.read()
done = doneReading
const chunkValue = decoder.decode(value)
text += chunkValue
if (isFirst) {
// isFirst refers to the first chunk of data received from the API (happens once for each new message from API)
isFirst = false
const updatedMessages: Message[] = [
...updatedConversation.messages,
{
role: 'assistant',
content: chunkValue,
contexts: message.contexts,
},
]
updatedConversation = {
...updatedConversation,
messages: updatedMessages,
const citationLinkCache = new Map<number, string>();
try {
while (!done) {
if (stopConversationRef.current === true) {
controller.abort()
done = true
break
}
homeDispatch({
field: 'selectedConversation',
value: updatedConversation,
})
} else {
const updatedMessages: Message[] =
updatedConversation.messages.map((message, index) => {
if (index === updatedConversation.messages.length - 1) {
return {
...message,
content: text,
// responseTimeSec: // TODO: try to track this.. mostly in ChatMessage.tsx
const { value, done: doneReading } = await reader.read()
done = doneReading
const chunkValue = decoder.decode(value)
text += chunkValue

if (isFirst) {
// isFirst refers to the first chunk of data received from the API (happens once for each new message from API)
isFirst = false
const updatedMessages: Message[] = [
...updatedConversation.messages,
{
role: 'assistant',
content: chunkValue,
contexts: message.contexts,
},
]
updatedConversation = {
...updatedConversation,
messages: updatedMessages,
}
homeDispatch({
field: 'selectedConversation',
value: updatedConversation,
})
} else {

const updatedMessagesPromises: Promise<Message>[] = updatedConversation.messages.map(async (message, index) => {
if (index === updatedConversation.messages.length - 1 && message.contexts) {
let content = text;

// Identify all unique citation indices in the content
const citationIndices = new Set<number>();
const citationPattern = /\[(\d+)\](?!\([^)]*\))/g;
let match;
while ((match = citationPattern.exec(content)) !== null) {
citationIndices.add(parseInt(match[1] as string));
}

// Generate citation links only for the referenced indices
for (const citationIndex of citationIndices) {
const context = message.contexts[citationIndex - 1]; // Adjust index for zero-based array
if (context) {
const link = await getCitationLink(context, citationLinkCache, citationIndex);
const pageNumberMatch = content.match(new RegExp(`\\[${escapeRegExp(context.readable_filename)}, page: (\\d+)\\]\\(#\\)`));
const pageNumber = pageNumberMatch ? `#page=${pageNumberMatch[1]}` : '';

// Replace citation index with link
content = content.replace(new RegExp(`\\[${citationIndex}\\](?!\\([^)]*\\))`, 'g'), `[${citationIndex}](${link}${pageNumber})`);

// Replace filename with link
content = content.replace(new RegExp(`(\\b${citationIndex}\\.)\\s*\\[(.*?)\\]\\(\\#\\)`, 'g'), (match, index, filename) => {
return `${index} [${index} ${filename}](${link}${pageNumber})`;
});
}
}
// Uncomment for debugging
// console.log('content: ', content);
return { ...message, content };
}
return message
return message;
});

// Use Promise.all to wait for all promises to resolve
const updatedMessages = await Promise.all(updatedMessagesPromises);

updatedConversation = {
...updatedConversation,
messages: updatedMessages,
}
homeDispatch({
field: 'selectedConversation',
value: updatedConversation,
})
updatedConversation = {
...updatedConversation,
messages: updatedMessages,
}
homeDispatch({
field: 'selectedConversation',
value: updatedConversation,
})
}
} catch (error) {
console.error('Error reading from stream:', error);
homeDispatch({ field: 'loading', value: false });
homeDispatch({ field: 'messageIsStreaming', value: false });
return;
} finally {
// Reset cache metrics after each message
resetCacheMetrics();
}

if (!done) {
throw new Error('Stream ended prematurely');
}

saveConversation(updatedConversation)
// todo: add clerk user info to onMessagereceived for logging.
if (clerk_obj.isLoaded && clerk_obj.isSignedIn) {
Expand Down
112 changes: 7 additions & 105 deletions src/components/Chat/ChatInput.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -357,111 +357,6 @@ export const ChatInput = ({
return validImageTypes.includes(`.${ext}`);
}

// const uploadToS3 = async (file: File) => {
// if (!file) {
// console.error('No file provided for upload');
// return;
// }

// // Generate a unique file name using uuidv4
// const uniqueFileName = `${uuidv4()}.${file.name.split('.').pop()}`;
// const s3_filepath = `courses/${courseName}/${uniqueFileName}`; // Define s3_filepath here

// console.log('uploadToS3 called with uniqueFileName:', uniqueFileName);
// console.log('uploadToS3 called with s3_filepath:', s3_filepath);

// // Prepare the request body for the API call
// // Prepare the request body for the API call
// const requestObject = {
// method: 'POST',
// headers: {
// 'Content-Type': 'application/json',
// },
// body: JSON.stringify({
// uniqueFileName: uniqueFileName,
// fileType: file.type,
// courseName: courseName,
// }),
// };

// try {
// // Call your API to get the presigned POST data
// const response = await fetch('/api/UIUC-api/uploadToS3', requestObject);
// if (!response.ok) {
// throw new Error(`HTTP error! Status: ${response.status}`);
// }
// const { post } = await response.json();

// // Use the presigned POST data to upload the file to S3
// const formData = new FormData();
// Object.entries(post.fields).forEach(([key, value]) => {
// formData.append(key, value as string);
// });
// formData.append('file', file);

// // Post the file to the S3 bucket using the presigned URL and form data
// const uploadResponse = await fetch(post.url, {
// method: 'POST',
// body: formData,
// });

// if (!uploadResponse.ok) {
// throw new Error('Failed to upload the file to S3');
// }

// // Construct the URL to the uploaded file using the response from the presigned POST
// const uploadedImageUrl = `https://${aws_config.bucketName}.s3.${aws_config.region}.amazonaws.com/${encodeURIComponent(s3_filepath)}`;

// return uploadedImageUrl;
// } catch (error) {
// console.error('Error uploading file:', error);
// }
// };



const ingestFile = async (file: File | null) => {
if (!file) return;

const fileExtension = file.name.slice(((file.name.lastIndexOf(".") - 1) >>> 0) + 2);
const uniqueFileName = `${uuidv4()}.${fileExtension}`;

const queryParams = new URLSearchParams({
courseName: courseName,
fileName: uniqueFileName,
}).toString();

const requestObject = {
method: 'GET',
headers: {
'Content-Type': 'application/json',
},
query: {
fileName: file.name,
courseName: courseName,
},
}

// Actually we CAN await here, just don't await this function.
console.log('right before call /ingest...')
const response = await fetch(
`/api/UIUC-api/ingest?${queryParams}`,
requestObject,
)

// check if the response was ok
if (response.ok) {
const data = await response.json()
// console.log(file.name as string + ' ingested successfully!!')
console.log('Success or Failure:', data)
return data
} else {
console.log('Error during ingest:', response.statusText)
console.log('Full Response message:', response)
return response
}
}

const showToastOnInvalidImage = useCallback(() => {
notifications.show({
id: 'error-notification',
Expand Down Expand Up @@ -664,6 +559,13 @@ export const ChatInput = ({
}
}, []);

useEffect(() => {
setContent(inputContent)
if (textareaRef.current) {
textareaRef.current.focus()
}
}, [inputContent, textareaRef])

// This is where we upload images and generate their presigned url
async function uploadImageAndGetUrl(file: File, courseName: string): Promise<string> {
try {
Expand Down
2 changes: 1 addition & 1 deletion src/components/Chat/ChatMessage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ export const ChatMessage: FC<Props> = memo(
const { href, title } = props;
// console.log("href:", href);
// console.log("title:", title);
console.log("children:", children);
// console.log("children:", children);
const isCitationLink = /^\d+$/.test(children[0] as string);
if (isCitationLink) {
return (
Expand Down
Loading

1 comment on commit 44aeb80

@vercel
Copy link

@vercel vercel bot commented on 44aeb80 Dec 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.