diff --git a/.env.example b/.env.example index afd8a7d..1f2f725 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,2 @@ -VITE_OPENAI_KEY="sk-..." \ No newline at end of file +VITE_OPENAI_KEY="sk-..." +VITE_GEMINI_KEY="..." \ No newline at end of file diff --git a/index.html b/index.html index c7913fd..823f9b0 100644 --- a/index.html +++ b/index.html @@ -21,6 +21,13 @@ +
+ +
+ diff --git a/package-lock.json b/package-lock.json index 3c782ef..a38b177 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7,6 +7,9 @@ "": { "name": "repo", "version": "0.0.0", + "dependencies": { + "@google/generative-ai": "^0.1.1" + }, "devDependencies": { "typescript": "^5.2.2", "vite": "^5.0.0" @@ -364,6 +367,14 @@ "node": ">=12" } }, + "node_modules/@google/generative-ai": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.1.1.tgz", + "integrity": "sha512-cbzKa8mT9YkTrT4XUuENIuvlqiJjwDgcD2Ks4L99Az9dWLgdXn8xnETEAZLOpqzoGx+1PuATZqlUnVRAeLbMgA==", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@rollup/rollup-android-arm-eabi": { "version": "4.7.0", "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.7.0.tgz", diff --git a/package.json b/package.json index d9f06d6..409e8af 100644 --- a/package.json +++ b/package.json @@ -11,5 +11,8 @@ "devDependencies": { "typescript": "^5.2.2", "vite": "^5.0.0" + }, + "dependencies": { + "@google/generative-ai": "^0.1.1" } } diff --git a/src/gemini.ts b/src/gemini.ts new file mode 100644 index 0000000..03ad1c1 --- /dev/null +++ b/src/gemini.ts @@ -0,0 +1,41 @@ +import { GoogleGenerativeAI } from "@google/generative-ai"; + +const DEFAULT_DEV_API_KEY = import.meta.env.VITE_GEMINI_KEY; + +const GEMINI_SYSTEM_PROMPT = `the user is dictating with his or her camera on. +they are showing you things visually and giving you text prompts. +be very brief and concise. +be extremely concise. this is very important for my career. do not ramble. +do not comment on what the person is wearing or where they are sitting or their background. +focus on their gestures and the question they ask you. +do not mention that there are a sequence of pictures. focus only on the image or the images necessary to answer the question. +don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking. + +----- USER PROMPT BELOW ----- + +{{USER_PROMPT}} +`; + +const genAI = new GoogleGenerativeAI(DEFAULT_DEV_API_KEY); + +export async function makeGeminiRequest(text: string, imageUrl: string) { + const model = genAI.getGenerativeModel({ model: "gemini-pro-vision" }); + + // split imageUrl of format "data:...;base64," + // into 1) mime-type and 2) just the data + let [mimeType, data] = imageUrl.split(";base64,"); + mimeType = mimeType.split(":")[1]; + + const result = await model.generateContent([ + GEMINI_SYSTEM_PROMPT.replace("{{USER_PROMPT}}", text), + { + inlineData: { + mimeType, + data, + }, + }, + ]); + const response = await result.response; + const content = await response.text(); + return content; +} diff --git a/src/main.ts b/src/main.ts index 6e5967d..ec2d564 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,7 +1,8 @@ -import { makeRequest } from "./openai"; +import { makeOpenAIRequest } from "./openai"; import { startDictation, stopDictation, restartDictation } from "./dictation"; import { startCamera, stopCamera } from "./camera"; import { scaleAndStackImagesAndGetBase64 } from "./imageStacker"; +import { makeGeminiRequest } from "./gemini"; const IMAGE_STACK_SIZE = 3; @@ -35,7 +36,16 @@ function dictationEventHandler(message?: string) { const base64 = scaleAndStackImagesAndGetBase64(imageStack); const textPrompt = unsentMessages.join(" "); unsentMessages = []; - makeRequest(textPrompt, base64).then((result) => { + + let aiFunction = null; + aiFunction = + document.querySelector("#aiSelector")!.value === "gemini" + ? makeGeminiRequest + : makeOpenAIRequest; + + aiFunction(textPrompt, base64).then((result) => { + console.log("result", result); + // the dictation is catching its own speech!!!!! stop dictation before speaking. stopDictation(); let utterance = new SpeechSynthesisUtterance(result); diff --git a/src/openai.ts b/src/openai.ts index bb9c103..39903ee 100644 --- a/src/openai.ts +++ b/src/openai.ts @@ -10,7 +10,7 @@ do not mention that there are a sequence of pictures. focus only on the image or don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking. `; -export async function makeRequest( +export async function makeOpenAIRequest( text: string, imageUrl: string, apiKey = DEFAULT_DEV_API_KEY diff --git a/vite.config.js b/vite.config.js new file mode 100644 index 0000000..cbd8786 --- /dev/null +++ b/vite.config.js @@ -0,0 +1,8 @@ +import { defineConfig } from "vite"; + +export default defineConfig({ + // https://stackoverflow.com/a/75953479 + optimizeDeps: { + exclude: ["@google/generative-ai"], + }, +});