add gemini to the mix

gregsadetsky · Dec 13, 2023 · 6b9f70a · 6b9f70a
1 parent e3b4a41
commit 6b9f70a
Show file tree

Hide file tree

Showing 8 changed files with 85 additions and 4 deletions.
diff --git a/.env.example b/.env.example
@@ -1 +1,2 @@
-VITE_OPENAI_KEY="sk-..."
+VITE_OPENAI_KEY="sk-..."
+VITE_GEMINI_KEY="..."
diff --git a/index.html b/index.html
@@ -21,6 +21,13 @@
   <body>
     <button style='font-size: 24px;' id='letsGo'>Start</button>
 
+    <div style='margin-top:20px;margin-bottom:20px;'>
+      <select id='aiSelector'>
+        <option value="gemini">Gemini</option>
+        <option value="gpt">GPT-4</option>
+      </select>
+    </div>
+
     <video autoplay playsinline webkit-playsinline muted hidden></video>
     <canvas id='canvas' width='800' height='600'></canvas>
 

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -11,5 +11,8 @@
   "devDependencies": {
     "typescript": "^5.2.2",
     "vite": "^5.0.0"
+  },
+  "dependencies": {
+    "@google/generative-ai": "^0.1.1"
   }
 }
diff --git a/src/gemini.ts b/src/gemini.ts
@@ -0,0 +1,41 @@
+import { GoogleGenerativeAI } from "@google/generative-ai";
+
+const DEFAULT_DEV_API_KEY = import.meta.env.VITE_GEMINI_KEY;
+
+const GEMINI_SYSTEM_PROMPT = `the user is dictating with his or her camera on.
+they are showing you things visually and giving you text prompts.
+be very brief and concise.
+be extremely concise. this is very important for my career. do not ramble.
+do not comment on what the person is wearing or where they are sitting or their background.
+focus on their gestures and the question they ask you.
+do not mention that there are a sequence of pictures. focus only on the image or the images necessary to answer the question.
+don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking.
+
+----- USER PROMPT BELOW -----
+
+{{USER_PROMPT}}
+`;
+
+const genAI = new GoogleGenerativeAI(DEFAULT_DEV_API_KEY);
+
+export async function makeGeminiRequest(text: string, imageUrl: string) {
+  const model = genAI.getGenerativeModel({ model: "gemini-pro-vision" });
+
+  // split imageUrl of format "data:...;base64,<data>"
+  // into 1) mime-type and 2) just the data
+  let [mimeType, data] = imageUrl.split(";base64,");
+  mimeType = mimeType.split(":")[1];
+
+  const result = await model.generateContent([
+    GEMINI_SYSTEM_PROMPT.replace("{{USER_PROMPT}}", text),
+    {
+      inlineData: {
+        mimeType,
+        data,
+      },
+    },
+  ]);
+  const response = await result.response;
+  const content = await response.text();
+  return content;
+}
diff --git a/src/main.ts b/src/main.ts
@@ -1,7 +1,8 @@
-import { makeRequest } from "./openai";
+import { makeOpenAIRequest } from "./openai";
 import { startDictation, stopDictation, restartDictation } from "./dictation";
 import { startCamera, stopCamera } from "./camera";
 import { scaleAndStackImagesAndGetBase64 } from "./imageStacker";
+import { makeGeminiRequest } from "./gemini";
 
 const IMAGE_STACK_SIZE = 3;
 
@@ -35,7 +36,16 @@ function dictationEventHandler(message?: string) {
     const base64 = scaleAndStackImagesAndGetBase64(imageStack);
     const textPrompt = unsentMessages.join(" ");
     unsentMessages = [];
-    makeRequest(textPrompt, base64).then((result) => {
+
+    let aiFunction = null;
+    aiFunction =
+      document.querySelector("#aiSelector")!.value === "gemini"
+        ? makeGeminiRequest
+        : makeOpenAIRequest;
+
+    aiFunction(textPrompt, base64).then((result) => {
+      console.log("result", result);
+
       // the dictation is catching its own speech!!!!! stop dictation before speaking.
       stopDictation();
       let utterance = new SpeechSynthesisUtterance(result);

diff --git a/src/openai.ts b/src/openai.ts
@@ -10,7 +10,7 @@ do not mention that there are a sequence of pictures. focus only on the image or
 don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking.
 `;
 
-export async function makeRequest(
+export async function makeOpenAIRequest(
   text: string,
   imageUrl: string,
   apiKey = DEFAULT_DEV_API_KEY

diff --git a/vite.config.js b/vite.config.js
@@ -0,0 +1,8 @@
+import { defineConfig } from "vite";
+
+export default defineConfig({
+  // https://stackoverflow.com/a/75953479
+  optimizeDeps: {
+    exclude: ["@google/generative-ai"],
+  },
+});