diff --git a/.env.example b/.env.example
index afd8a7d..1f2f725 100644
--- a/.env.example
+++ b/.env.example
@@ -1 +1,2 @@
-VITE_OPENAI_KEY="sk-..."
\ No newline at end of file
+VITE_OPENAI_KEY="sk-..."
+VITE_GEMINI_KEY="..."
\ No newline at end of file
diff --git a/index.html b/index.html
index c7913fd..823f9b0 100644
--- a/index.html
+++ b/index.html
@@ -21,6 +21,13 @@
+
+
+
+
diff --git a/package-lock.json b/package-lock.json
index 3c782ef..a38b177 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -7,6 +7,9 @@
"": {
"name": "repo",
"version": "0.0.0",
+ "dependencies": {
+ "@google/generative-ai": "^0.1.1"
+ },
"devDependencies": {
"typescript": "^5.2.2",
"vite": "^5.0.0"
@@ -364,6 +367,14 @@
"node": ">=12"
}
},
+ "node_modules/@google/generative-ai": {
+ "version": "0.1.1",
+ "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.1.1.tgz",
+ "integrity": "sha512-cbzKa8mT9YkTrT4XUuENIuvlqiJjwDgcD2Ks4L99Az9dWLgdXn8xnETEAZLOpqzoGx+1PuATZqlUnVRAeLbMgA==",
+ "engines": {
+ "node": ">=18.0.0"
+ }
+ },
"node_modules/@rollup/rollup-android-arm-eabi": {
"version": "4.7.0",
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.7.0.tgz",
diff --git a/package.json b/package.json
index d9f06d6..409e8af 100644
--- a/package.json
+++ b/package.json
@@ -11,5 +11,8 @@
"devDependencies": {
"typescript": "^5.2.2",
"vite": "^5.0.0"
+ },
+ "dependencies": {
+ "@google/generative-ai": "^0.1.1"
}
}
diff --git a/src/gemini.ts b/src/gemini.ts
new file mode 100644
index 0000000..03ad1c1
--- /dev/null
+++ b/src/gemini.ts
@@ -0,0 +1,41 @@
+import { GoogleGenerativeAI } from "@google/generative-ai";
+
+const DEFAULT_DEV_API_KEY = import.meta.env.VITE_GEMINI_KEY;
+
+const GEMINI_SYSTEM_PROMPT = `the user is dictating with his or her camera on.
+they are showing you things visually and giving you text prompts.
+be very brief and concise.
+be extremely concise. this is very important for my career. do not ramble.
+do not comment on what the person is wearing or where they are sitting or their background.
+focus on their gestures and the question they ask you.
+do not mention that there are a sequence of pictures. focus only on the image or the images necessary to answer the question.
+don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking.
+
+----- USER PROMPT BELOW -----
+
+{{USER_PROMPT}}
+`;
+
+const genAI = new GoogleGenerativeAI(DEFAULT_DEV_API_KEY);
+
+export async function makeGeminiRequest(text: string, imageUrl: string) {
+ const model = genAI.getGenerativeModel({ model: "gemini-pro-vision" });
+
+ // split imageUrl of format "data:...;base64,"
+ // into 1) mime-type and 2) just the data
+ let [mimeType, data] = imageUrl.split(";base64,");
+ mimeType = mimeType.split(":")[1];
+
+ const result = await model.generateContent([
+ GEMINI_SYSTEM_PROMPT.replace("{{USER_PROMPT}}", text),
+ {
+ inlineData: {
+ mimeType,
+ data,
+ },
+ },
+ ]);
+ const response = await result.response;
+ const content = await response.text();
+ return content;
+}
diff --git a/src/main.ts b/src/main.ts
index 6e5967d..ec2d564 100644
--- a/src/main.ts
+++ b/src/main.ts
@@ -1,7 +1,8 @@
-import { makeRequest } from "./openai";
+import { makeOpenAIRequest } from "./openai";
import { startDictation, stopDictation, restartDictation } from "./dictation";
import { startCamera, stopCamera } from "./camera";
import { scaleAndStackImagesAndGetBase64 } from "./imageStacker";
+import { makeGeminiRequest } from "./gemini";
const IMAGE_STACK_SIZE = 3;
@@ -35,7 +36,16 @@ function dictationEventHandler(message?: string) {
const base64 = scaleAndStackImagesAndGetBase64(imageStack);
const textPrompt = unsentMessages.join(" ");
unsentMessages = [];
- makeRequest(textPrompt, base64).then((result) => {
+
+ let aiFunction = null;
+ aiFunction =
+ document.querySelector("#aiSelector")!.value === "gemini"
+ ? makeGeminiRequest
+ : makeOpenAIRequest;
+
+ aiFunction(textPrompt, base64).then((result) => {
+ console.log("result", result);
+
// the dictation is catching its own speech!!!!! stop dictation before speaking.
stopDictation();
let utterance = new SpeechSynthesisUtterance(result);
diff --git a/src/openai.ts b/src/openai.ts
index bb9c103..39903ee 100644
--- a/src/openai.ts
+++ b/src/openai.ts
@@ -10,7 +10,7 @@ do not mention that there are a sequence of pictures. focus only on the image or
don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking.
`;
-export async function makeRequest(
+export async function makeOpenAIRequest(
text: string,
imageUrl: string,
apiKey = DEFAULT_DEV_API_KEY
diff --git a/vite.config.js b/vite.config.js
new file mode 100644
index 0000000..cbd8786
--- /dev/null
+++ b/vite.config.js
@@ -0,0 +1,8 @@
+import { defineConfig } from "vite";
+
+export default defineConfig({
+ // https://stackoverflow.com/a/75953479
+ optimizeDeps: {
+ exclude: ["@google/generative-ai"],
+ },
+});