Skip to content

Commit

Permalink
add gemini to the mix
Browse files Browse the repository at this point in the history
  • Loading branch information
gregsadetsky committed Dec 13, 2023
1 parent e3b4a41 commit 6b9f70a
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 4 deletions.
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
VITE_OPENAI_KEY="sk-..."
VITE_OPENAI_KEY="sk-..."
VITE_GEMINI_KEY="..."
7 changes: 7 additions & 0 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@
<body>
<button style='font-size: 24px;' id='letsGo'>Start</button>

<div style='margin-top:20px;margin-bottom:20px;'>
<select id='aiSelector'>
<option value="gemini">Gemini</option>
<option value="gpt">GPT-4</option>
</select>
</div>

<video autoplay playsinline webkit-playsinline muted hidden></video>
<canvas id='canvas' width='800' height='600'></canvas>

Expand Down
11 changes: 11 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,8 @@
"devDependencies": {
"typescript": "^5.2.2",
"vite": "^5.0.0"
},
"dependencies": {
"@google/generative-ai": "^0.1.1"
}
}
41 changes: 41 additions & 0 deletions src/gemini.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import { GoogleGenerativeAI } from "@google/generative-ai";

const DEFAULT_DEV_API_KEY = import.meta.env.VITE_GEMINI_KEY;

const GEMINI_SYSTEM_PROMPT = `the user is dictating with his or her camera on.
they are showing you things visually and giving you text prompts.
be very brief and concise.
be extremely concise. this is very important for my career. do not ramble.
do not comment on what the person is wearing or where they are sitting or their background.
focus on their gestures and the question they ask you.
do not mention that there are a sequence of pictures. focus only on the image or the images necessary to answer the question.
don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking.
----- USER PROMPT BELOW -----
{{USER_PROMPT}}
`;

const genAI = new GoogleGenerativeAI(DEFAULT_DEV_API_KEY);

export async function makeGeminiRequest(text: string, imageUrl: string) {
const model = genAI.getGenerativeModel({ model: "gemini-pro-vision" });

// split imageUrl of format "data:...;base64,<data>"
// into 1) mime-type and 2) just the data
let [mimeType, data] = imageUrl.split(";base64,");
mimeType = mimeType.split(":")[1];

const result = await model.generateContent([
GEMINI_SYSTEM_PROMPT.replace("{{USER_PROMPT}}", text),
{
inlineData: {
mimeType,
data,
},
},
]);
const response = await result.response;
const content = await response.text();
return content;
}
14 changes: 12 additions & 2 deletions src/main.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import { makeRequest } from "./openai";
import { makeOpenAIRequest } from "./openai";
import { startDictation, stopDictation, restartDictation } from "./dictation";
import { startCamera, stopCamera } from "./camera";
import { scaleAndStackImagesAndGetBase64 } from "./imageStacker";
import { makeGeminiRequest } from "./gemini";

const IMAGE_STACK_SIZE = 3;

Expand Down Expand Up @@ -35,7 +36,16 @@ function dictationEventHandler(message?: string) {
const base64 = scaleAndStackImagesAndGetBase64(imageStack);
const textPrompt = unsentMessages.join(" ");
unsentMessages = [];
makeRequest(textPrompt, base64).then((result) => {

let aiFunction = null;
aiFunction =
document.querySelector("#aiSelector")!.value === "gemini"
? makeGeminiRequest
: makeOpenAIRequest;

aiFunction(textPrompt, base64).then((result) => {
console.log("result", result);

// the dictation is catching its own speech!!!!! stop dictation before speaking.
stopDictation();
let utterance = new SpeechSynthesisUtterance(result);
Expand Down
2 changes: 1 addition & 1 deletion src/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ do not mention that there are a sequence of pictures. focus only on the image or
don't comment if they are smiling. don't comment if they are frowning. just focus on what they're asking.
`;

export async function makeRequest(
export async function makeOpenAIRequest(
text: string,
imageUrl: string,
apiKey = DEFAULT_DEV_API_KEY
Expand Down
8 changes: 8 additions & 0 deletions vite.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { defineConfig } from "vite";

export default defineConfig({
// https://stackoverflow.com/a/75953479
optimizeDeps: {
exclude: ["@google/generative-ai"],
},
});

0 comments on commit 6b9f70a

Please sign in to comment.