Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 152 additions & 41 deletions electron/LLMHelper.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import { GoogleGenerativeAI, GenerativeModel } from "@google/generative-ai"
import fs from "fs"
import sharp from "sharp"

interface OllamaResponse {
response: string
done: boolean
}

interface OllamaVisionResponse {
response: string
done: boolean
}

export class LLMHelper {
private model: GenerativeModel | null = null
private readonly systemPrompt = `You are Wingman AI, a helpful, proactive assistant for any kind of problem or situation (not just coding). For any user input, analyze the situation, provide a clear problem statement, relevant context, and suggest several possible responses or actions the user could take next. Always explain your reasoning. Present your suggestions as a list of options or next steps.`
Expand All @@ -15,12 +21,12 @@ export class LLMHelper {

constructor(apiKey?: string, useOllama: boolean = false, ollamaModel?: string, ollamaUrl?: string) {
this.useOllama = useOllama

if (useOllama) {
this.ollamaUrl = ollamaUrl || "http://localhost:11434"
this.ollamaModel = ollamaModel || "gemma:latest" // Default fallback
this.ollamaModel = ollamaModel // Default fallback
console.log(`[LLMHelper] Using Ollama with model: ${this.ollamaModel}`)

// Auto-detect and use first available model if specified model doesn't exist
this.initializeOllamaModel()
} else if (apiKey) {
Expand Down Expand Up @@ -50,6 +56,7 @@ export class LLMHelper {
return text;
}


private async callOllama(prompt: string): Promise<string> {
try {
const response = await fetch(`${this.ollamaUrl}/api/generate`, {
Expand Down Expand Up @@ -80,6 +87,51 @@ export class LLMHelper {
}
}

private supportsOllamaVision(): boolean {
const model = this.ollamaModel.toLowerCase()
const visionHints = ["vision", "llava", "moondream", "pixtral", "minicpm", "gpt4o", "flux", "reka"]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's look like a hardcode. If you will merge this could you also add ministral and deepseek-ocr

return visionHints.some((hint) => model.includes(hint))
}

private async callOllamaVision(prompt: string, images: string[]): Promise<string> {
if (!this.supportsOllamaVision()) {
throw new Error(`Current Ollama model (${this.ollamaModel}) does not support image understanding. Switch to a vision-capable model such as llama3.2-vision, llava, gemma2:vision, etc.`)
}

try {
const response = await fetch(`${this.ollamaUrl}/api/generate`, {
method: "POST",
headers: {
"Content-Type": "application/json"
},
body: JSON.stringify({
model: this.ollamaModel,
prompt,
images,
stream: false,
options: {
temperature: 0.4,
top_p: 0.9
}
})
})

if (!response.ok) {
throw new Error(`Ollama vision API error: ${response.status} ${response.statusText}`)
}

const data: OllamaVisionResponse = await response.json()
if (!data?.response) {
throw new Error("Ollama vision API returned an empty response")
}

return data.response
} catch (error) {
console.error("[LLMHelper] Error calling Ollama vision endpoint:", error)
throw error
}
}

private async checkOllamaAvailable(): Promise<boolean> {
try {
const response = await fetch(`${this.ollamaUrl}/api/tags`)
Expand Down Expand Up @@ -123,15 +175,26 @@ export class LLMHelper {

public async extractProblemFromImages(imagePaths: string[]) {
try {
const imageParts = await Promise.all(imagePaths.map(path => this.fileToGenerativePart(path)))

const prompt = `${this.systemPrompt}\n\nYou are a wingman. Please analyze these images and extract the following information in JSON format:\n{
"problem_statement": "A clear statement of the problem or situation depicted in the images.",
"context": "Relevant background or context from the images.",
"suggested_responses": ["First possible answer or action", "Second possible answer or action", "..."],
"reasoning": "Explanation of why these suggestions are appropriate."
}\nImportant: Return ONLY the JSON object, without any markdown formatting or code blocks.`

if (this.useOllama) {
const imagePayloads = await Promise.all(
imagePaths.map(async (imagePath) => (await fs.promises.readFile(imagePath)).toString("base64"))
)
const text = await this.callOllamaVision(prompt, imagePayloads)
return JSON.parse(this.cleanJsonResponse(text))
}

if (!this.model) {
throw new Error("Gemini model is not initialized. Provide a Gemini API key or enable Ollama mode.")
}

const imageParts = await Promise.all(imagePaths.map(path => this.fileToGenerativePart(path)))
const result = await this.model.generateContent([prompt, ...imageParts])
const response = await result.response
const text = this.cleanJsonResponse(response.text())
Expand All @@ -153,13 +216,21 @@ export class LLMHelper {
}
}\nImportant: Return ONLY the JSON object, without any markdown formatting or code blocks.`

console.log("[LLMHelper] Calling Gemini LLM for solution...");
console.log("[LLMHelper] Calling LLM for solution...");
try {
const result = await this.model.generateContent(prompt)
console.log("[LLMHelper] Gemini LLM returned result.");
const response = await result.response
const text = this.cleanJsonResponse(response.text())
const parsed = JSON.parse(text)
const text = this.useOllama
? await this.callOllama(prompt)
: await (async () => {
if (!this.model) {
throw new Error("Gemini model is not initialized. Provide a Gemini API key or enable Ollama mode.")
}
const result = await this.model.generateContent(prompt)
const response = await result.response
return response.text()
})()

const cleaned = this.cleanJsonResponse(text)
const parsed = JSON.parse(cleaned)
console.log("[LLMHelper] Parsed LLM response:", parsed)
return parsed
} catch (error) {
Expand All @@ -170,8 +241,6 @@ export class LLMHelper {

public async debugSolutionWithImages(problemInfo: any, currentCode: string, debugImagePaths: string[]) {
try {
const imageParts = await Promise.all(debugImagePaths.map(path => this.fileToGenerativePart(path)))

const prompt = `${this.systemPrompt}\n\nYou are a wingman. Given:\n1. The original problem or situation: ${JSON.stringify(problemInfo, null, 2)}\n2. The current response or approach: ${currentCode}\n3. The debug information in the provided images\n\nPlease analyze the debug information and provide feedback in this JSON format:\n{
"solution": {
"code": "The code or main answer here.",
Expand All @@ -182,6 +251,21 @@ export class LLMHelper {
}
}\nImportant: Return ONLY the JSON object, without any markdown formatting or code blocks.`

if (this.useOllama) {
const imagePayloads = await Promise.all(
debugImagePaths.map(async (imagePath) => (await fs.promises.readFile(imagePath)).toString("base64"))
)
const text = await this.callOllamaVision(prompt, imagePayloads)
const parsed = JSON.parse(this.cleanJsonResponse(text))
console.log("[LLMHelper] Parsed debug LLM response:", parsed)
return parsed
}

if (!this.model) {
throw new Error("Gemini model is not initialized. Provide a Gemini API key or enable Ollama mode.")
}

const imageParts = await Promise.all(debugImagePaths.map(path => this.fileToGenerativePart(path)))
const result = await this.model.generateContent([prompt, ...imageParts])
const response = await result.response
const text = this.cleanJsonResponse(response.text())
Expand All @@ -195,61 +279,88 @@ export class LLMHelper {
}

public async analyzeAudioFile(audioPath: string) {
if (this.useOllama) {
throw new Error("Audio analysis is currently only supported when using Gemini. Switch off USE_OLLAMA or supply a Gemini API key.")
}

try {
const audioData = await fs.promises.readFile(audioPath);
if (!this.model) {
throw new Error("Gemini model is not initialized. Provide a Gemini API key or disable Ollama mode.")
}

const audioData = await fs.promises.readFile(audioPath)
const audioPart = {
inlineData: {
data: audioData.toString("base64"),
mimeType: "audio/mp3"
}
};
const prompt = `${this.systemPrompt}\n\nDescribe this audio clip in a short, concise answer. In addition to your main answer, suggest several possible actions or responses the user could take next based on the audio. Do not return a structured JSON object, just answer naturally as you would to a user.`;
const result = await this.model.generateContent([prompt, audioPart]);
const response = await result.response;
const text = response.text();
return { text, timestamp: Date.now() };
}
const prompt = `${this.systemPrompt}\n\nDescribe this audio clip in a short, concise answer. In addition to your main answer, suggest several possible actions or responses the user could take next based on the audio. Do not return a structured JSON object, just answer naturally as you would to a user.`
const result = await this.model.generateContent([prompt, audioPart])
const response = await result.response
const text = response.text()
return { text, timestamp: Date.now() }
} catch (error) {
console.error("Error analyzing audio file:", error);
throw error;
console.error("Error analyzing audio file:", error)
throw error
}
}

public async analyzeAudioFromBase64(data: string, mimeType: string) {
if (this.useOllama) {
throw new Error("Audio analysis is currently only supported when using Gemini. Switch off USE_OLLAMA or supply a Gemini API key.")
}

try {
if (!this.model) {
throw new Error("Gemini model is not initialized. Provide a Gemini API key or disable Ollama mode.")
}

const audioPart = {
inlineData: {
data,
mimeType
}
};
const prompt = `${this.systemPrompt}\n\nDescribe this audio clip in a short, concise answer. In addition to your main answer, suggest several possible actions or responses the user could take next based on the audio. Do not return a structured JSON object, just answer naturally as you would to a user and be concise.`;
const result = await this.model.generateContent([prompt, audioPart]);
const response = await result.response;
const text = response.text();
return { text, timestamp: Date.now() };
}
const prompt = `${this.systemPrompt}\n\nDescribe this audio clip in a short, concise answer. In addition to your main answer, suggest several possible actions or responses the user could take next based on the audio. Do not return a structured JSON object, just answer naturally as you would to a user and be concise.`
const result = await this.model.generateContent([prompt, audioPart])
const response = await result.response
const text = response.text()
return { text, timestamp: Date.now() }
} catch (error) {
console.error("Error analyzing audio from base64:", error);
throw error;
console.error("Error analyzing audio from base64:", error)
throw error
}
}

public async analyzeImageFile(imagePath: string) {
try {
const imageData = await fs.promises.readFile(imagePath);
const base64Image = (await fs.promises.readFile(imagePath)).toString("base64")
const prompt = `${this.systemPrompt}\n\nDescribe the content of this image in a short, concise answer. In addition to your main answer, suggest several possible actions or responses the user could take next based on the image. Do not return a structured JSON object, just answer naturally as you would to a user. Be concise and brief.`

if (this.useOllama) {
const text = await this.callOllamaVision(prompt, [base64Image])
return { text: text.trim(), timestamp: Date.now() }
}

if (!this.model) {
throw new Error("Gemini model is not initialized. Provide a Gemini API key or enable Ollama mode.")
}

const imagePart = {
inlineData: {
data: imageData.toString("base64"),
data: base64Image,
mimeType: "image/png"
}
};
const prompt = `${this.systemPrompt}\n\nDescribe the content of this image in a short, concise answer. In addition to your main answer, suggest several possible actions or responses the user could take next based on the image. Do not return a structured JSON object, just answer naturally as you would to a user. Be concise and brief.`;
const result = await this.model.generateContent([prompt, imagePart]);
const response = await result.response;
const text = response.text();
return { text, timestamp: Date.now() };
}

const result = await this.model.generateContent([prompt, imagePart])
const response = await result.response
const text = response.text()
return { text, timestamp: Date.now() }
} catch (error) {
console.error("Error analyzing image file:", error);
throw error;
console.error("Error analyzing image file:", error)
throw error
}
}

Expand Down Expand Up @@ -357,4 +468,4 @@ export class LLMHelper {
return { success: false, error: error.message };
}
}
}
}
30 changes: 24 additions & 6 deletions electron/ScreenshotHelper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@

import path from "node:path"
import fs from "node:fs"
import { app } from "electron"
import { app, desktopCapturer } from "electron"
import { v4 as uuidv4 } from "uuid"
import screenshot from "screenshot-desktop"

export class ScreenshotHelper {
private screenshotQueue: string[] = []
Expand All @@ -28,10 +27,10 @@ export class ScreenshotHelper {

// Create directories if they don't exist
if (!fs.existsSync(this.screenshotDir)) {
fs.mkdirSync(this.screenshotDir)
fs.mkdirSync(this.screenshotDir, { recursive: true })
}
if (!fs.existsSync(this.extraScreenshotDir)) {
fs.mkdirSync(this.extraScreenshotDir)
fs.mkdirSync(this.extraScreenshotDir, { recursive: true })
}
}

Expand Down Expand Up @@ -84,11 +83,30 @@ export class ScreenshotHelper {
// Add a small delay to ensure window is hidden
await new Promise(resolve => setTimeout(resolve, 100))

// Use Electron's desktopCapturer to get screen sources
const sources = await desktopCapturer.getSources({
types: ['screen'],
thumbnailSize: { width: 1024, height: 1024 }
})

if (sources.length === 0) {
throw new Error("No screen sources available")
}

// Get the primary screen (first source)
const primaryScreen = sources[0]
const image = primaryScreen.thumbnail

if (image.isEmpty()) {
throw new Error("Failed to capture screen - image is empty")
}

let screenshotPath = ""
const screenshotBuffer = image.toPNG()

if (this.view === "queue") {
screenshotPath = path.join(this.screenshotDir, `${uuidv4()}.png`)
await screenshot({ filename: screenshotPath })
await fs.promises.writeFile(screenshotPath, screenshotBuffer)

this.screenshotQueue.push(screenshotPath)
if (this.screenshotQueue.length > this.MAX_SCREENSHOTS) {
Expand All @@ -103,7 +121,7 @@ export class ScreenshotHelper {
}
} else {
screenshotPath = path.join(this.extraScreenshotDir, `${uuidv4()}.png`)
await screenshot({ filename: screenshotPath })
await fs.promises.writeFile(screenshotPath, screenshotBuffer)

this.extraScreenshotQueue.push(screenshotPath)
if (this.extraScreenshotQueue.length > this.MAX_SCREENSHOTS) {
Expand Down
5 changes: 4 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -155,5 +155,8 @@
"tailwind-merge": "^2.5.4",
"tesseract.js": "^5.0.5",
"uuid": "^11.0.3"
}
},
"bin": {
"start-cluely": "./start.sh"
}
}
3 changes: 3 additions & 0 deletions start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cd "/home/nandita/Documents/free-cluely"
echo "Launching Free Cluely... hold tight!"
npm start