Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## 0.5.4 — 2025-12-08

### Added
- Browser mode: `--agent` flag enables ChatGPT Agent mode for agentic tasks (web browsing, code execution). Response capture waits for the send button to reappear, ensuring full agent output is captured after all actions complete.

### Changed
- Docs: README now explicitly warns against `pnpx @steipete/oracle` (pnpx cache breaks sqlite bindings); use `npx -y @steipete/oracle` instead. Thanks Xuanwo for flagging this.
- Browser uploads: stick to the single reliable file-input path (no drag/drop fallbacks), wait for the composer to render the new “N files” pill/remove-card UI before sending, and prefer non-image inputs. Thanks Peter for the repros and screenshots that caught the regressions.
Expand Down
2 changes: 2 additions & 0 deletions bin/oracle-cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ interface CliOptions extends OptionValues {
remoteChrome?: string;
browserPort?: number;
browserDebugPort?: number;
agent?: boolean;
remoteHost?: string;
remoteToken?: string;
copyMarkdown?: boolean;
Expand Down Expand Up @@ -380,6 +381,7 @@ program
new Option('--browser-inline-files', 'Paste files directly into the ChatGPT composer instead of uploading attachments.').default(false),
)
.addOption(new Option('--browser-bundle-files', 'Bundle all attachments into a single archive before uploading.').default(false))
.option('--agent', 'Enable ChatGPT Agent mode for agentic tasks (web browsing, code execution). Browser engine only.', false)
.option(
'--retain-hours <hours>',
'Prune stored sessions older than this many hours before running (set 0 to disable).',
Expand Down
1 change: 1 addition & 0 deletions docs/browser-mode.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ You can pass the same payload inline (`--browser-inline-cookies '<json or base64
- `--browser-url`: override ChatGPT base URL if needed.
- `--browser-inline-files`: paste resolved files directly into the composer instead of uploading them (debug fallback; useful when the attachment button is broken).
- `--browser-bundle-files`: bundle all resolved attachments into a single temp file before uploading (useful when you want one upload even with many files).
- `--agent`: enable ChatGPT Agent mode for agentic tasks (web browsing, code execution). When enabled, Oracle clicks the "+" button and selects "Agent" before submitting the prompt. Response capture waits for the send button to reappear, ensuring the full agent output is captured after all actions complete.
- sqlite bindings: automatic rebuilds now require `ORACLE_ALLOW_SQLITE_REBUILD=1`. Without it, the CLI logs instructions instead of running `pnpm rebuild` on your behalf.
- `--model`: the same flag used for API runs controls the ChatGPT picker. Pass descriptive labels such as `--model "ChatGPT 5.1 Instant"` when you want a specific browser variant; canonical API names (`gpt-5.1-pro`, `gpt-5.1`) still work and map to their default picker labels.
- Cookie sync is mandatory—if we can’t copy cookies from Chrome, the run exits early. Use the hidden `--browser-allow-cookie-errors` flag only when you’re intentionally running logged out (it skips the early exit but still warns).
Expand Down
116 changes: 116 additions & 0 deletions src/browser/actions/agentMode.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import type { ChromeClient, BrowserLogger } from '../types.js';
import { buildClickDispatcher } from './domEvents.js';

/**
* Enables ChatGPT Agent mode by clicking the "+" button and selecting "Agent" from the menu.
*/
export async function enableAgentMode(
Runtime: ChromeClient['Runtime'],
logger: BrowserLogger,
): Promise<{ status: 'enabled' | 'already-enabled' | 'not-found' | 'button-missing' }> {
const { result } = await Runtime.evaluate({
expression: buildAgentModeExpression(),
awaitPromise: true,
returnByValue: true,
});

const value = result?.value as { status: string; debug?: string } | undefined;

switch (value?.status) {
case 'enabled':
logger('Agent mode: enabled');
return { status: 'enabled' };
case 'already-enabled':
logger('Agent mode: already active');
return { status: 'already-enabled' };
case 'not-found':
logger(`Agent mode: option not found in menu${value.debug ? ` - ${value.debug}` : ''}`);
return { status: 'not-found' };
case 'button-missing':
logger('Agent mode: plus button not found');
return { status: 'button-missing' };
default:
logger('Agent mode: unexpected result');
return { status: 'button-missing' };
}
}

function buildAgentModeExpression(): string {
return `(async () => {
${buildClickDispatcher()}

const PLUS_BUTTON_SELECTOR = 'button[data-testid="composer-plus-btn"]';
const MENU_WAIT_MS = 500;
const MAX_ATTEMPTS = 10;
const ATTEMPT_INTERVAL_MS = 300;

// Find the plus button
const plusButton = document.querySelector(PLUS_BUTTON_SELECTOR);
if (!plusButton) {
return { status: 'button-missing' };
}

// Click to open menu
dispatchClickSequence(plusButton);
await new Promise(r => setTimeout(r, MENU_WAIT_MS));

// Search for Agent option in opened menus/popups
const findAgentOption = () => {
// Look in any menu or popup that appeared
const menuContainers = document.querySelectorAll(
'[role="menu"], [role="listbox"], [data-radix-menu-content], [data-radix-popper-content-wrapper], [data-state="open"]'
);

for (const container of menuContainers) {
// Look for items containing "agent"
const items = container.querySelectorAll('button, [role="menuitem"], [role="option"], div[tabindex]');
for (const item of items) {
const text = (item.textContent || '').toLowerCase().trim();
const testId = (item.getAttribute('data-testid') || '').toLowerCase();
if (text.includes('agent') || testId.includes('agent')) {
return item;
}
}
}

// Also check body-level floating elements
const floatingDivs = document.querySelectorAll('body > div[data-radix-popper-content-wrapper]');
for (const div of floatingDivs) {
const items = div.querySelectorAll('button, [role="menuitem"], div[tabindex]');
for (const item of items) {
const text = (item.textContent || '').toLowerCase().trim();
if (text.includes('agent')) {
return item;
}
}
}

return null;
};

// Try to find and click the agent option
for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
const agentOption = findAgentOption();
if (agentOption) {
dispatchClickSequence(agentOption);
await new Promise(r => setTimeout(r, 200));
return { status: 'enabled' };
}

// Menu might not be open yet, try clicking again
if (attempt > 0 && attempt % 3 === 0) {
dispatchClickSequence(plusButton);
}
await new Promise(r => setTimeout(r, ATTEMPT_INTERVAL_MS));
}

// Collect debug info about what we found
const menuContainers = document.querySelectorAll('[role="menu"], [role="listbox"], [data-radix-menu-content]');
const menuTexts = Array.from(menuContainers).map(m => (m.textContent || '').slice(0, 100)).join(' | ');

return {
status: 'not-found',
debug: menuTexts ? \`Found menus: \${menuTexts}\` : 'No menus found'
};
})()`;
}
69 changes: 58 additions & 11 deletions src/browser/actions/assistantResponse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
COPY_BUTTON_SELECTOR,
FINISHED_ACTIONS_SELECTOR,
STOP_BUTTON_SELECTOR,
SEND_BUTTON_SELECTOR,
} from '../constants.js';
import { delay } from '../utils.js';
import { logDomFailure, logConversationSnapshot, buildConversationDebugExpression } from '../domDebug.js';
Expand All @@ -17,17 +18,18 @@ export async function waitForAssistantResponse(
Runtime: ChromeClient['Runtime'],
timeoutMs: number,
logger: BrowserLogger,
options?: { agentMode?: boolean },
): Promise<{ text: string; html?: string; meta: { turnId?: string | null; messageId?: string | null } }> {
logger('Waiting for ChatGPT response');
const expression = buildResponseObserverExpression(timeoutMs);
const expression = buildResponseObserverExpression(timeoutMs, options?.agentMode);
const evaluationPromise = Runtime.evaluate({ expression, awaitPromise: true, returnByValue: true });
const raceReadyEvaluation = evaluationPromise.then(
(value) => ({ kind: 'evaluation' as const, value }),
(error) => {
throw { source: 'evaluation' as const, error };
},
);
const pollerPromise = pollAssistantCompletion(Runtime, timeoutMs).then(
const pollerPromise = pollAssistantCompletion(Runtime, timeoutMs, options?.agentMode).then(
(value) => {
if (!value) {
throw { source: 'poll' as const, error: new Error(ASSISTANT_POLL_TIMEOUT_ERROR) };
Expand Down Expand Up @@ -213,11 +215,13 @@ async function terminateRuntimeExecution(Runtime: ChromeClient['Runtime']): Prom
async function pollAssistantCompletion(
Runtime: ChromeClient['Runtime'],
timeoutMs: number,
agentMode?: boolean,
): Promise<{ text: string; html?: string; meta: { turnId?: string | null; messageId?: string | null } } | null> {
const watchdogDeadline = Date.now() + timeoutMs;
let previousLength = 0;
let stableCycles = 0;
const requiredStableCycles = 6;
// Agent mode needs more stable cycles since agents pause between actions
const requiredStableCycles = agentMode ? 15 : 6;
while (Date.now() < watchdogDeadline) {
const snapshot = await readAssistantSnapshot(Runtime);
const normalized = normalizeAssistantSnapshot(snapshot);
Expand All @@ -229,12 +233,27 @@ async function pollAssistantCompletion(
} else {
stableCycles += 1;
}
const [stopVisible, completionVisible] = await Promise.all([
const [stopVisible, sendVisible, completionVisible] = await Promise.all([
isStopButtonVisible(Runtime),
isSendButtonVisible(Runtime),
isCompletionVisible(Runtime),
]);
if (completionVisible || (!stopVisible && stableCycles >= requiredStableCycles)) {
return normalized;

if (agentMode) {
// In agent mode: require send button visible (meaning agent is done)
// and stop button gone, with stable content
if (sendVisible && !stopVisible && stableCycles >= requiredStableCycles) {
return normalized;
}
// Also accept if completion actions are visible
if (completionVisible && !stopVisible && sendVisible) {
return normalized;
}
} else {
// Standard mode: return when stop button is gone and response is stable
if (completionVisible || (!stopVisible && stableCycles >= requiredStableCycles)) {
return normalized;
}
}
} else {
previousLength = 0;
Expand All @@ -257,6 +276,18 @@ async function isStopButtonVisible(Runtime: ChromeClient['Runtime']): Promise<bo
}
}

async function isSendButtonVisible(Runtime: ChromeClient['Runtime']): Promise<boolean> {
try {
const { result } = await Runtime.evaluate({
expression: `Boolean(document.querySelector('${SEND_BUTTON_SELECTOR}'))`,
returnByValue: true,
});
return Boolean(result?.value);
} catch {
return false;
}
}

async function isCompletionVisible(Runtime: ChromeClient['Runtime']): Promise<boolean> {
try {
const { result } = await Runtime.evaluate({
Expand Down Expand Up @@ -311,13 +342,15 @@ function buildAssistantSnapshotExpression(): string {
})()`;
}

function buildResponseObserverExpression(timeoutMs: number): string {
function buildResponseObserverExpression(timeoutMs: number, agentMode?: boolean): string {
const selectorsLiteral = JSON.stringify(ANSWER_SELECTORS);
return `(() => {
${buildClickDispatcher()}
const SELECTORS = ${selectorsLiteral};
const STOP_SELECTOR = '${STOP_BUTTON_SELECTOR}';
const SEND_SELECTOR = '${SEND_BUTTON_SELECTOR}';
const FINISHED_SELECTOR = '${FINISHED_ACTIONS_SELECTOR}';
const AGENT_MODE = ${agentMode ? 'true' : 'false'};
const settleDelayMs = 800;
${buildAssistantExtractor('extractFromTurns')}

Expand Down Expand Up @@ -364,25 +397,39 @@ function buildResponseObserverExpression(timeoutMs: number): string {
});

const waitForSettle = async (snapshot) => {
const settleWindowMs = 5000;
const settleWindowMs = AGENT_MODE ? 30000 : 5000;
const settleIntervalMs = 400;
const deadline = Date.now() + settleWindowMs;
let latest = snapshot;
let lastLength = snapshot?.text?.length ?? 0;
let stableCycles = 0;
const requiredStableCycles = AGENT_MODE ? 15 : 3;
while (Date.now() < deadline) {
await new Promise((resolve) => setTimeout(resolve, settleIntervalMs));
const refreshed = extractFromTurns();
if (refreshed && (refreshed.text?.length ?? 0) >= lastLength) {
if (refreshed && (refreshed.text?.length ?? 0) > lastLength) {
latest = refreshed;
lastLength = refreshed.text?.length ?? lastLength;
stableCycles = 0;
} else {
stableCycles++;
}
const stopVisible = Boolean(document.querySelector(STOP_SELECTOR));
const sendVisible = Boolean(document.querySelector(SEND_SELECTOR));
const finishedVisible =
Boolean(document.querySelector(FINISHED_SELECTOR)) ||
Array.from(document.querySelectorAll('.markdown')).some((n) => (n.textContent || '').trim() === 'Done');

if (!stopVisible || finishedVisible) {
break;
if (AGENT_MODE) {
// In agent mode: wait for send button to appear (agent is done)
if (sendVisible && !stopVisible && stableCycles >= requiredStableCycles) {
break;
}
} else {
// Standard mode
if (!stopVisible || finishedVisible) {
break;
}
}
}
return latest ?? snapshot;
Expand Down
2 changes: 2 additions & 0 deletions src/browser/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export const DEFAULT_BROWSER_CONFIG: ResolvedBrowserConfig = {
keepBrowser: false,
hideWindow: false,
desiredModel: DEFAULT_MODEL_TARGET,
agentMode: false,
debug: false,
allowCookieErrors: false,
remoteChrome: null,
Expand Down Expand Up @@ -63,6 +64,7 @@ export function resolveBrowserConfig(config: BrowserAutomationConfig | undefined
chromeProfile: config?.chromeProfile ?? DEFAULT_BROWSER_CONFIG.chromeProfile,
chromePath: config?.chromePath ?? DEFAULT_BROWSER_CONFIG.chromePath,
chromeCookiePath: config?.chromeCookiePath ?? DEFAULT_BROWSER_CONFIG.chromeCookiePath,
agentMode: config?.agentMode ?? DEFAULT_BROWSER_CONFIG.agentMode,
debug: config?.debug ?? DEFAULT_BROWSER_CONFIG.debug,
allowCookieErrors: config?.allowCookieErrors ?? envAllowCookieErrors ?? DEFAULT_BROWSER_CONFIG.allowCookieErrors,
manualLogin,
Expand Down
19 changes: 17 additions & 2 deletions src/browser/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
uploadAttachmentFile,
waitForAttachmentCompletion,
readAssistantSnapshot,
enableAgentMode,
} from './pageActions.js';
import { uploadAttachmentViaDataTransfer } from './actions/remoteFileTransfer.js';
import { estimateTokenCount, withRetries, delay } from './utils.js';
Expand Down Expand Up @@ -313,6 +314,13 @@ export async function runBrowserMode(options: BrowserRunOptions): Promise<Browse
await raceWithDisconnect(ensurePromptReady(Runtime, config.inputTimeoutMs, logger));
logger(`Prompt textarea ready (after model switch, ${promptText.length.toLocaleString()} chars queued)`);
}
// Enable agent mode if requested
if (config.agentMode) {
const agentResult = await raceWithDisconnect(enableAgentMode(Runtime, logger));
if (agentResult.status !== 'enabled' && agentResult.status !== 'already-enabled') {
throw new Error(`Failed to enable agent mode: ${agentResult.status}`);
}
}
const attachmentNames = attachments.map((a) => path.basename(a.path));
if (attachments.length > 0) {
if (!DOM) {
Expand All @@ -328,7 +336,7 @@ export async function runBrowserMode(options: BrowserRunOptions): Promise<Browse
}
await raceWithDisconnect(submitPrompt({ runtime: Runtime, input: Input, attachmentNames }, promptText, logger));
stopThinkingMonitor = startThinkingStatusMonitor(Runtime, logger, options.verbose ?? false);
const answer = await raceWithDisconnect(waitForAssistantResponse(Runtime, config.timeoutMs, logger));
const answer = await raceWithDisconnect(waitForAssistantResponse(Runtime, config.timeoutMs, logger, { agentMode: config.agentMode }));
answerText = answer.text;
answerHtml = answer.html ?? '';
const copiedMarkdown = await raceWithDisconnect(
Expand Down Expand Up @@ -733,6 +741,13 @@ async function runRemoteBrowserMode(
await ensurePromptReady(Runtime, config.inputTimeoutMs, logger);
logger(`Prompt textarea ready (after model switch, ${promptText.length.toLocaleString()} chars queued)`);
}
// Enable agent mode if requested
if (config.agentMode) {
const agentResult = await enableAgentMode(Runtime, logger);
if (agentResult.status !== 'enabled' && agentResult.status !== 'already-enabled') {
throw new Error(`Failed to enable agent mode: ${agentResult.status}`);
}
}

const attachmentNames = attachments.map((a) => path.basename(a.path));
if (attachments.length > 0) {
Expand All @@ -750,7 +765,7 @@ async function runRemoteBrowserMode(
}
await submitPrompt({ runtime: Runtime, input: Input, attachmentNames }, promptText, logger);
stopThinkingMonitor = startThinkingStatusMonitor(Runtime, logger, options.verbose ?? false);
const answer = await waitForAssistantResponse(Runtime, config.timeoutMs, logger);
const answer = await waitForAssistantResponse(Runtime, config.timeoutMs, logger, { agentMode: config.agentMode });
answerText = answer.text;
answerHtml = answer.html ?? '';

Expand Down
1 change: 1 addition & 0 deletions src/browser/pageActions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ export {
buildAssistantExtractorForTest,
buildConversationDebugExpressionForTest,
} from './actions/assistantResponse.js';
export { enableAgentMode } from './actions/agentMode.js';
Loading