diff --git a/examples/scrape-pricing-plans.ts b/examples/scrape-pricing-plans.ts new file mode 100644 index 0000000..14d508a --- /dev/null +++ b/examples/scrape-pricing-plans.ts @@ -0,0 +1,53 @@ +import { z } from 'zod' +import { Api, invokable } from '../src/apis' +import { OpenAiAgent } from '../src/chat-agents/open-ai' +import { WorkGptRunner } from '../src/runners/workgpt' +import { haltProgram } from '../src/runners/control' +import { TextBrowser } from '../src/apis/text-browser' + +export class WorkGptControl extends Api { + @invokable({ + usage: 'Finishes the program. Call when you have an answer.', + schema: z.object({ + pricingPlans: z.array( + z.object({ + planName: z.string(), + planAmount: z.string(), + planDescription: z.string().optional(), + }) + ), + }), + }) + onFinish(result: any) { + haltProgram(result) + } +} + +async function main() { + const agent = new OpenAiAgent({ + verbose: true, + temperature: 0, + model: 'gpt-4-0613', + }) + + const apis = await Promise.all([new TextBrowser(), new WorkGptControl()]) + + const runner = new WorkGptRunner({ + agent, + apis, + }) + + const result = await runner.runWithDirective( + ` + You purpose is to extract pricing plans from SAAS service websites. + Follow the instructions below. Think step by step. + 1. Navigate to hubspot.com + 2. Find the pricing page for their CRM. + 3. Extract the text of the page to understand the service's different pricing plans. + 3. Call WorkGptControl.onFinish with the parsed pricing plans.` + ) + + console.log('Result', JSON.stringify(result, null, 2)) +} + +main() diff --git a/src/apis/text-browser.ts b/src/apis/text-browser.ts index 56f1dad..2bb8a52 100644 --- a/src/apis/text-browser.ts +++ b/src/apis/text-browser.ts @@ -5,12 +5,12 @@ import puppeteer, { Page } from 'puppeteer' export class TextBrowser extends Api { @invokable({ - usage: `Useful for getting text contents of a website.`, + usage: `Useful for getting plain text contents of a website.`, schema: z.object({ url: z.string(), }), }) - async browse({ url }: { url: string }): Promise { + async getSiteText({ url }: { url: string }): Promise { const browser = await puppeteer.launch({ headless: 'new' }) const page = await browser.newPage() await page.goto(url) @@ -18,6 +18,61 @@ export class TextBrowser extends Api { await browser.close() return text ?? '' } + + @invokable({ + usage: `Get sitemap of a website. Useful for getting a list of pages of a website.`, + schema: z.object({ + url: z.string(), + }), + }) + async getSitemap({ url }: { url: string }) { + const browser = await puppeteer.launch({ headless: 'new' }) + const base = new URL(url).hostname + const page = await browser.newPage() + await page.goto(url) + const links = await page.evaluate(() => + Array.from(document.querySelectorAll('a[href]')).map((node) => ({ + href: node.getAttribute('href'), + linkText: node.textContent, + })) + ) + await browser.close() + + const normalizedLinks: { + linkText: string + linkUrl: string + }[] = [] + + for (const { href, linkText } of links) { + if (!href || !linkText) { + continue + } + + let url: URL + + try { + url = new URL(href) + } catch (e) { + continue + } + + if (url.hostname !== base) { + continue + } + + // Check uniqueness + if (normalizedLinks.some((link) => link.linkUrl === url.href)) { + continue + } + + normalizedLinks.push({ + linkText: linkText.trim(), + linkUrl: url.href, + }) + } + + return normalizedLinks + } } async function extractText(page: Page) {