Skip to content

Commit fd0a922

Browse files
committed
feat: Add usePuppeteerFetch option to QSource and DialoqbaseSettings
1 parent e92371f commit fd0a922

File tree

14 files changed

+136
-79
lines changed

14 files changed

+136
-79
lines changed

app/ui/src/routes/settings/application.tsx

+7
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,13 @@ export default function SettingsApplicationRoot() {
154154
>
155155
<Switch />
156156
</Form.Item>
157+
<Form.Item
158+
label="Enhanced Website loader"
159+
name="usePuppeteerFetch"
160+
valuePropName="checked"
161+
>
162+
<Switch />
163+
</Form.Item>
157164
</div>
158165
<div className="bg-gray-50 border-x border-b rounded-b-md rounded-x-md px-4 py-3 text-right sm:px-6 dark:bg-[#141414] dark:border-gray-600">
159166
<button
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
-- AlterTable
2+
ALTER TABLE "DialoqbaseSettings" ADD COLUMN "usePuppeteerFetch" BOOLEAN DEFAULT false;

server/prisma/schema.prisma

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ model DialoqbaseSettings {
9999
defaultChatModel String @default("gpt-3.5-turbo-dbase")
100100
defaultEmbeddingModel String @default("dialoqbase_eb_text-embedding-ada-002")
101101
ollamaURL String? @default("http://host.docker.internal:11434")
102+
usePuppeteerFetch Boolean? @default(false)
102103
}
103104

104105
model BotIntegration {

server/prisma/seed.ts

+58-58
Original file line numberDiff line numberDiff line change
@@ -488,70 +488,70 @@ const removeTensorflowSupport = async () => {
488488
});
489489
};
490490

491-
const replaceOldEmbeddings = async () => {
492-
await prisma.bot.updateMany({
493-
where: {
494-
embedding: "openai",
495-
},
496-
data: {
497-
embedding: "dialoqbase_eb_text-embedding-ada-002",
498-
},
499-
});
491+
// const replaceOldEmbeddings = async () => {
492+
// await prisma.bot.updateMany({
493+
// where: {
494+
// embedding: "openai",
495+
// },
496+
// data: {
497+
// embedding: "dialoqbase_eb_text-embedding-ada-002",
498+
// },
499+
// });
500500

501-
await prisma.bot.updateMany({
502-
where: {
503-
embedding: "cohere",
504-
},
505-
data: {
506-
embedding: "dialoqbase_eb_small",
507-
},
508-
});
501+
// await prisma.bot.updateMany({
502+
// where: {
503+
// embedding: "cohere",
504+
// },
505+
// data: {
506+
// embedding: "dialoqbase_eb_small",
507+
// },
508+
// });
509509

510-
await prisma.bot.updateMany({
511-
where: {
512-
embedding: "transformer",
513-
},
514-
data: {
515-
embedding: "dialoqbase_eb_Xenova/all-MiniLM-L6-v2",
516-
},
517-
});
510+
// await prisma.bot.updateMany({
511+
// where: {
512+
// embedding: "transformer",
513+
// },
514+
// data: {
515+
// embedding: "dialoqbase_eb_Xenova/all-MiniLM-L6-v2",
516+
// },
517+
// });
518518

519-
await prisma.bot.updateMany({
520-
where: {
521-
embedding: "google-gecko",
522-
},
523-
data: {
524-
embedding: "dialoqbase_eb_models/embedding-gecko-001",
525-
},
526-
});
519+
// await prisma.bot.updateMany({
520+
// where: {
521+
// embedding: "google-gecko",
522+
// },
523+
// data: {
524+
// embedding: "dialoqbase_eb_models/embedding-gecko-001",
525+
// },
526+
// });
527527

528-
await prisma.bot.updateMany({
529-
where: {
530-
embedding: "jina-api",
531-
},
532-
data: {
533-
embedding: "dialoqbase_eb_jina-embeddings-v2-base-en",
534-
},
535-
});
528+
// await prisma.bot.updateMany({
529+
// where: {
530+
// embedding: "jina-api",
531+
// },
532+
// data: {
533+
// embedding: "dialoqbase_eb_jina-embeddings-v2-base-en",
534+
// },
535+
// });
536536

537-
await prisma.bot.updateMany({
538-
where: {
539-
embedding: "jina",
540-
},
541-
data: {
542-
embedding: "dialoqbase_eb_Xenova/jina-embeddings-v2-small-en",
543-
},
544-
});
537+
// await prisma.bot.updateMany({
538+
// where: {
539+
// embedding: "jina",
540+
// },
541+
// data: {
542+
// embedding: "dialoqbase_eb_Xenova/jina-embeddings-v2-small-en",
543+
// },
544+
// });
545545

546-
await prisma.bot.updateMany({
547-
where: {
548-
embedding: "google",
549-
},
550-
data: {
551-
embedding: "dialoqbase_eb_embedding-001",
552-
},
553-
});
554-
};
546+
// await prisma.bot.updateMany({
547+
// where: {
548+
// embedding: "google",
549+
// },
550+
// data: {
551+
// embedding: "dialoqbase_eb_embedding-001",
552+
// },
553+
// });
554+
// };
555555

556556
const updateGeminiStreamingToTrue = async () => {
557557
await prisma.dialoqbaseModels.update({

server/src/handlers/api/v1/admin/type.ts

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ export type UpdateDialoqbaseSettingsRequest = {
33
noOfBotsPerUser: number;
44
allowUserToCreateBots: boolean;
55
allowUserToRegister: boolean;
6+
usePuppeteerFetch: boolean;
67
};
78
};
89

server/src/loader/web.ts

+16-3
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,44 @@
11
import { BaseDocumentLoader } from "langchain/document_loaders/base";
22
import { Document } from "langchain/document";
33
import { websiteParser } from "../utils/website-parser";
4-
// import puppeteerFetch from "../utils/puppeteer-fetch";
4+
import puppeteerFetch, { closePuppeteer } from "../utils/puppeteer-fetch";
55

66
export interface WebLoaderParams {
77
url: string;
8+
usePuppeteerFetch?: boolean;
9+
doNotClosePuppeteer?: boolean;
810
}
911

1012
export class DialoqbaseWebLoader
1113
extends BaseDocumentLoader
1214
implements WebLoaderParams {
1315
url: string;
16+
usePuppeteerFetch?: boolean;
17+
doNotClosePuppeteer?: boolean;
1418

15-
constructor({ url }: WebLoaderParams) {
19+
constructor({ url, usePuppeteerFetch, doNotClosePuppeteer }: WebLoaderParams) {
1620
super();
1721
this.url = url;
22+
this.usePuppeteerFetch = usePuppeteerFetch;
23+
this.doNotClosePuppeteer = doNotClosePuppeteer;
1824
}
1925

2026
async _fetchHTML(): Promise<string> {
27+
if (this.usePuppeteerFetch) {
28+
console.log(`[DialoqbaseWebLoader] Using puppeteer to fetch ${this.url}`)
29+
const response = await puppeteerFetch(this.url, true);
30+
if (!this.doNotClosePuppeteer) {
31+
await closePuppeteer();
32+
}
33+
return response;
34+
}
2135
const response = await fetch(this.url);
2236
return await response.text();
2337
}
2438

2539
async load(): Promise<Document<Record<string, any>>[]> {
2640
const html = await this._fetchHTML();
2741
const text = websiteParser(html);
28-
console.log(text)
2942
const metadata = { source: this.url };
3043
return [new Document({ pageContent: text, metadata })];
3144
}

server/src/queue/controllers/crawl.controller.ts

+6-1
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@ import { PrismaClient } from "@prisma/client";
22
import { QSource } from "../type";
33
import { crawl } from "../../utils/crawl";
44
import { websiteQueueController } from "./website.controller";
5+
import { closePuppeteer } from "../../utils/puppeteer-fetch";
56
const prisma = new PrismaClient();
67

78
export const crawlQueueController = async (source: QSource) => {
89
let maxDepth = source.maxDepth || 1;
910
let maxLinks = source.maxLinks || 1;
10-
const data = await crawl(source.content!, maxDepth, maxLinks);
11+
const data = await crawl(source.content!, maxDepth, maxLinks, source.usePuppeteerFetch);
1112
const links = Array.from(data?.links || []);
1213

1314
for (const link of links) {
@@ -27,6 +28,8 @@ export const crawlQueueController = async (source: QSource) => {
2728
embedding: source.embedding,
2829
chunkOverlap: source.chunkOverlap,
2930
chunkSize: source.chunkSize,
31+
usePuppeteerFetch: source.usePuppeteerFetch,
32+
doNotClosePuppeteer: true,
3033
},
3134
prisma
3235
);
@@ -41,4 +44,6 @@ export const crawlQueueController = async (source: QSource) => {
4144
},
4245
});
4346
}
47+
48+
await closePuppeteer()
4449
};

server/src/queue/controllers/website.controller.ts

+2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ export const websiteQueueController = async (
7373
} else {
7474
const loader = new DialoqbaseWebLoader({
7575
url: source.content!,
76+
usePuppeteerFetch: source.usePuppeteerFetch,
77+
doNotClosePuppeteer: source.doNotClosePuppeteer,
7678
});
7779
docs = await loader.load();
7880
}

server/src/queue/index.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ export default async function queueHandler(job: SandboxedJob) {
3434
status: "PROCESSING",
3535
},
3636
});
37-
const { chunkOverlap, chunkSize } = await getRagSettings(prisma);
37+
const { chunkOverlap, chunkSize , usePuppeteerFetch} = await getRagSettings(prisma);
3838
source.chunkOverlap = chunkOverlap;
3939
source.chunkSize = chunkSize;
40+
source.usePuppeteerFetch = usePuppeteerFetch;
4041
switch (source.type.toLowerCase()) {
4142
case "website":
4243
await websiteQueueController(source, prisma);

server/src/queue/type.ts

+2
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ export interface QSource extends BotSource {
66
maxLinks?: number;
77
chunkSize: number;
88
chunkOverlap: number;
9+
usePuppeteerFetch?: boolean;
10+
doNotClosePuppeteer?: boolean;
911
}

server/src/schema/api/v1/admin/index.ts

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ export const dialoqbaseSettingsSchema: FastifySchema = {
2222
dynamicallyFetchOllamaModels: { type: "boolean" },
2323
hideDefaultModels: { type: "boolean" },
2424
ollamaURL: { type: "string" },
25+
usePuppeteerFetch: { type: "boolean" },
2526
},
2627
},
2728
};
@@ -47,6 +48,7 @@ export const updateDialoqbaseSettingsSchema: FastifySchema = {
4748
defaultEmbeddingModel: { type: "string" },
4849
hideDefaultModels: { type: "boolean" },
4950
ollamaURL: { type: "string" },
51+
usePuppeteerFetch: { type: "boolean" },
5052
},
5153
},
5254
response: {

server/src/utils/crawl.ts

+22-13
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import axios from "axios";
2-
import { load } from "cheerio";
2+
import { CheerioAPI, load } from "cheerio";
3+
import puppeteerFetch from "./puppeteer-fetch";
34

45
type CrawlResult = {
56
links: Set<string>;
@@ -13,7 +14,8 @@ const queuedLinks: Set<string> = new Set();
1314
export const crawl = async (
1415
startUrl: string,
1516
maxDepth = 2,
16-
maxLinks = 20
17+
maxLinks = 20,
18+
usePuppeteerFetch = false
1719
): Promise<CrawlResult> => {
1820
const queue: { url: string; depth: number }[] = [{ url: startUrl, depth: 0 }];
1921
const fetchedLinks: Set<string> = new Set();
@@ -28,20 +30,27 @@ export const crawl = async (
2830
}
2931

3032
try {
31-
const response = await axios.get(url, {
32-
headers: {
33-
Accept: "text/html",
34-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
35-
},
36-
});
3733

38-
const contentType = response.headers['content-type'];
39-
if (!contentType || !contentType.includes("text/html")) {
40-
return;
41-
}
34+
let $: CheerioAPI;
35+
36+
if (usePuppeteerFetch) {
37+
const response = await puppeteerFetch(url);
38+
$ = load(response);
39+
} else {
40+
const response = await axios.get(url, {
41+
headers: {
42+
Accept: "text/html",
43+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
44+
},
45+
});
4246

43-
const $ = load(response.data);
47+
const contentType = response.headers['content-type'];
48+
if (!contentType || !contentType.includes("text/html")) {
49+
return;
50+
}
4451

52+
$ = load(response.data);
53+
}
4554
visitedLinks.add(url);
4655
fetchedLinks.add(url);
4756

server/src/utils/puppeteer-fetch.ts

+13-3
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ let browser: Browser;
3030

3131

3232
const init = async () => {
33-
if (!browser) {
33+
if (!browser || !browser.connected) {
3434
browser = await puppeteer.launch({
3535
headless: true,
3636
args: ['--no-sandbox', '--disable-setuid-sandbox'],
@@ -52,10 +52,10 @@ const puppeteerFetch = async (url: string, useReadability = false) => {
5252
${executor}
5353
return executor();
5454
}())
55-
`) as { content?: string }
55+
`) as { content?: string, title?: string };
5656
if (resultArticle?.content) {
5757
await page.close();
58-
return resultArticle.content;
58+
return `<!DOCTYPE html><html><head><title>${resultArticle.title}</title></head><body>${resultArticle.content}</body></html>`
5959
}
6060
console.error(`[puppeteerFetch] Error fetching ${url}: Readability failed`);
6161
}
@@ -68,6 +68,16 @@ const puppeteerFetch = async (url: string, useReadability = false) => {
6868
}
6969
}
7070

71+
export const closePuppeteer = async () => {
72+
try {
73+
if (browser.connected) {
74+
await browser.close();
75+
}
76+
} catch (error) {
77+
console.error(`[closePuppeteer] Error closing browser: ${error.message}`);
78+
}
79+
}
80+
7181

7282

7383
export default puppeteerFetch;

server/src/utils/rag-settings.ts

+2
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@ export const getRagSettings = async (prisma: PrismaClient) => {
55
select: {
66
defaultChunkSize: true,
77
defaultChunkOverlap: true,
8+
usePuppeteerFetch: true,
89
},
910
});
1011

1112
return {
1213
chunkSize: data?.defaultChunkSize || 1000,
1314
chunkOverlap: data?.defaultChunkOverlap || 200,
15+
usePuppeteerFetch: data?.usePuppeteerFetch
1416
};
1517
};

0 commit comments

Comments
 (0)