Skip to content

Commit 881e1c8

Browse files
authored
Merge pull request #272 from n4ze3m/next
v1.8.5
2 parents d7df246 + 83e634f commit 881e1c8

File tree

20 files changed

+1034
-116
lines changed

20 files changed

+1034
-116
lines changed

Dockerfile

+14-1
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,28 @@ RUN pnpm install
2525
RUN pnpm build
2626

2727
FROM node:18-slim
28+
2829
WORKDIR /app
2930

31+
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
32+
33+
RUN apt-get update && apt-get install gnupg wget -y && \
34+
wget --quiet --output-document=- https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor > /etc/apt/trusted.gpg.d/google-archive.gpg && \
35+
sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' && \
36+
apt-get update && \
37+
apt-get install google-chrome-stable -y --no-install-recommends && \
38+
rm -rf /var/lib/apt/lists/*
39+
3040
RUN yarn config set registry https://registry.npmjs.org/
3141
RUN yarn config set network-timeout 1200000
3242

3343
RUN apt update && apt -y install --no-install-recommends ca-certificates git git-lfs openssh-client curl jq cmake sqlite3 openssl psmisc python3
44+
45+
3446
RUN apt -y install g++ make
35-
# RUN npm install -g node-gyp
47+
3648
RUN apt-get clean autoclean && apt-get autoremove --yes && rm -rf /var/lib/{apt,dpkg,cache,log}/
49+
3750
RUN npm --no-update-notifier --no-fund --global install pnpm
3851
# Copy API
3952
COPY --from=server /app/dist/ .

app/ui/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "app",
33
"private": true,
4-
"version": "1.8.4",
4+
"version": "1.8.5",
55
"type": "module",
66
"scripts": {
77
"dev": "vite",

app/ui/src/routes/settings/application.tsx

+7
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,13 @@ export default function SettingsApplicationRoot() {
154154
>
155155
<Switch />
156156
</Form.Item>
157+
<Form.Item
158+
label="Enhanced Website loader"
159+
name="usePuppeteerFetch"
160+
valuePropName="checked"
161+
>
162+
<Switch />
163+
</Form.Item>
157164
</div>
158165
<div className="bg-gray-50 border-x border-b rounded-b-md rounded-x-md px-4 py-3 text-right sm:px-6 dark:bg-[#141414] dark:border-gray-600">
159166
<button

app/widget/src/hooks/useMessage.tsx

+5-5
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ export type BotResponse = {
1515
};
1616

1717
const parsesStreamingResponse = (text: string) => {
18-
// event: chunk or result\ndata: been or object\n\n
19-
// console.log(`text: ${text}`);
2018
const REGEX = /event: (.+)\ndata: (.+)/g;
2119
const matches = text.matchAll(REGEX);
2220
const result = [];
@@ -149,18 +147,20 @@ export const useMessage = () => {
149147
if (type === "chunk") {
150148
const jsonMessage = JSON.parse(message);
151149
if (count === 0) {
152-
newMessage[appendingIndex].message = jsonMessage.message;
150+
newMessage[appendingIndex].message = jsonMessage.message + "▋";
153151
setMessages(newMessage);
154152
localStorage.setItem("DS_MESSAGE", JSON.stringify(newMessage));
155153
} else {
156-
newMessage[appendingIndex].message += jsonMessage.message;
154+
newMessage[appendingIndex].message =
155+
newMessage[appendingIndex].message.slice(0, -1) +
156+
jsonMessage.message +
157+
"▋";
157158
setMessages(newMessage);
158159
localStorage.setItem("DS_MESSAGE", JSON.stringify(newMessage));
159160
}
160161
count++;
161162
} else if (type === "result") {
162163
const responseData = JSON.parse(message) as BotResponse;
163-
console.log(responseData);
164164
newMessage[appendingIndex].message = responseData.bot.text;
165165
newMessage[appendingIndex].sources =
166166
responseData.bot.sourceDocuments;

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "dialoqbase",
3-
"version": "1.8.4",
3+
"version": "1.8.5",
44
"description": "Create chatbots with ease",
55
"scripts": {
66
"ui:dev": "pnpm run --filter ui dev",

server/package.json

+6
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
"@langchain/community": "^0.0.35",
4747
"@langchain/google-genai": "^0.0.16",
4848
"@langchain/openai": "^0.0.18",
49+
"@mozilla/readability": "^0.5.0",
4950
"@prisma/client": "^5.9.1",
5051
"@slack/bolt": "^3.13.2",
5152
"@supabase/supabase-js": "^2.24.0",
@@ -77,6 +78,11 @@
7778
"pdf-parse": "^1.1.1",
7879
"pdfjs-dist": "^3.7.107",
7980
"pubsub-js": "^1.9.4",
81+
"puppeteer": "^22.11.0",
82+
"puppeteer-extra": "^3.3.6",
83+
"puppeteer-extra-plugin-block-resources": "^2.4.3",
84+
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
85+
"puppeteer-extra-plugin-stealth": "^2.11.2",
8086
"replicate": "^0.26.0",
8187
"sitemapper": "^3.2.6",
8288
"ts-node": "^10.9.1",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
-- AlterTable
2+
ALTER TABLE "DialoqbaseSettings" ADD COLUMN "usePuppeteerFetch" BOOLEAN DEFAULT false;

server/prisma/schema.prisma

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ model DialoqbaseSettings {
9999
defaultChatModel String @default("gpt-3.5-turbo-dbase")
100100
defaultEmbeddingModel String @default("dialoqbase_eb_text-embedding-ada-002")
101101
ollamaURL String? @default("http://host.docker.internal:11434")
102+
usePuppeteerFetch Boolean? @default(false)
102103
}
103104

104105
model BotIntegration {

server/prisma/seed.ts

+59-59
Original file line numberDiff line numberDiff line change
@@ -488,70 +488,70 @@ const removeTensorflowSupport = async () => {
488488
});
489489
};
490490

491-
const replaceOldEmbeddings = async () => {
492-
await prisma.bot.updateMany({
493-
where: {
494-
embedding: "openai",
495-
},
496-
data: {
497-
embedding: "dialoqbase_eb_text-embedding-ada-002",
498-
},
499-
});
491+
// const replaceOldEmbeddings = async () => {
492+
// await prisma.bot.updateMany({
493+
// where: {
494+
// embedding: "openai",
495+
// },
496+
// data: {
497+
// embedding: "dialoqbase_eb_text-embedding-ada-002",
498+
// },
499+
// });
500500

501-
await prisma.bot.updateMany({
502-
where: {
503-
embedding: "cohere",
504-
},
505-
data: {
506-
embedding: "dialoqbase_eb_small",
507-
},
508-
});
501+
// await prisma.bot.updateMany({
502+
// where: {
503+
// embedding: "cohere",
504+
// },
505+
// data: {
506+
// embedding: "dialoqbase_eb_small",
507+
// },
508+
// });
509509

510-
await prisma.bot.updateMany({
511-
where: {
512-
embedding: "transformer",
513-
},
514-
data: {
515-
embedding: "dialoqbase_eb_Xenova/all-MiniLM-L6-v2",
516-
},
517-
});
510+
// await prisma.bot.updateMany({
511+
// where: {
512+
// embedding: "transformer",
513+
// },
514+
// data: {
515+
// embedding: "dialoqbase_eb_Xenova/all-MiniLM-L6-v2",
516+
// },
517+
// });
518518

519-
await prisma.bot.updateMany({
520-
where: {
521-
embedding: "google-gecko",
522-
},
523-
data: {
524-
embedding: "dialoqbase_eb_models/embedding-gecko-001",
525-
},
526-
});
519+
// await prisma.bot.updateMany({
520+
// where: {
521+
// embedding: "google-gecko",
522+
// },
523+
// data: {
524+
// embedding: "dialoqbase_eb_models/embedding-gecko-001",
525+
// },
526+
// });
527527

528-
await prisma.bot.updateMany({
529-
where: {
530-
embedding: "jina-api",
531-
},
532-
data: {
533-
embedding: "dialoqbase_eb_jina-embeddings-v2-base-en",
534-
},
535-
});
528+
// await prisma.bot.updateMany({
529+
// where: {
530+
// embedding: "jina-api",
531+
// },
532+
// data: {
533+
// embedding: "dialoqbase_eb_jina-embeddings-v2-base-en",
534+
// },
535+
// });
536536

537-
await prisma.bot.updateMany({
538-
where: {
539-
embedding: "jina",
540-
},
541-
data: {
542-
embedding: "dialoqbase_eb_Xenova/jina-embeddings-v2-small-en",
543-
},
544-
});
537+
// await prisma.bot.updateMany({
538+
// where: {
539+
// embedding: "jina",
540+
// },
541+
// data: {
542+
// embedding: "dialoqbase_eb_Xenova/jina-embeddings-v2-small-en",
543+
// },
544+
// });
545545

546-
await prisma.bot.updateMany({
547-
where: {
548-
embedding: "google",
549-
},
550-
data: {
551-
embedding: "dialoqbase_eb_embedding-001",
552-
},
553-
});
554-
};
546+
// await prisma.bot.updateMany({
547+
// where: {
548+
// embedding: "google",
549+
// },
550+
// data: {
551+
// embedding: "dialoqbase_eb_embedding-001",
552+
// },
553+
// });
554+
// };
555555

556556
const updateGeminiStreamingToTrue = async () => {
557557
await prisma.dialoqbaseModels.update({
@@ -567,7 +567,7 @@ const updateGeminiStreamingToTrue = async () => {
567567
const main = async () => {
568568
await newModels();
569569
await removeTensorflowSupport();
570-
await replaceOldEmbeddings();
570+
// await replaceOldEmbeddings();
571571
await updateGeminiStreamingToTrue();
572572
};
573573

server/src/handlers/api/v1/admin/type.ts

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ export type UpdateDialoqbaseSettingsRequest = {
33
noOfBotsPerUser: number;
44
allowUserToCreateBots: boolean;
55
allowUserToRegister: boolean;
6+
usePuppeteerFetch: boolean;
67
};
78
};
89

server/src/loader/web.ts

+17-3
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,37 @@
11
import { BaseDocumentLoader } from "langchain/document_loaders/base";
22
import { Document } from "langchain/document";
33
import { websiteParser } from "../utils/website-parser";
4+
import puppeteerFetch, { closePuppeteer } from "../utils/puppeteer-fetch";
45

56
export interface WebLoaderParams {
67
url: string;
8+
usePuppeteerFetch?: boolean;
9+
doNotClosePuppeteer?: boolean;
710
}
811

912
export class DialoqbaseWebLoader
1013
extends BaseDocumentLoader
11-
implements WebLoaderParams
12-
{
14+
implements WebLoaderParams {
1315
url: string;
16+
usePuppeteerFetch?: boolean;
17+
doNotClosePuppeteer?: boolean;
1418

15-
constructor({ url }: WebLoaderParams) {
19+
constructor({ url, usePuppeteerFetch, doNotClosePuppeteer }: WebLoaderParams) {
1620
super();
1721
this.url = url;
22+
this.usePuppeteerFetch = usePuppeteerFetch;
23+
this.doNotClosePuppeteer = doNotClosePuppeteer;
1824
}
1925

2026
async _fetchHTML(): Promise<string> {
27+
if (this.usePuppeteerFetch) {
28+
console.log(`[DialoqbaseWebLoader] Using puppeteer to fetch ${this.url}`)
29+
const response = await puppeteerFetch(this.url, true);
30+
if (!this.doNotClosePuppeteer) {
31+
await closePuppeteer();
32+
}
33+
return response;
34+
}
2135
const response = await fetch(this.url);
2236
return await response.text();
2337
}

server/src/queue/controllers/crawl.controller.ts

+6-1
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@ import { PrismaClient } from "@prisma/client";
22
import { QSource } from "../type";
33
import { crawl } from "../../utils/crawl";
44
import { websiteQueueController } from "./website.controller";
5+
import { closePuppeteer } from "../../utils/puppeteer-fetch";
56
const prisma = new PrismaClient();
67

78
export const crawlQueueController = async (source: QSource) => {
89
let maxDepth = source.maxDepth || 1;
910
let maxLinks = source.maxLinks || 1;
10-
const data = await crawl(source.content!, maxDepth, maxLinks);
11+
const data = await crawl(source.content!, maxDepth, maxLinks, source.usePuppeteerFetch);
1112
const links = Array.from(data?.links || []);
1213

1314
for (const link of links) {
@@ -27,6 +28,8 @@ export const crawlQueueController = async (source: QSource) => {
2728
embedding: source.embedding,
2829
chunkOverlap: source.chunkOverlap,
2930
chunkSize: source.chunkSize,
31+
usePuppeteerFetch: source.usePuppeteerFetch,
32+
doNotClosePuppeteer: true,
3033
},
3134
prisma
3235
);
@@ -41,4 +44,6 @@ export const crawlQueueController = async (source: QSource) => {
4144
},
4245
});
4346
}
47+
48+
await closePuppeteer()
4449
};

server/src/queue/controllers/website.controller.ts

+10-3
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,16 @@ export const websiteQueueController = async (
1414
source: QSource,
1515
prisma: PrismaClient
1616
) => {
17-
const response = await axios.get(source.content!);
1817

19-
const type = response.headers["content-type"];
18+
let type = "text/html";
19+
20+
try {
21+
const response = await axios.get(source.content!);
22+
type = response.headers["content-type"];
23+
} catch (error) {
24+
console.error(`[websiteQueueController] Error fetching ${source.content}`);
25+
}
2026

21-
console.log("website type is", type);
2227

2328
if (type.includes("application/pdf")) {
2429
const response = await axios.get(source.content!, {
@@ -68,6 +73,8 @@ export const websiteQueueController = async (
6873
} else {
6974
const loader = new DialoqbaseWebLoader({
7075
url: source.content!,
76+
usePuppeteerFetch: source.usePuppeteerFetch,
77+
doNotClosePuppeteer: source.doNotClosePuppeteer,
7178
});
7279
docs = await loader.load();
7380
}

server/src/queue/index.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ export default async function queueHandler(job: SandboxedJob) {
3434
status: "PROCESSING",
3535
},
3636
});
37-
const { chunkOverlap, chunkSize } = await getRagSettings(prisma);
37+
const { chunkOverlap, chunkSize , usePuppeteerFetch} = await getRagSettings(prisma);
3838
source.chunkOverlap = chunkOverlap;
3939
source.chunkSize = chunkSize;
40+
source.usePuppeteerFetch = usePuppeteerFetch;
4041
switch (source.type.toLowerCase()) {
4142
case "website":
4243
await websiteQueueController(source, prisma);

server/src/queue/type.ts

+2
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@ export interface QSource extends BotSource {
66
maxLinks?: number;
77
chunkSize: number;
88
chunkOverlap: number;
9+
usePuppeteerFetch?: boolean;
10+
doNotClosePuppeteer?: boolean;
911
}

0 commit comments

Comments
 (0)