-
Notifications
You must be signed in to change notification settings - Fork 0
/
webCrawler.ts
31 lines (27 loc) · 1.15 KB
/
webCrawler.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import * as puppeteer from "puppeteer";
function removeNonAlphanumeric(input: any) {
return input.replace(/[^a-zA-Z0-9\s]/g, '');
}
// In Future can extend this to crawl every page in the website heirarchy for now it crawls on a single page
async function crawlWebsite(websiteUrl: any){
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(websiteUrl);
const textContent = await page.$eval('*', (el) => {
console.log("Element: ", el)
const selection = window.getSelection();
const range = document.createRange();
range.selectNode(el);
selection?.removeAllRanges();
selection?.addRange(range);
return window.getSelection()?.toString()
});
websiteUrl = websiteUrl.split(".com")[0] + ".com"
console.log(textContent);
// Sanitize the data
// 1. Remove the empty strings, trim the strings.
let result = textContent?.split("\n").filter(str => str.trim() !== "").map(str => str.trim()).map(str => removeNonAlphanumeric(str));
browser.close();
return result?.map(str => ({"website": websiteUrl, "data": str}));
}
export {crawlWebsite}