Skip to content

Commit

Permalink
added downloadPdf call inside enqueueList
Browse files Browse the repository at this point in the history
  • Loading branch information
star-nox committed Jan 8, 2024
1 parent 6565d10 commit 536f180
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 5 deletions.
4 changes: 2 additions & 2 deletions config.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { Config } from "./src/config";

export const defaultConfig: Config = {
url: "https://extensionpubs.unl.edu/publication/9000016363639/windbreak-establishment/",
url: "https://www.builder.io/c/docs/developers",
match: "",
maxPagesToCrawl: 100,
maxPagesToCrawl: 50,
outputFileName: "output.json",
};
Binary file removed pdfs/ec711.pdf
Binary file not shown.
22 changes: 19 additions & 3 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,15 @@ function downloadPdf(url: string) {
}

export function getPageHtml(page: Page, selector = "body") {

return page.evaluate((selector) => {
console.log(`Getting page HTML...`);
// Exclude header, footer, nav from scraping
const elementsToExclude = document.querySelectorAll('header, footer, nav');
elementsToExclude.forEach(element => element.remove());
// Check if the selector is an XPath
if (selector.startsWith("/")) {
console.log(`XPath: ${selector}`);
const elements = document.evaluate(
selector,
document,
Expand All @@ -45,6 +51,7 @@ export function getPageHtml(page: Page, selector = "body") {
return result ? result.textContent || "" : "";
} else {
// Handle as a CSS selector
console.log(`Selector: ${selector}`);
const el = document.querySelector(selector) as HTMLElement | null;
return el?.innerText || "";
}
Expand All @@ -70,8 +77,7 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {

export async function crawl(config: Config) {
configSchema.parse(config);
console.log(config)


if (config.url){
if (config.url.endsWith('.pdf')) {
console.log(`Downloading PDF: ${config.url}`);
Expand Down Expand Up @@ -105,8 +111,9 @@ export async function crawl(config: Config) {
});
}
}

page.on('console', message => console.log(`Page log: ${message.text()}`));
const html = await getPageHtml(page, config.selector);
//console.log(html);

// Save results as JSON to ./storage/datasets/default
await pushData({ title, url: request.loadedUrl, html });
Expand All @@ -120,6 +127,15 @@ export async function crawl(config: Config) {
await enqueueLinks({
globs:
typeof config.match === "string" ? [config.match] : config.match,
transformRequestFunction(req) {
// ignore all links ending with `.pdf`
if (req.url.endsWith('.pdf')) {
console.log(`Downloading PDF: ${req.url}`);
downloadPdf(req.url);
return false;
}
return req;
},
});
},
// Comment this option to scrape the full website.
Expand Down

0 comments on commit 536f180

Please sign in to comment.