Skip to content

Commit 9d868b5

Browse files
committed
feat: add markdown for smartscraper
1 parent 76f446f commit 9d868b5

File tree

9 files changed

+848
-12
lines changed

9 files changed

+848
-12
lines changed
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import { smartScraper } from '../../index.js';
2+
import 'dotenv/config';
3+
4+
/**
5+
* Example: Using SmartScraper with HTML content
6+
*
7+
* This example demonstrates how to use the SmartScraper with local HTML content
8+
* instead of fetching from a URL. This is useful when you already have HTML
9+
* content (e.g., from another source) and want to extract structured data from it.
10+
*/
11+
12+
const apiKey = process.env.SGAI_APIKEY;
13+
14+
if (!apiKey) {
15+
console.error('❌ Error: SGAI_APIKEY environment variable is not set');
16+
console.log('💡 Please set your API key: export SGAI_APIKEY="your-api-key"');
17+
process.exit(1);
18+
}
19+
20+
// Sample HTML content (e.g., from a file or API response)
21+
const htmlContent = `
22+
<!DOCTYPE html>
23+
<html lang="en">
24+
<head>
25+
<meta charset="UTF-8">
26+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
27+
<title>Product Catalog</title>
28+
</head>
29+
<body>
30+
<div class="container">
31+
<h1>Product Catalog</h1>
32+
33+
<div class="product" data-id="1">
34+
<h2>Laptop Pro 15</h2>
35+
<div class="brand">TechCorp</div>
36+
<div class="price">$1,299.99</div>
37+
<div class="rating">4.5/5</div>
38+
<div class="stock">In Stock</div>
39+
<p class="description">High-performance laptop with 15-inch display, 16GB RAM, and 512GB SSD</p>
40+
</div>
41+
42+
<div class="product" data-id="2">
43+
<h2>Wireless Mouse Elite</h2>
44+
<div class="brand">PeripheralCo</div>
45+
<div class="price">$29.99</div>
46+
<div class="rating">4.8/5</div>
47+
<div class="stock">In Stock</div>
48+
<p class="description">Ergonomic wireless mouse with precision tracking</p>
49+
</div>
50+
51+
<div class="product" data-id="3">
52+
<h2>USB-C Hub Pro</h2>
53+
<div class="brand">ConnectTech</div>
54+
<div class="price">$49.99</div>
55+
<div class="rating">4.3/5</div>
56+
<div class="stock">Out of Stock</div>
57+
<p class="description">7-in-1 USB-C hub with HDMI, USB 3.0, and SD card reader</p>
58+
</div>
59+
60+
<div class="reviews">
61+
<h2>Customer Reviews</h2>
62+
<div class="review">
63+
<p class="text">"The Laptop Pro 15 is amazing! Fast and reliable."</p>
64+
<p class="author">- John D.</p>
65+
</div>
66+
<div class="review">
67+
<p class="text">"Great mouse, very comfortable for long work sessions."</p>
68+
<p class="author">- Sarah M.</p>
69+
</div>
70+
</div>
71+
72+
<div class="shipping-info">
73+
<h2>Shipping Information</h2>
74+
<p>Free shipping on orders over $50. Standard delivery takes 3-5 business days.</p>
75+
</div>
76+
</div>
77+
</body>
78+
</html>
79+
`;
80+
81+
async function runExample() {
82+
console.log('🚀 SmartScraper HTML Example');
83+
console.log('='.repeat(60));
84+
console.log('');
85+
86+
try {
87+
console.log('📄 Processing HTML content...');
88+
console.log(`📏 Content size: ${(Buffer.byteLength(htmlContent, 'utf8') / 1024).toFixed(2)} KB`);
89+
console.log('');
90+
91+
const prompt = 'Extract all products with their names, brands, prices, ratings, and stock status';
92+
93+
console.log('🔍 Prompt:', prompt);
94+
console.log('⏳ Sending request to ScrapeGraph AI...');
95+
console.log('');
96+
97+
const result = await smartScraper(
98+
apiKey,
99+
null, // url is null when using HTML
100+
prompt,
101+
null, // schema (optional)
102+
null, // numberOfScrolls (not applicable for local HTML)
103+
null, // totalPages (not applicable for local HTML)
104+
null, // cookies (not applicable for local HTML)
105+
{}, // options
106+
false, // plain_text
107+
false, // renderHeavyJs (not applicable for local HTML)
108+
false, // stealth (not applicable for local HTML)
109+
htmlContent, // websiteHtml
110+
null // websiteMarkdown
111+
);
112+
113+
console.log('✅ Success! Extraction completed.');
114+
console.log('');
115+
console.log('📊 Extracted Data:');
116+
console.log('='.repeat(60));
117+
console.log(JSON.stringify(result, null, 2));
118+
console.log('='.repeat(60));
119+
120+
} catch (error) {
121+
console.error('❌ Error:', error.message);
122+
if (error.response) {
123+
console.error('API Response:', error.response.data);
124+
}
125+
process.exit(1);
126+
}
127+
}
128+
129+
console.log('💡 This example demonstrates:');
130+
console.log(' - Processing local HTML content');
131+
console.log(' - Extracting structured data from HTML');
132+
console.log(' - Using null for URL parameter when using HTML');
133+
console.log(' - Content size validation (max 2MB)');
134+
console.log('');
135+
136+
runExample();
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import { smartScraper } from '../../index.js';
2+
import 'dotenv/config';
3+
4+
/**
5+
* Example: Using SmartScraper with Markdown content
6+
*
7+
* This example demonstrates how to use the SmartScraper with local Markdown content
8+
* instead of fetching from a URL. This is useful when you already have markdown
9+
* content and want to extract structured data from it.
10+
*/
11+
12+
const apiKey = process.env.SGAI_APIKEY;
13+
14+
if (!apiKey) {
15+
console.error('❌ Error: SGAI_APIKEY environment variable is not set');
16+
console.log('💡 Please set your API key: export SGAI_APIKEY="your-api-key"');
17+
process.exit(1);
18+
}
19+
20+
// Sample markdown content (e.g., from a file or API response)
21+
const markdownContent = `
22+
# Product Catalog
23+
24+
## Featured Products
25+
26+
### Laptop Pro 15
27+
- **Brand**: TechCorp
28+
- **Price**: $1,299.99
29+
- **Rating**: 4.5/5
30+
- **In Stock**: Yes
31+
- **Description**: High-performance laptop with 15-inch display, 16GB RAM, and 512GB SSD
32+
33+
### Wireless Mouse Elite
34+
- **Brand**: PeripheralCo
35+
- **Price**: $29.99
36+
- **Rating**: 4.8/5
37+
- **In Stock**: Yes
38+
- **Description**: Ergonomic wireless mouse with precision tracking
39+
40+
### USB-C Hub Pro
41+
- **Brand**: ConnectTech
42+
- **Price**: $49.99
43+
- **Rating**: 4.3/5
44+
- **In Stock**: No
45+
- **Description**: 7-in-1 USB-C hub with HDMI, USB 3.0, and SD card reader
46+
47+
## Customer Reviews
48+
49+
> "The Laptop Pro 15 is amazing! Fast and reliable." - John D.
50+
51+
> "Great mouse, very comfortable for long work sessions." - Sarah M.
52+
53+
## Shipping Information
54+
55+
Free shipping on orders over $50. Standard delivery takes 3-5 business days.
56+
`;
57+
58+
async function runExample() {
59+
console.log('🚀 SmartScraper Markdown Example');
60+
console.log('='.repeat(60));
61+
console.log('');
62+
63+
try {
64+
console.log('📝 Processing Markdown content...');
65+
console.log(`📏 Content size: ${(Buffer.byteLength(markdownContent, 'utf8') / 1024).toFixed(2)} KB`);
66+
console.log('');
67+
68+
const prompt = 'Extract all products with their names, brands, prices, ratings, and stock status';
69+
70+
console.log('🔍 Prompt:', prompt);
71+
console.log('⏳ Sending request to ScrapeGraph AI...');
72+
console.log('');
73+
74+
const result = await smartScraper(
75+
apiKey,
76+
null, // url is null when using markdown
77+
prompt,
78+
null, // schema (optional)
79+
null, // numberOfScrolls (not applicable for markdown)
80+
null, // totalPages (not applicable for markdown)
81+
null, // cookies (not applicable for markdown)
82+
{}, // options
83+
false, // plain_text
84+
false, // renderHeavyJs (not applicable for markdown)
85+
false, // stealth (not applicable for markdown)
86+
null, // websiteHtml
87+
markdownContent // websiteMarkdown
88+
);
89+
90+
console.log('✅ Success! Extraction completed.');
91+
console.log('');
92+
console.log('📊 Extracted Data:');
93+
console.log('='.repeat(60));
94+
console.log(JSON.stringify(result, null, 2));
95+
console.log('='.repeat(60));
96+
97+
} catch (error) {
98+
console.error('❌ Error:', error.message);
99+
if (error.response) {
100+
console.error('API Response:', error.response.data);
101+
}
102+
process.exit(1);
103+
}
104+
}
105+
106+
console.log('💡 This example demonstrates:');
107+
console.log(' - Processing local Markdown content');
108+
console.log(' - Extracting structured data from markdown');
109+
console.log(' - Using null for URL parameter when using markdown');
110+
console.log(' - Content size validation (max 2MB)');
111+
console.log('');
112+
113+
runExample();

scrapegraph-js/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "scrapegraph-js",
33
"author": "ScrapeGraphAI",
4-
"version": "0.2.0",
4+
"version": "0.2.1",
55
"description": "Scrape and extract structured data from a webpage using ScrapeGraphAI's APIs. Supports cookies for authentication, infinite scrolling, and pagination.",
66
"repository": {
77
"type": "git",

scrapegraph-js/src/smartScraper.js

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,55 @@ import { getMockResponse, createMockAxiosResponse } from './utils/mockResponse.j
88
/**
99
* Scrape and extract structured data from a webpage using ScrapeGraph AI.
1010
*
11+
* Supports three types of input (must provide exactly one):
12+
* - url: Scrape from a URL
13+
* - websiteHtml: Process local HTML content
14+
* - websiteMarkdown: Process local Markdown content
15+
*
1116
* @param {string} apiKey - Your ScrapeGraph AI API key
12-
* @param {string} url - The URL of the webpage to scrape
17+
* @param {string} url - The URL of the webpage to scrape (can be null if using websiteHtml or websiteMarkdown)
1318
* @param {string} prompt - Natural language prompt describing what data to extract
1419
* @param {Object} [schema] - Optional schema object defining the output structure
1520
* @param {number} [numberOfScrolls] - Optional number of times to scroll the page (0-100). If not provided, no scrolling will be performed.
1621
* @param {number} [totalPages] - Optional number of pages to scrape (1-10). If not provided, only the first page will be scraped.
1722
* @param {Object} [cookies] - Optional cookies object for authentication and session management
23+
* @param {Object} [options] - Optional configuration object
24+
* @param {boolean} [plain_text] - Optional flag to return plain text instead of structured data
1825
* @param {boolean} [renderHeavyJs] - Optional flag to enable heavy JavaScript rendering on the page
1926
* @param {boolean} [stealth] - Optional flag to enable stealth mode to avoid bot detection
27+
* @param {string} [websiteHtml] - Optional raw HTML content to process (max 2MB, mutually exclusive with url and websiteMarkdown)
28+
* @param {string} [websiteMarkdown] - Optional Markdown content to process (max 2MB, mutually exclusive with url and websiteHtml)
2029
* @returns {Promise<string>} Extracted data in JSON format matching the provided schema
21-
* @throws - Will throw an error in case of an HTTP failure.
30+
* @throws - Will throw an error in case of an HTTP failure or validation error.
2231
*/
23-
export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null, cookies = null, options = {}, plain_text = false, renderHeavyJs = false, stealth = false) {
32+
export async function smartScraper(apiKey, url, prompt, schema = null, numberOfScrolls = null, totalPages = null, cookies = null, options = {}, plain_text = false, renderHeavyJs = false, stealth = false, websiteHtml = null, websiteMarkdown = null) {
2433
const { mock = null } = options;
2534

35+
// Validate that exactly one of url, websiteHtml, or websiteMarkdown is provided
36+
const inputsProvided = [url, websiteHtml, websiteMarkdown].filter(input => input !== null && input !== undefined).length;
37+
38+
if (inputsProvided === 0) {
39+
throw new Error('Exactly one of url, websiteHtml, or websiteMarkdown must be provided');
40+
}
41+
42+
if (inputsProvided > 1) {
43+
throw new Error('Only one of url, websiteHtml, or websiteMarkdown can be provided');
44+
}
45+
46+
// Validate content size for HTML and Markdown (max 2MB)
47+
const MAX_SIZE = 2 * 1024 * 1024; // 2MB in bytes
48+
49+
if (websiteHtml && Buffer.byteLength(websiteHtml, 'utf8') > MAX_SIZE) {
50+
throw new Error('websiteHtml content exceeds maximum size of 2MB');
51+
}
52+
53+
if (websiteMarkdown && Buffer.byteLength(websiteMarkdown, 'utf8') > MAX_SIZE) {
54+
throw new Error('websiteMarkdown content exceeds maximum size of 2MB');
55+
}
56+
2657
// Check if mock mode is enabled
2758
const useMock = mock !== null ? mock : isMockEnabled();
28-
59+
2960
if (useMock) {
3061
console.log('🧪 Mock mode active. Returning stub for smartScraper request');
3162
const mockConfig = getMockConfig();
@@ -41,11 +72,19 @@ export async function smartScraper(apiKey, url, prompt, schema = null, numberOfS
4172
};
4273

4374
const payload = {
44-
website_url: url,
4575
user_prompt: prompt,
4676
plain_text: plain_text,
4777
};
4878

79+
// Add the appropriate input source to the payload
80+
if (url) {
81+
payload.website_url = url;
82+
} else if (websiteHtml) {
83+
payload.website_html = websiteHtml;
84+
} else if (websiteMarkdown) {
85+
payload.website_markdown = websiteMarkdown;
86+
}
87+
4988
if (renderHeavyJs) {
5089
payload.render_heavy_js = renderHeavyJs;
5190
}

0 commit comments

Comments
 (0)