Skip to content

Commit 2fad0b4

Browse files
committed
Feedback to first version.
Add automatic proxy setting. Make inpout more user friendly.
1 parent 810157c commit 2fad0b4

File tree

2 files changed

+6
-14
lines changed

2 files changed

+6
-14
lines changed

.actor/input_schema.json

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,21 @@
88
"type": "array",
99
"description": "Product categories that will be included. (If no categories are explicitly specified, then it will consider all categories.)",
1010
"prefill": ["Herramientas"],
11-
"editor": "json"
11+
"editor": "stringList"
1212
},
1313
"include_keywords": {
1414
"title": "Included keywords(or terms) for products",
1515
"type": "array",
1616
"description": "Only products that match at least of of the terms will be included in search. Term match is used on all scraped attributes. (If no terms are specified, then all products are selected.)",
1717
"prefill": ["Taladro", "Soldadora"],
18-
"editor": "json"
18+
"editor": "stringList"
1919
},
2020
"exclude_keywords": {
2121
"title": "Excluded keywords(or terms) for products",
2222
"type": "array",
2323
"description": "Products that match any of the excluded terms on any of the product details will be filtered out, even if they match include keyword. (If no terms are specified, then no products are excluded this way.)",
2424
"prefill": ["Stanley"],
25-
"editor": "json"
26-
},
27-
"max_requests_per_crawl": {
28-
"title": "Maximum requests per crawl",
29-
"type": "integer",
30-
"description": "Maximum number of http requests that the crawler will send. (Set small value during actor testing.)",
31-
"default": 10,
32-
"editor": "number"
25+
"editor": "stringList"
3326
}
3427
}
3528
}

src/main.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
66

7-
from apify import Actor
7+
from apify import Actor, ProxyConfiguration
88

99
ProductDetails = dict[str, str]
1010

@@ -70,13 +70,12 @@ async def main() -> None:
7070
"""Main entry point for RexScraper."""
7171
async with Actor:
7272
actor_input = await Actor.get_input() or {}
73-
max_requests_per_crawl = actor_input.get('max_requests_per_crawl', 30)
7473
desired_categories = {category.lower() for category in actor_input.get('desired_categories', [])}
7574
include_keywords = {word.lower() for word in actor_input.get('include_keywords', [])}
7675
exclude_keywords = {word.lower() for word in actor_input.get('exclude_keywords', [])}
77-
Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=},{max_requests_per_crawl=}')
76+
Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=}')
7877

79-
crawler = BeautifulSoupCrawler(max_requests_per_crawl=max_requests_per_crawl, _logger=Actor.log)
78+
crawler = BeautifulSoupCrawler(_logger=Actor.log, proxy_configuration=ProxyConfiguration())
8079

8180
@crawler.router.default_handler
8281
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

0 commit comments

Comments
 (0)