Skip to content

Commit 50af5cf

Browse files
authored
Feedback to first version. (#4)
Add automatic proxy setting. Make inpout more user friendly.
1 parent 810157c commit 50af5cf

File tree

3 files changed

+17
-15
lines changed

3 files changed

+17
-15
lines changed

.actor/input_schema.json

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,21 @@
88
"type": "array",
99
"description": "Product categories that will be included. (If no categories are explicitly specified, then it will consider all categories.)",
1010
"prefill": ["Herramientas"],
11-
"editor": "json"
11+
"editor": "stringList"
1212
},
1313
"include_keywords": {
1414
"title": "Included keywords(or terms) for products",
1515
"type": "array",
1616
"description": "Only products that match at least of of the terms will be included in search. Term match is used on all scraped attributes. (If no terms are specified, then all products are selected.)",
1717
"prefill": ["Taladro", "Soldadora"],
18-
"editor": "json"
18+
"editor": "stringList"
1919
},
2020
"exclude_keywords": {
2121
"title": "Excluded keywords(or terms) for products",
2222
"type": "array",
2323
"description": "Products that match any of the excluded terms on any of the product details will be filtered out, even if they match include keyword. (If no terms are specified, then no products are excluded this way.)",
2424
"prefill": ["Stanley"],
25-
"editor": "json"
26-
},
27-
"max_requests_per_crawl": {
28-
"title": "Maximum requests per crawl",
29-
"type": "integer",
30-
"description": "Maximum number of http requests that the crawler will send. (Set small value during actor testing.)",
31-
"default": 10,
32-
"editor": "number"
25+
"editor": "stringList"
3326
}
3427
}
3528
}

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,14 @@
22

33
Very simple Apify based product scraper for https://somosrex.com/
44

5-
Inputs to the actor are described in: ./.actor/input_schema.json
5+
**Inputs to the actor**:
6+
7+
- **Desired categories**: List of product categories that will be used to search for products. Keeping this list empty will search all categories. (not case-sensitive)
8+
- **Include keywords**: Scrapper will search in all product fields for any of the keywords. If at least one match is found, product is included. Keeping this list empty will include all products. (not case-sensitive)
9+
- **Exclude keywords**: Scrapper will search in all product fields for any of the keywords. If at least one match is found, product is excluded. Keeping this list empty will not do any exclusion.(not case-sensitive)
10+
11+
If product contains both one of the include keywords and one of the excluded keyword, then it is excluded.
12+
(Simple order of evaluation: find product from desired category, include product if it contains any include keyword and does not contain any exclude keyword.)
13+
14+
**Source code**: https://github.com/Pijukatel/actor-RexScraper
15+
Report any issues or improvements proposals on GitHub please.

src/main.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
66

7-
from apify import Actor
7+
from apify import Actor, ProxyConfiguration
88

99
ProductDetails = dict[str, str]
1010

@@ -70,13 +70,12 @@ async def main() -> None:
7070
"""Main entry point for RexScraper."""
7171
async with Actor:
7272
actor_input = await Actor.get_input() or {}
73-
max_requests_per_crawl = actor_input.get('max_requests_per_crawl', 30)
7473
desired_categories = {category.lower() for category in actor_input.get('desired_categories', [])}
7574
include_keywords = {word.lower() for word in actor_input.get('include_keywords', [])}
7675
exclude_keywords = {word.lower() for word in actor_input.get('exclude_keywords', [])}
77-
Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=},{max_requests_per_crawl=}')
76+
Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=}')
7877

79-
crawler = BeautifulSoupCrawler(max_requests_per_crawl=max_requests_per_crawl, _logger=Actor.log)
78+
crawler = BeautifulSoupCrawler(_logger=Actor.log, proxy_configuration=ProxyConfiguration())
8079

8180
@crawler.router.default_handler
8281
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:

0 commit comments

Comments
 (0)