Feedback to first version. (#4)

Pijukatel · web-flow · commit 50af5cf16b73 · 2024-11-07T16:25:42.000+01:00
Add automatic proxy setting.
Make inpout more user friendly.
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -8,28 +8,21 @@
             "type": "array",
             "description": "Product categories that will be included. (If no categories are explicitly specified, then it will consider all categories.)",
             "prefill": ["Herramientas"],
-            "editor": "json"
+            "editor": "stringList"
         },
         "include_keywords": {
             "title": "Included keywords(or terms) for products",
             "type": "array",
             "description": "Only products that match at least of of the terms will be included in search. Term match is used on all scraped attributes. (If no terms are specified, then all products are selected.)",
             "prefill": ["Taladro", "Soldadora"],
-            "editor": "json"
+            "editor": "stringList"
         },
         "exclude_keywords": {
             "title": "Excluded keywords(or terms) for products",
             "type": "array",
             "description": "Products that match any of the excluded terms on any of the product details will be filtered out, even if they match include keyword. (If no terms are specified, then no products are excluded this way.)",
             "prefill": ["Stanley"],
-            "editor": "json"
-        },
-        "max_requests_per_crawl": {
-            "title": "Maximum requests per crawl",
-            "type": "integer",
-            "description": "Maximum number of http requests that the crawler will send. (Set small value during actor testing.)",
-            "default": 10,
-            "editor": "number"
+            "editor": "stringList"
         }
     }
 }
diff --git a/README.md b/README.md
@@ -2,4 +2,14 @@
 
 Very simple Apify based product scraper for https://somosrex.com/
 
-Inputs to the actor are described in: ./.actor/input_schema.json
+**Inputs to the actor**: 
+
+- **Desired categories**: List of product categories that will be used to search for products. Keeping this list empty will search all categories. (not case-sensitive)
+- **Include keywords**: Scrapper will search in all product fields for any of the keywords. If at least one match is found, product is included. Keeping this list empty will include all products. (not case-sensitive)
+- **Exclude keywords**: Scrapper will search in all product fields for any of the keywords. If at least one match is found, product is excluded. Keeping this list empty will not do any exclusion.(not case-sensitive)
+
+If product contains both one of the include keywords and one of the excluded keyword, then it is excluded.
+(Simple order of evaluation: find product from desired category, include product if it contains any include keyword and does not contain any exclude keyword.)
+
+**Source code**: https://github.com/Pijukatel/actor-RexScraper
+Report any issues or improvements proposals on GitHub please.
diff --git a/src/main.py b/src/main.py
@@ -4,7 +4,7 @@
 
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 
-from apify import Actor
+from apify import Actor, ProxyConfiguration
 
 ProductDetails = dict[str, str]
 
@@ -70,13 +70,12 @@ async def main() -> None:
     """Main entry point for RexScraper."""
     async with Actor:
         actor_input = await Actor.get_input() or {}
-        max_requests_per_crawl = actor_input.get('max_requests_per_crawl', 30)
         desired_categories = {category.lower() for category in actor_input.get('desired_categories', [])}
         include_keywords = {word.lower() for word in actor_input.get('include_keywords', [])}
         exclude_keywords = {word.lower() for word in actor_input.get('exclude_keywords', [])}
-        Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=},{max_requests_per_crawl=}')
+        Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=}')
 
-        crawler = BeautifulSoupCrawler(max_requests_per_crawl=max_requests_per_crawl, _logger=Actor.log)
+        crawler = BeautifulSoupCrawler(_logger=Actor.log, proxy_configuration=ProxyConfiguration())
 
         @crawler.router.default_handler
         async def request_handler(context: BeautifulSoupCrawlingContext) -> None: