Merge pull request #2 from Pijukatel/actor_draft

Pijukatel · web-flow · commit 53c9c761eb1a · 2024-11-06T09:56:07.000+01:00
Simple scraping actor.

Filtering options:
Included keywords, excluded keywords, category.

Outputs product details and single picture for each product that matches the search.

Includes Apify deployment-relevant files .actor folder.
diff --git a/.actor/Dockerfile b/.actor/Dockerfile
@@ -0,0 +1,24 @@
+FROM apify/actor-python:3.12
+
+RUN rm -rf /usr/src/app/*
+WORKDIR /usr/src/app
+
+COPY pyproject.toml ./
+COPY poetry.lock ./
+
+RUN echo "Python version:" \
+    && python --version \
+    && echo "Pip version:" \
+    && pip --version \
+    && echo "Installing Poetry:" \
+    && pip install --no-cache-dir poetry~=1.8 \
+    && echo "Installing dependencies:" \
+    && poetry config virtualenvs.create false \
+    && poetry install --only main --no-interaction --no-ansi --no-root\
+    && rm -rf /tmp/.poetry-cache \
+    && echo "All installed Python packages:" \
+    && pip freeze
+
+COPY . ./
+
+CMD ["python3", "-m", "src"]
diff --git a/.actor/actor.json b/.actor/actor.json
@@ -0,0 +1,7 @@
+{
+	"actorSpecification": 1,
+	"name": "RexScraper",
+	"version": "0.1",
+	"buildTag": "latest",
+	"environmentVariables": {}
+}
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -0,0 +1,35 @@
+{
+    "title": "RexScraper",
+    "type": "object",
+    "schemaVersion": 1,
+    "properties": {
+        "desired_categories": {
+            "title": "Categories to search.",
+            "type": "array",
+            "description": "Product categories that will be included. (If no categories are explicitly specified, then it will consider all categories.)",
+            "prefill": ["Herramientas"],
+            "editor": "json"
+        },
+        "include_keywords": {
+            "title": "Included keywords(or terms) for products",
+            "type": "array",
+            "description": "Only products that match at least of of the terms will be included in search. Term match is used on all scraped attributes. (If no terms are specified, then all products are selected.)",
+            "prefill": ["Taladro", "Soldadora"],
+            "editor": "json"
+        },
+        "exclude_keywords": {
+            "title": "Excluded keywords(or terms) for products",
+            "type": "array",
+            "description": "Products that match any of the excluded terms on any of the product details will be filtered out, even if they match include keyword. (If no terms are specified, then no products are excluded this way.)",
+            "prefill": ["Stanley"],
+            "editor": "json"
+        },
+        "max_requests_per_crawl": {
+            "title": "Maximum requests per crawl",
+            "type": "integer",
+            "description": "Maximum number of http requests that the crawler will send. (Set small value during actor testing.)",
+            "default": 10,
+            "editor": "number"
+        }
+    }
+}
diff --git a/README.md b/README.md
@@ -1,3 +1,5 @@
 # actor-RexScraper
 
-Very simple Apify based product scraper for https://somosrex.com/
+Very simple Apify based product scraper for https://somosrex.com/
+
+Inputs to the actor are described in: ./.actor/input_schema.json
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,7 @@ readme = "README.md"
 python = "^3.12"
 setuptools = "^75.3.0"
 apify = "^2.0.1"
+crawlee = {extras = ["beautifulsoup"], version = "^0.4.0"}
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/src/__main__.py b/src/__main__.py
@@ -0,0 +1,19 @@
+import asyncio
+import logging
+
+from apify.log import ActorLogFormatter
+
+from .main import main
+
+handler = logging.StreamHandler()
+handler.setFormatter(ActorLogFormatter())
+
+apify_client_logger = logging.getLogger('apify_client')
+apify_client_logger.setLevel(logging.INFO)
+apify_client_logger.addHandler(handler)
+
+apify_logger = logging.getLogger('apify')
+apify_logger.setLevel(logging.DEBUG)
+apify_logger.addHandler(handler)
+
+asyncio.run(main())
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,100 @@
+from __future__ import annotations
+
+import asyncio
+
+from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+from apify import Actor
+
+ProductDetails = dict[str, str]
+
+
+async def process_top_page(context: BeautifulSoupCrawlingContext, desired_categories: set[str]) -> None:
+    """Enqueue links to category pages that are in desired categories."""
+    for category_element in context.soup.findAll('a', 'level-top'):
+        category_selector = '.' + '.'.join(category_element['class'])
+        category = category_element.text
+        if not desired_categories or category.lower() in desired_categories:
+            await context.enqueue_links(
+                selector=category_selector,
+                label=f'CATEGORY-{category}',
+            )
+
+
+async def process_category_page(context: BeautifulSoupCrawlingContext, category: str) -> None:
+    """Enqueue product links and pagination link to next page of products of this category."""
+    await context.enqueue_links(
+        selector='.action.next',
+        label=f'CATEGORY-{category}',
+    )
+    await context.enqueue_links(
+        selector='.product-item-link',
+        label=f'PRODUCT-{category}',
+    )
+
+
+def get_product_details(context: BeautifulSoupCrawlingContext, category: str) -> ProductDetails:
+    """Scrape details of specific product."""
+    soup = context.soup
+    details = {
+        'sku': soup.find('span', {'itemprop': 'name'}).text,
+        'category': category,
+        'price': soup.find('div', 'product-info-price').find('span', 'price').text,
+        'imageUrl': soup.select('.gallery-placeholder__image')[0]['src'].split('?')[0],
+        'url': context.request.url,
+        'description': soup.select('div.product.attribute.description > div.value')[0].text,
+    }
+
+    for detail_element in list(soup.select('.col.label')):
+        detail_name = detail_element.text
+        details[detail_name] = soup.find('td', {'data-th': detail_name}).text
+    return details
+
+
+def is_relevant(product_details: ProductDetails, include_keywords: set[str]) -> bool:
+    """Return true if no include keywords are defined or if any of them match product details."""
+    return not include_keywords or product_includes_keyword(product_details, include_keywords)
+
+
+def product_includes_keyword(product_details: ProductDetails, keywords: set[str]) -> bool:
+    """Return true if any of the product details is containing any of the keywords. Not case-sensitive."""
+    for detail_text in product_details.values():
+        for keyword in keywords:
+            if keyword in detail_text.lower():
+                return True
+    return False
+
+
+async def main() -> None:
+    """Main entry point for RexScraper."""
+    async with Actor:
+        actor_input = await Actor.get_input() or {}
+        max_requests_per_crawl = actor_input.get('max_requests_per_crawl', 30)
+        desired_categories = {category.lower() for category in actor_input.get('categories', [])}
+        include_keywords = {word.lower() for word in actor_input.get('include_keywords', [])}
+        exclude_keywords = {word.lower() for word in actor_input.get('exclude_keywords', [])}
+        Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=},{max_requests_per_crawl=}')
+
+        crawler = BeautifulSoupCrawler(max_requests_per_crawl=max_requests_per_crawl, _logger=Actor.log)
+
+        @crawler.router.default_handler
+        async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+            match (context.request.label or '').split('-'):
+                case ['PRODUCT', category_name]:
+                    product_details = get_product_details(context, category_name)
+                    if is_relevant(product_details, include_keywords) and not product_includes_keyword(
+                        product_details,
+                        exclude_keywords,
+                    ):
+                        await context.push_data(product_details)
+
+                case ['CATEGORY', category_name]:
+                    await process_category_page(context, category_name)
+                case _:
+                    await process_top_page(context, desired_categories)
+
+        await crawler.run(['https://somosrex.com/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())