Skip to content

Commit 53c9c76

Browse files
authored
Merge pull request #2 from Pijukatel/actor_draft
Simple scraping actor. Filtering options: Included keywords, excluded keywords, category. Outputs product details and single picture for each product that matches the search. Includes Apify deployment-relevant files .actor folder.
2 parents 7bc363f + dacf0c2 commit 53c9c76

File tree

8 files changed

+411
-2
lines changed

8 files changed

+411
-2
lines changed

.actor/Dockerfile

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
FROM apify/actor-python:3.12
2+
3+
RUN rm -rf /usr/src/app/*
4+
WORKDIR /usr/src/app
5+
6+
COPY pyproject.toml ./
7+
COPY poetry.lock ./
8+
9+
RUN echo "Python version:" \
10+
&& python --version \
11+
&& echo "Pip version:" \
12+
&& pip --version \
13+
&& echo "Installing Poetry:" \
14+
&& pip install --no-cache-dir poetry~=1.8 \
15+
&& echo "Installing dependencies:" \
16+
&& poetry config virtualenvs.create false \
17+
&& poetry install --only main --no-interaction --no-ansi --no-root\
18+
&& rm -rf /tmp/.poetry-cache \
19+
&& echo "All installed Python packages:" \
20+
&& pip freeze
21+
22+
COPY . ./
23+
24+
CMD ["python3", "-m", "src"]

.actor/actor.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "RexScraper",
4+
"version": "0.1",
5+
"buildTag": "latest",
6+
"environmentVariables": {}
7+
}

.actor/input_schema.json

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"title": "RexScraper",
3+
"type": "object",
4+
"schemaVersion": 1,
5+
"properties": {
6+
"desired_categories": {
7+
"title": "Categories to search.",
8+
"type": "array",
9+
"description": "Product categories that will be included. (If no categories are explicitly specified, then it will consider all categories.)",
10+
"prefill": ["Herramientas"],
11+
"editor": "json"
12+
},
13+
"include_keywords": {
14+
"title": "Included keywords(or terms) for products",
15+
"type": "array",
16+
"description": "Only products that match at least of of the terms will be included in search. Term match is used on all scraped attributes. (If no terms are specified, then all products are selected.)",
17+
"prefill": ["Taladro", "Soldadora"],
18+
"editor": "json"
19+
},
20+
"exclude_keywords": {
21+
"title": "Excluded keywords(or terms) for products",
22+
"type": "array",
23+
"description": "Products that match any of the excluded terms on any of the product details will be filtered out, even if they match include keyword. (If no terms are specified, then no products are excluded this way.)",
24+
"prefill": ["Stanley"],
25+
"editor": "json"
26+
},
27+
"max_requests_per_crawl": {
28+
"title": "Maximum requests per crawl",
29+
"type": "integer",
30+
"description": "Maximum number of http requests that the crawler will send. (Set small value during actor testing.)",
31+
"default": 10,
32+
"editor": "number"
33+
}
34+
}
35+
}

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
# actor-RexScraper
22

3-
Very simple Apify based product scraper for https://somosrex.com/
3+
Very simple Apify based product scraper for https://somosrex.com/
4+
5+
Inputs to the actor are described in: ./.actor/input_schema.json

poetry.lock

Lines changed: 222 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ readme = "README.md"
99
python = "^3.12"
1010
setuptools = "^75.3.0"
1111
apify = "^2.0.1"
12+
crawlee = {extras = ["beautifulsoup"], version = "^0.4.0"}
1213

1314

1415
[tool.poetry.group.dev.dependencies]

src/__main__.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import asyncio
2+
import logging
3+
4+
from apify.log import ActorLogFormatter
5+
6+
from .main import main
7+
8+
handler = logging.StreamHandler()
9+
handler.setFormatter(ActorLogFormatter())
10+
11+
apify_client_logger = logging.getLogger('apify_client')
12+
apify_client_logger.setLevel(logging.INFO)
13+
apify_client_logger.addHandler(handler)
14+
15+
apify_logger = logging.getLogger('apify')
16+
apify_logger.setLevel(logging.DEBUG)
17+
apify_logger.addHandler(handler)
18+
19+
asyncio.run(main())

src/main.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
5+
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
6+
7+
from apify import Actor
8+
9+
ProductDetails = dict[str, str]
10+
11+
12+
async def process_top_page(context: BeautifulSoupCrawlingContext, desired_categories: set[str]) -> None:
13+
"""Enqueue links to category pages that are in desired categories."""
14+
for category_element in context.soup.findAll('a', 'level-top'):
15+
category_selector = '.' + '.'.join(category_element['class'])
16+
category = category_element.text
17+
if not desired_categories or category.lower() in desired_categories:
18+
await context.enqueue_links(
19+
selector=category_selector,
20+
label=f'CATEGORY-{category}',
21+
)
22+
23+
24+
async def process_category_page(context: BeautifulSoupCrawlingContext, category: str) -> None:
25+
"""Enqueue product links and pagination link to next page of products of this category."""
26+
await context.enqueue_links(
27+
selector='.action.next',
28+
label=f'CATEGORY-{category}',
29+
)
30+
await context.enqueue_links(
31+
selector='.product-item-link',
32+
label=f'PRODUCT-{category}',
33+
)
34+
35+
36+
def get_product_details(context: BeautifulSoupCrawlingContext, category: str) -> ProductDetails:
37+
"""Scrape details of specific product."""
38+
soup = context.soup
39+
details = {
40+
'sku': soup.find('span', {'itemprop': 'name'}).text,
41+
'category': category,
42+
'price': soup.find('div', 'product-info-price').find('span', 'price').text,
43+
'imageUrl': soup.select('.gallery-placeholder__image')[0]['src'].split('?')[0],
44+
'url': context.request.url,
45+
'description': soup.select('div.product.attribute.description > div.value')[0].text,
46+
}
47+
48+
for detail_element in list(soup.select('.col.label')):
49+
detail_name = detail_element.text
50+
details[detail_name] = soup.find('td', {'data-th': detail_name}).text
51+
return details
52+
53+
54+
def is_relevant(product_details: ProductDetails, include_keywords: set[str]) -> bool:
55+
"""Return true if no include keywords are defined or if any of them match product details."""
56+
return not include_keywords or product_includes_keyword(product_details, include_keywords)
57+
58+
59+
def product_includes_keyword(product_details: ProductDetails, keywords: set[str]) -> bool:
60+
"""Return true if any of the product details is containing any of the keywords. Not case-sensitive."""
61+
for detail_text in product_details.values():
62+
for keyword in keywords:
63+
if keyword in detail_text.lower():
64+
return True
65+
return False
66+
67+
68+
async def main() -> None:
69+
"""Main entry point for RexScraper."""
70+
async with Actor:
71+
actor_input = await Actor.get_input() or {}
72+
max_requests_per_crawl = actor_input.get('max_requests_per_crawl', 30)
73+
desired_categories = {category.lower() for category in actor_input.get('categories', [])}
74+
include_keywords = {word.lower() for word in actor_input.get('include_keywords', [])}
75+
exclude_keywords = {word.lower() for word in actor_input.get('exclude_keywords', [])}
76+
Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=},{max_requests_per_crawl=}')
77+
78+
crawler = BeautifulSoupCrawler(max_requests_per_crawl=max_requests_per_crawl, _logger=Actor.log)
79+
80+
@crawler.router.default_handler
81+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
82+
match (context.request.label or '').split('-'):
83+
case ['PRODUCT', category_name]:
84+
product_details = get_product_details(context, category_name)
85+
if is_relevant(product_details, include_keywords) and not product_includes_keyword(
86+
product_details,
87+
exclude_keywords,
88+
):
89+
await context.push_data(product_details)
90+
91+
case ['CATEGORY', category_name]:
92+
await process_category_page(context, category_name)
93+
case _:
94+
await process_top_page(context, desired_categories)
95+
96+
await crawler.run(['https://somosrex.com/'])
97+
98+
99+
if __name__ == '__main__':
100+
asyncio.run(main())

0 commit comments

Comments
 (0)