|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import asyncio |
| 4 | + |
| 5 | +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext |
| 6 | + |
| 7 | +from apify import Actor |
| 8 | + |
| 9 | +ProductDetails = dict[str, str] |
| 10 | + |
| 11 | + |
| 12 | +async def process_top_page(context: BeautifulSoupCrawlingContext, desired_categories: set[str]) -> None: |
| 13 | + """Enqueue links to category pages that are in desired categories.""" |
| 14 | + for category_element in context.soup.findAll('a', 'level-top'): |
| 15 | + category_selector = '.' + '.'.join(category_element['class']) |
| 16 | + category = category_element.text |
| 17 | + if not desired_categories or category.lower() in desired_categories: |
| 18 | + await context.enqueue_links( |
| 19 | + selector=category_selector, |
| 20 | + label=f'CATEGORY-{category}', |
| 21 | + ) |
| 22 | + |
| 23 | + |
| 24 | +async def process_category_page(context: BeautifulSoupCrawlingContext, category: str) -> None: |
| 25 | + """Enqueue product links and pagination link to next page of products of this category.""" |
| 26 | + await context.enqueue_links( |
| 27 | + selector='.action.next', |
| 28 | + label=f'CATEGORY-{category}', |
| 29 | + ) |
| 30 | + await context.enqueue_links( |
| 31 | + selector='.product-item-link', |
| 32 | + label=f'PRODUCT-{category}', |
| 33 | + ) |
| 34 | + |
| 35 | + |
| 36 | +def get_product_details(context: BeautifulSoupCrawlingContext, category: str) -> ProductDetails: |
| 37 | + """Scrape details of specific product.""" |
| 38 | + soup = context.soup |
| 39 | + details = { |
| 40 | + 'sku': soup.find('span', {'itemprop': 'name'}).text, |
| 41 | + 'category': category, |
| 42 | + 'price': soup.find('div', 'product-info-price').find('span', 'price').text, |
| 43 | + 'imageUrl': soup.select('.gallery-placeholder__image')[0]['src'].split('?')[0], |
| 44 | + 'url': context.request.url, |
| 45 | + 'description': soup.select('div.product.attribute.description > div.value')[0].text, |
| 46 | + } |
| 47 | + |
| 48 | + for detail_element in list(soup.select('.col.label')): |
| 49 | + detail_name = detail_element.text |
| 50 | + details[detail_name] = soup.find('td', {'data-th': detail_name}).text |
| 51 | + return details |
| 52 | + |
| 53 | + |
| 54 | +def is_relevant(product_details: ProductDetails, include_keywords: set[str]) -> bool: |
| 55 | + """Return true if no include keywords are defined or if any of them match product details.""" |
| 56 | + return not include_keywords or product_includes_keyword(product_details, include_keywords) |
| 57 | + |
| 58 | + |
| 59 | +def product_includes_keyword(product_details: ProductDetails, keywords: set[str]) -> bool: |
| 60 | + """Return true if any of the product details is containing any of the keywords. Not case-sensitive.""" |
| 61 | + for detail_text in product_details.values(): |
| 62 | + for keyword in keywords: |
| 63 | + if keyword in detail_text.lower(): |
| 64 | + return True |
| 65 | + return False |
| 66 | + |
| 67 | + |
| 68 | +async def main() -> None: |
| 69 | + """Main entry point for RexScraper.""" |
| 70 | + async with Actor: |
| 71 | + actor_input = await Actor.get_input() or {} |
| 72 | + max_requests_per_crawl = actor_input.get('max_requests_per_crawl', 30) |
| 73 | + desired_categories = {category.lower() for category in actor_input.get('categories', [])} |
| 74 | + include_keywords = {word.lower() for word in actor_input.get('include_keywords', [])} |
| 75 | + exclude_keywords = {word.lower() for word in actor_input.get('exclude_keywords', [])} |
| 76 | + Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=},{max_requests_per_crawl=}') |
| 77 | + |
| 78 | + crawler = BeautifulSoupCrawler(max_requests_per_crawl=max_requests_per_crawl, _logger=Actor.log) |
| 79 | + |
| 80 | + @crawler.router.default_handler |
| 81 | + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: |
| 82 | + match (context.request.label or '').split('-'): |
| 83 | + case ['PRODUCT', category_name]: |
| 84 | + product_details = get_product_details(context, category_name) |
| 85 | + if is_relevant(product_details, include_keywords) and not product_includes_keyword( |
| 86 | + product_details, |
| 87 | + exclude_keywords, |
| 88 | + ): |
| 89 | + await context.push_data(product_details) |
| 90 | + |
| 91 | + case ['CATEGORY', category_name]: |
| 92 | + await process_category_page(context, category_name) |
| 93 | + case _: |
| 94 | + await process_top_page(context, desired_categories) |
| 95 | + |
| 96 | + await crawler.run(['https://somosrex.com/']) |
| 97 | + |
| 98 | + |
| 99 | +if __name__ == '__main__': |
| 100 | + asyncio.run(main()) |
0 commit comments