Skip to content

Commit 810157c

Browse files
authored
Add simplified view of dataset (#3)
1 parent 53c9c76 commit 810157c

File tree

3 files changed

+59
-3
lines changed

3 files changed

+59
-3
lines changed

.actor/actor.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,8 @@
33
"name": "RexScraper",
44
"version": "0.1",
55
"buildTag": "latest",
6-
"environmentVariables": {}
6+
"environmentVariables": {},
7+
"storages": {
8+
"dataset": "./dataset_schema.json"
9+
}
710
}

.actor/dataset_schema.json

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
{
2+
"actorSpecification": 1,
3+
"views": {
4+
"overview": {
5+
"title": "Overview",
6+
"transformation": {
7+
"fields": [
8+
"sku",
9+
"name",
10+
"category",
11+
"price",
12+
"imageUrl",
13+
"description",
14+
"url"
15+
]
16+
},
17+
"display": {
18+
"component": "table",
19+
"properties": {
20+
"sku": {
21+
"label": "sku",
22+
"format": "text"
23+
},
24+
"name": {
25+
"label": "name",
26+
"format": "text"
27+
},
28+
"category": {
29+
"label": "category",
30+
"format": "text"
31+
},
32+
"price": {
33+
"label": "price",
34+
"format": "text"
35+
},
36+
"imageUrl": {
37+
"label": "image",
38+
"format": "image"
39+
},
40+
"description": {
41+
"label": "description",
42+
"format": "text"
43+
},
44+
"url": {
45+
"label": "url",
46+
"format": "link"
47+
}
48+
}
49+
}
50+
}
51+
}
52+
}

src/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ def get_product_details(context: BeautifulSoupCrawlingContext, category: str) ->
3737
"""Scrape details of specific product."""
3838
soup = context.soup
3939
details = {
40-
'sku': soup.find('span', {'itemprop': 'name'}).text,
40+
'name': soup.find('span', {'itemprop': 'name'}).text,
41+
'sku': soup.find('div', {'itemprop': 'sku'}).text,
4142
'category': category,
4243
'price': soup.find('div', 'product-info-price').find('span', 'price').text,
4344
'imageUrl': soup.select('.gallery-placeholder__image')[0]['src'].split('?')[0],
@@ -70,7 +71,7 @@ async def main() -> None:
7071
async with Actor:
7172
actor_input = await Actor.get_input() or {}
7273
max_requests_per_crawl = actor_input.get('max_requests_per_crawl', 30)
73-
desired_categories = {category.lower() for category in actor_input.get('categories', [])}
74+
desired_categories = {category.lower() for category in actor_input.get('desired_categories', [])}
7475
include_keywords = {word.lower() for word in actor_input.get('include_keywords', [])}
7576
exclude_keywords = {word.lower() for word in actor_input.get('exclude_keywords', [])}
7677
Actor.log.info(f'{desired_categories=}, {include_keywords=},{exclude_keywords=},{max_requests_per_crawl=}')

0 commit comments

Comments
 (0)