3
3
from typing import Set
4
4
5
5
import httpx
6
- import pyppeteer
7
6
import selectorlib
8
- from app .core .settings import settings
9
- from app .crud import shop_crud
10
- from app .schemas .shop import ShopConfigurationDB
7
+ from sqlalchemy import orm
8
+
9
+ from app .models import shopmodels
10
+ from app .service import shopservice
11
+ from app .settings import settings
11
12
12
13
logger = logging .getLogger (__name__ )
13
14
14
15
15
16
class Scraper :
16
- def __init__ (self , config : ShopConfigurationDB ) -> None :
17
+ def __init__ (self , shop : shopmodels . ShopRead ) -> None :
17
18
self .protocol = "https"
18
- self ._config = config
19
- self .base_url = f"{ self .protocol } ://{ self ._config .url } "
20
- self .name = self ._config .name
19
+ self ._shop = shop
20
+ self .base_url = f"{ self .protocol } ://{ self ._shop .url } "
21
+ self .name = self ._shop .name
21
22
22
23
# Create Extractor for listing page
23
24
self ._listing_page_extractor = selectorlib .Extractor (
24
- self ._config .listing_page_selector
25
+ self ._shop .listing_page_selector
25
26
)
26
27
27
28
def __repr__ (self ) -> str :
28
29
return f"Scraper(name={ self .name } , base_url={ self .base_url } )"
29
30
30
31
def _build_query_url (self , q : str ) -> str :
31
- return self .base_url + self ._config .query_url .format (query = q )
32
+ return self .base_url + self ._shop .query_url .format (query = q )
32
33
33
34
async def query_listings (
34
35
self , client : httpx .AsyncClient , query : str , limit : int = 10
35
36
) -> dict :
36
37
37
38
url = self ._build_query_url (query )
38
-
39
- # Render page with `pyppeteer` if needed
40
- if self ._config .render_javascript :
41
- html = await render_page (url = url )
42
- else :
43
- html = await fetch_page (url = url , client = client )
39
+ html = await fetch_page (url = url , client = client )
44
40
45
41
results = self ._listing_page_extractor .extract (
46
42
html , base_url = self .base_url
47
43
).get ("items" )
48
44
if results :
49
45
results = results [:limit ]
50
46
response_object = {
51
- "id" : self ._config .id ,
52
- "name" : self ._config .name ,
47
+ "id" : self ._shop .id ,
48
+ "name" : self ._shop .name ,
53
49
"listings" : results ,
54
50
}
55
51
return response_object
@@ -62,32 +58,11 @@ async def fetch_page(url: str, client: httpx.AsyncClient):
62
58
return html
63
59
64
60
65
- async def render_page (url : str ) -> str :
66
- """
67
- Using ``pyppeteer`` load a web page and return HTML content.
68
- """
69
- options = {
70
- "timeout" : int (settings .scraper .PYPPETEER_TIMEOUT * 1000 ),
71
- "waitUntil" : "domcontentloaded" ,
72
- }
73
- browser = await pyppeteer .launch (
74
- executablePath = settings .CHROME_BIN , ignoreHTTPSErrors = True , headless = True
75
- )
76
- page = await browser .newPage ()
77
- await page .goto (url , options = options )
78
- await asyncio .sleep (settings .scraper .PYPPETEER_SLEEP )
79
- html = await page .content ()
80
- await browser .close ()
81
- return html
82
-
83
-
84
61
async def query_scrapers (query : str , limit : int , include : Set [int ]):
85
62
"""Query scrapers entry point."""
86
63
87
64
# Only use one client for all requests
88
- async with httpx .AsyncClient (
89
- verify = False , timeout = settings .scraper .HTTPCLIENT_TIMEOUT
90
- ) as client :
65
+ async with httpx .AsyncClient (verify = False ,) as client :
91
66
tasks = [
92
67
scrapers [i ].query_listings (client = client , query = query , limit = limit )
93
68
for i in include
@@ -99,31 +74,13 @@ async def query_scrapers(query: str, limit: int, include: Set[int]):
99
74
scrapers = {}
100
75
101
76
102
- async def initialise ( ):
77
+ def populate_scrapers ( db_session : orm . Session ):
103
78
"""
104
- Reads all shop configurations from database and using each config initialise's a `Scraper` instance.
105
- Local scraper dict is then used for quick lookup. Key is scraper id and value is `Scraper` instance.
106
- Should be called every time new configurations are added to db.
79
+ Populate scraper configuration into memory for fast lookup.
107
80
"""
108
81
global scrapers
109
82
scrapers = {}
110
- shops = await shop_crud .read_all ()
111
- for s in shops :
112
- scrapers [s ["id" ]] = Scraper (config = ShopConfigurationDB (** s ))
113
-
114
-
115
- # async def initialise():
116
- # global scrapers
117
- # """Reads scraper information from /etc and populates the database with shop configs."""
118
- #
119
- # with open(settings.SHOPS_YAML_PATH) as fileobj:
120
- # shopsconfig = yaml.safe_load_all(fileobj.read())
121
- #
122
- # global scrapers
123
- # for config in shopsconfig:
124
- # shop = await shop_crud.read_by_name(config["name"])
125
- # if not shop:
126
- # logger.info(f"Shop not found adding: {config['name']}")
127
- # shop = await shop_crud.create(ShopConfigurationSchema(**config))
128
- #
129
- # scrapers[shop["id"]] = Scraper(config=ShopConfigurationDB(**shop))
83
+ shops = shopservice .get_multiple (db_session = db_session )
84
+ for shop in shops :
85
+ shop_model = shopmodels .ShopRead .from_orm (shop )
86
+ scrapers [shop .id ] = Scraper (shop_model )
0 commit comments