Skip to content

Commit

Permalink
fix closespider
Browse files Browse the repository at this point in the history
  • Loading branch information
unknown committed May 6, 2023
1 parent 63a1619 commit 8afef47
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 39 deletions.
10 changes: 5 additions & 5 deletions aioscrapy/core/downloader/handlers/playwright/webdriver.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def __init__(
self.on_response = on_response
self.user_agent = user_agent

self.driver: Playwright = None
self.browser: Browser = None
self.context: BrowserContext = None
self.page: Page = None
self.driver: Optional[Playwright] = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self.url = None

async def setup(self):
Expand All @@ -67,7 +67,7 @@ async def setup(self):
context_args.update({'user_agent': self.user_agent})

self.driver = await async_playwright().start()
# self.browser = await getattr(self.driver, self.driver_type).launch(**browser_args)
self.browser = await getattr(self.driver, self.driver_type).launch(**browser_args)
self.browser = await self.driver.chromium.launch(**browser_args)
self.context = await self.browser.new_context(**context_args)
self.page = await self.context.new_page()
Expand Down
27 changes: 14 additions & 13 deletions aioscrapy/libs/extensions/closespider.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
See documentation in docs/topics/extensions.rst
"""
import asyncio
from typing import Optional
from collections import defaultdict

from aioscrapy import signals
Expand All @@ -26,13 +27,14 @@ def __init__(self, crawler):
raise NotConfigured

self.counter = defaultdict(int)
self.task: Optional[asyncio.tasks.Task] = None

if self.close_on.get('errorcount'):
crawler.signals.connect(self.error_count, signal=signals.spider_error)
if self.close_on.get('pagecount'):
crawler.signals.connect(self.page_count, signal=signals.response_received)
if self.close_on.get('timeout'):
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(self.timeout_close, signal=signals.spider_opened)
if self.close_on.get('itemcount'):
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
Expand All @@ -44,26 +46,25 @@ def from_crawler(cls, crawler):
async def error_count(self, failure, response, spider):
self.counter['errorcount'] += 1
if self.counter['errorcount'] == self.close_on['errorcount']:
asyncio.create_task(self.crawler.engine.close_spider(spider, 'closespider_errorcount'))
asyncio.create_task(self.crawler.engine.stop(reason='closespider_errorcount'))

async def page_count(self, response, request, spider):
self.counter['pagecount'] += 1
if self.counter['pagecount'] == self.close_on['pagecount']:
asyncio.create_task(self.crawler.engine.close_spider(spider, 'closespider_pagecount'))
asyncio.create_task(self.crawler.engine.stop(reason='closespider_pagecount'))

async def spider_opened(self, spider):
self.task = asyncio.create_task(self.timeout_close(spider))

async def timeout_close(self, spider):
await asyncio.sleep(self.close_on['timeout'])
asyncio.create_task(self.crawler.engine.close_spider(spider, reason='closespider_timeout'))

async def close():
await asyncio.sleep(self.close_on['timeout'])
asyncio.create_task(self.crawler.engine.stop(reason='closespider_timeout'))

self.task = asyncio.create_task(close())

async def item_scraped(self, item, spider):
self.counter['itemcount'] += 1
if self.counter['itemcount'] == self.close_on['itemcount']:
asyncio.create_task(self.crawler.engine.close_spider(spider, 'closespider_itemcount'))
asyncio.create_task(self.crawler.engine.stop(reason='closespider_itemcount'))

def spider_closed(self, spider):
task = getattr(self, 'task', False)
if task and not task.done():
task.cancel()
if self.task and not self.task.done():
self.task.cancel()
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,23 @@
logger = logging.getLogger(__name__)


class DemoMemorySpider(Spider):
name = 'DemoMemorySpider'
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
# 'DOWNLOAD_DELAY': 3,
# 'RANDOMIZE_DOWNLOAD_DELAY': True,
# 'CONCURRENT_REQUESTS': 1,
# 'LOG_LEVEL': 'INFO'
# 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter',
"CLOSE_SPIDER_ON_IDLE": True,
'DOWNLOAD_HANDLERS': {
class DemoHttpxSpider(Spider):
name = 'DemoHttpxSpider'
custom_settings = dict(
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
# DOWNLOAD_DELAY=3,
# RANDOMIZE_DOWNLOAD_DELAY=True,
# CONCURRENT_REQUESTS=1,
LOG_LEVEL='INFO',
CLOSE_SPIDER_ON_IDLE=True,
DOWNLOAD_HANDLERS={
'http': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler',
'https': 'aioscrapy.core.downloader.handlers.httpx.HttpxDownloadHandler',
},
'HTTPX_CLIENT_SESSION_ARGS': {
HTTPX_CLIENT_SESSION_ARGS={
'http2': True
}
}
)

start_urls = ['https://quotes.toscrape.com']

Expand Down Expand Up @@ -59,4 +58,4 @@ async def process_item(self, item):


if __name__ == '__main__':
DemoMemorySpider.start()
DemoHttpxSpider.start()
3 changes: 1 addition & 2 deletions example/singlespider/demo_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ class DemoMemorySpider(Spider):
# 'DOWNLOAD_DELAY': 3,
# 'RANDOMIZE_DOWNLOAD_DELAY': True,
# 'CONCURRENT_REQUESTS': 1,
# 'LOG_LEVEL': 'INFO'
# 'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.disk.RFPDupeFilter',
# 'LOG_LEVEL': 'INFO',
"CLOSE_SPIDER_ON_IDLE": True,
}

Expand Down
4 changes: 2 additions & 2 deletions example/singlespider/demo_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ async def process_action(self, driver: PlaywrightDriver):
img_bytes = await driver.page.screenshot(type="jpeg", quality=50)
return img_bytes

async def process_item(self, item):
print(item)
# async def process_item(self, item):
# print(item)


if __name__ == '__main__':
Expand Down
6 changes: 3 additions & 3 deletions example/singlespider/demo_sink_mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
logger = logging.getLogger(__name__)


class DemoMemorySpider(Spider):
name = 'DemoMemorySpider'
class DemoMongoSpider(Spider):
name = 'DemoMongoSpider'
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
# 'DOWNLOAD_DELAY': 3,
Expand Down Expand Up @@ -68,4 +68,4 @@ async def process_item(self, item):


if __name__ == '__main__':
DemoMemorySpider.start()
DemoMongoSpider.start()
75 changes: 75 additions & 0 deletions example/singlespider/demo_sink_mysql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import logging

from aioscrapy import Request
from aioscrapy.spiders import Spider

logger = logging.getLogger(__name__)


class DemoMysqlSpider(Spider):
name = 'DemoMysqlSpider'
custom_settings = dict(
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
# OWNLOAD_DELAY=3,
# ANDOMIZE_DOWNLOAD_DELAY=True,
# ONCURRENT_REQUESTS=1,
# OG_LEVEL='INFO',
# UPEFILTER_CLASS='aioscrapy.dupefilters.disk.RFPDupeFilter',
CLOSE_SPIDER_ON_IDLE=True,
# mysql parameter
MYSQL_ARGS={
'default': {
'host': '127.0.0.1',
'user': 'root',
'password': 'root',
'port': 3306,
'charset': 'utf8mb4',
'db': 'test',
},
},
ITEM_PIPELINES={
'aioscrapy.libs.pipelines.sink.MysqlPipeline': 100,
},
SAVE_CACHE_NUM=1000, # 每次存储1000条
SAVE_CACHE_INTERVAL=10, # 每次10秒存储一次
)

start_urls = ['https://quotes.toscrape.com']

@staticmethod
async def process_request(request, spider):
""" request middleware """
pass

@staticmethod
async def process_response(request, response, spider):
""" response middleware """
return response

@staticmethod
async def process_exception(request, exception, spider):
""" exception middleware """
pass

async def parse(self, response):
for quote in response.css('div.quote'):
yield {
'save_table_name': 'article', # 要存储的表名字
'save_db_alias': 'default', # 要存储的mongo, 参数“MYSQL_ARGS”的key
# 'save_db_name': 'xxx', # 要存储的mongo的库名, 不指定则默认为“MYSQL_ARGS”中的“db”值

'author': quote.xpath('span/small/text()').get(),
'text': quote.css('span.text::text').get(),
}

next_page = response.css('li.next a::attr("href")').get()
if next_page is not None:
# yield response.follow(next_page, self.parse)
yield Request(f"https://quotes.toscrape.com{next_page}", callback=self.parse)

async def process_item(self, item):
print(item)


if __name__ == '__main__':
DemoMysqlSpider.start()

0 comments on commit 8afef47

Please sign in to comment.