We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hello , I'm facing a problem in my spider for 'DEPTH_LIMIT' not work.
# setting.py # depth to crawler #-------------------------------------------------- DEPTH_LIMIT = 5 DEPTH_STATS_VERBOSE = True DEPTH_PRIORITY = 1 #--------------------------------------------------
# /spider/dynamicspider.py class DynamiccrawlerSpider(scrapy.Spider): name = 'dynamicCrawler' link_extractor = MYLxmlLinkExtractor() start_urls = [] def get_seed_url(self,file_path): url_list=set() with open(file_path) as f: for i in f.readlines(): seed=i.strip('\n') seedURL = json.loads(seed)['seedURL'] url_list.add(seedURL) print('GET SEED Length: ',len(url_list),'--',url_list) return url_list def start_requests(self): self.start_urls = self.get_seed_url(file_path=self.settings.get('SEED_FILE_PATH')) for url in self.start_urls: yield SplashRequest(url, callback=self.parse_result, args={ 'wait': 30, 'timeout': 90 , 'images': 0, 'resource_timeout': 30 , }, dont_filter = True, dont_process_response = True, endpoint='render.html' ) def parse_result(self, response): print("DEPTH+++++++++++++++++++++++",response.request.meta['depth']) table_list = response.xpath("//table").extract() if len(table_list) > 0 : item = DynamicWebMeta() item['pageurl'] = response.request._original_url item['title'] = response.xpath("//title/text()").get() item['body'] = response.text yield item links = self.link_extractor.extract_links(response) links_len = len(links) if links_len>0: i = 0 for link in links: i = i+1 print('{0}/{1}--Son link *******{2}'.format(i,links_len,link.url)) yield SplashRequest(link.url, callback=self.parse_result, args={ 'wait': 30, 'timeout': 90 , 'images': 0, 'resource_timeout': 30, }, dont_process_response = True, endpoint='render.html')
logfile
{'BOT_NAME': 'dynamicTableScrapy', 'DEPTH_LIMIT': 5, 'DEPTH_PRIORITY': 1, 'DEPTH_STATS_VERBOSE': True, 'DOWNLOAD_DELAY': 10, 'DUPEFILTER_CLASS': 'dynamicTableScrapy.mydupefilter.MyDupeFilter', 'HTTPCACHE_ENABLED': True, 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage', 'NEWSPIDER_MODULE': 'dynamicTableScrapy.spiders', 'SCHEDULER': 'scrapy_redis_bloomfilter.scheduler.Scheduler', 'SPIDER_MODULES': ['dynamicTableScrapy.spiders'], 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'} [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor [scrapy.extensions.telnet] INFO: Telnet Password: 732cc755693aaef0 [scrapy.middleware] INFO: Enabled extensions: ['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.telnet.TelnetConsole', 'scrapy.extensions.memusage.MemoryUsage', 'scrapy.extensions.logstats.LogStats'] 2022-05-26 21:07:40 [scrapy.middleware] INFO: Enabled downloader middlewares: ['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', 'dynamicTableScrapy.middlewares.DynamictablescrapyDownloaderMiddleware', 'scrapy.downloadermiddlewares.retry.RetryMiddleware', 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', 'scrapy_splash.SplashCookiesMiddleware', 'scrapy_splash.SplashMiddleware', 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware', 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', 'scrapy.downloadermiddlewares.stats.DownloaderStats', 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware'] [scrapy.middleware] INFO: Enabled spider middlewares: ['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', 'scrapy_splash.SplashDeduplicateArgsMiddleware', 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', 'dynamicTableScrapy.middlewares.DynamictablescrapySpiderMiddleware', 'scrapy.spidermiddlewares.referer.RefererMiddleware', 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', 'scrapy.spidermiddlewares.depth.DepthMiddleware'] [scrapy.middleware] INFO: Enabled item pipelines: ['dynamicTableScrapy.pipelines.MongoDBPipLine'] [scrapy.core.engine] INFO: Spider opened .......... {'bloomfilter/filtered': 21704, 'downloader/exception_count': 4, 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 4, 'downloader/request_bytes': 1143999, 'downloader/request_count': 1482, 'downloader/request_method_count/POST': 1482, 'downloader/response_bytes': 27034526, 'downloader/response_count': 1478, 'downloader/response_status_count/200': 420, 'downloader/response_status_count/502': 129, 'downloader/response_status_count/503': 864, 'downloader/response_status_count/504': 65, 'elapsed_time_seconds': 343.332559, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2022, 5, 26, 13, 13, 24, 317268), 'httpcache/firsthand': 37, 'httpcache/hit': 1441, 'httpcache/miss': 41, 'httpcache/store': 37, 'httperror/response_ignored_count': 353, 'httperror/response_ignored_status_count/502': 43, 'httperror/response_ignored_status_count/503': 288, 'httperror/response_ignored_status_count/504': 22, 'item_scraped_count': 188, 'log_count/DEBUG': 1673, 'log_count/ERROR': 353, 'log_count/INFO': 372, 'log_count/WARNING': 2, 'memusage/max': 123432960, 'memusage/startup': 71020544, 'request_depth_count/0': 420, 'request_depth_count/1': 21706, 'request_depth_max': 1, 'response_received_count': 773, 'retry/count': 709, 'retry/max_reached': 353, 'retry/reason_count/502 Bad Gateway': 86, 'retry/reason_count/503 Service Unavailable': 576, 'retry/reason_count/504 Gateway Time-out': 43, 'retry/reason_count/twisted.internet.error.TimeoutError': 4, 'scheduler/dequeued/redis': 2255, 'scheduler/enqueued/redis': 2255, 'splash/render.html/request_count': 773, 'splash/render.html/response_count/200': 420, 'splash/render.html/response_count/502': 129, 'splash/render.html/response_count/503': 864, 'splash/render.html/response_count/504': 65, 'start_time': datetime.datetime(2022, 5, 26, 13, 7, 40, 984709), 'urllength/request_ignored_count': 2} [scrapy.core.engine] INFO: Spider closed (finished)
What should I do to make DEPTH_LIMIT work well?
The text was updated successfully, but these errors were encountered:
No branches or pull requests
Hello , I'm facing a problem in my spider for 'DEPTH_LIMIT' not work.
logfile
What should I do to make DEPTH_LIMIT work well?
The text was updated successfully, but these errors were encountered: