From 94877ef533a2b1f160bea9d328376b49faf401bd Mon Sep 17 00:00:00 2001 From: Manu Date: Tue, 5 Apr 2016 13:15:47 +0800 Subject: [PATCH 1/2] Fix image search selectors for Bing and Google. --- GoogleScraper/parsing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 09fd4b41..b2ea3281 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -404,7 +404,7 @@ class GoogleParser(Parser): image_search_selectors = { 'results': { 'de_ip': { - 'container': 'li#isr_mc', + 'container': '#isr_mc', 'result_container': 'div.rg_di', 'link': 'a.rg_l::attr(href)' }, @@ -626,7 +626,7 @@ class BingParser(Parser): 'ch_ip': { 'container': '#dg_c .imgres', 'result_container': '.dg_u', - 'link': 'a.dv_i::attr(m)' + 'link': 'a::attr(m)' }, } } From 8fe8a61acb96afd26b8682814fbaf24bf6a3d2f5 Mon Sep 17 00:00:00 2001 From: Manu Date: Tue, 5 Apr 2016 22:10:37 +0800 Subject: [PATCH 2/2] next page fixes? --- GoogleScraper/core.py | 2 +- GoogleScraper/search_engine_parameters.py | 2 +- GoogleScraper/selenium_mode.py | 21 ++++++++------------- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/GoogleScraper/core.py b/GoogleScraper/core.py index 9d2bf094..4e678b51 100755 --- a/GoogleScraper/core.py +++ b/GoogleScraper/core.py @@ -456,4 +456,4 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None): session.commit() if return_results: - return scraper_search + return session diff --git a/GoogleScraper/search_engine_parameters.py b/GoogleScraper/search_engine_parameters.py index a3db0505..ace7162e 100644 --- a/GoogleScraper/search_engine_parameters.py +++ b/GoogleScraper/search_engine_parameters.py @@ -182,7 +182,7 @@ """ bing_search_params = { - + 'adlt': 'off' } """ diff --git a/GoogleScraper/selenium_mode.py b/GoogleScraper/selenium_mode.py index 04e70ea4..193dc767 100644 --- a/GoogleScraper/selenium_mode.py +++ b/GoogleScraper/selenium_mode.py @@ -58,7 +58,7 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'google': '#pnnext', 'yandex': '.pager__button_kind_next', 'bing': '.sb_pagN', - 'yahoo': '#pg-next', + 'yahoo': '.compPagination .next', 'baidu': '.n', 'ask': '#paging div a.txt3.l_nu', 'blekko': '', @@ -455,13 +455,16 @@ def _find_next_page_element(self): WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector))) except (WebDriverException, TimeoutException) as e: self._save_debug_screenshot() - raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e))) + # raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e))) return self.webdriver.find_element_by_css_selector(selector) elif self.search_type == 'image': self.page_down() - return True + if self.search_engine_name == 'google': + return self.webdriver.find_element_by_css_selector('input._kvc') + else: + return True def wait_until_serp_loaded(self): """ @@ -599,17 +602,9 @@ def page_down(self): Used for next page in image search mode or when the next results are obtained by scrolling down a page. """ - js = ''' - var w = window, - d = document, - e = d.documentElement, - g = d.getElementsByTagName('body')[0], - y = w.innerHeight|| e.clientHeight|| g.clientHeight; - - window.scrollBy(0,y); - return y; - ''' + js = 'window.scrollTo(0,document.body.scrollHeight);' + time.sleep(5) self.webdriver.execute_script(js) def run(self):