diff --git a/README.md b/README.md index e43f1bc..2333671 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,17 @@ # googlesearch -googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. + +googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. ## Installation + To install, run the following command: + ```bash python3 -m pip install googlesearch-python ``` ## Usage + To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program: ```python from googlesearch import search @@ -15,17 +19,23 @@ search("Google") ``` ## Additional options + googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program. + ```python from googlesearch import search search("Google", num_results=100) ``` + In addition, you can change the language google searches in. For example, to get results in French run the following program: + ```python from googlesearch import search search("Google", lang="fr") ``` + To extract more information, such as the description or the result URL, use an advanced search: + ```python from googlesearch import search search("Google", advanced=True) @@ -35,8 +45,30 @@ search("Google", advanced=True) # - url # - description ``` + +To also fetch sponsored results along with general search results, use the sponsored parameter: + +```python +from googlesearch import search +search("Google", sponsored=True) +# Returns a list of SearchResult +# Properties: +# - title +# - url +# - description +# - sponsored +``` + +To convert the search results to a Pandas DataFrame, use the to_df function: + +```python +from googlesearch import to_df +df = to_df(advanced_results) +``` + If requesting more than 100 results, googlesearch will send multiple requests to go through the pages. To increase the time between these requests, use `sleep_interval`: + ```python from googlesearch import search search("Google", sleep_interval=5, num_results=200) -``` \ No newline at end of file +``` diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 74e6564..500112d 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -4,7 +4,7 @@ from requests import get from .user_agents import get_useragent import urllib - +import pandas as pd def _req(term, results, lang, start, proxies, timeout): resp = get( @@ -14,30 +14,48 @@ def _req(term, results, lang, start, proxies, timeout): }, params={ "q": term, - "num": results + 2, # Prevents multiple requests + "num": results, "hl": lang, "start": start, }, proxies=proxies, timeout=timeout, ) + resp.raise_for_status() return resp +def to_df(search_results): + data = [] + for index, result in enumerate(search_results): + result_data = { + 'index': index + 1, + 'url': result.url, + 'title': result.title, + 'description': result.description + } + if hasattr(result, 'is_sponsored'): + result_data['is_sponsored'] = result.is_sponsored + data.append(result_data) + + df = pd.DataFrame(data) + + return df class SearchResult: - def __init__(self, url, title, description): + def __init__(self, url, title, description, is_sponsored): self.url = url self.title = title self.description = description + self.is_sponsored = is_sponsored def __repr__(self): - return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" - - -def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5): - """Search the Google search engine""" - + if self.is_sponsored: + return f"SearchResult(url={self.url}, title={self.title}, description={self.description}, is_sponsored={self.is_sponsored})" + else: + return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" + +def search(term, sponsored=False, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5): escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works. # Proxy @@ -52,29 +70,52 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in start = 0 while start < num_results: # Send request - resp = _req(escaped_term, num_results - start, + resp = _req(escaped_term,num_results - start, lang, start, proxies, timeout) # Parse soup = BeautifulSoup(resp.text, "html.parser") + + # Check for sponsored results + if sponsored: + sponsored_block = soup.find_all("div", attrs={"class": "vdQmEd"}) + if len(sponsored_block) == 0: + start += 1 + for sponsored_result in sponsored_block: + link = sponsored_result.find("a", href=True,attrs={"class":"sVXRqc"}).get("href") + title = sponsored_result.find("span", attrs={"class":"OSrXXb"}) + description_box = sponsored_result.find(lambda tag: tag.name == 'span' and not tag.has_attr('class')) + + if description_box: + description = description_box.text + if link and title and description: + start += 1 + if advanced: + yield SearchResult(link, title.text, description, True) + else: + yield link + + # Check for not sponsored results result_block = soup.find_all("div", attrs={"class": "g"}) - if len(result_block) ==0: + if len(result_block) == 0: start += 1 for result in result_block: # Find link, title, description - link = result.find("a", href=True) + link = result.find("a", href=True).get("href") title = result.find("h3") description_box = result.find( "div", {"style": "-webkit-line-clamp:2"}) + if description_box: description = description_box.text if link and title and description: start += 1 if advanced: - yield SearchResult(link["href"], title.text, description) + yield SearchResult(link, title.text, description, False) else: - yield link["href"] + yield link + sleep(sleep_interval) if start == 0: - return [] + return [] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 56399db..db44518 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ beautifulsoup4>=4.9 requests>=2.20 +pandas