forked from amineboutarfi/google_maps_scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 0e8a69a
Showing
6 changed files
with
153 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# Google Maps Scraper | ||
|
||
This is simple scraper that uses Playwright to extract data from Google Maps. | ||
|
||
This example is made for educational purposese. | ||
|
||
This scrapit is easy to customize. | ||
|
||
check both Excel & CSV files (google_maps_data) to see how final data will look like. | ||
|
||
## To Install: | ||
- (Optional: create & activate a virtual environment) `virtualenv venv`, then `source venv/bin/activate` | ||
|
||
- `pip install -r requirements.txt` | ||
- `playwright install chromium` | ||
|
||
## to Run: | ||
- `python3 main.py -l=<location> -s=<profession>` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
name,address,website,phone_number | ||
Temple Dental,"Fleet Street Clinic, 29 Fleet St, Temple, London EC4Y 1AA, Royaume-Uni",templedental.com,+44 20 8175 8892 | ||
The Welbeck Clinic - Cosmetic Dentist,"20 Welbeck St, London W1G 8ED, Royaume-Uni",thewelbeckclinic.co.uk,+44 20 7486 8100 | ||
London Dental Centre,"109 Lever St, London EC1V 3RQ, Royaume-Uni",thelondondentalcentre.co.uk,+44 20 7608 0806 | ||
Dental Smiles London Chalton Street,"30-32 Chalton St, London NW1 1JB, Royaume-Uni",dentalsmileslondon.co.uk,+44 20 3757 5272 | ||
Pall Mall Dental London,"15 Pall Mall, St. James's, London SW1Y 5LU, Royaume-Uni",pallmalldental.co.uk,+44 20 7766 7150 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
"""This script serves as an example on how to use Python | ||
& Playwright to scrape/extract data from Google Maps""" | ||
|
||
from playwright.sync_api import sync_playwright | ||
from dataclasses import dataclass, asdict, field | ||
import pandas as pd | ||
import argparse | ||
|
||
|
||
@dataclass | ||
class Business: | ||
"""holds business data | ||
""" | ||
name: str = None | ||
address: str = None | ||
website: str = None | ||
phone_number: str = None | ||
|
||
@dataclass | ||
class BusinessList: | ||
"""holds list of Business objects, | ||
and save to both excel and csv | ||
""" | ||
business_list : list[Business] = field(default_factory=list) | ||
|
||
def dataframe(self): | ||
"""transform business_list to pandas dataframe | ||
Returns: pandas dataframe | ||
""" | ||
return pd.json_normalize((asdict(business) for business in self.business_list), sep="_") | ||
|
||
def save_to_excel(self, filename): | ||
"""saves pandas dataframe to excel (xlsx) file | ||
Args: | ||
filename (str): filename | ||
""" | ||
self.dataframe().to_excel(f'{filename}.xlsx', index=False) | ||
|
||
def save_to_csv(self, filename): | ||
"""saves pandas dataframe to csv file | ||
Args: | ||
filename (str): filename | ||
""" | ||
self.dataframe().to_csv(f'{filename}.csv', index=False) | ||
|
||
|
||
def main(): | ||
|
||
with sync_playwright() as p: | ||
|
||
browser = p.chromium.launch(headless=False) | ||
page = browser.new_page() | ||
|
||
page.goto('https://www.google.com/maps', timeout=60000) | ||
# wait is added for dev phase. can remove it in production | ||
page.wait_for_timeout(5000) | ||
|
||
page.locator('//input[@id="searchboxinput"]').fill(search_for) | ||
page.wait_for_timeout(3000) | ||
|
||
page.keyboard.press('Enter') | ||
page.wait_for_timeout(5000) | ||
|
||
###################################################### | ||
# need to add scrolling features here if needed # | ||
# lot of business data as explained in Youtube video # | ||
###################################################### | ||
|
||
listings = page.locator('//div[@role="article"]').all() | ||
|
||
business_list = BusinessList() | ||
|
||
# getting first five only | ||
for listing in listings[:5]: | ||
|
||
listing.click() | ||
page.wait_for_timeout(5000) | ||
|
||
name_xpath = '//h1[contains(@class, "fontHeadlineLarge")]/span[2]' | ||
address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]' | ||
website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]' | ||
phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]' | ||
|
||
business = Business() | ||
business.name = page.locator(name_xpath).inner_text() | ||
business.address = page.locator(address_xpath).inner_text() | ||
business.website = page.locator(website_xpath).inner_text() | ||
business.phone_number = page.locator(phone_number_xpath).inner_text() | ||
|
||
business_list.business_list.append(business) | ||
|
||
# saving to both excel and csv just to showcase the features. | ||
business_list.save_to_excel('google_maps_data') | ||
business_list.save_to_csv('google_maps_data') | ||
|
||
browser.close() | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-s", "--search", type=str) | ||
parser.add_argument("-l", "--location", type=str) | ||
args = parser.parse_args() | ||
|
||
if args.location and args.search: | ||
search_for = f'{args.search} {args.location}' | ||
else: | ||
# in case no arguments passed: | ||
# scraper will search for this on Google Maps | ||
search_for = 'dentist new york' | ||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
et-xmlfile==1.1.0 | ||
greenlet==2.0.1 | ||
numpy==1.24.2 | ||
openpyxl==3.1.1 | ||
pandas==1.5.3 | ||
playwright==1.30.0 | ||
pyee==9.0.4 | ||
python-dateutil==2.8.2 | ||
pytz==2022.7.1 | ||
six==1.16.0 | ||
typing_extensions==4.5.0 |