Skip to content

Commit

Permalink
added files to repo
Browse files Browse the repository at this point in the history
  • Loading branch information
amineboutarfi committed Feb 18, 2023
0 parents commit 0e8a69a
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Google Maps Scraper

This is simple scraper that uses Playwright to extract data from Google Maps.

This example is made for educational purposese.

This scrapit is easy to customize.

check both Excel & CSV files (google_maps_data) to see how final data will look like.

## To Install:
- (Optional: create & activate a virtual environment) `virtualenv venv`, then `source venv/bin/activate`

- `pip install -r requirements.txt`
- `playwright install chromium`

## to Run:
- `python3 main.py -l=<location> -s=<profession>`

6 changes: 6 additions & 0 deletions google_maps_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
name,address,website,phone_number
Temple Dental,"Fleet Street Clinic, 29 Fleet St, Temple, London EC4Y 1AA, Royaume-Uni",templedental.com,+44 20 8175 8892
The Welbeck Clinic - Cosmetic Dentist,"20 Welbeck St, London W1G 8ED, Royaume-Uni",thewelbeckclinic.co.uk,+44 20 7486 8100
London Dental Centre,"109 Lever St, London EC1V 3RQ, Royaume-Uni",thelondondentalcentre.co.uk,+44 20 7608 0806
Dental Smiles London Chalton Street,"30-32 Chalton St, London NW1 1JB, Royaume-Uni",dentalsmileslondon.co.uk,+44 20 3757 5272
Pall Mall Dental London,"15 Pall Mall, St. James's, London SW1Y 5LU, Royaume-Uni",pallmalldental.co.uk,+44 20 7766 7150
Binary file added google_maps_data.xlsx
Binary file not shown.
116 changes: 116 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""This script serves as an example on how to use Python
& Playwright to scrape/extract data from Google Maps"""

from playwright.sync_api import sync_playwright
from dataclasses import dataclass, asdict, field
import pandas as pd
import argparse


@dataclass
class Business:
"""holds business data
"""
name: str = None
address: str = None
website: str = None
phone_number: str = None

@dataclass
class BusinessList:
"""holds list of Business objects,
and save to both excel and csv
"""
business_list : list[Business] = field(default_factory=list)

def dataframe(self):
"""transform business_list to pandas dataframe
Returns: pandas dataframe
"""
return pd.json_normalize((asdict(business) for business in self.business_list), sep="_")

def save_to_excel(self, filename):
"""saves pandas dataframe to excel (xlsx) file
Args:
filename (str): filename
"""
self.dataframe().to_excel(f'{filename}.xlsx', index=False)

def save_to_csv(self, filename):
"""saves pandas dataframe to csv file
Args:
filename (str): filename
"""
self.dataframe().to_csv(f'{filename}.csv', index=False)


def main():

with sync_playwright() as p:

browser = p.chromium.launch(headless=False)
page = browser.new_page()

page.goto('https://www.google.com/maps', timeout=60000)
# wait is added for dev phase. can remove it in production
page.wait_for_timeout(5000)

page.locator('//input[@id="searchboxinput"]').fill(search_for)
page.wait_for_timeout(3000)

page.keyboard.press('Enter')
page.wait_for_timeout(5000)

######################################################
# need to add scrolling features here if needed #
# lot of business data as explained in Youtube video #
######################################################

listings = page.locator('//div[@role="article"]').all()

business_list = BusinessList()

# getting first five only
for listing in listings[:5]:

listing.click()
page.wait_for_timeout(5000)

name_xpath = '//h1[contains(@class, "fontHeadlineLarge")]/span[2]'
address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'

business = Business()
business.name = page.locator(name_xpath).inner_text()
business.address = page.locator(address_xpath).inner_text()
business.website = page.locator(website_xpath).inner_text()
business.phone_number = page.locator(phone_number_xpath).inner_text()

business_list.business_list.append(business)

# saving to both excel and csv just to showcase the features.
business_list.save_to_excel('google_maps_data')
business_list.save_to_csv('google_maps_data')

browser.close()


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument("-s", "--search", type=str)
parser.add_argument("-l", "--location", type=str)
args = parser.parse_args()

if args.location and args.search:
search_for = f'{args.search} {args.location}'
else:
# in case no arguments passed:
# scraper will search for this on Google Maps
search_for = 'dentist new york'

main()
11 changes: 11 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
et-xmlfile==1.1.0
greenlet==2.0.1
numpy==1.24.2
openpyxl==3.1.1
pandas==1.5.3
playwright==1.30.0
pyee==9.0.4
python-dateutil==2.8.2
pytz==2022.7.1
six==1.16.0
typing_extensions==4.5.0

0 comments on commit 0e8a69a

Please sign in to comment.