added files to repo

fabriciosprj · Feb 18, 2023 · 0e8a69a · 0e8a69a
commit 0e8a69a
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+venv
diff --git a/README.md b/README.md
@@ -0,0 +1,19 @@
+# Google Maps Scraper
+
+This is simple scraper that uses Playwright to extract data from Google Maps. 
+
+This example is made for educational purposese.
+
+This scrapit is easy to customize.
+
+check both Excel & CSV files (google_maps_data) to see how final data will look like. 
+
+## To Install:
+- (Optional: create & activate a virtual environment) `virtualenv venv`, then `source venv/bin/activate`
+
+- `pip install -r requirements.txt`
+- `playwright install chromium`
+
+## to Run:
+- `python3 main.py -l=<location> -s=<profession>`
+
diff --git a/google_maps_data.csv b/google_maps_data.csv
@@ -0,0 +1,6 @@
+name,address,website,phone_number
+Temple Dental,"Fleet Street Clinic, 29 Fleet St, Temple, London EC4Y 1AA, Royaume-Uni",templedental.com,+44 20 8175 8892
+The Welbeck Clinic - Cosmetic Dentist,"20 Welbeck St, London W1G 8ED, Royaume-Uni",thewelbeckclinic.co.uk,+44 20 7486 8100
+London Dental Centre,"109 Lever St, London EC1V 3RQ, Royaume-Uni",thelondondentalcentre.co.uk,+44 20 7608 0806
+Dental Smiles London Chalton Street,"30-32 Chalton St, London NW1 1JB, Royaume-Uni",dentalsmileslondon.co.uk,+44 20 3757 5272
+Pall Mall Dental London,"15 Pall Mall, St. James's, London SW1Y 5LU, Royaume-Uni",pallmalldental.co.uk,+44 20 7766 7150
diff --git a/google_maps_data.xlsx b/google_maps_data.xlsx
diff --git a/main.py b/main.py
@@ -0,0 +1,116 @@
+"""This script serves as an example on how to use Python 
+   & Playwright to scrape/extract data from Google Maps"""
+
+from playwright.sync_api import sync_playwright
+from dataclasses import dataclass, asdict, field
+import pandas as pd
+import argparse
+
+
+@dataclass
+class Business:
+    """holds business data
+    """
+    name: str = None
+    address: str = None
+    website: str = None
+    phone_number: str = None
+
+@dataclass
+class BusinessList:
+    """holds list of Business objects, 
+       and save to both excel and csv
+    """
+    business_list : list[Business] = field(default_factory=list)
+
+    def dataframe(self):
+        """transform business_list to pandas dataframe 
+
+        Returns: pandas dataframe
+        """
+        return pd.json_normalize((asdict(business) for business in self.business_list), sep="_")
+
+    def save_to_excel(self, filename):
+        """saves pandas dataframe to excel (xlsx) file
+
+        Args:
+            filename (str): filename
+        """   
+        self.dataframe().to_excel(f'{filename}.xlsx', index=False)
+
+    def save_to_csv(self, filename):
+        """saves pandas dataframe to csv file
+
+        Args:
+            filename (str): filename
+        """
+        self.dataframe().to_csv(f'{filename}.csv', index=False)
+
+
+def main():
+
+    with sync_playwright() as p:
+
+        browser = p.chromium.launch(headless=False)
+        page = browser.new_page()
+
+        page.goto('https://www.google.com/maps', timeout=60000)
+        # wait is added for dev phase. can remove it in production
+        page.wait_for_timeout(5000)
+
+        page.locator('//input[@id="searchboxinput"]').fill(search_for)
+        page.wait_for_timeout(3000)
+
+        page.keyboard.press('Enter')
+        page.wait_for_timeout(5000)
+
+        ######################################################
+        # need to add scrolling features here if needed      #
+        # lot of business data as explained in Youtube video #
+        ######################################################
+
+        listings = page.locator('//div[@role="article"]').all()
+
+        business_list = BusinessList()
+
+        # getting first five only
+        for listing in listings[:5]:
+
+            listing.click()
+            page.wait_for_timeout(5000)
+
+            name_xpath = '//h1[contains(@class, "fontHeadlineLarge")]/span[2]'
+            address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
+            website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
+            phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'
+
+            business = Business()
+            business.name = page.locator(name_xpath).inner_text()
+            business.address = page.locator(address_xpath).inner_text()
+            business.website = page.locator(website_xpath).inner_text()
+            business.phone_number = page.locator(phone_number_xpath).inner_text()
+
+            business_list.business_list.append(business)
+
+        # saving to both excel and csv just to showcase the features.
+        business_list.save_to_excel('google_maps_data')
+        business_list.save_to_csv('google_maps_data')
+
+        browser.close()
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s", "--search", type=str)
+    parser.add_argument("-l", "--location", type=str)
+    args = parser.parse_args()
+
+    if args.location and args.search:
+        search_for = f'{args.search}  {args.location}'
+    else:
+        # in case no arguments passed:
+        # scraper will search for this on Google Maps
+        search_for = 'dentist new york'
+
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,11 @@
+et-xmlfile==1.1.0
+greenlet==2.0.1
+numpy==1.24.2
+openpyxl==3.1.1
+pandas==1.5.3
+playwright==1.30.0
+pyee==9.0.4
+python-dateutil==2.8.2
+pytz==2022.7.1
+six==1.16.0
+typing_extensions==4.5.0