v2: added reviews scraping + scrolling

fabriciosprj · Apr 24, 2023 · 88deebc · 88deebc
1 parent 6e1875c
commit 88deebc
Show file tree

Hide file tree

Showing 7 changed files with 60 additions and 27 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-venv
+venv
+dev
diff --git a/LICENSE.txt → LICENSE b/LICENSE.txt → LICENSE
diff --git a/README.md b/README.md
@@ -15,5 +15,5 @@ check both Excel & CSV files (google_maps_data) to see how final data will look
 - `playwright install chromium`
 
 ## to Run:
-- `python3 main.py -l=<location> -s=<profession>`
+- `python3 main.py -s=<what & where to search for> -t=<how many>`
 
diff --git a/google_maps_data.csv b/google_maps_data.csv
@@ -1,6 +1,7 @@
-name,address,website,phone_number
-The Welbeck Clinic,"20 Welbeck St, London W1G 8ED, Royaume-Uni",thewelbeckclinic.co.uk,+44 20 7486 8100
-French Dentist London,"71 Queen's Gate, South Kensington, London SW7 5JT, Royaume-Uni",drsadone.com,+44 20 7373 6899
-London Dental Centre,"109 Lever St, London EC1V 3RQ, Royaume-Uni",thelondondentalcentre.co.uk,+44 20 7608 0806
-Dental Smiles London Chalton Street,"30-32 Chalton St, London NW1 1JB, Royaume-Uni",dentalsmileslondon.co.uk,+44 20 3757 5272
-Pall Mall Dental London,"15 Pall Mall, St. James's, London SW1Y 5LU, Royaume-Uni",pallmalldental.co.uk,+44 20 7766 7150
+name,address,website,phone_number,reviews_count,reviews_average
+Dr Pascal MARIN,"5 Rue Crétet, 75009 Paris, France",doctolib.fr,+33 6 75 15 49 16,5,3.4
+Dr. Charlotte Parment,"cabinet médical ipso Saint Martin, 323 Rue Saint-Martin, 75003 Paris, France",ipso.paris,,4,5.0
+Dr Claire Paris,"86 Rue de l'Université, 75007 Paris, France",,+33 1 40 62 95 28,6,5.0
+Dr Nancy Salzman,"1 Av. de Lowendal, 75007 Paris, France",doctor-salzman.com,+33 1 45 63 18 43,35,4.8
+Docteur Franck Besse,"45 Rue de Lancry, 75010 Paris, France",doctolib.fr,+33 1 44 85 26 83,24,4.2
+Docteur Simon OHAYON- English speaking doctor- International medical center,"48 BIS Rue des Belles Feuilles, 75116 Paris, France",doctolib.fr,+33 6 58 80 18 38,94,4.1
diff --git a/google_maps_data.xlsx b/google_maps_data.xlsx
diff --git a/main.py b/main.py
@@ -15,6 +15,8 @@ class Business:
     address: str = None
     website: str = None
     phone_number: str = None
+    reviews_count: int = None
+    reviews_average: float = None
 
 @dataclass
 class BusinessList:
@@ -64,21 +66,24 @@ def main():
         page.keyboard.press('Enter')
         page.wait_for_timeout(5000)
 
-        #################  
-        ### scrolling ###
-        #################
+        # scrolling 
         page.hover('(//div[@role="article"])[1]')
-        # If you needed more data, change 7 by a bigger number
-        for i in range(7): 
+
+        while True:
             page.mouse.wheel(0, 10000)
             page.wait_for_timeout(3000)
-
-        listings = page.locator('//div[@role="article"]').all()
+
+            if page.locator('//div[@role="article"]').count() >= total:
+                listings = page.locator('//div[@role="article"]').all()[:total]
+                print(f'Total Scraped: {len(listings)}')
+                break
+            else:
+                print(f'Currently Scraped: ', page.locator('//div[@role="article"]').count())
 
         business_list = BusinessList()
 
-        # getting first five only
-        for listing in listings[:5]:
+        # scraping
+        for listing in listings:
 
             listing.click()
             page.wait_for_timeout(5000)
@@ -87,13 +92,33 @@ def main():
             address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
             website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
             phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'
+            reviews_span_xpath = '//span[@role="img"]'
 
             business = Business()
-            business.name = page.locator(name_xpath).inner_text()
-            business.address = page.locator(address_xpath).inner_text()
-            business.website = page.locator(website_xpath).inner_text()
-            business.phone_number = page.locator(phone_number_xpath).inner_text()
-
+
+            if page.locator(name_xpath).count() > 0:
+                business.name = page.locator(name_xpath).inner_text()
+            else:
+                business.name = ''
+            if page.locator(address_xpath).count() > 0:
+                business.address = page.locator(address_xpath).inner_text()
+            else:
+                business.address = ''
+            if page.locator(website_xpath).count() > 0:
+                business.website = page.locator(website_xpath).inner_text()
+            else:
+                business.website = ''
+            if page.locator(phone_number_xpath).count() > 0:
+                business.phone_number = page.locator(phone_number_xpath).inner_text()
+            else:
+                business.phone_number = ''
+            if listing.locator(reviews_span_xpath).count() > 0:
+                business.reviews_average = float(listing.locator(reviews_span_xpath).get_attribute('aria-label').split()[0].replace(',','.').strip())
+                business.reviews_count = int(listing.locator(reviews_span_xpath).get_attribute('aria-label').split()[2].strip())
+            else:
+                business.reviews_average = ''
+                business.reviews_count = ''
+
             business_list.business_list.append(business)
 
         # saving to both excel and csv just to showcase the features.
@@ -107,14 +132,20 @@ def main():
 
     parser = argparse.ArgumentParser()
     parser.add_argument("-s", "--search", type=str)
-    parser.add_argument("-l", "--location", type=str)
+    parser.add_argument("-t", "--total", type=int)
     args = parser.parse_args()
 
-    if args.location and args.search:
-        search_for = f'{args.search}  {args.location}'
+    if args.search:
+        search_for = args.search
     else:
-        # in case no arguments passed:
-        # scraper will search for this on Google Maps
+        # in case no arguments passed
+        # the scraper will search by defaukt for:
         search_for = 'dentist new york'
+
+    # total number of products to scrape. Default is 10
+    if args.total:
+        total = args.total
+    else:
+        total = 10
 
     main()
-Original file line number
+Diff line change
@@ -1 +1,2 @@
-    venv
+    venv
+    dev