Skip to content

Commit

Permalink
v2: added reviews scraping + scrolling
Browse files Browse the repository at this point in the history
  • Loading branch information
amineboutarfi committed Apr 24, 2023
1 parent 6e1875c commit 88deebc
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 27 deletions.
Binary file added .DS_Store
Binary file not shown.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
venv
venv
dev
File renamed without changes.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ check both Excel & CSV files (google_maps_data) to see how final data will look
- `playwright install chromium`

## to Run:
- `python3 main.py -l=<location> -s=<profession>`
- `python3 main.py -s=<what & where to search for> -t=<how many>`

13 changes: 7 additions & 6 deletions google_maps_data.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name,address,website,phone_number
The Welbeck Clinic,"20 Welbeck St, London W1G 8ED, Royaume-Uni",thewelbeckclinic.co.uk,+44 20 7486 8100
French Dentist London,"71 Queen's Gate, South Kensington, London SW7 5JT, Royaume-Uni",drsadone.com,+44 20 7373 6899
London Dental Centre,"109 Lever St, London EC1V 3RQ, Royaume-Uni",thelondondentalcentre.co.uk,+44 20 7608 0806
Dental Smiles London Chalton Street,"30-32 Chalton St, London NW1 1JB, Royaume-Uni",dentalsmileslondon.co.uk,+44 20 3757 5272
Pall Mall Dental London,"15 Pall Mall, St. James's, London SW1Y 5LU, Royaume-Uni",pallmalldental.co.uk,+44 20 7766 7150
name,address,website,phone_number,reviews_count,reviews_average
Dr Pascal MARIN,"5 Rue Crétet, 75009 Paris, France",doctolib.fr,+33 6 75 15 49 16,5,3.4
Dr. Charlotte Parment,"cabinet médical ipso Saint Martin, 323 Rue Saint-Martin, 75003 Paris, France",ipso.paris,,4,5.0
Dr Claire Paris,"86 Rue de l'Université, 75007 Paris, France",,+33 1 40 62 95 28,6,5.0
Dr Nancy Salzman,"1 Av. de Lowendal, 75007 Paris, France",doctor-salzman.com,+33 1 45 63 18 43,35,4.8
Docteur Franck Besse,"45 Rue de Lancry, 75010 Paris, France",doctolib.fr,+33 1 44 85 26 83,24,4.2
Docteur Simon OHAYON- English speaking doctor- International medical center,"48 BIS Rue des Belles Feuilles, 75116 Paris, France",doctolib.fr,+33 6 58 80 18 38,94,4.1
Binary file modified google_maps_data.xlsx
Binary file not shown.
69 changes: 50 additions & 19 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class Business:
address: str = None
website: str = None
phone_number: str = None
reviews_count: int = None
reviews_average: float = None

@dataclass
class BusinessList:
Expand Down Expand Up @@ -64,21 +66,24 @@ def main():
page.keyboard.press('Enter')
page.wait_for_timeout(5000)

#################
### scrolling ###
#################
# scrolling
page.hover('(//div[@role="article"])[1]')
# If you needed more data, change 7 by a bigger number
for i in range(7):

while True:
page.mouse.wheel(0, 10000)
page.wait_for_timeout(3000)

listings = page.locator('//div[@role="article"]').all()

if page.locator('//div[@role="article"]').count() >= total:
listings = page.locator('//div[@role="article"]').all()[:total]
print(f'Total Scraped: {len(listings)}')
break
else:
print(f'Currently Scraped: ', page.locator('//div[@role="article"]').count())

business_list = BusinessList()

# getting first five only
for listing in listings[:5]:
# scraping
for listing in listings:

listing.click()
page.wait_for_timeout(5000)
Expand All @@ -87,13 +92,33 @@ def main():
address_xpath = '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]'
website_xpath = '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]'
phone_number_xpath = '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]'
reviews_span_xpath = '//span[@role="img"]'

business = Business()
business.name = page.locator(name_xpath).inner_text()
business.address = page.locator(address_xpath).inner_text()
business.website = page.locator(website_xpath).inner_text()
business.phone_number = page.locator(phone_number_xpath).inner_text()


if page.locator(name_xpath).count() > 0:
business.name = page.locator(name_xpath).inner_text()
else:
business.name = ''
if page.locator(address_xpath).count() > 0:
business.address = page.locator(address_xpath).inner_text()
else:
business.address = ''
if page.locator(website_xpath).count() > 0:
business.website = page.locator(website_xpath).inner_text()
else:
business.website = ''
if page.locator(phone_number_xpath).count() > 0:
business.phone_number = page.locator(phone_number_xpath).inner_text()
else:
business.phone_number = ''
if listing.locator(reviews_span_xpath).count() > 0:
business.reviews_average = float(listing.locator(reviews_span_xpath).get_attribute('aria-label').split()[0].replace(',','.').strip())
business.reviews_count = int(listing.locator(reviews_span_xpath).get_attribute('aria-label').split()[2].strip())
else:
business.reviews_average = ''
business.reviews_count = ''

business_list.business_list.append(business)

# saving to both excel and csv just to showcase the features.
Expand All @@ -107,14 +132,20 @@ def main():

parser = argparse.ArgumentParser()
parser.add_argument("-s", "--search", type=str)
parser.add_argument("-l", "--location", type=str)
parser.add_argument("-t", "--total", type=int)
args = parser.parse_args()

if args.location and args.search:
search_for = f'{args.search} {args.location}'
if args.search:
search_for = args.search
else:
# in case no arguments passed:
# scraper will search for this on Google Maps
# in case no arguments passed
# the scraper will search by defaukt for:
search_for = 'dentist new york'

# total number of products to scrape. Default is 10
if args.total:
total = args.total
else:
total = 10

main()

0 comments on commit 88deebc

Please sign in to comment.