-
Notifications
You must be signed in to change notification settings - Fork 6
/
001_Retrieve_all_urls.py
78 lines (66 loc) · 2.67 KB
/
001_Retrieve_all_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# -*- coding: utf-8 -*-
"""
*** To be able to run this script, selenium and google chrome webdriver are needed.
This script will:
1. retrieve url links for all district in Bangkok, Thailand listed on hipflat website.
2. retrieve url links for all condomeniums listed on each district.
3. save the data to condo_links_all.txt
"""
# Import packages
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import time
import pandas as pd
import pickle as pk
# Create an 'instance' of the driver.
# A new Chrome (or other browser) window should open up if options.headless = False (default)
CHROMEDRIVER_PATH = "./_chromedriver/chromedriver.exe"
options = Options()
options.headless = True
driver = webdriver.Chrome(CHROMEDRIVER_PATH, chrome_options=options)
# Enter the main page, all condos in Bangkok are grouped by district.
url ='https://www.hipflat.co.th/en/market/condo-bangkok-skik'
driver.get(url)
# Write function to scrape all links from the webpage.
def get_all_links(driver):
links = []
elements = driver.find_elements_by_class_name('directories__lists-element-name')
for elem in elements:
href = elem.get_attribute("href")
links.append(href)
return links
# Run and store the links in district_links
start_time = datetime.now()
district_links=get_all_links(driver)
time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
# Check the length of 'district_links', there are 50 districts in Bangkok.
# https://en.wikipedia.org/wiki/List_of_districts_of_Bangkok
print(len(district_links))
# Re-run the function to retrive all condo links in each district.
# Append to 'condo_links'.
start_time = datetime.now()
condo_links=[]
for district in district_links:
print(len(condo_links),district)
#implicitly_wait - Specifies the amount of time the driver should wait
#when searching for an element if it is not immediately present.
driver.implicitly_wait(10)
driver.get(district)
condo_links.append(get_all_links(driver))
time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))
print("completed")
# Now we got lists within a list (nested list)
# Turn a (nested) python list into a single list, that contains all the elements of sub lists
# Named as 'condo_links_all'
from itertools import chain
condo_links_all=list(chain.from_iterable(condo_links))
print("Total condo listings = "+str(len(condo_links_all)))
# Result in 2566 condo listings
# Dump the retrived links to text file.
with open("condo_links_all.txt", "w") as f:
for s in condo_links_all:
f.write(str(s) +"\n")
print("completed")