Glassdoor scraping scripts and update to indeed readme

samgoodgame · Mar 11, 2018 · c75dc12 · c75dc12
1 parent 29d3e2d
commit c75dc12
Show file tree

Hide file tree

Showing 5 changed files with 203 additions and 11 deletions.
diff --git a/sandbox/glassdoor-scraping/GlassdoorCsvRow.py b/sandbox/glassdoor-scraping/GlassdoorCsvRow.py
@@ -0,0 +1,6 @@
+class GlassdoorCsvRow:
+	def __init__(self, title, location, url):
+		self.title = title
+		self.location = location
+		self.url = url
+	#enddef
diff --git a/sandbox/glassdoor-scraping/README.md b/sandbox/glassdoor-scraping/README.md
@@ -0,0 +1,23 @@
+How to scrape glassdoor
+---------------------
+---------------------
+
+
+## Command
+python glassdoor-scraper.py 
+The command will spit out objects into S3 of the form salaries/title/location1/title_location1.json, salaries/title/location1/title_location1.json, etc. The files of the same structure are written out locally too, in case we want to cleanup/reprocess before uploading to S3. The local files avoid re-scraping.
+
+The script expects a file glassdoor-urls.csv to be present. This CSV should have rows with 3 columns - title, location, url. Note that the glassdoor urls are not easy to construct programatically. So, here are the steps to come up with this CSV - 
+1. Go to glassdoor.com, enter the job title and location and hit enter. 
+2. Copy the job title, location and the url that loads up as a row in the CSV.
+3. Name that glassdoor-urls.csv and save that in this directory.
+
+## How does this script scrape?
+This scripts reuses a lot of code from https://github.com/ajbentley/glassdoor-salary-scraper
+That project is old and did not work - so, I made changes on top of that by hacking around the glassdoor login, salary pages, saving to local disk and uploads to s3
+
+The script first logs into glassdoor.com with the creds you use in the script (commented section for creds). For every row in the csv, it uses selenium to launch a browser with that url and parses out the salary, company, location for every single listing in the first 6 pages (~60 salary+company+location combinations).
+Once this information is scraped, the data is first saved to the local disk and then uploaded into s3.
+
+## Next steps
+The glassdoor-urls-1.csv and glassdoor-urls-2.csv has the information on the salaries we already scraped. If there are any other locations, jobs for which we want to scrape salaries, we can run the script again.
diff --git a/sandbox/glassdoor-scraping/Salary.py b/sandbox/glassdoor-scraping/Salary.py
@@ -0,0 +1,6 @@
+class Salary:
+	def __init__(self, jobTitle, company, meanPay):
+		self.jobTitle = jobTitle
+		self.company = company
+		self.meanPay = meanPay
+	#enddef
diff --git a/sandbox/glassdoor-scraping/glassdoor-scraper.py b/sandbox/glassdoor-scraping/glassdoor-scraper.py
@@ -0,0 +1,144 @@
+import time
+import json
+import Salary
+import csv
+import random
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.common.keys import Keys
+from GlassdoorCsvRow import GlassdoorCsvRow
+import boto3
+
+username = "####" # your email here
+password = "####" # your password here
+
+
+# Manual options for the city, num pages to scrape, and URL
+pages = 6
+#cityName = "new-york-city"
+#cityURL = "https://www.glassdoor.com/Salaries/new-york-city-data-scientist-salary-SRCH_IL.0,13_IM615_KO14,28.htm"
+BUCKET_NAME = 'tech-salary-project'
+s3 = boto3.resource('s3')
+
+def obj_dict(obj):
+    return obj.__dict__
+#enddef
+
+def json_export(data, location, title):
+        file_name = title + "_" + location + ".json"
+        s3_key = "{}/{}/{}/{}".format("salaries", title, location, file_name)
+	jsonFile = open(file_name, "w")
+	jsonFile.write(json.dumps(data, indent=4, separators=(',', ': '), default=obj_dict))
+	jsonFile.close()
+        s3.Bucket(BUCKET_NAME).put_object(Key=s3_key, Body=open(file_name, 'rb'))
+#enddef
+
+def init_driver():
+    #driver = webdriver.Chrome(executable_path = "./chromedriver")
+    #NOTE: Replace the location where the chrome driver is installed
+    driver = webdriver.Chrome('/Users/Raghu/Downloads/chromedriver')
+    driver.wait = WebDriverWait(driver, 10)
+    return driver
+#enddef
+
+def login(driver, username, password):
+    driver.get("http://www.glassdoor.com/profile/login_input.htm")
+    try:
+        user_field = driver.wait.until(EC.presence_of_element_located(
+            (By.NAME, "username")))
+        pw_field = driver.find_element_by_css_selector("input[type='password']")
+        #pw_field = driver.findElement(By.cssSelector("input[type='password']"));
+	#pw_field = driver.find_element_by_class_name("signin-password")
+        login_button = driver.find_element_by_xpath("//button[@type='submit']")
+        #login_button = driver.find_element_by_id("signInBtn")
+        user_field.send_keys(username)
+        user_field.send_keys(Keys.TAB)
+        time.sleep(random.randint(1100,2300)/1000.0)
+        pw_field.send_keys(password)
+        time.sleep(random.randint(1100,2300)/1000.0)
+        login_button.click()
+    except TimeoutException:
+        print("TimeoutException! Username/password field or login button not found on glassdoor.com")
+#enddef
+
+def parse_salaries_HTML(salaries, data):
+	for salary in salaries:
+		jobTitle = "-"
+		company = "-"
+		meanPay = "-"
+		jobTitle = salary.find("div", { "class" : "JobInfoStyle__jobTitle"}).find("a").getText().strip()
+		company = salary.find("div", { "class" : "JobInfoStyle__employerName"}).getText().strip()
+		try:
+			meanPay = salary.find("div", { "class" : "JobInfoStyle__meanBasePay"}).find("span", {"class": "strong"}).getText().strip()
+		except Exception as e:
+                        print(str(e))
+			meanPay = 'xxx'
+		r = Salary.Salary(jobTitle, company, meanPay)
+		data.append(r)
+	#endfor
+	return data
+#enddef
+
+def get_data(driver, URL, startPage, endPage, data, refresh):
+	if (startPage > endPage):
+		return data
+	#endif
+	print "\nPage " + str(startPage) + " of " + str(endPage)
+	currentURL = URL + "_IP" + str(startPage) + ".htm"
+	time.sleep(random.randint(2100,2400)/1000.0)
+	#endif
+	if (refresh):
+		driver.get(currentURL)
+		print "Getting " + currentURL
+	#endif
+	time.sleep(random.randint(2100,2300)/1000.0)
+	HTML = driver.page_source
+	soup = BeautifulSoup(HTML, "html.parser")
+	salaries = soup.find("div", { "class" : ["salaryList"] }).find_all("div", { "class" : ["SalaryRowStyle__row"] })
+	if (salaries):
+		data = parse_salaries_HTML(salaries, data)
+		print "Page " + str(startPage) + " scraped."
+		if (startPage % 10 == 0):
+			print "\nTaking a breather for a few seconds ..."
+			time.sleep(10)
+		#endif
+		get_data(driver, URL, startPage + 1, endPage, data, True)
+	else:
+		print "Page could not be loaded..ignoring the page"
+		time.sleep(3)
+		#get_data(driver, URL, startPage, endPage, data, False)
+	#endif
+	return data
+#enddef
+
+if __name__ == "__main__":
+	driver = init_driver()
+	time.sleep(3)
+	print "Logging into Glassdoor account ..."
+	login(driver, username, password)
+	time.sleep(10)
+        print 'logged in..'
+	print 'parsing the glassdoor-urls.csv file..'
+    	csvRows = []
+    	with open('glassdoor-urls.csv') as csvFile:
+        	reader = csv.DictReader(csvFile, delimiter=',', quotechar='"')
+        	for row in reader:
+            		csvRows.append(GlassdoorCsvRow(row['title'], row['location'], row['url']))
+
+    	print "\nStarting data scraping ..."       
+    	for csvRow in csvRows:
+                location = csvRow.location
+		title = csvRow.title
+		url = csvRow.url
+                print 'location:{}'.format(location)
+		print 'title:{}'.format(title)
+		print 'url:{}'.format(url)
+		data = get_data(driver, url[:-4], 1, pages, [], True)
+            	print "\nExporting data to " + title + "_" + location + ".json"
+            	json_export(data, location, title)
+
+	driver.quit()
diff --git a/sandbox/indeed-scraping/README.md b/sandbox/indeed-scraping/README.md
@@ -37,17 +37,30 @@ indeed.com has 2 types of urls - "classic" and "mobile" (which have a "/m/" in t
 
 
 ## Next steps
-Run the script with these search queries - please add to this list and we can run this from different machines to parallelize the scraping.
-Technical Product Manager
-Technical Sales Engineer
-Technical Support Engineer
-Technical Product Marketing Manager
-Software Development Engineer
-Backend Software Engineer
-Frontend Software Engineer
-UX Researcher
-Database Administrator
+The job titles we scraped so far are:
 Data Scientist
 Data Analyst
-Software Engineering Manager
+UX Engineer
+Product Manager
+Software Manager
+Software Engineer
+Database Administrator
+UX Designer	
+Software Test Engineer
+Devops
+Sales Engineer
+Mobile Engineer
+Software Developer
+Software Architect
+QA Engineer
+Quality Assurance Engineer
+App Developer
 Technical Sales
+Frontend Engineer
+Data Engineer
+Hardware Engineer
+Technical Program Manager
+Software Consultant
+Professional Services
+
+If we need to scrape latest jobs for any of these titles OR any other job titles, we can run the script again.