-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
67 lines (52 loc) · 2.02 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import re
#setup for web viewing
def get_driver(link):
driver = webdriver.Safari()
driver.get(link)
content = driver.page_source
soup = BeautifulSoup(content,'html.parser')
driver.quit()
return soup
#finding all the jobs and storing their info in dictionary
def scraping(page):
jobs = page.find_all("div", class_="job_seen_beacon")
for job in jobs:
company_name = job.find("span", class_="css-1x7z1ps eu4oa1w0").text
company_location = job.find("div", class_="css-t4u72d eu4oa1w0").text
try:
salary = job.find("div", class_="metadata salary-snippet-container").text
cleaned_str = re.sub(r'[^\d$.,]', '', salary)
list = cleaned_str.split('$')
if(len(list)==2):
min_salary = list[1]
max_salary = "none"
else:
min_salary = list[1]
max_salary = list[2]
except:
min_salary = "click link to find"
max_salary = "click link to find"
job_title = job.find("a", class_="jcs-JobTitle css-jspxzf eu4oa1w0").text
link_tag = job.find("a", class_="jcs-JobTitle css-jspxzf eu4oa1w0")
link_half = link_tag["href"]
base_url = "https://ca.indeed.com"
full_url = f"{base_url}{link_half}"
job_info = {
"job_title" : job_title,
"salary_min" : min_salary,
"salary_max" : max_salary,
"company_name" : company_name,
"company_location" : company_location,
"link" : full_url
}
job_list.append(job_info)
job_list = []
#put number of pages * 10 in place of the second int in for
for i in range(0,50,10):
page_html = get_driver(f"https://ca.indeed.com/jobs?q=computer+science&l=Victoria%2C+BC&radius=50&start={i}&vjk=74df27615b32ca3c")
scraping(page_html)
df = pd.DataFrame(job_list)
df.to_csv("jobs_indeed_computer_science.csv")