-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathinit_scrape.py
163 lines (135 loc) · 5.86 KB
/
init_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import sys
import os
import os.path
import requests
import json
import sqlite3
from bs4 import BeautifulSoup
from settings import *
import csv
# session needs to be global to maintain same session
session = requests.Session()
# Init Database
if(os.path.exists(DB)):
os.remove(DB)
conn = sqlite3.connect(DB)
conn.text_factory = str
c = conn.cursor()
# c.execute('''CREATE TABLE listings
# (name text, id text, url text, apply text, zipcode text, wages text, description text, education text)''')
c.execute('''CREATE TABLE listings
(name text, url text, location text, exp text, edu text, employment text, temp text, hours text, company text)''')
def scrape():
# Goes through all 40 pages of job listings, scrapes job url, name, and id
# number
# There are 40 pages of results, with 250 listings per page. There should
# be more, but it's capped here.
for n in range(1, 3): # should be 41
# Changes the page= number
page = PAGE_URL[:87] + str(n) + PAGE_URL[88:]
r = session.get(page)
soup = BeautifulSoup(r.content, "html.parser")
listings = soup.find_all("dt") # Finds all dt tags
for l in listings:
# Finds the a tag, which will have the name and the url
urls = l.find_all('a')
for u in urls:
# The href part of the tag will have the url
job_url = u['href']
name = u.string # The name will be in the string part of the a tag
id_num = u.string[u.string.find('(') + 1:u.string.find(')')]
# Step through to the job page.
job_page = session.get(BASE_URL + job_url)
job_soup = BeautifulSoup(job_page.content, "html.parser")
# experience
tags = job_soup.find_all(
'div', 'row attr-job-months_of_experience')
exp = ""
for t in tags:
children = t.contents
exp = children[1].contents[0]
# education
tags = job_soup.find_all(
'div', 'row attr-job-required_education_level_id')
edu = ""
for t in tags:
children = t.contents
edu = children[1].contents[0]
# employment type
tags = job_soup.find_all('div', 'row attr-job-employment_type')
job_type = ""
for t in tags:
children = t.contents
job_type = children[1].contents[0]
# perm/temp
tags = job_soup.find_all('div', 'row attr-job-position_type')
pos_type = ""
for t in tags:
children = t.contents
pos_type = children[1].contents[0]
# hours
tags = job_soup.find_all('div', 'row attr-job-average_hours')
hours = ""
for t in tags:
children = t.contents
hours = children[1].contents[0]
# company
tags = job_soup.find_all(
'div', 'row attr-job-company_name')
comp = ""
for t in tags:
children = t.contents
comp = children[1].contents[0]
# # creds
# tags = job_soup.find_all(
# 'div', 'row attr-job-credential_description')
# creds = ""
# for t in tags:
# children = t.contents
# parts = children[1].contents
# print len(parts)
# physical address
tags = job_soup.find_all(
'div', class_='row attr-job-physical_address')
for t in tags:
children = t.contents
address_parts = children[1]
parts = address_parts.contents
# import pdb
# pdb.set_trace()
parts = filter(
lambda s: 'br>' not in str(s) and '<br' not in str(s), parts)
location = ""
# print parts
for part in parts:
location += str(part) + ", "
# print location
# Need to scrape for description, zipcode, wages, education, etc and
# put them into the DB. ---> Use above code as a model as well as what
# we did in the scraping workshop.
# Insert the job listing into the database (only the name and url
# have been implemented at this point)
# c.execute(
# "INSERT INTO listings VALUES (?, ?, ?, 'TODO', 'TODO', 'TODO', 'TODO', 'TODO');", (name, id_num, job_url))
c.execute(
"INSERT INTO listings VALUES (?, ?, ?, ?, ?, ?, ? , ?, ?);", (name, BASE_URL + job_url, "\"" + location.encode('utf-8') + "\"", exp, edu, job_type, pos_type, hours, comp))
conn.commit()
def login():
# get html data for login page
soup = BeautifulSoup(session.get(LOGIN_URL).content, "html.parser")
# pulls login url from page, could change per session
login = soup.find_all('form')[0]['action']
login_data = dict(v_username=USER_NAME,
v_password=PASSWORD,
FormName='Form0',
fromlogin=1,
button='Log in')
# logs in
r = session.post(BASE_URL + login, data=login_data)
if __name__ == '__main__':
login()
scrape()
# csvWriter = csv.writer(open("listings.csv", "w")) # , delimiter='\t')
# for row in c.execute('SELECT * FROM listings'):
# csvWriter.writerow([s.decode('utf-8', 'ignore') for s in row])
c.close()