-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathpapersDownloader.py
169 lines (142 loc) · 5.85 KB
/
papersDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#Muwei Zheng
#
#This program takes a csv file as input,
#and then download all papers listed in the csv file into
#a given output folder. The file should have a header,
#And the header should be like 'PaperName, URL, Conference, Year'.
#Conferenc entry is the abbreviation from DBLP database,
#and the URL entry should be the url provided also by DBLP.
#It can only recognize and handle the following conferences:
# ACM Conference on Computer and Communications Security,ccs
# AI & Security Workshop at CCS,ccsaisec
# USENIX Security Symposium,uss
# IEEE Symposium on Security and Privacy,sp
# Network and Distributed System Security Symposium,ndss
# APWG Symposium on Electronic Crime Research,ecrime
# Cyber Security Experimentation and Test Workshop at USENIX,usscset
# International Conference on Financial Cryptography and Data Security,fc (2012-2016)
#
#Example: python papersDownloader.py downloadTest.csv papers
import csv, os, subprocess, time, sys
from random import randint
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
#allow maximum size of memory usage
csv.field_size_limit(sys.maxsize)
#open the file given by the command line argument
fileName = sys.argv[1]
if not fileName.endswith('csv'):
print 'A valid file should be in .csv format.'
sys.exit()
#extract all paper info from the spreadsheet
paperInfo = []
with open(fileName, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
reader.next() # remove header
for row in reader:
paperInfo.append(row)
#to check if the directory storing the papers exist
folder = sys.argv[2]
if not folder.isalnum():
print 'output folder name should be alphanumeric and there is at least one character. System aborted'
sys.exit()
if folder not in os.listdir('./'):
os.mkdir(folder)
#set up a dictionary for fc conferences webpages:
fc = {'2016':'http://fc16.ifca.ai/program.html',
'2015':'http://fc15.ifca.ai/schedule.html',
'2014':'http://fc14.ifca.ai/program.html', #different
'2013':'http://fc13.ifca.ai/program.html',
'2012':'http://fc12.ifca.ai/program.html'}
fcw = {'2016':'http://fc16.ifca.ai/bitcoin/program.html',
'2015':'http://fc15.ifca.ai/bitcoin/schedule.html',
'2014':'http://fc14.ifca.ai/bitcoin/program.html'}
#Start downloading papers
#set up chrome driver options:
#1. change download folder
#2. change download preference so that chrome will download pdf instead of reviewing it in new tab
prefs = {"download.default_directory":os.getcwd()+'/'+folder+'/',
"plugins.always_open_pdf_externally": True}
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option("prefs",prefs)
#open chrome
driver = webdriver.Chrome(chrome_options=chromeOptions)
paperNum = len(os.listdir('./'+folder))
print 'start downloading papers...'
last_conf = ''
for paper in paperInfo:
name, url, conf, year, pdfName = paper[0], paper[1], paper[2], paper[3], paper[9]
#check if the paper is already processed:
if pdfName.endswith('.pdf'):
continue
elif pdfName != '':
print 'strange pdf name:', name, ':', pdfName
elem = ''
if conf == last_conf:
time.sleep(0.001*randint(20000,40000))
else:
last_conf = conf
time.sleep(0.001*randint(10000,20000))
try:
if url.endswith('.pdf'):
driver.get(url)
continue
if not conf == 'fc' and not conf == 'fcw': #for not fc conferences
driver.get(url)
if conf == 'uss' or conf == 'usscset':
elem = driver.find_element_by_class_name("file")
elif conf in ['ccs', 'ccsaisec', 'imc']:
page = driver.page_source
ind = page.find('citation_pdf_url')
start = page.find('http', ind)
end = page.find('"', start)
driver.get(page[start:end])
elif conf == 'ndss':
elem = driver.find_element_by_link_text("Download File")
elif conf == 'sp':
url=driver.current_url
routes = url.split('/')
paperID = routes[4]
time.sleep(0.001*randint(4000,6000))
driver.get('http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber='+str(paperID))
elif conf != 'fc':
print 'Cannot handle conference:', conf
else: #for fc conference
if year not in fc.keys():
print 'Need to add', year+'th', 'Financial Cryptography conference webpage url into script.'
print name
continue
if conf == 'fc':
driver.get(fc[year])
else:
if year not in fcw.keys():
print 'Need to add', year+ 'th','Financial Cryptography conference bitcoin and block chain workshop webpage url into script.'
print name
continue
driver.get(fcw[year])
page = driver.page_source
if len(name) > 10:
ind = page.find(name[:10])
if year != '2014':
start = page.rfind('>',0,ind)
end = page.find('<',ind)
elem = driver.find_element_by_link_text(page[start+1:end])
else:
start = page.frind('"', ind)
end = page.find('"', start+1)
driver.get('http://fc14.ifca.ai/'+page[start+1:end])
if elem != '':
time.sleep(0.001*randint(4000,6000))
elem.click()
if len(os.listdir('./'+folder)) == (paperNum+1):
paperNum += 1
elif len(os.listdir('./'+folder)) == (paperNum):
print 'Failed download paper:', name
else:
print 'Error!'
except:
print name
driver.close()
print ""
print 'finish downloading'