-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathpdfChecker2.py
95 lines (80 loc) · 2.86 KB
/
pdfChecker2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import csv, os
#helper method to extract content from pdf
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
#open all pdf files in the 'papers' folder and also sort them into Modification time order
pdfsPath = './papers/'
pdfs = [(os.path.join(pdfsPath, fn), fn) for fn in os.listdir(pdfsPath) if fn.endswith('.pdf')]
pdfs = [(os.stat(path), fn) for path, fn in pdfs]
pdfs = [fn for stat, fn in sorted(pdfs)]
#create keywords set need to evaluate for each pdf file
keys = ['dataset','Alexa','families','exploits','vulnerabilities','figures','survey']
#evaluate each pdf file, use dictrionary in case some numbers of papers not exist
results = {}
#here 1
out = []
for pdf in pdfs:
ind = int(pdf[:pdf.index('.')])
results[ind] = ['No','\\']
#do it three times in case it is not converted completely
for t in range(3):
content = convert_pdf_to_txt(pdfsPath+pdf)
page = ''.join([x if ord(x) > 0 else ' ' for x in content]) #replace all the null characters
check = page.lower().find('references')
#check if the converter convert pdf files completely
if check == -1:
if t == 2:
print 'Error! '+ pdf+' :' +str(len(page))
else:
#here 1
out.append("|||"+str(ind)+"|||"+page)
break
print pdf + ' done'
'''
for key in keys:
if key in page:
if results[i][0] == 'No':
results[i][0] = 'Yes'
results[i][1] = ''
results[i].append(key)
index = page.index(key)
results[i][1] = results[i][1] + page[index-100:index+100]+'\n'
results[i][1]= '"'+results[i][1]+'"'
#append output to appropriate file
out = []
with open('sample.csv','rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in reader:
out.append(row)
for i in range(len(out)):
if i in results:
out[i] = out[i]+results[i]
with open('sampleOut.csv','wb') as csvfile:
writer = csv.writer(csvfile,delimiter=',')
writer.writerows(out)
'''
f = open('rawPapers.txt','w')
f.writelines(out)
f.close()