-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathpdfChecker.py
135 lines (113 loc) · 4.14 KB
/
pdfChecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#Muwei Zheng
#
#This script takes three arguments:
#1. The name of the .csv file that contains all papers info
#2. The folder name that contains all pdf papers. It should under current dir.
#3. The name of the .csv file where we want to save the output.
#
#The script will convert all pdfs in the given folder into plain txt and store them in
#a new csv file. The header will be [pdf_name | txt]. It uses '|' as delimiter.
#
#Example: python pdfChecker.py MLpapers.csv papers papers.csv
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import csv, os, sys
csv.field_size_limit(sys.maxsize)
#helper method to extract content from pdf
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
#open all pdf files in the 'papers' folder and sort them into Modification time order
try:
pdfsPath = './'+sys.argv[2]+'/'
pdfs = [(os.path.join(pdfsPath, fn), fn) for fn in os.listdir(pdfsPath) if '.pdf' in fn]
pdfs = [(os.stat(path), fn) for path, fn in pdfs]
pdfs = [fn for stat, fn in sorted(pdfs)]
except:
print "Can't not locate folder", sys.argv[2]
sys.exit()
'''
#Match the pdf file names back to the entries in csv file
fileName = sys.argv[1]
if not fileName.endswith('csv'):
print 'A valid file should be in .csv format.'
sys.exit()
entries = []
with open(fileName, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
header = reader.next()
for row in reader:
entries.append(row)
#check if the file is already exist. If it is, append new data to the old one
exist = False
new_add = []
if header[-1] == 'PDF_file_name':
exist = True
new_add = [entries.index(x) for x in entries if x[-1] == '']
new_add.sort()
print 'new_add:',len(new_add)
else:
header.append('PDF_file_name')
new_add = range(len(entries))
out = [header]
#the number of papers in the folder should equal to the number of entries in the csv file
if len(pdfs) != len(new_add):
print 'papers number not match. Abort program.'
print 'len(entries):',len(new_add),'\t','len(pdfs):', len(pdfs)
sys.exit()
for i in range(len(pdfs)):
entries[new_add[i]] += [pdfs[i],'new']
out.append(entries[new_add[i]])
out = out + [x for x in entries if x[-1] != 'new']
#write back to the csv file
with open(fileName,'wb') as csvfile:
writer = csv.writer(csvfile,delimiter=',')
writer.writerows(out)
'''
#convert all pdfs into plain text, and then map them with their names in csv file
out = []
for pdf in pdfs:
#do it three times in case it is not converted completely
for t in range(3):
print pdf
content = convert_pdf_to_txt(pdfsPath+pdf)
page = ''.join([x if ord(x) > 0 else ' ' for x in content])#replace all the null characters
page = page.replace('|', ' ') #replace all '|' character in paper, therefore we can use '|' as delimiter.
check = page.lower().find('references')
#check if the converter convert pdf files completely
out.append([pdf, page])
break
#if check == -1:
# if t == 2:
# print 'Error! '+ pdf+' :' +str(len(page))
#else:
# out.append([pdf, page])
# break
print pdf, 'done'
#write to a csv file
fileName = sys.argv[3]
if not fileName.endswith('csv'):
print 'A valid file should be in .csv format.'
sys.exit()
with open(fileName,'a') as csvfile:
writer = csv.writer(csvfile,delimiter=',', quotechar = '|')
writer.writerows(out)