forked from mortazazakeri/iust_deep_fuzz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_stream_extractor_1.py
124 lines (98 loc) · 4.71 KB
/
pdf_stream_extractor_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 8 15:13:22 2018
@author: Morteza
"""
import sys
import os
import subprocess
import re
def get_xref(pdf_file_path, mutool_path='D:\\afl\\mupdf-1.11-windows\\mutool.exe',
mutool_command=' show -e ', mutool_object_number=' x'):
# command line to get xref size
cmd = mutool_path + mutool_command + pdf_file_path + mutool_object_number
# cmd = 'dir'
# cmd = 'D:\\afl\\mupdf-1.11-windows\\mutool.exe show -e D:\\afl\\mupdf-1.11-windows\\input\\pdftc_100k_2708.pdf x'
returned_value_in_byte = subprocess.check_output(cmd, shell=True)
return_value_in_string = returned_value_in_byte.decode()
# print command line output to see it in python console
# print(return_value_in_string)
start_index_string = ''
end_index_string = ''
index = 6
while(return_value_in_string[index].isdigit()):
start_index_string += return_value_in_string[index]
index+=1
index+=1
while(return_value_in_string[index].isdigit()):
end_index_string += return_value_in_string[index]
index += 1
start_index_integer = int(start_index_string)
end_index_integer = int(end_index_string)
# print(start_index_integer, end_index_integer)
return start_index_integer,end_index_integer
def get_pdf_objects(pdf_file_path, mutool_path='D:\\afl\\mupdf-1.11-windows\\mutool.exe',
mutool_command=' show -e ', mutool_object_number=' x'):
cmd = mutool_path + mutool_command + pdf_file_path + mutool_object_number
# cmd = 'D:\\afl\\mupdf-1.11-windows\\mutool.exe show -e D:\\afl\\mupdf-1.11-windows\\input\\pdftc_100k_2708.pdf '
# print(cmd)
returned_value_in_byte = subprocess.check_output(cmd, shell=True)
return_value_in_string = returned_value_in_byte.decode()
# can I use regexp?
# return_value = re.sub(r'stream.*endstream', 'stream', return_value_in_string, flags=re.DOTALL)
print(return_value_in_string)
input()
# os.system('pause')
stream_start = 0
stream_end = 0
new_seq = ''
while(return_value_in_string.find('endstream', stream_end+9) != -1):
stream_start = return_value_in_string.find('stream', stream_end+9)
new_seq += return_value_in_string[stream_end:stream_start+6]
stream_end = return_value_in_string.find('endstream', stream_end+9)
# if(stream_start != -1 and stream_end != -1):
# for i in range(stream_start+6,stream_end):
# return_value_in_list.(i)
# print(stream_start, stream_end)
new_seq+= return_value_in_string[stream_end:len(return_value_in_string)]
new_seq = new_seq.replace('streamendstream', 'stream')
# print(new_seq)
new_seq = re.sub(r'\d+ \d+ obj', "obj", new_seq)
# print(new_seq)
# new_seq.replace('\\r\\n','')
return new_seq
def main(argv):
total_extracted_object = 0
pdf_file_path = 'C:\\Users\\Morteza\\Desktop\\corpus_garbage\\mozilla\\'
min_pdf_file_id = 1
max_pdf_file_id = 341
current_pdf_file_id = min_pdf_file_id
for i in range(min_pdf_file_id,max_pdf_file_id):
try:
pdf_file_path += ('pdftc_mozilla_' + str(current_pdf_file_id).zfill(4) +'.pdf')
start_index_integer,end_index_integer = get_xref(pdf_file_path)
mutool_object_number = ' '
for index in range (start_index_integer+1, end_index_integer):
mutool_object_number += str(index) + ' '
object_seq = get_pdf_objects(pdf_file_path, mutool_object_number=mutool_object_number)
#print(object_seq)
#input()
object_file_path = 'C:\\Users\\Morteza\\Desktop\\corpus_garbage\\mozilla' \
+ '\\pdf_objects\\pdftc_mozilla_' \
+ str(current_pdf_file_id).zfill(4) + '_' \
+ str(end_index_integer-1)+ '_obj.pdfobjects'
#print(object_file_name)
# os.system('pause')
with open(object_file_path, 'w') as new_file:
new_file.write(object_seq)
total_extracted_object += (end_index_integer - 1)
print('Extracting successfull: ', str(current_pdf_file_id).zfill(4))
except Exception as e:
print('Extracting error: ', str(current_pdf_file_id).zfill(4), file=sys.stderr)
print(str(e), file=sys.stderr)
finally:
current_pdf_file_id += 1
pdf_file_path = 'C:\\Users\\Morteza\\Desktop\\corpus_garbage\\mozilla\\'
print('total_extracted_object: ', total_extracted_object)
if __name__ == "__main__":
main(sys.argv)