-
Notifications
You must be signed in to change notification settings - Fork 0
/
archiveVersion.py
executable file
·365 lines (295 loc) · 12.3 KB
/
archiveVersion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#!/usr/bin/env python
#coding=utf8
import fileinput
import sys, traceback
import re
import httplib, urllib, urllib2
from urlparse import urljoin, urlparse, urlsplit, urlunparse, ParseResult
import json
import os
from os.path import splitext
from PIL import Image, ImageDraw
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import subprocess
import cStringIO
from configobj import ConfigObj
import errno
import shutil
# TODO: Improve error handling for when Tidy doesn't work.
def FileExists(filename):
try:
fp = open(filename)
except IOError as e:
if e.errno == errno.ENOENT: # If the file does not exist
print "Warning: {0} does not exist".format(filename)
return False
elif e.errno == errno.EACCES: # If you don't have permission
print "Warning: Permission denied to access {0}".format(filename)
else:
raise
else:
fp.close()
return True
def AppendFilename(filename, appendix):
splitFilename = splitext(filename)
return splitFilename[0] + appendix + splitFilename[1]
def TidyHTML(fileAsString, optDict):
optDictList = []
for key, value in optDict.iteritems():
optDictList.append(key)
if value != '':
optDictList.append(value)
tidy = subprocess.Popen(['tidy'] + optDictList,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
results = tidy.communicate(fileAsString)[0]
# Remove CDATA tags and return
# TODO: Improve error handling for when Tidy doesn't work.
if (results != ''):
return results.replace('/*<![CDATA[*/\n', '').replace('/*]]>*/\n','')
else:
print "There's a problem with tidy."
def CleanDirPath(path):
""" Return a directory path that is terminated by '/'.
"""
def CleanPath(path):
if (os.path.split(path)[1] != ''):
return path + '/'
else:
return path
if urlparse(path).scheme == '': # This means that the path is local
return CleanPath(path)
else:
parsedURL = urlparse(path)
return urlunparse(ParseResult(
parsedURL.scheme,
parsedURL.netloc,
CleanPath(parsedURL.path),
'', '', ''
))
def PathWalk(basePath, targetPath):
for dirpath, dirnames, filenames in os.walk(basePath):
if targetPath.startswith(dirpath):
yield dirpath, dirnames, filenames
def WebDirPerms(basePath, targetPath):
for dirpath, dirnames, filenames in \
PathWalk(basePath, targetPath):
os.chmod(dirpath, 0755)
# Create a dictionary to hold tidy options:
tidyOptions = {
'-i': '',
'--tab-size': '2',
'--wrap': '0',
'--tidy-mark': 'n',
'--merge-divs': 'n',
'--doctype': 'strict',
'--char-encoding': 'utf8',
'--hide-comments': 'n',
'--drop-proprietary-attributes': 'y',
'--preserve-entities': 'y'
}
# Create a dictionary for all the filenames
filenames = {}
# Read settings from an INI file
filenames['Settings'] = sys.argv[1]
settings = ConfigObj(filenames['Settings'])
# Clean paths from INI file
settings['www-docs_root'] = os.path.expanduser(CleanDirPath(settings['www-docs_root']))
settings['web_path_root'] = CleanDirPath(settings['web_path_root'])
settings['path_to_site_folder'] = CleanDirPath(settings['path_to_site_folder'])
filenames['Original'] = settings['original_HTML']
filenames['Archive'] = AppendFilename(filenames['Original'],"-archive")
filenames['Mail'] = AppendFilename(filenames['Original'],"-mail")
filenames['Text'] = splitext(filenames['Original'])[0] + ".txt"
filenames['Tracking GIF'] = splitext(filenames['Original'])[0] + ".gif"
#linkListFilename = splitext(filename)[0] + "-links.txt"
# Create a dictionary for URLs
urls = {}
# Set up the URL for the Archive page and images
if settings.as_bool('publish_files') == True:
archivePath = urljoin(settings['web_path_root'], settings['path_to_site_folder'])
urls['Archive'] = urljoin(archivePath, filenames['Archive'])
urls['Base'] = urljoin(settings['web_path_root'], settings['path_to_site_folder'])
else:
urls['Archive'] = "./" + filenames['Archive']
urls['Base'] = "./"
# Set up the URL for the images
# Create a dictionary for all the files
files = {}
files['Original'] = open(filenames['Original'], 'r')
files['Archive'] = open(filenames['Archive'],'w')
files['Mail'] = open(filenames['Mail'],'w')
# Read in the Original file
files['Original'] = files['Original'].read()
# Insert the link to the archive version
files['Original'] = files['Original'].replace('*|ARCHIVE|*', urls['Archive'])
files['Original'] = TidyHTML(files['Original'], tidyOptions)
# Create a tracking pixel
img = Image.new('RGBA',(1,1))
draw = ImageDraw.Draw(img)
img.save(filenames['Tracking GIF'], 'GIF', transparency=0)
print "Adding a tracking pixel with name",
print '"' + filenames['Tracking GIF'] + '"...'
# Make the file into a Beautiful Soup
soup = BeautifulSoup(files['Original'])
# Replace the title of the document with the subject
titleTag = Tag(soup, "title")
titleTag.insert(0, settings['Subject'])
soup.title.replaceWith(titleTag)
# Make an image tag for the tracking pixel
trackingTag = Tag(soup, "img")
trackingTag['src'] = filenames['Tracking GIF']
#trackingTag['src'] = urljoin('./', filenames['Tracking GIF'])
# Insert the tag into the soup, right before the closing body tag
soup.body.insert(len(soup.body.contents), trackingTag)
# Find all img tags in the soup and add height and width attributes.
# If th file is local (relative), add the src to a list so that we
# know which files to publish.
filesToPublish = []
print "Adding image dimensions to all <img /> tags..."
for img in soup.findAll('img'):
imgURL = img['src']
# urlopen doesn't work with relative urls, so we have to test to see if imgURL is relative or not
if urlparse(imgURL).scheme == '': # This means that the image URL is relative (and local)
if FileExists(imgURL):
dimens = Image.open(imgURL).size
filesToPublish.append(imgURL)
else:
imgFile = cStringIO.StringIO(urllib2.urlopen(imgURL).read())
dimens = Image.open(imgFile).size
# set image width and height
img['width'] = dimens[0]
img['height'] = dimens[1]
# Find all a tags in the soup. If the link is local, add the href to
# a list so we know which files to publish.
print "Finding other files to publish..."
for a in soup.findAll('a'):
aURL = a['href']
if urlparse(aURL).scheme == '':
if FileExists(aURL):
filesToPublish.append(aURL)
# Turn the soup back into a string
print 'Reformatting original HTML file...'
files['Original'] = str(soup)
searchRegex = "<!-- \*\|IFNOT:ARCHIVE_PAGE\|\* -->.*<!-- \*\|END:IF\|\* -->"
m = re.search(searchRegex, files['Original'], re.DOTALL)
archiveVersion = files['Original'][:m.start(0)] + files['Original'][m.end(0):]
files['Archive'].write(TidyHTML(archiveVersion, tidyOptions))
archiveSoup = BeautifulSoup(m.group(0))
for a in archiveSoup.findAll('a'):
a['href'] = urls['Archive']
files['Original'] = TidyHTML(files['Original'][:m.start(0)] + \
str(archiveSoup) + \
files['Original'][m.end(0):], \
tidyOptions)
# Close the files so that we can read some of them
for value in files.itervalues():
if isinstance(value, file): # If it's a file object, close it
value.close()
# Create the content to send to Premailer
content = files['Original']
if settings.as_bool('use_premailer') == True:
# Premailer will put xmlns="http://www.w3.org/1999/xhtml" into the html tag no matter
# if it's already there or not, so remove it from the content we're sending.
#content = content.replace(' xmlns="http://www.w3.org/1999/xhtml"', '')
# Now, send the email version to Premailer:
print "Sending to Premailer..."
urls['Premailer'] = "http://premailer.dialect.ca/api/0.1/documents"
parameterDict = {'html': content, 'remove_comments': 'true',
'base_url': urls['Base']}
if settings.as_bool('publish_files') == False:
del parameterDict['base_url']
params = urllib.urlencode(parameterDict)
conn = httplib.HTTPConnection("premailer.dialect.ca")
conn.request("POST", urls['Premailer'], params)
response = conn.getresponse()
print "Response from Premailer received with status:",
print response.status, response.reason
premailerData = response.read()
conn.close()
# The premailerData is a JSON. Convert it to a Python dict.
premailerDict = json.loads(premailerData)
urls['Premailer HTML'] = premailerDict['documents']['html']
urls['Premailer Text'] = premailerDict['documents']['txt']
files['Premailer HTML'] = urllib2.urlopen(urls['Premailer HTML'])
files['Premailer Text'] = urllib2.urlopen(urls['Premailer Text'])
# We have to clean the CDATA tags from what we get from Premailer.
CDATAmsgCount = 0
with open(filenames['Mail'], 'w') as f:
withinStyle = False
for line in files['Premailer HTML']:
# Only look for CDATA tags within the <style></style> section
if line.strip().startswith('<style'):
withinStyle = True # We're in the style section
if '</style>' in line:
withinStyle = False # We're not in the style section anymore
if (withinStyle == True):
# If neither of these two tags are in the line, write the line.
# This, in effect, "deletes" the two tags from the copy.
if ('<![CDATA[' not in line) & (']]>' not in line):
f.write(line)
else:
CDATAmsgCount = CDATAmsgCount + 1
if CDATAmsgCount == 1:
print "Removing <![CDATA[...]]>..."
else: # If we're not in the style section, write everything.
f.write(line)
else:
with open(filenames['Mail'], 'w') as f:
print "Copying original HTML file to a file for mailing..."
f.write(content)
# Write plain text versions
if (settings.as_bool('use_premailer') == True) & \
(settings.as_bool('use_lynx_for_text') == False):
with open(filenames['Text'], 'w') as f:
print "Creating plain text version from Premailer..."
text = files['Premailer Text'].read()
f.write(text)
else:
optList = [
'-display_charset=utf-8',
'-width=1024',
'-dump',
'-stdin'
]
htmlAsString = ''.join(files['Original'])
print "Creating plain text version from lynx..."
lynx = subprocess.Popen(['lynx'] + optList,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
text = lynx.communicate(htmlAsString)[0]
with open(filenames['Text'], 'w') as f:
f.write(text)
### End Premailer section
# Close the files
for value in files.itervalues():
if isinstance(value, file): # If it's a file object, close it
value.close()
filesToPublish.append(filenames['Archive'])
### Publish the files
print "Publishing files..."
sourcePath = os.getcwd()
publishPath = os.path.join(settings['www-docs_root'],
settings['path_to_site_folder'])
if os.path.exists(publishPath) == False:
os.makedirs(publishPath)
# Walk the directories and make them web-viewable
WebDirPerms(settings['www-docs_root'], publishPath)
# Copy the files to the published directory
for item in filesToPublish:
dirpath = os.path.dirname(item)
filename = os.path.basename(item)
publishDirpath = os.path.join(publishPath, dirpath)
# Make directories in publishPath
if os.path.exists(publishDirpath) == False:
os.makedirs(publishDirpath)
WebDirPerms(publishPath, publishDirpath)
publishFilepath = os.path.join(publishDirpath, filename)
shutil.copy(item, publishDirpath)
os.chmod(publishFilepath, 0644)
print "Done."
# Display URL of Archive version (for debugging)
print '\nURL for Archive version:\n{0}\n'.format(urls['Archive'])
# sys.exit(0)