-
Notifications
You must be signed in to change notification settings - Fork 0
/
pagefromfile.py
366 lines (333 loc) · 13.4 KB
/
pagefromfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/python
#coding: utf-8
"""
This bot takes its input from a file that contains a number of
pages to be put on the wiki. The pages should all have the same
begin and end text (which may not overlap).
By default the text should have the intended title of the page
as the first text in bold (that is, between ''' and '''),
you can modify this behavior with command line options.
The default is not to include the begin and
end text in the page, if you want to include that text, use
the -include option.
Specific arguments:
-start:xxx Specify the text that marks the beginning of a page
-end:xxx Specify the text that marks the end of a page
-file:xxx Give the filename we are getting our material from
-include The beginning and end markers should be included
in the page.
-titlestart:xxx Use xxx in place of ''' for identifying the
beginning of page title
-titleend:xxx Use xxx in place of ''' for identifying the
end of page title
-notitle do not include the title, including titlestart, and
titleend, in the page
-nocontent If page has this statment it dosen't append
(example: -nocontents:"{{infobox")
-summary:xxx Use xxx as the edit summary for the upload - if
a page exists, standard messages are appended
after xxx for appending, prepending, or replacement
-autosummary Use MediaWikis autosummary when creating a new page,
overrides -summary in this case
-minor set minor edit flag on page edits
If the page to be uploaded already exists:
-safe do nothing (default)
-appendtop add the text to the top of it
-appendbottom add the text to the bottom of it
-force overwrite the existing page
"""
#
# (C) Andre Engels, 2004
# (C) Pywikipedia bot team, 2005-2010
#
# Distributed under the terms of the MIT license.
#
__version__='$Id$'
#
import re, codecs
import wikipedia as pywikibot
import config
class NoTitle(Exception):
"""No title found"""
def __init__(self, offset):
self.offset = offset
class PageFromFileRobot:
"""
Responsible for writing pages to the wiki, with the titles and contents
given by a PageFromFileReader.
"""
msg = {
'ar': u'استيراد تلقائي للمقالات',
'de': u'Automatischer Import von Artikeln',
'en': u'Automated import of articles',
'fa': u'درونریزی خودکار مقالهها',
'fr': u'Import automatique',
'he': u'ייבוא ערכים אוטומטי',
'ia': u'Importation automatic de articulos',
'id': u'Impor artikel automatis',
'it': u'Caricamento automatico',
'ja': u'記事の自動取り込み',
'ksh': u'Bot: automatesch huhjelaade',
'mzn': u'ربوت:صفحه شه خاد به خاد دله دکته',
'nl': u'Geautomatiseerde import',
'no': u'bot: Automatisk import',
'pl': u'Automatyczny import artykułów',
'pt': u'Importação automática de artigos',
'uk': u'Автоматичний імпорт статей',
'zh': u'機器人: 自動匯入頁面',
}
# The following messages are added to topic when the page already exists
msg_top = {
'ar': u'كتابة على الأعلى',
'de': u'ergänze am Anfang',
'en': u'append on top',
'fa': u'به بالا اضافه شد',
'he': u'הוספה בראש הדף',
'fr': u'rajouté en haut',
'id': u'ditambahkan di atas',
'it': u'aggiungo in cima',
'ja': u'冒頭への追加',
'ksh': u'un dofüürjesaz',
'nl': u'bovenaan toegevoegd',
'no': u'legger til øverst',
'pl': u'dodaj na górze',
'pt': u'adicionado no topo',
'uk': u'додано зверху',
'zh': u'機器人: 增加至最上層',
}
msg_bottom = {
'ar': u'كتابة على الأسفل',
'de': u'ergänze am Ende',
'en': u'append on bottom',
'fa': u'به پایین اضافه شد',
'he': u'הוספה בתחתית הדף',
'fr': u'rajouté en bas',
'id': u'ditambahkan di bawah',
'it': u'aggiungo in fondo',
'ja': u'末尾への追加',
'ksh': u'un aanjehange',
'nl': u'onderaan toegevoegd',
'no': u'legger til nederst',
'pl': u'dodaj na dole',
'pt': u'adicionando no fim',
'uk': u'додано знизу',
'zh': u'機器人: 增加至最底層',
}
msg_force = {
'ar': u'تمت الكتابة على النص الموجود',
'de': u'bestehender Text überschrieben',
'en': u'existing text overwritten',
'fa': u'متن جایگزین شد',
'he': u'הטקסט הישן נמחק',
'fr': u'texte existant écrasé',
'id': u'menimpa teks yang ada',
'it': u'sovrascritto il testo esistente',
'ja': u'存在するテキストの上書き',
'ksh': u'un komplët ußjetuusch',
'nl': u'bestaande tekst overschreven',
'no': u'erstatter eksisterende tekst',
'pl': u'aktualny tekst nadpisany',
'pt': u'sobrescrever texto',
'uk': u'існуючий текст перезаписано',
'zh': u'機器人: 覆寫已存在的文字',
}
def __init__(self, reader, force, append, summary, minor, autosummary,
dry, nocontents):
self.reader = reader
self.force = force
self.append = append
self.summary = summary
self.minor = minor
self.autosummary = autosummary
self.dry = dry
self.nocontents=nocontents
def run(self):
for title, contents in self.reader.run():
self.put(title, contents)
def put(self, title, contents):
mysite = pywikibot.getSite()
page = pywikibot.Page(mysite, title)
# Show the title of the page we're working on.
# Highlight the title in purple.
pywikibot.output(u">>> \03{lightpurple}%s\03{default} <<<"
% page.title())
if self.summary:
comment = self.summary
else:
comment = pywikibot.translate(mysite, self.msg)
comment_top = comment + " - " + pywikibot.translate(mysite,
self.msg_top)
comment_bottom = comment + " - " + pywikibot.translate(mysite,
self.msg_bottom)
comment_force = comment + " *** " + pywikibot.translate(mysite,
self.msg_force) + " ***"
# Remove trailing newlines (cause troubles when creating redirects)
contents = re.sub('^[\r\n]*','', contents)
if page.exists():
if self.append == "Top":
if appendtops.find(self.nocontents)==-1 and appendtops.find(self.nocontents.lower())==-1:
contents=contents +appendtops
pywikibot.output(u"Page %s already exists, appending on top!"
% title)
else:
pywikibot.output(u'Page had %s so it is skipped' % (self.nocontents))
return
contents = contents + page.get()
comment = comment_top
elif self.append == "Bottom":
if appendtops.find(self.nocontents)==-1 and appendtops.find(self.nocontents.lower())==-1:
contents=contents +appendtops
pywikibot.output(u"Page %s already exists, appending on bottom!"
% title)
else:
pywikibot.output(u'Page had %s so it is skipped' % (self.nocontents))
return
contents = page.get() + contents
comment = comment_bottom
elif self.force:
pywikibot.output(u"Page %s already exists, ***overwriting!"
% title)
comment = comment_force
else:
pywikibot.output(u"Page %s already exists, not adding!" % title)
return
else:
if self.autosummary:
comment = ''
pywikibot.setAction('')
if self.dry:
pywikibot.output("*** Dry mode ***\n" + \
"\03{lightpurple}title\03{default}: " + title + "\n" + \
"\03{lightpurple}contents\03{default}:\n" + contents + "\n" \
"\03{lightpurple}comment\03{default}: " + comment + "\n")
return
try:
page.put(contents, comment = comment, minorEdit = self.minor)
except pywikibot.LockedPage:
pywikibot.output(u"Page %s is locked; skipping." % title)
except pywikibot.EditConflict:
pywikibot.output(u'Skipping %s because of edit conflict' % title)
except pywikibot.SpamfilterError, error:
pywikibot.output(
u'Cannot change %s because of spam blacklist entry %s'
% (title, error.url))
class PageFromFileReader:
"""
Responsible for reading the file.
The run() method yields a (title, contents) tuple for each found page.
"""
def __init__(self, filename, pageStartMarker, pageEndMarker,
titleStartMarker, titleEndMarker, include, notitle):
self.filename = filename
self.pageStartMarker = pageStartMarker
self.pageEndMarker = pageEndMarker
self.titleStartMarker = titleStartMarker
self.titleEndMarker = titleEndMarker
self.include = include
self.notitle = notitle
def run(self):
pywikibot.output('Reading \'%s\'...' % self.filename)
try:
f = codecs.open(self.filename, 'r',
encoding=config.textfile_encoding)
except IOError, err:
print err
return
text = f.read()
position = 0
length = 0
while True:
try:
length, title, contents = self.findpage(text[position:])
except AttributeError:
if not length:
pywikibot.output(u'\nStart or end marker not found.')
else:
pywikibot.output(u'End of file.')
break
except NoTitle, err:
pywikibot.output(u'\nNo title found - skipping a page.')
position += err.offset
continue
position += length
yield title, contents
def findpage(self, text):
pageR = re.compile(self.pageStartMarker + "(.*?)" + self.pageEndMarker, re.DOTALL)
titleR = re.compile(self.titleStartMarker + "(.*?)" + self.titleEndMarker)
location = pageR.search(text)
if self.include:
contents = location.group()
else:
contents = location.group(1)
try:
title = titleR.search(contents).group(1)
if self.notitle:
#Remove title (to allow creation of redirects)
contents = titleR.sub('', contents, count = 1)
except AttributeError:
raise NoTitle(location.end())
else:
return location.end(), title, contents
def main():
# Adapt these to the file you are using. 'pageStartMarker' and
# 'pageEndMarker' are the beginning and end of each entry. Take text that
# should be included and does not occur elsewhere in the text.
# TODO: make config variables for these.
filename = "dict.txt"
pageStartMarker = "{{-start-}}"
pageEndMarker = "{{-stop-}}"
titleStartMarker = u"'''"
titleEndMarker = u"'''"
nocontents=u""
include = False
force = False
append = None
notitle = False
summary = None
minor = False
autosummary = False
for arg in pywikibot.handleArgs():
if arg.startswith("-start:"):
pageStartMarker = arg[7:]
elif arg.startswith("-end:"):
pageEndMarker = arg[5:]
elif arg.startswith("-file:"):
filename = arg[6:]
elif arg == "-include":
include = True
elif arg == "-appendtop":
append = "Top"
elif arg == "-appendbottom":
append = "Bottom"
elif arg == "-force":
force=True
elif arg == "-safe":
force = False
append = None
elif arg == '-notitle':
notitle = True
elif arg == '-minor':
minor = True
elif arg.startswith('-nocontent:'):
nocontents=arg[11:]
elif arg.startswith("-titlestart:"):
titleStartMarker = arg[12:]
elif arg.startswith("-titleend:"):
titleEndMarker = arg[10:]
elif arg.startswith("-summary:"):
summary = arg[9:]
elif arg == '-autosummary':
autosummary = True
else:
pywikibot.output(u"Disregarding unknown argument %s." % arg)
reader = PageFromFileReader(filename, pageStartMarker, pageEndMarker,
titleStartMarker, titleEndMarker, include,
notitle)
bot = PageFromFileRobot(reader, force, append, summary, minor, autosummary,
pywikibot.simulate, nocontents)
bot.run()
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()