-
Notifications
You must be signed in to change notification settings - Fork 0
/
makecat.py
314 lines (300 loc) · 10.7 KB
/
makecat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# -*- coding: UTF-8 -*-
"""
This bot takes as its argument (or, if no argument is given, asks for it), the
name of a new or existing category. It will then try to find new articles for
this category (pages linked to and from pages already in the category), asking
the user which pages to include and which not.
Arguments:
-nodates automatically skip all pages that are years or dates (years
only work AD, dates only for certain languages)
-forward only check pages linked from pages already in the category,
not pages linking to them. Is less precise but quite a bit
faster.
-exist only ask about pages that do actually exist; drop any
titles of non-existing pages silently. If -forward is chosen,
-exist is automatically implied.
-keepparent do not remove parent categories of the category to be
worked on.
-all work on all pages (default: only main namespace)
When running the bot, you will get one by one a number by pages. You can
choose:
Y(es) - include the page
N(o) - do not include the page or
I(gnore) - do not include the page, but if you meet it again, ask again.
X - add the page, but do not check links to and from it
Other possiblities:
A(dd) - add another page, which may have been one that was included before
C(heck) - check links to and from the page, but do not add the page itself
R(emove) - remove a page that is already in the list
L(ist) - show current list of pages to include or to check
"""
# (C) Andre Engels, 2004
# (C) Pywikipedia bot team 2005-2010
#
# Distributed under the terms of the MIT license.
#
__version__='$Id$'
#
import sys, codecs, re
import date
import wikipedia as pywikibot
import catlib
msg={
'ar':u'إنشاء أو تحديث التصنيف:',
'en':u'Creation or update of category:',
'es':u'Creación o actualiza de la categoría:',
'fa':u'ایجاد یا تصحیح رده:',
'fr':u'Création ou mise à jour de categorie:',
'he':u'יצירה או עדכון של קטגוריה:',
'ia':u'Creation o actualisation de categoria:',
'it':u'La creazione o laggiornamento di categoria:',
'nl':u'Aanmaak of uitbreiding van categorie:',
'nn':u'oppretting eller oppdatering av kategori:',
'no':u'opprettelse eller oppdatering av kategori:',
'pl':u'Stworzenie lub aktualizacja kategorii:',
'pt':u'Criando ou atualizando categoria:',
}
def rawtoclean(c):
#Given the 'raw' category, provides the 'clean' category
c2 = c.title().split('|')[0]
return pywikibot.Page(mysite,c2)
def isdate(s):
"""returns true iff s is a date or year
"""
dict,val = date.getAutoFormat( pywikibot.getSite().language(), s )
return dict is not None
def needcheck(pl):
if main:
if pl.namespace() != 0:
return False
if pl in checked:
return False
if skipdates:
if isdate(pl.title()):
return False
return True
def include(pl,checklinks=True,realinclude=True,linkterm=None):
cl = checklinks
if linkterm:
actualworkingcat = catlib.Category(mysite,workingcat.title(),
sortKey=linkterm)
else:
actualworkingcat = workingcat
if realinclude:
try:
text = pl.get()
except pywikibot.NoPage:
pass
except pywikibot.IsRedirectPage:
cl = True
pass
else:
cats = pl.categories()
if not workingcat in cats:
cats = pl.categories()
for c in cats:
if c in parentcats:
if removeparent:
catlib.change_category(pl,c,actualworkingcat)
break
else:
pl.put(pywikibot.replaceCategoryLinks(
text, cats + [actualworkingcat]))
if cl:
if checkforward:
for page2 in pl.linkedPages():
if needcheck(page2):
tocheck.append(page2)
checked[page2] = page2
if checkbackward:
for refPage in pl.getReferences():
if needcheck(refPage):
tocheck.append(refPage)
checked[refPage] = refPage
def exclude(pl,real_exclude=True):
if real_exclude:
excludefile.write('%s\n'%pl.title())
def asktoadd(pl):
if pl.site() != mysite:
return
if pl.isRedirectPage():
pl2 = pl.getRedirectTarget()
if needcheck(pl2):
tocheck.append(pl2)
checked[pl2]=pl2
return
ctoshow = 500
pywikibot.output(u'')
pywikibot.output(u"==%s=="%pl.title())
while 1:
answer = raw_input("y(es)/n(o)/i(gnore)/(o)ther options? ")
if answer=='y':
include(pl)
break
if answer=='c':
include(pl,realinclude=False)
break
if answer=='z':
if pl.exists():
if not pl.isRedirectPage():
linkterm = pywikibot.input(
u"In what manner should it be alphabetized?")
include(pl,linkterm=linkterm)
break
include(pl)
break
elif answer=='n':
exclude(pl)
break
elif answer=='i':
exclude(pl,real_exclude=False)
break
elif answer=='o':
pywikibot.output(u"t: Give the beginning of the text of the page")
pywikibot.output(
u"z: Add under another title (as [[Category|Title]])")
pywikibot.output(
u"x: Add the page, but do not check links to and from it")
pywikibot.output(u"c: Do not add the page, but do check links")
pywikibot.output(u"a: Add another page")
pywikibot.output(u"l: Give a list of the pages to check")
elif answer=='a':
pagetitle = raw_input("Specify page to add:")
page=pywikibot.Page(pywikibot.getSite(),pagetitle)
if not page in checked.keys():
include(page)
elif answer=='x':
if pl.exists():
if pl.isRedirectPage():
pywikibot.output(
u"Redirect page. Will be included normally.")
include(pl,realinclude=False)
else:
include(pl,checklinks=False)
else:
pywikibot.output(u"Page does not exist; not added.")
exclude(pl,real_exclude=False)
break
elif answer=='l':
pywikibot.output(u"Number of pages still to check: %s"
% len(tocheck))
pywikibot.output(u"Pages to be checked:")
pywikibot.output(u" - ".join(page.title() for page in tocheck))
pywikibot.output(u"==%s=="%pl.title())
elif answer=='t':
pywikibot.output(u"==%s=="%pl.title())
try:
pywikibot.output(u''+pl.get(get_redirect=True)[0:ctoshow])
except pywikibot.NoPage:
pywikibot.output(u"Page does not exist.")
ctoshow += 500
else:
pywikibot.output(u"Not understood.")
try:
checked = {}
skipdates = False
checkforward = True
checkbackward = True
checkbroken = True
removeparent = True
main = True
workingcatname = []
tocheck = []
for arg in pywikibot.handleArgs():
if arg.startswith('-nodate'):
skipdates = True
elif arg.startswith('-forward'):
checkbackward = False
checkbroken = False
elif arg.startswith('-exist'):
checkbroken = False
elif arg.startswith('-keepparent'):
removeparent = False
elif arg.startswith('-all'):
main = False
else:
workingcatname.append(arg)
if len(workingcatname) == 0:
workingcatname = raw_input("Which page to start with? ")
else:
workingcatname = ' '.join(workingcatname)
mysite = pywikibot.getSite()
pywikibot.setAction(pywikibot.translate(mysite,msg) + ' ' + workingcatname)
workingcat = catlib.Category(mysite,
u'%s:%s'
% (mysite.category_namespace(),
workingcatname))
filename = pywikibot.config.datafilepath('category',
pywikibot.UnicodeToAsciiHtml(workingcatname) +'_exclude.txt')
try:
f = codecs.open(filename, 'r', encoding = mysite.encoding())
for line in f.readlines():
# remove trailing newlines and carriage returns
try:
while line[-1] in ['\n', '\r']:
line = line[:-1]
except IndexError:
pass
exclude(line,real_exclude=False)
pl = pywikibot.Page(mysite,line)
checked[pl] = pl
f.close()
excludefile = codecs.open(filename, 'a', encoding = mysite.encoding())
except IOError:
# File does not exist
excludefile = codecs.open(filename, 'w', encoding = mysite.encoding())
try:
parentcats = workingcat.categories()
except pywikibot.Error:
parentcats = []
# Do not include articles already in subcats; only checking direct subcats
subcatlist = workingcat.subcategoriesList()
if subcatlist:
pywikibot.getall(mysite,subcatlist)
for cat in subcatlist:
list = cat.articlesList()
for page in list:
exclude(page.title(),real_exclude=False)
checked[page] = page
list = workingcat.articlesList()
if list:
for pl in list:
checked[pl]=pl
pywikibot.getall(mysite,list)
for pl in list:
include(pl)
else:
pywikibot.output(
u"Category %s does not exist or is empty. Which page to start with?"
% workingcatname)
answer = pywikibot.input(u"(Default is [[%s]]):" % workingcatname)
if not answer:
answer = workingcatname
pywikibot.output(u''+answer)
pl = pywikibot.Page(mysite,answer)
tocheck = []
checked[pl] = pl
include(pl)
loaded = 0
while tocheck:
if loaded == 0:
if len(tocheck) < 50:
loaded = len(tocheck)
else:
loaded = 50
pywikibot.getall(mysite,tocheck[:loaded])
if not checkbroken:
if not tocheck[0].exists():
pass
else:
asktoadd(tocheck[0])
else:
asktoadd(tocheck[0])
tocheck = tocheck[1:]
loaded -= 1
finally:
pywikibot.stopme()
try:
excludefile.close()
except:
pass