-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathwikipedia.py
9737 lines (8551 loc) · 404 KB
/
wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
Library to get and put pages on a MediaWiki wiki.
Contents of the library (objects and functions to be used outside)
Classes:
Page(site, title): A page on a MediaWiki site
DataPage(site, title): A Page object for the data repository.
ImagePage(site, title): An image descriptor Page
Site(lang, fam): A MediaWiki site
Factory functions:
Family(name): Import the named family
getSite(lang, fam): Return a Site instance
Exceptions:
Error: Base class for all exceptions in this module
NoUsername: Username is not in user-config.py
NoPage: Page does not exist on the wiki
NoSuchSite: Site does not exist
IsRedirectPage: Page is a redirect page
IsNotRedirectPage: Page is not a redirect page
LockedPage: Page is locked
SectionError: The section specified in the Page title does not exist
PageNotSaved: Saving the page has failed
EditConflict: PageNotSaved due to edit conflict while uploading
SpamfilterError: PageNotSaved due to MediaWiki spam filter
LongPageError: PageNotSaved due to length limit
ServerError: Got unexpected response from wiki server
BadTitle: Server responded with BadTitle
UserBlocked: Client's username or IP has been blocked
PageNotFound: Page not found in list
Objects:
get_throttle: Call to limit rate of read-access to wiki
put_throttle: Call to limit rate of write-access to wiki
Other functions:
getall(): Load a group of pages
handleArgs(): Process all standard command line arguments (such as
-family, -lang, -log and others)
translate(xx, dict): dict is a dictionary, giving text depending on
language, xx is a language. Returns the text in the most applicable
language for the xx: wiki
setAction(text): Use 'text' instead of "Wikipedia python library" in
edit summaries
setUserAgent(text): Sets the string being passed to the HTTP server as
the User-agent: header. The default is
'<script>/<revision> Pywikipediabot/1.0', where '<script>' is the tail
path component and file name of the currently executing script and
revision is the SVN revision of Pywikipediabot.
output(text): Prints the text 'text' in the encoding of the user's
console. **Use this instead of "print" statements**
stdout(text): Prints to stdout **Use this for script results only!**
warning(text): Prints warnings.
error(text): Prints errors.
log(text): Prints general log messages.
critical(text): Prints critical errors.
debug(text): Prints debug information.
debugDump(): Prints huge debug information.
exception(msg): Prints excpetions and tracebacks.
input(text): Asks input from the user, printing the text 'text' first.
inputChoice: Shows user a list of choices and returns user's selection.
showDiff(oldtext, newtext): Prints the differences between oldtext and
newtext on the screen
Wikitext manipulation functions: each of these takes a unicode string
containing wiki text as its first argument, and returns a modified version
of the text unless otherwise noted --
replaceExcept: replace all instances of 'old' by 'new', skipping any
instances of 'old' within comments and other special text blocks
removeDisabledParts: remove text portions exempt from wiki markup
isDisabled(text,index): return boolean indicating whether text[index] is
within a non-wiki-markup section of text
decodeEsperantoX: decode Esperanto text using the x convention.
encodeEsperantoX: convert wikitext to the Esperanto x-encoding.
findmarker(text, startwith, append): return a string which is not part
of text
expandmarker(text, marker, separator): return marker string expanded
backwards to include separator occurrences plus whitespace
Wikitext manipulation functions for interlanguage links:
getLanguageLinks(text,xx): extract interlanguage links from text and
return in a dict
removeLanguageLinks(text): remove all interlanguage links from text
removeLanguageLinksAndSeparator(text, site, marker, separator = ''):
remove language links, whitespace, preceeding separators from text
replaceLanguageLinks(oldtext, new): remove the language links and
replace them with links from a dict like the one returned by
getLanguageLinks
interwikiFormat(links): convert a dict of interlanguage links to text
(using same dict format as getLanguageLinks)
interwikiSort(sites, inSite): sorts a list of sites according to interwiki
sort preference of inSite.
url2link: Convert urlname of a wiki page into interwiki link format.
Wikitext manipulation functions for category links:
getCategoryLinks(text): return list of Category objects corresponding
to links in text
removeCategoryLinks(text): remove all category links from text
replaceCategoryLinksAndSeparator(text, site, marker, separator = ''):
remove language links, whitespace, preceeding separators from text
replaceCategoryLinks(oldtext,new): replace the category links in oldtext by
those in a list of Category objects
replaceCategoryInPlace(text,oldcat,newtitle): replace a single link to
oldcat with a link to category given by newtitle
categoryFormat(links): return a string containing links to all
Categories in a list.
Unicode utility functions:
UnicodeToAsciiHtml: Convert unicode to a bytestring using HTML entities.
url2unicode: Convert url-encoded text to unicode using a site's encoding.
unicode2html: Ensure unicode string is encodable; if not, convert it to
ASCII for HTML.
html2unicode: Replace HTML entities in text with unicode characters.
stopme(): Put this on a bot when it is not or not communicating with the Wiki
any longer. It will remove the bot from the list of running processes,
and thus not slow down other bot threads anymore.
"""
#
# (C) Pywikipedia bot team, 2003-2013
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id$'
import os, sys
import httplib, socket, urllib, urllib2, cookielib
import traceback, pprint
import time, threading, Queue
import re, codecs, difflib
try:
from hashlib import md5
except ImportError: # Python 2.4 compatibility
from md5 import new as md5
import xml.sax, xml.sax.handler
import htmlentitydefs
import warnings
import unicodedata
import xmlreader
import externals # allow imports from externals
externals.check_setup('BeautifulSoup.py')
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup, SoupStrainer
import weakref
import logging, logging.handlers
try:
#For Python 2.6 newer
import json
if not hasattr(json, 'loads'):
# 'json' can also be the name in for
# http://pypi.python.org/pypi/python-json
raise ImportError
except ImportError:
externals.check_setup('simplejson')
import simplejson as json
# Splitting the bot into library parts
from pywikibot.support import *
import config, login, query
from pywikibot import version
# Check Unicode support (is this a wide or narrow python build?)
# See http://www.python.org/doc/peps/pep-0261/
try:
unichr(66365) # a character in th: alphabet, uses 32 bit encoding
WIDEBUILD = True
except ValueError:
WIDEBUILD = False
from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL
STDOUT = 16
VERBOSE = 18
INPUT = 25
# Format string for the default user agent.
USER_AGENT_FORMAT = '{script}/r{version[rev]} Pywikipediabot/1.0'
SaxError = xml.sax._exceptions.SAXParseException
# Pre-compile re expressions
reNamespace = re.compile("^(.+?) *: *(.*)$")
Rwatch = re.compile(
r"<input type='hidden' value=\"(.*?)\" name=\"wpEditToken\"")
Rwatchlist = re.compile(r"<input tabindex='[\d]+' type='checkbox' "
r"name='wpWatchthis' checked='checked'")
Rlink = re.compile(r'\[\[(?P<title>[^\]\|\[]*)(\|[^\]]*)?\]\]')
# Page objects (defined here) represent the page itself, including its contents.
class Page(object):
"""Page: A MediaWiki page
Constructor has two required parameters:
1) The wiki Site on which the page resides [note that, if the
title is in the form of an interwiki link, the Page object may
have a different Site than this]
2) The title of the page as a unicode string
Optional parameters:
insite - the wiki Site where this link was found (to help decode
interwiki links)
defaultNamespace - A namespace to use if the link does not contain one
Methods available:
title : The name of the page, including namespace and
section if any
urlname : Title, in a form suitable for a URL
namespace : The namespace in which the page is found
section : The section of the page (the part of the title
after '#', if any)
sectionFreeTitle : Title, without the section part
site : The wiki this page is in
encoding : The encoding of the page
isAutoTitle : Title can be translated using the autoFormat method
autoFormat : Auto-format certain dates and other standard
format page titles
isCategory : True if the page is a category
isDisambig (*) : True if the page is a disambiguation page
isImage : True if the page is an image
isRedirectPage (*) : True if the page is a redirect, false otherwise
getRedirectTarget (*) : The page the page redirects to
isTalkPage : True if the page is in any "talk" namespace
toggleTalkPage : Return the talk page (if this is one, return the
non-talk page)
get (*) : The text of the page
getSections (*) : Retrieve page section heading and assign them to
the byte offset
latestRevision (*) : The page's current revision id
userName : Last user to edit page
userNameHuman : Last human (non-bot) user to edit page
isIpEdit : True if last editor was unregistered
editTime : Timestamp of the last revision to the page
previousRevision (*) : The revision id of the previous version
permalink (*) : The url of the permalink of the current version
getOldVersion(id) (*) : The text of a previous version of the page
getRestrictions : Returns a protection dictionary
getVersionHistory : Load the version history information from wiki
getVersionHistoryTable: Create a wiki table from the history data
fullVersionHistory : Return all past versions including wikitext
contributingUsers : Return set of users who have edited page
getCreator : Function to get the first editor of a page
getLatestEditors : Function to get the last editors of a page
exists (*) : True if the page actually exists, false otherwise
isEmpty (*) : True if the page has 4 characters or less content,
not counting interwiki and category links
interwiki (*) : The interwiki links from the page (list of Pages)
categories (*) : The categories the page is in (list of Pages)
linkedPages (*) : The normal pages linked from the page (list of
Pages)
imagelinks (*) : The pictures on the page (list of ImagePages)
templates (*) : All templates referenced on the page (list of
Pages)
templatesWithParams(*): All templates on the page, with list of parameters
getReferences : List of pages linking to the page
canBeEdited (*) : True if page is unprotected or user has edit
privileges
protection(*) : This page protection level
botMayEdit (*) : True if bot is allowed to edit page
put(newtext) : Saves the page
put_async(newtext) : Queues the page to be saved asynchronously
append(newtext) : Append to page section
watch : Add the page to the watchlist
unwatch : Remove the page from the watchlist
move : Move the page to another title
delete : Deletes the page (requires being logged in)
protect : Protect or unprotect a page (requires sysop status)
removeImage : Remove all instances of an image from this page
replaceImage : Replace all instances of an image with another
loadDeletedRevisions : Load all deleted versions of this page
getDeletedRevision : Return a particular deleted revision
markDeletedRevision : Mark a version to be undeleted, or not
undelete : Undelete past version(s) of the page
purgeCache : Purge page from server cache
(*) : This loads the page if it has not been loaded before; permalink might
even reload it if it has been loaded before
"""
def __init__(self, site, title, insite=None, defaultNamespace=0):
"""Instantiate a Page object.
"""
try:
# if _editrestriction is True, it means that the page has been found
# to have an edit restriction, but we do not know yet whether the
# restriction affects us or not
self._editrestriction = False
if site is None or isinstance(site, basestring):
site = getSite(site)
self._site = site
if not insite:
insite = site
# Clean up the name, it can come from anywhere.
# Convert HTML entities to unicode
t = html2unicode(title)
# Convert URL-encoded characters to unicode
# Sometimes users copy the link to a site from one to another.
# Try both the source site and the destination site to decode.
try:
t = url2unicode(t, site=insite, site2=site)
except UnicodeDecodeError:
raise InvalidTitle(u'Bad page title : %s' % t)
# Normalize unicode string to a NFC (composed) format to allow
# proper string comparisons. According to
# http://svn.wikimedia.org/viewvc/mediawiki/branches/REL1_6/phase3/includes/normal/UtfNormal.php?view=markup
# the mediawiki code normalizes everything to NFC, not NFKC
# (which might result in information loss).
t = unicodedata.normalize('NFC', t)
if u'\ufffd' in t:
raise InvalidTitle("Title contains illegal char (\\uFFFD)")
# Replace underscores by spaces
t = t.replace(u"_", u" ")
# replace multiple spaces a single space
while u" " in t: t = t.replace(u" ", u" ")
# Strip spaces at both ends
t = t.strip()
# Remove left-to-right and right-to-left markers.
t = t.replace(u'\u200e', '').replace(u'\u200f', '')
if t.startswith(':'):
t = t[1:]
prefix = True
else:
prefix = False
self._namespace = defaultNamespace
#
# This code was adapted from Title.php : secureAndSplit()
#
# Namespace or interwiki prefix
while True:
m = reNamespace.match(t)
if not m:
# leading colon implies main namespace instead of default
if t.startswith(':'):
t = t[1:]
self._namespace = 0
elif prefix:
self._namespace = 0
else:
self._namespace = defaultNamespace
break
prefix = False
p = m.group(1)
lowerNs = p.lower()
ns = self._site.getNamespaceIndex(lowerNs)
if ns:
t = m.group(2)
self._namespace = ns
break
if lowerNs in self._site.family.langs.keys():
# Interwiki link
t = m.group(2)
# Redundant interwiki prefix to the local wiki
if lowerNs == self._site.lang:
if t == '':
raise Error("Can't have an empty self-link")
else:
self._site = getSite(lowerNs, self._site.family.name)
if t == '':
t = self._site.mediawiki_message('Mainpage')
elif lowerNs in self._site.family.get_known_families(site = self._site):
if self._site.family.get_known_families(site = self._site)[lowerNs] == self._site.family.name:
t = m.group(2)
else:
# This page is from a different family
if verbose:
output(u"Target link '%s' has different family '%s'" % (title, lowerNs))
if self._site.family.name in ['commons', 'meta']:
#When the source wiki is commons or meta,
#w:page redirects you to w:en:page
otherlang = 'en'
else:
otherlang = self._site.lang
familyName = self._site.family.get_known_families(site=self._site)[lowerNs]
if familyName in ['commons', 'meta']:
otherlang = familyName
try:
self._site = getSite(otherlang, familyName)
except ValueError:
raise NoPage("""\
%s is not a local page on %s, and the %s family is
not supported by PyWikipediaBot!"""
% (title, self._site, familyName))
t = m.group(2)
else:
# If there's no recognized interwiki or namespace,
# then let the colon expression be part of the title.
break
if not t:
raise InvalidTitle(u"Invalid title '%s'" % title )
sectionStart = t.find(u'#')
# But maybe there are magic words like {{#time|}}
# TODO: recognize magic word and templates inside links
# see http://la.wikipedia.org/w/index.php?title=997_Priska&diff=prev&oldid=1038880
if sectionStart > 0:
# Categories does not have sections.
if self._namespace == 14:
raise InvalidTitle(u"Invalid section in category '%s'" % t)
else:
t, sec = t.split(u'#', 1)
self._section = sec.lstrip() or None
t = t.rstrip()
elif sectionStart == 0:
raise InvalidTitle(u"Invalid title starting with a #: '%s'" % t)
else:
self._section = None
if t:
if not self._site.nocapitalize:
t = t[:1].upper() + t[1:]
# reassemble the title from its parts
if self._namespace != 0:
t = u'%s:%s' % (self._site.namespace(self._namespace), t)
if self._section:
t += u'#' + self._section
self._title = t
self.editRestriction = None
self.moveRestriction = None
self._permalink = None
self._userName = None
self._comment = None
self._ipedit = None
self._editTime = None
self._startTime = '0'
# For the Flagged Revisions MediaWiki extension
self._revisionId = None
self._deletedRevs = None
except NoSuchSite:
raise
except:
if verbose:
output(u"Exception in Page constructor")
output(
u"site=%s, title=%s, insite=%s, defaultNamespace=%i"
% (site, title, insite, defaultNamespace)
)
raise
@property
def site(self):
"""Return the Site object for the wiki on which this Page resides."""
return self._site
@property
def image_repository(self):
"""Return the Site object for the image repository."""
return self.site.image_repository()
@property
def data_repository(self):
"""Return the Site object for the data repository."""
return self.site.data_repository()
def namespace(self):
"""Return the number of the namespace of the page.
Only recognizes those namespaces defined in family.py.
If not defined, it will return 0 (the main namespace).
"""
return self._namespace
def encoding(self):
"""Return the character encoding used on this Page's wiki Site."""
return self._site.encoding()
@deprecate_arg("decode", None)
def title(self, underscore=False, savetitle=False, withNamespace=True,
withSection=True, asUrl=False, asLink=False,
allowInterwiki=True, forceInterwiki=False, textlink=False,
as_filename=False):
"""Return the title of this Page, as a Unicode string.
@param underscore: if true, replace all ' ' characters with '_'
@param withNamespace: if false, omit the namespace prefix
@param withSection: if false, omit the section
@param asUrl: - not implemented yet -
@param asLink: if true, return the title in the form of a wikilink
@param allowInterwiki: (only used if asLink is true) if true, format
the link as an interwiki link if necessary
@param forceInterwiki: (only used if asLink is true) if true, always
format the link as an interwiki link
@param textlink: (only used if asLink is true) if true, place a ':'
before Category: and Image: links
@param as_filename: - not implemented yet -
@param savetitle: if True, encode any wiki syntax in the title.
"""
title = self._title
if not withNamespace and self.namespace() != 0:
title = title.split(':', 1)[1]
if asLink:
iw_target_site = getSite()
iw_target_family = getSite().family
if iw_target_family.interwiki_forward:
iw_target_family = pywikibot.Family(iw_target_family.interwiki_forward)
if allowInterwiki and (forceInterwiki or self._site != iw_target_site):
colon = ""
if textlink:
colon = ":"
if self._site.family != iw_target_family \
and self._site.family.name != self._site.lang:
title = u'[[%s%s:%s:%s]]' % (colon, self._site.family.name,
self._site.lang, title)
else:
title = u'[[%s%s:%s]]' % (colon, self._site.lang, title)
elif textlink and (self.isImage() or self.isCategory()):
title = u'[[:%s]]' % title
else:
title = u'[[%s]]' % title
if savetitle or asLink:
# Ensure there's no wiki syntax in the title
title = title.replace(u"''", u'%27%27')
if underscore:
title = title.replace(' ', '_')
if not withSection:
sectionName = self.section(underscore=underscore)
if sectionName:
title = title[:-len(sectionName)-1]
return title
#@deprecated("Page.title(withNamespace=False)")
def titleWithoutNamespace(self, underscore=False):
"""Return title of Page without namespace and without section."""
return self.title(underscore=underscore, withNamespace=False,
withSection=False)
def titleForFilename(self):
"""
Return the title of the page in a form suitable for a filename on
the user's file system.
"""
result = self.title()
# Replace characters that are not possible in file names on some
# systems.
# Spaces are possible on most systems, but are bad for URLs.
for forbiddenChar in ':*?/\\ ':
result = result.replace(forbiddenChar, '_')
return result
@deprecate_arg("decode", None)
def section(self, underscore = False):
"""Return the name of the section this Page refers to.
The section is the part of the title following a '#' character, if
any. If no section is present, return None.
"""
section = self._section
if section and underscore:
section = section.replace(' ', '_')
return section
def sectionFreeTitle(self, underscore=False):
"""Return the title of this Page, without the section (if any)."""
sectionName = self.section(underscore=underscore)
title = self.title(underscore=underscore)
if sectionName:
return title[:-len(sectionName)-1]
else:
return title
def urlname(self, withNamespace=True):
"""Return the Page title encoded for use in an URL."""
title = self.title(withNamespace=withNamespace, underscore=True)
encodedTitle = title.encode(self.site().encoding())
return urllib.quote(encodedTitle)
def __str__(self):
"""Return a console representation of the pagelink."""
return self.title(asLink=True, forceInterwiki=True
).encode(config.console_encoding,
"xmlcharrefreplace")
def __unicode__(self):
return self.title(asLink=True, forceInterwiki=True)
def __repr__(self):
"""Return a more complete string representation."""
return "%s{%s}" % (self.__class__.__name__,
self.title(asLink=True).encode(config.console_encoding))
def __cmp__(self, other):
"""Test for equality and inequality of Page objects.
Page objects are "equal" if and only if they are on the same site
and have the same normalized title, including section if any.
Page objects are sortable by namespace first, then by title.
"""
if not isinstance(other, Page):
# especially, return -1 if other is None
return -1
if self._site == other._site:
return cmp(self._title, other._title)
else:
return cmp(self._site, other._site)
def __hash__(self):
# Pseudo method that makes it possible to store Page objects as keys
# in hash-tables. This relies on the fact that the string
# representation of an instance can not change after the construction.
return hash(unicode(self))
@deprecated("Page.title(asLink=True)")
def aslink(self, forceInterwiki=False, textlink=False, noInterwiki=False):
"""Return a string representation in the form of a wikilink.
If forceInterwiki is True, return an interwiki link even if it
points to the home wiki. If False, return an interwiki link only if
needed.
If textlink is True, always return a link in text form (that is,
interwiki links and internal links to the Category: and Image:
namespaces will be preceded by a : character).
DEPRECATED to merge to rewrite branch:
use self.title(asLink=True) instead.
"""
return self.title(asLink=True, forceInterwiki=forceInterwiki,
allowInterwiki=not noInterwiki, textlink=textlink)
def autoFormat(self):
"""Return (dictName, value) if title is in date.autoFormat dictionary.
Value can be a year, date, etc., and dictName is 'YearBC',
'Year_December', or another dictionary name. Please note that two
entries may have exactly the same autoFormat, but be in two
different namespaces, as some sites have categories with the
same names. Regular titles return (None, None).
"""
if not hasattr(self, '_autoFormat'):
import date
self._autoFormat = date.getAutoFormat(self.site().language(),
self.title(withNamespace=False))
return self._autoFormat
def isAutoTitle(self):
"""Return True if title of this Page is in the autoFormat dictionary."""
return self.autoFormat()[0] is not None
def get(self, force=False, get_redirect=False, throttle=True,
sysop=False, change_edit_time=True, expandtemplates=False):
"""Return the wiki-text of the page.
This will retrieve the page from the server if it has not been
retrieved yet, or if force is True. This can raise the following
exceptions that should be caught by the calling code:
@exception NoPage The page does not exist
@exception IsRedirectPage The page is a redirect. The argument of the
exception is the title of the page it
redirects to.
@exception SectionError The section does not exist on a page with
a # link
@param force reload all page attributes, including errors.
@param get_redirect return the redirect text, do not follow the
redirect, do not raise an exception.
@param sysop if the user has a sysop account, use it to
retrieve this page
@param change_edit_time if False, do not check this version for
changes before saving. This should be used only
if the page has been loaded previously.
@param expandtemplates all templates in the page content are fully
resolved too (if API is used).
"""
# NOTE: The following few NoPage exceptions could already be thrown at
# the Page() constructor. They are raised here instead for convenience,
# because all scripts are prepared for NoPage exceptions raised by
# get(), but not for such raised by the constructor.
# \ufffd represents a badly encoded character, the other characters are
# disallowed by MediaWiki.
for illegalChar in u'#<>[]|{}\n\ufffd':
if illegalChar in self.sectionFreeTitle():
if verbose:
output(u'Illegal character in %s!'
% self.title(asLink=True))
raise NoPage('Illegal character in %s!'
% self.title(asLink=True))
if self.namespace() == -1:
raise NoPage('%s is in the Special namespace!'
% self.title(asLink=True))
if self.site().isInterwikiLink(self.title()):
raise NoPage('%s is not a local page on %s!'
% (self.title(asLink=True), self.site()))
if force:
# When forcing, we retry the page no matter what:
# * Old exceptions and contents do not apply any more
# * Deleting _contents and _expandcontents to force reload
for attr in ['_redirarg', '_getexception',
'_contents', '_expandcontents',
'_sections']:
if hasattr(self, attr):
delattr(self, attr)
else:
# Make sure we re-raise an exception we got on an earlier attempt
if hasattr(self, '_redirarg') and not get_redirect:
raise IsRedirectPage, self._redirarg
elif hasattr(self, '_getexception'):
if self._getexception == IsRedirectPage and get_redirect:
pass
else:
raise self._getexception
# Make sure we did try to get the contents once
if expandtemplates:
attr = '_expandcontents'
else:
attr = '_contents'
if not hasattr(self, attr):
try:
contents = self._getEditPage(get_redirect=get_redirect, throttle=throttle, sysop=sysop,
expandtemplates = expandtemplates)
if expandtemplates:
self._expandcontents = contents
else:
self._contents = contents
hn = self.section()
if hn:
m = re.search("=+[ ']*%s[ ']*=+" % re.escape(hn),
self._contents)
if verbose and not m:
warning(u"Section does not exist: %s" % self)
# Store any exceptions for later reference
except NoPage:
self._getexception = NoPage
raise
except IsRedirectPage, arg:
self._getexception = IsRedirectPage
self._redirarg = arg
if not get_redirect:
raise
except SectionError:
self._getexception = SectionError
raise
except UserBlocked:
if self.site().loggedInAs(sysop=sysop):
raise UserBlocked(self.site(), unicode(self))
else:
if verbose:
output("The IP address is blocked, retry by login.")
self.site().forceLogin(sysop=sysop)
return self.get(force, get_redirect, throttle, sysop, change_edit_time)
if expandtemplates:
return self._expandcontents
return self._contents
def _getEditPage(self, get_redirect=False, throttle=True, sysop=False,
oldid=None, change_edit_time=True, expandtemplates=False):
"""Get the contents of the Page via API query
Do not use this directly, use get() instead.
Arguments:
oldid - Retrieve an old revision (by id), not the current one
get_redirect - Get the contents, even if it is a redirect page
expandtemplates - Fully resolve templates within page content
(if API is used)
This method returns the raw wiki text as a unicode string.
"""
if not self.site().has_api() or self.site().versionnumber() < 12:
return self._getEditPageOld(get_redirect, throttle, sysop, oldid, change_edit_time)
params = {
'action': 'query',
'titles': self.title(),
'prop': ['revisions', 'info'],
'rvprop': ['content', 'ids', 'flags', 'timestamp', 'user', 'comment', 'size'],
'rvlimit': 1,
#'talkid' valid for release > 1.12
#'url', 'readable' valid for release > 1.14
'inprop': ['protection', 'subjectid'],
#'intoken': 'edit',
}
if oldid:
params['rvstartid'] = oldid
if expandtemplates:
params[u'rvexpandtemplates'] = u''
if throttle:
get_throttle()
textareaFound = False
# retrying loop is done by query.GetData
data = query.GetData(params, self.site(), sysop=sysop)
if 'error' in data:
raise RuntimeError("API query error: %s" % data)
if not 'pages' in data['query']:
raise RuntimeError("API query error, no pages found: %s" % data)
pageInfo = data['query']['pages'].values()[0]
if data['query']['pages'].keys()[0] == "-1":
if 'missing' in pageInfo:
raise NoPage(self.site(), unicode(self),
"Page does not exist. In rare cases, if you are certain the page does exist, look into overriding family.RversionTab")
elif 'invalid' in pageInfo:
raise BadTitle('BadTitle: %s' % self)
elif 'revisions' in pageInfo: #valid Title
lastRev = pageInfo['revisions'][0]
if isinstance(lastRev['*'], basestring):
textareaFound = True
# I got page date with 'revisions' in pageInfo but
# lastRev['*'] = False instead of the content. The Page itself was
# deleted but there was not 'missing' in pageInfo as expected
# I raise a ServerError() yet, but maybe it should be NoPage().
if not textareaFound:
if verbose:
output(unicode(pageInfo))
raise ServerError('ServerError: No textarea found in %s' % self)
self.editRestriction = ''
self.moveRestriction = ''
# Note: user may be hidden and mw returns 'userhidden' flag
if 'userhidden' in lastRev:
self._userName = None
else:
self._userName = lastRev['user']
self._ipedit = 'anon' in lastRev
try:
self._comment = lastRev['comment']
except KeyError:
self._comment = None
for restr in pageInfo['protection']:
if restr['type'] == 'edit':
self.editRestriction = restr['level']
elif restr['type'] == 'move':
self.moveRestriction = restr['level']
self._revisionId = lastRev['revid']
if change_edit_time:
self._editTime = parsetime2stamp(lastRev['timestamp'])
if "starttimestamp" in pageInfo:
self._startTime = parsetime2stamp(pageInfo["starttimestamp"])
self._isWatched = False #cannot handle in API in my research for now.
pagetext = lastRev['*']
pagetext = pagetext.rstrip()
# pagetext must not decodeEsperantoX() if loaded via API
m = self.site().redirectRegex().match(pagetext)
if m:
# page text matches the redirect pattern
if self.section() and not "#" in m.group(1):
redirtarget = "%s#%s" % (m.group(1), self.section())
else:
redirtarget = m.group(1)
if get_redirect:
self._redirarg = redirtarget
else:
raise IsRedirectPage(redirtarget)
if self.section() and \
not does_text_contain_section(pagetext, self.section()):
try:
self._getexception
except AttributeError:
raise SectionError # Page has no section by this name
return pagetext
def _getEditPageOld(self, get_redirect=False, throttle=True, sysop=False,
oldid=None, change_edit_time=True):
"""Get the contents of the Page via the edit page."""
if verbose:
output(u'Getting page %s' % self.title(asLink=True))
path = self.site().edit_address(self.urlname())
if oldid:
path += "&oldid="+oldid
# Make sure Brion doesn't get angry by waiting if the last time a page
# was retrieved was not long enough ago.
if throttle:
get_throttle()
textareaFound = False
retry_idle_time = 1
while not textareaFound:
text = self.site().getUrl(path, sysop = sysop)
if "<title>Wiki does not exist</title>" in text:
raise NoSuchSite(u'Wiki %s does not exist yet' % self.site())
# Extract the actual text from the textarea
m1 = re.search('<textarea([^>]*)>', text)
m2 = re.search('</textarea>', text)
if m1 and m2:
i1 = m1.end()
i2 = m2.start()
textareaFound = True
else:
# search for messages with no "view source" (aren't used in new versions)
if self.site().mediawiki_message('whitelistedittitle') in text:
raise NoPage(u'Page editing is forbidden for anonymous '
u'users.')
elif self.site().has_mediawiki_message('nocreatetitle') and \
self.site().mediawiki_message('nocreatetitle') in text:
raise NoPage(self.site(), unicode(self))
# Bad title
elif 'var wgPageName = "Special:Badtitle";' in text \
or self.site().mediawiki_message('badtitle') in text:
raise BadTitle('BadTitle: %s' % self)
# find out if the username or IP has been blocked
elif self.site().isBlocked():
raise UserBlocked(self.site(), unicode(self))
# If there is no text area and the heading is 'View Source'
# but user is not blocked, the page does not exist, and is
# locked
elif self.site().mediawiki_message('viewsource') in text:
raise NoPage(self.site(), unicode(self))
# Some of the newest versions don't have a "view source" tag for
# non-existant pages
# Check also the div class because if the language is not english
# the bot can not seeing that the page is blocked.
elif self.site().mediawiki_message('badaccess') in text or \
"<div class=\"permissions-errors\">" in text:
raise NoPage(self.site(), unicode(self))
elif config.retry_on_fail:
if "<title>Wikimedia Error</title>" in text:
output( u"Wikimedia has technical problems; will retry in %i minutes." % retry_idle_time)
else:
output( unicode(text) )
# We assume that the server is down. Wait some time, then try again.
warning( u"No text area found on %s%s. Maybe the server is down. Retrying in %i minutes..." % (self.site().hostname(), path, retry_idle_time) )
time.sleep(retry_idle_time * 60)
# Next time wait longer, but not longer than half an hour
retry_idle_time *= 2
if retry_idle_time > 30:
retry_idle_time = 30
else:
output( u"Failed to access wiki")
sys.exit(1)
# Check for restrictions
m = re.search('var wgRestrictionEdit = \\["(\w+)"\\]', text)
if m:
if verbose:
debug(u"page is locked for group %s" % m.group(1))
self.editRestriction = m.group(1);
else:
self.editRestriction = ''
m = re.search('var wgRestrictionMove = \\["(\w+)"\\]', text)
if m:
self.moveRestriction = m.group(1);
else:
self.moveRestriction = ''
m = re.search('name=["\']baseRevId["\'] type=["\']hidden["\'] value="(\d+)"', text)
if m:
self._revisionId = m.group(1)
if change_edit_time:
# Get timestamps
m = re.search('value="(\d+)" name=["\']wpEdittime["\']', text)
if m:
self._editTime = m.group(1)
else:
self._editTime = "0"
m = re.search('value="(\d+)" name=["\']wpStarttime["\']', text)
if m:
self._startTime = m.group(1)
else:
self._startTime = "0"
# Find out if page actually exists. Only existing pages have a
# version history tab.
if self.site().family.RversionTab(self.site().language()):
# In case a family does not have version history tabs, or in
# another form
RversionTab = re.compile(self.site().family.RversionTab(self.site().language()))
else:
RversionTab = re.compile(r'<li id="ca-history"><a href=".*?title=.*?&action=history".*?>.*?</a></li>', re.DOTALL)
matchVersionTab = RversionTab.search(text)
if not matchVersionTab and not self.site().family.name == 'wikitravel':
raise NoPage(self.site(), unicode(self),
"Page does not exist. In rare cases, if you are certain the page does exist, look into overriding family.RversionTab" )
# Look if the page is on our watchlist
matchWatching = Rwatchlist.search(text)
if matchWatching: