-
Notifications
You must be signed in to change notification settings - Fork 0
/
interwiki.py
2680 lines (2425 loc) · 113 KB
/
interwiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script to check language links for general pages. This works by downloading the
page, and using existing translations plus hints from the command line to
download the equivalent pages from other languages. All of such pages are
downloaded as well and checked for interwiki links recursively until there are
no more links that are encountered. A rationalization process then selects the
right interwiki links, and if this is unambiguous, the interwiki links in the
original page will be automatically updated and the modified page uploaded.
These command-line arguments can be used to specify which pages to work on:
&pagegenerators_help;
-days: Like -years, but runs through all date pages. Stops at
Dec 31. If the argument is given in the form -days:X,
it will start at month no. X through Dec 31. If the
argument is simply given as -days, it will run from
Jan 1 through Dec 31. E.g. for -days:9 it will run
from Sep 1 through Dec 31.
-years: run on all year pages in numerical order. Stop at year 2050.
If the argument is given in the form -years:XYZ, it
will run from [[XYZ]] through [[2050]]. If XYZ is a
negative value, it is interpreted as a year BC. If the
argument is simply given as -years, it will run from 1
through 2050.
This implies -noredirect.
-new: Work on the 100 newest pages. If given as -new:x, will work
on the x newest pages.
When multiple -namespace parameters are given, x pages are
inspected, and only the ones in the selected name spaces are
processed. Use -namespace:all for all namespaces. Without
-namespace, only article pages are processed.
This implies -noredirect.
-restore: restore a set of "dumped" pages the robot was working on
when it terminated. The dump file will be subsequently
removed.
-restore:all restore a set of "dumped" pages of all dumpfiles to a given
family remaining in the "interwiki-dumps" directory. All
these dump files will be subsequently removed. If restoring
process interrupts again, it saves all unprocessed pages in
one new dump file of the given site.
-continue: like restore, but after having gone through the dumped pages,
continue alphabetically starting at the last of the dumped
pages. The dump file will be subsequently removed.
-warnfile: used as -warnfile:filename, reads all warnings from the
given file that apply to the home wiki language,
and read the rest of the warning as a hint. Then
treats all the mentioned pages. A quicker way to
implement warnfile suggestions without verifying them
against the live wiki is using the warnfile.py
script.
Additionaly, these arguments can be used to restrict the bot to certain pages:
-namespace:n Number or name of namespace to process. The parameter can be
used multiple times. It works in combination with all other
parameters, except for the -start parameter. If you e.g.
want to iterate over all categories starting at M, use
-start:Category:M.
-number: used as -number:#, specifies that the robot should process
that amount of pages and then stop. This is only useful in
combination with -start. The default is not to stop.
-until: used as -until:title, specifies that the robot should
process pages in wiki default sort order up to, and
including, "title" and then stop. This is only useful in
combination with -start. The default is not to stop.
Note: do not specify a namespace, even if -start has one.
-bracket only work on pages that have (in the home language)
parenthesis in their title. All other pages are skipped.
(note: without ending colon)
-skipfile: used as -skipfile:filename, skip all links mentioned in
the given file. This does not work with -number!
-skipauto use to skip all pages that can be translated automatically,
like dates, centuries, months, etc.
(note: without ending colon)
-lack: used as -lack:xx with xx a language code: only work on pages
without links to language xx. You can also add a number nn
like -lack:xx:nn, so that the bot only works on pages with
at least nn interwiki links (the default value for nn is 1).
These arguments control miscellanous bot behaviour:
-quiet Use this option to get less output
(note: without ending colon)
-async Put page on queue to be saved to wiki asynchronously. This
enables loading pages during saving throtteling and gives a
better performance.
NOTE: For post-processing it always assumes that saving the
the pages was sucessful.
(note: without ending colon)
-summary: Set an additional action summary message for the edit. This
could be used for further explainings of the bot action.
This will only be used in non-autonomous mode.
-hintsonly The bot does not ask for a page to work on, even if none of
the above page sources was specified. This will make the
first existing page of -hint or -hinfile slip in as the start
page, determining properties like namespace, disambiguation
state, and so on. When no existing page is found in the
hints, the bot does nothing.
Hitting return without input on the "Which page to check:"
prompt has the same effect as using -hintsonly.
Options like -back, -same or -wiktionary are in effect only
after a page has been found to work on.
(note: without ending colon)
These arguments are useful to provide hints to the bot:
-hint: used as -hint:de:Anweisung to give the robot a hint
where to start looking for translations. If no text
is given after the second ':', the name of the page
itself is used as the title for the hint, unless the
-hintnobracket command line option (see there) is also
selected.
There are some special hints, trying a number of languages
at once:
* all: All languages with at least ca. 100 articles.
* 10: The 10 largest languages (sites with most
articles). Analogous for any other natural
number.
* arab: All languages using the Arabic alphabet.
* cyril: All languages that use the Cyrillic alphabet.
* chinese: All Chinese dialects.
* latin: All languages using the Latin script.
* scand: All Scandinavian languages.
Names of families that forward their interlanguage links
to the wiki family being worked upon can be used (with
-family=wikipedia only), they are:
* commons: Interlanguage links of Mediawiki Commons.
* incubator: Links in pages on the Mediawiki Incubator.
* meta: Interlanguage links of named pages on Meta.
* species: Interlanguage links of the wikispecies wiki.
* strategy: Links in pages on Wikimedias strategy wiki.
* test: Take interwiki links from Test Wikipedia
Languages, groups and families having the same page title
can be combined, as -hint:5,scand,sr,pt,commons:New_York
-hintfile: similar to -hint, except that hints are taken from the given
file, enclosed in [[]] each, instead of the command line.
-askhints: for each page one or more hints are asked. See hint: above
for the format, one can for example give "en:something" or
"20:" as hint.
-repository Include data repository
-same looks over all 'serious' languages for the same title.
-same is equivalent to -hint:all:
(note: without ending colon)
-wiktionary: similar to -same, but will ONLY accept names that are
identical to the original. Also, if the title is not
capitalized, it will only go through other wikis without
automatic capitalization.
-untranslated: works normally on pages with at least one interlanguage
link; asks for hints for pages that have none.
-untranslatedonly: same as -untranslated, but pages which already have a
translation are skipped. Hint: do NOT use this in
combination with -start without a -number limit, because
you will go through the whole alphabet before any queries
are performed!
-showpage when asking for hints, show the first bit of the text
of the page always, rather than doing so only when being
asked for (by typing '?'). Only useful in combination
with a hint-asking option like -untranslated, -askhints
or -untranslatedonly.
(note: without ending colon)
-noauto Do not use the automatic translation feature for years and
dates, only use found links and hints.
(note: without ending colon)
-hintnobracket used to make the robot strip everything in brackets,
and surrounding spaces from the page name, before it is
used in a -hint:xy: where the page name has been left out,
or -hint:all:, -hint:10:, etc. without a name, or
an -askhint reply, where only a language is given.
These arguments define how much user confirmation is required:
-autonomous run automatically, do not ask any questions. If a question
-auto to an operator is needed, write the name of the page
to autonomous_problems.dat and continue on the next page.
(note: without ending colon)
-confirm ask for confirmation before any page is changed on the
live wiki. Without this argument, additions and
unambiguous modifications are made without confirmation.
(note: without ending colon)
-force do not ask permission to make "controversial" changes,
like removing a language because none of the found
alternatives actually exists.
(note: without ending colon)
-cleanup like -force but only removes interwiki links to non-existent
or empty pages.
-select ask for each link whether it should be included before
changing any page. This is useful if you want to remove
invalid interwiki links and if you do multiple hints of
which some might be correct and others incorrect. Combining
-select and -confirm is possible, but seems like overkill.
(note: without ending colon)
These arguments specify in which way the bot should follow interwiki links:
-noredirect do not follow redirects nor category redirects.
(note: without ending colon)
-initialredirect work on its target if a redirect or category redirect is
entered on the command line or by a generator (note: without
ending colon). It is recommended to use this option with the
-movelog pagegenerator.
-neverlink: used as -neverlink:xx where xx is a language code:
Disregard any links found to language xx. You can also
specify a list of languages to disregard, separated by
commas.
-ignore: used as -ignore:xx:aaa where xx is a language code, and
aaa is a page title to be ignored.
-ignorefile: similar to -ignore, except that the pages are taken from
the given file instead of the command line.
-localright do not follow interwiki links from other pages than the
starting page. (Warning! Should be used very sparingly,
only when you are sure you have first gotten the interwiki
links on the starting page exactly right).
(note: without ending colon)
-hintsareright do not follow interwiki links to sites for which hints
on existing pages are given. Note that, hints given
interactively, via the -askhint command line option,
are only effective once they have been entered, thus
interwiki links on the starting page are followed
regardess of hints given when prompted.
(Warning! Should be used with caution!)
(note: without ending colon)
-back only work on pages that have no backlink from any other
language; if a backlink is found, all work on the page
will be halted. (note: without ending colon)
The following arguments are only important for users who have accounts for
multiple languages, and specify on which sites the bot should modify pages:
-localonly only work on the local wiki, not on other wikis in the
family I have a login at. (note: without ending colon)
-limittwo only update two pages - one in the local wiki (if logged-in)
and one in the top available one.
For example, if the local page has links to de and fr,
this option will make sure that only the local site and
the de: (larger) sites are updated. This option is useful
to quickly set two way links without updating all of the
wiki families sites.
(note: without ending colon)
-whenneeded works like limittwo, but other languages are changed in the
following cases:
* If there are no interwiki links at all on the page
* If an interwiki link must be removed
* If an interwiki link must be changed and there has been
a conflict for this page
Optionally, -whenneeded can be given an additional number
(for example -whenneeded:3), in which case other languages
will be changed if there are that number or more links to
change or add. (note: without ending colon)
The following arguments influence how many pages the bot works on at once:
-array: The number of pages the bot tries to be working on at once.
If the number of pages loaded is lower than this number,
a new set of pages is loaded from the starting wiki. The
default is 100, but can be changed in the config variable
interwiki_min_subjects
-query: The maximum number of pages that the bot will load at once.
Default value is 60.
Some configuration option can be used to change the working of this robot:
interwiki_min_subjects: the minimum amount of subjects that should be processed
at the same time.
interwiki_backlink: if set to True, all problems in foreign wikis will
be reported
interwiki_shownew: should interwiki.py display every new link it discovers?
interwiki_graph: output a graph PNG file on conflicts? You need pydot for
this: http://dkbza.org/pydot.html
interwiki_graph_format: the file format for interwiki graphs
without_interwiki: save file with local articles without interwikis
All these options can be changed through the user-config.py configuration file.
If interwiki.py is terminated before it is finished, it will write a dump file
to the interwiki-dumps subdirectory. The program will read it if invoked with
the "-restore" or "-continue" option, and finish all the subjects in that list.
After finishing the dump file will be deleted. To run the interwiki-bot on all
pages on a language, run it with option "-start:!", and if it takes so long
that you have to break it off, use "-continue" next time.
"""
#
# (C) Rob W.W. Hooft, 2003
# (C) Daniel Herding, 2004
# (C) Yuri Astrakhan, 2005-2006
# (C) xqt, 2009-2013
# (C) Pywikipedia bot team, 2007-2013
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id$'
#
import sys
import copy
import re
import os
import time
import codecs
import socket
import webbrowser
import wikipedia as pywikibot
import config
import catlib
import pagegenerators
from pywikibot import i18n
import interwiki_graph
import titletranslate
docuReplacements = {
'&pagegenerators_help;': pagegenerators.parameterHelp
}
class SaveError(pywikibot.Error):
"""
An attempt to save a page with changed interwiki has failed.
"""
class LinkMustBeRemoved(SaveError):
"""
An interwiki link has to be removed, but this can't be done because of user
preferences or because the user chose not to change the page.
"""
class GiveUpOnPage(pywikibot.Error):
"""
The user chose not to work on this page and its linked pages any more.
"""
# Subpage templates. Must be in lower case,
# whereas subpage itself must be case sensitive
moved_links = {
'ar': ([u'documentation', u'template documentation', u'شرح', u'توثيق'],
u'/doc'),
'bn': (u'documentation', u'/doc'),
'ca': (u'ús de la plantilla', u'/ús'),
'cs': (u'dokumentace', u'/doc'),
'da': (u'dokumentation', u'/doc'),
'de': (u'dokumentation', u'/Meta'),
'dsb': ([u'dokumentacija', u'doc'], u'/Dokumentacija'),
'en': ([u'documentation', u'template documentation', u'template doc',
u'doc', u'documentation, template'], u'/doc'),
'es': ([u'documentación', u'documentación de plantilla'], u'/doc'),
'eu': (u'txantiloi dokumentazioa', u'/dok'),
'fa': ([u'documentation', u'template documentation', u'template doc',
u'doc', u'توضیحات', u'زیرصفحه توضیحات'], u'/doc'),
# fi: no idea how to handle this type of subpage at :Metasivu:
'fi': (u'mallineohje', None),
'fr': ([u'/documentation', u'documentation', u'doc_modèle',
u'documentation modèle', u'documentation modèle compliqué',
u'documentation modèle en sous-page',
u'documentation modèle compliqué en sous-page',
u'documentation modèle utilisant les parserfunctions en sous-page',
],
u'/Documentation'),
'hsb': ([u'dokumentacija', u'doc'], u'/Dokumentacija'),
'hu': (u'sablondokumentáció', u'/doc'),
'id': (u'template doc', u'/doc'),
'ilo': (u'documentation', u'/doc'),
'ja': (u'documentation', u'/doc'),
'ka': (u'თარგის ინფო', u'/ინფო'),
'ko': (u'documentation', u'/설명문서'),
'ms': (u'documentation', u'/doc'),
'no': (u'dokumentasjon', u'/dok'),
'nn': (u'dokumentasjon', u'/dok'),
'pl': (u'dokumentacja', u'/opis'),
'pt': ([u'documentação', u'/doc'], u'/doc'),
'ro': (u'documentaţie', u'/doc'),
'ru': (u'doc', u'/doc'),
'simple': ([u'documentation',
u'template documentation',
u'template doc',
u'doc',
u'documentation, template'], u'/doc'),
'sk': (u'dokumentácia', u'/Dokumentácia'),
'sv': (u'dokumentation', u'/dok'),
'uk': ([u'документація', u'doc', u'documentation'], u'/Документація'),
'vi': (u'documentation', u'/doc'),
'zh': ([u'documentation', u'doc'], u'/doc'),
}
# A list of template names in different languages.
# Pages which contain these shouldn't be changed.
ignoreTemplates = {
'_default': [u'delete'],
'ar': [u'قيد الاستخدام'],
'cs': [u'Pracuje_se'],
'de': [u'inuse', 'in use', u'in bearbeitung', u'inbearbeitung',
u'löschen', u'sla',
u'löschantrag', u'löschantragstext',
u'falschschreibung',
u'obsolete schreibung', 'veraltete schreibweise'],
'en': [u'inuse', u'softredirect'],
'fa': [u'در دست ویرایش ۲', u'حذف سریع'],
'pdc': [u'lösche'],
'zh': [u'inuse'],
}
class Global(object):
"""
Container class for global settings.
Use of globals outside of this is to be avoided.
"""
autonomous = False
confirm = False
always = False
select = False
followredirect = True
initialredirect = False
force = False
cleanup = False
remove = []
maxquerysize = 60
same = False
skip = set()
skipauto = False
untranslated = False
untranslatedonly = False
auto = True
neverlink = []
showtextlink = 0
showtextlinkadd = 300
localonly = False
limittwo = False
strictlimittwo = False
needlimit = 0
ignore = []
parenthesesonly = False
rememberno = False
followinterwiki = True
minsubjects = config.interwiki_min_subjects
nobackonly = False
askhints = False
hintnobracket = False
hints = []
hintsareright = False
contentsondisk = config.interwiki_contents_on_disk
lacklanguage = None
minlinks = 0
quiet = False
restoreAll = False
async = False
summary = u''
repository = False
def readOptions(self, arg):
""" Read all commandline parameters for the global container """
if arg == '-noauto':
self.auto = False
elif arg.startswith('-hint:'):
self.hints.append(arg[6:])
elif arg.startswith('-hintfile'):
hintfilename = arg[10:]
if (hintfilename is None) or (hintfilename == ''):
hintfilename = pywikibot.input(
u'Please enter the hint filename:')
f = codecs.open(hintfilename, 'r', config.textfile_encoding)
# hint or title ends either before | or before ]]
R = re.compile(ur'\[\[(.+?)(?:\]\]|\|)')
for pageTitle in R.findall(f.read()):
self.hints.append(pageTitle)
f.close()
elif arg == '-force':
self.force = True
elif arg == '-cleanup':
self.cleanup = True
elif arg == '-same':
self.same = True
elif arg == '-wiktionary':
self.same = 'wiktionary'
elif arg == '-repository':
self.repository = True
elif arg == '-untranslated':
self.untranslated = True
elif arg == '-untranslatedonly':
self.untranslated = True
self.untranslatedonly = True
elif arg == '-askhints':
self.untranslated = True
self.untranslatedonly = False
self.askhints = True
elif arg == '-hintnobracket':
self.hintnobracket = True
elif arg == '-confirm':
self.confirm = True
elif arg == '-select':
self.select = True
elif arg == '-autonomous' or arg == '-auto':
self.autonomous = True
elif arg == '-noredirect':
self.followredirect = False
elif arg == '-initialredirect':
self.initialredirect = True
elif arg == '-localonly':
self.localonly = True
elif arg == '-limittwo':
self.limittwo = True
self.strictlimittwo = True
elif arg.startswith('-whenneeded'):
self.limittwo = True
self.strictlimittwo = False
try:
self.needlimit = int(arg[12:])
except KeyError:
pass
except ValueError:
pass
elif arg.startswith('-skipfile:'):
skipfile = arg[10:]
skipPageGen = pagegenerators.TextfilePageGenerator(skipfile)
for page in skipPageGen:
self.skip.add(page)
del skipPageGen
elif arg == '-skipauto':
self.skipauto = True
elif arg.startswith('-neverlink:'):
self.neverlink += arg[11:].split(",")
elif arg.startswith('-ignore:'):
self.ignore += [pywikibot.Page(None, p) for p in arg[8:].split(",")]
elif arg.startswith('-ignorefile:'):
ignorefile = arg[12:]
ignorePageGen = pagegenerators.TextfilePageGenerator(ignorefile)
for page in ignorePageGen:
self.ignore.append(page)
del ignorePageGen
elif arg == '-showpage':
self.showtextlink += self.showtextlinkadd
elif arg == '-graph':
# override configuration
config.interwiki_graph = True
elif arg == '-bracket':
self.parenthesesonly = True
elif arg == '-localright':
self.followinterwiki = False
elif arg == '-hintsareright':
self.hintsareright = True
elif arg.startswith('-array:'):
self.minsubjects = int(arg[7:])
elif arg.startswith('-query:'):
self.maxquerysize = int(arg[7:])
elif arg == '-back':
self.nobackonly = True
elif arg == '-quiet':
self.quiet = True
elif arg == '-async':
self.async = True
elif arg.startswith('-summary'):
if len(arg) == 8:
self.summary = pywikibot.input(
u'What summary do you want to use?')
else:
self.summary = arg[9:]
elif arg.startswith('-lack:'):
remainder = arg[6:].split(':')
self.lacklanguage = remainder[0]
if len(remainder) > 1:
self.minlinks = int(remainder[1])
else:
self.minlinks = 1
else:
return False
return True
class StoredPage(pywikibot.Page):
"""
Store the Page contents on disk to avoid sucking too much
memory when a big number of Page objects will be loaded
at the same time.
"""
# Please prefix the class members names by SP
# to avoid possible name clashes with pywikibot.Page
# path to the shelve
SPpath = None
# shelve
SPstore = None
# attributes created by pywikibot.Page.__init__
SPcopy = ['_editrestriction',
'_site',
'_namespace',
'_section',
'_title',
'editRestriction',
'moveRestriction',
'_permalink',
'_userName',
'_ipedit',
'_editTime',
'_startTime',
'_revisionId',
'_deletedRevs']
def SPdeleteStore():
if StoredPage.SPpath:
del StoredPage.SPstore
os.unlink(StoredPage.SPpath)
SPdeleteStore = staticmethod(SPdeleteStore)
def __init__(self, page):
for attr in StoredPage.SPcopy:
setattr(self, attr, getattr(page, attr))
if not StoredPage.SPpath:
import shelve
index = 1
while True:
path = config.datafilepath('cache', 'pagestore' + str(index))
if not os.path.exists(path): break
index += 1
StoredPage.SPpath = path
StoredPage.SPstore = shelve.open(path)
self.SPkey = str(self)
self.SPcontentSet = False
def SPgetContents(self):
return StoredPage.SPstore[self.SPkey]
def SPsetContents(self, contents):
self.SPcontentSet = True
StoredPage.SPstore[self.SPkey] = contents
def SPdelContents(self):
if self.SPcontentSet:
del StoredPage.SPstore[self.SPkey]
_contents = property(SPgetContents, SPsetContents, SPdelContents)
class PageTree(object):
"""
Structure to manipulate a set of pages.
Allows filtering efficiently by Site.
"""
def __init__(self):
# self.tree :
# Dictionary:
# keys: Site
# values: list of pages
# All pages found within Site are kept in
# self.tree[site]
# While using dict values would be faster for
# the remove() operation,
# keeping list values is important, because
# the order in which the pages were found matters:
# the earlier a page is found, the closer it is to the
# Subject.originPage. Chances are that pages found within
# 2 interwiki distance from the originPage are more related
# to the original topic than pages found later on, after
# 3, 4, 5 or more interwiki hops.
# Keeping this order is hence important to display an ordered
# list of pages to the user when he'll be asked to resolve
# conflicts.
self.tree = {}
self.size = 0
def filter(self, site):
"""
Iterates over pages that are in Site site
"""
try:
for page in self.tree[site]:
yield page
except KeyError:
pass
def __len__(self):
return self.size
def add(self, page):
site = page.site
if not site in self.tree:
self.tree[site] = []
self.tree[site].append(page)
self.size += 1
def remove(self, page):
try:
self.tree[page.site].remove(page)
self.size -= 1
except ValueError:
pass
def removeSite(self, site):
"""
Removes all pages from Site site
"""
try:
self.size -= len(self.tree[site])
del self.tree[site]
except KeyError:
pass
def siteCounts(self):
"""
Yields (Site, number of pages in site) pairs
"""
for site, d in self.tree.iteritems():
yield site, len(d)
def __iter__(self):
for site, plist in self.tree.iteritems():
for page in plist:
yield page
class Subject(object):
"""
Class to follow the progress of a single 'subject' (i.e. a page with
all its translations)
Subject is a transitive closure of the binary relation on Page:
"has_a_langlink_pointing_to".
A formal way to compute that closure would be:
With P a set of pages, NL ('NextLevel') a function on sets defined as:
NL(P) = { target | ∃ source ∈ P, target ∈ source.langlinks() }
pseudocode:
todo <- [originPage]
done <- []
while todo != []:
pending <- todo
todo <-NL(pending) / done
done <- NL(pending) U done
return done
There is, however, one limitation that is induced by implementation:
to compute efficiently NL(P), one has to load the page contents of
pages in P.
(Not only the langlinks have to be parsed from each Page, but we also want
to know if the Page is a redirect, a disambiguation, etc...)
Because of this, the pages in pending have to be preloaded.
However, because the pages in pending are likely to be in several sites
we cannot "just" preload them as a batch.
Instead of doing "pending <- todo" at each iteration, we have to elect a
Site, and we put in pending all the pages from todo that belong to that
Site:
Code becomes:
todo <- {originPage.site:[originPage]}
done <- []
while todo != {}:
site <- electSite()
pending <- todo[site]
preloadpages(site, pending)
todo[site] <- NL(pending) / done
done <- NL(pending) U done
return done
Subject objects only operate on pages that should have been preloaded
before. In fact, at any time:
* todo contains new Pages that have not been loaded yet
* done contains Pages that have been loaded, and that have been treated.
* If batch preloadings are successful, Page._get() is never called from
this Object.
"""
def __init__(self, originPage=None, hints=None):
"""Constructor. Takes as arguments the Page on the home wiki
plus optionally a list of hints for translation"""
if globalvar.contentsondisk:
if originPage:
originPage = StoredPage(originPage)
# Remember the "origin page"
self.originPage = originPage
self.repoPage = None
# todo is a list of all pages that still need to be analyzed.
# Mark the origin page as todo.
self.todo = PageTree()
if originPage:
self.todo.add(originPage)
if globalvar.repository:
self.repoPage = pywikibot.DataPage(originPage)
self.todo.add(self.repoPage)
# done is a list of all pages that have been analyzed and that
# are known to belong to this subject.
self.done = PageTree()
# foundIn is a dictionary where pages are keys and lists of
# pages are values. It stores where we found each page.
# As we haven't yet found a page that links to the origin page, we
# start with an empty list for it.
if originPage:
self.foundIn = {self.originPage: []}
else:
self.foundIn = {}
# This is a list of all pages that are currently scheduled for
# download.
self.pending = PageTree()
if globalvar.hintsareright:
# This is a set of sites that we got hints to
self.hintedsites = set()
self.translate(hints, globalvar.hintsareright)
self.confirm = globalvar.confirm
self.problemfound = False
self.untranslated = None
self.hintsAsked = False
self.forcedStop = False
self.workonme = True
def getFoundDisambig(self, site):
"""
If we found a disambiguation on the given site while working on the
subject, this method returns it. If several ones have been found, the
first one will be returned.
Otherwise, None will be returned.
"""
for tree in [self.done, self.pending]:
for page in tree.filter(site):
if page.exists() and page.isDisambig():
return page
def getFoundNonDisambig(self, site):
"""
If we found a non-disambiguation on the given site while working on the
subject, this method returns it. If several ones have been found, the
first one will be returned.
Otherwise, None will be returned.
"""
for tree in [self.done, self.pending]:
for page in tree.filter(site):
if page.exists() and not page.isDisambig() and \
not page.isRedirectPage() and not page.isCategoryRedirect():
return page
def getFoundInCorrectNamespace(self, site):
"""
If we found a page that has the expected namespace on the given site
while working on the subject, this method returns it. If several ones
have been found, the first one will be returned.
Otherwise, None will be returned.
"""
for tree in [self.done, self.pending, self.todo]:
for page in tree.filter(site):
# -hintsonly: before we have an origin page, any namespace will
# do.
if self.originPage and \
page.namespace() == self.originPage.namespace():
if page.exists() and not \
page.isRedirectPage() and not page.isCategoryRedirect():
return page
def translate(self, hints=None, keephintedsites=False):
"""Add the given translation hints to the todo list"""
if globalvar.same and self.originPage:
if hints:
pages = titletranslate.translate(
self.originPage,
hints=hints + ['all:'],
auto=globalvar.auto,
removebrackets=globalvar.hintnobracket)
else:
pages = titletranslate.translate(
self.originPage,
hints=['all:'],
auto=globalvar.auto,
removebrackets=globalvar.hintnobracket)
else:
pages = titletranslate.translate(
self.originPage,
hints=hints,
auto=globalvar.auto,
removebrackets=globalvar.hintnobracket,
site=pywikibot.getSite())
for page in pages:
if globalvar.contentsondisk:
page = StoredPage(page)
self.todo.add(page)
self.foundIn[page] = [None]
if keephintedsites:
self.hintedsites.add(page.site)
def openSites(self):
"""
Iterator. Yields (site, count) pairs:
* site is a site where we still have work to do on
* count is the number of items in that Site that need work on
"""
return self.todo.siteCounts()
def whatsNextPageBatch(self, site):
"""
By calling this method, you 'promise' this instance that you will
preload all the 'site' Pages that are in the todo list.
This routine will return a list of pages that can be treated.
"""
# Bug-check: Isn't there any work still in progress? We can't work on
# different sites at a time!
if len(self.pending) > 0:
raise "BUG: Can't start to work on %s; still working on %s" \
% (site, self.pending)
# Prepare a list of suitable pages
result = []
for page in self.todo.filter(site):
self.pending.add(page)
result.append(page)
self.todo.removeSite(site)
# If there are any, return them. Otherwise, nothing is in progress.
return result
def makeForcedStop(self, counter):
"""
Ends work on the page before the normal end.
"""
for site, count in self.todo.siteCounts():
counter.minus(site, count)
self.todo = PageTree()
self.forcedStop = True
def addIfNew(self, page, counter, linkingPage):
"""
Adds the pagelink given to the todo list, but only if we didn't know
it before. If it is added, update the counter accordingly.
Also remembers where we found the page, regardless of whether it had
already been found before or not.
Returns True if the page is new.
"""
if self.forcedStop:
return False
# cannot check backlink before we have an origin page
if globalvar.nobackonly and self.originPage:
if page == self.originPage:
try: