-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_wikilinks.py
61 lines (54 loc) · 1.4 KB
/
extract_wikilinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python
"""
Script to extract all wiki page names a certain HTML file points to in
interwiki-link format
The output can be used as input to interwiki.py.
This script takes a single file name argument, the file should be a HTML file
as captured from one of the wikipedia servers.
Arguments:
-bare Extract as internal links: [[Title]] instead of [[Family:xx:Title]]
-sorted Print the pages sorted alphabetically (default: the order in which
they occur in the HTML file)
"""
#
# (C) Rob W.W. Hooft, Andre Engels, 2003-2005
#
# Distributed under the terms of the MIT license.
#
__version__='$Id$'
#
import sys,re
import codecs
import wikipedia as pywikibot
# This bot does not contact the Wiki, so no need to get it on the list
pywikibot.stopme()
R = re.compile('/wiki/(.*?)" *')
fn = []
sorted = False
list = []
complete = True
for arg in pywikibot.handleArgs():
if arg.startswith("-sorted"):
sorted = True
elif arg.startswith("-bare"):
complete = False
elif fn:
print "Ignoring argument %s"%arg
else:
fn = arg
if not fn:
print "No file specified to get the links from"
sys.exit(1)
mysite = pywikibot.getSite()
f=open(fn,'r')
text=f.read()
f.close()
for hit in R.findall(text):
if complete:
list.append(mysite.linkto(hit))
else:
list.append("[[%s]]"%hit)
if sorted:
list.sort()
for page in list:
print page