-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunzip_and_extract_based_on_IDs.py
149 lines (119 loc) · 4.96 KB
/
unzip_and_extract_based_on_IDs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
Created by Dimitri Perrin, September 2022
"""
import glob, subprocess, datetime, ast
# Input and output folders. Note: make sure you are using the correct folders
folder_in = "/work/wikidata/"
folder_out = "/work/wikidata/extracted_files/"
# List of page IDs to process
pages = [264, 198332, 35458904]
"""
Simple function to print a timestamped message
Parameters:
txt (string): message to display
Returns:
None
"""
def progress_message(txt):
# datetime object containing current date and time
now = datetime.datetime.now()
# time, formatted as dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
# we print the time and the message
print(dt_string,txt)
"""
Function finding which file a given page comes from
Parameters:
page_ID (int): ID for the page we want to extract
Returns:
file (string): full path to the file containing that page
"""
def find_file(page_ID):
# we get
files = glob.glob(folder_in+"enwiki-20220201-pages-meta-history*.7z")
for file in files:
# filenames are structued like this:
# enwiki-20220201-pages-meta-history13.xml-p9940247p10047273.7z
# we extract the range of IDs
range_ID = file.rstrip().split("-")[-1][1:-3]
first_ID, last_ID = range_ID.split("p")
# we convert the first and last IDs to numbers
first_ID = ast.literal_eval(first_ID)
last_ID = ast.literal_eval(last_ID)
# if page_ID is between the two, we have found the file
if page_ID>=first_ID and page_ID<=last_ID:
return file
"""
Function that extracts from an XML file just the content we need for a given page
Parameters:
page_ID (int): ID for the page we want to extract
filename (string): full path to the XML file
folder (strong): full path to the output folder
Returns:
None
"""
def extract_XML(page_ID, filename, folder):
# note: the structure is <page>, then <title> on next line, then <ns> then <id>
with open(filename,'r') as inFile, open(f"{folder}{page_ID}.xml",'w') as outFile:
flag = False
count = 0
for line in inFile:
# we are reaching a new page, so we start a counter
if line.lstrip()[:6] == "<page>":
count = 1
# we save the title line in case we need it
elif line.lstrip()[:7] == "<title>":
title = line
count+=1
# we save the ns line in case we need it
elif line.lstrip()[:4] == "<ns>":
ns = line
count+=1
# we check the ID
elif line.lstrip()[:4] == "<id>":
if count==3: #revisions also have IDs, so we need to be careful
# we extract the ID (using strip rather than lstrip now, to handle the end-of-line character)
current_ID = ast.literal_eval(line.strip()[4:-5])
# if it is our page of interest...
if current_ID == page_ID:
page_name = title.strip()[7:-8]
progress_message(f"Saving page {page_name}")
# we write the title, ns, and current lines
outFile.write("<page>\n")
outFile.write(title)
outFile.write(ns)
outFile.write(line)
# we turn on the flag to indicate that we want that page
flag = True
count+=1
# we write the content we want to extract
elif flag:
outFile.write(line)
# we turn off the flag if we have reached the end of the page
if line.lstrip()[:7] == "</page>":
flag=False
# we can even break the loop, as we are extracting only one page
break
else:
continue
"""
Main part of the program.
Iterates over the list of pages.
For each page, finds the correct archive, unzips it, and extracts data for that page.
"""
progress_message(f"Starting. Number of pages to process: {len(pages)}")
for current_page in pages:
progress_message(f"Processing page ID {current_page}")
# first, we find which file contains the page we want
current_file = find_file(current_page)
progress_message(f"Extracting file {current_file}")
subprocess.call(["7za","-y","e",current_file,"-o"+folder_out])
# the resulting file has the same name, without the .7z extension
# next, we extract just the content for that page
progress_message(f"Extracting XML content for the page")
unzipped_file = folder_out+current_file[len(folder_in):-3]
extract_XML(current_page,unzipped_file,folder_out)
# finally, we delete the initial XML as we are no longer using it
progress_message(f"Deleting file {unzipped_file}")
subprocess.call(["rm",unzipped_file])
progress_message("Done.")