-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml_process_example.py
143 lines (101 loc) · 4.55 KB
/
xml_process_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Created by Dimitri Perrin, September 2022
"""
import glob, subprocess, datetime, ast
from bs4 import BeautifulSoup as bs
# Input and output folders. Note: make sure you are using the correct folders
folder_in = "/work/wikidata/extracted_files/"
folder_out = "/work/wikidata/extracted_files/"
# List of files to process
# Hard-coded files
files = ["enwiki-20220201-pages-meta-history1.xml-p1p857.notext", "enwiki-20220201-pages-meta-history17.xml-p22110007p22238459.notext"]
# Alternatively, we can use `glob` to get a list of all files matching a pattern
#files = glob.glob(folder_in+"*.notext")
def progress_message(txt):
"""
Simple function to print a timestamped message
Parameters:
txt (string): message to display
Returns:
None
"""
# datetime object containing current date and time
now = datetime.datetime.now()
# time, formatted as dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
# we print the time and the message
print(dt_string,txt)
def extract_data(filename, folder):
"""
Function that reads from an XML file, extracts some data, and saves to CSV.
For this example, we extract timestamp and contributor
Parameters:
filename (string): full path to the XML file
folder (strong): full path to the output folder
Returns:
None
"""
# first, we read the file
with open(filename, "r") as inFile:
# Read each line in the file, readlines() returns a list of lines
content = inFile.readlines()
# Combine the lines in the list into a string
content = "".join(content)
bs_content = bs(content, "lxml")
progress_message(f"Data in memory. Parsing.")
# we extract the pages
pages = bs_content.find_all("page")
progress_message(f"{len(pages)} pages to process")
nb_page_processed = 0
# hacky way to create the ouput file name
temp_name = filename.split("/")[-1]
output_name = temp_name+".csv"
with open(folder+output_name,'w') as outFile:
# header line
outFile.write("Page ID,Page title,Revision ID,Timestamp,Contributor ID,Contributor name\n")
# for each page
for p in pages:
# we extract the page title
title = p.find("title")
title_value = str(title)[7:-8]
# we extract the page ID
ID = p.find("id")
ID_value = str(ID)[4:-5]
# we extract the revisions
revisions = p.find_all("revision")
# for each revision
for r in revisions:
# we extract the revision ID
rev_ID = r.find("id")
rev_ID_value = str(rev_ID)[4:-5]
# we extract the timestamp
rev_time = r.find("timestamp")
rev_time_value = str(rev_time)[11:-12]
# we extract the contributor
contributor = r.find("contributor")
# the contributor can be a registered user or an IP
contributor_ID = contributor.find("id")
if contributor_ID != None:
contributor_value = str(contributor_ID)[4:-5]
contributor_name = str(contributor.find("username"))[10:-11]
# same usernames have commas, so we need to escape them
contributor_name = '"'+contributor_name+'"'
else:
contributor_IP = contributor.find("ip")
contributor_value = str(contributor_IP)[4:-5]
contributor_name = "N/A"
outFile.write(f"{ID_value},{title_value},{rev_ID_value},{rev_time_value},{contributor_value},{contributor_name}\n")
nb_page_processed+=1
# we show how much progress has been made
if nb_page_processed % 100 == 0:
progress_message(f"Processed {nb_page_processed} pages")
"""
Main part of the program.
Iterates over the list of files.
For each file, extracts data and saves to CSV.
"""
progress_message(f"Starting. Number of pages to process: {len(files)}")
for current_file in files:
progress_message(f"Processing file {current_file}")
extract_data(folder_in+current_file,folder_out)
progress_message("Done.")