Skip to content

Commit cd8fec2

Browse files
committed
Create script to generate minimal yet sufficient hierarchy and alternative names
1 parent 071f4ce commit cd8fec2

File tree

4 files changed

+13011
-20
lines changed

4 files changed

+13011
-20
lines changed

build_test_data.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import mmap
2+
import os
3+
import re
4+
import sys
5+
6+
from tqdm import tqdm
7+
8+
from cities.conf import files
9+
10+
11+
def get_line_number(file_path):
12+
with open(file_path, 'r+') as fp:
13+
buf = mmap.mmap(fp.fileno(), 0)
14+
lines = 0
15+
while buf.readline():
16+
lines += 1
17+
return lines
18+
19+
20+
try:
21+
original_file = sys.argv[1]
22+
except IndexError:
23+
original_file = None
24+
25+
test_data_dir = os.path.join('test_project', 'data')
26+
27+
new_filename = '{}.new'.format(os.path.basename(original_file))
28+
new_filepath = os.path.join(test_data_dir, new_filename)
29+
30+
# Luckily this regex applies to both hierarchy.txt and alternativeNames.txt
31+
file_rgx = re.compile(r'^(?:[^\t]+\t){1}([^\t]+)\t(?:en|und|ru|adm|\t)',
32+
re.IGNORECASE | re.UNICODE)
33+
34+
# Bail early if we haven't been given a file to read
35+
if original_file is None:
36+
print("You must specify the full original file (usually hierarchy.txt or "
37+
"alternativeNames.txt) as the first argument.\n\nExiting.")
38+
39+
exit(-1)
40+
41+
# Bail early if the file exists
42+
if new_filepath and os.path.exists(new_filepath):
43+
print("This script writes {}, but that file already exists. Please move "
44+
"(or remove) that file and rerun this script.\n\nExiting.".format(
45+
new_filepath))
46+
47+
exit(-1)
48+
49+
# Read all of the affected geonameids
50+
geonameids = []
51+
for _type in ('region', 'subregion', 'city'):
52+
filename = files[_type]['filename']
53+
filepath = os.path.join(test_data_dir, filename)
54+
55+
column = files[_type]['fields'].index('geonameid')
56+
57+
rgx = re.compile(r'^(?:[^\t]+\t){{{}}}([^\t\n]+)(?:[\t\n])'.format(column))
58+
59+
num_lines = get_line_number(filepath)
60+
61+
with open(filepath, 'r') as f:
62+
# Not using .read() here causes f to be read as an iterable, which is
63+
# exactly what we want because the file may be large
64+
for line in tqdm(f, total=num_lines,
65+
desc="Collecting geonameids from {}".format(filename)):
66+
m = rgx.match(line)
67+
68+
if m:
69+
geonameids.append(m.group(1))
70+
71+
# For all of the collected geonameids, write out matching lines from the
72+
# original file
73+
with open(original_file, 'r') as rf:
74+
# Check for file existence again, immediately before we write to it
75+
if os.path.exists(new_filepath):
76+
print("This script writes {}, but that file already exists. Please "
77+
"move (or remove) that file and rerun this script.".format(
78+
new_filepath))
79+
80+
exit(-1)
81+
82+
num_lines = get_line_number(original_file)
83+
84+
# Write out matching lines to the new file
85+
with open(new_filepath, 'a+') as wf:
86+
for line in tqdm(rf, total=num_lines,
87+
desc="Writing geonameids"):
88+
m = file_rgx.match(line)
89+
90+
if m and m.group(1) in geonameids:
91+
wf.write(line)

build_test_data.sh

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
DJANGO_SETTINGS_MODULE=test_project.test_app.settings python ./build_test_data.py $1

0 commit comments

Comments
 (0)