|
| 1 | +import mmap |
| 2 | +import os |
| 3 | +import re |
| 4 | +import sys |
| 5 | + |
| 6 | +from tqdm import tqdm |
| 7 | + |
| 8 | +from cities.conf import files |
| 9 | + |
| 10 | + |
| 11 | +def get_line_number(file_path): |
| 12 | + with open(file_path, 'r+') as fp: |
| 13 | + buf = mmap.mmap(fp.fileno(), 0) |
| 14 | + lines = 0 |
| 15 | + while buf.readline(): |
| 16 | + lines += 1 |
| 17 | + return lines |
| 18 | + |
| 19 | + |
| 20 | +try: |
| 21 | + original_file = sys.argv[1] |
| 22 | +except IndexError: |
| 23 | + original_file = None |
| 24 | + |
| 25 | +test_data_dir = os.path.join('test_project', 'data') |
| 26 | + |
| 27 | +new_filename = '{}.new'.format(os.path.basename(original_file)) |
| 28 | +new_filepath = os.path.join(test_data_dir, new_filename) |
| 29 | + |
| 30 | +# Luckily this regex applies to both hierarchy.txt and alternativeNames.txt |
| 31 | +file_rgx = re.compile(r'^(?:[^\t]+\t){1}([^\t]+)\t(?:en|und|ru|adm|\t)', |
| 32 | + re.IGNORECASE | re.UNICODE) |
| 33 | + |
| 34 | +# Bail early if we haven't been given a file to read |
| 35 | +if original_file is None: |
| 36 | + print("You must specify the full original file (usually hierarchy.txt or " |
| 37 | + "alternativeNames.txt) as the first argument.\n\nExiting.") |
| 38 | + |
| 39 | + exit(-1) |
| 40 | + |
| 41 | +# Bail early if the file exists |
| 42 | +if new_filepath and os.path.exists(new_filepath): |
| 43 | + print("This script writes {}, but that file already exists. Please move " |
| 44 | + "(or remove) that file and rerun this script.\n\nExiting.".format( |
| 45 | + new_filepath)) |
| 46 | + |
| 47 | + exit(-1) |
| 48 | + |
| 49 | +# Read all of the affected geonameids |
| 50 | +geonameids = [] |
| 51 | +for _type in ('region', 'subregion', 'city'): |
| 52 | + filename = files[_type]['filename'] |
| 53 | + filepath = os.path.join(test_data_dir, filename) |
| 54 | + |
| 55 | + column = files[_type]['fields'].index('geonameid') |
| 56 | + |
| 57 | + rgx = re.compile(r'^(?:[^\t]+\t){{{}}}([^\t\n]+)(?:[\t\n])'.format(column)) |
| 58 | + |
| 59 | + num_lines = get_line_number(filepath) |
| 60 | + |
| 61 | + with open(filepath, 'r') as f: |
| 62 | + # Not using .read() here causes f to be read as an iterable, which is |
| 63 | + # exactly what we want because the file may be large |
| 64 | + for line in tqdm(f, total=num_lines, |
| 65 | + desc="Collecting geonameids from {}".format(filename)): |
| 66 | + m = rgx.match(line) |
| 67 | + |
| 68 | + if m: |
| 69 | + geonameids.append(m.group(1)) |
| 70 | + |
| 71 | +# For all of the collected geonameids, write out matching lines from the |
| 72 | +# original file |
| 73 | +with open(original_file, 'r') as rf: |
| 74 | + # Check for file existence again, immediately before we write to it |
| 75 | + if os.path.exists(new_filepath): |
| 76 | + print("This script writes {}, but that file already exists. Please " |
| 77 | + "move (or remove) that file and rerun this script.".format( |
| 78 | + new_filepath)) |
| 79 | + |
| 80 | + exit(-1) |
| 81 | + |
| 82 | + num_lines = get_line_number(original_file) |
| 83 | + |
| 84 | + # Write out matching lines to the new file |
| 85 | + with open(new_filepath, 'a+') as wf: |
| 86 | + for line in tqdm(rf, total=num_lines, |
| 87 | + desc="Writing geonameids"): |
| 88 | + m = file_rgx.match(line) |
| 89 | + |
| 90 | + if m and m.group(1) in geonameids: |
| 91 | + wf.write(line) |
0 commit comments