Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better parsing of planning applications #1354

Merged
merged 10 commits into from
Mar 26, 2024
4 changes: 3 additions & 1 deletion etl/planning_data/address_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ def generate_address(site_name, site_number, street_name, secondary_street_name)
street_name += " - with secondary road name: " + secondary_street_name

if site_number is not None and street_name is not None:
address = site_number + " " + street_name + " - " + site_name
address = site_number + " " + street_name
if site_name is not None:
address += " - " + site_name
# in some cases it results in duplication when site_name repeats
# address parts, but often it provides useful data

Expand Down
110 changes: 65 additions & 45 deletions etl/planning_data/obtain_livestream_data_and_load_into_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ def main():
downloaded = 0
last_sort = None
search_after = []
unexpected_status_statistics = {}
while True:
data = query(search_after).json()
load_data_into_database(cursor, data)
unexpected_status_statistics = load_data_into_database(cursor, data, unexpected_status_statistics)
for entry in data["hits"]["hits"]:
downloaded += 1
last_sort = entry["sort"]
Expand All @@ -27,6 +28,14 @@ def main():
if search_after == last_sort:
break
search_after = last_sort
print("unexpected_status_statistics")
for code, occurences in unexpected_status_statistics.items():
print(code, "x" + str(occurences))
print()
print("popular values in unexpected_status_statistics")
for code, occurences in unexpected_status_statistics.items():
if occurences > 100:
print(code, "x" + str(occurences))
connection.commit()


Expand All @@ -50,7 +59,7 @@ def get_connection():
)


def load_data_into_database(cursor, data):
def load_data_into_database(cursor, data, unexpected_status_statistics):
if "timed_out" not in data:
print(json.dumps(data, indent=4))
print("timed_out field missing in provided data")
Expand All @@ -72,7 +81,8 @@ def load_data_into_database(cursor, data):
)
uprn = entry["_source"]["uprn"]
status_before_aliasing = entry["_source"]["status"]
status_info = process_status(status_before_aliasing, decision_date)
status_info = process_status(status_before_aliasing, decision_date, unexpected_status_statistics)
unexpected_status_statistics = status_info['unexpected_status_statistics']
status = status_info["status"]
status_explanation_note = status_info["status_explanation_note"]
planning_url = obtain_entry_link(
Expand Down Expand Up @@ -107,24 +117,7 @@ def load_data_into_database(cursor, data):
if len(entry["address"]) > maximum_address_length:
print("address is too long, shortening", entry["address"])
entry["address"] = entry["address"][0:maximum_address_length]
if date_in_future(entry["registered_with_local_authority_date"]):
print(
"registered_with_local_authority_date is treated as invalid:",
entry["registered_with_local_authority_date"],
)
# Brent-87_0946 has "valid_date": "23/04/9187"
entry["registered_with_local_authority_date"] = None

if date_in_future(entry["decision_date"]):
print("decision_date is treated as invalid:", entry["decision_date"])
entry["decision_date"] = None

if date_in_future(entry["last_synced_date"]):
print(
"last_synced_date is treated as invalid:", entry["last_synced_date"]
)
entry["last_synced_date"] = None

entry = throw_away_invalid_dates(entry)
if "Hackney" in application_id_with_borough_identifier:
if entry["application_url"] is not None:
if "https://" not in entry["application_url"]:
Expand All @@ -140,6 +133,25 @@ def load_data_into_database(cursor, data):
print()
show_dictionary(entry)
raise e
return unexpected_status_statistics


def throw_away_invalid_dates(entry):
for date_code in ["registered_with_local_authority_date", "decision_date", "last_synced_date"]:
if date_in_future(entry[date_code]):
print(
date_code + " is treated as invalid:",
entry[date_code],
)
# Brent-87_0946 has "valid_date": "23/04/9187"
entry[date_code] = None

if entry[date_code] is not None:
# not believable values
if entry[date_code] < datetime.datetime(1950, 1, 1):
print(date_code, "Unexpectedly early date, treating it as a missing date:", entry[date_code])
entry[date_code] = None
return entry


def date_in_future(date):
Expand Down Expand Up @@ -295,9 +307,6 @@ def parse_date_string_into_date_object(incoming):
date = datetime.datetime.strptime(
incoming, "%Y-%m-%dT%H:%M:%S.%fZ"
) # '2022-08-08T20:07:22.238Z'
if date < datetime.datetime(1950, 1, 1): # not believable values
print("Unexpectedly early date, treating it as a missing date:", date)
date = None
return date


Expand Down Expand Up @@ -355,14 +364,21 @@ def obtain_entry_link(provided_link, application_id):
# Richmond is simply broken


def process_status(status, decision_date):
def process_status(status, decision_date, unexpected_status_statistics):
status_length_limit = 50 # see migrations/034.planning_livestream_data.up.sql
if status is None or status.lower() in ["null", "not_mapped"]:
if status is None:
status = "Unknown"
canonical_status = status.lower().strip()
if canonical_status in ["null", "not_mapped", "", "not known"]:
status = "Unknown"
if status.lower() in ["application under consideration", "application received"]:
if canonical_status in ["application under consideration", "application received"]:
if decision_date is None:
status = "Submitted"
if status.lower() in [
else:
print(status, "but with", decision_date,
"marking as unknown status")
status = "Unknown"
if canonical_status in [
"refused",
"refusal",
"refusal (p)",
Expand All @@ -372,17 +388,18 @@ def process_status(status, decision_date):
"rejected",
]:
status = "Rejected"
if status.lower() in ["appeal received", "appeal in progress"]:
if canonical_status in ["appeal received", "appeal in progress", "refusal (appealed)", "refusal (p) (appealed)"]:
status = "Appeal In Progress"
if status.lower() in ["completed", "allowed", "approval", "approved"]:
if canonical_status in ["completed", "allowed", "allow", "approval", "approved"]:
status = "Approved"
if status.lower() in ["lapsed", "withdrawn"]:
if canonical_status in ["lapsed", "withdrawn"]:
status = "Withdrawn"
if len(status) > status_length_limit:
print("Status was too long and was skipped:", status)
return {
"status": "Processing failed",
"status_explanation_note": "status was unusually long and it was impossible to save it",
"unexpected_status_statistics": unexpected_status_statistics,
}
if status in [
"Submitted",
Expand All @@ -392,26 +409,29 @@ def process_status(status, decision_date):
"Withdrawn",
"Unknown",
]:
return {"status": status, "status_explanation_note": None}
if status in [
"No Objection to Proposal (OBS only)",
"Objection Raised to Proposal (OBS only)",
return {
"status": status,
"status_explanation_note": None,
"unexpected_status_statistics": unexpected_status_statistics,
}
if canonical_status in [
"no objection to proposal (obs only)",
"objection raised to proposal (obs only)",
]:
return {
"status": "Approved",
"status_explanation_note": "preapproved application, local authority is unable to reject it",
"unexpected_status_statistics": unexpected_status_statistics,
}
print("Unexpected status <" + status + ">")
if status not in [
"Not Required",
"SECS",
"Comment Issued",
"ALL DECISIONS ISSUED",
"Closed",
"Declined to Determine",
]:
print("New unexpected status " + status)
return {"status": status, "status_explanation_note": None}
if status not in unexpected_status_statistics:
unexpected_status_statistics[status] = 0
unexpected_status_statistics[status] += 1
return {
"status": status,
"status_explanation_note": None,
"unexpected_status_statistics": unexpected_status_statistics,
}


if __name__ == "__main__":
Expand Down
Loading