Skip to content

Commit

Permalink
Merge pull request #1354 from matkoniecz/fix/livestream
Browse files Browse the repository at this point in the history
Better parsing of planning applications
  • Loading branch information
matkoniecz authored Mar 26, 2024
2 parents 3d3200a + 60d9889 commit 7fcace4
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 46 deletions.
4 changes: 3 additions & 1 deletion etl/planning_data/address_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ def generate_address(site_name, site_number, street_name, secondary_street_name)
street_name += " - with secondary road name: " + secondary_street_name

if site_number is not None and street_name is not None:
address = site_number + " " + street_name + " - " + site_name
address = site_number + " " + street_name
if site_name is not None:
address += " - " + site_name
# in some cases it results in duplication when site_name repeats
# address parts, but often it provides useful data

Expand Down
110 changes: 65 additions & 45 deletions etl/planning_data/obtain_livestream_data_and_load_into_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ def main():
downloaded = 0
last_sort = None
search_after = []
unexpected_status_statistics = {}
while True:
data = query(search_after).json()
load_data_into_database(cursor, data)
unexpected_status_statistics = load_data_into_database(cursor, data, unexpected_status_statistics)
for entry in data["hits"]["hits"]:
downloaded += 1
last_sort = entry["sort"]
Expand All @@ -27,6 +28,14 @@ def main():
if search_after == last_sort:
break
search_after = last_sort
print("unexpected_status_statistics")
for code, occurences in unexpected_status_statistics.items():
print(code, "x" + str(occurences))
print()
print("popular values in unexpected_status_statistics")
for code, occurences in unexpected_status_statistics.items():
if occurences > 100:
print(code, "x" + str(occurences))
connection.commit()


Expand All @@ -50,7 +59,7 @@ def get_connection():
)


def load_data_into_database(cursor, data):
def load_data_into_database(cursor, data, unexpected_status_statistics):
if "timed_out" not in data:
print(json.dumps(data, indent=4))
print("timed_out field missing in provided data")
Expand All @@ -72,7 +81,8 @@ def load_data_into_database(cursor, data):
)
uprn = entry["_source"]["uprn"]
status_before_aliasing = entry["_source"]["status"]
status_info = process_status(status_before_aliasing, decision_date)
status_info = process_status(status_before_aliasing, decision_date, unexpected_status_statistics)
unexpected_status_statistics = status_info['unexpected_status_statistics']
status = status_info["status"]
status_explanation_note = status_info["status_explanation_note"]
planning_url = obtain_entry_link(
Expand Down Expand Up @@ -107,24 +117,7 @@ def load_data_into_database(cursor, data):
if len(entry["address"]) > maximum_address_length:
print("address is too long, shortening", entry["address"])
entry["address"] = entry["address"][0:maximum_address_length]
if date_in_future(entry["registered_with_local_authority_date"]):
print(
"registered_with_local_authority_date is treated as invalid:",
entry["registered_with_local_authority_date"],
)
# Brent-87_0946 has "valid_date": "23/04/9187"
entry["registered_with_local_authority_date"] = None

if date_in_future(entry["decision_date"]):
print("decision_date is treated as invalid:", entry["decision_date"])
entry["decision_date"] = None

if date_in_future(entry["last_synced_date"]):
print(
"last_synced_date is treated as invalid:", entry["last_synced_date"]
)
entry["last_synced_date"] = None

entry = throw_away_invalid_dates(entry)
if "Hackney" in application_id_with_borough_identifier:
if entry["application_url"] is not None:
if "https://" not in entry["application_url"]:
Expand All @@ -140,6 +133,25 @@ def load_data_into_database(cursor, data):
print()
show_dictionary(entry)
raise e
return unexpected_status_statistics


def throw_away_invalid_dates(entry):
for date_code in ["registered_with_local_authority_date", "decision_date", "last_synced_date"]:
if date_in_future(entry[date_code]):
print(
date_code + " is treated as invalid:",
entry[date_code],
)
# Brent-87_0946 has "valid_date": "23/04/9187"
entry[date_code] = None

if entry[date_code] is not None:
# not believable values
if entry[date_code] < datetime.datetime(1950, 1, 1):
print(date_code, "Unexpectedly early date, treating it as a missing date:", entry[date_code])
entry[date_code] = None
return entry


def date_in_future(date):
Expand Down Expand Up @@ -295,9 +307,6 @@ def parse_date_string_into_date_object(incoming):
date = datetime.datetime.strptime(
incoming, "%Y-%m-%dT%H:%M:%S.%fZ"
) # '2022-08-08T20:07:22.238Z'
if date < datetime.datetime(1950, 1, 1): # not believable values
print("Unexpectedly early date, treating it as a missing date:", date)
date = None
return date


Expand Down Expand Up @@ -355,14 +364,21 @@ def obtain_entry_link(provided_link, application_id):
# Richmond is simply broken


def process_status(status, decision_date):
def process_status(status, decision_date, unexpected_status_statistics):
status_length_limit = 50 # see migrations/034.planning_livestream_data.up.sql
if status is None or status.lower() in ["null", "not_mapped"]:
if status is None:
status = "Unknown"
canonical_status = status.lower().strip()
if canonical_status in ["null", "not_mapped", "", "not known"]:
status = "Unknown"
if status.lower() in ["application under consideration", "application received"]:
if canonical_status in ["application under consideration", "application received"]:
if decision_date is None:
status = "Submitted"
if status.lower() in [
else:
print(status, "but with", decision_date,
"marking as unknown status")
status = "Unknown"
if canonical_status in [
"refused",
"refusal",
"refusal (p)",
Expand All @@ -372,17 +388,18 @@ def process_status(status, decision_date):
"rejected",
]:
status = "Rejected"
if status.lower() in ["appeal received", "appeal in progress"]:
if canonical_status in ["appeal received", "appeal in progress", "refusal (appealed)", "refusal (p) (appealed)"]:
status = "Appeal In Progress"
if status.lower() in ["completed", "allowed", "approval", "approved"]:
if canonical_status in ["completed", "allowed", "allow", "approval", "approved"]:
status = "Approved"
if status.lower() in ["lapsed", "withdrawn"]:
if canonical_status in ["lapsed", "withdrawn"]:
status = "Withdrawn"
if len(status) > status_length_limit:
print("Status was too long and was skipped:", status)
return {
"status": "Processing failed",
"status_explanation_note": "status was unusually long and it was impossible to save it",
"unexpected_status_statistics": unexpected_status_statistics,
}
if status in [
"Submitted",
Expand All @@ -392,26 +409,29 @@ def process_status(status, decision_date):
"Withdrawn",
"Unknown",
]:
return {"status": status, "status_explanation_note": None}
if status in [
"No Objection to Proposal (OBS only)",
"Objection Raised to Proposal (OBS only)",
return {
"status": status,
"status_explanation_note": None,
"unexpected_status_statistics": unexpected_status_statistics,
}
if canonical_status in [
"no objection to proposal (obs only)",
"objection raised to proposal (obs only)",
]:
return {
"status": "Approved",
"status_explanation_note": "preapproved application, local authority is unable to reject it",
"unexpected_status_statistics": unexpected_status_statistics,
}
print("Unexpected status <" + status + ">")
if status not in [
"Not Required",
"SECS",
"Comment Issued",
"ALL DECISIONS ISSUED",
"Closed",
"Declined to Determine",
]:
print("New unexpected status " + status)
return {"status": status, "status_explanation_note": None}
if status not in unexpected_status_statistics:
unexpected_status_statistics[status] = 0
unexpected_status_statistics[status] += 1
return {
"status": status,
"status_explanation_note": None,
"unexpected_status_statistics": unexpected_status_statistics,
}


if __name__ == "__main__":
Expand Down

0 comments on commit 7fcace4

Please sign in to comment.