Skip to content

Commit b12637c

Browse files
committed
fix: #1569 - Somerset Council
fix: #1569 - Somerset Council
1 parent c0d2e23 commit b12637c

File tree

3 files changed

+112
-184
lines changed

3 files changed

+112
-184
lines changed

uk_bin_collection/tests/input.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2088,10 +2088,11 @@
20882088
"SomersetCouncil": {
20892089
"postcode": "TA6 4AA",
20902090
"skip_get_url": true,
2091-
"uprn": "10090857775",
2091+
"house_number": "5",
20922092
"url": "https://www.somerset.gov.uk/",
2093+
"web_driver": "http://selenium:4444",
20932094
"wiki_name": "Somerset",
2094-
"wiki_note": "Provide your UPRN and postcode. Find your UPRN using [FindMyAddress](https://www.findmyaddress.co.uk/search).",
2095+
"wiki_note": "Provide your house number and postcode",
20952096
"LAD24CD": "E06000066"
20962097
},
20972098
"SouthAyrshireCouncil": {
Lines changed: 105 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1+
import datetime
2+
13
from bs4 import BeautifulSoup
4+
from selenium.webdriver.common.by import By
5+
from selenium.webdriver.support import expected_conditions as EC
6+
from selenium.webdriver.support.wait import WebDriverWait
27

38
from uk_bin_collection.uk_bin_collection.common import *
49
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass
@@ -13,192 +18,113 @@ class CouncilClass(AbstractGetBinDataClass):
1318
"""
1419

1520
def parse_data(self, page: str, **kwargs) -> dict:
16-
user_postcode = kwargs.get("postcode")
17-
check_postcode(user_postcode)
18-
user_uprn = kwargs.get("uprn")
19-
check_uprn(user_uprn)
20-
21-
headers = {
22-
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) "
23-
"Chrome/87.0.4280.141 Safari/537.36"
24-
}
25-
26-
requests.packages.urllib3.disable_warnings()
27-
with requests.Session() as s:
28-
# Set Headers
29-
s.headers = headers
30-
31-
# Get the first page - This is the Search for property by Post Code page
32-
resource = s.get(
33-
"https://iweb.itouchvision.com/portal/f?p=customer:BIN_DAYS:::NO:RP:UID:625C791B4D9301137723E9095361401AE8C03934"
34-
)
35-
# Create a BeautifulSoup object from the page's HTML
36-
soup = BeautifulSoup(resource.text, "html.parser")
37-
38-
# The page contains a number of values that must be passed into subsequent requests - extract them here
39-
payload = {
40-
i["name"]: i.get("value", "") for i in soup.select("input[name]")
41-
}
42-
payload2 = {
43-
i["data-for"]: i.get("value", "")
44-
for i in soup.select("input[data-for]")
45-
}
46-
payload_salt = soup.select_one('input[id="pSalt"]').get("value")
47-
payload_protected = soup.select_one('input[id="pPageItemsProtected"]').get(
48-
"value"
21+
driver = None
22+
try:
23+
data = {"bins": []}
24+
url = kwargs.get("url")
25+
user_paon = kwargs.get("paon")
26+
user_postcode = kwargs.get("postcode")
27+
web_driver = kwargs.get("web_driver")
28+
headless = kwargs.get("headless")
29+
check_paon(user_paon)
30+
check_postcode(user_postcode)
31+
32+
# Use a realistic user agent to help bypass Cloudflare
33+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
34+
driver = create_webdriver(web_driver, headless, user_agent, __name__)
35+
driver.get("https://www.somerset.gov.uk/collection-days")
36+
37+
# Wait for the postcode field to appear then populate it
38+
inputElement_postcode = WebDriverWait(driver, 30).until(
39+
EC.presence_of_element_located((By.ID, "postcodeSearch"))
4940
)
41+
inputElement_postcode.send_keys(user_postcode)
5042

51-
# Add the PostCode and 'SEARCH' to the payload
52-
payload["p_request"] = "SEARCH"
53-
payload["P153_POST_CODE"] = user_postcode
54-
55-
# Manipulate the lists and build the JSON that must be submitted in further requests - some data is nested
56-
merged_list = {**payload, **payload2}
57-
new_list = []
58-
other_list = {}
59-
for key in merged_list.keys():
60-
temp_list = {}
61-
val = merged_list[key]
62-
if key in [
63-
"P153_UPRN",
64-
"P153_TEMP",
65-
"P153_SYSDATE",
66-
"P0_LANGUAGE",
67-
"P153_POST_CODE",
68-
]:
69-
temp_list = {"n": key, "v": val}
70-
new_list.append(temp_list)
71-
elif key in [
72-
"p_flow_id",
73-
"p_flow_step_id",
74-
"p_instance",
75-
"p_page_submission_id",
76-
"p_request",
77-
"p_reload_on_submit",
78-
]:
79-
other_list[key] = val
80-
else:
81-
temp_list = {"n": key, "v": "", "ck": val}
82-
new_list.append(temp_list)
83-
84-
json_builder = {
85-
"pageItems": {
86-
"itemsToSubmit": new_list,
87-
"protected": payload_protected,
88-
"rowVersion": "",
89-
"formRegionChecksums": [],
90-
},
91-
"salt": payload_salt,
92-
}
93-
json_object = json.dumps(json_builder, separators=(",", ":"))
94-
other_list["p_json"] = json_object
95-
96-
# Set Referrer header
97-
s.headers.update(
98-
{
99-
"referer": "https://iweb.itouchvision.com/portal/f?p=customer:BIN_DAYS:::NO:RP:UID:625C791B4D9301137723E9095361401AE8C03934"
100-
}
43+
# Click search button
44+
findAddress = WebDriverWait(driver, 10).until(
45+
EC.presence_of_element_located((By.CLASS_NAME, "govuk-button"))
10146
)
102-
103-
# Generate POST including all the JSON we just built
104-
s.post(
105-
"https://iweb.itouchvision.com/portal/wwv_flow.accept", data=other_list
47+
findAddress.click()
48+
49+
# Wait for the 'Select address' dropdown to appear and select option matching the house name/number
50+
WebDriverWait(driver, 10).until(
51+
EC.element_to_be_clickable(
52+
(
53+
By.XPATH,
54+
"//select[@id='addressSelect']//option[contains(., '"
55+
+ user_paon
56+
+ "')]",
57+
)
58+
)
59+
).click()
60+
61+
# Wait for the collections table to appear
62+
WebDriverWait(driver, 20).until(
63+
EC.presence_of_element_located(
64+
(
65+
By.XPATH,
66+
"//h2[contains(@class,'mt-4') and contains(@class,'govuk-heading-s') and normalize-space(.)='Your next collections']",
67+
)
68+
)
10669
)
10770

108-
# The second page on the portal would normally allow you to select your property from a dropdown list of
109-
# those that are at the postcode entered on the previous page
110-
# The required cookies are stored within the session so re-use the session to keep them
111-
resource = s.get(
112-
"https://iweb.itouchvision.com/portal/itouchvision/r/customer/bin_days"
113-
)
71+
soup = BeautifulSoup(driver.page_source, features="html.parser")
11472

115-
# Create a BeautifulSoup object from the page's HTML
116-
soup = BeautifulSoup(resource.text, "html.parser")
117-
118-
# The page contains a number of values that must be passed into subsequent requests - extract them here
119-
payload = {
120-
i["name"]: i.get("value", "") for i in soup.select("input[name]")
121-
}
122-
payload2 = {
123-
i["data-for"]: i.get("value", "")
124-
for i in soup.select("input[data-for]")
125-
}
126-
payload_salt = soup.select_one('input[id="pSalt"]').get("value")
127-
payload_protected = soup.select_one('input[id="pPageItemsProtected"]').get(
128-
"value"
129-
)
73+
collections = soup.find_all("div", {"class": "p-2"})
13074

131-
# Add the UPRN and 'SUBMIT' to the payload
132-
payload["p_request"] = "SUBMIT"
133-
payload["P153_UPRN"] = user_uprn
134-
135-
# Manipulate the lists and build the JSON that must be submitted in further requests - some data is nested
136-
merged_list = {**payload, **payload2}
137-
new_list = []
138-
other_list = {}
139-
for key in merged_list.keys():
140-
temp_list = {}
141-
val = merged_list[key]
142-
if key in ["P153_UPRN", "P153_TEMP", "P153_SYSDATE", "P0_LANGUAGE"]:
143-
temp_list = {"n": key, "v": val}
144-
new_list.append(temp_list)
145-
elif key in ["P153_ZABY"]:
146-
temp_list = {"n": key, "v": "1", "ck": val}
147-
new_list.append(temp_list)
148-
elif key in ["P153_POST_CODE"]:
149-
temp_list = {"n": key, "v": user_postcode, "ck": val}
150-
new_list.append(temp_list)
151-
elif key in [
152-
"p_flow_id",
153-
"p_flow_step_id",
154-
"p_instance",
155-
"p_page_submission_id",
156-
"p_request",
157-
"p_reload_on_submit",
158-
]:
159-
other_list[key] = val
160-
else:
161-
temp_list = {"n": key, "v": "", "ck": val}
162-
new_list.append(temp_list)
163-
164-
json_builder = {
165-
"pageItems": {
166-
"itemsToSubmit": new_list,
167-
"protected": payload_protected,
168-
"rowVersion": "",
169-
"formRegionChecksums": [],
170-
},
171-
"salt": payload_salt,
172-
}
173-
174-
json_object = json.dumps(json_builder, separators=(",", ":"))
175-
other_list["p_json"] = json_object
176-
177-
# Generate POST including all the JSON we just built
178-
s.post(
179-
"https://iweb.itouchvision.com/portal/wwv_flow.accept", data=other_list
180-
)
75+
for collection in collections:
76+
bin_type = collection.find("h3").get_text()
18177

182-
# The third and final page on the portal shows the detail of the waste collection services
183-
# The required cookies are stored within the session so re-use the session to keep them
184-
resource = s.get(
185-
"https://iweb.itouchvision.com/portal/itouchvision/r/customer/bin_days"
186-
)
78+
next_collection = soup.find("div", {"class": "fw-bold"}).get_text()
18779

188-
# Create a BeautifulSoup object from the page's HTML
189-
soup = BeautifulSoup(resource.text, "html.parser")
190-
data = {"bins": []}
80+
following_collection = soup.find(
81+
lambda t: (
82+
t.name == "div"
83+
and t.get_text(strip=True).lower().startswith("followed by")
84+
)
85+
).get_text()
86+
87+
next_collection_date = datetime.strptime(next_collection, "%A %d %B")
88+
89+
following_collection_date = datetime.strptime(
90+
following_collection, "followed by %A %d %B"
91+
)
92+
93+
current_date = datetime.now()
94+
next_collection_date = next_collection_date.replace(
95+
year=current_date.year
96+
)
97+
following_collection_date = following_collection_date.replace(
98+
year=current_date.year
99+
)
100+
101+
next_collection_date = get_next_occurrence_from_day_month(
102+
next_collection_date
103+
)
191104

192-
# Loop through the items on the page and build a JSON object for ingestion
193-
for item in soup.select(".t-MediaList-item"):
194-
for value in item.select(".t-MediaList-body"):
195-
dict_data = {
196-
"type": value.select("span")[1].get_text(strip=True).title(),
197-
"collectionDate": datetime.strptime(
198-
value.select(".t-MediaList-desc")[0].get_text(strip=True),
199-
"%A, %d %B, %Y",
200-
).strftime(date_format),
201-
}
202-
data["bins"].append(dict_data)
203-
204-
return data
105+
following_collection_date = get_next_occurrence_from_day_month(
106+
following_collection_date
107+
)
108+
109+
dict_data = {
110+
"type": bin_type,
111+
"collectionDate": next_collection_date.strftime(date_format),
112+
}
113+
data["bins"].append(dict_data)
114+
115+
dict_data = {
116+
"type": bin_type,
117+
"collectionDate": following_collection_date.strftime(date_format),
118+
}
119+
data["bins"].append(dict_data)
120+
121+
except Exception as e:
122+
# Here you can log the exception if needed
123+
print(f"An error occurred: {e}")
124+
# Optionally, re-raise the exception if you want it to propagate
125+
raise
126+
finally:
127+
# This block ensures that the driver is closed regardless of an exception
128+
if driver:
129+
driver.quit()
130+
return data

wiki/Councils.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3295,14 +3295,15 @@ Note: Replace `XXXXXXXX` with your UPRN. You will need to use [FindMyAddress](ht
32953295

32963296
### Somerset
32973297
```commandline
3298-
python collect_data.py SomersetCouncil https://www.somerset.gov.uk/ -s -u XXXXXXXX -p "XXXX XXX"
3298+
python collect_data.py SomersetCouncil https://www.somerset.gov.uk/ -s -p "XXXX XXX" -n XX -w http://HOST:PORT/
32993299
```
33003300
Additional parameters:
33013301
- `-s` - skip get URL
3302-
- `-u` - UPRN
33033302
- `-p` - postcode
3303+
- `-n` - house number
3304+
- `-w` - remote Selenium web driver URL (required for Home Assistant)
33043305

3305-
Note: Provide your UPRN and postcode. Find your UPRN using [FindMyAddress](https://www.findmyaddress.co.uk/search).
3306+
Note: Provide your house number and postcode
33063307

33073308
---
33083309

0 commit comments

Comments
 (0)