1
+ import datetime
2
+
1
3
from bs4 import BeautifulSoup
4
+ from selenium .webdriver .common .by import By
5
+ from selenium .webdriver .support import expected_conditions as EC
6
+ from selenium .webdriver .support .wait import WebDriverWait
2
7
3
8
from uk_bin_collection .uk_bin_collection .common import *
4
9
from uk_bin_collection .uk_bin_collection .get_bin_data import AbstractGetBinDataClass
@@ -13,192 +18,113 @@ class CouncilClass(AbstractGetBinDataClass):
13
18
"""
14
19
15
20
def parse_data (self , page : str , ** kwargs ) -> dict :
16
- user_postcode = kwargs .get ("postcode" )
17
- check_postcode (user_postcode )
18
- user_uprn = kwargs .get ("uprn" )
19
- check_uprn (user_uprn )
20
-
21
- headers = {
22
- "User-Agent" : "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) "
23
- "Chrome/87.0.4280.141 Safari/537.36"
24
- }
25
-
26
- requests .packages .urllib3 .disable_warnings ()
27
- with requests .Session () as s :
28
- # Set Headers
29
- s .headers = headers
30
-
31
- # Get the first page - This is the Search for property by Post Code page
32
- resource = s .get (
33
- "https://iweb.itouchvision.com/portal/f?p=customer:BIN_DAYS:::NO:RP:UID:625C791B4D9301137723E9095361401AE8C03934"
34
- )
35
- # Create a BeautifulSoup object from the page's HTML
36
- soup = BeautifulSoup (resource .text , "html.parser" )
37
-
38
- # The page contains a number of values that must be passed into subsequent requests - extract them here
39
- payload = {
40
- i ["name" ]: i .get ("value" , "" ) for i in soup .select ("input[name]" )
41
- }
42
- payload2 = {
43
- i ["data-for" ]: i .get ("value" , "" )
44
- for i in soup .select ("input[data-for]" )
45
- }
46
- payload_salt = soup .select_one ('input[id="pSalt"]' ).get ("value" )
47
- payload_protected = soup .select_one ('input[id="pPageItemsProtected"]' ).get (
48
- "value"
21
+ driver = None
22
+ try :
23
+ data = {"bins" : []}
24
+ url = kwargs .get ("url" )
25
+ user_paon = kwargs .get ("paon" )
26
+ user_postcode = kwargs .get ("postcode" )
27
+ web_driver = kwargs .get ("web_driver" )
28
+ headless = kwargs .get ("headless" )
29
+ check_paon (user_paon )
30
+ check_postcode (user_postcode )
31
+
32
+ # Use a realistic user agent to help bypass Cloudflare
33
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
34
+ driver = create_webdriver (web_driver , headless , user_agent , __name__ )
35
+ driver .get ("https://www.somerset.gov.uk/collection-days" )
36
+
37
+ # Wait for the postcode field to appear then populate it
38
+ inputElement_postcode = WebDriverWait (driver , 30 ).until (
39
+ EC .presence_of_element_located ((By .ID , "postcodeSearch" ))
49
40
)
41
+ inputElement_postcode .send_keys (user_postcode )
50
42
51
- # Add the PostCode and 'SEARCH' to the payload
52
- payload ["p_request" ] = "SEARCH"
53
- payload ["P153_POST_CODE" ] = user_postcode
54
-
55
- # Manipulate the lists and build the JSON that must be submitted in further requests - some data is nested
56
- merged_list = {** payload , ** payload2 }
57
- new_list = []
58
- other_list = {}
59
- for key in merged_list .keys ():
60
- temp_list = {}
61
- val = merged_list [key ]
62
- if key in [
63
- "P153_UPRN" ,
64
- "P153_TEMP" ,
65
- "P153_SYSDATE" ,
66
- "P0_LANGUAGE" ,
67
- "P153_POST_CODE" ,
68
- ]:
69
- temp_list = {"n" : key , "v" : val }
70
- new_list .append (temp_list )
71
- elif key in [
72
- "p_flow_id" ,
73
- "p_flow_step_id" ,
74
- "p_instance" ,
75
- "p_page_submission_id" ,
76
- "p_request" ,
77
- "p_reload_on_submit" ,
78
- ]:
79
- other_list [key ] = val
80
- else :
81
- temp_list = {"n" : key , "v" : "" , "ck" : val }
82
- new_list .append (temp_list )
83
-
84
- json_builder = {
85
- "pageItems" : {
86
- "itemsToSubmit" : new_list ,
87
- "protected" : payload_protected ,
88
- "rowVersion" : "" ,
89
- "formRegionChecksums" : [],
90
- },
91
- "salt" : payload_salt ,
92
- }
93
- json_object = json .dumps (json_builder , separators = ("," , ":" ))
94
- other_list ["p_json" ] = json_object
95
-
96
- # Set Referrer header
97
- s .headers .update (
98
- {
99
- "referer" : "https://iweb.itouchvision.com/portal/f?p=customer:BIN_DAYS:::NO:RP:UID:625C791B4D9301137723E9095361401AE8C03934"
100
- }
43
+ # Click search button
44
+ findAddress = WebDriverWait (driver , 10 ).until (
45
+ EC .presence_of_element_located ((By .CLASS_NAME , "govuk-button" ))
101
46
)
102
-
103
- # Generate POST including all the JSON we just built
104
- s .post (
105
- "https://iweb.itouchvision.com/portal/wwv_flow.accept" , data = other_list
47
+ findAddress .click ()
48
+
49
+ # Wait for the 'Select address' dropdown to appear and select option matching the house name/number
50
+ WebDriverWait (driver , 10 ).until (
51
+ EC .element_to_be_clickable (
52
+ (
53
+ By .XPATH ,
54
+ "//select[@id='addressSelect']//option[contains(., '"
55
+ + user_paon
56
+ + "')]" ,
57
+ )
58
+ )
59
+ ).click ()
60
+
61
+ # Wait for the collections table to appear
62
+ WebDriverWait (driver , 20 ).until (
63
+ EC .presence_of_element_located (
64
+ (
65
+ By .XPATH ,
66
+ "//h2[contains(@class,'mt-4') and contains(@class,'govuk-heading-s') and normalize-space(.)='Your next collections']" ,
67
+ )
68
+ )
106
69
)
107
70
108
- # The second page on the portal would normally allow you to select your property from a dropdown list of
109
- # those that are at the postcode entered on the previous page
110
- # The required cookies are stored within the session so re-use the session to keep them
111
- resource = s .get (
112
- "https://iweb.itouchvision.com/portal/itouchvision/r/customer/bin_days"
113
- )
71
+ soup = BeautifulSoup (driver .page_source , features = "html.parser" )
114
72
115
- # Create a BeautifulSoup object from the page's HTML
116
- soup = BeautifulSoup (resource .text , "html.parser" )
117
-
118
- # The page contains a number of values that must be passed into subsequent requests - extract them here
119
- payload = {
120
- i ["name" ]: i .get ("value" , "" ) for i in soup .select ("input[name]" )
121
- }
122
- payload2 = {
123
- i ["data-for" ]: i .get ("value" , "" )
124
- for i in soup .select ("input[data-for]" )
125
- }
126
- payload_salt = soup .select_one ('input[id="pSalt"]' ).get ("value" )
127
- payload_protected = soup .select_one ('input[id="pPageItemsProtected"]' ).get (
128
- "value"
129
- )
73
+ collections = soup .find_all ("div" , {"class" : "p-2" })
130
74
131
- # Add the UPRN and 'SUBMIT' to the payload
132
- payload ["p_request" ] = "SUBMIT"
133
- payload ["P153_UPRN" ] = user_uprn
134
-
135
- # Manipulate the lists and build the JSON that must be submitted in further requests - some data is nested
136
- merged_list = {** payload , ** payload2 }
137
- new_list = []
138
- other_list = {}
139
- for key in merged_list .keys ():
140
- temp_list = {}
141
- val = merged_list [key ]
142
- if key in ["P153_UPRN" , "P153_TEMP" , "P153_SYSDATE" , "P0_LANGUAGE" ]:
143
- temp_list = {"n" : key , "v" : val }
144
- new_list .append (temp_list )
145
- elif key in ["P153_ZABY" ]:
146
- temp_list = {"n" : key , "v" : "1" , "ck" : val }
147
- new_list .append (temp_list )
148
- elif key in ["P153_POST_CODE" ]:
149
- temp_list = {"n" : key , "v" : user_postcode , "ck" : val }
150
- new_list .append (temp_list )
151
- elif key in [
152
- "p_flow_id" ,
153
- "p_flow_step_id" ,
154
- "p_instance" ,
155
- "p_page_submission_id" ,
156
- "p_request" ,
157
- "p_reload_on_submit" ,
158
- ]:
159
- other_list [key ] = val
160
- else :
161
- temp_list = {"n" : key , "v" : "" , "ck" : val }
162
- new_list .append (temp_list )
163
-
164
- json_builder = {
165
- "pageItems" : {
166
- "itemsToSubmit" : new_list ,
167
- "protected" : payload_protected ,
168
- "rowVersion" : "" ,
169
- "formRegionChecksums" : [],
170
- },
171
- "salt" : payload_salt ,
172
- }
173
-
174
- json_object = json .dumps (json_builder , separators = ("," , ":" ))
175
- other_list ["p_json" ] = json_object
176
-
177
- # Generate POST including all the JSON we just built
178
- s .post (
179
- "https://iweb.itouchvision.com/portal/wwv_flow.accept" , data = other_list
180
- )
75
+ for collection in collections :
76
+ bin_type = collection .find ("h3" ).get_text ()
181
77
182
- # The third and final page on the portal shows the detail of the waste collection services
183
- # The required cookies are stored within the session so re-use the session to keep them
184
- resource = s .get (
185
- "https://iweb.itouchvision.com/portal/itouchvision/r/customer/bin_days"
186
- )
78
+ next_collection = soup .find ("div" , {"class" : "fw-bold" }).get_text ()
187
79
188
- # Create a BeautifulSoup object from the page's HTML
189
- soup = BeautifulSoup (resource .text , "html.parser" )
190
- data = {"bins" : []}
80
+ following_collection = soup .find (
81
+ lambda t : (
82
+ t .name == "div"
83
+ and t .get_text (strip = True ).lower ().startswith ("followed by" )
84
+ )
85
+ ).get_text ()
86
+
87
+ next_collection_date = datetime .strptime (next_collection , "%A %d %B" )
88
+
89
+ following_collection_date = datetime .strptime (
90
+ following_collection , "followed by %A %d %B"
91
+ )
92
+
93
+ current_date = datetime .now ()
94
+ next_collection_date = next_collection_date .replace (
95
+ year = current_date .year
96
+ )
97
+ following_collection_date = following_collection_date .replace (
98
+ year = current_date .year
99
+ )
100
+
101
+ next_collection_date = get_next_occurrence_from_day_month (
102
+ next_collection_date
103
+ )
191
104
192
- # Loop through the items on the page and build a JSON object for ingestion
193
- for item in soup .select (".t-MediaList-item" ):
194
- for value in item .select (".t-MediaList-body" ):
195
- dict_data = {
196
- "type" : value .select ("span" )[1 ].get_text (strip = True ).title (),
197
- "collectionDate" : datetime .strptime (
198
- value .select (".t-MediaList-desc" )[0 ].get_text (strip = True ),
199
- "%A, %d %B, %Y" ,
200
- ).strftime (date_format ),
201
- }
202
- data ["bins" ].append (dict_data )
203
-
204
- return data
105
+ following_collection_date = get_next_occurrence_from_day_month (
106
+ following_collection_date
107
+ )
108
+
109
+ dict_data = {
110
+ "type" : bin_type ,
111
+ "collectionDate" : next_collection_date .strftime (date_format ),
112
+ }
113
+ data ["bins" ].append (dict_data )
114
+
115
+ dict_data = {
116
+ "type" : bin_type ,
117
+ "collectionDate" : following_collection_date .strftime (date_format ),
118
+ }
119
+ data ["bins" ].append (dict_data )
120
+
121
+ except Exception as e :
122
+ # Here you can log the exception if needed
123
+ print (f"An error occurred: { e } " )
124
+ # Optionally, re-raise the exception if you want it to propagate
125
+ raise
126
+ finally :
127
+ # This block ensures that the driver is closed regardless of an exception
128
+ if driver :
129
+ driver .quit ()
130
+ return data
0 commit comments