|
| 1 | +import polars as pl |
| 2 | +import polars.selectors as cs |
| 3 | +import re |
| 4 | + |
| 5 | +# Polars can read all the sheets of an Excel workbook |
| 6 | +# in one go and return a list of sheets, but I want to |
| 7 | +# add a column with the year. So I write this function |
| 8 | +# that reads the sheet and adds the column. I then |
| 9 | +# map this function over a list of sheet names. |
| 10 | +def read_excel(excel_file, sheet): |
| 11 | + out = pl.read_excel( |
| 12 | + source = excel_file, |
| 13 | + sheet_name = sheet, |
| 14 | + read_csv_options = { |
| 15 | + "skip_rows": 6, |
| 16 | + "has_header": True |
| 17 | + } |
| 18 | + ).with_columns(pl.lit(sheet).alias("year")) |
| 19 | + return out |
| 20 | + |
| 21 | +# This function sets the excel_file argument so that I |
| 22 | +# map over ’sheet’ |
| 23 | +def wrap_read_excel(sheet): |
| 24 | + out = read_excel(excel_file = "vente-maison-2010-2021.xlsx", |
| 25 | + sheet = sheet) |
| 26 | + return out |
| 27 | + |
| 28 | +# This creates a list of sheet names to map over |
| 29 | +sheets = list(map(str, range(2010, 2022))) |
| 30 | + |
| 31 | +# I can now map the function over the list of sheets |
| 32 | +# and concatenate them into a single polars data frame |
| 33 | +# using pl.concat. |
| 34 | +raw_data = pl.concat(list(map(wrap_read_excel, sheets))) |
| 35 | + |
| 36 | +# This function will be used below to clean the column names |
| 37 | +# If I was using Pandas, I could have used clean_columns from skimpy |
| 38 | +# but unfortunately this function doesn’t work with Polars DFs. |
| 39 | +# So I write this little function to clean the column names instead. |
| 40 | +def clean_names(string): |
| 41 | + # inspired by https://nadeauinnovations.com/post/2020/11/python-tricks-replace-all-non-alphanumeric-characters-in-a-string/ |
| 42 | + clean_string = [s for s in string if s.isalnum() or s.isspace()] |
| 43 | + out = "".join(clean_string).lower() |
| 44 | + out = re.sub(r"\s+", "_", out) |
| 45 | + out = out.encode("ascii", "ignore").decode("utf-8") |
| 46 | + return out |
| 47 | + |
| 48 | +# This row-binds all the datasets (first converting the dict to a list), and |
| 49 | +# then renames the columns using the above defined function |
| 50 | +# Not as nice as skimpy.clean_columns, but works on Polars DataFrames |
| 51 | +raw_data = raw_data.select(pl.all().name.map(clean_names)) |
| 52 | + |
| 53 | +raw_data = ( |
| 54 | + raw_data |
| 55 | + .rename( |
| 56 | + { |
| 57 | + "commune": "locality", |
| 58 | + "nombre_doffres": "n_offers", |
| 59 | + "prix_moyen_annonc_en_courant": "average_price_nominal_euros", |
| 60 | + "prix_moyen_annonc_au_m_en_courant": "average_price_m2_nominal_euros" |
| 61 | + } |
| 62 | + ) |
| 63 | + .with_columns( |
| 64 | + cs.contains("average").cast(pl.Float64, strict = False) |
| 65 | + ) |
| 66 | + .with_columns( |
| 67 | + # In some sheets it’s "Luxembourg", in others it’s "Luxembourg-Ville" |
| 68 | + pl.col("locality").str.replace_all("Luxembourg.*", "Luxembourg") |
| 69 | + ) |
| 70 | + .with_columns( |
| 71 | + # In some sheets it’s "Pétange", in others it’s "Petange" |
| 72 | + pl.col("locality").str.replace_all("P.*tange", "Pétange") |
| 73 | + ) |
| 74 | + .with_columns( |
| 75 | + pl.col("locality").str.strip_chars() |
| 76 | + ) |
| 77 | +) |
| 78 | + |
| 79 | +# Always look at your data |
| 80 | +( |
| 81 | +raw_data |
| 82 | +.filter(pl.col("average_price_nominal_euros").is_null()) |
| 83 | +) |
| 84 | + |
| 85 | + |
| 86 | +# Remove empty locality |
| 87 | +raw_data = ( |
| 88 | + raw_data |
| 89 | + .filter(~pl.col("locality").is_null()) |
| 90 | +) |
| 91 | + |
| 92 | +# Only keep communes in the data |
| 93 | + |
| 94 | +commune_level_data = ( |
| 95 | + raw_data |
| 96 | + .filter(~pl.col("locality").str.contains("nationale|offre|Source")) |
| 97 | + # This is needed on Windows... |
| 98 | + .with_columns( |
| 99 | + pl.col("locality").str.replace_all("\351", "é") |
| 100 | + ) |
| 101 | + .with_columns( |
| 102 | + pl.col("locality").str.replace_all("\373", "û") |
| 103 | + ) |
| 104 | + .with_columns( |
| 105 | + pl.col("locality").str.replace_all("\344", "ä") |
| 106 | + ) |
| 107 | +) |
| 108 | + |
| 109 | +country_level = ( |
| 110 | + raw_data |
| 111 | + .filter(pl.col("locality").str.contains("nationale")) |
| 112 | + .select(~cs.contains("n_offers")) |
| 113 | +) |
| 114 | + |
| 115 | +offers_country = ( |
| 116 | + raw_data |
| 117 | + .filter(pl.col("locality").str.contains("Total d.offres")) |
| 118 | + .select(["year", "n_offers"]) |
| 119 | +) |
| 120 | + |
| 121 | +country_level_data = ( |
| 122 | + country_level.join(offers_country, on = "year") |
| 123 | + .with_columns(pl.lit("Grand-Duchy of Luxembourg").alias("locality")) |
| 124 | +) |
| 125 | + |
| 126 | +# I can use all these comments |
| 127 | + |
| 128 | +# if the data already had a year column, I could have read all the sheets |
| 129 | +# in one go using the following code |
| 130 | + |
| 131 | +#datasets = pl.read_excel( |
| 132 | +# source = "vente-maison-2010-2021.xlsx", |
| 133 | +# sheet_id = 0, |
| 134 | +# read_csv_options = { |
| 135 | +# # Polars skip empty rows that that come before any data by default, which is quite helpful |
| 136 | +# # with Pandas, 10 rows should get skipped for sheets 2010 to 2020, but only 8 for sheet 2021 |
| 137 | +# # but in the case of Polars, because empty rows get skipped automatically, 6 more rows |
| 138 | +# # must get skipped. Check out the Excel file to see what I mean. |
| 139 | +# "skip_rows": 6, |
| 140 | +# "has_header": True#, |
| 141 | +# # new_columns would be the preferred approach, but for some reason when using it on this Excel File, |
| 142 | +# # two more empty columns appear. So I could call them a and b and then remove them |
| 143 | +# # this is what the commented line below does. However, I decided to apply a function |
| 144 | +# # that cleans the column names myself. It’s more complicated, but also more elegant as it would |
| 145 | +# # work for any number of columns and in any order |
| 146 | +# # "new_columns": ["a", "b","locality", "n_offers", "average_price_nominal_euros", "average_price_m2_nominal_euros"] |
| 147 | +# } |
| 148 | +#) |
| 149 | + |
| 150 | + |
| 151 | +# We now need to scrape wikipedia for a table |
| 152 | + |
| 153 | +from urllib.request import urlopen |
| 154 | +from bs4 import BeautifulSoup |
| 155 | +from pandas import read_html |
| 156 | +from io import StringIO |
| 157 | +# also need to install lxml |
| 158 | + |
| 159 | +# we now need to scrape wikipedia pages |
| 160 | +url = 'https://b-rodrigues.github.io/list_communes/' |
| 161 | + |
| 162 | +html = urlopen(url) |
| 163 | + |
| 164 | +tables = ( |
| 165 | + BeautifulSoup(html, 'html.parser') |
| 166 | + .find_all("table") |
| 167 | +) |
| 168 | + |
| 169 | +current_communes_raw = read_html(StringIO(str(tables[1])))[0] |
| 170 | + |
| 171 | +# current_communes has a MultiIndex, so drop it |
| 172 | +current_communes_raw.columns = current_communes_raw.columns.droplevel() |
| 173 | + |
| 174 | +current_communes_pl = ( |
| 175 | + pl.DataFrame(current_communes_raw) |
| 176 | + .select(pl.col("Name.1").alias("commune")) |
| 177 | + .with_columns( |
| 178 | + pl.col("commune").str.replace_all("\351", "é") |
| 179 | + ) |
| 180 | + .with_columns( |
| 181 | + pl.col("commune").str.replace_all("\373", "û") |
| 182 | + ) |
| 183 | + .with_columns( |
| 184 | + pl.col("commune").str.replace_all("\344", "ä") |
| 185 | + ) |
| 186 | + .with_columns( |
| 187 | + pl.col("commune").str.replace_all(" .$", "") |
| 188 | + ) |
| 189 | +) |
| 190 | + |
| 191 | +current_communes = list(current_communes_pl["commune"]) |
| 192 | + |
| 193 | +# Test whether all the communes are in our data |
| 194 | +# If the next expression returns an empty list |
| 195 | +# then we’re good |
| 196 | + |
| 197 | +( |
| 198 | +commune_level_data |
| 199 | + .filter(~pl.col("locality").is_in(current_communes)) |
| 200 | + .get_column("locality") |
| 201 | + .unique() |
| 202 | + .sort() |
| 203 | + .to_list() |
| 204 | +) |
| 205 | + |
| 206 | +# Need to also check former communes |
| 207 | +url = 'https://b-rodrigues.github.io/former_communes/#Former_communes/' |
| 208 | + |
| 209 | +html = urlopen(url) |
| 210 | + |
| 211 | +tables = ( |
| 212 | + BeautifulSoup(html, 'html.parser') |
| 213 | + .find_all("table") |
| 214 | +) |
| 215 | + |
| 216 | +# The third table (...hence the ’2’ in tables[2]...) is the one we need |
| 217 | +former_communes_raw = read_html(StringIO(str(tables[2])))[0] |
| 218 | + |
| 219 | +former_communes_pl = ( |
| 220 | + pl.DataFrame(former_communes_raw) |
| 221 | + .with_columns( |
| 222 | + pl.col("Name").str.replace_all("\351", "é") |
| 223 | + ) |
| 224 | + .with_columns( |
| 225 | + pl.col("Name").str.replace_all("\373", "û") |
| 226 | + ) |
| 227 | + .with_columns( |
| 228 | + pl.col("Name").str.replace_all("\344", "ä") |
| 229 | + ) |
| 230 | + .select(pl.col("Name").alias("commune")) |
| 231 | +) |
| 232 | + |
| 233 | +# Combine former and current communes |
| 234 | + |
| 235 | +communes = ( |
| 236 | + pl.concat([former_communes_pl, current_communes_pl]) |
| 237 | + .get_column("commune") |
| 238 | + .unique() |
| 239 | + .sort() |
| 240 | + .to_list() |
| 241 | +) |
| 242 | + |
| 243 | + |
| 244 | +( |
| 245 | +commune_level_data |
| 246 | + .filter(~pl.col("locality").is_in(communes)) |
| 247 | + .get_column("locality") |
| 248 | + .unique() |
| 249 | + .sort() |
| 250 | + .to_list() |
| 251 | +) |
| 252 | + |
| 253 | +# There’s certain communes with diffirent spelling between |
| 254 | +# wikipedia and our data, so let’s correct the spelling |
| 255 | +# on the wikipedia ones |
| 256 | +# ['Clémency', 'Erpeldange', 'Kaerjeng', 'Luxembourg', 'Pétange'] |
| 257 | + |
| 258 | +communes_clean = ( |
| 259 | + pl.concat([former_communes_pl, current_communes_pl]) |
| 260 | + .with_columns( |
| 261 | + pl.when(pl.col("commune").str.contains("Cl.mency")) |
| 262 | + .then(pl.lit("Clémency")) |
| 263 | + .otherwise(pl.col("commune")).alias("commune") |
| 264 | + ) |
| 265 | + .with_columns( |
| 266 | + pl.when(pl.col("commune").str.contains("Erpeldange")) |
| 267 | + .then(pl.lit("Erpeldange")) |
| 268 | + .otherwise(pl.col("commune")).alias("commune"), |
| 269 | + |
| 270 | + ) |
| 271 | + .with_columns( |
| 272 | + pl.when(pl.col("commune").str.contains("City")) |
| 273 | + .then(pl.lit("Luxembourg")) |
| 274 | + .otherwise(pl.col("commune")).alias("commune"), |
| 275 | + ) |
| 276 | + .with_columns( |
| 277 | + pl.when(pl.col("commune").str.contains("K.*jeng")) |
| 278 | + .then(pl.lit("Kaerjeng")) |
| 279 | + .otherwise(pl.col("commune")).alias("commune"), |
| 280 | + ) |
| 281 | + .with_columns( |
| 282 | + pl.when(pl.col("commune").str.contains("P.*tange")) |
| 283 | + .then(pl.lit("Pétange")) |
| 284 | + .otherwise(pl.col("commune")).alias("commune"), |
| 285 | + ) |
| 286 | + .get_column("commune") |
| 287 | + .unique() |
| 288 | + .sort() |
| 289 | + .to_list() |
| 290 | +) |
| 291 | + |
| 292 | +# Test whether all the communes are in our data |
| 293 | +# If the next expression returns an empty list |
| 294 | +# then we’re good |
| 295 | + |
| 296 | +( |
| 297 | +commune_level_data |
| 298 | + .filter(~pl.col("locality").is_in(communes_clean)) |
| 299 | + .get_column("locality") |
| 300 | + .unique() |
| 301 | + .sort() |
| 302 | + .to_list() |
| 303 | +) |
| 304 | + |
| 305 | +# save data as csv |
| 306 | + |
| 307 | +commune_level_data.write_csv("commune_level_data.csv") |
| 308 | +country_level_data.write_csv("country_level_data.csv") |
0 commit comments