Skip to content

Commit 31c6452

Browse files
committedJan 14, 2024
project startb
1 parent f259875 commit 31c6452

File tree

6 files changed

+926
-3
lines changed

6 files changed

+926
-3
lines changed
 

‎.github/workflows/render_publish.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,19 @@ jobs:
2020
# uncomment below and fill to pin a version
2121
version: 1.3.353
2222

23+
- name: Setup R
24+
uses: r-lib/actions/setup-r@v2
25+
with:
26+
r-version: '4.3.2'
27+
28+
- name: Install packages
29+
uses: r-lib/actions/setup-r-dependencies@v2
30+
with:
31+
packages: |
32+
any::withr
33+
any::ggplot2
34+
any::knitr
35+
2336
- name: Set up Python 3.10
2437
uses: actions/setup-python@v5
2538
with:

‎_quarto.yml

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ book:
1515
chapters:
1616
- index.qmd
1717
- preface.qmd
18+
- setting_up.qmd
19+
- project_start.qmd
1820
- references.qmd
1921
page-navigation: true
2022

‎index.qmd

+6-3
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,14 @@ and you'll be able buy a DRM-free Epub or PDF on
5252
[Leanpub](https://leanpub.com/)^[https://leanpub.com/] once there's more
5353
content.
5454

55-
This Python edition is shorter than the R version. Here's the topics that
56-
I will cover:
55+
This is the *Python edition* of my book titled [Building reproducible analytical
56+
pipelines with R](https://raps-with-r.dev)^[https://raps-with-r.dev]. This means
57+
that a lot of text is copied over, but the all of the code and concepts are
58+
completely adapted to the Python programming language. This book is also
59+
shorter than the R version. Here's the topics that I will cover:
5760

5861
- Dependency management with `pipenv`;
59-
- Some thoughts on functional programming;
62+
- Some thoughts on functional programming with Python;
6063
- Unit and assertive testing;
6164
- Build automation with `ploomber`;
6265
- Literate programming with Quarto;

‎project_start.qmd

+593
Large diffs are not rendered by default.

‎scripts/save_data.py

+308
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
import polars as pl
2+
import polars.selectors as cs
3+
import re
4+
5+
# Polars can read all the sheets of an Excel workbook
6+
# in one go and return a list of sheets, but I want to
7+
# add a column with the year. So I write this function
8+
# that reads the sheet and adds the column. I then
9+
# map this function over a list of sheet names.
10+
def read_excel(excel_file, sheet):
11+
out = pl.read_excel(
12+
source = excel_file,
13+
sheet_name = sheet,
14+
read_csv_options = {
15+
"skip_rows": 6,
16+
"has_header": True
17+
}
18+
).with_columns(pl.lit(sheet).alias("year"))
19+
return out
20+
21+
# This function sets the excel_file argument so that I
22+
# map over ’sheet’
23+
def wrap_read_excel(sheet):
24+
out = read_excel(excel_file = "vente-maison-2010-2021.xlsx",
25+
sheet = sheet)
26+
return out
27+
28+
# This creates a list of sheet names to map over
29+
sheets = list(map(str, range(2010, 2022)))
30+
31+
# I can now map the function over the list of sheets
32+
# and concatenate them into a single polars data frame
33+
# using pl.concat.
34+
raw_data = pl.concat(list(map(wrap_read_excel, sheets)))
35+
36+
# This function will be used below to clean the column names
37+
# If I was using Pandas, I could have used clean_columns from skimpy
38+
# but unfortunately this function doesn’t work with Polars DFs.
39+
# So I write this little function to clean the column names instead.
40+
def clean_names(string):
41+
# inspired by https://nadeauinnovations.com/post/2020/11/python-tricks-replace-all-non-alphanumeric-characters-in-a-string/
42+
clean_string = [s for s in string if s.isalnum() or s.isspace()]
43+
out = "".join(clean_string).lower()
44+
out = re.sub(r"\s+", "_", out)
45+
out = out.encode("ascii", "ignore").decode("utf-8")
46+
return out
47+
48+
# This row-binds all the datasets (first converting the dict to a list), and
49+
# then renames the columns using the above defined function
50+
# Not as nice as skimpy.clean_columns, but works on Polars DataFrames
51+
raw_data = raw_data.select(pl.all().name.map(clean_names))
52+
53+
raw_data = (
54+
raw_data
55+
.rename(
56+
{
57+
"commune": "locality",
58+
"nombre_doffres": "n_offers",
59+
"prix_moyen_annonc_en_courant": "average_price_nominal_euros",
60+
"prix_moyen_annonc_au_m_en_courant": "average_price_m2_nominal_euros"
61+
}
62+
)
63+
.with_columns(
64+
cs.contains("average").cast(pl.Float64, strict = False)
65+
)
66+
.with_columns(
67+
# In some sheets it’s "Luxembourg", in others it’s "Luxembourg-Ville"
68+
pl.col("locality").str.replace_all("Luxembourg.*", "Luxembourg")
69+
)
70+
.with_columns(
71+
# In some sheets it’s "Pétange", in others it’s "Petange"
72+
pl.col("locality").str.replace_all("P.*tange", "Pétange")
73+
)
74+
.with_columns(
75+
pl.col("locality").str.strip_chars()
76+
)
77+
)
78+
79+
# Always look at your data
80+
(
81+
raw_data
82+
.filter(pl.col("average_price_nominal_euros").is_null())
83+
)
84+
85+
86+
# Remove empty locality
87+
raw_data = (
88+
raw_data
89+
.filter(~pl.col("locality").is_null())
90+
)
91+
92+
# Only keep communes in the data
93+
94+
commune_level_data = (
95+
raw_data
96+
.filter(~pl.col("locality").str.contains("nationale|offre|Source"))
97+
# This is needed on Windows...
98+
.with_columns(
99+
pl.col("locality").str.replace_all("\351", "é")
100+
)
101+
.with_columns(
102+
pl.col("locality").str.replace_all("\373", "û")
103+
)
104+
.with_columns(
105+
pl.col("locality").str.replace_all("\344", "ä")
106+
)
107+
)
108+
109+
country_level = (
110+
raw_data
111+
.filter(pl.col("locality").str.contains("nationale"))
112+
.select(~cs.contains("n_offers"))
113+
)
114+
115+
offers_country = (
116+
raw_data
117+
.filter(pl.col("locality").str.contains("Total d.offres"))
118+
.select(["year", "n_offers"])
119+
)
120+
121+
country_level_data = (
122+
country_level.join(offers_country, on = "year")
123+
.with_columns(pl.lit("Grand-Duchy of Luxembourg").alias("locality"))
124+
)
125+
126+
# I can use all these comments
127+
128+
# if the data already had a year column, I could have read all the sheets
129+
# in one go using the following code
130+
131+
#datasets = pl.read_excel(
132+
# source = "vente-maison-2010-2021.xlsx",
133+
# sheet_id = 0,
134+
# read_csv_options = {
135+
# # Polars skip empty rows that that come before any data by default, which is quite helpful
136+
# # with Pandas, 10 rows should get skipped for sheets 2010 to 2020, but only 8 for sheet 2021
137+
# # but in the case of Polars, because empty rows get skipped automatically, 6 more rows
138+
# # must get skipped. Check out the Excel file to see what I mean.
139+
# "skip_rows": 6,
140+
# "has_header": True#,
141+
# # new_columns would be the preferred approach, but for some reason when using it on this Excel File,
142+
# # two more empty columns appear. So I could call them a and b and then remove them
143+
# # this is what the commented line below does. However, I decided to apply a function
144+
# # that cleans the column names myself. It’s more complicated, but also more elegant as it would
145+
# # work for any number of columns and in any order
146+
# # "new_columns": ["a", "b","locality", "n_offers", "average_price_nominal_euros", "average_price_m2_nominal_euros"]
147+
# }
148+
#)
149+
150+
151+
# We now need to scrape wikipedia for a table
152+
153+
from urllib.request import urlopen
154+
from bs4 import BeautifulSoup
155+
from pandas import read_html
156+
from io import StringIO
157+
# also need to install lxml
158+
159+
# we now need to scrape wikipedia pages
160+
url = 'https://b-rodrigues.github.io/list_communes/'
161+
162+
html = urlopen(url)
163+
164+
tables = (
165+
BeautifulSoup(html, 'html.parser')
166+
.find_all("table")
167+
)
168+
169+
current_communes_raw = read_html(StringIO(str(tables[1])))[0]
170+
171+
# current_communes has a MultiIndex, so drop it
172+
current_communes_raw.columns = current_communes_raw.columns.droplevel()
173+
174+
current_communes_pl = (
175+
pl.DataFrame(current_communes_raw)
176+
.select(pl.col("Name.1").alias("commune"))
177+
.with_columns(
178+
pl.col("commune").str.replace_all("\351", "é")
179+
)
180+
.with_columns(
181+
pl.col("commune").str.replace_all("\373", "û")
182+
)
183+
.with_columns(
184+
pl.col("commune").str.replace_all("\344", "ä")
185+
)
186+
.with_columns(
187+
pl.col("commune").str.replace_all(" .$", "")
188+
)
189+
)
190+
191+
current_communes = list(current_communes_pl["commune"])
192+
193+
# Test whether all the communes are in our data
194+
# If the next expression returns an empty list
195+
# then we’re good
196+
197+
(
198+
commune_level_data
199+
.filter(~pl.col("locality").is_in(current_communes))
200+
.get_column("locality")
201+
.unique()
202+
.sort()
203+
.to_list()
204+
)
205+
206+
# Need to also check former communes
207+
url = 'https://b-rodrigues.github.io/former_communes/#Former_communes/'
208+
209+
html = urlopen(url)
210+
211+
tables = (
212+
BeautifulSoup(html, 'html.parser')
213+
.find_all("table")
214+
)
215+
216+
# The third table (...hence the ’2’ in tables[2]...) is the one we need
217+
former_communes_raw = read_html(StringIO(str(tables[2])))[0]
218+
219+
former_communes_pl = (
220+
pl.DataFrame(former_communes_raw)
221+
.with_columns(
222+
pl.col("Name").str.replace_all("\351", "é")
223+
)
224+
.with_columns(
225+
pl.col("Name").str.replace_all("\373", "û")
226+
)
227+
.with_columns(
228+
pl.col("Name").str.replace_all("\344", "ä")
229+
)
230+
.select(pl.col("Name").alias("commune"))
231+
)
232+
233+
# Combine former and current communes
234+
235+
communes = (
236+
pl.concat([former_communes_pl, current_communes_pl])
237+
.get_column("commune")
238+
.unique()
239+
.sort()
240+
.to_list()
241+
)
242+
243+
244+
(
245+
commune_level_data
246+
.filter(~pl.col("locality").is_in(communes))
247+
.get_column("locality")
248+
.unique()
249+
.sort()
250+
.to_list()
251+
)
252+
253+
# There’s certain communes with diffirent spelling between
254+
# wikipedia and our data, so let’s correct the spelling
255+
# on the wikipedia ones
256+
# ['Clémency', 'Erpeldange', 'Kaerjeng', 'Luxembourg', 'Pétange']
257+
258+
communes_clean = (
259+
pl.concat([former_communes_pl, current_communes_pl])
260+
.with_columns(
261+
pl.when(pl.col("commune").str.contains("Cl.mency"))
262+
.then(pl.lit("Clémency"))
263+
.otherwise(pl.col("commune")).alias("commune")
264+
)
265+
.with_columns(
266+
pl.when(pl.col("commune").str.contains("Erpeldange"))
267+
.then(pl.lit("Erpeldange"))
268+
.otherwise(pl.col("commune")).alias("commune"),
269+
270+
)
271+
.with_columns(
272+
pl.when(pl.col("commune").str.contains("City"))
273+
.then(pl.lit("Luxembourg"))
274+
.otherwise(pl.col("commune")).alias("commune"),
275+
)
276+
.with_columns(
277+
pl.when(pl.col("commune").str.contains("K.*jeng"))
278+
.then(pl.lit("Kaerjeng"))
279+
.otherwise(pl.col("commune")).alias("commune"),
280+
)
281+
.with_columns(
282+
pl.when(pl.col("commune").str.contains("P.*tange"))
283+
.then(pl.lit("Pétange"))
284+
.otherwise(pl.col("commune")).alias("commune"),
285+
)
286+
.get_column("commune")
287+
.unique()
288+
.sort()
289+
.to_list()
290+
)
291+
292+
# Test whether all the communes are in our data
293+
# If the next expression returns an empty list
294+
# then we’re good
295+
296+
(
297+
commune_level_data
298+
.filter(~pl.col("locality").is_in(communes_clean))
299+
.get_column("locality")
300+
.unique()
301+
.sort()
302+
.to_list()
303+
)
304+
305+
# save data as csv
306+
307+
commune_level_data.write_csv("commune_level_data.csv")
308+
country_level_data.write_csv("country_level_data.csv")

‎setting_up.qmd

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Setting up a development environment
2+
3+
I have to start with on of the hardest chapters of the book,
4+
how to set up a development environment for Python.

0 commit comments

Comments
 (0)
Please sign in to comment.