diff --git a/.gitignore b/.gitignore index 11676a02..673554e4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,9 +3,12 @@ _book/ node_modules/ *.pyc *.swp + book.pdf analysis-essentials.pdf shell/files/data-shell.zip build .ipynb_checkpoints -*~ \ No newline at end of file +*~ +/.idea + diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b4c5969c..f6e5de40 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: v5.0.0 hooks: - id: check-added-large-files - args: ['--maxkb=1000'] + args: [ '--maxkb=1000' ] - id: check-merge-conflict - id: check-case-conflict - id: check-symlinks @@ -21,16 +21,20 @@ repos: - id: nbqa-pyupgrade additional_dependencies: [ pyupgrade ] - args: [ --py38-plus ] -# -# - repo: https://github.com/ambv/black -# rev: 21.9b0 -# hooks: -# - id: black -# args: [ --line-length=120 ] -# + args: [ --py39-plus ] + - repo: https://github.com/kynan/nbstripout rev: 0.8.1 hooks: - id: nbstripout + args: [ --extra-keys=metadata.language_info.codemirror_mode.version metadata.kernelspec metadata.language_info.pygments_lexer metadata.language_info.version ] + + + +# needs rust, only activate if needed +# - repo: https://github.com/shssoichiro/oxipng +# rev: v9.1.2 +# hooks: +# - id: oxipng +# args: [ --best --strip all --quiet ] diff --git a/advanced-python/10Basics.ipynb b/advanced-python/10Basics.ipynb index 9301d4bd..c55798b7 100644 --- a/advanced-python/10Basics.ipynb +++ b/advanced-python/10Basics.ipynb @@ -391,7 +391,7 @@ "metadata": {}, "outputs": [], "source": [ - "{'a': 'b'}.get?" + "get?" ] }, { @@ -506,22 +506,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" }, "nbsphinx": { "execute": "auto" diff --git a/advanced-python/11AdvancedPython.ipynb b/advanced-python/11AdvancedPython.ipynb index 2a1c253b..1d9bcd4b 100644 --- a/advanced-python/11AdvancedPython.ipynb +++ b/advanced-python/11AdvancedPython.ipynb @@ -965,22 +965,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/12AdvancedClasses.ipynb b/advanced-python/12AdvancedClasses.ipynb index 41185a08..faeab09e 100644 --- a/advanced-python/12AdvancedClasses.ipynb +++ b/advanced-python/12AdvancedClasses.ipynb @@ -503,22 +503,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/20DataAndPlotting.ipynb b/advanced-python/20DataAndPlotting.ipynb index df8e83ee..cec014d2 100644 --- a/advanced-python/20DataAndPlotting.ipynb +++ b/advanced-python/20DataAndPlotting.ipynb @@ -113,8 +113,9 @@ "metadata": {}, "outputs": [], "source": [ - "my_file = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64})\n", + "my_file = uproot.open(\n", + " \"https://cern.ch/starterkit/data/advanced-python-2018/real_data.root\"\n", + ")\n", "\n", "my_file.keys()" ] @@ -132,9 +133,9 @@ "metadata": {}, "outputs": [], "source": [ - "tree = my_file['DecayTree']\n", + "tree = my_file[\"DecayTree\"]\n", "# Get a numpy array containing the J/Ψ mass\n", - "tree['Jpsi_M'].array(library='np')" + "tree[\"Jpsi_M\"].array(library=\"np\")" ] }, { @@ -144,8 +145,8 @@ "outputs": [], "source": [ "# Load data as a pandas DataFrame\n", - "data_df = tree.arrays(library='pd')\n", - "my_file.close() # usually, it's better to open the file with a \"with\" statement -> needs no closing\n", + "data_df = tree.arrays(library=\"pd\")\n", + "my_file.close() # usually, it's better to open the file with a \"with\" statement -> closes automatically if outside block\n", "\n", "# Show the first 5 lines of the DataFrame\n", "data_df.head()" @@ -175,8 +176,8 @@ "outputs": [], "source": [ "# Start with a basic histogram\n", - "plt.hist(data_df['Jpsi_M'])\n", - "plt.xlabel('Jpsi mass')" + "plt.hist(data_df[\"Jpsi_M\"])\n", + "plt.xlabel(\"Jpsi mass\")" ] }, { @@ -224,8 +225,8 @@ "outputs": [], "source": [ "# plotting again\n", - "plt.hist(data_df['Jpsi_M'], bins=40)\n", - "plt.xlabel('Jpsi mass')" + "plt.hist(data_df[\"Jpsi_M\"], bins=40)\n", + "plt.xlabel(\"Jpsi mass\")" ] }, { @@ -239,8 +240,8 @@ "outputs": [], "source": [ "# And similar with mplhep\n", - "mplhep.histplot(*np.histogram(data_df['Jpsi_M'], bins=30))\n", - "plt.xlabel('Jpsi mass')" + "mplhep.histplot(*np.histogram(data_df[\"Jpsi_M\"], bins=30))\n", + "plt.xlabel(\"Jpsi mass\")" ] }, { @@ -256,13 +257,13 @@ "# ... except that it can do a lot more!\n", "\n", "# we need to histogram only once!\n", - "h, bins = np.histogram(data_df['Jpsi_M'], bins=50) # TRY OUT: change the binning\n", - "plt.subplots(1,2,figsize=(20, 6))\n", + "h, bins = np.histogram(data_df[\"Jpsi_M\"], bins=50) # TRY OUT: change the binning\n", + "plt.subplots(1, 2, figsize=(20, 6))\n", "plt.subplot(1, 2, 1)\n", "mplhep.histplot(h, bins, yerr=True) # error can also be array\n", "plt.subplot(1, 2, 2)\n", "half_binwidths = (bins[1] - bins[0]) / 2\n", - "mplhep.histplot(h, bins, histtype='errorbar', yerr=True, xerr=half_binwidths)" + "mplhep.histplot(h, bins, histtype=\"errorbar\", yerr=True, xerr=half_binwidths)" ] }, { @@ -272,10 +273,10 @@ "outputs": [], "source": [ "def plot_mass(df):\n", - " h, bins = np.histogram(df['Jpsi_M'], bins=100, range=[2.75, 3.5])\n", + " h, bins = np.histogram(df[\"Jpsi_M\"], bins=100, range=[2.75, 3.5])\n", " mplhep.histplot(h, bins, yerr=True) # feel free to adjust\n", " # You can also use LaTeX in the axis label\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlabel(\"$J/\\\\psi$ mass [GeV]\")\n", " plt.xlim(bins[0], bins[-1])\n", "\n", "\n", @@ -296,8 +297,8 @@ "outputs": [], "source": [ "# When making the ROOT file we forgot to add some variables, no bother lets add them now!\n", - "data_df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", - "data_df.head()['Jpsi_eta']" + "data_df.eval(\"Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)\", inplace=True)\n", + "data_df.head()[\"Jpsi_eta\"]" ] }, { @@ -313,10 +314,10 @@ "metadata": {}, "outputs": [], "source": [ - "data_df.eval('mup_P = sqrt(mup_PX**2 + mup_PY**2 + mup_PZ**2)', inplace=True)\n", - "data_df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "data_df.eval(\"mup_P = sqrt(mup_PX**2 + mup_PY**2 + mup_PZ**2)\", inplace=True)\n", + "data_df.eval(\"mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)\", inplace=True)\n", "# We can also get multiple columns at the same time\n", - "data_df.head()[['mum_P', 'mup_P']]" + "data_df.head()[[\"mum_P\", \"mup_P\"]]" ] }, { @@ -342,7 +343,7 @@ "outputs": [], "source": [ "plot_mass(data_df)\n", - "data_with_cuts_df = data_df.query('Jpsi_PT > 4')\n", + "data_with_cuts_df = data_df.query(\"Jpsi_PT > 4\")\n", "plot_mass(data_with_cuts_df)" ] }, @@ -353,10 +354,12 @@ "outputs": [], "source": [ "plot_mass(data_df)\n", - "data_with_cuts_df = data_df.query('Jpsi_PT > 4')\n", + "data_with_cuts_df = data_df.query(\"Jpsi_PT > 4\")\n", "plot_mass(data_with_cuts_df)\n", "# Lets add some PID cuts as well\n", - "data_with_cuts_df = data_df.query('(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.9) & (mup_ProbNNmu > 0.9))')\n", + "data_with_cuts_df = data_df.query(\n", + " \"(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.9) & (mup_ProbNNmu > 0.9))\"\n", + ")\n", "plot_mass(data_with_cuts_df)" ] }, @@ -375,19 +378,21 @@ "outputs": [], "source": [ "def plot_mass(df, **kwargs):\n", - " h, bins = np.histogram(df['Jpsi_M'], bins=100, range=[2.75, 3.5])\n", + " h, bins = np.histogram(df[\"Jpsi_M\"], bins=100, range=[2.75, 3.5])\n", " mplhep.histplot(h, bins, yerr=True, **kwargs) # feel free to adjust\n", " # You can also use LaTeX in the axis label\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlabel(\"$J/\\\\psi$ mass [GeV]\")\n", " plt.xlim(bins[0], bins[-1])\n", "\n", "\n", - "plot_mass(data_df, label='No cuts', density=1)\n", - "data_with_cuts_df = data_df.query('Jpsi_PT > 4')\n", - "plot_mass(data_with_cuts_df, label='$J/\\\\psi$ p$_T$ only', density=1)\n", - "data_with_cuts_df = data_df.query('(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.9) & (mup_ProbNNmu > 0.9))')\n", - "plot_mass(data_with_cuts_df, label='$J/\\\\psi$ p$_T$ and muon PID', density=1)\n", - "plt.legend(loc='best')" + "plot_mass(data_df, label=\"No cuts\", density=1)\n", + "data_with_cuts_df = data_df.query(\"Jpsi_PT > 4\")\n", + "plot_mass(data_with_cuts_df, label=\"$J/\\\\psi$ p$_T$ only\", density=1)\n", + "data_with_cuts_df = data_df.query(\n", + " \"(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.9) & (mup_ProbNNmu > 0.9))\"\n", + ")\n", + "plot_mass(data_with_cuts_df, label=\"$J/\\\\psi$ p$_T$ and muon PID\", density=1)\n", + "plt.legend(loc=\"best\")" ] }, { @@ -414,14 +419,16 @@ "source": [ "from python_lesson import check_truth\n", "\n", - "print('Originally the significance is')\n", + "print(\"Originally the significance is\")\n", "check_truth(data_df)\n", "\n", - "print('\\nCutting on pT gives us')\n", - "check_truth(data_df.query('Jpsi_PT > 4'))\n", + "print(\"\\nCutting on pT gives us\")\n", + "check_truth(data_df.query(\"Jpsi_PT > 4\"))\n", "\n", - "print('\\nCutting on pT and ProbNNmu gives us')\n", - "check_truth(data_df.query('(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.9) & (mup_ProbNNmu > 0.9))'))" + "print(\"\\nCutting on pT and ProbNNmu gives us\")\n", + "check_truth(\n", + " data_df.query(\"(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.9) & (mup_ProbNNmu > 0.9))\")\n", + ")" ] }, { @@ -443,14 +450,16 @@ "metadata": {}, "outputs": [], "source": [ - "with uproot.open('https://starterkit.web.cern.ch/starterkit/data/advanced-python-2018/simulated_data.root') as mc_file:\n", - " mc_df = mc_file['DecayTree'].arrays(library='pd')\n", + "with uproot.open(\n", + " \"https://starterkit.web.cern.ch/starterkit/data/advanced-python-2018/simulated_data.root\"\n", + ") as mc_file:\n", + " mc_df = mc_file[\"DecayTree\"].arrays(library=\"pd\")\n", "\n", "# mc_file = uproot.open('https://starterkit.web.cern.ch/starterkit/data/advanced-python-2018/simulated_data.root')\n", "# mc_df = mc_file['DecayTree'].arrays(library='pd')\n", - "mc_df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", - "mc_df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - "mc_df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)" + "mc_df.eval(\"Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)\", inplace=True)\n", + "mc_df.eval(\"mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)\", inplace=True)\n", + "mc_df.eval(\"mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)\", inplace=True)" ] }, { @@ -470,7 +479,7 @@ "metadata": {}, "outputs": [], "source": [ - "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", + "bkg_df = data_df.query(\"~(3.0 < Jpsi_M < 3.2)\")\n", "plot_mass(bkg_df)" ] }, @@ -496,16 +505,16 @@ "metadata": {}, "outputs": [], "source": [ - "var = 'Jpsi_PT'\n", + "var = \"Jpsi_PT\"\n", "# let's first create the histograms\n", "hsig, bins = np.histogram(mc_df[var], bins=60)\n", "hbkg, bins = np.histogram(bkg_df[var], bins=bins) # using the same bins here\n", "# then plot them\n", - "mplhep.histplot((hsig, bins), label='MC Signal')\n", - "mplhep.histplot(hbkg, bins=bins, label='Data Bkg')\n", + "mplhep.histplot((hsig, bins), label=\"MC Signal\")\n", + "mplhep.histplot(hbkg, bins=bins, label=\"Data Bkg\")\n", "plt.xlabel(var)\n", "plt.xlim(bins[0], bins[-1])\n", - "plt.legend(loc='best')" + "plt.legend(loc=\"best\")" ] }, { @@ -517,11 +526,11 @@ "# Those are hard to compare!\n", "# We can add the density keyword argument to normalise the distributions\n", "\n", - "mplhep.histplot(hsig, bins=bins, label='MC Signal', density=1)\n", - "mplhep.histplot(hbkg, bins=bins, label='Data Bkg', density=1)\n", + "mplhep.histplot(hsig, bins=bins, label=\"MC Signal\", density=1)\n", + "mplhep.histplot(hbkg, bins=bins, label=\"Data Bkg\", density=1)\n", "plt.xlabel(var)\n", "plt.xlim(bins[0], bins[-1])\n", - "plt.legend(loc='best')" + "plt.legend(loc=\"best\")" ] }, { @@ -542,11 +551,14 @@ " hsig, bins = np.histogram(mc_df[var], bins=60, density=1)\n", " hbkg, bins = np.histogram(bkg_df[var], bins=bins, density=1)\n", "\n", - " mplhep.histplot((hsig, bins), label='MC Signal', )\n", - " mplhep.histplot(hbkg, bins=bins, label='Data Bkg')\n", + " mplhep.histplot(\n", + " (hsig, bins),\n", + " label=\"MC Signal\",\n", + " )\n", + " mplhep.histplot(hbkg, bins=bins, label=\"Data Bkg\")\n", " plt.xlabel(var)\n", " plt.xlim(bins[0], bins[-1])\n", - " plt.legend(loc='best')" + " plt.legend(loc=\"best\")" ] }, { @@ -646,22 +658,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/30Classification.ipynb b/advanced-python/30Classification.ipynb index 9c07d9aa..22c99b49 100644 --- a/advanced-python/30Classification.ipynb +++ b/advanced-python/30Classification.ipynb @@ -19,7 +19,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "jupyter": { + "is_executing": true + } + }, "outputs": [], "source": [ "%store -r bkg_df\n", @@ -30,7 +34,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "jupyter": { + "is_executing": true + } + }, "outputs": [], "source": [ "import mplhep\n", @@ -650,22 +658,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/31ClassificationExtension.ipynb b/advanced-python/31ClassificationExtension.ipynb index 3213818e..607bc71e 100644 --- a/advanced-python/31ClassificationExtension.ipynb +++ b/advanced-python/31ClassificationExtension.ipynb @@ -336,22 +336,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/32BoostingToUniformity.ipynb b/advanced-python/32BoostingToUniformity.ipynb index ae62b845..eb29139c 100644 --- a/advanced-python/32BoostingToUniformity.ipynb +++ b/advanced-python/32BoostingToUniformity.ipynb @@ -237,22 +237,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" }, "nbsphinx": { "execute": "never" diff --git a/advanced-python/33ModelTuning.ipynb b/advanced-python/33ModelTuning.ipynb deleted file mode 100644 index be6ea321..00000000 --- a/advanced-python/33ModelTuning.ipynb +++ /dev/null @@ -1,681 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model tuning setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#%store -r training_data\n", - "#%store -r training_columns\n", - "#%store -r bkg_df\n", - "#%store -r mc_df\n", - "#%store -r data_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#@title\n", - "#!pip install uproot\n", - "#!pip install sklearn\n", - "\n", - "import time\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import uproot\n", - "import xgboost as xgb\n", - "from matplotlib import pyplot as plt\n", - "from sklearn import metrics\n", - "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", - "from sklearn.metrics import auc, roc_curve\n", - "from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,\n", - " cross_validate, train_test_split)\n", - "from xgboost.sklearn import XGBClassifier" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Time and processing check for the lesson\n", - "stt = time.time()\n", - "stc = time.process_time()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_mass(df, label=\"\", norm=True):\n", - " counts, bins, _ = plt.hist(df['Jpsi_M'], label=label, bins=100, range=[2.75, 3.5], histtype='step', density=norm)\n", - " # You can also use LaTeX in the axis label\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", - " plt.xlim(bins[0], bins[-1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_comparision(var, mc_df, bkg_df):\n", - " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", - " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", - " plt.xlabel(var)\n", - " plt.xlim(bins[0], bins[-1])\n", - " plt.legend(loc='best')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_roc(bdt, training_data, training_columns, label=None):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - " area = auc(fpr, tpr)\n", - "\n", - " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", - " if label:\n", - " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", - " else:\n", - " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", - " plt.xlim(0.0, 1.0)\n", - " plt.ylim(0.0, 1.0)\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('True Positive Rate')\n", - " plt.legend(loc='lower right')\n", - " # We can make the plot look nicer by forcing the grid to be square\n", - " plt.gca().set_aspect('equal', adjustable='box')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_significance(bdt, training_data, training_columns, label):\n", - " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", - " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", - "\n", - " n_sig = 1200\n", - " n_bkg = 23000\n", - " S = n_sig*tpr + (n_sig*tpr==0)*1\n", - " B = n_bkg*fpr + (n_bkg*tpr==0)*1\n", - " metric = S/np.sqrt(S+B)\n", - "\n", - " plt.plot(thresholds, metric, label=label)\n", - " plt.xlabel('BDT cut value')\n", - " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", - " plt.xlim(0, 1.0)\n", - "\n", - " optimum = np.max(metric)\n", - " optimal_cut = thresholds[np.argmax(metric)]\n", - " print(label, \": S/sqrt(S+B) =\", optimum, \" at x =\", optimal_cut)\n", - " plt.axvline(x=optimal_cut, color='black', linewidth=1.0, linestyle='--')\n", - "\n", - " return optimal_cut" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly\n", - "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", - "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", - " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", - "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", - "\n", - "for df in [mc_df, data_df, bkg_df]:\n", - " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", - " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", - "\n", - "bkg_df['catagory'] = 0 # Use 0 for background\n", - "mc_df['catagory'] = 1 # Use 1 for signal\n", - "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", - "\n", - "training_columns = [\n", - " 'Jpsi_PT',\n", - " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", - " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X1, y1 = training_data[training_columns], training_data['catagory']\n", - "X_train, X_test, y_train, y_test = train_test_split(X1, y1)\n", - "# default train_size = 0.25, this can be varied to suit your data\n", - "\n", - "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", - "\n", - "stime = time.time()\n", - "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_jobs=-1)\n", - "bdt.fit(training_data[training_columns], training_data['catagory'])\n", - "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cross-validation\n", - "\n", - "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def training_monitor(alg):\n", - "\n", - " # A model trained with eval_set and eval_metric will return evals_result\n", - " results = alg.evals_result()\n", - " epochs = len(results['validation_0']['logloss'])\n", - " x_axis = range(0, epochs)\n", - "\n", - " # Plotting logLoss as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test')\n", - " ax.legend()\n", - " plt.ylabel('LogLoss')\n", - " plt.title('LogLoss')\n", - " plt.show()\n", - "\n", - " # Plotting classification error as a function of training iteration\n", - " fig, ax = plt.subplots()\n", - " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", - " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", - " ax.legend()\n", - " plt.ylabel('Error')\n", - " plt.title('Error')\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Defining a model with multi-threading set to maximum\n", - "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1)\n", - "\n", - "# Model fitting with CV and printing out processing time\n", - "stime = time.time()\n", - "bdt_cv.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", - "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Writing model predictions out for data\n", - "training_monitor(bdt_cv)\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Drawing plot of model respone for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "\n", - "# Drawing the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", - "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### $k$-folding & early stopping\n", - "\n", - "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Defining the folds with a seed to test consistently\n", - "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", - "kf = KFold(n_splits=splits, shuffle=True, random_state=123)\n", - "\n", - "# Printing processing time of the kfold cross-validation\n", - "stime = time.time()\n", - "for train, test in kf.split(X1):\n", - " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", - " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", - " bdt.fit(X_train,y_train)\n", - "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Calculating scores of each fold using variety of CV-metrics\n", - "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", - "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", - "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", - "\n", - "# Printing results and indicating best fold\n", - "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", - "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", - "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", - "bestfold = np.argmax(cv_acc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", - "\n", - " # Loading data split inputs providing best fold result\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Defining data in terms of training variables and class label\n", - " xgb_param = alg.get_xgb_params()\n", - " data = xgb.DMatrix(params, label=label, feature_names=predictors, nthread=-1)\n", - "\n", - " # Runs timed CV on our model using early stopping based on our metric\n", - " stime = time.time()\n", - " cvresult = xgb.cv(xgb_param,\n", - " data,\n", - " num_boost_round=alg.get_params()['n_estimators'],\n", - " #nfold=cv_folds, # to use in build folding\n", - " folds=kfold, # use -> ignores nfold\n", - " metrics=metric,\n", - " early_stopping_rounds=early_stop)\n", - " alg.set_params(n_estimators=cvresult.shape[0])\n", - " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Fitting the algorithm on the data with CV evaluation early stopping\n", - " stime = time.time()\n", - " alg.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " training_monitor(alg)\n", - " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Predicting training set:\n", - " train_predictions = alg.predict(X_train)\n", - " test_predictions = alg.predict(X_test)\n", - "\n", - " # Printing model report:\n", - " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", - " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", - " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", - " return cvresult.shape[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Defining model with high maximum estimators for use with early stopping\n", - "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", - " # Default values of other hyperparamters\n", - " #max_depth=6, min_child_weight=1,\n", - " #gamma=0, subsample=0.8,\n", - " #colsample_bytree=0.8, scale_pos_weight=1,\n", - " #objective='binary:logistic', # default for binary classification\n", - " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", - " seed=123, n_jobs=-1)\n", - "\n", - "# Timing the CV using early stopping\n", - "stime = time.time()\n", - "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", - "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Saving model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Drawing plot to compare model response for signal and background classes\n", - "plt.figure()\n", - "plot_comparision('XGBcv', mc_df, bkg_df)\n", - "plot_comparision('XGBes', mc_df, bkg_df)\n", - "\n", - "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", - "plt.figure()\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)\n", - "\n", - "# Drawing signal significance comparison as a function of minimum cut on model response\n", - "plt.figure()\n", - "bdt_cut_cv = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", - "bdt_cut_es = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hyperameter optimisation\n", - "\n", - "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Define a function that performs a gridscan of HPs\n", - "\n", - "\n", - "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", - "\n", - " # Load data fold with best performance\n", - " for k, (train, test) in enumerate(kf.split(params)):\n", - " if (k==fbest):\n", - " X_train, X_test = params.iloc[train], params.iloc[test]\n", - " y_train, y_test = label.iloc[train], label.iloc[test]\n", - "\n", - " # Define a dictionary of numpy arrays for our HPs\n", - " params = {\n", - " 'max_depth':np.array([7]),\n", - " 'min_child_weight':np.array([3]),\n", - " #'max_depth':np.arange( 5, 9, 1 ),\n", - " #'min_child_weight':np.arange( 1, 5, 1 ),\n", - " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", - " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", - " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", - " }\n", - "\n", - " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", - " stime = time.time()\n", - " gs = GridSearchCV(estimator=alg,\n", - " param_grid=params,\n", - " scoring=metric,\n", - " #iid=False,\n", - " cv=kf,\n", - " n_jobs=-1)\n", - " gs.fit(X_train, y_train, eval_metric=[\"logloss\",\"error\"],\n", - " eval_set=[(X_train, y_train), (X_test, y_test)],\n", - " verbose=False, early_stopping_rounds=early_stop)\n", - " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - " # Return suggested parameters, performance and best model\n", - " training_monitor(gs.best_estimator_)\n", - " print(\"Suggestion:\", gs.best_params_)\n", - " print(\"Accuracy:\" ,gs.best_score_)\n", - " return gs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Running with estimators maximum for shortened training\n", - "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", - " seed=123, n_jobs=-1)\n", - "\n", - "# Running timed hyperparameter gridscan\n", - "stime = time.time()\n", - "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", - "bdt_gs = gs.best_estimator_\n", - "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "# Get model predictions\n", - "for df in [mc_df, bkg_df, data_df, training_data]:\n", - " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## We could define a model using optimal hyperparameters from our grid scan\n", - "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000,\n", - "# max_depth=gs.best_params_['max_depth'],\n", - "# min_child_weight=gs.best_params_['min_child_weight'],\n", - "# seed=123, n_jobs=-1 )\n", - "\n", - "## Run with CV early stopping\n", - "#stime = time.time()\n", - "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", - "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", - "\n", - "## Get model predictions\n", - "#for df in [mc_df, bkg_df, data_df, training_data]:\n", - "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Comapring model response from the end of last session to the end of this one\n", - "plt.figure()\n", - "plot_comparision('XGB', mc_df, bkg_df)\n", - "plot_comparision('XGBgs', mc_df, bkg_df)\n", - "\n", - "# Comparing model performance for each level of tuning\n", - "plt.figure()\n", - "plot_roc(bdt, training_data, training_columns)\n", - "plot_roc(bdt_cv, training_data, training_columns)\n", - "plot_roc(bdt_es, training_data, training_columns)\n", - "plot_roc(bdt_gs, training_data, training_columns)\n", - "#plot_roc(bdt_opt, training_data, training_columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Comparing the impact on projected performance at each stage of the tutorial\n", - "plt.figure()\n", - "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", - "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", - "bdt_es_cut = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")\n", - "bdt_gs_cut = plot_significance(bdt_gs, training_data, training_columns, \"bdt_gs\")\n", - "#bdt_opt_cut = plot_significance(bdt_opt, training_data, training_columns, \"bdt_opt\")\n", - "\n", - "# Comparing best cuts impact on mass for original and tuned model\n", - "plt.figure()\n", - "data_bdt_cut = data_df.query('XGB > %f' %bdt_cut )\n", - "plot_mass(data_bdt_cut, label='XGB default', norm=True)\n", - "data_gs_cut = data_df.query('XGBgs > %f' %bdt_gs_cut )\n", - "plot_mass(data_gs_cut, label='XGB tuned', norm=True)\n", - "plt.legend(loc='best')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Comparing our data sample's mass plot having applied the cut optimised for $\\sigma=\\frac{S}{\\sqrt{S+B}}$ from each BDT output, we can see how the improved model reduces relative background. However, while we define our signal training sample from MC you'll remember we defined our background training sample from the data !(3.0 < JPsi_M < 3.2).\n", - "\n", - "We can see shoulders at the edges of the regions where we define our background training sample in our data's mass spectrum now. Our training and validation samples include a subset of our data sample so there's potential that our model is learning the difference between MC and data and exploiting that or demonstrating overtraining on the 'previously seen' data (remember we could see our train and test samples beginning to diverge in our validation metrics with more iterations).\n", - "\n", - "Below you can see replotting the normalised mass distribution from just the data not included in training demonstrates no significant improvement. This is not ideal and might be addressed by choosing the setup of our training more carefully. For example, we could train using background from same-sign muon MC across the full mass range (a common practice in LHC experiments) or, using other libraries such as UGBoost to introduce a punishment to the training for introducing a depedance of efficiency on mass." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sig_df = data_df.query('(3.0 < Jpsi_M < 3.2)')\n", - "sig_bdt_cut = sig_df.query('XGB > %f' %bdt_cut )\n", - "plot_mass(sig_bdt_cut, label='XGB default', norm=True)\n", - "sig_gs_cut = sig_df.query('XGBgs > %f' %bdt_gs_cut )\n", - "plot_mass(sig_gs_cut, label='XGB tuned', norm=True)\n", - "plt.legend(loc='best')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", - "\n", - "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own:\n", - "* sklearn.model_selection.RandomizedSearchCV\n", - "* sklearn.model_selection.GridSearchCV\n", - "\n", - "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", - "* skopt.BayesSearchCV\n", - "* hyperopt.tpe" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning\n", - "Run with full stats by removing entrystop at max_events in cell 8." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Final lesson time and processing time check\n", - "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", - "print(\"Notebook CPU time --- %s seconds ---\" % (time.process_time() - stc))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/advanced-python/33ModelTuning.ipynbBKP b/advanced-python/33ModelTuning.ipynbBKP new file mode 100644 index 00000000..38e469e9 --- /dev/null +++ b/advanced-python/33ModelTuning.ipynbBKP @@ -0,0 +1,884 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model tuning setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#%store -r training_data\n", + "#%store -r training_columns\n", + "#%store -r bkg_df\n", + "#%store -r mc_df\n", + "#%store -r data_df" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#@title\n", + "#!pip install uproot\n", + "#!pip install sklearn\n", + "\n", + "import time\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import uproot\n", + "import xgboost as xgb\n", + "from matplotlib import pyplot as plt\n", + "from sklearn import metrics\n", + "from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier\n", + "from sklearn.metrics import auc, roc_curve\n", + "from sklearn.model_selection import (GridSearchCV, KFold, cross_val_score,\n", + " cross_validate, train_test_split)\n", + "from xgboost.sklearn import XGBClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Time and processing check for the lesson\n", + "stt = time.time()\n", + "stc = time.process_time()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_mass(df, label=\"\", norm=True):\n", + " counts, bins, _ = plt.hist(df['Jpsi_M'], label=label, bins=100, range=[2.75, 3.5], histtype='step', density=norm)\n", + " # You can also use LaTeX in the axis label\n", + " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", + " plt.xlim(bins[0], bins[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_comparision(var, mc_df, bkg_df):\n", + " _, bins, _ = plt.hist(mc_df[var], bins=100, histtype='step', label='MC', density=1)\n", + " _, bins, _ = plt.hist(bkg_df[var], bins=bins, histtype='step', label='Background', density=1)\n", + " plt.xlabel(var)\n", + " plt.xlim(bins[0], bins[-1])\n", + " plt.legend(loc='best')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_roc(bdt, training_data, training_columns, label=None):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + " area = auc(fpr, tpr)\n", + "\n", + " plt.plot([0, 1], [0, 1], color='grey', linestyle='--')\n", + " if label:\n", + " plt.plot(fpr, tpr, label=f'{label} (area = {area:.2f})')\n", + " else:\n", + " plt.plot(fpr, tpr, label=f'ROC curve (area = {area:.2f})')\n", + " plt.xlim(0.0, 1.0)\n", + " plt.ylim(0.0, 1.0)\n", + " plt.xlabel('False Positive Rate')\n", + " plt.ylabel('True Positive Rate')\n", + " plt.legend(loc='lower right')\n", + " # We can make the plot look nicer by forcing the grid to be square\n", + " plt.gca().set_aspect('equal', adjustable='box')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_significance(bdt, training_data, training_columns, label):\n", + " y_score = bdt.predict_proba(training_data[training_columns])[:,1]\n", + " fpr, tpr, thresholds = roc_curve(training_data['catagory'], y_score)\n", + "\n", + " n_sig = 1200\n", + " n_bkg = 23000\n", + " S = n_sig*tpr + (n_sig*tpr==0)*1\n", + " B = n_bkg*fpr + (n_bkg*tpr==0)*1\n", + " metric = S/np.sqrt(S+B)\n", + "\n", + " plt.plot(thresholds, metric, label=label)\n", + " plt.xlabel('BDT cut value')\n", + " plt.ylabel('$\\\\frac{S}{\\\\sqrt{S+B}}$')\n", + " plt.xlim(0, 1.0)\n", + "\n", + " optimum = np.max(metric)\n", + " optimal_cut = thresholds[np.argmax(metric)]\n", + " print(label, \": S/sqrt(S+B) =\", optimum, \" at x =\", optimal_cut)\n", + " plt.axvline(x=optimal_cut, color='black', linewidth=1.0, linestyle='--')\n", + "\n", + " return optimal_cut" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_113570/96520257.py:9: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + "/tmp/ipykernel_113570/96520257.py:10: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "/tmp/ipykernel_113570/96520257.py:11: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "/tmp/ipykernel_113570/96520257.py:13: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " bkg_df['catagory'] = 0 # Use 0 for background\n", + "/tmp/ipykernel_113570/96520257.py:17: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n" + ] + } + ], + "source": [ + "#max_entries = 1000 # try running with low stats for bug fixing your changes quickly\n", + "data_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root',\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", + "mc_df = uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/simulated_data.root',\n", + " )['DecayTree'].arrays(library='pd')#,entry_stop=max_entries)\n", + "bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')\n", + "\n", + "for df in [mc_df, data_df, bkg_df]:\n", + " df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)\n", + " df.eval('mup_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + " df.eval('mum_P = sqrt(mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)\n", + "\n", + "bkg_df['catagory'] = 0 # Use 0 for background\n", + "mc_df['catagory'] = 1 # Use 1 for signal\n", + "training_data = pd.concat([bkg_df, mc_df], copy=True, ignore_index=True)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['IPdiff'] = np.abs(df['mum_PT'] - df['mup_PT'])\n", + "\n", + "training_columns = [\n", + " 'Jpsi_PT',\n", + " 'mup_PT', 'mup_eta', 'mup_ProbNNmu', 'mup_IP',\n", + " 'mum_PT', 'mum_eta', 'mum_ProbNNmu', 'mum_IP',\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Previously we trained an XGBClassifier with the default settings, with learning rate = 0.3 and maximum iterations = 100. This cut off to the training process may be limiting the performance of our model. We can monitor the performance of our model as a function of training iteration and stop the training when the gradient approximates zero. " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "XGBoost --- 37.32467699050903 seconds ---\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_113570/3009250180.py:13: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]\n" + ] + } + ], + "source": [ + "X1, y1 = training_data[training_columns], training_data['catagory']\n", + "X_train, X_test, y_train, y_test = train_test_split(X1, y1)\n", + "# default train_size = 0.25, this can be varied to suit your data\n", + "\n", + "LR = 0.3 # the coefficient of step size decay, eta, has alias 'learning_rate' with default 0.3\n", + "\n", + "stime = time.time()\n", + "bdt = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_jobs=-1)\n", + "bdt.fit(training_data[training_columns], training_data['catagory'])\n", + "print(\"XGBoost --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGB'] = bdt.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation\n", + "\n", + "Splitting the data into randomised subsets for training allows you to monitor your model's performance on the fly using the statistically independant remainder of your sample - this is called cross-validation (CV). We can see below that at the 100th iteration the metrics still show a trend of improvement." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def training_monitor(alg):\n", + "\n", + " # A model trained with eval_set and eval_metric will return evals_result\n", + " results = alg.evals_result()\n", + " epochs = len(results['validation_0']['logloss'])\n", + " x_axis = range(0, epochs)\n", + "\n", + " # Plotting logLoss as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['logloss'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['logloss'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('LogLoss')\n", + " plt.title('LogLoss')\n", + " plt.show()\n", + "\n", + " # Plotting classification error as a function of training iteration\n", + " fig, ax = plt.subplots()\n", + " ax.plot(x_axis, results['validation_0']['error'], label='Train') # for each eval_set\n", + " if results['validation_1']: ax.plot(x_axis, results['validation_1']['error'], label='Test')\n", + " ax.legend()\n", + " plt.ylabel('Error')\n", + " plt.title('Error')\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This involves training on less data but allows us to monitor progress to check if the model is becoming over-specific to our training sample. The minimisation of loss and classification error are common metrics for model assessment. As shown below, the cost to performance is negligible. If the test sample gradient were to invert this would be considered overtraining and is why monitoring performance without CV can be a time costly pitfall." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jonas/mambaforge/envs/anaessentials311/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [02:00:05] WARNING: /workspace/src/learner.cc:740: \n", + "Parameters: { \"n_threads\" } are not used.\n", + "\n", + " warnings.warn(smsg, UserWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "XGBoost cross-validation --- 50.098893880844116 seconds ---\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_113570/3527880945.py:13: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]\n" + ] + } + ], + "source": [ + "# Defining a model with multi-threading set to maximum\n", + "bdt_cv = XGBClassifier(learning_rate = LR, n_estimators=100, seed=123, n_threads=-1, eval_metric=[\"logloss\",\"error\"])\n", + "\n", + "# Model fitting with CV and printing out processing time\n", + "stime = time.time()\n", + "bdt_cv.fit(X_train, y_train,\n", + " eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)\n", + "print(\"\\nXGBoost cross-validation --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Writing model predictions out for data\n", + "training_monitor(bdt_cv)\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBcv'] = bdt_cv.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bdt : S/sqrt(S+B) = 15.405805978293522 at x = 0.80857503\n", + "bdt_cv : S/sqrt(S+B) = 15.179113634975664 at x = 0.7901888\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Drawing plot of model respone for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "\n", + "# Drawing the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### $k$-folding & early stopping\n", + "\n", + "Performing CV on each of a number, k, of ways to split your data gives you k models to choose from. Some choose to average the performance across the models from each fold as any instability might imply the model will not be reliable. The results below seem stable; each fold provides a consistant performance across multiple metrics, so we'll just choose the best one." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "XGBoost k-folding --- 100.34803915023804 seconds ---\n", + "accuracy: [0.78412488 0.77760064 0.78477481 0.78856118] -> best fold = 3\n", + "-logloss: [-0.42945196 -0.43137575 -0.42765048 -0.42374993] -> best fold = 3\n", + "roc_auc: [0.87520356 0.87239133 0.8768386 0.87975731] -> best fold = 3\n" + ] + } + ], + "source": [ + "# Defining the folds with a seed to test consistently\n", + "splits = 4 # to match 0.25 value of test_train_split default though this may not be optimal\n", + "kf = KFold(n_splits=splits, shuffle=True, random_state=123)\n", + "\n", + "# Printing processing time of the kfold cross-validation\n", + "stime = time.time()\n", + "for train, test in kf.split(X1):\n", + " X_train, X_test = X1.iloc[train], X1.iloc[test]\n", + " y_train, y_test = y1.iloc[train], y1.iloc[test]\n", + " bdt.fit(X_train,y_train)\n", + "print(\"\\nXGBoost k-folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Calculating scores of each fold using variety of CV-metrics\n", + "cv_acc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"accuracy\", n_jobs=-1)\n", + "cv_los = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"neg_log_loss\", n_jobs=-1)\n", + "cv_auc = cross_val_score(bdt, X_test, y_test, cv=splits, scoring=\"roc_auc\", n_jobs=-1)\n", + "\n", + "# Printing results and indicating best fold\n", + "print(\"accuracy: \",cv_acc, \" -> best fold =\", np.argmax(cv_acc) )\n", + "print(\"-logloss: \",cv_los, \" -> best fold =\", np.argmax(cv_los) )\n", + "print(\"roc_auc: \",cv_auc, \" -> best fold =\", np.argmax(cv_auc) )\n", + "bestfold = np.argmax(cv_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping defines a maximum number of rounds the cross-validation metric (we'll use 'error'=1-accuracy) is allowed to not improve before training is terminated. As is standard, we will be reverting back to a 'previous best' model based on test sample score, this helps avoid overtraining. Early stopping prevents us training too many of extra models thus saving time. Set the limit too small though and your training might be cut off prematurely." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "jupyter": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "def modelfit(alg, metric, params, label, predictors, kfold, fbest, early_stop=10):\n", + "\n", + " # Loading data split inputs providing best fold result\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Defining data in terms of training variables and class label\n", + " xgb_param = alg.get_xgb_params()\n", + " data = xgb.DMatrix(params, label=label, feature_names=predictors)\n", + "\n", + " # Runs timed CV on our model using early stopping based on our metric\n", + " stime = time.time()\n", + " cvresult = xgb.cv(xgb_param,\n", + " data,\n", + " num_boost_round=alg.get_params()['n_estimators'],\n", + " #nfold=cv_folds, # to use in build folding\n", + " folds=kfold, # use -> ignores nfold\n", + " metrics=metric)\n", + " alg.set_params(n_estimators=cvresult.shape[0])\n", + " print(\"\\nXGBoost early-stop folding --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Fitting the algorithm on the data with CV evaluation early stopping\n", + " stime = time.time()\n", + " alg.fit(X_train, y_train,\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False)\n", + " training_monitor(alg)\n", + " print(\"XGBoost early-stop limit --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Predicting training set:\n", + " train_predictions = alg.predict(X_train)\n", + " test_predictions = alg.predict(X_test)\n", + "\n", + " # Printing model report:\n", + " print(\"\\nModel Report : best iteration \"+str(cvresult.shape[0]))\n", + " print(\"Train Accuracy : \"+str(metrics.accuracy_score(y_train, train_predictions)))\n", + " print(\"Test Accuracy : \"+str(metrics.accuracy_score(y_test, test_predictions)))\n", + " return cvresult.shape[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function incorporates the k-folding CV and early stopping, saving not only the optimal model but also the index of its training iteration. This means, in our subsequent steps, we can apply an upper limit on training for models based on the convergence of the default hyperparameters, saving us some time. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "XGBoost early-stop folding --- 41.88707709312439 seconds ---\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "KeyError", + "evalue": "'error'", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[15], line 13\u001B[0m\n\u001B[1;32m 11\u001B[0m \u001B[38;5;66;03m# Timing the CV using early stopping\u001B[39;00m\n\u001B[1;32m 12\u001B[0m stime \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mtime()\n\u001B[0;32m---> 13\u001B[0m estimators \u001B[38;5;241m=\u001B[39m \u001B[43mmodelfit\u001B[49m\u001B[43m(\u001B[49m\u001B[43mbdt_es\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43merror\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mX1\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my1\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mtraining_columns\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mkf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mbestfold\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 14\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;130;01m\\n\u001B[39;00m\u001B[38;5;124mmodelfit(bdt_es) --- \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m seconds ---\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m%\u001B[39m (time\u001B[38;5;241m.\u001B[39mtime() \u001B[38;5;241m-\u001B[39m stime))\n\u001B[1;32m 16\u001B[0m \u001B[38;5;66;03m# Saving model predictions\u001B[39;00m\n", + "Cell \u001B[0;32mIn[14], line 29\u001B[0m, in \u001B[0;36mmodelfit\u001B[0;34m(alg, metric, params, label, predictors, kfold, fbest, early_stop)\u001B[0m\n\u001B[1;32m 25\u001B[0m stime \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mtime()\n\u001B[1;32m 26\u001B[0m alg\u001B[38;5;241m.\u001B[39mfit(X_train, y_train,\n\u001B[1;32m 27\u001B[0m eval_set\u001B[38;5;241m=\u001B[39m[(X_train, y_train), (X_test, y_test)],\n\u001B[1;32m 28\u001B[0m verbose\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m)\n\u001B[0;32m---> 29\u001B[0m \u001B[43mtraining_monitor\u001B[49m\u001B[43m(\u001B[49m\u001B[43malg\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 30\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mXGBoost early-stop limit --- \u001B[39m\u001B[38;5;132;01m%s\u001B[39;00m\u001B[38;5;124m seconds ---\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;241m%\u001B[39m (time\u001B[38;5;241m.\u001B[39mtime() \u001B[38;5;241m-\u001B[39m stime))\n\u001B[1;32m 32\u001B[0m \u001B[38;5;66;03m# Predicting training set:\u001B[39;00m\n", + "Cell \u001B[0;32mIn[10], line 19\u001B[0m, in \u001B[0;36mtraining_monitor\u001B[0;34m(alg)\u001B[0m\n\u001B[1;32m 17\u001B[0m \u001B[38;5;66;03m# Plotting classification error as a function of training iteration\u001B[39;00m\n\u001B[1;32m 18\u001B[0m fig, ax \u001B[38;5;241m=\u001B[39m plt\u001B[38;5;241m.\u001B[39msubplots()\n\u001B[0;32m---> 19\u001B[0m ax\u001B[38;5;241m.\u001B[39mplot(x_axis, \u001B[43mresults\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mvalidation_0\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43merror\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m, label\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mTrain\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;66;03m# for each eval_set\u001B[39;00m\n\u001B[1;32m 20\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m results[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mvalidation_1\u001B[39m\u001B[38;5;124m'\u001B[39m]: ax\u001B[38;5;241m.\u001B[39mplot(x_axis, results[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mvalidation_1\u001B[39m\u001B[38;5;124m'\u001B[39m][\u001B[38;5;124m'\u001B[39m\u001B[38;5;124merror\u001B[39m\u001B[38;5;124m'\u001B[39m], label\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mTest\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\u001B[1;32m 21\u001B[0m ax\u001B[38;5;241m.\u001B[39mlegend()\n", + "\u001B[0;31mKeyError\u001B[0m: 'error'" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Defining model with high maximum estimators for use with early stopping\n", + "bdt_es = XGBClassifier(learning_rate = LR, n_estimators=1000,\n", + " # Default values of other hyperparamters\n", + " #max_depth=6, min_child_weight=1,\n", + " #gamma=0, subsample=0.8,\n", + " #colsample_bytree=0.8, scale_pos_weight=1,\n", + " #objective='binary:logistic', # default for binary classification\n", + " #objective='mutli:softprob', num_class=3, # for multiclassifiers\n", + " seed=123, nthread=-1)\n", + "\n", + "# Timing the CV using early stopping\n", + "stime = time.time()\n", + "estimators = modelfit(bdt_es, \"error\", X1, y1, training_columns, kf, bestfold)\n", + "print(\"\\nmodelfit(bdt_es) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Saving model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBes'] = bdt_es.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This provides us with an improved model as well as a benchmark to test against in both performance and training efficiency. When training using new combinations of hyperparameters, the maximum number of estimators from our model report will cut off any new models improving more slowly than our default, while, for more efficient models, the early stopping will kick in." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drawing plot to compare model response for signal and background classes\n", + "plt.figure()\n", + "plot_comparision('XGBcv', mc_df, bkg_df)\n", + "plot_comparision('XGBes', mc_df, bkg_df)\n", + "\n", + "# Drawing comaprison of the signal efficiency vs background rejection curve (ROC)\n", + "plt.figure()\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "\n", + "# Drawing signal significance comparison as a function of minimum cut on model response\n", + "plt.figure()\n", + "bdt_cut_cv = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_cut_es = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hyperameter optimisation\n", + "\n", + "Below we provide a \"grid\" of hyperparameters, defining the structure of the trees and constraints on the learning, but there are many more values to choose from and a larger parameter space to be explored. These optimsations are very problem specific and their impact will have to be weighed against the computing resources and timeframe you have at your disposal. For the sake of expedient demonstration we are comparing the default parameters to only one predetermined variation in 2 parameters. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a function that performs a gridscan of HPs\n", + "\n", + "\n", + "def hpgridscan(alg, metric, params, label, kfold, fbest, early_stop=10):\n", + "\n", + " # Load data fold with best performance\n", + " for k, (train, test) in enumerate(kf.split(params)):\n", + " if (k==fbest):\n", + " X_train, X_test = params.iloc[train], params.iloc[test]\n", + " y_train, y_test = label.iloc[train], label.iloc[test]\n", + "\n", + " # Define a dictionary of numpy arrays for our HPs\n", + " params = {\n", + " 'max_depth':np.array([7]),\n", + " 'min_child_weight':np.array([3]),\n", + " #'max_depth':np.arange( 5, 9, 1 ),\n", + " #'min_child_weight':np.arange( 1, 5, 1 ),\n", + " ##'gamma':np.arange( 0.0, 1.0, 0.1 ),\n", + " ##'colsample_bytree':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'subsample':np.arange( 0.4, 1.0, 0.1 ),\n", + " ##'scale_pos_weight':np.arange( 0.4, 1.6, 0.1 )\n", + " }\n", + "\n", + " # Perform timed grid scan with established n_estimator cutoff and early stopping\n", + " stime = time.time()\n", + " gs = GridSearchCV(estimator=alg,\n", + " param_grid=params,\n", + " scoring=metric,\n", + " #iid=False,\n", + " cv=kf,\n", + " eval_metric=[\"logloss\",\"error\"],\n", + " n_jobs=-1)\n", + " gs.fit(X_train, y_train,\n", + " eval_set=[(X_train, y_train), (X_test, y_test)],\n", + " verbose=False)\n", + " print(\"XGBoost grid-scan --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + " # Return suggested parameters, performance and best model\n", + " training_monitor(gs.best_estimator_)\n", + " print(\"Suggestion:\", gs.best_params_)\n", + " print(\"Accuracy:\" ,gs.best_score_)\n", + " return gs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Running with estimators maximum for shortened training\n", + "bdt_st = XGBClassifier( learning_rate = LR, n_estimators=estimators,\n", + " seed=123, n_jobs=-1)\n", + "\n", + "# Running timed hyperparameter gridscan\n", + "stime = time.time()\n", + "gs = hpgridscan(bdt_st, \"accuracy\", X1, y1, kf, bestfold)\n", + "bdt_gs = gs.best_estimator_\n", + "print(\"\\nhpgridscan(bdt_st) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "# Get model predictions\n", + "for df in [mc_df, bkg_df, data_df, training_data]:\n", + " df['XGBgs'] = bdt_gs.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even this naive grid scan, using the same fold as before for fair comparison, can provide significant improvements as demonstrated above. These may be pushed further by including more hyperparameters for a trade off with processing time. However, even with parrallisation these tasks can take hours or longer and might only provide improvement of O(>1%)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## We could define a model using optimal hyperparameters from our grid scan\n", + "#bdt_opt = XGBClassifier( learning_rate = LR, n_estimators=1000,\n", + "# max_depth=gs.best_params_['max_depth'],\n", + "# min_child_weight=gs.best_params_['min_child_weight'],\n", + "# seed=123, n_jobs=-1 )\n", + "\n", + "## Run with CV early stopping\n", + "#stime = time.time()\n", + "#estimators = modelfit(bdt_opt, 'error', X1, y1, training_columns, kf, bestfold)\n", + "#print(\"\\nmodelfit(bdt_opt) --- %s seconds ---\" % (time.time() - stime))\n", + "\n", + "## Get model predictions\n", + "#for df in [mc_df, bkg_df, data_df, training_data]:\n", + "# df['XGBopt'] = bdt_opt.predict_proba(df[training_columns])[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Comapring model response from the end of last session to the end of this one\n", + "plt.figure()\n", + "plot_comparision('XGB', mc_df, bkg_df)\n", + "plot_comparision('XGBgs', mc_df, bkg_df)\n", + "\n", + "# Comparing model performance for each level of tuning\n", + "plt.figure()\n", + "plot_roc(bdt, training_data, training_columns)\n", + "plot_roc(bdt_cv, training_data, training_columns)\n", + "plot_roc(bdt_es, training_data, training_columns)\n", + "plot_roc(bdt_gs, training_data, training_columns)\n", + "#plot_roc(bdt_opt, training_data, training_columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Comparing the impact on projected performance at each stage of the tutorial\n", + "plt.figure()\n", + "bdt_cut = plot_significance(bdt, training_data, training_columns, \"bdt\")\n", + "bdt_cv_cut = plot_significance(bdt_cv, training_data, training_columns, \"bdt_cv\")\n", + "bdt_es_cut = plot_significance(bdt_es, training_data, training_columns, \"bdt_es\")\n", + "bdt_gs_cut = plot_significance(bdt_gs, training_data, training_columns, \"bdt_gs\")\n", + "#bdt_opt_cut = plot_significance(bdt_opt, training_data, training_columns, \"bdt_opt\")\n", + "\n", + "# Comparing best cuts impact on mass for original and tuned model\n", + "plt.figure()\n", + "data_bdt_cut = data_df.query('XGB > %f' %bdt_cut )\n", + "plot_mass(data_bdt_cut, label='XGB default', norm=True)\n", + "data_gs_cut = data_df.query('XGBgs > %f' %bdt_gs_cut )\n", + "plot_mass(data_gs_cut, label='XGB tuned', norm=True)\n", + "plt.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Comparing our data sample's mass plot having applied the cut optimised for $\\sigma=\\frac{S}{\\sqrt{S+B}}$ from each BDT output, we can see how the improved model reduces relative background. However, while we define our signal training sample from MC you'll remember we defined our background training sample from the data !(3.0 < JPsi_M < 3.2).\n", + "\n", + "We can see shoulders at the edges of the regions where we define our background training sample in our data's mass spectrum now. Our training and validation samples include a subset of our data sample so there's potential that our model is learning the difference between MC and data and exploiting that or demonstrating overtraining on the 'previously seen' data (remember we could see our train and test samples beginning to diverge in our validation metrics with more iterations).\n", + "\n", + "Below you can see replotting the normalised mass distribution from just the data not included in training demonstrates no significant improvement. This is not ideal and might be addressed by choosing the setup of our training more carefully. For example, we could train using background from same-sign muon MC across the full mass range (a common practice in LHC experiments) or, using other libraries such as UGBoost to introduce a punishment to the training for introducing a depedance of efficiency on mass." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sig_df = data_df.query('(3.0 < Jpsi_M < 3.2)')\n", + "sig_bdt_cut = sig_df.query('XGB > %f' %bdt_cut )\n", + "plot_mass(sig_bdt_cut, label='XGB default', norm=True)\n", + "sig_gs_cut = sig_df.query('XGBgs > %f' %bdt_gs_cut )\n", + "plot_mass(sig_gs_cut, label='XGB tuned', norm=True)\n", + "plt.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also choose a higher learning rate to perform course scans of your space and decrease it again to retrain your final model. If you can afford to, it might be best to include learning rate itself as a parameter in your grid. With some libraries you can specify your choice of kernel. Both these choices will impact your optimal maximum number of iterations, so setting it sufficiently high and using early stopping might be a good strategy.\n", + "\n", + "For less exhaustive and non-discritised methods try smart combinations of the following to perform adaptive scans or build your own:\n", + "* sklearn.model_selection.RandomizedSearchCV\n", + "* sklearn.model_selection.GridSearchCV\n", + "\n", + "Moving to higher dimentional optimisation problems may require more sophisticated solutions:\n", + "* skopt.BayesSearchCV\n", + "* hyperopt.tpe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Full stats plots saved here: bit.ly/LHCb_XGB_Tuning\n", + "Run with full stats by removing entrystop at max_events in cell 8." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Final lesson time and processing time check\n", + "print(\"Notebook real time --- %s seconds ---\" % (time.time() - stt))\n", + "print(\"Notebook CPU time --- %s seconds ---\" % (time.process_time() - stc))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:anaessentials311] *", + "language": "python", + "name": "python" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/advanced-python/40Histograms.ipynb b/advanced-python/40Histograms.ipynb index 2827a6a9..917384e4 100644 --- a/advanced-python/40Histograms.ipynb +++ b/advanced-python/40Histograms.ipynb @@ -547,22 +547,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/45DemoReweighting.ipynb b/advanced-python/45DemoReweighting.ipynb index 02dd9a47..77215a8a 100644 --- a/advanced-python/45DemoReweighting.ipynb +++ b/advanced-python/45DemoReweighting.ipynb @@ -53,12 +53,10 @@ "columns = ['hSPD', 'pt_b', 'pt_phi', 'vchi2_b', 'mu_pt_sum']\n", "\n", "with uproot.open('https://starterkit.web.cern.ch/starterkit/data/advanced-python-2019/MC_distribution.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", " ) as original_file:\n", " original_tree = original_file['tree']\n", " original = original_tree.arrays(library='pd')\n", "with uproot.open('https://starterkit.web.cern.ch/starterkit/data/advanced-python-2019/RD_distribution.root',\n", - " httpsource={'chunkbytes': 1024*1024, 'limitbytes': 33554432, 'parallel': 64}\n", " ) as target_file:\n", " target_tree = target_file['tree']\n", " target = target_tree.arrays(library='pd')\n", @@ -487,22 +485,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/50LikelihoodInference.ipynb b/advanced-python/50LikelihoodInference.ipynb index 0ec7c601..536fc13f 100644 --- a/advanced-python/50LikelihoodInference.ipynb +++ b/advanced-python/50LikelihoodInference.ipynb @@ -41,7 +41,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "jupyter": { + "is_executing": true + } + }, "outputs": [], "source": [ "%store -r bkg_df\n", @@ -91,19 +95,7 @@ "metadata": {}, "outputs": [], "source": [ - "obs = zfit.Space('Jpsi_M', limits=(2.8, 3.5)) # defining the observable" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# bkg = zfit.Data.from_pandas(bkg_df['Jpsi_M'], obs=obs)\n", - "# OR\n", - "# obs_bkg = zfit.Space('Jpsi_M', limits=(2.8, 3.0)) + zfit.Space('Jpsi_M', limits=(3.2, 3.5))\n", - "# bkg_two = zfit.Data.from_pandas(data_df['Jpsi_M'], obs=obs_bkg)" + "obs = zfit.Space('Jpsi_M', 2.8, 3.5, label='$J/\\\\psi$ mass [GeV]') # defining the observable" ] }, { @@ -120,11 +112,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Difference of the two spaces\n", - "\n", - "While the first space is defined over the whole space from 2.8 to 3.5, the second consists of two distinct regions. Therefore we can use the original space and zfit applies the cut, the same as we did before to the `bkg_df`.\n", + "## Creating the model\n", "\n", - "The difference comes when using the normalization in the PDF: we can either normalize it over the whole range or only over part of it." + "In the following, we create an extended background and signal model and combine it.\n", + "zfit takes care of correctly normalizing and adding the PDFs and the yields." ] }, { @@ -133,12 +124,12 @@ "metadata": {}, "outputs": [], "source": [ - "lambd = zfit.Parameter('lambda', -0.1, -2, 2)\n", - "bkg_yield = zfit.Parameter('bkg_yield', 5000, 0, 200000, step_size=1)\n", + "lambd = zfit.Parameter('lambda', -0.1, -2, 2, label=r\"$\\lambda$\") # label is optional\n", + "bkg_yield = zfit.Parameter('bkg_yield', 5000, 0, 200000, step_size=1, label=\"Bkg yield\")\n", "\n", "mu = zfit.Parameter('mu', 3.1, 2.9, 3.3)\n", "sigma = zfit.Parameter('sigma', 0.1, 0, 0.5)\n", - "sig_yield = zfit.Parameter('sig_yield', 200, 0, 10000, step_size=1)" + "sig_yield = zfit.Parameter('sig_yield', 200, 0, 10000, step_size=1, label=\"Signal yield\")" ] }, { @@ -147,8 +138,7 @@ "metadata": {}, "outputs": [], "source": [ - "bkg_pdf = zfit.pdf.Exponential(lambd, obs=obs)\n", - "bkg_pdf.set_yield(bkg_yield)" + "bkg_pdf = zfit.pdf.Exponential(lambd, obs=obs, extended=bkg_yield)" ] }, { @@ -157,8 +147,7 @@ "metadata": {}, "outputs": [], "source": [ - "sig_pdf = zfit.pdf.Gauss(obs=obs, mu=mu, sigma=sigma)\n", - "sig_pdf.set_yield(sig_yield)" + "sig_pdf = zfit.pdf.Gauss(obs=obs, mu=mu, sigma=sigma, extended=sig_yield)" ] }, { @@ -190,15 +179,15 @@ " if ax is None:\n", " ax = plt.gca()\n", "\n", - " lower, upper = data.space.limit1d\n", + " lower, upper = data.space.v1.limits\n", "\n", " # Creates and histogram of the data and plots it with mplhep.\n", - " counts, bin_edges = np.histogram(data.unstack_x(), bins=nbins)\n", - " mplhep.histplot(counts, bins=bin_edges, histtype=\"errorbar\", yerr=True,\n", + " binneddata = data.to_binned(nbins)\n", + " mplhep.histplot(binneddata, histtype=\"errorbar\", yerr=True,\n", " label=\"Data\", ax=ax, color=\"black\")\n", "\n", - " binwidth = np.diff(bin_edges)[0]\n", - " x = np.linspace(lower, upper, num=1000) # or tf.linspace\n", + " binwidth = binneddata.space.binning[0].widths[0] # all have the same width\n", + " x = np.linspace(lower, upper, num=1000)\n", "\n", " # Line plots of the total pdf and the sub-pdfs.\n", " y = model.ext_pdf(x) * binwidth\n", @@ -207,8 +196,8 @@ " ym = m.ext_pdf(x) * binwidth\n", " ax.plot(x, ym, label=l, color=c)\n", "\n", - " plt.xlabel('$J/\\\\psi$ mass [GeV]')\n", - " ax.set_title(data.data_range.obs[0])\n", + " plt.xlabel(data.space.labels[0])\n", + " ax.set_title(data.space.labels[0])\n", " ax.set_xlim(lower, upper)\n", " ax.legend(fontsize=15)\n", "\n", @@ -255,7 +244,7 @@ "metadata": {}, "outputs": [], "source": [ - "minimizer = zfit.minimize.Minuit()\n", + "minimizer = zfit.minimize.Minuit(gradient=\"zfit\") # to use the analytic gradient. Set it to true will use the iminuit one\n", "# minimizer = zfit.minimize.NLoptLBFGSV1() # can be changed but maybe not as powerful as iminuit\n", "# minimizer = zfit.minimize.ScipySLSQPV1()" ] @@ -393,22 +382,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/60sPlot.ipynb b/advanced-python/60sPlot.ipynb index cfac4125..37337fda 100644 --- a/advanced-python/60sPlot.ipynb +++ b/advanced-python/60sPlot.ipynb @@ -880,22 +880,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/70ScikitHEPUniverse.ipynb b/advanced-python/70ScikitHEPUniverse.ipynb index 98710b73..a146ec27 100644 --- a/advanced-python/70ScikitHEPUniverse.ipynb +++ b/advanced-python/70ScikitHEPUniverse.ipynb @@ -272,22 +272,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/advanced-python/README.md b/advanced-python/README.md index 2dde2d2d..e56dac86 100644 --- a/advanced-python/README.md +++ b/advanced-python/README.md @@ -4,6 +4,8 @@ Welcome to the advanced Python tutorials of the starterkit. This lecture covers and the notebooks available may fill more than the scheduled lesson. However, they also serve as a knowledge base that one can always come back to lock up things. +[//]: # (33ModelTuning.ipynb was commented out below, fails, would need to be fixed) + ```eval_rst .. toctree:: :maxdepth: 3 @@ -17,7 +19,6 @@ a knowledge base that one can always come back to lock up things. 30Classification.ipynb 31ClassificationExtension.ipynb 32BoostingToUniformity.ipynb - 33ModelTuning.ipynb 40Histograms.ipynb 45DemoReweighting.ipynb 50LikelihoodInference.ipynb diff --git a/environment.yml b/environment.yml index a10aefbc..3ed6c4f8 100644 --- a/environment.yml +++ b/environment.yml @@ -1,7 +1,9 @@ name: analysis-essentials channels: - conda-forge + - nodefaults dependencies: + - python ~=3.11.0 - boost-histogram - hep_ml - hepunits @@ -11,23 +13,26 @@ dependencies: - mplhep - nb_conda - nb_conda_kernels - - notebook<7.0.0 # fixes failed nb_conda install https://github.com/DeepLabCut/DeepLabCut/issues/2322 + - notebook # <7.0.0 # fixes failed nb_conda install https://github.com/DeepLabCut/DeepLabCut/issues/2322 - numpy - pandas - particle - pandoc - pip + - uv - scikit-learn - scipy - - uproot <5.0.0 # 5.0.0 breaks the httpsource argument with open, TODO upgrade (what's the equivalent?) in the "get_truth" function - - uproot3 + - uproot >=5.0.0 # 5.0.0 breaks the httpsource argument with open, TODO upgrade (what's the equivalent?) in the "get_truth" function + - aiohttp # needed for uproot http access + - requests # needed for uproot http access - vector - wget -# - xgboost - - zfit >=0.14.0 + # - xgboost - hepstats - pip: - - git+https://github.com/hsf-training/python-lesson.git - - formulate - - starterkit-ci - - xgboost + - zfit >=0.24.0 # to have the newest version, TensorFlow is a bit stuck: https://github.com/conda-forge/tensorflow-feedstock/pull/408 + - zfit-physics >=0.7.0 + - git+https://github.com/hsf-training/python-lesson.git + - formulate + - starterkit-ci + - xgboost diff --git a/git/fig/git-freshly-made-gitlab-repo.png b/git/fig/git-freshly-made-gitlab-repo.png index 41064276..0ed30de8 100644 Binary files a/git/fig/git-freshly-made-gitlab-repo.png and b/git/fig/git-freshly-made-gitlab-repo.png differ diff --git a/git/fig/github-add-collaborators.png b/git/fig/github-add-collaborators.png index 96922cc7..29faa14b 100644 Binary files a/git/fig/github-add-collaborators.png and b/git/fig/github-add-collaborators.png differ diff --git a/git/fig/github-change-repo-string.png b/git/fig/github-change-repo-string.png index 29720991..f8072d27 100644 Binary files a/git/fig/github-change-repo-string.png and b/git/fig/github-change-repo-string.png differ diff --git a/git/fig/github-create-repo-01.png b/git/fig/github-create-repo-01.png index 6dc6bf21..804d0d80 100644 Binary files a/git/fig/github-create-repo-01.png and b/git/fig/github-create-repo-01.png differ diff --git a/git/fig/github-create-repo-02.png b/git/fig/github-create-repo-02.png index d0c006f9..2da6a07b 100644 Binary files a/git/fig/github-create-repo-02.png and b/git/fig/github-create-repo-02.png differ diff --git a/git/fig/github-create-repo-03.png b/git/fig/github-create-repo-03.png index ea56feb0..16577fd9 100644 Binary files a/git/fig/github-create-repo-03.png and b/git/fig/github-create-repo-03.png differ diff --git a/git/fig/github-find-repo-string.png b/git/fig/github-find-repo-string.png index 00276599..b0144b2c 100644 Binary files a/git/fig/github-find-repo-string.png and b/git/fig/github-find-repo-string.png differ diff --git a/git/fig/gitlab-add-collaborators.png b/git/fig/gitlab-add-collaborators.png index 39bec341..d436913c 100644 Binary files a/git/fig/gitlab-add-collaborators.png and b/git/fig/gitlab-add-collaborators.png differ diff --git a/git/fig/gitlab-ci-artefacts.png b/git/fig/gitlab-ci-artefacts.png index 493b63d7..de15a97c 100644 Binary files a/git/fig/gitlab-ci-artefacts.png and b/git/fig/gitlab-ci-artefacts.png differ diff --git a/git/fig/gitlab-ci-first-log.png b/git/fig/gitlab-ci-first-log.png index 3a5162cd..4804bf4c 100644 Binary files a/git/fig/gitlab-ci-first-log.png and b/git/fig/gitlab-ci-first-log.png differ diff --git a/git/fig/gitlab-ci-pipeline-link.png b/git/fig/gitlab-ci-pipeline-link.png index e05c3de9..2d3bf3f9 100644 Binary files a/git/fig/gitlab-ci-pipeline-link.png and b/git/fig/gitlab-ci-pipeline-link.png differ diff --git a/git/fig/gitlab-ci-view-log.png b/git/fig/gitlab-ci-view-log.png index 6365d567..e9029c28 100644 Binary files a/git/fig/gitlab-ci-view-log.png and b/git/fig/gitlab-ci-view-log.png differ diff --git a/git/fig/gitlab-ci-view-pipeline.png b/git/fig/gitlab-ci-view-pipeline.png index 6200cab5..b10365e5 100644 Binary files a/git/fig/gitlab-ci-view-pipeline.png and b/git/fig/gitlab-ci-view-pipeline.png differ diff --git a/git/fig/gitlab-create-repo-01.png b/git/fig/gitlab-create-repo-01.png index e53818d2..5a24d3e8 100644 Binary files a/git/fig/gitlab-create-repo-01.png and b/git/fig/gitlab-create-repo-01.png differ diff --git a/git/fig/gitlab-create-repo-02.png b/git/fig/gitlab-create-repo-02.png index 31db79ed..b60b525a 100644 Binary files a/git/fig/gitlab-create-repo-02.png and b/git/fig/gitlab-create-repo-02.png differ diff --git a/git/fig/gitlab-create-repo-03.png b/git/fig/gitlab-create-repo-03.png index 55eca77d..b47c3eef 100644 Binary files a/git/fig/gitlab-create-repo-03.png and b/git/fig/gitlab-create-repo-03.png differ diff --git a/git/fig/gitlab-find-repo-string.png b/git/fig/gitlab-find-repo-string.png index c990b425..06bec1f4 100644 Binary files a/git/fig/gitlab-find-repo-string.png and b/git/fig/gitlab-find-repo-string.png differ diff --git a/git/fig/gitlab-pr-close.png b/git/fig/gitlab-pr-close.png index 0366f465..6520fbf6 100644 Binary files a/git/fig/gitlab-pr-close.png and b/git/fig/gitlab-pr-close.png differ diff --git a/git/fig/gitlab-pr-commits.png b/git/fig/gitlab-pr-commits.png index 44acc14d..e0ee89a1 100644 Binary files a/git/fig/gitlab-pr-commits.png and b/git/fig/gitlab-pr-commits.png differ diff --git a/git/fig/gitlab-pr-diff.png b/git/fig/gitlab-pr-diff.png index 9a7378ff..e01e9802 100644 Binary files a/git/fig/gitlab-pr-diff.png and b/git/fig/gitlab-pr-diff.png differ diff --git a/git/fig/gitlab-pr-discussion.png b/git/fig/gitlab-pr-discussion.png index ef8d5fdf..44d4c265 100644 Binary files a/git/fig/gitlab-pr-discussion.png and b/git/fig/gitlab-pr-discussion.png differ diff --git a/git/fig/gitlab-pr-fork.png b/git/fig/gitlab-pr-fork.png index c3e61773..8dfb56c6 100644 Binary files a/git/fig/gitlab-pr-fork.png and b/git/fig/gitlab-pr-fork.png differ diff --git a/git/fig/gitlab-pr-merge.png b/git/fig/gitlab-pr-merge.png index 97c8d3bb..a879a065 100644 Binary files a/git/fig/gitlab-pr-merge.png and b/git/fig/gitlab-pr-merge.png differ diff --git a/git/fig/gitlab-pr-newmergerequest.png b/git/fig/gitlab-pr-newmergerequest.png index 587a33f6..af897372 100644 Binary files a/git/fig/gitlab-pr-newmergerequest.png and b/git/fig/gitlab-pr-newmergerequest.png differ diff --git a/git/fig/gitlab-pr-selectbranch.png b/git/fig/gitlab-pr-selectbranch.png index 0af33f43..683b9355 100644 Binary files a/git/fig/gitlab-pr-selectbranch.png and b/git/fig/gitlab-pr-selectbranch.png differ diff --git a/git/fig/gitlab-pr-title.png b/git/fig/gitlab-pr-title.png index 3d28fc92..e1b26240 100644 Binary files a/git/fig/gitlab-pr-title.png and b/git/fig/gitlab-pr-title.png differ diff --git a/git/fig/gitlab-pr-wheretofork.png b/git/fig/gitlab-pr-wheretofork.png index ca5e2581..6d5c7014 100644 Binary files a/git/fig/gitlab-pr-wheretofork.png and b/git/fig/gitlab-pr-wheretofork.png differ diff --git a/git/fig/phd101212s.png b/git/fig/phd101212s.png index c47c428b..535a1a0b 100644 Binary files a/git/fig/phd101212s.png and b/git/fig/phd101212s.png differ diff --git a/hsf_logo_angled.png b/hsf_logo_angled.png index 8c1dbc78..b44dc09b 100644 Binary files a/hsf_logo_angled.png and b/hsf_logo_angled.png differ diff --git a/python/01basics.ipynb b/python/01basics.ipynb index 451812bd..5c158344 100644 --- a/python/01basics.ipynb +++ b/python/01basics.ipynb @@ -3,9 +3,11 @@ { "cell_type": "markdown", "metadata": { + "editable": true, "slideshow": { "slide_type": "slide" - } + }, + "tags": [] }, "source": [ "# 1: Basics" @@ -54,6 +56,9 @@ "cell_type": "code", "execution_count": null, "metadata": { + "jupyter": { + "is_executing": true + }, "slideshow": { "slide_type": "subslide" } @@ -127,6 +132,9 @@ "cell_type": "code", "execution_count": null, "metadata": { + "jupyter": { + "is_executing": true + }, "slideshow": { "slide_type": "subslide" } @@ -139,7 +147,7 @@ "\n", "# several ways for strings\n", "c = \"hello\"\n", - "d = 'world'\n", + "d = \"world\"\n", "cd = \"welcome to this 'world' here\" # we can now use '' inside (or vice versa)\n", "e = \"\"\"hello world\"\"\" # which we can also wrap\n", "e2 = \"\"\"hello\n", @@ -153,6 +161,9 @@ "cell_type": "code", "execution_count": null, "metadata": { + "jupyter": { + "is_executing": true + }, "slideshow": { "slide_type": "subslide" } @@ -551,7 +562,12 @@ }, "outputs": [], "source": [ - "person = {'name': \"Jonas Eschle\", 'age': 42, 5: True, 11: \"hi\"} # we can use strings but also other elements\n", + "person = {\n", + " \"name\": \"Jonas Eschle\",\n", + " \"age\": 42,\n", + " 5: True,\n", + " 11: \"hi\",\n", + "} # we can use strings but also other elements\n", "print(person)" ] }, @@ -565,7 +581,7 @@ }, "outputs": [], "source": [ - "print(person['name'])\n", + "print(person[\"name\"])\n", "print(person[5])\n", "print(person[11])" ] @@ -591,7 +607,7 @@ }, "outputs": [], "source": [ - "person['age'] = '42.00001'" + "person[\"age\"] = \"42.00001\"" ] }, { @@ -615,7 +631,7 @@ }, "outputs": [], "source": [ - "person['alias'] = \"Mayou36\"\n", + "person[\"alias\"] = \"Mayou36\"\n", "print(person)" ] }, @@ -645,7 +661,7 @@ }, "outputs": [], "source": [ - "person['nationality']" + "person[\"nationality\"]" ] }, { @@ -669,7 +685,9 @@ }, "outputs": [], "source": [ - "hair_color = person.get('hair_color', 'unknown color') # the second argument gets returned if key is not in dict\n", + "hair_color = person.get(\n", + " \"hair_color\", \"unknown color\"\n", + ") # the second argument gets returned if key is not in dict\n", "print(hair_color)" ] }, @@ -1013,8 +1031,8 @@ }, "outputs": [], "source": [ - "a = 'spam'\n", - "list_a = [1, 5, 2, 'world', 1]\n", + "a = \"spam\"\n", + "list_a = [1, 5, 2, \"world\", 1]\n", "print(a, b)\n", "print(list_a, list_b)" ] @@ -1056,7 +1074,7 @@ }, "outputs": [], "source": [ - "list_a[1] = 'hello'\n", + "list_a[1] = \"hello\"\n", "print(list_a, list_b)" ] }, @@ -1096,7 +1114,7 @@ }, "outputs": [], "source": [ - "list_a[2] = 'my'\n", + "list_a[2] = \"my\"\n", "print(list_a)\n", "print(list_b)\n", "print(list_c)" @@ -1141,7 +1159,7 @@ }, "outputs": [], "source": [ - "list_c[2] = 'my' # make it the same as the other lists\n", + "list_c[2] = \"my\" # make it the same as the other lists\n", "print(list_a == list_c)" ] }, @@ -1220,7 +1238,7 @@ "list_of_squares = [i**2 for i in range(N)]\n", "sum_of_squares = sum(list_of_squares)\n", "\n", - "print('Sum of squares for', N, 'is', sum_of_squares)" + "print(\"Sum of squares for\", N, \"is\", sum_of_squares)" ] }, { @@ -1257,6 +1275,9 @@ "cell_type": "code", "execution_count": null, "metadata": { + "jupyter": { + "is_executing": true + }, "slideshow": { "slide_type": "subslide" } @@ -1278,7 +1299,7 @@ "outputs": [], "source": [ "N = 5\n", - "print('The square of', N, 'is', squares[N])" + "print(\"The square of\", N, \"is\", squares[N])" ] }, { @@ -1366,29 +1387,21 @@ "eta_low = 2\n", "eta_high = 5\n", "\n", - "cut_string = f'(PT > {pt_cut:.2f}) & ({eta_low} < ETA < {eta_high})'\n", + "cut_string = f\"(PT > {pt_cut:.2f}) & ({eta_low} < ETA < {eta_high})\"\n", "print(cut_string)" ] } ], "metadata": { "celltoolbar": "Tags", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "nbconvert_exporter": "python" }, "nbsphinx": { "execute": "auto" diff --git a/python/README.md b/python/README.md index 3d5f98ff..e8c22269 100644 --- a/python/README.md +++ b/python/README.md @@ -47,11 +47,11 @@ Sounds good? Then let’s get going! 00scripts.ipynb 01basics.ipynb operators.md - numbers.md - strings.md - lists.md - dictionaries.md - conditions.md + numbers.ipynb + strings.ipynb + lists.ipynb + dictionaries.ipynb + conditions.ipynb methods.md scripting.md modules.md diff --git a/python/classes.ipynb b/python/classes.ipynb index 4fe59ea0..6c660277 100644 --- a/python/classes.ipynb +++ b/python/classes.ipynb @@ -484,22 +484,14 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { - "name": "ipython", - "version": 3 + "name": "ipython" }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" + "nbconvert_exporter": "python" } }, "nbformat": 4, diff --git a/python/conditions.ipynb b/python/conditions.ipynb new file mode 100644 index 00000000..3223d4cc --- /dev/null +++ b/python/conditions.ipynb @@ -0,0 +1,966 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conditions\n", + "\n", + "Sometimes, often while looping, you only want to do things depending on\n", + "something’s value. Specifying _conditions_ like this is pretty simple in Python.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pizzas = [\"Pineapple\", \"Cheese\", \"Pepperoni\", \"Hot dog\"]\n", + "for p in pizzas:\n", + " if p == \"Cheese\":\n", + " print(\"Nice pizza!\")\n", + " elif p == \"Pepperoni\":\n", + " print(\"Amazing pizza!\")\n", + " else:\n", + " print(\"Weird pizza.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Like the \"body\" of the `for` loop, called a _block_, the block in the `if`,\n", + "`elif`, and `else` statements must be indented. The convention we adopt is to\n", + "use four spaces for indentation.\n", + "\n", + "The `if` statement starts with `if` (duh!) and what follows is a _condition_.\n", + "If this condition isn’t met, the next `elif` (for \"else-if\") condition is\n", + "evaluated. If this also isn’t met, the `else` block is run. You can use as many\n", + "`elif` conditions as you like, or none at all, and the `else` block is optional.\n", + "\n", + "\n", + "Python evaluates a condition and sees whether it is truth-like or not. If it is\n", + "truth-like, the code in the block is run.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if pizzas[0] == \"Cheese\":\n", + " print(\"It is cheese, my dudes.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pizzas[0] == \"Cheese\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pizzas[1] == \"Cheese\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`False` and `True` are variables. They\n", + "correspond to the possible values a boolean variable can have.\n", + "\n", + "The result of a comparison is `True` or `False`, and we can perform comparisons\n", + "using several operators, like `==` for equality, `!=` for inequality, `>` and\n", + "`<` for relative magnitude, and so on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "True is False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "True is not False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "1 > 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(1 > 2) is False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This shows that we can combine comparison operators, just like with `+` and\n", + "friends. We can also use `and` to require multiple conditions, `or` to require\n", + "at least one, and `not` to negate a result.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "1 < x and x < 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "3 < x or 1 < x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "not 1 < x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that `and`, `or` and `not` have lower precedence than `>`, `<` and `==`, but\n", + "you can (and in general should) use parentheses to be more explicit.\n", + "\n", + "**Exercise**\n", + "\n", + "Play around with the booleans and try to answer the following! Can you do\n", + " - double negotiations\n", + " - convert a boolean to an integer\n", + " - use parentheses to create a more complicated expression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course, we can compare everything we have played around with so far." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x = [1, 2]\n", + "y = [1, 2]\n", + "z = {\"hero\": \"thor\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x == y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y == z" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For collection objects like lists, tuples, and dictionaries, we can easily ask\n", + "them if they contain something in particular using the `in` operator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "3 in x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "2 in y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"thor\" in z" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last statement is `False` because `in` queries a dictionaries _keys_, not \n", + "its values. This is useful if you want to access a key in a dictionary that \n", + "might not exist:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "z[\"pizza\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if \"pizza\" in z:\n", + " print(\"We have pizza\", z[\"pizza\"])\n", + "else:\n", + " print(\"No pizza :(\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Note that `in` doesn’t dive into nested collections, but only looks at the top\n", + "level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "1 in [[1, 2], [3, 4]] # the elements are two lists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "[1, 2] in [[1, 2], [3, 4]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "\n", + "**Advanced (skip on first read)**\n", + "\n", + "Find the double-underscore method on lists and dictionaries that corresponds to\n", + "the `in` operator, and check that it does the same thing as the operator.\n", + "\n", + "**Solutions**\n", + "\n", + "Taking lists as an example, the `dir` method can tell us what methods are\n", + "available. The `__contains__` method sounds promising." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = [1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x.__contains__(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x.__contains__(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Strings work a lot like lists, which makes sense because they are effectively\n", + "collections of single characters. This means we can also query string contents\n", + "with `in`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "fact = \"The best hero is Thor.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "\"Thor\" in fact" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "\"Iron Man\" in fact" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Truthiness\n", + "\n", + "It’s conventional not to explicitly compare a condition to `True`, because the\n", + "`if` statement already does that for us." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if (\"Pineapple\" in pizzas) is True:\n", + " print(\"Weird.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if \"Pineapple\" in pizzas:\n", + " print(\"Not weird.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Likewise, rather than comparing for False, we just use `not`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if (\"Pineapple\" in pizzas) is False:\n", + " print(\"Weird.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if \"Pineapple\" not in pizzas:\n", + " print(\"Not weird.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "\"Pineapple\" not in pizzas # not ('Pineapple' in pizzas)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "\"Pineapple\" not in pizzas" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The last two lines show that we can use `not in` for checking that something\n", + "_is not_ in a collection. This reads more naturally." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "All Python objects are truth-like unless they are the value `False`, the value\n", + "`None`, or are empty collections (such as `\"\"`, `[]`, `()`, `{}`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if list() or dict() or tuple() or \"\":\n", + " print(\"You won’t see me!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The value `None`, which is available as the variable named `None`, is often\n", + "used as placeholder for an empty value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "favourite = None\n", + "for p in pizzas:\n", + " if \"Olives\" in p:\n", + " favourite = p" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if favourite:\n", + " print(f\"Found favourite: {favourite}\")\n", + "else:\n", + " print(\"No favourite :(\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "It behaves as false-y value in conditions." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Conditions in loops\n", + "\n", + "`for` loops and comprehensions are the most common ways of iterating in Python.\n", + "We’ve already seen that using conditions in these can be useful." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "not_cheesy = [p for p in pizzas if \"cheese\" not in p.lower()]\n", + "not_cheesy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Another way of iterating is with `while`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "i = 5\n", + "while i > 0:\n", + " print(f\"T-minus {i} seconds\")\n", + " # Equivalent to `i = i - 1`\n", + " i -= 1\n", + "print(\"Blast off!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The `while` loop checks the condition, runs the block, and then re-checks the\n", + "condition. If we don’t do something in the loop to change the result of the\n", + "condition, we will end up looping forever!\n", + "\n", + "You can uncomment and run the below, you may need to stop the kernel manually with the stop button." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# i = 5\n", + "# while i > 0:\n", + "# print('All work and no play makes Jack a dull boy')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Because we do not change the value of `i` in the loop, the condition always\n", + "evaluates to `True`, so we’re stuck. You can stop Python running the code by\n", + "typing the `Ctrl-c` key combination." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Sometimes you want to stop iterating when some condition is met. You could\n", + "achieve this with a `while` loop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ok = False\n", + "i = 0\n", + "while not ok:\n", + " ok = \"cheese\" in pizzas[i].lower()\n", + " # Equivalent to `i = i + 1`\n", + " i += 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "i" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "pizzas[i - 1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "It is not nice to have to keep track of these `ok` and `i` variables. Instead,\n", + "we can use a `for` loop, which feels much more natural when iterating over a\n", + "collection, and `break` to stop looping (we can also use a break with a `while` loop)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for pizza in pizzas:\n", + " if \"cheese\" in pizza.lower():\n", + " yum = pizza\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "yum" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/conditions.md b/python/conditions.md deleted file mode 100644 index 68f8686b..00000000 --- a/python/conditions.md +++ /dev/null @@ -1,319 +0,0 @@ -# Conditions - -Sometimes, often while looping, you only want to do things depending on -something’s value. Specifying _conditions_ like this is pretty simple in Python. - -```python ->>> pizzas = ['Pineapple', 'Cheese', 'Pepperoni', 'Hot dog'] ->>> for p in pizzas: -... if p == 'Cheese': -... print('Nice pizza!') -... elif p == 'Pepperoni': -... print('Amazing pizza!') -... else: -... print('Weird pizza.') -... -Weird pizza. -Nice pizza! -Amazing pizza! -Weird pizza. -``` - -Like the "body" of the `for` loop, called a _block_, the block in the `if`, -`elif`, and `else` statements must be indented. The convention we adopt is to -use four spaces for indentation. - -The `if` statement starts with `if` (duh!) and what follows is a _condition_. -If this condition isn’t met, the next `elif` (for "else-if") condition is -evaluated. If this also isn’t met, the `else` block is run. You can use as many -`elif` conditions as you like, or none at all, and the `else` block is optional. - -{% callout "Ternary conditional operator" %} - -You can use a succinct one-line syntax for conditional assignments like this: - -```python ->>> x = 'ok' if pizzas[0] == 'Cheese' else 'not ok' ->>> x -'not ok' -``` - -Make sure your line does not get too long in order not to impair its -readability! - -{% endcallout %} - - -Python evaluates a condition and sees whether it is truth-like or not. If it is -truth-like, the code in the block is run. - -```python ->>> if pizzas[0] == 'Cheese': -... print('It is cheese, my dudes.') -... ->>> pizzas[0] == 'Cheese' -False ->>> pizzas[1] == 'Cheese' -True -``` - -`False` and `True` are variables and they can actually be reassigned to some -other value (though it is quite pointless and dangerous to do that!) They -correspond to the possible values a boolean variable can have. Here is why you -should never touch those variables: - -```python ->>> True = 1 ->>> True -1 ->>> True = False ->>> True == False -True -``` - -The result of a comparison is `True` or `False`, and we can perform comparisons -using several operators, like `==` for equality, `!=` for inequality, `>` and -`<` for relative magnitude, and so on. - -```python ->>> True, False -(True, False) ->>> False -False ->>> True == False -False ->>> True != False -True ->>> 1 > 2 -False ->>> (1 > 2) == False -True -``` - -This shows that we can combine comparison operators, just like with `+` and -friends. We can also use `and` to require multiple conditions, `or` to require -at least one, and `not` to negate a result. - -```python -x = 2 ->>> 1 < x and x < 3 ->>> 3 < x or 1 < x -True ->>> not 1 < x -False -``` - -Note that `and`, `or` and `not` have lower precedence than `>`, `<` and `==`, but -you can use parentheses to be more explicit. - -Of course, we can compare everything we have played around with so far. - -```python ->>> x = [1, 2] ->>> y = [1, 2] ->>> z = {'hero': 'thor'} ->>> x == y -True ->>> y == z -False -``` - -For collection objects like lists, tuples, and dictionaries, we can easily ask -them if they contain something in particular using the `in` operator. - -```python ->>> 3 in x -False ->>> 2 in y -True ->>> 'thor' in z -False -``` - -The last statement is `False` because `in` queries a dictionaries _keys_, not -its values. This is useful if you want to access a key in a dictionary that -might not exist: - -```python ->>> z['pizza'] -Traceback (most recent call last): - File "", line 1, in -NameError: name 'z' is not defined ->>> if 'pizza' in z: -... print('We have pizza', z['pizza']) -... else -... print('No pizza :(') -``` - -Note that `in` doesn’t dive into nested collections, but only looks at the top -level. - -```python ->>> 1 in [[1, 2], [3, 4]] -False ->>> [1, 2] in [[1, 2], [3, 4]] -True -``` - -{% challenge "The `in` operator" %} - -Find the double-underscore method on lists and dictionaries that corresponds to -the `in` operator, and check that it does the same thing as the operator. - -{% solution "Solution" %} - -Taking lists as an example, the `dir` method can tell us what methods are -available. The `__contains__` method sounds promising. - -```python ->>> x= [1] ->>> x.__contains__(1) -True ->>> x.__contains__(2) -False -``` - -{% endsolution %} - -{% endchallenge %} - -Strings work a lot like lists, which makes sense because they are effectively -collections of single characters. This means we can also query string contents -with `in`. - -```python ->>> fact = 'The best hero is Thor.' ->>> 'Thor' in fact -True ->>> 'Iron Man' in fact -False -``` - -## Truthiness - -It’s conventional not to explicitly compare a condition to `True`, because the -`if` statement already does that for us. - -```python ->>> if ('Pineapple' in pizzas) == True: -... print('Weird.') -... ->>> if 'Pineapple' in pizzas: -... print('Not weird.') -... -``` - -Likewise, rather than comparing for False, we just use `not`. - -```python ->>> if ('Pineapple' in pizzas) == False: -... print ('Weird.') -... ->>> if not 'Pineapple' in pizzas: -... print ('Not weird.') -... ->>> not 'Pineapple' in pizzas: ->>> 'Pineapple' not in pizzas: -``` - -The last two lines show that we can use `not in` for checking that something -_is not_ in a collection. This reads more naturally. - -All Python objects are truth-like unless they are the value `False`, the value -`None`, or are empty collections (such as `""`, `[]`, `()`, `{}`). - -```python ->>> if list() or dict() or tuple() or str(): -... print ("You won’t see me!") -``` - -The value `None`, which is available as the variable named `None`, is often -used as placeholder for an empty value. - -```python ->>> favourite = None ->>> for p in pizzas: -... if 'Olives' in p: -... favourite = p -... ->>> if favourite: -... print ('Found favourite: {0}'.format(favourite)) -... else: -... print ('No favourite :(') -No favourite :( -``` - -It behaves as false-y value in conditions. - -## Conditions in loops - -`for` loops and comprehensions are the most common ways of iterating in Python. -We’ve already seen that using conditions in these can be useful. - -```python ->>> not_cheesy = [p for p in pizzas if 'cheese' not in p.lower()] ->>> not_cheesy -['Pineapple', 'Pepperoni', 'Hot dog'] -``` - -Another way of iterating is with `while`. - -```python ->>> i = 5 ->>> while i > 0: -... print 'T-minus {0} seconds'.format(i) -... # Equivalent to `i = i - 1` -... i -= 1 -... print ('Blast off!') -T-minus 5 seconds -T-minus 4 seconds -T-minus 3 seconds -T-minus 2 seconds -T-minus 1 seconds -Blast off! -``` - -The `while` loop checks the condition, runs the block, and then re-checks the -condition. If we don’t do something in the loop to change the result of the -condition, we will end up looping forever! - -```python ->>> i = 5 ->>> while i > 0: -... print('All work and no play makes Jack a dull boy') -``` - -Because we do not change the value of `i` in the loop, the condition always -evaluates to `True`, so we’re stuck. You can stop Python running the code by -typing the `Ctrl-c` key combination. - -Sometimes you want to stop iterating when some condition is met. You could -achieve this with a `while` loop. - -```python ->>> ok = False ->>> i = 0 ->>> while not ok: -... ok = 'cheese' in pizza[i].lower() -... # Equivalent to `i = i + 1` -... i += 1 -... ->>> i -2 ->>> pizza[i - 1] -'Cheese' -``` - -It is not nice to have to keep track of these `ok` and `i` variables. Instead, -we can use a `for` loop, which feels much more natural when iterating over a -collection, and `break` to stop looping. - -```python ->>> for pizza in pizzas: -... if 'cheese' in pizza.lower(): -... yum = pizza -... break -... ->>> yum -'Cheese' -``` diff --git a/python/dictionaries.ipynb b/python/dictionaries.ipynb new file mode 100644 index 00000000..45e63655 --- /dev/null +++ b/python/dictionaries.ipynb @@ -0,0 +1,606 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Dictionaries\n", + "\n", + "You can think of lists as a _mapping_ from indices to values. The indices are\n", + "always integers and go from `0` to `len(the_list) - 1`, and the values are the\n", + "items.\n", + "\n", + "Dictionaries are collections, just like lists, but they have important\n", + "differences:\n", + "\n", + "* lists map sequential numeric indices to items, whereas dictionaries can map\n", + " most object types to any object,\n", + "* lists are _ordered_ collections of items, whereas dictionaries have no\n", + " ordering.\n", + "\n", + "Since anything can be used as index for an item, indices must be always\n", + "specified when creating a dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "d = {1: 0.5, 'excellent index': math.sin, 0.1: 2}\n", + "d[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "d['excellent index']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "d[0.1] = 3" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The \"indices\" of a dictionary are called **keys**, and the things they map to\n", + "are **values**. Together, each key-value pair is an **item**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "d.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "d.values()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "d" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "As you can see, the values of a dictionary can be whatever we like, and need not\n", + "be the same type of object.\n", + "\n", + "You can see that the order of the keys, values and items we get back are not the\n", + "same as the order we created the dictionary with. If you run the same example on\n", + "your own you might get a different ordering. This is what we mean when we define\n", + "dictionaries as _unordered collections_: when you iterate over its content, you\n", + "cannot rely on the ordering.\n", + "\n", + "It is however guaranteed that the _n_-th item returned by `keys()` corresponds\n", + "to the _n_-th item returned by `values()`. This allows the following example\n", + "to work flawlessly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for key, value in zip(d.keys(), d.values()):\n", + " print(key, ':', value)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Of course, this could be considerably simpler just by using `items()`, which\n", + "gives us _tuples of key-value pairs_." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for key, value in d.items():\n", + " print(key, ':', value)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We can create dictionaries from lists of 2-item lists." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dict(enumerate(['a thing', 'another']))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "And also with _dictionary comprehensions_, in a similar manner to list\n", + "comprehensions, with the additional specification of the key." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "{i: i**i for i in range(5) if i != 3}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Dictionary keys\n", + "\n", + "There is no restriction on values a dictionary might hold, but there is on keys." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "numbers = [1, 4, 3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dd = {}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "dd[numbers] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In essence, keys must not be mutable. This includes numbers, strings, and\n", + "tuples, but not lists. This restriction is a trade-off that allows Python to\n", + "make accessing values in a dictionary by key very fast." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Advanced (skip on first read)**\n", + "Immutable data types in Python have a `__hash__()` function, you can test it\n", + "yourself:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "s = \"a string\"\n", + "s.__hash__()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "[A hashing function](https://en.wikipedia.org/wiki/Hash_function) creates an\n", + "encoded (but not unique) representation of the object as a number. When you\n", + "look up an item in a Python dictionary with `my_dict[\"my_key\"]`, what happens\n", + "internally is:\n", + "\n", + "* hash of `\"my_key\"` is calculated,\n", + "* this number is compared to every hash of every key in the dictionary, until a\n", + " match between the hashes is found,\n", + "* if two hashes match, _and_ the two objects are really identical, the\n", + " corresponding dictionary item is returned.\n", + "\n", + "Looking up numbers instead of strings or tuples is considerably faster, but\n", + "since two different strings can have the same hash, their content has to be\n", + "compared as well to really tell whether they are equal. If two hashes are\n", + "different on the other hand we are sure that the objects are different as well." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Iteration over dictionaries is over their keys." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for key in d:\n", + " print(key, ':', d[key])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We have already seen how to iterate over values (using `d.values()`) or keys\n", + "and values simultaneously (using `d.items()`)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Exercise**\n", + "Alphabet mapping\n", + "\n", + "Map each letter of the alphabet to a number with a dictionary comprehension,\n", + "starting with `0` for `a` and ending with `25` for `z`.\n", + "\n", + "You can get a string containing the letters of the alphabet like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "string.ascii_lowercase" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You can iterate over a string exactly like a list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for character in string.ascii_lowercase:\n", + " print(character)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Then, create the \"reverse\" dictionary, again with a comprehension, mapping\n", + "letters to numbers." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Solution**\n", + "You need to have a list containing one number per letter, and to loop over that\n", + "list along with the characters in the string. This is exactly the same as\n", + "looping over items in a list alongside the index, so we can use `enumerate`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "alphabet_map = {i: c for i, c in enumerate(string.ascii_lowercase)}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We can create the inverse map by swapping the key and value in the\n", + "comprehension." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "reverse_map = {c: i for i, c in alphabet_map.items()}" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/dictionaries.md b/python/dictionaries.md deleted file mode 100644 index 88beb7c1..00000000 --- a/python/dictionaries.md +++ /dev/null @@ -1,214 +0,0 @@ -# Dictionaries - -You can think of lists as a _mapping_ from indices to values. The indices are -always integers and go from `0` to `len(the_list) - 1`, and the values are the -items. - -Dictionaries are collections, just like lists, but they have important -differences: - -* lists map sequential numeric indices to items, whereas dictionaries can map - most object types to any object, -* lists are _ordered_ collections of items, whereas dictionaries have no - ordering. - -Since anything can be used as index for an item, indices must be always -specified when creating a dictionary: - -```python ->>> d = {1: 0.5, 'excellent index': math.sin, 0.1: 2} ->>> d[1] -0.5 ->>> d['excellent index'] - ->>> d[0.1] = 3 -``` - -The "indices" of a dictionary are called **keys**, and the things they map to -are **values**. Together, each key-value pair is an **item**. - -```python ->>> d.keys() -dict_keys([1, 0.1, 'excellent index']) ->>> d.values() -dict_values([0.5, 2, ]) ->>> d -{0.1: 2, 1: 0.5, 'excellent index': } -``` - -As you can see, the values of a dictionary can be whatever we like, and need not -be the same type of object. - -You can see that the order of the keys, values and items we get back are not the -same as the order we created the dictionary with. If you run the same example on -your own you might get a different ordering. This is what we mean when we define -dictionaries as _unordered collections_: when you iterate over its content, you -cannot rely on the ordering. - -It is however guaranteed that the _n_-th item returned by `keys()` corresponds -to the _n_-th item returned by `values()`. This allows the following example -to work flawlessly: - -```python ->>> for key, value in zip(d.keys(), d.values()): -... print(key, ':', value) -... -1 : 0.5 -0.1 : 3 -excellent index : -``` - -Of course, this could be considerably simpler just by using `items()`, which -gives us _tuples of key-value pairs_. - -```python ->>> for key, value in d.items(): -... print(key, ':', value) -... -1 : 0.5 -0.1 : 3 -excellent index : -``` - -We can create dictionaries from lists of 2-item lists. - -```python ->>> dict(enumerate(['a thing', 'another'])) -{0: 'a thing', 1: 'another'} -``` - -And also with _dictionary comprehensions_, in a similar manner to list -comprehensions, with the additional specification of the key. - -```python ->>> {i: i**i for i in range(5) if i != 3} -{0: 1, 1: 1, 2: 4, 4: 256, 5: 3125} -``` - -Note that dictionary comprehensions require at least Python 2.7 to work. - - -## Dictionary keys - -There’s no restriction on values a dictionary might hold, but there is on keys. - -```python ->>> l = [1, 4, 3] ->>> dd = {} ->>> dd[l] = 0 -Traceback (most recent call last): - File "", line 1, in -TypeError: unhashable type: 'list' -``` - -In essence, keys must not be mutable. This includes numbers, strings, and -tuples, but not lists. This restriction is a trade-off that allows Python to -make accessing values in a dictionary by key very fast. - -{% callout "Hashing" %} - -Immutable data types in Python have a `__hash__()` function, you can test it -yourself: - -```python ->>> s = "a string" ->>> s.__hash__() --8411828025894108412 -``` - -[A hashing function](https://en.wikipedia.org/wiki/Hash_function) creates an -encoded (but not unique) representation of the object as a number. When you -look up an item in a Python dictionary with `my_dict["my_key"]`, what happens -internally is: - -* hash of `"my_key"` is calculated, -* this number is compared to every hash of every key in the dictionary, until a - match between the hashes is found, -* if two hashes match, _and_ the two objects are really identical, the - corresponding dictionary item is returned. - -Looking up numbers instead of strings or tuples is considerably faster, but -since two different strings can have the same hash, their content has to be -compared as well to really tell whether they are equal. If two hashes are -different on the other hand we are sure that the objects are different as well. - -{% endcallout %} - -Iteration over dictionaries is over their keys. - -```python ->>> for key in d: -... print key, ':' d[key] -... -1 : 0.5 -0.1 : 3 -excellent index : -``` - -We have already seen how to iterate over values (using `d.values()`) or keys -and values simultaneously (using `d.items()`). - -{% callout "On the efficiency of items()" %} - -In Python 2, using items copies the keys and values of a dictionary, and gives -you back those copies. This can be problematic for large dictionaries as the -amount of memory your program uses can double. Python 3 uses a much more -memory-efficient way of implementing items so that you don't have to worry. - -If you're having memory problems with using items in Python 2, you can use -`viewitems()` instead, which behaves the same way as items does in Python 3. - -Note that there are also similar methods for keys and values, called -`viewkeys()` and `viewvalues()`, and that all of these view methods are only -available from -Python 2.7. - -{% endcallout %} - -{% challenge "Alphabet mapping" %} - -Map each letter of the alphabet to a number with a dictionary comprehension, -starting with `0` for `a` and ending with `25` for `z`. - -You can get a string containing the letters of the alphabet like this: - -```python ->>> import string ->>> string.ascii_lowercase -'abcdefghijklmnopqrstuvwxyz' -``` - -You can iterate over a string exactly like a list. - -```python ->>> for character in string.ascii_lowercase: -... print character -... -a -b -... -z -``` -Then, create the "reverse" dictionary, again with a comprehension, mapping -letters to numbers. - -{% solution "Solution" %} - -You need to have a list containing one number per letter, and to loop over that -list along with the characters in the string. This is exactly the same as -looping over items in a list alongside the index, so we can use `enumerate`. - -```python ->>> alphabet_map = {i: c for i, c in enumerate(string.ascii_lowercase)} -``` - -We can create the inverse map by swapping the key and value in the -comprehension. - -```python ->>> reverse_map = {c: i for i, c in alphabet_map.items()} -``` - -{% endsolution %} - -{% endchallenge %} diff --git a/python/figs/B_flight_distance.png b/python/figs/B_flight_distance.png index b92304c7..eca41d0a 100644 Binary files a/python/figs/B_flight_distance.png and b/python/figs/B_flight_distance.png differ diff --git a/python/figs/B_flight_distance_v2.png b/python/figs/B_flight_distance_v2.png index 4632b297..d927291e 100644 Binary files a/python/figs/B_flight_distance_v2.png and b/python/figs/B_flight_distance_v2.png differ diff --git a/python/figs/B_flight_distance_v3.png b/python/figs/B_flight_distance_v3.png index d037bc63..b6ebb7b0 100644 Binary files a/python/figs/B_flight_distance_v3.png and b/python/figs/B_flight_distance_v3.png differ diff --git a/python/figs/B_flight_distance_with_cut_compare.png b/python/figs/B_flight_distance_with_cut_compare.png index 9c552e86..4af8e8ac 100644 Binary files a/python/figs/B_flight_distance_with_cut_compare.png and b/python/figs/B_flight_distance_with_cut_compare.png differ diff --git a/python/lists.ipynb b/python/lists.ipynb new file mode 100644 index 00000000..d25f8d7d --- /dev/null +++ b/python/lists.ipynb @@ -0,0 +1,1773 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Lists and looping\n", + "\n", + "Now things start to get _really_ interesting! Lists are collections of things \n", + "stored in a specific order. They can be defined literally by wrapping things in \n", + "square brackets `[]`, separating items with commas `,`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a = [4, 2, 9, 3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Python lists can contain collections of whatever you like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "excellent = [41, 'Hello', math.sin]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Each item in the list can be accessed by its _index_, its position in the list, \n", + "which starts at zero for the first item. Indexing by negative numbers starts \n", + "from the _last_ item of the list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We’ll get an error if we try to access an index that doesn’t exist:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "a[99]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Like strings, lists have a length which can be found with the `len` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "len(a)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Unlike strings, lists are _mutable_, which means we can modify lists in-place, \n", + "without creating a new one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a.append(45)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "len(a)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We can see that lists are mutable because using the `append` method didn’t \n", + "print anything, and our variable `a` now has a different value.\n", + "\n", + "Because lists are mutable, we can use the special `del` keyword to remove \n", + "specific indices from the list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "del a[-2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "*Good to know*\n", + "`del` is a language keyword representing an action, and not a function. The\n", + "syntactic difference is that functions take their arguments between parentheses,\n", + "such as `my_function(1, 2, 3)`, whereas `del` does not." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You can retrieve sub-lists by using _slice_ notation whilst indexing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[1:-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This retrieves the part of list `a` starting from index `1` until _just before_ \n", + "index `-1`. The indexing is ‘exclusive’ in that it excludes the item of the \n", + "last index. This is the convention of indexing in Python.\n", + "\n", + "You can omit a number in the first or second indexing position, and Python will \n", + "assume you mean the first element (index zero) and last element (index \n", + "`len(array)`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[:-2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[1:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Slicing returns a copy of the array, so modifying the return value doesn’t \n", + "affect the original array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b = a[1:]\n", + "print(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b[0] = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We did something cool there by assigning a value to a specific index, `b[0] =\n", + "3`. The same trick works with slices." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b[:2] = [99, 2, 78]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This is equivalent of _replacing_ a certain range (`:2`, or items at position 0\n", + "and 1) of the list `b` with other items from another list. Note that in our\n", + "example we replace 2 elements with 3. The same syntax might be used for\n", + "inserting elements at an arbitrary position in the list. If we want to insert\n", + "the number 6 between the 2 and the 78 in the list above, we would use:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b[2:0] = [6]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "meaning _take out 0 elements from the list starting a position 2 and insert the\n", + "content of the list `[6]` in that position_." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Exercise**\n", + "\n", + "Slicing creates a copy, so what notation could you use to copy the full list?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Solution**\n", + "You need to slice from the very beginning to the very end of the list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[:]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This is equivalent to specifying the indices explicitly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a[0:len(a)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a == a[:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a is a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a is a[:] # creates a copy!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Looping\n", + "\n", + "When you’ve got a collection of things, it’s pretty common to want to access \n", + "each one sequentially. This is called looping, or iterating, and is super easy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for item in a:\n", + " print(item)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We have to indent the code inside the `for` loop to tell Python that these\n", + "lines should be run for every iteration." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Indentation\n", + "\n", + "The `for` loop is a block, and every Python block requires indentation, unlike\n", + "other \"free-form\" languages such as C++ or Java. This means that Python will\n", + "throw an error if you don't indent:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "for i in b:\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Indentation must be consistent within the same block, so if you indent two lines\n", + "in the same `for` loop using a different number of spaces, Python will complain\n", + "once again:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "for i in b:\n", + " print(\"I am in a loop\")\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Indentation is necessary as Python does not use any keyword or symbol to\n", + "determine the end of a block (_e.g._ there is no `endfor`). As a side effect,\n", + "indentation forces you to make your code more readable!\n", + "\n", + "Note that it does not matter how many spaces you use for indentation. **As a\n", + "convention, we are using four spaces.**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The variable name `item` can be whatever we want, but its value is changed by \n", + "Python to be the element of the item we’re currently on, starting from the \n", + "first.\n", + "\n", + "Because lists are mutable, we can try to modify the length of the list whilst \n", + "we’re iterating." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_copy = a[:]\n", + "for item in a_copy:\n", + " del a_copy[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_copy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Intuitively, you might expect `a_copy` to be empty, but it’s not! The technical\n", + "reasons aren’t important, but this highlights an important rule: **never\n", + "modify the length of a list whilst iterating over it!** You won’t end up with\n", + "what you expect.\n", + "\n", + "You can, however, freely modify the _values_ of each item in the list whilst \n", + "looping. This is a very common use case." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_copy = a[:]\n", + "i = 0\n", + "for item in a_copy:\n", + " a_copy[i] = 2*item\n", + " i += 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_copy" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Keeping track of the current index ourselves, with `i` is annoying, but luckily \n", + "Python gives us a nicer way of doing that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_doubled = a[:]\n", + "for index, item in enumerate(a_doubled):\n", + " a_doubled[index] = 2 * item" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_doubled" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "There’s a lot going on here. Firstly, note that Python lets you assign values \n", + "to multiple variables at the same time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "one, two = [34, 43]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(f\"one: {one}, two: {two}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "That’s already pretty cool! But then think about what happens if you had a list \n", + "where each item was another list, each containing two numbers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "nested = [[20, 29], [30, 34]]\n", + "for item in nested:\n", + " print(item)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "So, we can just assign each item in the sublist to separate variables in the \n", + "`for` statement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for one, two in nested:\n", + " print(two, one)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Now we can understand a little better what `enumerate` does: for each item in \n", + "the list, it returns a new list containing the current index and the item." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "enumerate(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(enumerate(a))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "For more advanced reasons `enumerate` doesn’t return a list directly, but instead\n", + "something that the `for` statement knows how to iterate over (this is called a\n", + "[generator](https://wiki.python.org/moin/Generators) and for the moment you\n", + "don't need to know how it works). We can convert it to a list with the `list`\n", + "method when we want to see what’s it doing.\n", + "\n", + "This technique of looping over lists of lists lets us loop over two lists \n", + "simultaneously, using the `zip` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for item, item2 in zip(a, a_doubled):\n", + " print(item2, item)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Neat! As before, we can see what `zip` is doing explicitly by using `list`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(zip(a, a_doubled))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You can see that the structure of the list that’s iterated over, the output of \n", + "`zip`, is identical to that for `enumerate`.\n", + "\n", + "Finally, we’ll take a quick look at the `range` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for i in range(0, 10):\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The arguments to `range` work just like slicing, the second argument is treated \n", + "exclusively, as its value is excluded from the output. Again like slicing, we \n", + "can specify a third argument as the step size for the iteration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for i in range(0, 10, 2):\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "If you only give a single argument to `range`, it assumes you’ve given the end \n", + "value, and want a starting value of zero." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for i in range(5):\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This reads “give me a list of length 5, in steps of 1, starting from zero”.\n", + "\n", + "Now that we know how to easily generate sequences of numbers, we can write \n", + "`enumerate` by hand!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "for index, item in zip(range(len(a)), a):\n", + " print(index, item)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Just like before! When you see something cool like `enumerate`, it can be fun \n", + "trying to see how you’d accomplish something similar with different building \n", + "blocks." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## List comprehension (Sugar, can be skipped on first read)\n", + "\n", + "We’ve already made a new list from an existing one when we created `a_doubled`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_doubled = a[:]\n", + "for index, item in enumerate(a_doubled):\n", + " a_doubled[index] = 2*item" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Creating a new list from an existing one is a common operation, so Python has a \n", + "shorthand syntax called _list comprehension_." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_doubled = [2*item for item in a]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a_doubled" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Isn’t that beautiful?\n", + "\n", + "We can use the same multi-variable stuff we learnt whilst looping." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "[index*item for index, item in enumerate(a)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We’re not restricted to creating new lists with the same structure as the \n", + "original." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "[[item, item*item] for item in a]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We can even filter out items from the original list using `if`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "[[item, item*item] for item in a if item % 2 == 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "List comprehensions are a powerful way of succinctly creating new lists. But be \n", + "responsible; if you find you’re doing something complicated, it’s probably \n", + "better to write a full `for` loop." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Exercise**\n", + "Write a list comprehension yourself\n", + "\n", + "Compute the square of the magnitude of the sum of the following two \n", + "three-vectors, using a single list comprehension and the global `sum` method.\n", + "\n", + "It might help to first think about how you’d compute the quantity for a single \n", + "vector.\n", + "\n", + "Not sure what the `sum` method does? Ask for `help`!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "help(sum)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "kaon = [3.4, 4.3, 20.0]\n", + "pion = [1.4, 0.9, 19.8]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Solution**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The square magnitude is the sum of the squares of the components, where the \n", + "components are the sum of the two input vectors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "magsq = sum([(k + pi)**2 for k, pi in zip(kaon, pion)])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The square root of this is around 40.42." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Tuples\n", + "\n", + "A close relative of lists are tuples, which differ in that they cannot be \n", + "mutated after creation. You can create tuples literally using parentheses, or \n", + "convert things to tuples using the `tuple` method.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a = (3, 4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "del a[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "a.append(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Tuples are usually used to describe data whose length is meaningful in and of \n", + "itself. For example, you could express coordinates as a tuple." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "coords = (3.2, 0.1)\n", + "x, y = coords" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This is nice because it doesn’t make sense to append to an `$ (x, y) $` \n", + "coordinate, nor to ‘delete’ a dimension. Generally, it can be useful if the \n", + "data structure you’re using respects the _meaning_ of the data you’re storing.\n", + "\n", + "If you can’t think of a use for tuples yourself, its worth keeping in mind that \n", + "Python creates tuples for groups of things by default. We saw that \n", + "earlier when we used `enumerate`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(enumerate([4, 9]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each element of the list is a tuple." + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/lists.md b/python/lists.md deleted file mode 100644 index df6ee45f..00000000 --- a/python/lists.md +++ /dev/null @@ -1,520 +0,0 @@ -# Lists and looping - -Now things start to get _really_ interesting! Lists are collections of things -stored in a specific order. They can be defined literally by wrapping things in -square brackets `[]`, separating items with commas `,`. - -```python ->>> a = [4, 2, 9, 3] ->>> a -[4, 2, 9, 3] -``` - -Python lists can contain collections of whatever you like. - -``` ->>> excellent = [41, 'Hello', math.sin] -``` - -Each item in the list can be accessed by its _index_, its position in the list, -which starts at zero for the first item. Indexing by negative numbers starts -from the _last_ item of the list. - -```python ->>> a[0] -4 ->>> a[2] -9 ->>> a[-1] -3 -``` - -We’ll get an error if we try to access an index that doesn’t exist: - -```python ->>> a[99] -Traceback (most recent call last): - File "", line 1, in -IndexError: list index out of range -``` - -Like strings, lists have a length which can be found with the `len` method. - -```python ->>> len(a) -4 -``` - -Unlike strings, lists are _mutable_, which means we can modify lists in-place, -without creating a new one. - -```python ->>> a.append(45) ->>> a -[4, 2, 9, 3, 45] ->>> len(a) -5 -``` - -We can see that lists are mutable because using the `append` method didn’t -print anything, and our variable `a` now has a different value. - -Because lists are mutable, we can use the special `del` keyword to remove -specific indices from the list. - -```python ->>> del a[-2] ->>> a -[4, 2, 9, 45] -``` - -{% callout "Functions and keywords" %} - -`del` is a language keyword representing an action, and not a function. The -syntactic difference is that functions take their arguments between parentheses, -such as `my_function(1, 2, 3)`, whereas `del` does not. - -{% endcallout %} - -You can retrieve sub-lists by using _slice_ notation whilst indexing. - -```python ->>> a[1:-1] -[2, 9] -``` - -This retrieves the part of list `a` starting from index `1` until _just before_ -index `-1`. The indexing is ‘exclusive’ in that it excludes the item of the -last index. This is the convention of indexing in Python. - -You can omit a number in the first or second indexing position, and Python will -assume you mean the first element (index zero) and last element (index -`len(array)`). - -```python ->>> a[:-2] -[4, 2 ->>> a[1:] -[2, 9, 45] ->>> a[:] -[4, 2, 9, 45] -``` - -Slicing returns a copy of the array, so modifying the return value doesn’t -affect the original array. - -```python ->>> b = a[1:] ->>> b -[2, 9, 45] ->>> b[0] = 3 ->>> b -[3, 9, 45] ->>> a -[4, 2, 9, 45] -``` - -We did something cool there by assigning a value to a specific index, `b[0] = -3`. The same trick works with slices. - -```python ->>> b[:2] = [99, 2, 78] ->>> b -[99, 2, 78, 45] -``` - -This is equivalent of _replacing_ a certain range (`:2`, or items at position 0 -and 1) of the list `b` with other items from another list. Note that in our -example we replace 2 elements with 3. The same syntax might be used for -inserting elements at an arbitrary position in the list. If we want to insert -the number 6 between the 2 and the 78 in the list above, we would use: - -```python ->>> b[2:0] = [6] ->>> b -[99, 2, 6, 78, 45] -``` - -meaning _take out 0 elements from the list starting a position 2 and insert the -content of the list `[6]` in that position_. - -{% challenge "Copying a list" %} - -Slicing creates a copy, so what notation could you use to copy the full list? - -{% solution "Solution" %} - -You need to slice from the very beginning to the very end of the list. -```python ->>> a[:] -[4, 2, 9, 45] -``` -This is equivalent to specifying the indices explicitly. -```python ->>> a[0:len(a)] -[4, 2, 9, 45] -``` - -{% endsolution %} - -{% endchallenge %} - -## Looping - -When you’ve got a collection of things, it’s pretty common to want to access -each one sequentially. This is called looping, or iterating, and is super easy. - -```python ->>> for item in a: -... print item -... -4 -2 -9 -45 -``` - -We have to indent the code inside the `for` loop to tell Python that these -lines should be run for every iteration. - -{% callout "Indentation in Python" %} - -The `for` loop is a block, and every Python block requires indentation, unlike -other "free-form" languages such as C++ or Java. This means that Python will -throw an error if you don't indent: - -```python ->>> for i in b: -... print(i) - File "", line 2 - print(i) - ^ -IndentationError: expected an indented block -``` - -Indentation must be consistent within the same block, so if you indent two lines -in the same `for` loop using a different number of spaces, Python will complain -once again: - -```python ->>> for i in b: -... print("I am in a loop") -... print(i) - File "", line 3 - print(i) - ^ -IndentationError: unexpected indent -``` - -Indentation is necessary as Python does not use any keyword or symbol to -determine the end of a block (_e.g._ there is no `endfor`). As a side effect, -indentation forces you to make your code more readable! - -Note that it does not matter how many spaces you use for indentation. **As a -convention, we are using four spaces.** - -{% endcallout %} - -The variable name `item` can be whatever we want, but its value is changed by -Python to be the element of the item we’re currently on, starting from the -first. - -Because lists are mutable, we can try to modify the length of the list whilst -we’re iterating. - -```python ->>> a_copy = a[:] ->>> for item in a_copy: -... del a_copy[0] -... ->>> a_copy -[9, 45] -``` - -Intuitively, you might expect `a_copy` to be empty, but it’s not! The technical -reasons aren’t important, but this highlights an important rule: **never -modify the length of a list whilst iterating over it!** You won’t end up with -what you expect. - -You can, however, freely modify the _values_ of each item in the list whilst -looping. This is a very common use case. - -```python ->>> a_copy = a[:] ->>> i = 0 ->>> for item in a_copy: -... a_copy[i] = 2*item -... i += 1 -... ->>> a_copy -[8, 4, 18, 90] -``` - -Keeping track of the current index ourselves, with `i` is annoying, but luckily -Python gives us a nicer way of doing that. - -```python ->>> a_doubled = a[:] ->>> for index, item in enumerate(a_doubled): -... a_doubled[index] = 2*item -... ->>> a_doubled -[8, 4, 18, 90] -``` - -There’s a lot going on here. Firstly, note that Python lets you assign values -to multiple variables at the same time. - -```python ->>> one, two = [34, 43] ->>> print(two, one) -43 34 -``` - -That’s already pretty cool! But then think about what happens if you had a list -where each item was another list, each containing two numbers. - -```python ->>> nested = [[20, 29], [30, 34]] ->>> for item in nested: -... print(item) -... -[20, 29] -[30, 34] -``` - -So, we can just assign each item in the sublist to separate variables in the -`for` statement. - -```python ->>> for one, two in nested: -... print(two, one) -... -29 20 -34 30 -``` - -Now we can understand a little better what `enumerate` does: for each item in -the list, it returns a new list containing the current index and the item. - -```python ->>> enumerate(a) - ->>> list(enumerate(a)) -[(0, 4), (1, 2), (2, 9), (3, 45)] -``` - -For performance reasons `enumerate` doesn’t return a list directly, but instead -something that the `for` statement knows how to iterate over (this is called a -[generator](https://wiki.python.org/moin/Generators) and for the moment you -don't need to know how it works). We can convert it to a list with the `list` -method when we want to see what’s it doing. - -This technique of looping over lists of lists lets us loop over two lists -simultaneously, using the `zip` method. - -```python ->>> for item, item2 in zip(a, a_doubled): -... print(item2, item) -... -8 4 -4 2 -18 9 -90 45 -``` - -Neat! As before, we can see what `zip` is doing explicitly by using `list`. - -```python ->>> list(zip(a, a_doubled)) -[(4, 8), (2, 4), (9, 18), (45, 90)] -``` - -You can see that the structure of the list that’s iterated over, the output of -`zip`, is identical to that for `enumerate`. - -Finally, we’ll take a quick look at the `range` method. - -```python ->>> for i in range(0, 10): - print(i) -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -``` - -The arguments to `range` work just like slicing, the second argument is treated -exclusively, as its value is excluded from the output. Again like slicing, we -can specify a third argument as the step size for the iteration. - -```python ->>> for i in range(0, 10, 2): - print(i) -0 -2 -4 -6 -8 -``` - -If you only give a single argument to `range`, it assumes you’ve given the end -value, and want a starting value of zero. - -```python ->>> for i in range(5): - print(i) -0 -1 -2 -3 -4 -``` - -This reads “give me a list of length 5, in steps of 1, starting from zero”. - -Now that we know how to easily generate sequences of numbers, we can write -`enumerate` by hand! - -```python ->>> for index, item in zip(range(len(a)), a): -... print(index, item) -... -0 4 -1 2 -2 9 -3 45 -``` - -Just like before! When you see something cool like `enumerate`, it can be fun -trying to see how you’d accomplish something similar with different building -blocks. - -## List comprehension - -We’ve already made a new list from an existing one when we created `a_doubled`. - -```python ->>> a_doubled = a[:] ->>> for index, item in enumerate(a_doubled): -... a_doubled[index] = 2*item -``` - -Creating a new list from an existing one is a common operation, so Python has a -shorthand syntax called _list comprehension_. - -```python ->>> a_doubled = [2*item for item in a] ->>> a_doubled -[8, 4, 18, 90] -``` - -Isn’t that beautiful? - -We can use the same multi-variable stuff we learnt whilst looping. - -```python ->>> [index*item for index, item in enumerate(a)] -[0, 2, 18, 135] -``` - -We’re not restricted to creating new lists with the same structure as the -original. - -```python ->>> [[item, item*item] for item in a] -[[4, 16], [2, 4], [9, 81], [45, 2025]] -``` - -We can even filter out items from the original list using `if`. - -```python ->>> [[item, item*item] for item in a if item % 2 == 0] -[[4, 16], [2, 4]] -``` - -List comprehensions are a powerful way of succinctly creating new lists. But be -responsible; if you find you’re doing something complicated, it’s probably -better to write a full `for` loop. - -## Tuples - -A close relative of lists are tuples, which differ in that they cannot be -mutated after creation. You can create tuples literally using parentheses, or -convert things to tuples using the `tuple` method. - -```python ->>> a = (3, 4) ->>> del a[0] -Traceback (most recent call last): - File "", line 1, in -TypeError: 'tuple' object doesn't support item deletion ->>> a.append(5) -Traceback (most recent call last): - File "", line 1, in -AttributeError: 'tuple' object has no attribute 'append' ->>> a[0] = 1 -Traceback (most recent call last): - File "", line 1, in -TypeError: 'tuple' object does not support item assignment ->>> b = tuple([65, 'yes']) ->>> b -(65, 'yes') -``` - -Tuples are usually used to describe data whose length is meaningful in and of -itself. For example, you could express coordinates as a tuple. - -```python ->>> coords = (3.2, 0.1) ->>> x, y = coords -``` - -This is nice because it doesn’t make sense to append to an `$ (x, y) $` -coordinate, nor to ‘delete’ a dimension. Generally, it can be useful if the -data structure you’re using respects the _meaning_ of the data you’re storing. - -If you can’t think of a use for tuples yourself, its worth keeping in mind that -Python creates tuples for groups of things by default. We saw that -earlier when we used `enumerate`. - -```python ->>> list(enumerate([4, 9])) -[(0, 4), (1, 9)] -``` - -Each element of the list is a tuple. - -{% challenge "Write a list comprehension yourself" %} - -Compute the square of the magnitude of the sum of the following two -three-vectors, using a single list comprehension and the global `sum` method. -```python ->>> kaon = [3.4, 4.3, 20.0] ->>> pion = [1.4, 0.9, 19.8] -``` -It might help to first think about how you’d compute the quantity for a single -vector. - -{% solution "Solution" %} - -Not sure what the `sum` method does? Ask for `help`! -```python ->>> help(sum) -``` -The square magnitude is the sum of the squares of the components, where the -components are the sum of the two input vectors. -```python ->>> magsq = sum([(k + pi)**2 for k, pi in zip(kaon, pion)]) -``` -The square root of this is around 40.42. - -{% endsolution %} - -{% endchallenge %} diff --git a/python/methods.ipynb b/python/methods.ipynb new file mode 100644 index 00000000..c27225fc --- /dev/null +++ b/python/methods.ipynb @@ -0,0 +1,1675 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Functions\n", + "Functions, or methods if they are associated with a class, take some input and return some output. \n", + "\n", + "They are the equivalent of a mathematical function $f(x) = y$, where the function `f` takes zero or more aruments and returns zero or more values. The best way to think about a function is that it is executed and _replaced_ by the return value, like a mathematical function. `f` is a function, `x` are values. `f(x)` is not a function anymore and isn't `x` either, it's whatever `y` is, the return value. Like mathematical functions, you could \"copy-paste\" the code-block of the function in the place (technically).\n", + "\n", + "We have already used lots of *functions*, like `len`, `abs` and `print` as well as *methods* like `append` from `list`, `get` from `dict` or `replace` from `str`. In this lesson we will start creating our own.\n", + "\n", + "As we have seen, methods can do a lot of stuff with very little typing. Methods are normally used to encapsulate small pieces of code that we want to reuse.\n", + "\n", + "Let’s rewrite len as an example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def length(obj):\n", + " \"\"\"Return the number of elements in `obj`.\n", + " Args\n", + " ----\n", + " obj (iterable): Object the length will be calculated from.\n", + " Return\n", + " ------\n", + " int: number of elements in `obj`.\n", + " \"\"\"\n", + " i = 0\n", + " for _ in obj:\n", + " i += 1\n", + " return i" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "length" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "help(length)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "or viewing the docs view in your preferred editor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "length('A b c!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "length(range(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "There’s a lot going on here, so we will break it down line-by-line.\n", + "\n", + "1. `def length(obj)`: methods are _defined_ using `def`, followed by a space,\n", + " and then the name you want to give the method. Inside the parentheses\n", + " after the name, we list the inputs, or _arguments_, that we want our method\n", + " to accept. In this case, we only need a single input: the thing we want to\n", + " compute the length of. Finally, there’s a colon at the end, just like with\n", + " a `for` or `if`, which means a _block_ of code follows (which must be\n", + " indented).\n", + " Names are conventionally in lowercase, with underscores separating words - snakecase.\n", + "3. `\"\"\"Return the number of elements in obj.\"\"\"`: This is the _docstring_. It’s\n", + " just a documentation string, defined literally with three double quotes so that we can\n", + " include linebreaks. By placing a string here, Python makes the string\n", + " available to use when we pass our function to `help` and in a lot of other places\n", + " like docs viewer of a decent editor or even allows to automatically generate \n", + " documents including HTML with the docs. Documenting your\n", + " functions is a very good idea! It makes it clear to others, and to\n", + " future-you, what the method is supposed to do.\n", + " The formatting of docstrings is standardized (there are 2-3 different ones).\n", + " As for code style, do not invent your own but make it easier for everyone\n", + " (including your future self).\n", + "4. The method block. This is the code that will run whenever you _call_ your\n", + " method, like `length([1])`. The code in the block has access to the\n", + " arguments and to any variables defined _before_ the method definition." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Remark: there are comments (with `#`) and docstrings. Both serve a very different purpose\n", + "\n", + " - comments `#` are for people who _read_ the code. Other developers that don't want to just\n", + " use your function but _change_ it. They can be short and serve the purpose to make the\n", + " code more readable. Typical example: adding a comment on a `- 1` or `+ 1` added somewhere,\n", + " such as ` len(x) - 1 # we don't need the border`. If a block of code implements a hard\n", + " to read algorithm, it is also appropriate to use several `#` lines to explain beforehand\n", + " what is going to happen.\n", + " _Never_ use tripple quotes `\"\"\"` to make a large comment! Use always `#`, any decent\n", + " editor is able to (un)comment several lines at once. (usually ctrl + /)\n", + " - Docstrings are for users. If someone imports your function, the docstrings tells\n", + " _how to use it_ and what it does exactly. It does, however, not contain any (unnecessary)\n", + " information about the implementation. It's for someone who will _not_ read the source code.\n", + " Example are functions that we used, like `len`: we never looked at the source code, but the\n", + " `help(len)` gave us all the information that we needed to *use* it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "x = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def top_function():\n", + " \"\"\"Do something silly.\"\"\"\n", + " print(x)\n", + " print(y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "top_function()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "y = 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "top_function()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "In general, you should try to minimise the number of variables outside your\n", + " method that you use inside. It makes figuring out what the method does much\n", + " harder, as you have to look elsewhere in the code to find things out.\n", + " \n", + "4. `return i`: This defines the _output_ of the method, the thing that you get\n", + " back when you call the method. You don’t have to return anything, in which\n", + " case Python will implicitly make your function return `None`, or you can\n", + " return multiple things at once." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def no_return():\n", + " 1 + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "no_return()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "no_return() is None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def such_output():\n", + " return 'wow', 'much clever', 213 # equivalent to (return 'wow', 'much clever', 213)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You can see that returning multiple things implicitly means returning a tuple, so we can choose to assign one variable per value while calling the method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "help(len)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def add(x, y):\n", + " \"\"\"Return the sum of x and y.\"\"\"\n", + " return x + y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "add(1, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "add(x=1, y=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "add(1, y=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "add(y=2, x=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "add(y=2, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Specifying the argument’s name explicitly when calling a method is nice because\n", + "it reminds you what the argument is supposed to do. It also means you don’t\n", + "have to remember the order in which the arguments were defined, you can specify\n", + "_keyword arguments_ in any order. You can even mix _positional arguments_ with\n", + "keyword arguments, but any keyword arguments must come last.\n", + "The rule is simply: is it unambigious? You can do it. Otherwise, it's not allowed.\n", + "\n", + "Using keyword arguments is particularly useful for arguments which act as\n", + "on/off flags, because it’s often not obvious what your `True` or `False` is\n", + "doing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def add(x, y, show):\n", + " \"\"\"Return the sum of x and y.\n", + " Optionally print the result before returning it.\n", + " \"\"\"\n", + " if show:\n", + " print(x + y)\n", + " return x + y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "_ = add(1, 2, True) # Hmm, what is True doing again?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "_ = add(1, 2, show=True) # Aha! Much clearer" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "_remark on `_`: the character `_` is just a variable like any other. By convention, this is used in places where there is a return value but it signals, that it is **deliberately** ignored, as it won't be used. Contrary, just calling `add(...)` without the assignement is a \"code smell\": a possible bug, because why would someone call it and not use it's value?_" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Always having to specify that flag is annoying. It would be much nicer if\n", + "`show` had a _default value_, so that we don’t _have_ to provide a value when\n", + "calling the method, but can optionally override it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def add(x, y, show=False):\n", + " \"\"\"Return the sum of x and y.\n", + " Optionally print the result before returning it.\n", + " \"\"\"\n", + " if show:\n", + " print(x + y)\n", + " return x + y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "_ = add(1, 2) # No printing!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "_ = add(1, 2, show=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Perfect.\n", + "\n", + "Of course, function arguments can be anything, even other functions!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def run_method(method, x):\n", + " \"\"\"Call `method` with `x`.\"\"\"\n", + " return method(x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "run_method(len, [1, 2, 3])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Exercise**\n", + "Methods returning methods\n", + "\n", + "What does this method do? Think about it, what _exactly_ happens. Be precise, discuss with your neighbours." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def make_incrementor(increment):\n", + " def func(var):\n", + " return var + increment\n", + " return func" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Solution**\n", + "\n", + "It returns a function whose `increment` value has been filled by the argument\n", + "to `make_incrementor`. If we called `make_incrementor(3)`, then `increment` has\n", + "the value 3, and we can fill in the returned method in our heads." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def func(var):\n", + " return var + 3" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "So when we call _this_ method, we’ll get back what we put in, but plus 3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "increment_one = make_incrementor(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "increment_two = make_incrementor(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(increment_one(42), increment_two(42))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(make_incrementor(3)(42)) # Do it in one go!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## *args and **kwargs\n", + "\n", + "This is a brief introduction, for a more detailed explanation on the packing and unpacking of arguments, [see here](https://hsf-training.github.io/analysis-essentials/advanced-python/11AdvancedPython.html#Packing-and-unpacking-of-values)\n", + "\n", + "What if you like to accept an arbitrary number of arguments? For example, we\n", + "can also write a `total` method that takes two arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def total(x, y):\n", + " \"\"\"Return the sum of the arguments.\"\"\"\n", + " return x + y" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "But what if we want to allow the caller to pass more than two arguments? It\n", + "would be tedious to define many arguments explicitly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def total(*args):\n", + " \"\"\"Return the sum of the arguments.\"\"\"\n", + " # For seeing what `*` does\n", + " print(f'Got {len(args)} arguments: {args}')\n", + " return sum(args)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "total(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "total(1, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "total(1, 2, 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The `*args` syntax says “stuff any arguments into a tuple and call it `args`”.\n", + "This let’s us capture any number of arguments. As `args` is a tuple, one could\n", + "loop over it, access a specific element, and so on.\n", + "\n", + "*remark: `args`, like `_`, is just a name that by convention is used in this way, but has no special function*\n", + "\n", + "We can also _expand_ lists into separate arguments with the same syntax when\n", + "_calling_ a method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def reverse_args(x, y):\n", + " return y, x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "l = ['a', 'b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "raises-exception" + ] + }, + "outputs": [], + "source": [ + "reverse_args(l)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "reverse_args(*l)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "A similar syntax exists for keyword arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def ages(**people):\n", + " \"\"\"Print people's information.\"\"\"\n", + " # For seeing what `**` does\n", + " print(f'Got {len(people)} arguments: {people}')\n", + " for person in people:\n", + " print(f'Person {person} is {people[person]}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ages(steve=31)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ages(steve=31, helen=70, zorblax=9963)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "As you can see from the debug print statement, `**people` is a dictionary\n", + "containing the keyword arguments we passed to the `ages` method. The keys of\n", + "the dictionary are the names of the argument as strings, and the values are the\n", + "values of the arguments. Just like for the `*` syntax, `**` can also be used to\n", + "expand a dictionary into keyword arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data = {'thor': 5000, 'yoda': -1}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ages(**data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The order of the keyword arguments used to call the method are not necessarily\n", + "the same as those that the function block sees!\n", + "This is because dictionaries are unordered, and the `**` syntax effectively creates a dictionary." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Exercise**\n", + "The most generic method\n", + "\n", + "The most generic method would take any number of positional arguments _and_ any\n", + "number of keyword arguments. What would this method look like?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Solution**\n", + "\n", + "It would use both `*` and `**` syntax in defining the arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def generic(*args, **kwargs):\n", + " print(f'Got args: {args}')\n", + " print(f'Got kwargs: {kwargs}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "data = {'bing': 'baz'}\n", + "generic(1, 2, 'abc', foo='bar', **data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Inline methods\n", + "\n", + "Some methods take other methods as arguments, like the built-in `map` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "map(str, range(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "`map` takes a function and an iterable, and applies the function to each element in\n", + "the iterable. It returs however an generator, an object that is, for advanced reasons, not actually evaluated yet. In most cases, you can treat this `list` or `tuple`-like.\n", + "\n", + "To make sure it is evaluated, we can explicitly convert it to a container, _i.e._ a list with the results. We can define and then pass our own functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(map(str, range(5)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def cube(x):\n", + " \"\"\"Return the third power of x.\"\"\"\n", + " return x*x*x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(map(cube, range(5)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "For such a simple method, this is a lot of typing! We can use a `lambda` function to\n", + "define such simple methods inline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(map(lambda x: x*x*x, range(5)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The syntax of defining a `lambda` is like this:\n", + "```\n", + "lambda : \n", + "```\n", + "`` is a command-separate set of variables that the `lambda` can take as\n", + "arguments, and `` is the code that is run. A `lambda`\n", + "automatically returns whatever the result of the expression is, you don’t need\n", + "a `return` (the `return` is _implicit_).\n", + "\n", + "Writing a `lambda` statement defines a method, which you can capture as a\n", + "variable just like any other object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "div2 = lambda x: x / 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "div2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(map(div2, range(5)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Note however that _if we assing the function to a variable_, the general preferred way to do is using the normal function definition." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def div2(x):\n", + " return x / 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Exercise**\n", + "Sum in quadrature\n", + "\n", + "Write a method that accepts an arbitrary number of arguments, and returns the\n", + "sum of the arguments computed in quadrature. A “sum in quadrature” is the\n", + "square root of the sum of the squares of each number. You should use `lambda`\n", + "to define a squaring and a square root function, and `map` to apply the\n", + "squaring method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Solution**\n", + "We need a little square root method and a method to square its input." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "square = lambda x: x*x\n", + "sqrt = lambda x: x**0.5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We then define a method that can accept any number of arguments using the\n", + "`*args` syntax, and use `map` to call the `square` method on the list of\n", + "arguments. Then we can call `sum` on the result, and then `sqrt`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def quadrature(*args):\n", + " \"\"\"Return the sum in quadrature of the arguments.\"\"\"\n", + " return sqrt(sum(map(square, args)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "quadrature(1, 1) # should be equal to sqrt(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "2**0.5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Another good use case for `lambda` (remember, we can just define the function, it's more of a \"nice-to-have\") is the built-in `filter` method (see:\n", + "`help(filter)`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# filter and return the even numbers only\n", + "filter(lambda x: x % 2 == 0, range(10)) # returns again a generator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "list(filter(lambda x: x % 2 == 0, range(10)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Exercise**\n", + "List comprehension\n", + "\n", + "How would you rewrite the `filter` example above using a list comprehension?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "**Solution**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "[ x for x in range(10) if x % 2 == 0 ]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Generally, you should only use `lambda` methods to define little throw-away\n", + "methods. The main downside with using them is that you can’t attach a docstring\n", + "to them, and they become unwieldy when there’s complex logic.\n", + "\n", + "Golden rules:\n", + " - Make functions idempotent where possible (stateless, the same input values will return the same output). This is of course different for classes.\n", + " - Don't use globals (if anyhow avoidable).\n", + " - Do not alter the input argument if they are mutable. If it's convenient, make a copy of the object first (remember copies of lists?)\n", + " - Put a docstring there. Probably even before you implement your function. This makes it\n", + " not only to everyone else but also to you clear what comes in and what comes out.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/methods.md b/python/methods.md deleted file mode 100644 index 3e88c4b7..00000000 --- a/python/methods.md +++ /dev/null @@ -1,476 +0,0 @@ -# Functions - -Functions, or methods if they are associated with a class, take some input and return some output. We have -already used lots of functions, like `len`, `abs` and `print` as well as methods like `append` from `list`, `get` from `dict` or `replace` from `str`. In this lesson -we will start creating our own. - -As we have seen, methods can do a lot of stuff with very little typing. Methods -are normally used to encapsulate small pieces of code that we want to reuse. - -Let’s rewrite `len` as an example. - -```python ->>> def length(obj): -... """Return the number of elements in `obj`. -... -... Args -... ---- -... obj (iterable): Object the length will be calculated from. -... -... Return -... ------ -... int: number of elements in `obj`. -... """ -... i = 0 -... for _ in obj: -... i += 1 -... return i ->>> length - ->>> help(length) -Help on function length in module __main__: - -length(obj) - Return the number of elements in `obj`. - - Args - obj (iterable): Object the length will be calculated from. - - Return - int: number of elements in `obj`. - -or viewing the docs view in your preferred editor. - ->>> length('A b c!') -6 ->>> length(range(5)) -5 -``` - -There’s a lot going on here, so we will break it down line-by-line. - -1. `def length(obj)`: methods are _defined_ using `def`, followed by a space, - and then the name you want to give the method.[^1] Inside the parentheses - after the name, we list the inputs, or _arguments_, that we want our method - to accept. In this case, we only need a single input: the thing we want to - compute the length of. Finally, there’s a colon at the end, just like with - a `for` or `if`, which means a _block_ of code follows (which must be - indented). -2. `"""Return the number of elements in obj."""`: This is the _docstring_. It’s - just a documentation string, defined literally with three double quotes so that we can - include linebreaks. By placing a string here, Python makes the string - available to use when we pass our function to `help` and in a lot of other places - like docs viewer of a decent editor or even allows to automatically generate - documents including HTML with the docs. Documenting your - functions is a very good idea! It makes it clear to others, and to - future-you, what the method is supposed to do. -3. The method block. This is the code that will run whenever you _call_ your - method, like `length([1])`. The code in the block has access to the - arguments and to any variables defined _before_ the method definition. -```python ->>> x = 1 ->>> def top_function(): -... """Do something silly.""" -... print(x) -... print(y) -... ->>> y = 2 ->>> top_function() -1 -Traceback (most recent call last): - File "", line 1, in - File "", line 3, in top_function -NameError: global name 'y' is not defined -``` - In general, you should try to minimise the number of variables outside your - method that you use inside. It makes figuring out what the method does much - harder, as you have to look elsewhere in the code to find things out. -4. `return i`: This defines the _output_ of the method, the thing that you get - back when you call the method. You don’t have to return anything, in which - case Python will implicitly make your function return `None`, or you can - return multiple things at once. -```python ->>> def no_return(): -... 1 + 1 -... ->>> no_return() ->>> no_return() == None ->>> def such_output(): -... return 'wow', 'much clever', 213 -... ->>> such_output() -('wow', 'much clever', 213) ->>> a, b, c = such_output() ->>> b -'much clever' -``` - You can see that returning multiple things implicitly means returning a - tuple, so we can choose to assign one variable per value while calling the - method. - -[^1]: Names are conventionally in lowercase, with underscores separating words. - -Remark: there are comments (with `#`) and docstrings. Both serve a very different purpose - - - comments `#` are for people who _read_ the code. Other developers that don't want to just - use your function but _change_ it. They can be short and serve the purpose to make the - code more readable. Typical example: adding a comment on a `- 1` or `+ 1` added somewhere, - such as ` len(x) - 1 # we don't need the border`. If a block of code implements a hard - to read algorithm, it is also appropriate to use several `#` lines to explain beforehand - what is going to happen. - _Never_ use tripple quotes `"""` to make a large comment! Use always `#`, any decent - editor is able to (un)comment several lines at once. - - Docstrings are for users. If someone imports your function, the docstrings tells - _how to use it_ and what it does exactly. It does, however, not contain any (unnecessary) - information about the implementation. - - -Functions can be called in several ways. - -```python ->>> def add(x, y): -... """Return the sum of x and y.""" -... return x + y -... ->>> add(1, 2) ->>> add(x=1, y=2) ->>> add(1, y=2) ->>> add(y=2, x=1) ->>> add(y=2, 1) - File "", line 1 -SyntaxError: non-keyword arg after keyword arg ->>> add(y=2, =1) - File "", line 1 - add(y=2, =1) - ^ -SyntaxError: invalid syntax -``` - -Specifying the argument’s name explicitly when calling a method is nice because -it reminds you what the argument is supposed to do. It also means you don’t -have to remember the order in which the arguments were defined, you can specify -_keyword arguments_ in any order. You can even mix _positional arguments_ with -keyword arguments, but any keyword arguments must come last. - -Using keyword arguments is particularly useful for arguments which act as -on/off flags, because it’s often not obvious what your `True` or `False` is -doing. - -```python ->>> def add(x, y, show): -... """Return the sum of x and y. -... -... Optionally print the result before returning it. -... """ -... if show: -... print(x + y) -... return x + y -... ->>> _ = add(1, 2, True) # Hmm, what is True doing again? -3 ->>> _ = add(1, 2, show=True) # Aha! Much clearer -``` - -Always having to specify that flag is annoying. It would be much nicer if -`show` had a _default value_, so that we don’t _have_ to provide a value when -calling the method, but can optionally override it. - -```python ->>> def add(x, y, show=False): -... """Return the sum of x and y. -... -... Optionally print the result before returning it. -... """ -... if show: -... print(x + y) -... return x + y -... ->>> _ = add(1, 2) # No printing! ->>> _ = add(1, 2, show=True) -3 -``` - -Perfect. - -Of course, function arguments can be anything, even other functions! - -```python ->>> def run_method(method, x): -... """Call `method` with `x`.""" -... return method(x) -... ->>> run_method(len, [1, 2, 3]) -3 -``` - -{% challenge "Methods returning methods" %} - -What does this method do? - -```python ->>> def make_incrementor(increment): -... def func(var): -... return var + increment -... return func -``` - -{% solution "Solution" %} - -It returns a function whose `increment` value has been filled by the argument -to `make_incrementor`. If we called `make_incrementor(3)`, then `increment` has -the value 3, and we can fill in the returned method in our heads. - -```python -def func(var): - return var + 3 -``` - -So when we call _this_ method, we’ll get back what we put in, but plus 3. - -```python ->>> increment_one = make_incrementator(1) ->>> increment_two = make_incrementator(2) ->>> print increment_one(42), increment_two(42) -43 44 ->>> print make_incrementator(3)(42) # Do it in one go! -45 -``` - -{% endsolution %} - -{% endchallenge %} - -What if you like to accept an arbitrary number of arguments? For example, we -can also write a `total` method that takes two arguments. - -```python ->>> def total(x, y): -... """Return the sum of the arguments.""" -... return x + y -... ->>> -``` - -But what if we want to allow the caller to pass more than two arguments? It -would be tedious to define many arguments explicitly. - -```python ->>> def total(*args): -... """Return the sum of the arguments.""" -... # For seeing what `*` does -... print('Got {0} arguments: {1}'.format(len(args), args)) -... return sum(args) -... ->>> total(1) -Got 1 arguments: (1,) -1 ->>> total(1, 2) -Got 2 arguments: (1, 2) -3 ->>> total(1, 2, 3) -Got 3 arguments: (1, 2, 3) -6 -``` - -The `*args` syntax says “stuff any arguments into a tuple and call it `args`”. -This let’s us capture any number of arguments. As `args` is a tuple, one could -loop over it, access a specific element, and so on. - -We can also _expand_ lists into separate arguments with the same syntax when -_calling_ a method. - -```python ->>> def reverse_args(x, y): -... return y, x -... ->>> l = ['a', 'b'] ->>> reverse_args(l) -Traceback (most recent call last): - File "", line 1, in -TypeError: reverse_args() takes exactly 2 arguments (1 given) ->>> reverse_args(*l) -('b', 'a') -``` - -A similar syntax exists for keyword arguments. - -```python ->>> def ages(**people): -... """Print people's information.""" -... # For seeing what `**` does -... print('Got {0} arguments: {1}'.format(len(people), people)) -... for person in people: -... print('Person {0} is {1}'.format(person, people[person])) -... ->>> ages(steve=31) -Got 1 arguments: {'steve': 31} -Person steve is 31 ->>> ages(steve=31, helen=70, zorblax=9963) -Got 3 arguments: {'steve': 31, 'zorblax': 9963, 'helen': 70} -Person steve is 31 -Person zorblax is 9963 -Person helen is 70 -``` - -As you can see from the debug print statement, `**people` is a dictionary -containing the keyword arguments we passed to the `ages` method. The keys of -the dictionary are the names of the argument as strings, and the values are the -values of the arguments. Just like for the `*` syntax, `**` can also be used to -expand a dictionary into keyword arguments. - -```python ->>> d = {'thor': 5000, 'yoda': -1} ->>> ages(**d) -Got 2 arguments: {'yoda': -1, 'thor': 5000} -Person yoda is -1 -Person thor is 5000 -``` - -The order of the keyword arguments used to call the method are not necessarily -the same as those that the function block sees! -This is because dictionaries are unordered, and the `**` syntax effectively creates a dictionary. - - -{% challenge "The most generic method" %} - -The most generic method would take any number of positional arguments _and_ any -number of keyword arguments. What would this method look like? - -{% solution "Solution" %} - -It would use both `*` and `**` syntax in defining the arguments. -```python ->>> def generic(*args, **kwargs): -... print('Got args: {0}'.format(args)) -... print('Got kwargs: {0}'.format(kwargs)) -... ->>> d = {'bing': 'baz'} ->>> generic(1, 2, 'abc', foo='bar', **d) -Got args: (1, 2, 'abc') -Got kwargs: {'bing': 'baz', 'foo': 'bar'} -``` - -{% endsolution %} - -{% endchallenge %} - -## Inline methods - -Some methods take other methods as arguments, like the built-in `map` method. - -```python ->>> map(str, range(5)) -['0', '1', '2', '3', '4'] -``` - -`map` takes a function and an iterable, and applies the function to each element in -the iterable. It returns a new list with the results. We can define and then pass -our own functions. - -```python ->>> def cube(x): -... """Return the third power of x.""" -... return x*x*x -... ->>> map(cube, range(5)) -[0, 1, 8, 27, 64] -``` - -For such a simple method, this is a lot of typing! We can use a `lambda` function to -define such simple methods inline. - -```python ->>> map(lambda x: x*x*x, range(5)) -[0, 1, 8, 27, 64] -``` - -The syntax of defining a `lambda` is like this: - -``` -lambda : -``` - -`` is a command-separate set of variables that the `lambda` can take as -arguments, and `` is the code that is run. A `lambda` -automatically returns whatever the result of the expression is, you don’t need -a `return` (the `return` is _implicit_). - -Writing a `lambda` statement defines a method, which you can capture as a -variable just like any other object. - -```python ->>> div2 = lambda x: x/2 ->>> div2 - at 0x7fc6b2207758> ->>> map(div2, range(5)) -[0.0, 0.5, 1.0, 1.5, 2.0] -``` - -Note that we got real numbers back because we are using Python 2 with `from __future__ import division`. - -{% challenge "Sum in quadrature" %} - -Write a method that accepts an arbitrary number of arguments, and returns the -sum of the arguments computed in quadrature. A “sum in quadrature” is the -square root of the sum of the squares of each number. You should use `lambda` -to define a squaring and a square root function, and `map` to apply the -squaring method. - -{% solution "Solution" %} - -We need a little square root method and a method to square its input. -```python ->>> square = lambda x: x*x ->>> sqrt = lambda x: x**0.5 -``` -We then define a method that can accept any number of arguments using the -`*args` syntax, and use `map` to call the `square` method on the list of -arguments. Then we can call `sum` on the result, and then `sqrt`. -```python ->>> def quadrature(*args): -... """Return the sum in quadrature of the arguments.""" -... return sqrt(sum(map(square, args))) -... ->>> quadrature(1, 1) # should be equal to sqrt(2) -1.4142135623730951 ->>> 2**0.5 -1.4142135623730951 -``` - -{% endsolution %} - -{% endchallenge %} - -Another use case for `lambda` is the built-in `filter` method (see: -`help(filter)`). - -```python ->>> filter(lambda x: x % 2 == 0, range(10)) # filter and return the even numbers only -[0, 2, 4, 6, 8] -``` - -{% challenge "List comprehension" %} - -How would you rewrite the `filter` example above using a list comprehension? - -{% solution "Solution" %} - -```python ->>> [ x for x in range(10) if x % 2 == 0 ] -[0, 2, 4, 6, 8] -``` - -{% endsolution %} - -{% endchallenge %} - -Generally, you should only use `lambda` methods to define little throw-away -methods. The main downside with using them is that you can’t attach a docstring -to them, and they become unwieldy when there’s complex logic. - -Golden rules: - - Make functions idempotent where possible (stateless, the same input values will return the same output). This is of course different for classes. - - Don't use globals (if anyhow avoidable). - - Do not alter the input argument if they are mutable. - - Put a docstring there. Probably even before you implement your function. This makes it - not only to everyone else but also to you clear what comes in and what comes out. diff --git a/python/modules.ipynb b/python/modules.ipynb new file mode 100644 index 00000000..bac0c7c6 --- /dev/null +++ b/python/modules.ipynb @@ -0,0 +1,894 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Modules\n", + "\n", + "Python comes with lots of useful stuff, which is provided with modules\n", + "(and submodules, see later).\n", + "We have already met the maths module, but did not talk about how we started using it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "math" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "math.pi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "math.sin(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The path after `from` might look different on your computer.\n", + "\n", + "So, `math` is a _module_, and this seems to behave a lot like other objects we\n", + "have met: it is a container with properties and methods attached that we can\n", + "access with the dot operator `.`. Actually, that is pretty much all there is to\n", + "them." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Using modules into your code: import\n", + "\n", + "The keyword `import`, usually specified at the beginning of your source code, is\n", + "used to tell Python what modules you want to make available to your current\n", + "code.\n", + "\n", + "There are different ways of specifying an import. The one we have seen already\n", + "simply makes the module available to you:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "random.uniform(0, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The module `random` contains functions useful for random number generation: with\n", + "the `import` above, we have made the `random` module accessible, and everything\n", + "within that module is accessible via the syntax `random.`. For the record,\n", + "the `uniform(x,y)` method returns a pseudo-random number within the range\n", + "`$ [x,y] $`.\n", + "\n", + "Sometimes you want to make only one or more things from a given module\n", + "accessible: Python gives you the ability to import just those:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from random import choice, uniform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "uniform(0, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "choice([ 33, 56, 42, -1 ])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "\n", + "In this case the `uniform` and `choice` names are available _directly_, _i.e._\n", + "without using the `random` prefix. All other functions in the `random` module\n", + "are not available in this case. For the record, the `choice` function returns a\n", + "random element from a given collection.\n", + "\n", + "Another option is to import _all_ functions of a certain module and make them\n", + "available without a prefix, which you should *never* do (except in very special cases if you're writing a library)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# from random import * # don't run this :) never" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This is not that recommended as you generally do not know what is the extent\n", + "of what you are importing and you might end up with name clashes between your\n", + "current code and the imported module, as it will all be in the same namespace,\n", + "meaning directly available with no need for a `.` syntax. \n", + "\n", + "Lastly, it is possible to import modules, or specific names from a module,\n", + "under an alias." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from random import uniform as uni" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "uni(0, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.arccos(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This option is useful when you need to assign shorter aliases to names you will\n", + "use frequently. In particular, the alias `np` for the `numpy` module will be\n", + "encountered a lot.\n", + "\n", + "Note that modules can have submodules, specified with extra dots `.`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "Path('..').absolute()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "When importing a module, its **submodules are not available by default and you\n", + "must import them explicitly**:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "os.getcwd()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "It is also possible to import several modules with a single import command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import math\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "but this is [not recommended by the Python style guide][https://www.python.org/dev/peps/pep-0008/#imports], which\n", + "suggests to use several import statements, one per module, as it improves\n", + "readability:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import math\n", + "import os\n", + "import sys" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "If you need to import several names from a single module, you can split an import\n", + "function over multiple lines:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from math import e, exp, floor, log" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "floor(exp(log(e)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "However, while these are possibilities, importing the modules is the usual way to go. If you're unsure, just look around for good examples; remember, consistency is the key!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## The standard library\n", + "\n", + "The set of things that Python comes with, from all of the types of objects to\n", + "all of the different modules, is called the [standard library][stl]. It is\n", + "recommended to browse through the standard library documentation to see what is\n", + "available: Python is rich of standard modules, and you should reuse them as much\n", + "as possible instead of rewriting code on your own.\n", + "\n", + "Some of the categories for which standard modules are available are:\n", + "\n", + "* processing paths\n", + "* date and time manipulation\n", + "* mathematical functions\n", + "* parsing of certain file formats\n", + "* support for multiple processes and threads\n", + "* ...\n", + "\n", + "Use standard Python library modules with confidence: being part of any standard\n", + "Python distribution, your code will be easily portable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Modules from PyPi\n", + "\n", + "Many external modules can be found on [PyPi][pypi], the Python Package Index repository.\n", + "\n", + "If a certain module you need is not available on your distribution you can\n", + "easily install it with the `pip` shell command as seen in the previous lectures." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## (Advanced) Write your first Python module\n", + "\n", + "The simplest Python module you can write is just a `.py` file with some\n", + "functions inside:\n", + "\n", + "\n", + "```python\n", + "# myfirstmodule.py\n", + "\n", + "def one():\n", + " print('this is my first function')\n", + "\n", + "def two():\n", + " print('this is my second function')\n", + "```\n", + "\n", + "You can now fire an `ipython` shell and use those functions right away (because it automatically picks up modules aka `.py` files inside the working directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "```python\n", + ">>> import myfirstmodule\n", + ">>> myfirstmodule.one()\n", + "this is my first function\n", + ">>> myfirstmodule.two()\n", + "this is my second function\n", + "```\n", + "\n", + "By simply calling the file `myfirstmodule.py` we have made it available as a\n", + "module named `myfirstmodule` - given that the file is in the same directory\n", + "where we have launched the Python interpreter.\n", + "\n", + "### Module name restrictions\n", + "\n", + "Note that you cannot pick any name you want for a module! From the\n", + "[Python style guide][https://www.python.org/dev/peps/pep-0008/#package-and-module-names\n", + "], we gather that we should use \"short,\n", + "all-lowercase names\". As a matter of fact, if we used dashes in the file name,\n", + "we would have ended up with a syntax error while trying to load it:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "```python\n", + ">>> import my-first-module\n", + " File \"\", line 1\n", + " import my-first-module\n", + " ^\n", + "SyntaxError: invalid syntax\n", + "```\n", + "\n", + "Python treats `-` as a minus and does not understand your intentions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "{% endcallout %}\n", + "\n", + "\n", + "## Write a structured module\n", + "\n", + "Let's now create a more structured module, with submodules and different files.\n", + "We can start from the `myfirstmodule.py` file and create a directory structure:\n", + "\n", + "```bash\n", + "$ mkdir yabba\n", + "$ cp myfirstmodule.py yabba/__init__.py\n", + "```\n", + "\n", + "We have reused the same file created before, copied it into a directory called\n", + "`yabba` and renamed it to `__init__.py`. The double underscore should ring a\n", + "bell: this is a Python special name, and it represents the \"main file\" within\n", + "a module, whereas the directory name now represents the module name.\n", + "\n", + "This means that our module is called `yabba`, and if we import it, functions\n", + "from `__init__.py` will be available:\n", + "\n", + "```python\n", + ">>> import yabba\n", + ">>> yabba.one()\n", + "this is my first function\n", + ">>> yabba.two()\n", + "this is my second function\n", + "```\n", + "\n", + "We can create an additional file inside the `yabba` directory, say\n", + "`yabba/extra.py` and have more functions there:\n", + "\n", + "```python\n", + "# yabba/extra.py\n", + "\n", + "def three():\n", + " print 'this function will return the number three'\n", + " return 3\n", + "```\n", + "\n", + "We have effectively made `extra` a submodule of `yabba`. Let's try:\n", + "\n", + "```python\n", + ">>> import yabba\n", + ">>> filter(lambda x: not x.startswith('__'), dir(yabba))\n", + "['one', 'two']\n", + ">>> import yabba.extra\n", + ">>> yabba.extra.three()\n", + "yabba.extra.three()\n", + "this function will return the number three\n", + "3\n", + "```\n", + "\n", + "{% challenge \"What have I done with the filter function?\" %}\n", + "\n", + "We have used the filter function above to list the functions we have defined\n", + "in our module. Can you describe in detail what the commands above do?\n", + "{% solution \"Solution\" %}\n", + "\n", + "The `dir(module)` command lists all _names_ (not necessarily functions, not\n", + "necessarily defined by us) contained in a given imported module. We have used the\n", + "`filter()` command to filter out all names starting with two underscores. Every\n", + "item returned by `dir()` is passed as `x` to the lambda function which returns\n", + "`True` or `False`, determining whether the `filter()` function should keep or\n", + "discard the current element.\n", + "\n", + "{% endsolution %}\n", + "\n", + "{% endchallenge %}\n", + "\n", + "\n", + "## Run a module\n", + "\n", + "We can make a Python module that can be easily imported by other Python\n", + "programs, but we can also make it in a way that it can be run directly as a\n", + "Python script.\n", + "\n", + "Let's write this special module and call it `runnable.py`:\n", + "\n", + "```python\n", + "#!/usr/bin/env python\n", + "\n", + "long_format = False\n", + "\n", + "def print_label(label, msg):\n", + " if long_format:\n", + " out = '{0}: {1}'.format(label.upper(), str(msg))\n", + " else:\n", + " out = '{0}-{1}'.format(label[0].upper(), str(msg))\n", + " print out\n", + "\n", + "def debug(msg):\n", + " print_label('debug', msg)\n", + "\n", + "def warning(msg):\n", + " print_label('warning', msg)\n", + "\n", + "if __name__ == '__main__':\n", + " print '*** Testing print functions ***'\n", + " debug('This is a debug message')\n", + " long_format = True\n", + " warning('This is a warning message with a long label')\n", + "else:\n", + " print 'Module {0} is being imported'.format(__name__)\n", + "```\n", + "\n", + "Now let's make it executable:\n", + "\n", + "```bash\n", + "$ chmod +x runnable.py\n", + "```\n", + "\n", + "It can be now run as a normal executable from your shell:\n", + "\n", + "```\n", + "$ ./runnable.py\n", + "*** Testing print functions ***\n", + "D-This is a debug message\n", + "WARNING: This is a warning message with a long label\n", + "```\n", + "\n", + "There are two outstanding notions here. First off, the first line is a\n", + "\"shebang\": it really has to be the _first_ line in a file (it cannot be the\n", + "second, or \"one of the first\", or the first non-empty) and it basically tells\n", + "your shell that your executable text file has to be interpreted by the current\n", + "Python interpreter. Just use this line as it is.\n", + "\n", + "Secondly, we notice we have a peculiar `if` condition with a block that gets\n", + "executed when we run the file. `__name__` is a special internal Python variable\n", + "which is set to the module name in case the module is imported. When the module\n", + "is ran, it is set to the special value `\"__main__\"`.\n", + "\n", + "The `else:` condition we have added is just to show what happens when you import\n", + "the module instead:\n", + "\n", + "```python\n", + ">>> import runnable\n", + "Module runnable is being imported\n", + ">>> runnable.warning('hey I can use it from here too')\n", + "W-hey I can use it from here too\n", + "```\n", + "\n", + "Now, the `if` condition is not necessary when you want to run the module - those\n", + "lines in the `if` block will be executed anyway. It is however used to _prevent_\n", + "some lines from being executed when you import the file as a module.\n", + "\n", + "Please also note that module imports are typically _silent_, so the `else:`\n", + "condition with a printout would not exist in real life.\n", + "\n", + "\n", + "[stl]: https://docs.python.org/2/library/index.html\n", + "[pep8-import]: https://www.python.org/dev/peps/pep-0008/#imports\n", + "[pep8-modulenames]: https://www.python.org/dev/peps/pep-0008/#package-and-module-names\n", + "[pypi]: https://pypi.org/\n", + "[anaconda]: https://www.anaconda.com/distribution/\n", + "[lcg_virtualenv]: https://gitlab.cern.ch/cburr/lcg_virtualenv/\n" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/modules.md b/python/modules.md deleted file mode 100644 index 4a318657..00000000 --- a/python/modules.md +++ /dev/null @@ -1,430 +0,0 @@ -# Modules - -Python comes with lots of useful stuff, which is provided with modules -(and submodules, see later). -We have already met the maths module, but did not talk about how we started using it. - -```python ->>> import math ->>> math - ->>> math.pi -3.141592653589793 ->>> math.sin(1) -0.8414709848078965 -``` - -The path after `from` might look different on your computer. - -So, `math` is a _module_, and this seems to behave a lot like other objects we -have met: it is a container with properties and methods attached that we can -access with the dot operator `.`. Actually, that is pretty much all there is to -them. - - -## Using modules into your code: import - -The keyword `import`, usually specified at the beginning of your source code, is -used to tell Python what modules you want to make available to your current -code. - -There are different ways of specifying an import. The one we have seen already -simply makes the module available to you: - -```python ->>> import random ->>> random.uniform(0, 1) -0.5877109428927353 -``` - -The module `random` contains functions useful for random number generation: with -the `import` above, we have made the `random` module accessible, and everything -within that module is accessible via the syntax `random.`. For the record, -the `uniform(x,y)` method returns a pseudo-random number within the range -`$ [x,y] $`. - -Sometimes you want to make only one or more things from a given module -accessible: Python gives you the ability to import just those: - -```python ->>> from random import uniform, choice ->>> uniform(0, 1) -0.4059007502204043 ->>> choice([ 33, 56, 42, -1 ]) -42 -``` - -In this case the `uniform` and `choice` names are available _directly_, _i.e._ -without using the `random` prefix. All other functions in the `random` module -are not available in this case. For the record, the `choice` function returns a -random element from a given collection. - -Another option is to import _all_ functions of a certain module and make them -available without a prefix: - -```python ->>> from random import * ->>> gauss(0, 1) --1.639334770284028 -``` - -This is not that recommended as you generally do not know what is the extent -of what you are importing and you might end up with name clashes between your -current code and the imported module, as it will all be in the same namespace, -meaning directly available with no need for a `.` syntax. - -Lastly, it is possible to import modules, or specific names from a module, -under an alias. - -```python ->>> from random import uniform as uni ->>> uni(0, 1) -0.7288973406605329 ->>> import numpy as np -np.arccos(1) -0.0 -``` - -This option is useful when you need to assign shorter aliases to names you will -use frequently. In particular, the alias `np` for the `numpy` module will be -encountered a lot. - -Note that modules can have submodules, specified with extra dots `.`: - -```python ->>> from os.path import abspath ->>> abspath('..') -'/afs/cern.ch/user/d' -``` - -When importing a module, its **submodules are not available by default and you -must import them explicitly**: - -```python ->>> import os ->>> os.getcwd() -'/afs/cern.ch/user/d/dberzano' ->>> import os.path ->>> os.path.basename(os.getcwd()) -'dberzano' -``` - -Note that due to the current Python implementation of the `os` module, `os.path` -functions are _actually_ available _even without importing `os.path`. But just -`os`_. You cannot and should not rely on this implementation, which represents an exception -and might change in the future. Always import submodules explicitly! - -It is also possible to import several modules with a single import command: - -```python ->>> import os, sys, math -``` - -but this is [not recommended by the Python style guide][pep8-import], which -suggests to use several import statements, one per module, as it improves -readability: - -```python ->>> import os ->>> import sys ->>> import math -``` - -If you need to import several names from a single module, you can split an import -function over multiple lines: - -```python ->>> from math import ( -... exp, -... log, -... e, -... floor -... ) ->>> floor(exp(log(e))) -2.0 -``` - - -## The standard library - -The set of things that Python comes with, from all of the types of objects to -all of the different modules, is called the [standard library][stl]. It is -recommended to browse through the standard library documentation to see what is -available: Python is rich of standard modules, and you should reuse them as much -as possible instead of rewriting code on your own. - -Some of the categories for which standard modules are available are: - -* processing paths -* date and time manipulation -* mathematical functions -* parsing of certain file formats -* support for multiple processes and threads -* ... - -Use standard Python library modules with confidence: being part of any standard -Python distribution, your code will be easily portable. - - -## Modules from PyPi - -Many external modules can be found on [PyPi][pypi], the Python Package Index repository. -Some of those modules are -already part of some Python distributions (such as [Anaconda][anaconda], which -comes with more than a thousand science-oriented modules preinstalled). - -If a certain module you need is not available on your distribution you can -easily install it with the `pip` shell command. Since you typically do not have write -access to the standard Python installation's directories, `pip` allows you to -install modules only for yourself, under your current user's home directory. -It is recommended to set up in your shell startup script (such as `~/.bashrc`) -the following two lines telling once and for all where to install and search for -Python user modules: - -```bash -export PYTHONUSERBASE=$HOME/.local -export PATH=$PYTHONUSERBASE/bin:$PATH -``` - -Once you have done that, close your current terminal window and open a new one, -and you will be ready to use `pip`. We will see in a later lesson how to install -the `root_pandas` module with: - -```bash -pip install --user root_pandas -``` - -## Modules inside a virtual environment - -It is however usually preferable and safer to do everything inside a virtual environement. -The latter is like a copy of your current environement. Thus you can modify your virtual -environement (including installing/deleting/updating modules) without affecting your default -environement. If at some point you realize you have broken everything, you can always exit -the virtual environement and go back to the default lxplus one. - -To build a virtual environement based on LCG views, you can use [LCG_virtualenv][lcg_virtualenv]: - -```bash -git clone https://gitlab.cern.ch/cburr/lcg_virtualenv.git -./lcg_virtualenv/create_lcg_virtualenv myVenv -``` -To activate the virtual environement do: - -```bash -source myVenv/bin/activate -``` - -You can then install stuff with `pip`, like for instance `root_pandas`: - -```bash -pip install --upgrade root_pandas matplotlib -python -c 'import pandas; print(f"Got pandas from {pandas.__file__}")' -python -c 'import root_pandas; print(f"Got root_pandas from {root_pandas.__file__}")' -python -c 'import matplotlib; print(f"Got matplotlib from {matplotlib.__file__}")' -``` - -You can go back to the default environement using the `deactivate` command. - - -## Write your first Python module - -The simplest Python module you can write is just a `.py` file with some -functions inside: - -```python -# myfirstmodule.py - -def one(): - print('this is my first function') - -def two(): - print('this is my second function') -``` - -You can now fire an `ipython` shell and use those functions right away: - -```python ->>> import myfirstmodule ->>> myfirstmodule.one() -this is my first function ->>> myfirstmodule.two() -this is my second function -``` - -By simply calling the file `myfirstmodule.py` we have made it available as a -module named `myfirstmodule` - given that the file is in the same directory -where we have launched the Python interpreter. - -{% callout "Module name restrictions" %} - -Note that you cannot pick any name you want for a module! From the -[Python style guide][pep8-modulenames], we gather that we should use "short, -all-lowercase names". As a matter of fact, if we used dashes in the file name, -we would have ended up with a syntax error while trying to load it: - -```python ->>> import my-first-module - File "", line 1 - import my-first-module - ^ -SyntaxError: invalid syntax -``` - -Python treats `-` as a minus and does not understand your intentions. - -{% endcallout %} - - -## Write a structured module - -Let's now create a more structured module, with submodules and different files. -We can start from the `myfirstmodule.py` file and create a directory structure: - -```bash -$ mkdir yabba -$ cp myfirstmodule.py yabba/__init__.py -``` - -We have reused the same file created before, copied it into a directory called -`yabba` and renamed it to `__init__.py`. The double underscore should ring a -bell: this is a Python special name, and it represents the "main file" within -a module, whereas the directory name now represents the module name. - -This means that our module is called `yabba`, and if we import it, functions -from `__init__.py` will be available: - -```python ->>> import yabba ->>> yabba.one() -this is my first function ->>> yabba.two() -this is my second function -``` - -We can create an additional file inside the `yabba` directory, say -`yabba/extra.py` and have more functions there: - -```python -# yabba/extra.py - -def three(): - print 'this function will return the number three' - return 3 -``` - -We have effectively made `extra` a submodule of `yabba`. Let's try: - -```python ->>> import yabba ->>> filter(lambda x: not x.startswith('__'), dir(yabba)) -['one', 'two'] ->>> import yabba.extra ->>> yabba.extra.three() -yabba.extra.three() -this function will return the number three -3 -``` - -{% challenge "What have I done with the filter function?" %} - -We have used the filter function above to list the functions we have defined -in our module. Can you describe in detail what the commands above do? -{% solution "Solution" %} - -The `dir(module)` command lists all _names_ (not necessarily functions, not -necessarily defined by us) contained in a given imported module. We have used the -`filter()` command to filter out all names starting with two underscores. Every -item returned by `dir()` is passed as `x` to the lambda function which returns -`True` or `False`, determining whether the `filter()` function should keep or -discard the current element. - -{% endsolution %} - -{% endchallenge %} - - -## Run a module - -We can make a Python module that can be easily imported by other Python -programs, but we can also make it in a way that it can be run directly as a -Python script. - -Let's write this special module and call it `runnable.py`: - -```python -#!/usr/bin/env python - -long_format = False - -def print_label(label, msg): - if long_format: - out = '{0}: {1}'.format(label.upper(), str(msg)) - else: - out = '{0}-{1}'.format(label[0].upper(), str(msg)) - print out - -def debug(msg): - print_label('debug', msg) - -def warning(msg): - print_label('warning', msg) - -if __name__ == '__main__': - print '*** Testing print functions ***' - debug('This is a debug message') - long_format = True - warning('This is a warning message with a long label') -else: - print 'Module {0} is being imported'.format(__name__) -``` - -Now let's make it executable: - -```bash -$ chmod +x runnable.py -``` - -It can be now run as a normal executable from your shell: - -``` -$ ./runnable.py -*** Testing print functions *** -D-This is a debug message -WARNING: This is a warning message with a long label -``` - -There are two outstanding notions here. First off, the first line is a -"shebang": it really has to be the _first_ line in a file (it cannot be the -second, or "one of the first", or the first non-empty) and it basically tells -your shell that your executable text file has to be interpreted by the current -Python interpreter. Just use this line as it is. - -Secondly, we notice we have a peculiar `if` condition with a block that gets -executed when we run the file. `__name__` is a special internal Python variable -which is set to the module name in case the module is imported. When the module -is ran, it is set to the special value `"__main__"`. - -The `else:` condition we have added is just to show what happens when you import -the module instead: - -```python ->>> import runnable -Module runnable is being imported ->>> runnable.warning('hey I can use it from here too') -W-hey I can use it from here too -``` - -Now, the `if` condition is not necessary when you want to run the module - those -lines in the `if` block will be executed anyway. It is however used to _prevent_ -some lines from being executed when you import the file as a module. - -Please also note that module imports are typically _silent_, so the `else:` -condition with a printout would not exist in real life. - - -[stl]: https://docs.python.org/2/library/index.html -[pep8-import]: https://www.python.org/dev/peps/pep-0008/#imports -[pep8-modulenames]: https://www.python.org/dev/peps/pep-0008/#package-and-module-names -[pypi]: https://pypi.org/ -[anaconda]: https://www.anaconda.com/distribution/ -[lcg_virtualenv]: https://gitlab.cern.ch/cburr/lcg_virtualenv/ diff --git a/python/numbers.ipynb b/python/numbers.ipynb new file mode 100644 index 00000000..d622607d --- /dev/null +++ b/python/numbers.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Numbers\n", + "\n", + "There’s nothing magical about numbers in Python, and we’ve already discovered \n", + "how we perform operations on them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "(2 * (1 + 3) - 5) / 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "11 % 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Python also lets you manipulate complex numbers, using `j` to represent the \n", + "complex term." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a = 1 + 4j\n", + "b = 4 - 1j\n", + "a - b" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Complex numbers are objects, of course, and have some useful functions and \n", + "properties attached to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a.conjugate()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a.imag" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a.real" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Somewhat confusingly, computing the magnitude of a complex number can be done \n", + "with the `abs` method, which is available globally." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "abs(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "(a.real**2 + a.imag**2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "(a.real**2 + a.imag**2) ** 0.5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This also demonstrates the `**` operator, which for real numbers corresponds to \n", + "exponentiation.\n", + "\n", + "Each type of number can be created _literally_, like we’ve been doing, by just \n", + "typing the number into your shell or source code, and by using the correspond \n", + "methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "int()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "float()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "complex()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/numbers.md b/python/numbers.md deleted file mode 100644 index d3c2aaf4..00000000 --- a/python/numbers.md +++ /dev/null @@ -1,135 +0,0 @@ -# Numbers - -There’s nothing magical about numbers in Python, and we’ve already discovered -how we perform operations on them. - -```python ->>> (2 * (1 + 3) - 5) / 0.5 -2 ->>> 11 % 4 -3 -``` - - -{% callout "Integer division in Python 2" %} - - -If for any reason (e.g. you want to use LHCb or Alice software) you have to use Python 2, -beware of that Python 2 has a few different _types_ of numbers, and they can -behave differently. - -```python ->>> 10/3 -3 ->>> 10.0/3.0 -3.3333333333333335 -``` - -Interesting. Something different happens when we use numbers with and without -decimal places! This occurs because numbers given with decimal places, like -`3.14` are _floats_, while those without, like `3`, are _integers_. - -For historical reasons, dividing two integers in Python 2 returns an integer, -where the intermediate result is always rounded down. Division using _at least -one_ float gives us the more intuitive answer. - -In Python 3, division with integers works the same way as with floats. You can -ask to have this behaviour in Python 2. - -```python ->>> from __future__ import division ->>> 3 / 4 -0.75 ->>> 3.0 / 4.0 -0.75 -``` - -Because the default behaviour in Python 2 is quite unintuitive, we recommend -using the `from __future__ import division` line everywhere. We’ll come to what exactly this -line is doing shortly. - -If you _do_ want a rounding division, you then can ask for it explicitly with -round, or, if you want an integer division (truncating, rounding towards 0): - -```python ->>> 5 / 3 -1.66666666667 - ->>> round(5 / 3) -2 - ->>> 5 // 3 -1 -``` - -{% endcallout %} - -{% callout "Operators" %} - -This behaviour can be explained in terms of operators and the double-underscore -methods. You can see that numbers have two methods for division: - -```python ->>> dir(1) -[..., - '__floordiv__', - ... - '__truediv__', - ...] -``` - -In Python 2, the `/` operator corresponded to the `__floordiv__` method when -used with integers, but the `__truediv__` operator when used with floats. In -Python 3, and when using the `from __future__ import division` line, the `/` -operator always uses the `__truediv__` method. - -{% endcallout %} - -Python also lets you manipulate complex numbers, using `j` to represent the -complex term. - -```python ->>> a = 1 + 4j ->>> b = 4 - 1j ->>> a - b -(-3+5j) -``` - -Complex numbers are objects, of course, and have some useful functions and -properties attached to them. - -```python ->>> a.conjugate() -(1-4j) ->>> a.imag -4.0 ->>> a.real -1.0 -``` - -Somewhat confusingly, computing the magnitude of a complex number can be done -with the `abs` method, which is available globally. - -```python ->>> abs(a) -4.123105625617661 ->>> import numpy as np ->>> np.sqrt(a.real**2 + a.imag**2) -4.123105625617661 -``` - -This also demonstrates the `**` operator, which for real numbers corresponds to -exponentiation. - -Each type of number can be created _literally_, like we’ve been doing, by just -typing the number into your shell or source code, and by using the correspond -methods. - -```python ->>> int() -0 ->>> float() -0.0 ->>> complex() -0j -``` diff --git a/python/strings.ipynb b/python/strings.ipynb new file mode 100644 index 00000000..79f8e71f --- /dev/null +++ b/python/strings.ipynb @@ -0,0 +1,689 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Strings\n", + "\n", + "Number objects are useful for storing values which are, well, numbers. But what \n", + "if we want to store a sentence? Enter _strings_!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "a = \"What's orange and sounds like a parrot?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Strings can be joined with `+`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b = 'A carrot'\n", + "a + ' ' + b" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "And they can be multiplied by numbers, amazingly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "c = 'omg'\n", + "10 * c" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We’ve specified strings _literally_, _in_ the source code, by wrapping the text \n", + "with singles quotes or double quotes. There’s no difference; most people choose \n", + "one and stick with it.\n", + "\n", + "It can be useful to change if your text contains the quote character. If it \n", + "contains both, you can _escape_ the quote mark by preceding it with a \n", + "backslash. This tells Python that the quote is part of the string you want, and \n", + "not the ending quote." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "fact = \"Gary's favourite word is \\\"python\\\".\"\n", + "fact" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Python prints strings by surrounding them with _single_ quotes, so it escapes \n", + "the single quotes in our string. This is useful because we can copy-paste the \n", + "string into some Python code to use it somewhere else, without having to worry \n", + "about escaping things.\n", + "\n", + "We can create multi-line strings by using three quotation marks. \n", + "Conventionally, double quotations are usually used for these." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "long_fact = \"\"\"This is a long string.\n", + "Quite long indeed.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(long_fact)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Creating strings like this is useful when you want to include line breaks in \n", + "your string. You can also use `\\n` in strings to insert line breaks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "'This is a long string\\n\\nQuite long indeed.\\n'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We can convert things to strings by using the `str` method, which can also \n", + "create an _empty_ string for us." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "'A number: ' + str(999 - 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Strings are objects, and have lots of useful methods attached to them. If you \n", + "want to know how many characters are in a string, you use the global `len` \n", + "method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b.upper()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b.upper().lower()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b.replace('carrot', 'parrot').replace(' ', '_')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "len(b)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "b" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Notice that none of these operations _modify_ the value of the `b` variable. \n", + "Operations on strings _always_ return _new_ strings. Strings are said to be \n", + "_immutable_ for this reason: you can never change a string, just make new ones." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Formatting\n", + "\n", + "One of the most common things you’ll find yourself doing with strings is \n", + "interleaving values into them. For example, you’ve finished an amazing \n", + "analysis, and want to print the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "result1 = 42.0\n", + "result2 = 123.21\n", + "print('My results are: ' + str(result1) + ', ' + str(result2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "\n", + "This is already quite ugly, and will only get worse with more results. We can \n", + "instead use the `f-string` and use the \n", + "special `{}` placeholders to say where we want the values to go in the string by placing an `f` in front of the string (a \"formatted\" string)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "output = f'My results are: {result1} {result2}'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Instead, we can also just create a string withouth the `f` in front and later insert values. This is not only the more historical method, it also provides the ability to template a string and then use the `format` method that’s available on strings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "template = 'My results are: {}, {}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(template.format(result1, result2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Much better! We define the whole string at once, and then place the missing \n", + "values in later.\n", + "\n", + "We can add numbers inside the placeholders, `{0}` and `{1}`, which correspond to the indices \n", + "of the arguments passed to the `format` method, where `0` is the first \n", + "argument, `1` is the second, and so on. By referencing positions like this, we \n", + "can easily repeat placeholders in the string, but only pass the values once to \n", + "`format`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "template = 'My results are: {1}, {0}. The best is {0}, obviously.' # no need to start with 0 here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(template.format(result1, result2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "You can also use _named_ placeholders, then passing the values to `format` \n", + "using the same name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "template3 = 'My results are: {best}, {worst}. But the best is {best}, obviously.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(template3.format(best=result1, worst=result2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "But remember the `f-string`, if we don't need to do something fancy, it's a lot more convenient to use it (and the 99% use-case)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "f'My results are: {result1}, {result2}. But the best is {result1}, obviously.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(template3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "This is nice because it gives more meaning to what the placeholders are for.\n", + "\n", + "There’s [a lot you can do inside the placeholders](https://docs.python.org/3/tutorial/inputoutput.html#the-string-format-method), such as specifying that you want to format a number with a certain number of decimal places." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(f'This number is great: {result1:.3f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "If you want to print a literal curly brace using `format`, you will need to\n", + "escape it by doubling it, so that `{{` will become `{` and `}}` will become `}`.\n", + "Here's an example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(f'This number will be surrounded by curly braces: {{{result1}}}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The innermost `{0}` is replaced with the number, and `{{...}}` becomes `{...}`." + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/python/strings.md b/python/strings.md deleted file mode 100644 index 00020744..00000000 --- a/python/strings.md +++ /dev/null @@ -1,184 +0,0 @@ -# Strings - -Number objects are useful for storing values which are, well, numbers. But what -if we want to store a sentence? Enter _strings_! - -```python ->>> a = "What's orange and sounds like a parrot?" -``` - -Strings can be joined with `+`. - -```python ->>> b = 'A carrot' ->>> a + ' ' + b -'What's orange and sounds like a parrot? A carrot' -``` - -And they can be multiplied by numbers, amazingly. - -```python ->>> c = 'omg' ->>> 10*c -'omgomgomgomgomgomgomgomgomgomg' -``` - -We’ve specified strings _literally_, _in_ the source code, by wrapping the text -with singles quotes or double quotes. There’s no difference; most people choose -one and stick with it. - -It can be useful to change if your text contains the quote character. If it -contains both, you can _escape_ the quote mark by preceding it with a -backslash. This tells Python that the quote is part of the string you want, and -not the ending quote. - -```python ->>> fact = "Gary's favourite word is \"python\"." ->>> fact -'Gary\'s favourite word is "python".' -``` - -Python prints strings by surrounding them with _single_ quotes, so it escapes -the single quotes in our string. This is useful because we can copy-paste the -string into some Python code to use it somewhere else, without having to worry -about escaping things. - -We can create multi-line strings by using three quotation marks. -Conventionally, double quotations are usually used for these. - -```python ->>> long_fact = """This is a long string. -... -... Quite long indeed. -... """ ->>> print(long_fact) -This is a long string. - -Quite long indeed. - ->>> -``` - -Creating strings like this is useful when you want to include line breaks in -your string. You can also use `\n` in strings to insert line breaks. - -```python ->>> 'This is a long string\n\nQuite long indeed.\n' -``` - -We can convert things to strings by using the `str` method, which can also -create an _empty_ string for us. - -```python ->>> str() -'' ->>> 'A number: ' + str(999 - 1) -'A number: 998' -``` - -Strings are objects, and have lots of useful methods attached to them. If you -want to know how many characters are in a string, you use the global `len` -method. - -```python ->>> b.upper() -'A CARROT' ->>> b.upper().lower() -'a carrot' ->>> b.replace('carrot', 'parrot').replace(' ', '_') -'A_parrot' ->>> len(b) -8 ->>> b -'A carrot' -``` - -Notice that none of these operations _modify_ the value of the `b` variable. -Operations on strings _always_ return _new_ strings. Strings are said to be -_immutable_ for this reason: you can never change a string, just make new ones. - -## Formatting - -One of the most common things you’ll find yourself doing with strings is -interleaving values into them. For example, you’ve finished an amazing -analysis, and want to print the results. - -```python ->>> result1 = 123.0 ->>> result2 = 122.3 ->>> print('My results are: ' + str(result1) + ', ' + str(result2)) -My results are: 123.0, 122.3 -``` - -This is already quite ugly, and will only get worse with more results. We can -instead use the `format` method that’s available on strings, and use the -special `{}` placeholders to say where we want the values to go in the string. - -```python ->>> template = 'My results are: {0}, {1}' ->>> print(template.format(result1, result2)) -My results are: 123.0, 122.3 -``` - -Much better! We define the whole string at once, and then place the missing -values in later. - -The numbers inside the placeholders, `{0}` and `{1}`, correspond to the indices -of the arguments passed to the `format` method (but are not mandatory in newer Python), where `0` is the first -argument, `1` is the second, and so on. By referencing positions like this, we -can easily repeat placeholders in the string, but only pass the values once to -`format`. - -```python ->>> template2 = 'My results are: {0}, {1}. But the best is {0}, obviously.' ->>> print(template2.format(result1, result2)) -My results are: 123.0, 122.3. But the best is 123.0, obviously. -``` - -You can also use _named_ placeholders, then passing the values to `format` -using the same name. - -```python ->>> template3 = 'My results are: {best}, {worst}. But the best is {best}, obviously.' ->>> print(template3.format(best=result1, worst=result2)) -My results are: 123.0, 122.3. But the best is 123.0, obviously. -``` - -We can do even better! With Python 3.6+, there are so called f-strings that allow to directly enter a Python expression into the brackets. The syntax is to add an `f` in front of the string. - - -```python ->>> template3 = f'My results are: {result1}, {result2}. But the best is {result1}, obviously.' ->>> print(template3) -My results are: 123.0, 122.3. But the best is 123.0, obviously. -``` - - -This is nice because it gives more meaning to what the placeholders are for. - -There’s [a lot you can do inside the placeholders][strformat], such as specifying that you want to format a number with a certain number of decimal places. - -```python ->>> print('This number is great: {0:.3f}'.format(result1)) -This number is great: 123.000 -``` - -The same works again with f-strings. - -```python ->>> print(f'This number is great: {result1:.3f}') -This number is great: 123.000 -``` - -If you want to print a literal curly brace using `format`, you will need to -escape it by doubling it, so that `{{` will become `{` and `}}` will become `}`. -Here's an example: - -```python ->>> print('This number will be surrounded by curly braces: {{{0}}}'.format(123)) -This number will be surrounded by curly braces: {123} -``` - -The innermost `{0}` is replaced with the number, and `{{...}}` becomes `{...}`. - -[strformat]: https://pyformat.info/ diff --git a/shell/fig/cern_change_shell.png b/shell/fig/cern_change_shell.png index e3183835..bab34f00 100644 Binary files a/shell/fig/cern_change_shell.png and b/shell/fig/cern_change_shell.png differ diff --git a/shell/fig/nano-screenshot.png b/shell/fig/nano-screenshot.png index 50fb1710..9aca8356 100644 Binary files a/shell/fig/nano-screenshot.png and b/shell/fig/nano-screenshot.png differ diff --git a/shell/fig/redirects-and-pipes.png b/shell/fig/redirects-and-pipes.png index 3c595a59..a3332300 100644 Binary files a/shell/fig/redirects-and-pipes.png and b/shell/fig/redirects-and-pipes.png differ diff --git a/snakemake/img/DAG_multiple.png b/snakemake/img/DAG_multiple.png index 66d4c4b0..430ab062 100644 Binary files a/snakemake/img/DAG_multiple.png and b/snakemake/img/DAG_multiple.png differ diff --git a/snakemake/img/DAG_single-wide.png b/snakemake/img/DAG_single-wide.png index 1b2be4ac..7908d80f 100644 Binary files a/snakemake/img/DAG_single-wide.png and b/snakemake/img/DAG_single-wide.png differ diff --git a/snakemake/img/DAG_single.png b/snakemake/img/DAG_single.png index 7b6e80f4..aae2b33a 100644 Binary files a/snakemake/img/DAG_single.png and b/snakemake/img/DAG_single.png differ diff --git a/snakemake/img/Reporting_DAG.png b/snakemake/img/Reporting_DAG.png index 14359aaa..c30ed7f3 100644 Binary files a/snakemake/img/Reporting_DAG.png and b/snakemake/img/Reporting_DAG.png differ diff --git a/snakemake/img/Reporting_rule.png b/snakemake/img/Reporting_rule.png index dccd94ce..3060de92 100644 Binary files a/snakemake/img/Reporting_rule.png and b/snakemake/img/Reporting_rule.png differ diff --git a/snakemake/img/Reporting_stats.png b/snakemake/img/Reporting_stats.png index a3f71b04..508110e4 100644 Binary files a/snakemake/img/Reporting_stats.png and b/snakemake/img/Reporting_stats.png differ