diff --git a/.quarto/_freeze/site_libs/clipboard/clipboard.min.js b/.quarto/_freeze/site_libs/clipboard/clipboard.min.js index 41c6a0f..1103f81 100644 --- a/.quarto/_freeze/site_libs/clipboard/clipboard.min.js +++ b/.quarto/_freeze/site_libs/clipboard/clipboard.min.js @@ -1,7 +1,7 @@ /*! - * clipboard.js v2.0.10 + * clipboard.js v2.0.11 * https://clipboardjs.com/ * * Licensed MIT © Zeno Rocha */ -!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return n={686:function(t,e,n){"use strict";n.d(e,{default:function(){return o}});var e=n(279),i=n.n(e),e=n(370),u=n.n(e),e=n(817),c=n.n(e);function a(t){try{return document.execCommand(t)}catch(t){return}}var f=function(t){t=c()(t);return a("cut"),t};var l=function(t){var e,n,o,r=1 - - - - - - -
-
-
- UoC Bioinformatics Training Facility -

Licensed CC BY 4.0

-
-
- - - - - - diff --git a/_extensions/cambiotraining/courseformat/img/university_crest_reversed 2.png b/_extensions/cambiotraining/courseformat/img/university_crest_reversed 2.png deleted file mode 100644 index 7ae0f63..0000000 Binary files a/_extensions/cambiotraining/courseformat/img/university_crest_reversed 2.png and /dev/null differ diff --git a/_extensions/cambiotraining/courseformat/theme 2.scss b/_extensions/cambiotraining/courseformat/theme 2.scss deleted file mode 100644 index e4fbccc..0000000 --- a/_extensions/cambiotraining/courseformat/theme 2.scss +++ /dev/null @@ -1,94 +0,0 @@ -/*-- Importing fa icons --*/ -@import url("https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"); - - -/*-- scss:defaults --*/ - -// UoC green colour scheme (from dark to light) -// #003a41 -// #0c5963 -// #106470 -// #28828A -$navbar-bg: #106470; -$navbar-fg: white; -$body-color: black; -$link-color: #003a41; - -//make font size a bit larger than default -// $font-size-root: 1.2rem; -// $toc-font-size: 0.9rem; -// $sidebar-font-size: 0.9rem; -// $sidebar-font-size-section: 0.9rem !default; - -//tweak callout colours to match our scheme -//based on https://coolors.co/28828a-106470-003a41-d62828-f77f00 -$callout-color-note: #C6C5B9; -$callout-color-tip: #28828A; -$callout-color-caution: #D62828; -$callout-color-warning: #D62828; -$callout-color-important: #28828A; - - -/*-- scss:rules --*/ - -// headings colour -title, h1, h2, h3, h4, h5, h6 { - color: #003a41; -} - -// active section heading - affects content navigation bar on the left -div.sidebar-item-container .active { - font-weight: bold; -} - -// active links - affects header and TOC navigation bar -.sidebar .nav-link.active { - font-weight: bold; -} - -// active links on the header specifically -.navbar-dark .navbar-nav .nav-link.active { - color: white; - text-decoration: underline; -} - -// links on the header upon hover or after visited -.navbar-dark .navbar-nav .nav-link:hover, .navbar-dark .navbar-nav .nav-link:visited { - text-decoration: underline; - color: white; -} - -// font size on footer - make larger than default -.nav-footer, .nav-footer-left, .nav-footer-right, .nav-footer-center { - font-size: 1rem; -} - -// figure display -.figure { - text-align: center; - text-indent: 0; - border-bottom: 1px solid #dee2e6; -} - -.navbar-title { - padding-left: 20px; -} - - -// Download button - -// Style buttons - .btn { - background-color: #106470; - border: none; - color: white; - padding: 12px 30px; - cursor: pointer; - font-size: 20px; -} - -// Darker background on mouse-over -.btn:hover { - color: white; - background-color: #003a41; -} \ No newline at end of file diff --git a/_extensions/cambiotraining/courseformat/theme.scss b/_extensions/cambiotraining/courseformat/theme.scss index 2837ffc..5748643 100644 --- a/_extensions/cambiotraining/courseformat/theme.scss +++ b/_extensions/cambiotraining/courseformat/theme.scss @@ -28,6 +28,7 @@ $callout-color-caution: #D62828; $callout-color-warning: #D62828; $callout-color-important: #28828A; + /*-- scss:rules --*/ // headings colour @@ -40,13 +41,6 @@ div.sidebar-item-container .active { font-weight: bold; } -// part titles text bold -div.sidebar-item-container .text-start { - color: #003a41; - font-weight: bold; - font-size: 0.8rem; -} - // active links - affects header and TOC navigation bar .sidebar .nav-link.active { font-weight: bold; diff --git a/_extensions/cambiotraining/fontawesome/assets/css/latex-fontsize 2.css b/_extensions/cambiotraining/fontawesome/assets/css/latex-fontsize 2.css deleted file mode 100644 index 45545ec..0000000 --- a/_extensions/cambiotraining/fontawesome/assets/css/latex-fontsize 2.css +++ /dev/null @@ -1,30 +0,0 @@ -.fa-tiny { - font-size: 0.5em; -} -.fa-scriptsize { - font-size: 0.7em; -} -.fa-footnotesize { - font-size: 0.8em; -} -.fa-small { - font-size: 0.9em; -} -.fa-normalsize { - font-size: 1em; -} -.fa-large { - font-size: 1.2em; -} -.fa-Large { - font-size: 1.5em; -} -.fa-LARGE { - font-size: 1.75em; -} -.fa-huge { - font-size: 2em; -} -.fa-Huge { - font-size: 2.5em; -} diff --git a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-brands-400 2.woff2 b/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-brands-400 2.woff2 deleted file mode 100644 index 4d904aa..0000000 Binary files a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-brands-400 2.woff2 and /dev/null differ diff --git a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-regular-400 2.ttf b/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-regular-400 2.ttf deleted file mode 100644 index 23e3feb..0000000 Binary files a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-regular-400 2.ttf and /dev/null differ diff --git a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-regular-400 2.woff2 b/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-regular-400 2.woff2 deleted file mode 100644 index 80e3b12..0000000 Binary files a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-regular-400 2.woff2 and /dev/null differ diff --git a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-solid-900 2.ttf b/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-solid-900 2.ttf deleted file mode 100644 index da90824..0000000 Binary files a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-solid-900 2.ttf and /dev/null differ diff --git a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-solid-900 2.woff2 b/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-solid-900 2.woff2 deleted file mode 100644 index 360ba11..0000000 Binary files a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-solid-900 2.woff2 and /dev/null differ diff --git a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-v4compatibility 2.ttf b/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-v4compatibility 2.ttf deleted file mode 100644 index e9545ed..0000000 Binary files a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-v4compatibility 2.ttf and /dev/null differ diff --git a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-v4compatibility 2.woff2 b/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-v4compatibility 2.woff2 deleted file mode 100644 index db5b0b9..0000000 Binary files a/_extensions/cambiotraining/fontawesome/assets/webfonts/fa-v4compatibility 2.woff2 and /dev/null differ diff --git a/_extensions/cambiotraining/fontawesome/fontawesome 2.lua b/_extensions/cambiotraining/fontawesome/fontawesome 2.lua deleted file mode 100644 index 3939135..0000000 --- a/_extensions/cambiotraining/fontawesome/fontawesome 2.lua +++ /dev/null @@ -1,77 +0,0 @@ -local function ensureLatexDeps() - quarto.doc.useLatexPackage("fontawesome5") -end - -local function ensureHtmlDeps() - quarto.doc.addHtmlDependency({ - name = 'fontawesome6', - version = '0.1.0', - stylesheets = {'assets/css/all.css', 'assets/css/latex-fontsize.css'} - }) -end - -local function isEmpty(s) - return s == nil or s == '' -end - -local function isValidSize(size) - local validSizes = { - "tiny", - "scriptsize", - "footnotesize", - "small", - "normalsize", - "large", - "Large", - "LARGE", - "huge", - "Huge" - } - for _, v in ipairs(validSizes) do - if v == size then - return size - end - end - return "" -end - -return { - ["fa"] = function(args, kwargs) - - local group = "solid" - local icon = pandoc.utils.stringify(args[1]) - if #args > 1 then - group = icon - icon = pandoc.utils.stringify(args[2]) - end - - local title = pandoc.utils.stringify(kwargs["title"]) - if not isEmpty(title) then - title = " title=\"" .. title .. "\"" - end - - local size = pandoc.utils.stringify(kwargs["size"]) - - -- detect html (excluding epub which won't handle fa) - if quarto.doc.isFormat("html:js") then - ensureHtmlDeps() - if not isEmpty(size) then - size = " fa-" .. size - end - return pandoc.RawInline( - 'html', - "" - ) - -- detect pdf / beamer / latex / etc - elseif quarto.doc.isFormat("pdf") then - ensureLatexDeps() - if isEmpty(isValidSize(size)) then - return pandoc.RawInline('tex', "\\faIcon{" .. icon .. "}") - else - return pandoc.RawInline('tex', "{\\" .. size .. "\\faIcon{" .. icon .. "}}") - end - else - return pandoc.Null() - end - end -} diff --git a/_freeze/materials/glm-intro-glm/execute-results/html.json b/_freeze/materials/glm-intro-glm/execute-results/html.json index 559edc6..5b506cb 100644 --- a/_freeze/materials/glm-intro-glm/execute-results/html.json +++ b/_freeze/materials/glm-intro-glm/execute-results/html.json @@ -1,7 +1,8 @@ { - "hash": "982eda39568f3c70e84858134c3a3b54", + "hash": "1f7d400d1343625e53730f9d897d028e", "result": { - "markdown": "---\ntitle: \"Generalising your model\"\n---\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n\n## Putting the \"G\" into GLM\n\nIn the previous linear model example all the assumptions were met. But what if we have data where that isn't the case? For example, what if we have data where we _can't_ describe the relationship between the predictor and response variables in a linear way?\n\nOne of the ways we can deal with this is by using a **generalised linear model**, also abbreviated as GLM. In a way it's an extension of the linear model we discussed in the previous section. As with the normal linear model, the predictor variables in the model are in a linear combination, such as:\n\n$$\n\\beta_0 + \\beta_1X_1 + \\beta_2X_2 + \\beta_3X_3 + ...\n$$\n\nHere, the $\\beta_0$ value is the constant or intercept, whereas each subsequent $\\beta_i$ is a unique regression coefficient for each $X_i$ predictor variable. So far so good.\n\nHowever, the GLM makes the linear model more flexible in two ways:\n\n:::{.callout-important}\n1. In a standard linear model the linear combination (e.g. like we see above) becomes the predicted outcome value. With a GLM a transformation is specified, which turns the linear combination into the predicted outcome value. This is called a **link function**.\n2. A standard linear model assumes a continuous, normally distributed outcome, whereas **with GLM the outcome can be both continuous or integer**. Furthermore, the outcome does not have to be normally distributed. Indeed, **the outcome can follow a different kind of distribution**, such as binomial, Poisson, exponential etc.\n:::\n\nWe'll introduce each of these elements below, then illustrate how they are used in practice, using different types of data.\n\nThe link function and different distributions are closely...err, _linked_. To make sense of what the link function is doing it's useful to understand the different distributional assumptions. So we'll start with those.\n\n## Distributions\n\nIn the examples of a standard linear model we've seen that the residuals needed to be normally distributed. We've mainly used the Q-Q plot to assess this assumption of normality.\n\nBut what does \"normal\" actually mean? It assumes that the residuals are coming from a normal or Gaussian distribution. This distribution has a symmetrical bell-shape, where the centre is the mean, and half of the data are on either side of this.\n\nWe can see this in @fig-normdist. The mean of the normal distribution is indicated with the dashed blue line.\n\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n::: {.cell-output-display}\n![Normal distribution](glm-intro-glm_files/figure-html/fig-normdist-1.png){#fig-normdist width=672}\n:::\n:::\n\n\nWe can use the linear model we created previously, where we looked at the possible linear relationship between beak depth and beak length. This is based on measurements of _G. fortis_ beaks in 1975.\n\nThe individual values of the residuals from this linear model are shown in @fig-normdist, panel B (in red), with the corresponding theoretical normal distribution in the background. We can see that the residuals follow this distribution reasonably well, which matches our conclusions from the Q-Q plot (see @fig-fortis1975_lm_dgplots).\n\nAll this means is that assuming that these residuals may come from a normal distribution isn't such a daft suggestion after all.\n\nNow look at the example in @fig-beak_shape. This shows the classification of beak shape for a number of finches. Their beaks are either classed as `blunt` or `pointed`. Various (continuous) measurements were taken from each bird, with the beak length shown here.\n\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n::: {.cell-output-display}\n![Classification in beak shape](glm-intro-glm_files/figure-html/fig-beak_shape-1.png){#fig-beak_shape width=672}\n:::\n:::\n\n\nWe'll look into this example in more detail later. For now it's important to note that the response variable (the beak shape classification) is not continuous. Here it is a binary response (`blunt` or `pointed`). As a result, the assumptions for a regular linear model go out of the window. If we were foolish enough to fit a linear model to these data (see blue line in A), then the residuals would look rather non-normal (@fig-beak_shape B).\n\nSo what do we do? Well, the normal distribution is not the only one there is. In @fig-dist_examples there are a few examples of distributions (including the normal one).\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Different distributions](glm-intro-glm_files/figure-html/fig-dist_examples-1.png){#fig-dist_examples width=672}\n:::\n:::\n\n\nDifferent distributions are useful for different types of data. For example, a logistic distribution is particularly useful in the context of binary or proportional response data. The Poisson distribution is useful when we have count data as a response.\n\nIn order to understand how this can help us, we need to be aware of two more concepts: **linear predictors** and **link functions**.\n\n## Linear predictors\n\nThe nice thing about linear models is that the predictors are, well, linear. Straight lines make for easy interpretation of any potential relationship between predictor and response.\n\nAs mentioned before, predictors are in the form of a _linear combination_, where each predictor variable is multiplied by a coefficient and all the terms are added together:\n\n$$\n\\beta_0 + \\beta_1X_1 + \\beta_2X_2 + \\beta_3X_3 + ...\n$$\n\nFortunately, this is no different for generalised linear models! We still have a linear combination but, as we'll see, if the relationship is not linear then we need an additional step before we can model the data in this way.\n\nAt this point, we have two options at our disposal (well, there are more, but let's not muddy the waters too much).\n\n:::{.callout-important}\n\n1. Transform our data and use a normal linear model on the transformed data\n2. Transform the linear predictor\n:::\n\nThe first option, to **transform our data**, seems like a useful option and can work. It keeps things familiar (we'd still use a standard linear model) and so all is well with the world. Up to the point of interpreting the data. If we, for example, log-transform our data, how do we interpret this? After all, the predictions of the linear model are directly related to the outcome or response variable. Transforming the data is usually done so that the residuals of the linear model resemble a more normal distribution. An unwanted side-effect of this is that this also changes the ratio scale properties of the measured variables [@stevens1946].\n\nThe second option would be to **transform the linear predictor**. This enables us to map a *non-linear* outcome (or response variable) to a *linear* model. This transformation is done using a **link function**.\n\n## Link functions\n\nSimply put: link functions connect the predictors in our model to the response variables in a linear way. \n\nHowever, and similar to the standard linear model, there are two parts to each model:\n\n1. the coefficients for each predictor (linking each parameter to a predictor)\n2. the error or random component (which specifies a probability distribution)\n\nWhich link function you use depends on your analysis. Some common link functions and corresponding distributions are (adapted from [@glen2021]):\n\n| distribution | data type | link name |\n|--------------|---------------------|-----------|\n| binomial | binary / proportion | logit |\n| normal | any real number | identity |\n| poisson | count data | log |\n\n\nLet's again look at the earlier example of beak shape.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak shape classification](glm-intro-glm_files/figure-html/fig-beak_shapeclass-1.png){#fig-beak_shapeclass width=672}\n:::\n:::\n\n\nWe've seen the data in @fig-beak_shapeclass A before, where we had information on what beak shape our observed finches had, plotted against their beak length.\n\nLet's say we now want to make some **predictions** about what beak shape we would expect, *given* a certain beak length. In this scenario we'd need some way of modelling the response variable (beak shape; `blunt` or `pointed`) as a function of the predictor variable (beak length).\n\nThe issue we have is that the response variable is not continuous, but binary! We could fit a standard linear model to these data (blue line in @fig-beak_shape A) but this is really bad practice. Why? Well, what such a linear model represents is *the probability* - or how likely it is - that an observed finch has a pointed beak, given a certain beak length (@fig-beak_shapeclass B).\n\nSimply fitting a linear line through those data suggests that it is possible to have a higher than 1 and lower-than-zero probability that a beak would be pointed! That, of course, makes no sense. So, we can't describe these data as a linear relationship.\n\nInstead, we'll use a *logistic model* to analyse these data. We'll cover the practicalities of how to do this in more detail in a later chapter, but for now it's sufficient to realise that one of the ways we could model these data could look like this:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Logistic model for beak classification](glm-intro-glm_files/figure-html/fig-beak_shape_glm-1.png){#fig-beak_shape_glm width=672}\n:::\n:::\n\n\nUsing this sigmoidal curve ensures that our predicted probabilities do not exceed the $[0, 1]$ range.\n\nNow, what happened behind the scenes is that the generalised linear model has taken the linear predictor and transformed it using the *logit* link function. This links the non-linear response variable (beak shape) to a linear model, using beak length as a predictor.\n\nWe'll practice how to perform this analysis in the next section.\n\n## Key points\n\n::: {.callout-note}\n- GLMs allow us to map a non-linear outcome to a linear model\n- The link function determines *how* this occurs, transforming the linear predictor\n:::", + "engine": "knitr", + "markdown": "---\ntitle: \"Generalise your model\"\n---\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n\n## Putting the \"G\" into GLM\n\nIn the previous linear model example all the assumptions were met. But what if we have data where that isn't the case? For example, what if we have data where we _can't_ describe the relationship between the predictor and response variables in a linear way?\n\nOne of the ways we can deal with this is by using a **generalised linear model**, also abbreviated as GLM. In a way it's an extension of the linear model we discussed in the previous section. As with the normal linear model, the predictor variables in the model are in a linear combination, such as:\n\n$$\n\\beta_0 + \\beta_1X_1 + \\beta_2X_2 + \\beta_3X_3 + ...\n$$\n\nHere, the $\\beta_0$ value is the constant or intercept, whereas each subsequent $\\beta_i$ is a unique regression coefficient for each $X_i$ predictor variable. So far so good.\n\nHowever, the GLM makes the linear model more flexible in two ways:\n\n:::{.callout-important}\n1. In a standard linear model the linear combination (e.g. like we see above) becomes the predicted outcome value. With a GLM a transformation is specified, which turns the linear combination into the predicted outcome value. This is called a **link function**.\n2. A standard linear model assumes a continuous, normally distributed outcome, whereas **with GLM the outcome can be both continuous or integer**. Furthermore, the outcome does not have to be normally distributed. Indeed, **the outcome can follow a different kind of distribution**, such as binomial, Poisson, exponential etc.\n:::\n\nWe'll introduce each of these elements below, then illustrate how they are used in practice, using different types of data.\n\nThe link function and different distributions are closely...err, _linked_. To make sense of what the link function is doing it's useful to understand the different distributional assumptions. So we'll start with those.\n\n## Distributions\n\nIn the examples of a standard linear model we've seen that the residuals needed to be normally distributed. We've mainly used the Q-Q plot to assess this assumption of normality.\n\nBut what does \"normal\" actually mean? It assumes that the residuals are coming from a normal or Gaussian distribution. This distribution has a symmetrical bell-shape, where the centre is the mean, and half of the data are on either side of this.\n\nWe can see this in @fig-normdist. The mean of the normal distribution is indicated with the dashed blue line.\n\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n::: {.cell-output-display}\n![Normal distribution](glm-intro-glm_files/figure-html/fig-normdist-1.png){#fig-normdist width=672}\n:::\n:::\n\n\nWe can use the linear model we created previously, where we looked at the possible linear relationship between beak depth and beak length. This is based on measurements of _G. fortis_ beaks in 1975.\n\nThe individual values of the residuals from this linear model are shown in @fig-normdist, panel B (in red), with the corresponding theoretical normal distribution in the background. We can see that the residuals follow this distribution reasonably well, which matches our conclusions from the Q-Q plot (see @fig-fortis1975_lm_dgplots).\n\nAll this means is that assuming that these residuals may come from a normal distribution isn't such a daft suggestion after all.\n\nNow look at the example in @fig-beak_shape. This shows the classification of beak shape for a number of finches. Their beaks are either classed as `blunt` or `pointed`. Various (continuous) measurements were taken from each bird, with the beak length shown here.\n\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n::: {.cell-output-display}\n![Classification in beak shape](glm-intro-glm_files/figure-html/fig-beak_shape-1.png){#fig-beak_shape width=672}\n:::\n:::\n\n\nWe'll look into this example in more detail later. For now it's important to note that the response variable (the beak shape classification) is not continuous. Here it is a binary response (`blunt` or `pointed`). As a result, the assumptions for a regular linear model go out of the window. If we were foolish enough to fit a linear model to these data (see blue line in A), then the residuals would look rather non-normal (@fig-beak_shape B).\n\nSo what do we do? Well, the normal distribution is not the only one there is. In @fig-dist_examples there are a few examples of distributions (including the normal one).\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Different distributions](glm-intro-glm_files/figure-html/fig-dist_examples-1.png){#fig-dist_examples width=672}\n:::\n:::\n\n\nDifferent distributions are useful for different types of data. For example, a logistic distribution is particularly useful in the context of binary or proportional response data. The Poisson distribution is useful when we have count data as a response.\n\nIn order to understand how this can help us, we need to be aware of two more concepts: **linear predictors** and **link functions**.\n\n## Linear predictors\n\nThe nice thing about linear models is that the predictors are, well, linear. Straight lines make for easy interpretation of any potential relationship between predictor and response.\n\nAs mentioned before, predictors are in the form of a _linear combination_, where each predictor variable is multiplied by a coefficient and all the terms are added together:\n\n$$\n\\beta_0 + \\beta_1X_1 + \\beta_2X_2 + \\beta_3X_3 + ...\n$$\n\nFortunately, this is no different for generalised linear models! We still have a linear combination but, as we'll see, if the relationship is not linear then we need an additional step before we can model the data in this way.\n\nAt this point, we have two options at our disposal (well, there are more, but let's not muddy the waters too much).\n\n:::{.callout-important}\n\n1. Transform our data and use a normal linear model on the transformed data\n2. Transform the linear predictor\n:::\n\nThe first option, to **transform our data**, seems like a useful option and can work. It keeps things familiar (we'd still use a standard linear model) and so all is well with the world. Up to the point of interpreting the data. If we, for example, log-transform our data, how do we interpret this? After all, the predictions of the linear model are directly related to the outcome or response variable. Transforming the data is usually done so that the residuals of the linear model resemble a more normal distribution. An unwanted side-effect of this is that this also changes the ratio scale properties of the measured variables [@stevens1946].\n\nThe second option would be to **transform the linear predictor**. This enables us to map a *non-linear* outcome (or response variable) to a *linear* model. This transformation is done using a **link function**.\n\n## Link functions\n\nSimply put: link functions connect the predictors in our model to the response variables in a linear way. \n\nHowever, and similar to the standard linear model, there are two parts to each model:\n\n1. the coefficients for each predictor (linking each parameter to a predictor)\n2. the error or random component (which specifies a probability distribution)\n\nWhich link function you use depends on your analysis. Some common link functions and corresponding distributions are (adapted from [@glen2021]):\n\n| distribution | data type | link name |\n|--------------|---------------------|-----------|\n| binomial | binary / proportion | logit |\n| normal | any real number | identity |\n| poisson | count data | log |\n\n\nLet's again look at the earlier example of beak shape.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak shape classification](glm-intro-glm_files/figure-html/fig-beak_shapeclass-1.png){#fig-beak_shapeclass width=672}\n:::\n:::\n\n\nWe've seen the data in @fig-beak_shapeclass A before, where we had information on what beak shape our observed finches had, plotted against their beak length.\n\nLet's say we now want to make some **predictions** about what beak shape we would expect, *given* a certain beak length. In this scenario we'd need some way of modelling the response variable (beak shape; `blunt` or `pointed`) as a function of the predictor variable (beak length).\n\nThe issue we have is that the response variable is not continuous, but binary! We could fit a standard linear model to these data (blue line in @fig-beak_shape A) but this is really bad practice. Why? Well, what such a linear model represents is *the probability* - or how likely it is - that an observed finch has a pointed beak, given a certain beak length (@fig-beak_shapeclass B).\n\nSimply fitting a linear line through those data suggests that it is possible to have a higher than 1 and lower-than-zero probability that a beak would be pointed! That, of course, makes no sense. So, we can't describe these data as a linear relationship.\n\nInstead, we'll use a *logistic model* to analyse these data. We'll cover the practicalities of how to do this in more detail in a later chapter, but for now it's sufficient to realise that one of the ways we could model these data could look like this:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Logistic model for beak classification](glm-intro-glm_files/figure-html/fig-beak_shape_glm-1.png){#fig-beak_shape_glm width=672}\n:::\n:::\n\n\nUsing this sigmoidal curve ensures that our predicted probabilities do not exceed the $[0, 1]$ range.\n\nNow, what happened behind the scenes is that the generalised linear model has taken the linear predictor and transformed it using the *logit* link function. This links the non-linear response variable (beak shape) to a linear model, using beak length as a predictor.\n\nWe'll practice how to perform this analysis in the next section.\n\n## Key points\n\n::: {.callout-note}\n- GLMs allow us to map a non-linear outcome to a linear model\n- The link function determines *how* this occurs, transforming the linear predictor\n:::\n", "supporting": [ "glm-intro-glm_files" ], diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class-1.png deleted file mode 100644 index 20e2059..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm-1.png deleted file mode 100644 index a0ab558..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm_fitted-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm_fitted-1.png deleted file mode 100644 index eeb2970..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm_fitted-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm_probs-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm_probs-1.png deleted file mode 100644 index ef5c155..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_glm_probs-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_residuals-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_residuals-1.png deleted file mode 100644 index de0ce85..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_class_residuals-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_classification-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_classification-1.png deleted file mode 100644 index b03d8c0..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_classification-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_classification_nospecies-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_classification_nospecies-1.png deleted file mode 100644 index 8c26902..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_classification_nospecies-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_nospecies_lm-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_nospecies_lm-1.png deleted file mode 100644 index 441f1c4..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_nospecies_lm-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_residuals-1.png b/_freeze/materials/glm-intro-glm/figure-html/fig-beak_residuals-1.png deleted file mode 100644 index f5c6d87..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/fig-beak_residuals-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-16-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-16-1.png deleted file mode 100644 index e4d6ec5..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-16-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-17-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-17-1.png deleted file mode 100644 index 5d3810f..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-17-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-18-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-18-1.png deleted file mode 100644 index 43eb304..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-18-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-19-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-19-1.png deleted file mode 100644 index 43eb304..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-19-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-19-2.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-19-2.png deleted file mode 100644 index c578ef1..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-19-2.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-20-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-20-1.png deleted file mode 100644 index acf091e..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-20-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-20-2.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-20-2.png deleted file mode 100644 index c578ef1..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-20-2.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-21-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-21-1.png deleted file mode 100644 index fa85561..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-21-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-22-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-22-1.png deleted file mode 100644 index fa85561..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-22-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-23-1.png b/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-23-1.png deleted file mode 100644 index be54d74..0000000 Binary files a/_freeze/materials/glm-intro-glm/figure-html/unnamed-chunk-23-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/execute-results/html.json b/_freeze/materials/glm-intro-lm/execute-results/html.json index 33f1dcd..2748d92 100644 --- a/_freeze/materials/glm-intro-lm/execute-results/html.json +++ b/_freeze/materials/glm-intro-lm/execute-results/html.json @@ -1,7 +1,8 @@ { - "hash": "e83986712323c48b5fc6d0d4e78cf9d2", + "hash": "aa21ab58e0aba387b606c6d912dda11a", "result": { - "markdown": "---\ntitle: \"Linear models\"\n---\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n\n## Data\n\nFor this example, we'll be using the several data sets about Darwin's finches. They are part of a long-term genetic and phenotypic study on the evolution of several species of finches. The exact details are less important for now, but there are data on multiple species where several phenotypic characteristics were measured (see @fig-finchphenotypes).\n\n![Finches phenotypes (courtesy of [HHMI BioInteractive](https://www.biointeractive.org))](images/finches-phenotypes.png){width=75% #fig-finchphenotypes}\n\n\n::: {.cell}\n\n:::\n\n\n## Exploring data\n\nIt's always a good idea to explore your data visually. Here we are focussing on the (potential) relationship between beak length (`blength`) and beak depth (`bdepth`).\n\nOur data contains measurements from two years (`year`) and two species (`species`). If we plot beak depth against beak length, colour our data by species and look across the two time points (1975 and 2012), we get the following graph:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak depth and length for _G. fortis_ and _G. scandens_](glm-intro-lm_files/figure-html/fig-finches_1975v2012-1.png){#fig-finches_1975v2012 width=672}\n:::\n:::\n\n\nIt seems that there is a potential linear relationship between beak depth and beak length. There are some differences between the two species and two time points with, what seems, more spread in the data in 2012. The data for both species also seem to be less separated than in 1975.\n\nFor the current purpose, we'll focus on one group of data: those of _Geospiza fortis_ in 1975.\n\n\n::: {.cell}\n\n:::\n\n\n## Linear model\n\nLet's look at the _G. fortis_ data more closely, assuming that the have a linear relationship. We can visualise that as follows:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak depth vs beak length _G. fortis_ (1975)](glm-intro-lm_files/figure-html/fig-lm_fortis_1975-1.png){#fig-lm_fortis_1975 width=672}\n:::\n:::\n\n\nIf you recall from the [Core statistics linear regression](https://cambiotraining.github.io/corestats/materials/cs3_practical_linear-regression.html) session, what we're doing here is assuming that there is a linear relationship between the response variable (in this case `bdepth`) and predictor variable (here, `blength`).\n\nWe can get more information on this linear relationship by defining a linear model, which has the form of:\n\n$$\nY = \\beta_0 + \\beta_1X\n$$\n\nwhere $Y$ is the response variable (the thing we're interested in), $X$ the predictor variable and $\\beta_0$ and $\\beta_1$ are model coefficients. \nMore explicitly for our data, we get:\n\n$$\nbeak\\ depth = \\beta_0 + \\beta_1 \\times beak\\ length\n$$\n\n\n::: {.cell}\n\n:::\n\n\nBut how do we find this model? The computer uses a method called **least-squares regression**. There are several steps involved in this.\n\n### Line of best fit\n\nThe computer tries to find the **line of best fit**. This is a linear line that best describes your data. We could draw a linear line through our cloud of data points in many ways, but the least-squares method converges to a single solution, where the **sum of squared residual deviations** is at its smallest.\n\nTo understand this a bit better, it's helpful to realise that each data point consists of a fitted value (the beak depth predicted by the model at a given beak length), combined with the error. The error is the difference between the fitted value and the data point.\n\nLet's look at this for one of the observations, for example finch 473:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak depth vs beak length (finch 473, 1975)](glm-intro-lm_files/figure-html/fig-finch473-1.png){#fig-finch473 width=672}\n:::\n:::\n\n\nObtaining the fitted value and error happens for each data point. All these residuals are then squared (to ensure that they are positive), and added together. This is the so-called sum-of-squares.\n\nYou can imagine that if you draw a line through the data that doesn't fit the data all that well, the error associated with each data point increases. The sum-of-squares then also increases. Equally, the closer the data are to the line, the smaller the error. This results in a smaller sum-of-squares.\n\nThe linear line where the sum-of-squares is at its smallest, is called the **line of best fit**. This line acts as a model for our data.\n\nFor finch 473 we have the following values:\n\n* the observed beak depth is 9.5 mm\n* the observed beak length is 10.5 mm\n* the fitted value is 9.11 mm\n* the error is 0.39 mm\n\n### Linear regression\n\nOnce we have the line of best fit, we can perform a **linear regression**. What we're doing with the regression, is asking:\n\n> Is the line of best fit a better predictor of our data than a horizontal line across the average value?\n\nVisually, that looks like this:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Regression: is the slope different from zero?](glm-intro-lm_files/figure-html/fig-lm_regression-1.png){#fig-lm_regression width=672}\n:::\n:::\n\n\nWhat we're actually testing is whether the _slope_ ($\\beta_1$) of the line of best fit is any different from zero.\n\nTo find the answer, we perform an ANOVA. This gives us a p-value of 1.68e-78.\n\nNeedless to say, this p-value is extremely small, and definitely smaller than any common significance threshold, such as $p < 0.05$. This suggests that beak length is a statistically significant predictor of beak depth.\n\nIn this case the model has an **intercept** ($\\beta_0$) of -0.34 and a **slope** ($\\beta_1$) of 0.9. We can use this to write a simple linear equation, describing our data. Remember that this takes the form of:\n\n$$\nY = \\beta_0 + \\beta_1X\n$$\n\nwhich in our case is\n\n$$\nbeak\\ depth = \\beta_0 + \\beta_1 \\times beak\\ length\n$$\n\nand gives us\n\n$$\nbeak\\ depth = -0.34 + 0.90 \\times beak\\ length\n$$\n\n### Assumptions\n\nIn example above we just got on with things once we suspected that there was a linear relationship between beak depth and beak length. However, for the linear regression to be valid, several assumptions need to be met. If any of those assumptions are violated, we can't trust the results. The following four assumptions need to be met, with a 5th point being a case of good scientific practice:\n\n1. Data should be linear\n2. Residuals are normally distributed\n3. Equality of variance\n4. The residuals are independent\n5. (no influential points)\n\nAs we did many times during the [Core statistics](https://cambiotraining.github.io/corestats/) sessions, we mainly rely on diagnostic plots to check these assumptions. For this particular model they look as follows:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Diagnostic plots for _G. fortis_ (1975) model](glm-intro-lm_files/figure-html/fig-fortis1975_lm_dgplots-1.png){#fig-fortis1975_lm_dgplots width=672}\n:::\n:::\n\n\nThese plots look very good to me. For a recap on how to interpret these plots, see [CS2: ANOVA](https://cambiotraining.github.io/corestats/materials/cs2_practical_anova.html).\n\nTaken together, we can see the relationship between beak depth and beak length as a linear one, described by a (linear) model that has a predicted value for each data point, and an associated error.", + "engine": "knitr", + "markdown": "---\ntitle: \"Linear models\"\n---\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n\n## Data\n\nFor this example, we'll be using the several data sets about Darwin's finches. They are part of a long-term genetic and phenotypic study on the evolution of several species of finches. The exact details are less important for now, but there are data on multiple species where several phenotypic characteristics were measured (see @fig-finchphenotypes).\n\n![Finches phenotypes (courtesy of [HHMI BioInteractive](https://www.biointeractive.org))](img/finches-phenotypes.png){width=75% #fig-finchphenotypes}\n\n\n::: {.cell}\n\n:::\n\n\n## Exploring data\n\nIt's always a good idea to explore your data visually. Here we are focussing on the (potential) relationship between beak length (`blength`) and beak depth (`bdepth`).\n\nOur data contains measurements from two years (`year`) and two species (`species`). If we plot beak depth against beak length, colour our data by species and look across the two time points (1975 and 2012), we get the following graph:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak depth and length for _G. fortis_ and _G. scandens_](glm-intro-lm_files/figure-html/fig-finches_1975v2012-1.png){#fig-finches_1975v2012 width=672}\n:::\n:::\n\n\nIt seems that there is a potential linear relationship between beak depth and beak length. There are some differences between the two species and two time points with, what seems, more spread in the data in 2012. The data for both species also seem to be less separated than in 1975.\n\nFor the current purpose, we'll focus on one group of data: those of _Geospiza fortis_ in 1975.\n\n\n::: {.cell}\n\n:::\n\n\n## Linear model\n\nLet's look at the _G. fortis_ data more closely, assuming that the have a linear relationship. We can visualise that as follows:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak depth vs beak length _G. fortis_ (1975)](glm-intro-lm_files/figure-html/fig-lm_fortis_1975-1.png){#fig-lm_fortis_1975 width=672}\n:::\n:::\n\n\nIf you recall from the [Core statistics linear regression](https://cambiotraining.github.io/corestats/materials/cs3_practical_linear-regression.html) session, what we're doing here is assuming that there is a linear relationship between the response variable (in this case `bdepth`) and predictor variable (here, `blength`).\n\nWe can get more information on this linear relationship by defining a linear model, which has the form of:\n\n$$\nY = \\beta_0 + \\beta_1X\n$$\n\nwhere $Y$ is the response variable (the thing we're interested in), $X$ the predictor variable and $\\beta_0$ and $\\beta_1$ are model coefficients. \nMore explicitly for our data, we get:\n\n$$\nbeak\\ depth = \\beta_0 + \\beta_1 \\times beak\\ length\n$$\n\n\n::: {.cell}\n\n:::\n\n\nBut how do we find this model? The computer uses a method called **least-squares regression**. There are several steps involved in this.\n\n### Line of best fit\n\nThe computer tries to find the **line of best fit**. This is a linear line that best describes your data. We could draw a linear line through our cloud of data points in many ways, but the least-squares method converges to a single solution, where the **sum of squared residual deviations** is at its smallest.\n\nTo understand this a bit better, it's helpful to realise that each data point consists of a fitted value (the beak depth predicted by the model at a given beak length), combined with the error. The error is the difference between the fitted value and the data point.\n\nLet's look at this for one of the observations, for example finch 473:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Beak depth vs beak length (finch 473, 1975)](glm-intro-lm_files/figure-html/fig-finch473-1.png){#fig-finch473 width=672}\n:::\n:::\n\n\nObtaining the fitted value and error happens for each data point. All these residuals are then squared (to ensure that they are positive), and added together. This is the so-called sum-of-squares.\n\nYou can imagine that if you draw a line through the data that doesn't fit the data all that well, the error associated with each data point increases. The sum-of-squares then also increases. Equally, the closer the data are to the line, the smaller the error. This results in a smaller sum-of-squares.\n\nThe linear line where the sum-of-squares is at its smallest, is called the **line of best fit**. This line acts as a model for our data.\n\nFor finch 473 we have the following values:\n\n* the observed beak depth is 9.5 mm\n* the observed beak length is 10.5 mm\n* the fitted value is 9.11 mm\n* the error is 0.39 mm\n\n### Linear regression\n\nOnce we have the line of best fit, we can perform a **linear regression**. What we're doing with the regression, is asking:\n\n> Is the line of best fit a better predictor of our data than a horizontal line across the average value?\n\nVisually, that looks like this:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Regression: is the slope different from zero?](glm-intro-lm_files/figure-html/fig-lm_regression-1.png){#fig-lm_regression width=672}\n:::\n:::\n\n\nWhat we're actually testing is whether the _slope_ ($\\beta_1$) of the line of best fit is any different from zero.\n\nTo find the answer, we perform an ANOVA. This gives us a p-value of 1.68e-78.\n\nNeedless to say, this p-value is extremely small, and definitely smaller than any common significance threshold, such as $p < 0.05$. This suggests that beak length is a statistically significant predictor of beak depth.\n\nIn this case the model has an **intercept** ($\\beta_0$) of -0.34 and a **slope** ($\\beta_1$) of 0.9. We can use this to write a simple linear equation, describing our data. Remember that this takes the form of:\n\n$$\nY = \\beta_0 + \\beta_1X\n$$\n\nwhich in our case is\n\n$$\nbeak\\ depth = \\beta_0 + \\beta_1 \\times beak\\ length\n$$\n\nand gives us\n\n$$\nbeak\\ depth = -0.34 + 0.90 \\times beak\\ length\n$$\n\n### Assumptions\n\nIn example above we just got on with things once we suspected that there was a linear relationship between beak depth and beak length. However, for the linear regression to be valid, several assumptions need to be met. If any of those assumptions are violated, we can't trust the results. The following four assumptions need to be met, with a 5th point being a case of good scientific practice:\n\n1. Data should be linear\n2. Residuals are normally distributed\n3. Equality of variance\n4. The residuals are independent\n5. (no influential points)\n\nAs we did many times during the [Core statistics](https://cambiotraining.github.io/corestats/) sessions, we mainly rely on diagnostic plots to check these assumptions. For this particular model they look as follows:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Diagnostic plots for _G. fortis_ (1975) model](glm-intro-lm_files/figure-html/fig-fortis1975_lm_dgplots-1.png){#fig-fortis1975_lm_dgplots width=672}\n:::\n:::\n\n\nThese plots look very good to me. For a recap on how to interpret these plots, see [CS2: ANOVA](https://cambiotraining.github.io/corestats/materials/cs2_practical_anova.html).\n\nTaken together, we can see the relationship between beak depth and beak length as a linear one, described by a (linear) model that has a predicted value for each data point, and an associated error.\n", "supporting": [ "glm-intro-lm_files" ], diff --git a/_freeze/materials/glm-intro-lm/figure-html/fig-lm_fortis_1975-1.png b/_freeze/materials/glm-intro-lm/figure-html/fig-lm_fortis_1975-1.png index 7fa5421..c3a927b 100644 Binary files a/_freeze/materials/glm-intro-lm/figure-html/fig-lm_fortis_1975-1.png and b/_freeze/materials/glm-intro-lm/figure-html/fig-lm_fortis_1975-1.png differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-10-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-10-1.png deleted file mode 100644 index 36f368a..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-10-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-11-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-11-1.png deleted file mode 100644 index 36f368a..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-11-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-13-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-13-1.png deleted file mode 100644 index 36f368a..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-13-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-15-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-15-1.png deleted file mode 100644 index 36f368a..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-15-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-4-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-4-1.png deleted file mode 100644 index 5629054..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-4-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-6-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-6-1.png deleted file mode 100644 index 77af8b8..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-6-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-7-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-7-1.png deleted file mode 100644 index 8d97b61..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-7-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-8-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-8-1.png deleted file mode 100644 index 8d97b61..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-8-1.png and /dev/null differ diff --git a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-9-1.png b/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-9-1.png deleted file mode 100644 index 2ac37a4..0000000 Binary files a/_freeze/materials/glm-intro-lm/figure-html/unnamed-chunk-9-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-logistic-binary/execute-results/html.json b/_freeze/materials/glm-practical-logistic-binary/execute-results/html.json index af58c31..be4c132 100644 --- a/_freeze/materials/glm-practical-logistic-binary/execute-results/html.json +++ b/_freeze/materials/glm-practical-logistic-binary/execute-results/html.json @@ -1,7 +1,8 @@ { - "hash": "23304418a32ab6dbb7205b13a04ecfe2", + "hash": "34c6f67a8ac54c2d9cb418c133d33a2b", "result": { - "markdown": "---\ntitle: \"Binary response\"\n---\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n\n::: {.callout-tip}\n## Learning outcomes\n\n**Questions**\n\n- How do we analyse data with a binary outcome?\n- Can we test if our model is any good?\n- Be able to perform a logistic regression with a binary outcome\n- Predict outcomes of new data, based on a defined model\n\n**Objectives**\n\n- Be able to analyse binary outcome data\n- Understand different methods of testing model fit\n- Be able to make model predictions\n:::\n\n## Libraries and functions\n\n::: {.callout-note collapse=\"true\"}\n## Click to expand\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n### Libraries\n### Functions\n\n## Python\n\n### Libraries\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# A maths library\nimport math\n# A Python data analysis and manipulation tool\nimport pandas as pd\n\n# Python equivalent of `ggplot2`\nfrom plotnine import *\n\n# Statistical models, conducting tests and statistical data exploration\nimport statsmodels.api as sm\n\n# Convenience interface for specifying models using formula strings and DataFrames\nimport statsmodels.formula.api as smf\n```\n:::\n\n\n### Functions\n:::\n:::\n\nThe example in this section uses the following data set:\n\n`data/finches_early.csv`\n\nThese data come from an analysis of gene flow across two finch species [@lamichhaney2020]. They are slightly adapted here for illustrative purposes.\n\nThe data focus on two species, _Geospiza fortis_ and _G. scandens_. The original measurements are split by a uniquely timed event: a particularly strong El Niño event in 1983. This event changed the vegetation and food supply of the finches, allowing F1 hybrids of the two species to survive, whereas before 1983 they could not. The measurements are classed as `early` (pre-1983) and `late` (1983 onwards).\n\nHere we are looking only at the `early` data. We are specifically focussing on the beak shape classification, which we saw earlier in @fig-beak_shape_glm.\n\n## Load and visualise the data\n\nFirst we load the data, then we visualise it.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nearly_finches <- read_csv(\"data/finches_early.csv\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nearly_finches_py = pd.read_csv(\"data/finches_early.csv\")\n```\n:::\n\n\n:::\n\nLooking at the data, we can see that the `pointed_beak` column contains zeros and ones. These are actually yes/no classification outcomes and not numeric representations.\n\nWe'll have to deal with this soon. For now, we can plot the data:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(early_finches,\n aes(x = factor(pointed_beak),\n y = blength)) +\n geom_boxplot()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-6-1.png){width=672}\n:::\n:::\n\n\n## Python\n\nWe could just give Python the `pointed_beak` data directly, but then it would view the values as numeric. Which doesn't really work, because we have two groups as such: those with a pointed beak (`1`), and those with a blunt one (`0`).\n\nWe can force Python to temporarily covert the data to a factor, by making the `pointed_beak` column an `object` type. We can do this directly inside the `ggplot()` function.\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py,\n aes(x = early_finches_py.pointed_beak.astype(object),\n y = \"blength\")) +\n geom_boxplot())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-7-1.png){width=614}\n:::\n:::\n\n:::\n\nIt looks as though the finches with blunt beaks generally have shorter beak lengths.\n\nWe can visualise that differently by plotting all the data points as a classic binary response plot:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(early_finches,\n aes(x = blength, y = pointed_beak)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-8-3.png){width=672}\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py,\n aes(x = \"blength\",\n y = \"pointed_beak\")) +\n geom_point())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-9-1.png){width=614}\n:::\n:::\n\n\n:::\n\nThis presents us with a bit of an issue. We could fit a linear regression model to these data, although we already know that this is a bad idea...\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(early_finches,\n aes(x = blength, y = pointed_beak)) +\n geom_point() +\n geom_smooth(method = \"lm\", se = FALSE)\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-10-3.png){width=672}\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py,\n aes(x = \"blength\",\n y = \"pointed_beak\")) +\n geom_point() +\n geom_smooth(method = \"lm\",\n colour = \"blue\",\n se = False))\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-11-1.png){width=614}\n:::\n:::\n\n\n:::\n\nOf course this is rubbish - we can't have a beak classification outside the range of $[0, 1]$. It's either blunt (`0`) or pointed (`1`).\n\nBut for the sake of exploration, let's look at the assumptions:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlm_bks <- lm(pointed_beak ~ blength,\n data = early_finches)\n\nresid_panel(lm_bks,\n plots = c(\"resid\", \"qq\", \"ls\", \"cookd\"),\n smoother = TRUE)\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-12-3.png){width=672}\n:::\n:::\n\n\n## Python\n\nFirst, we create a linear model:\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# create a linear model\nmodel = smf.ols(formula = \"pointed_beak ~ blength\",\n data = early_finches_py)\n# and get the fitted parameters of the model\nlm_bks_py = model.fit()\n```\n:::\n\n\nNext, we can create the diagnostic plots:\n\n::: {.cell}\n\n```{.python .cell-code}\ndgplots(lm_bks_py)\n```\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n::: {.cell-output-display}\n![](images/dgplots/2023_08_07-02-26-41_PM_dgplots.png){width=805}\n:::\n:::\n\n\n:::\n\nThey're ~~pretty~~ extremely bad.\n\n- The response is not linear (Residual Plot, binary response plot, common sense).\n- The residuals do not appear to be distributed normally (Q-Q Plot)\n- The variance is not homogeneous across the predicted values (Location-Scale Plot)\n- But - there is always a silver lining - we don't have influential data points.\n\n## Creating a suitable model\n\nSo far we've established that using a simple linear model to describe a potential relationship between beak length and the probability of having a pointed beak is not a good idea. So, what _can_ we do?\n\nOne of the ways we can deal with binary outcome data is by performing a logistic regression. Instead of fitting a straight line to our data, and performing a regression on that, we fit a line that has an S shape. This avoids the model making predictions outside the $[0, 1]$ range.\n\nWe described our standard linear relationship as follows:\n\n$Y = \\beta_0 + \\beta_1X$\n\nWe can now map this to our non-linear relationship via the **logistic link function**:\n\n$Y = \\frac{\\exp(X)}{1 + \\exp(\\beta_0 + \\beta_1X)}$\n\nNote that the $\\beta_0 + \\beta_1X$ part is identical to the formula of a straight line.\n\nThe rest of the function is what makes the straight line curve into its characteristic S shape. \n\n:::{.callout-note collapse=true}\n## Euler's number ($\\exp$): would you like to know more?\n\nIn mathematics, $\\rm e$ represents a constant of around 2.718. Another notation is $\\exp$, which is often used when notations become a bit cumbersome. Here, I exclusively use the $\\exp$ notation for consistency.\n:::\n\n::: {.callout-important}\n## The logistic function\n\nThe shape of the logistic function is hugely influenced by the different parameters, in particular $\\beta_1$. The plots below show different situations, where $\\beta_0 = 0$ in all cases, but $\\beta_1$ varies.\n\nThe first plot shows the logistic function in its simplest form, with the others showing the effect of varying $\\beta_1$.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-17-1.png){width=672}\n:::\n:::\n\n\n* when $\\beta_1 = 1$, this gives the simplest logistic function\n* when $\\beta_1 = 0$ gives a horizontal line, with $Y = \\frac{\\exp(\\beta_0)}{1+\\exp(\\beta_0)}$\n* when $\\beta_1$ is negative flips the curve around, so it slopes down\n* when $\\beta_1$ is very large then the curve becomes extremely steep\n\n:::\n\nWe can fit such an S-shaped curve to our `early_finches` data set, by creating a generalised linear model.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nIn R we have a few options to do this, and by far the most familiar function would be `glm()`. Here we save the model in an object called `glm_bks`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nglm_bks <- glm(pointed_beak ~ blength,\n family = binomial,\n data = early_finches)\n```\n:::\n\n\nThe format of this function is similar to that used by the `lm()` function for linear models. The important difference is that we must specify the _family_ of error distribution to use. For logistic regression we must set the family to **binomial**.\n\nIf you forget to set the `family` argument, then the `glm()` function will perform a standard linear model fit, identical to what the `lm()` function would do.\n\n## Python\n\nIn Python we have a few options to do this, and by far the most familiar function would be `glm()`. Here we save the model in an object called `glm_bks_py`:\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# create a linear model\nmodel = smf.glm(formula = \"pointed_beak ~ blength\",\n family = sm.families.Binomial(),\n data = early_finches_py)\n# and get the fitted parameters of the model\nglm_bks_py = model.fit()\n```\n:::\n\n\nThe format of this function is similar to that used by the `ols()` function for linear models. The important difference is that we must specify the _family_ of error distribution to use. For logistic regression we must set the family to **binomial**. This is buried deep inside the `statsmodels` package and needs to be defined as `sm.families.Binomial()`.\n\n:::\n\n## Model output\n\nThat's the easy part done! The trickier part is interpreting the output. First of all, we'll get some summary information.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(glm_bks)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n\nCall:\nglm(formula = pointed_beak ~ blength, family = binomial, data = early_finches)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) -43.410 15.250 -2.847 0.00442 **\nblength 3.387 1.193 2.839 0.00452 **\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 84.5476 on 60 degrees of freedom\nResidual deviance: 9.1879 on 59 degrees of freedom\nAIC: 13.188\n\nNumber of Fisher Scoring iterations: 8\n```\n:::\n:::\n\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nprint(glm_bks_py.summary())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n Generalized Linear Model Regression Results \n==============================================================================\nDep. Variable: pointed_beak No. Observations: 61\nModel: GLM Df Residuals: 59\nModel Family: Binomial Df Model: 1\nLink Function: Logit Scale: 1.0000\nMethod: IRLS Log-Likelihood: -4.5939\nDate: Mon, 07 Aug 2023 Deviance: 9.1879\nTime: 14:26:42 Pearson chi2: 15.1\nNo. Iterations: 8 Pseudo R-squ. (CS): 0.7093\nCovariance Type: nonrobust \n==============================================================================\n coef std err z P>|z| [0.025 0.975]\n------------------------------------------------------------------------------\nIntercept -43.4096 15.250 -2.847 0.004 -73.298 -13.521\nblength 3.3866 1.193 2.839 0.005 1.049 5.724\n==============================================================================\n```\n:::\n:::\n\n\n:::\n\nThere’s a lot to unpack here, but for the purpose of today we are mostly focussing on the coefficients.\n\n::: {.panel-tabset group=\"language\"}\n## R\nThe coefficients can be found in the `Coefficients` block. The main numbers to extract from the output are the two numbers underneath `Estimate.Std`:\n\n```\nCoefficients:\n Estimate Std.\n(Intercept) -43.410\nblength 3.387 \n```\n\n## Python\n\nRight at the bottom is a table showing the model coefficients. The main numbers to extract from the output are the two numbers in the `coef` column:\n\n```\n======================\n coef\n----------------------\nIntercept -43.4096\nblength 3.3866\n======================\n```\n\n:::\n\nThese are the coefficients of the logistic model equation and need to be placed in the correct equation if we want to be able to calculate the probability of having a pointed beak for a given beak length.\n\nThe $p$ values at the end of each coefficient row merely show whether that particular coefficient is significantly different from zero. This is similar to the $p$ values obtained in the summary output of a linear model. As with continuous predictors in simple models, these $p$ values can be used to decide whether that predictor is important (so in this case beak length appears to be significant). However, these $p$ values aren’t great to work with when we have multiple predictor variables, or when we have categorical predictors with multiple levels (since the output will give us a $p$ value for each level rather than for the predictor as a whole).\n\nWe can use the coefficients to calculate the probability of having a pointed beak for a given beak length:\n\n$$ P(pointed \\ beak) = \\frac{\\exp(-43.41 + 3.39 \\times blength)}{1 + \\exp(-43.41 + 3.39 \\times blength)} $$\n\nHaving this formula means that we can calculate the probability of having a pointed beak for any beak length. How do we work this out in practice? \n\n::: {.panel-tabset group=\"language\"}\n## R\n\nWell, the probability of having a pointed beak if the beak length is large (for example 15 mm) can be calculated as follows:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nexp(-43.41 + 3.39 * 15) / (1 + exp(-43.41 + 3.39 * 15))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 0.9994131\n```\n:::\n:::\n\n\nIf the beak length is small (for example 10 mm), the probability of having a pointed beak is extremely low:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nexp(-43.41 + 3.39 * 10) / (1 + exp(-43.41 + 3.39 * 10))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 7.410155e-05\n```\n:::\n:::\n\n\n## Python\nWell, the probability of having a pointed beak if the beak length is large (for example 15 mm) can be calculated as follows:\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# import the math library\nimport math\n```\n:::\n\n::: {.cell}\n\n```{.python .cell-code}\nmath.exp(-43.41 + 3.39 * 15) / (1 + math.exp(-43.41 + 3.39 * 15))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n0.9994130595039192\n```\n:::\n:::\n\n\nIf the beak length is small (for example 10 mm), the probability of having a pointed beak is extremely low:\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmath.exp(-43.41 + 3.39 * 10) / (1 + math.exp(-43.41 + 3.39 * 10))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n7.410155028945912e-05\n```\n:::\n:::\n\n:::\n\nWe can calculate the the probabilities for all our observed values and if we do that then we can see that the larger the beak length is, the higher the probability that a beak shape would be pointed. I'm visualising this together with the logistic curve, where the blue points are the calculated probabilities:\n\n::: {.callout-note collapse=true}\n## Code available here\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nglm_bks %>% \n augment(type.predict = \"response\") %>% \n ggplot() +\n geom_point(aes(x = blength, y = pointed_beak)) +\n geom_line(aes(x = blength, y = .fitted),\n linetype = \"dashed\",\n colour = \"blue\") +\n geom_point(aes(x = blength, y = .fitted),\n colour = \"blue\", alpha = 0.5) +\n labs(x = \"beak length (mm)\",\n y = \"Probability\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py) +\n geom_point(aes(x = \"blength\", y = \"pointed_beak\")) +\n geom_line(aes(x = \"blength\", y = glm_bks_py.fittedvalues),\n linetype = \"dashed\",\n colour = \"blue\") +\n geom_point(aes(x = \"blength\", y = glm_bks_py.fittedvalues),\n colour = \"blue\", alpha = 0.5) +\n labs(x = \"beak length (mm)\",\n y = \"Probability\"))\n```\n:::\n\n:::\n:::\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Predicted probabilities for beak classification](glm-practical-logistic-binary_files/figure-html/fig-beak_class_glm_probs-2.png){#fig-beak_class_glm_probs width=672}\n:::\n:::\n\n\nThe graph shows us that, based on the data that we have and the model we used to make predictions about our response variable, the probability of seeing a pointed beak increases with beak length.\n\nShort beaks are more closely associated with the bluntly shaped beaks, whereas long beaks are more closely associated with the pointed shape. It's also clear that there is a range of beak lengths (around 13 mm) where the probability of getting one shape or another is much more even.\n\nThere is an interesting genetic story behind all this, but that will be explained in more detail in the full-day [Generalised linear models](https://cambiotraining.github.io/stats-glm/) course.\n\n## Exercises\n\n### Diabetes {#sec-exr_diabetes}\n\n:::{.callout-exercise}\n\n\n{{< level 2 >}}\n\n\n\nFor this exercise we'll be using the data from `data/diabetes.csv`.\n\nThis is a data set comprising 768 observations of three variables (one dependent and two predictor variables). This records the results of a diabetes test result as a binary variable (1 is a positive result, 0 is a negative result), along with the result of a glucose tolerance test and the diastolic blood pressure for each of 768 women. The variables are called `test_result`, `glucose` and `diastolic`.\n\nWe want to see if the `glucose` tolerance is a meaningful predictor for predictions on a diabetes test. To investigate this, do the following:\n\n1. Load and visualise the data\n2. Create a suitable model\n3. Determine if there are any statistically significant predictors\n4. Calculate the probability of a positive diabetes test result for a glucose tolerance test value of `glucose = 150`\n\n::: {.callout-answer collapse=\"true\"}\n## Answer\n\n#### Load and visualise the data\n\nFirst we load the data, then we visualise it.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndiabetes <- read_csv(\"data/diabetes.csv\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\ndiabetes_py = pd.read_csv(\"data/diabetes.csv\")\n```\n:::\n\n\n:::\n\nLooking at the data, we can see that the `test_result` column contains zeros and ones. These are yes/no test result outcomes and not actually numeric representations.\n\nWe'll have to deal with this soon. For now, we can plot the data, by outcome:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(diabetes,\n aes(x = factor(test_result),\n y = glucose)) +\n geom_boxplot()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-32-1.png){width=672}\n:::\n:::\n\n\n## Python\n\nWe could just give Python the `test_result` data directly, but then it would view the values as numeric. Which doesn't really work, because we have two groups as such: those with a negative diabetes test result, and those with a positive one.\n\nWe can force Python to temporarily covert the data to a factor, by making the `test_result` column an `object` type. We can do this directly inside the `ggplot()` function.\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(diabetes_py,\n aes(x = diabetes_py.test_result.astype(object),\n y = \"glucose\")) +\n geom_boxplot())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-33-1.png){width=614}\n:::\n:::\n\n:::\n\nIt looks as though the patients with a positive diabetes test have slightly higher glucose levels than those with a negative diabetes test.\n\nWe can visualise that differently by plotting all the data points as a classic binary response plot:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(diabetes,\n aes(x = glucose,\n y = test_result)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-34-3.png){width=672}\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(diabetes_py,\n aes(x = \"glucose\",\n y = \"test_result\")) +\n geom_point())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-35-1.png){width=614}\n:::\n:::\n\n\n:::\n\n#### Create a suitable model\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nWe'll use the `glm()` function to create a generalised linear model. Here we save the model in an object called `glm_dia`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nglm_dia <- glm(test_result ~ glucose,\n family = binomial,\n data = diabetes)\n```\n:::\n\n\nThe format of this function is similar to that used by the `lm()` function for linear models. The important difference is that we must specify the _family_ of error distribution to use. For logistic regression we must set the family to **binomial**.\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# create a linear model\nmodel = smf.glm(formula = \"test_result ~ glucose\",\n family = sm.families.Binomial(),\n data = diabetes_py)\n# and get the fitted parameters of the model\nglm_dia_py = model.fit()\n```\n:::\n\n\n:::\n\nLet's look at the model parameters:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(glm_dia)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n\nCall:\nglm(formula = test_result ~ glucose, family = binomial, data = diabetes)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) -5.611732 0.442289 -12.69 <2e-16 ***\nglucose 0.039510 0.003398 11.63 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 936.6 on 727 degrees of freedom\nResidual deviance: 752.2 on 726 degrees of freedom\nAIC: 756.2\n\nNumber of Fisher Scoring iterations: 4\n```\n:::\n:::\n\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nprint(glm_dia_py.summary())\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n Generalized Linear Model Regression Results \n==============================================================================\nDep. Variable: test_result No. Observations: 728\nModel: GLM Df Residuals: 726\nModel Family: Binomial Df Model: 1\nLink Function: Logit Scale: 1.0000\nMethod: IRLS Log-Likelihood: -376.10\nDate: Mon, 07 Aug 2023 Deviance: 752.20\nTime: 14:26:45 Pearson chi2: 713.\nNo. Iterations: 4 Pseudo R-squ. (CS): 0.2238\nCovariance Type: nonrobust \n==============================================================================\n coef std err z P>|z| [0.025 0.975]\n------------------------------------------------------------------------------\nIntercept -5.6117 0.442 -12.688 0.000 -6.479 -4.745\nglucose 0.0395 0.003 11.628 0.000 0.033 0.046\n==============================================================================\n```\n:::\n:::\n\n:::\n\nWe can see that `glucose` is a significant predictor for the `test_result` (the $p$ value is much smaller than 0.05).\n\nKnowing this, we're interested in the coefficients. We have an intercept of `-5.61` and `0.0395` for `glucose`. We can use these coefficients to write a formula that describes the potential relationship between the probability of having a positive test result, dependent on the glucose tolerance level value:\n\n$$ P(positive \\ test\\ result) = \\frac{\\exp(-5.61 + 0.04 \\times glucose)}{1 + \\exp(-5.61 + 0.04 \\times glucose)} $$\n\n#### Calculating probabilities\n\nUsing the formula above, we can now calculate the probability of having a positive test result, for a given `glucose` value. If we do this for `glucose = 150`, we get the following:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nexp(-5.61 + 0.04 * 150) / (1 + exp(-5.61 + 0.04 * 145))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 0.6685441\n```\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmath.exp(-5.61 + 0.04 * 150) / (1 + math.exp(-5.61 + 0.04 * 145))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n0.6685441044999503\n```\n:::\n:::\n\n:::\n\nThis tells us that the probability of having a positive diabetes test result, given a glucose tolerance level of 150 is around 67%.\n\n:::\n:::\n\n## Key points\n\n::: {.callout-note}\n- We use a logistic regression to model a binary response\n- We can feed new observations into the model and get probabilities for the outcome\n:::\n", + "engine": "knitr", + "markdown": "---\ntitle: \"Binary response\"\n---\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n:::\n\n\n::: {.callout-tip}\n## Learning outcomes\n\n**Questions**\n\n- How do we analyse data with a binary outcome?\n- Can we test if our model is any good?\n- Be able to perform a logistic regression with a binary outcome\n- Predict outcomes of new data, based on a defined model\n\n**Objectives**\n\n- Be able to analyse binary outcome data\n- Understand different methods of testing model fit\n- Be able to make model predictions\n:::\n\n## Libraries and functions\n\n::: {.callout-note collapse=\"true\"}\n## Click to expand\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n### Libraries\n### Functions\n\n## Python\n\n### Libraries\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# A maths library\nimport math\n# A Python data analysis and manipulation tool\nimport pandas as pd\n\n# Python equivalent of `ggplot2`\nfrom plotnine import *\n\n# Statistical models, conducting tests and statistical data exploration\nimport statsmodels.api as sm\n\n# Convenience interface for specifying models using formula strings and DataFrames\nimport statsmodels.formula.api as smf\n```\n:::\n\n\n### Functions\n:::\n:::\n\nThe example in this section uses the following data set:\n\n`data/finches_early.csv`\n\nThese data come from an analysis of gene flow across two finch species [@lamichhaney2020]. They are slightly adapted here for illustrative purposes.\n\nThe data focus on two species, _Geospiza fortis_ and _G. scandens_. The original measurements are split by a uniquely timed event: a particularly strong El Niño event in 1983. This event changed the vegetation and food supply of the finches, allowing F1 hybrids of the two species to survive, whereas before 1983 they could not. The measurements are classed as `early` (pre-1983) and `late` (1983 onwards).\n\nHere we are looking only at the `early` data. We are specifically focussing on the beak shape classification, which we saw earlier in @fig-beak_shape_glm.\n\n## Load and visualise the data\n\nFirst we load the data, then we visualise it.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nearly_finches <- read_csv(\"data/finches_early.csv\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nearly_finches_py = pd.read_csv(\"data/finches_early.csv\")\n```\n:::\n\n\n:::\n\nLooking at the data, we can see that the `pointed_beak` column contains zeros and ones. These are actually yes/no classification outcomes and not numeric representations.\n\nWe'll have to deal with this soon. For now, we can plot the data:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(early_finches,\n aes(x = factor(pointed_beak),\n y = blength)) +\n geom_boxplot()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-6-1.png){width=672}\n:::\n:::\n\n\n## Python\n\nWe could just give Python the `pointed_beak` data directly, but then it would view the values as numeric. Which doesn't really work, because we have two groups as such: those with a pointed beak (`1`), and those with a blunt one (`0`).\n\nWe can force Python to temporarily covert the data to a factor, by making the `pointed_beak` column an `object` type. We can do this directly inside the `ggplot()` function.\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py,\n aes(x = early_finches_py.pointed_beak.astype(object),\n y = \"blength\")) +\n geom_boxplot())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-7-1.png){width=614}\n:::\n:::\n\n:::\n\nIt looks as though the finches with blunt beaks generally have shorter beak lengths.\n\nWe can visualise that differently by plotting all the data points as a classic binary response plot:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(early_finches,\n aes(x = blength, y = pointed_beak)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-8-3.png){width=672}\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py,\n aes(x = \"blength\",\n y = \"pointed_beak\")) +\n geom_point())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-9-1.png){width=614}\n:::\n:::\n\n\n:::\n\nThis presents us with a bit of an issue. We could fit a linear regression model to these data, although we already know that this is a bad idea...\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(early_finches,\n aes(x = blength, y = pointed_beak)) +\n geom_point() +\n geom_smooth(method = \"lm\", se = FALSE)\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-10-3.png){width=672}\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py,\n aes(x = \"blength\",\n y = \"pointed_beak\")) +\n geom_point() +\n geom_smooth(method = \"lm\",\n colour = \"blue\",\n se = False))\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-11-1.png){width=614}\n:::\n:::\n\n\n:::\n\nOf course this is rubbish - we can't have a beak classification outside the range of $[0, 1]$. It's either blunt (`0`) or pointed (`1`).\n\nBut for the sake of exploration, let's look at the assumptions:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlm_bks <- lm(pointed_beak ~ blength,\n data = early_finches)\n\nresid_panel(lm_bks,\n plots = c(\"resid\", \"qq\", \"ls\", \"cookd\"),\n smoother = TRUE)\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-12-3.png){width=672}\n:::\n:::\n\n\n## Python\n\nFirst, we create a linear model:\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# create a linear model\nmodel = smf.ols(formula = \"pointed_beak ~ blength\",\n data = early_finches_py)\n# and get the fitted parameters of the model\nlm_bks_py = model.fit()\n```\n:::\n\n\nNext, we can create the diagnostic plots:\n\n::: {.cell}\n\n```{.python .cell-code}\ndgplots(lm_bks_py)\n```\n:::\n\n::: {.cell}\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-15-1.png){width=96}\n:::\n:::\n\n::: {.cell}\n::: {.cell-output-display}\n![](images/dgplots/2024_01_15-10-25-49_AM_dgplots.png){width=804}\n:::\n:::\n\n\n:::\n\nThey're ~~pretty~~ extremely bad.\n\n- The response is not linear (Residual Plot, binary response plot, common sense).\n- The residuals do not appear to be distributed normally (Q-Q Plot)\n- The variance is not homogeneous across the predicted values (Location-Scale Plot)\n- But - there is always a silver lining - we don't have influential data points.\n\n## Creating a suitable model\n\nSo far we've established that using a simple linear model to describe a potential relationship between beak length and the probability of having a pointed beak is not a good idea. So, what _can_ we do?\n\nOne of the ways we can deal with binary outcome data is by performing a logistic regression. Instead of fitting a straight line to our data, and performing a regression on that, we fit a line that has an S shape. This avoids the model making predictions outside the $[0, 1]$ range.\n\nWe described our standard linear relationship as follows:\n\n$Y = \\beta_0 + \\beta_1X$\n\nWe can now map this to our non-linear relationship via the **logistic link function**:\n\n$Y = \\frac{\\exp(\\beta_0 + \\beta_1X)}{1 + \\exp(\\beta_0 + \\beta_1X)}$\n\nNote that the $\\beta_0 + \\beta_1X$ part is identical to the formula of a straight line.\n\nThe rest of the function is what makes the straight line curve into its characteristic S shape. \n\n:::{.callout-note collapse=true}\n## Euler's number ($\\exp$): would you like to know more?\n\nIn mathematics, $\\rm e$ represents a constant of around 2.718. Another notation is $\\exp$, which is often used when notations become a bit cumbersome. Here, I exclusively use the $\\exp$ notation for consistency.\n:::\n\n::: {.callout-important}\n## The logistic function\n\nThe shape of the logistic function is hugely influenced by the different parameters, in particular $\\beta_1$. The plots below show different situations, where $\\beta_0 = 0$ in all cases, but $\\beta_1$ varies.\n\nThe first plot shows the logistic function in its simplest form, with the others showing the effect of varying $\\beta_1$.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-17-1.png){width=672}\n:::\n:::\n\n\n* when $\\beta_1 = 1$, this gives the simplest logistic function\n* when $\\beta_1 = 0$ gives a horizontal line, with $Y = \\frac{\\exp(\\beta_0)}{1+\\exp(\\beta_0)}$\n* when $\\beta_1$ is negative flips the curve around, so it slopes down\n* when $\\beta_1$ is very large then the curve becomes extremely steep\n\n:::\n\nWe can fit such an S-shaped curve to our `early_finches` data set, by creating a generalised linear model.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nIn R we have a few options to do this, and by far the most familiar function would be `glm()`. Here we save the model in an object called `glm_bks`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nglm_bks <- glm(pointed_beak ~ blength,\n family = binomial,\n data = early_finches)\n```\n:::\n\n\nThe format of this function is similar to that used by the `lm()` function for linear models. The important difference is that we must specify the _family_ of error distribution to use. For logistic regression we must set the family to **binomial**.\n\nIf you forget to set the `family` argument, then the `glm()` function will perform a standard linear model fit, identical to what the `lm()` function would do.\n\n## Python\n\nIn Python we have a few options to do this, and by far the most familiar function would be `glm()`. Here we save the model in an object called `glm_bks_py`:\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# create a linear model\nmodel = smf.glm(formula = \"pointed_beak ~ blength\",\n family = sm.families.Binomial(),\n data = early_finches_py)\n# and get the fitted parameters of the model\nglm_bks_py = model.fit()\n```\n:::\n\n\nThe format of this function is similar to that used by the `ols()` function for linear models. The important difference is that we must specify the _family_ of error distribution to use. For logistic regression we must set the family to **binomial**. This is buried deep inside the `statsmodels` package and needs to be defined as `sm.families.Binomial()`.\n\n:::\n\n## Model output\n\nThat's the easy part done! The trickier part is interpreting the output. First of all, we'll get some summary information.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(glm_bks)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nCall:\nglm(formula = pointed_beak ~ blength, family = binomial, data = early_finches)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) -43.410 15.250 -2.847 0.00442 **\nblength 3.387 1.193 2.839 0.00452 **\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 84.5476 on 60 degrees of freedom\nResidual deviance: 9.1879 on 59 degrees of freedom\nAIC: 13.188\n\nNumber of Fisher Scoring iterations: 8\n```\n\n\n:::\n:::\n\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nprint(glm_bks_py.summary())\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n Generalized Linear Model Regression Results \n==============================================================================\nDep. Variable: pointed_beak No. Observations: 61\nModel: GLM Df Residuals: 59\nModel Family: Binomial Df Model: 1\nLink Function: Logit Scale: 1.0000\nMethod: IRLS Log-Likelihood: -4.5939\nDate: Mon, 15 Jan 2024 Deviance: 9.1879\nTime: 10:25:50 Pearson chi2: 15.1\nNo. Iterations: 8 Pseudo R-squ. (CS): 0.7093\nCovariance Type: nonrobust \n==============================================================================\n coef std err z P>|z| [0.025 0.975]\n------------------------------------------------------------------------------\nIntercept -43.4096 15.250 -2.847 0.004 -73.298 -13.521\nblength 3.3866 1.193 2.839 0.005 1.049 5.724\n==============================================================================\n```\n\n\n:::\n:::\n\n\n:::\n\nThere’s a lot to unpack here, but let's start with what we're familiar with: coefficients!\n\n## Parameter interpretation\n\n::: {.panel-tabset group=\"language\"}\n## R\nThe coefficients or parameters can be found in the `Coefficients` block. The main numbers to extract from the output are the two numbers underneath `Estimate.Std`:\n\n```\nCoefficients:\n Estimate Std.\n(Intercept) -43.410\nblength 3.387 \n```\n\n## Python\n\nRight at the bottom is a table showing the model coefficients. The main numbers to extract from the output are the two numbers in the `coef` column:\n\n```\n======================\n coef\n----------------------\nIntercept -43.4096\nblength 3.3866\n======================\n```\n\n:::\n\nThese are the coefficients of the logistic model equation and need to be placed in the correct equation if we want to be able to calculate the probability of having a pointed beak for a given beak length.\n\nThe $p$ values at the end of each coefficient row merely show whether that particular coefficient is significantly different from zero. This is similar to the $p$ values obtained in the summary output of a linear model. As with continuous predictors in simple models, these $p$ values can be used to decide whether that predictor is important (so in this case beak length appears to be significant). However, these $p$ values aren’t great to work with when we have multiple predictor variables, or when we have categorical predictors with multiple levels (since the output will give us a $p$ value for each level rather than for the predictor as a whole).\n\nWe can use the coefficients to calculate the probability of having a pointed beak for a given beak length:\n\n$$ P(pointed \\ beak) = \\frac{\\exp(-43.41 + 3.39 \\times blength)}{1 + \\exp(-43.41 + 3.39 \\times blength)} $$\n\nHaving this formula means that we can calculate the probability of having a pointed beak for any beak length. How do we work this out in practice? \n\n::: {.panel-tabset group=\"language\"}\n## R\n\nWell, the probability of having a pointed beak if the beak length is large (for example 15 mm) can be calculated as follows:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nexp(-43.41 + 3.39 * 15) / (1 + exp(-43.41 + 3.39 * 15))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 0.9994131\n```\n\n\n:::\n:::\n\n\nIf the beak length is small (for example 10 mm), the probability of having a pointed beak is extremely low:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nexp(-43.41 + 3.39 * 10) / (1 + exp(-43.41 + 3.39 * 10))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 7.410155e-05\n```\n\n\n:::\n:::\n\n\n## Python\nWell, the probability of having a pointed beak if the beak length is large (for example 15 mm) can be calculated as follows:\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# import the math library\nimport math\n```\n:::\n\n::: {.cell}\n\n```{.python .cell-code}\nmath.exp(-43.41 + 3.39 * 15) / (1 + math.exp(-43.41 + 3.39 * 15))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n0.9994130595039192\n```\n\n\n:::\n:::\n\n\nIf the beak length is small (for example 10 mm), the probability of having a pointed beak is extremely low:\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmath.exp(-43.41 + 3.39 * 10) / (1 + math.exp(-43.41 + 3.39 * 10))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n7.410155028945912e-05\n```\n\n\n:::\n:::\n\n:::\n\nWe can calculate the the probabilities for all our observed values and if we do that then we can see that the larger the beak length is, the higher the probability that a beak shape would be pointed. I'm visualising this together with the logistic curve, where the blue points are the calculated probabilities:\n\n::: {.callout-note collapse=true}\n## Code available here\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nglm_bks %>% \n augment(type.predict = \"response\") %>% \n ggplot() +\n geom_point(aes(x = blength, y = pointed_beak)) +\n geom_line(aes(x = blength, y = .fitted),\n linetype = \"dashed\",\n colour = \"blue\") +\n geom_point(aes(x = blength, y = .fitted),\n colour = \"blue\", alpha = 0.5) +\n labs(x = \"beak length (mm)\",\n y = \"Probability\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(early_finches_py) +\n geom_point(aes(x = \"blength\", y = \"pointed_beak\")) +\n geom_line(aes(x = \"blength\", y = glm_bks_py.fittedvalues),\n linetype = \"dashed\",\n colour = \"blue\") +\n geom_point(aes(x = \"blength\", y = glm_bks_py.fittedvalues),\n colour = \"blue\", alpha = 0.5) +\n labs(x = \"beak length (mm)\",\n y = \"Probability\"))\n```\n:::\n\n:::\n:::\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Predicted probabilities for beak classification](glm-practical-logistic-binary_files/figure-html/fig-beak_class_glm_probs-2.png){#fig-beak_class_glm_probs width=672}\n:::\n:::\n\n\nThe graph shows us that, based on the data that we have and the model we used to make predictions about our response variable, the probability of seeing a pointed beak increases with beak length.\n\nShort beaks are more closely associated with the bluntly shaped beaks, whereas long beaks are more closely associated with the pointed shape. It's also clear that there is a range of beak lengths (around 13 mm) where the probability of getting one shape or another is much more even.\n\n## Parameter estimation explained\n\nNow that we know how to interpret the coefficients or parameters, let's have a look at how they're actually determined. To understand this, we need to take a step back.\n\n* Sum of squares (OLS)\n* MLE\n* Deviance\n\n## Assumptions\n\n* GAMLSS\n\n## Exercises\n\n### Diabetes {#sec-exr_diabetes}\n\n:::{.callout-exercise}\n\n\n{{< level 2 >}}\n\n\n\nFor this exercise we'll be using the data from `data/diabetes.csv`.\n\nThis is a data set comprising 768 observations of three variables (one dependent and two predictor variables). This records the results of a diabetes test result as a binary variable (1 is a positive result, 0 is a negative result), along with the result of a glucose tolerance test and the diastolic blood pressure for each of 768 women. The variables are called `test_result`, `glucose` and `diastolic`.\n\nWe want to see if the `glucose` tolerance is a meaningful predictor for predictions on a diabetes test. To investigate this, do the following:\n\n1. Load and visualise the data\n2. Create a suitable model\n3. Determine if there are any statistically significant predictors\n4. Calculate the probability of a positive diabetes test result for a glucose tolerance test value of `glucose = 150`\n\n::: {.callout-answer collapse=\"true\"}\n## Answer\n\n#### Load and visualise the data\n\nFirst we load the data, then we visualise it.\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndiabetes <- read_csv(\"data/diabetes.csv\")\n```\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\ndiabetes_py = pd.read_csv(\"data/diabetes.csv\")\n```\n:::\n\n\n:::\n\nLooking at the data, we can see that the `test_result` column contains zeros and ones. These are yes/no test result outcomes and not actually numeric representations.\n\nWe'll have to deal with this soon. For now, we can plot the data, by outcome:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(diabetes,\n aes(x = factor(test_result),\n y = glucose)) +\n geom_boxplot()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-32-1.png){width=672}\n:::\n:::\n\n\n## Python\n\nWe could just give Python the `test_result` data directly, but then it would view the values as numeric. Which doesn't really work, because we have two groups as such: those with a negative diabetes test result, and those with a positive one.\n\nWe can force Python to temporarily covert the data to a factor, by making the `test_result` column an `object` type. We can do this directly inside the `ggplot()` function.\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(diabetes_py,\n aes(x = diabetes_py.test_result.astype(object),\n y = \"glucose\")) +\n geom_boxplot())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-33-1.png){width=614}\n:::\n:::\n\n:::\n\nIt looks as though the patients with a positive diabetes test have slightly higher glucose levels than those with a negative diabetes test.\n\nWe can visualise that differently by plotting all the data points as a classic binary response plot:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(diabetes,\n aes(x = glucose,\n y = test_result)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-34-3.png){width=672}\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n(ggplot(diabetes_py,\n aes(x = \"glucose\",\n y = \"test_result\")) +\n geom_point())\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-binary_files/figure-html/unnamed-chunk-35-1.png){width=614}\n:::\n:::\n\n\n:::\n\n#### Create a suitable model\n\n::: {.panel-tabset group=\"language\"}\n## R\n\nWe'll use the `glm()` function to create a generalised linear model. Here we save the model in an object called `glm_dia`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nglm_dia <- glm(test_result ~ glucose,\n family = binomial,\n data = diabetes)\n```\n:::\n\n\nThe format of this function is similar to that used by the `lm()` function for linear models. The important difference is that we must specify the _family_ of error distribution to use. For logistic regression we must set the family to **binomial**.\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\n# create a linear model\nmodel = smf.glm(formula = \"test_result ~ glucose\",\n family = sm.families.Binomial(),\n data = diabetes_py)\n# and get the fitted parameters of the model\nglm_dia_py = model.fit()\n```\n:::\n\n\n:::\n\nLet's look at the model parameters:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nsummary(glm_dia)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n\nCall:\nglm(formula = test_result ~ glucose, family = binomial, data = diabetes)\n\nCoefficients:\n Estimate Std. Error z value Pr(>|z|) \n(Intercept) -5.611732 0.442289 -12.69 <2e-16 ***\nglucose 0.039510 0.003398 11.63 <2e-16 ***\n---\nSignif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1\n\n(Dispersion parameter for binomial family taken to be 1)\n\n Null deviance: 936.6 on 727 degrees of freedom\nResidual deviance: 752.2 on 726 degrees of freedom\nAIC: 756.2\n\nNumber of Fisher Scoring iterations: 4\n```\n\n\n:::\n:::\n\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nprint(glm_dia_py.summary())\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n Generalized Linear Model Regression Results \n==============================================================================\nDep. Variable: test_result No. Observations: 728\nModel: GLM Df Residuals: 726\nModel Family: Binomial Df Model: 1\nLink Function: Logit Scale: 1.0000\nMethod: IRLS Log-Likelihood: -376.10\nDate: Mon, 15 Jan 2024 Deviance: 752.20\nTime: 10:25:54 Pearson chi2: 713.\nNo. Iterations: 4 Pseudo R-squ. (CS): 0.2238\nCovariance Type: nonrobust \n==============================================================================\n coef std err z P>|z| [0.025 0.975]\n------------------------------------------------------------------------------\nIntercept -5.6117 0.442 -12.688 0.000 -6.479 -4.745\nglucose 0.0395 0.003 11.628 0.000 0.033 0.046\n==============================================================================\n```\n\n\n:::\n:::\n\n:::\n\nWe can see that `glucose` is a significant predictor for the `test_result` (the $p$ value is much smaller than 0.05).\n\nKnowing this, we're interested in the coefficients. We have an intercept of `-5.61` and `0.0395` for `glucose`. We can use these coefficients to write a formula that describes the potential relationship between the probability of having a positive test result, dependent on the glucose tolerance level value:\n\n$$ P(positive \\ test\\ result) = \\frac{\\exp(-5.61 + 0.04 \\times glucose)}{1 + \\exp(-5.61 + 0.04 \\times glucose)} $$\n\n#### Calculating probabilities\n\nUsing the formula above, we can now calculate the probability of having a positive test result, for a given `glucose` value. If we do this for `glucose = 150`, we get the following:\n\n::: {.panel-tabset group=\"language\"}\n## R\n\n\n::: {.cell}\n\n```{.r .cell-code}\nexp(-5.61 + 0.04 * 150) / (1 + exp(-5.61 + 0.04 * 145))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 0.6685441\n```\n\n\n:::\n:::\n\n\n## Python\n\n\n::: {.cell}\n\n```{.python .cell-code}\nmath.exp(-5.61 + 0.04 * 150) / (1 + math.exp(-5.61 + 0.04 * 145))\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n0.6685441044999503\n```\n\n\n:::\n:::\n\n:::\n\nThis tells us that the probability of having a positive diabetes test result, given a glucose tolerance level of 150 is around 67%.\n\n:::\n:::\n\n## Summary\n\n::: {.callout-tip}\n#### Key points\n\n- We use a logistic regression to model a binary response\n- We can feed new observations into the model and get probabilities for the outcome\n:::\n", "supporting": [ "glm-practical-logistic-binary_files" ], diff --git a/_freeze/materials/glm-practical-logistic-binary/figure-html/fig-beak_class_glm_probs-2.png b/_freeze/materials/glm-practical-logistic-binary/figure-html/fig-beak_class_glm_probs-2.png index 39f95c1..56a33c5 100644 Binary files a/_freeze/materials/glm-practical-logistic-binary/figure-html/fig-beak_class_glm_probs-2.png and b/_freeze/materials/glm-practical-logistic-binary/figure-html/fig-beak_class_glm_probs-2.png differ diff --git a/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-15-1.png b/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-15-1.png new file mode 100644 index 0000000..cc434e8 Binary files /dev/null and b/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-15-1.png differ diff --git a/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-18-1.png b/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-18-1.png deleted file mode 100644 index acbede4..0000000 Binary files a/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-18-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-27-1.png b/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-27-1.png index 39f95c1..56a33c5 100644 Binary files a/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-27-1.png and b/_freeze/materials/glm-practical-logistic-binary/figure-html/unnamed-chunk-27-1.png differ diff --git a/_freeze/materials/glm-practical-logistic-proportion/execute-results/html.json b/_freeze/materials/glm-practical-logistic-proportion/execute-results/html.json deleted file mode 100644 index 2365ddd..0000000 --- a/_freeze/materials/glm-practical-logistic-proportion/execute-results/html.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "hash": "d805fa496007b086c9180ea0de4632de", - "result": { - "markdown": "---\ntitle: \"Proportional response\"\n---\n\n::: {.cell}\n\n:::\n\n\n::: callout-note\n## Aims & objectives\n- How do I analyse proportion responses?\n- Be able to create a logistic model to test proportion response variables\n- Be able to plot the data and fitted curve\n- Assess the significance of the fit\n:::\n\n## Libraries and functions\n\n::: panel-tabset\n## tidyverse\n\n| Library | Description |\n|:-----------------------------------|:-----------------------------------|\n| `tidyverse` | A collection of R packages designed for data science |\n| `tidymodels` | A collection of packages for modelling and machine learning using tidyverse principles |\n:::\n\n## Datasets\n\n::: panel-tabset\n## Challenger\n\nThe example in this section uses the following data set:\n\n`data/challenger.csv`\n\nThese data, obtained from the [faraway package](https://www.rdocumentation.org/packages/faraway/versions/1.0.7), contain information related to the explosion of the USA Space Shuttle Challenger on 28 January, 1986. An investigation after the disaster traced back to certain joints on one of the two solid booster rockets, each containing O-rings that ensured no exhaust gases could escape from the booster.\n\nThe night before the launch was unusually cold, with temperatures below freezing. The final report suggested that the cold snap during the night made the o-rings stiff, and unable to adjust to changes in pressure. As a result, exhaust gases leaked away from the solid booster rockets, causing one of them to break loose and rupture the main fuel tank, leading to the final explosion.\n\nThe question we're trying to answer in this session is: based on the data from the previous flights, would it have been possible to predict the failure of most o-rings on the Challenger flight?\n:::\n\n## Visualise the data\n\nFirst, we read in the data:\n\n::: panel-tabset\n## tidyverse\n\n\n::: {.cell}\n\n```{.r .cell-code}\nchallenger <- read_csv(\"data/challenger.csv\")\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nRows: 23 Columns: 2\n── Column specification ────────────────────────────────────────────────────────\nDelimiter: \",\"\ndbl (2): temp, damage\n\nℹ Use `spec()` to retrieve the full column specification for this data.\nℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.\n```\n:::\n\n```{.r .cell-code}\nchallenger\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 23 × 2\n temp damage\n \n 1 53 5\n 2 57 1\n 3 58 1\n 4 63 1\n 5 66 0\n 6 67 0\n 7 67 0\n 8 67 0\n 9 68 0\n10 69 0\n# … with 13 more rows\n```\n:::\n:::\n\n:::\n\nThe data set contains several columns:\n\n1. `temp`, the launch temperature in degrees Fahrenheit\n2. `damage`, the number of o-rings that showed erosion\n\nBefore we have a further look at the data, let's calculate the proportion of damaged o-rings (`prop_damaged`) and the total number of o-rings (`total`) and update our data set.\n\n::: panel-tabset\n## tidyverse\n\n\n::: {.cell}\n\n```{.r .cell-code}\nchallenger <-\nchallenger %>%\n mutate(total = 6, # total number of o-rings\n intact = 6 - damage, # number of undamaged o-rings\n prop_damaged = damage / total) # proportion damaged o-rings\n\nchallenger\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 23 × 5\n temp damage total intact prop_damaged\n \n 1 53 5 6 1 0.833\n 2 57 1 6 5 0.167\n 3 58 1 6 5 0.167\n 4 63 1 6 5 0.167\n 5 66 0 6 6 0 \n 6 67 0 6 6 0 \n 7 67 0 6 6 0 \n 8 67 0 6 6 0 \n 9 68 0 6 6 0 \n10 69 0 6 6 0 \n# … with 13 more rows\n```\n:::\n:::\n\n:::\n\nPlotting the proportion of damaged o-rings against the launch temperature shows the following picture:\n\n::: panel-tabset\n## tidyverse\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(challenger, aes(x = temp, y = prop_damaged)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-proportion_files/figure-html/unnamed-chunk-4-1.png){width=672}\n:::\n:::\n\n:::\n\nThe point on the left is the data point corresponding to the coldest flight experienced before the disaster, where five damaged o-rings were found. Fortunately, this did not result in a disaster.\n\nHere we'll explore if we could have predicted the failure of both o-rings on the Challenger flight, where the launch temperature was 31 degrees Fahrenheit.\n\n## Model building\n\nWe only have 23 data points in total. So we're building a model on not that much data - we should keep this in mind when we draw our conclusions!\n\n::: panel-tabset\n## tidyverse\n\nWe are using a logistic regression for a proportion response in this case, since we're interested in the proportion of o-rings that are damaged.\n\nThe `logistic_reg()` function we used in the binary response section does not work here, because it expects a binary (yes/no; positive/negative; 0/1 etc) response.\n\nTo deal with that, we are using the standard `linear_reg()` function, still using the `glm` or generalised linear model engine, with the family or error distribution set to *binomial* (as before).\n\nFirst we set the model specification:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nchl_mod <- linear_reg(mode = \"regression\") %>%\n set_engine(\"glm\", family = \"binomial\")\n```\n:::\n\n\nThen we fit the data. Fitting the data for proportion responses is a bit annoying, where you have to give the `glm` model a two-column matrix to specify the response variable.\n\nHere, the first column corresponds to the number of damaged o-rings, whereas the second column refers to the number of intact o-rings. We use the `cbind()` function to bind these two together into a matrix.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nchl_fit <- chl_mod %>% \n fit(cbind(damage, intact) ~ temp,\n data = challenger)\n```\n:::\n\n\nNext, we can have a closer look at the results:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nchl_fit %>% tidy()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 5\n term estimate std.error statistic p.value\n \n1 (Intercept) 11.7 3.30 3.54 0.000403 \n2 temp -0.216 0.0532 -4.07 0.0000478\n```\n:::\n:::\n\n\nWe can see that the p-values of the `intercept` and `temp` are significant. We can also use the intercept and `temp` coefficients to construct the logistic equation, which we can use to sketch the logistic curve.\n\n$$E(prop \\ failed\\ orings) = \\frac{\\exp{(11.66 - 0.22 \\times temp)}}{1 + \\exp{(11.66 - 0.22 \\times temp)}}$$\n\nLet's see how well our model would have performed if we would have fed it the data from the ill-fated Challenger launch.\n\nOne way of doing this it to generate a table with data for a range of temperatures, from 25 to 85 degrees Fahrenheit, in steps of 1. We can then use these data to generate the logistic curve, based on the fitted model.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmodel <- tibble(temp = seq(25, 85, 1))\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# get the predicted proportions for the curve\ncurve <- chl_fit %>% augment(new_data = model)\n\n# plot the curve and the original data\nggplot(curve, aes(temp, .pred)) +\n geom_line(colour = \"red\") +\n geom_point(data = challenger, aes(temp, prop_damaged)) +\n # add a vertical line at the disaster launch temperature\n geom_vline(xintercept = 31, linetype = \"dashed\")\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-proportion_files/figure-html/unnamed-chunk-9-1.png){width=672}\n:::\n:::\n\n\nIt seems that there was a high probability of both o-rings failing at that launch temperature. One thing that the graph shows is that there is a lot of uncertainty involved in this model.\n:::\n\n## Exercise - predicting failure\n\nThe data point at 53 degrees Fahrenheit is quite influential for the analysis. Remove this data point and repeat the analysis. Is there still a predicted link between launch temperature and o-ring failure?\n\n::: {.callout-caution collapse=\"true\"}\n## Answer\n\n::: panel-tabset\n## tidyverse\n\nFirst, we need to remove the influential data point:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nchallenger_new <- challenger %>% filter(temp != 53)\n```\n:::\n\n\nWe can reuse the model specification, but we do have to update our fit:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nchl_new_fit <- chl_mod %>% \n fit(cbind(damage, intact) ~ temp,\n data = challenger_new)\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# get the predicted proportions for the curve\ncurve_new <- chl_new_fit %>% augment(new_data = model)\n\n# plot the curve and the original data\nggplot(curve_new, aes(temp, .pred)) +\n geom_line(colour = \"red\") +\n geom_point(data = challenger_new, aes(temp, prop_damaged)) +\n # add a vertical line at the disaster launch temperature\n geom_vline(xintercept = 31, linetype = \"dashed\")\n```\n\n::: {.cell-output-display}\n![](glm-practical-logistic-proportion_files/figure-html/unnamed-chunk-12-1.png){width=672}\n:::\n:::\n\n\nThe prediction proportion of damaged o-rings is markedly less in this scenario, with a failure rate of around 80%. The original fitted curve already had quite some uncertainty associated with it, but the uncertainty of this model is much greater.\n:::\n:::\n\n## Key points\n\n::: callout-note\n- We can use a logistic model for proportion response variables\n:::\n", - "supporting": [ - "glm-practical-logistic-proportion_files" - ], - "filters": [ - "rmarkdown/pagebreak.lua" - ], - "includes": {}, - "engineDependencies": {}, - "preserve": {}, - "postProcess": true - } -} \ No newline at end of file diff --git a/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-12-1.png b/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-12-1.png deleted file mode 100644 index 539f801..0000000 Binary files a/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-12-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-4-1.png b/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-4-1.png deleted file mode 100644 index b29684c..0000000 Binary files a/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-4-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-9-1.png b/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-9-1.png deleted file mode 100644 index 04f4807..0000000 Binary files a/_freeze/materials/glm-practical-logistic-proportion/figure-html/unnamed-chunk-9-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-poisson/execute-results/html.json b/_freeze/materials/glm-practical-poisson/execute-results/html.json deleted file mode 100644 index 4b7f5ee..0000000 --- a/_freeze/materials/glm-practical-poisson/execute-results/html.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "hash": "a16ca644410f89e952514c95d5941686", - "result": { - "markdown": "---\ntitle: \"Poisson regression\"\n---\n\n::: {.cell}\n\n:::\n\n\n::: callout-note\n## Aims & objectives\n- How do we analyse count data?\n- Be able to perform a poisson regression on count data\n:::\n\n## Libraries and functions\n\n::: panel-tabset\n## tidyverse\n\n| Library | Description |\n|:-----------------------------------|:-----------------------------------|\n| `tidyverse` | A collection of R packages designed for data science |\n| `tidymodels` | A collection of packages for modelling and machine learning using tidyverse principles |\n| `poissonreg` | Enables the `parsnip` package to fit various types of Poisson regression models |\n:::\n\n## Datasets\n\n::: panel-tabset\n## Islands\n\nThe example in this section uses the following data set:\n\n`data/islands.csv`\n\nThis is a data set comprising 35 observations of two variables (one dependent and one predictor). This records the number of species recorded on different small islands along with the area (km2) of the islands. The variables are `species` and `area`.\n\n## Seatbelts\n\nThe `seatbelts` data set is a multiple time-series data set that was commissioned by the Department of Transport in 1984 to measure differences in deaths before and after front seatbelt legislation was introduced on 31st January 1983. It provides monthly total numerical data on a number of incidents including those related to death and injury in Road Traffic Accidents (RTA's). The data set starts in January 1969 and observations run until December 1984.\n\nYou can find the file in `data/seatbelts.csv`\n:::\n\n## Visualise the data\n\nA good first step is always to explore your data prior to any further analysis.\n\n::: panel-tabset\n## tidyverse\n\nFirst, we load and inspect the data:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nislands <- read_csv(\"data/island.csv\")\n\nislands\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 35 × 2\n species area\n \n 1 114 12.1\n 2 130 13.4\n 3 113 13.7\n 4 109 14.5\n 5 118 16.8\n 6 136 19.0\n 7 149 19.6\n 8 162 20.6\n 9 145 20.9\n10 148 21.0\n# … with 25 more rows\n```\n:::\n:::\n\n\nLooking at the data, we can see that there are two columns: `species`, which contains the number of species recorded on each island and `area`, which contains the surface area of the island in square kilometers.\n\nWe can plot the data:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nislands %>% \n ggplot(aes(x = area, y = species)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n![](glm-practical-poisson_files/figure-html/unnamed-chunk-3-1.png){width=672}\n:::\n:::\n\n:::\n\nIt looks as though `area` may have an effect on the number of species that we observe on each island. We note that the response variable is count data and so we try to construct a Poisson regression.\n\n## Model building\n\nTo create a poisson regression we do the following:\n\n::: panel-tabset\n## tidyverse\n\nAgain, similar to what we've done for the logistic models, we will use the `parsnip` package from the `tidymodels` library. Yes, the workflow still seems a bit faffy, but it provides a common syntax for a whole range of modelling libraries. This means that the syntax will stay the same as you do different kind of model comparisons.\n\nIf you haven't loaded `tidymodels` yet, now is a really good time. We also need to load `poissonreg`, which adds extra functionality to `parsnip`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# install.packages(\"tidymodels\")\nlibrary(tidymodels)\n# install.packages(\"poissonreg\")\nlibrary(poissonreg)\n```\n:::\n\n\nRemember that the workflow in `parsnip` is a bit different to what we're used to so far. Using `parsnip` we approach things in a more systematic manner. We specify a model in three steps:\n\n1. **Specify the type of model based on its mathematical structure** (e.g., linear regression, logistic regression, poisson regression etc).\n\nFor example:\n\n- `linear_reg()` for linear regression\n- `logistic_reg()` for logistic regression\n- `poisson_reg()` for poisson regression (we'll be using that here)\n\n2. **When required, declare the mode of the model.** The mode reflects the type of prediction outcome. For numeric outcomes, the mode is *regression*; for qualitative outcomes, it is *classification.* If a model can only create one type of model, such as poisson regression, the mode is already set to, in this case, `mode = \"regression\"`.\n\n3. **Specify the engine for fitting the model.** This usually is the software package or library that should be used.\n\nFor example,\n\n- `\"lm\"` for linear models\n- `\"glm\"` for generalised linear models\n- `\"stan\"` for Bayesian inference\n\nYou can find out which engines can be used with the `show_engines()` function. The command `show_engines(\"poisson_reg\")` will give you the available engines for the `poisson_reg()` function.\n\nSo, we can create the model as follows:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_mod <- poisson_reg() %>% \n set_mode(\"regression\") %>% \n set_engine(\"glm\")\n```\n:::\n\n\nNote that we are not actually specifying any of the variables just yet! All we've done is tell R what kind of model we're planning to use. If we want to see how `parsnip` converts this code to the package syntax, we can check this with `translate()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_mod %>% translate()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nPoisson Regression Model Specification (regression)\n\nComputational engine: glm \n\nModel fit template:\nstats::glm(formula = missing_arg(), data = missing_arg(), weights = missing_arg(), \n family = stats::poisson)\n```\n:::\n:::\n\n\nThis shows that we have a poisson regression model, where the outcome is going to be a regression. The model fit template tells us that we'll be using the `glm()` function from the `stats` package, which can take a `formula`, `data`, `weights` and `family` argument. The `family` argument is already set to poisson.\n\nThe model fit template tells us that we'll be using the `glm()` function from the `stats` package (`stats::glm`). This function has several arguments:\n\n1. a `formula`, which we'll specify later\n2. `data`, which we'll provide in a bit\n3. `weights`, if we want to add prior weights to our variable - we don't have to concern ourselves with this - and\n4. a `family` argument, which is already set to `poisson`\n\n::: callout-important\n## The `family` argument\n\nThe `family` argument gives us a description of the error distribution and link function that will be used in the model. For the `islands` data set we are looking at count outcome - which we can model using a *poisson* regression model.\n\nIf we'd want to specify it manually, then we'd use\n\n`set_engine(\"glm\", family = stats::poisson(link = \"log\"))`\n\nwhich sets the family to poisson, using a log link function.\n:::\n\nNow we've specified what kind of model we're planning to use, we can fit our data to it, using the `fit()` function:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_fit <- isl_mod %>% \n fit(species ~ area,\n data = islands)\n```\n:::\n\n\nWe can look at the output directly, but I prefer to tidy the data up using the `tidy()` function from `broom` package:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_fit %>% tidy()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 5\n term estimate std.error statistic p.value\n \n1 (Intercept) 4.24 0.0413 103. 0 \n2 area 0.0356 0.00125 28.6 2.73e-179\n```\n:::\n:::\n\n\nThe output is strikingly similar to the binomial models (who'd have guessed, eh?) and the main numbers to extract from the output are the two numbers in the `estimate` column.\n:::\n\nThese are the coefficients of the Poisson model equation and need to be placed in the following formula in order to estimate the expected number of species as a function of island size:\n\n$$ E(species) = \\exp(4.24 + 0.036 \\times area) $$\n\nInterpreting this requires a bit of thought (not much, but a bit).\n\nThe intercept coefficient ($\\beta_0$), 4.24, is related to the number of species we would expect on an island of zero area (this is statistics, not real life. You'd do well to remember that before you worry too much about what that even means). But in order to turn this number into something meaningful we have to exponentiate it. Since $\\exp(4.24) \\approx 70$, we can say that the baseline number of species the model expects on any island is 70. This isn't actually the interesting bit though.\n\nThe coefficient of `area` ($\\beta_1$) is the fun bit. For starters we can see that it is a positive number which does mean that increasing area leads to increasing numbers of species. Good so far - since this matches what we saw when we plotted our data. But what does the value 0.036 actually mean?\n\nWell, if we exponentiate it too we get $\\exp(0.036) \\approx 1.04$. This means that for every increase in area of 1 km2 (the original units of the `area` variable) the number of species on the island is multiplied by 1.04. So, an island of area 1 km2 is predicted to have $1.04 \\times 70 \\approx 72$ species.\n\nSo, in order to interpret Poisson coefficients, you have to exponentiate them.\n\n## Model predictions\n\nNow that we can interpret the Poisson coefficients, it would be good to see if using a poisson regression to describe these data is actually a good idea.\n\nVisualisation is always useful, so in order to get an idea of how our data fits a Poisson regression, we'll plot the Poisson regression curve. Next, we overlay our original data.\n\n::: panel-tabset\n## tidyverse\n\nFirst, we create a table that contains data for the curve, starting for an `area` with value 1 to 50, in steps of 1.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nmodel <- tibble(area = seq(1, 50, 1))\n```\n:::\n\n\nNext, we feed our model these data:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ncurve <- isl_fit %>% augment(new_data = model)\n```\n:::\n\n\nThis gives the predicted number of `species` for each given value of `area`. If we have a closer look at these data we can see that, for example, for an `area` with a surface area of 4 km2 the predicted number of species is around 80. Nice.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nhead(curve)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 6 × 2\n area .pred\n \n1 1 72.0\n2 2 74.6\n3 3 77.3\n4 4 80.1\n5 5 83.0\n6 6 86.0\n```\n:::\n:::\n\n\nUsing these data, we can now plot all the *predicted* number of species and overlay our original *measured* data.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggplot(curve, aes(area, .pred)) +\n geom_line(colour = \"red\") +\n geom_point(data = islands, aes(area, species))\n```\n\n::: {.cell-output-display}\n![](glm-practical-poisson_files/figure-html/unnamed-chunk-12-1.png){width=672}\n:::\n:::\n\n:::\n\nThat looks like a pretty decent fit, really. But of course we want to have a (slightly) less hand-wavy conclusion than that.\n\n## Goodness-of-fit\n\nWe can use the model's residual deviance to assess how much the predicted values differ from the observed. This gives us an idea of how well-specified the model is. When a model is \"true\", *i.e.* the model makes pretty accurate predictions, then we expect the residual deviance to be distributed as a $\\chi^2$ random variable with degrees of freedom equal to the model's residual degrees of freedom.\n\n::: panel-tabset\n## tidyverse\n\nWe can get these parameters as follows and we'll store them in a new object, so we can extract them in a bit.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_fit %>% glance()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 8\n null.deviance df.null logLik AIC BIC deviance df.residual nobs\n \n1 857. 34 -139. 283. 286. 30.4 33 35\n```\n:::\n\n```{.r .cell-code}\nisl_parameters <- isl_fit %>% glance()\n```\n:::\n\n\nThe values we are interested in are in the `deviance` and `df.residual` columns, respectively.\n\nNext, we use the `pchisq()` function to calculate the correct probability.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npchisq(isl_parameters$deviance,\n isl_parameters$df.residual,\n lower.tail = FALSE)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 0.595347\n```\n:::\n:::\n\n\nThis gives us a value of around 0.60. This suggests that this model is actually a pretty good one (if it wasn't then the value would be close to zero) and that the data are pretty well supported by the model.\n\n::: callout-important\nThe `pchisq()` function gives the lower tail probability that $\\chi^2 \\le x$ by default. We're actually interested in the probability that $\\chi^2 \\ge x$. These two probabilities must sum to one, so we get the upper tail probability by setting the argument `lower.tail = FALSE`. An alternative way would be to use the default, but do `1 - pchisq()`.\n:::\n\nFor Poisson models this has an extra interpretation. This can be used to assess whether we have significant overdispersion in our data. For a Poisson model to be appropriate we need that the variance of the data to be exactly the same as the mean of the data. If there is overdispersion then the data would spread out more for higher predicted values of species than for lower ones. Our visualisation shows that this isn't really happening. The spread is unlikely to be perfectly homogeneous, but we don't want the data to spread out too much.\n\nThe easy way to check this is to look at the ratio of the residual deviance to the residual degrees of freedom (in this case 0.922). For a Poisson model to be valid, this ratio should be about 1. If the ratio is significantly bigger than 1 then we say that we have over-dispersion in the model and we wouldn't be able to trust any of the significance testing using a Poisson regression.\n:::\n\n## Confidence intervals\n\nWe can also assess how reliable our model is by looking at the confidence intervals of the estimated parameters.\n\n::: panel-tabset\n## tidyverse\n\nWe extracted the parameters of the model by using\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_fit %>% tidy()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 5\n term estimate std.error statistic p.value\n \n1 (Intercept) 4.24 0.0413 103. 0 \n2 area 0.0356 0.00125 28.6 2.73e-179\n```\n:::\n:::\n\n\nAlthough we focussed on the `estimate` column, we can see that the associated standard errors for each estimate is also given in the `std.error` column. We can use these values to calculate the 95% confidence intervals.\n\nWe can either do this by hand through multiplying the standard errors by 1.96. We can then subtract from (giving the lower confidence estimate) or add to (giving the higher confidence estimate) the estimated parameter. This gives a pretty decent approximation.\n\nBut then again, life is short, so we can just use the additional argument that is available for the `tidy()` function. You can look at what columns are returned, but I'm selecting the relevant ones here:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_fit %>% tidy(conf.int = TRUE, # default is FALSE\n conf.level = 0.95) %>% # is the default\n select(term, estimate, conf.low, conf.high)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 4\n term estimate conf.low conf.high\n \n1 (Intercept) 4.24 4.16 4.32 \n2 area 0.0356 0.0332 0.0381\n```\n:::\n:::\n\n\nWhat we're interested in here is the confidence intervals for the `area` parameter. Before we delve into that, I'm also going to calculate the exponent for these confidence intervals. We can do this using the `exp()` function.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nisl_fit %>% tidy(conf.int = TRUE, # default is FALSE\n conf.level = 0.95) %>% # is the default\n select(term, estimate, conf.low, conf.high) %>% \n mutate(conf.low_exp = exp(conf.low),\n conf.high_exp = exp(conf.high))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 6\n term estimate conf.low conf.high conf.low_exp conf.high_exp\n \n1 (Intercept) 4.24 4.16 4.32 64.1 75.3 \n2 area 0.0356 0.0332 0.0381 1.03 1.04\n```\n:::\n:::\n\n\nThese values are a bit familiar, since we've previously determined based on the `area` coefficient that for each increase in square kilometer, the number of species increases by approximately 1.04.\n\nWhat these values tell us is that we can be 95% certain that for every increase in square kilometer of island area size, the number of species increases by a factor of somewhere between 1.034 and 1.039.\n\n::: callout-note\nIf there was no association between `area` and `species`, then the $\\beta_1$ coefficient would be zero. That would mean that the exponent would be ${e}^{\\beta_1}=1$. The interval that we calculated for ${e}^{\\beta_1}$ lies between 1.034 and 1.039 and therefore does not include 1, so the model with `area` is preferred over the model without `area`.\n\nEquivalently, the interval for $\\beta_1$ (0.033 - 0.038) does not include 0, again showing the significance of `area` as a predictor of `species`.\n:::\n:::\n\n## Exercise: Seatbelts\n\nI'd like you to do the following:\n\n1. Load the data\n2. Visualise the data and create a poisson regression model\n3. Plot the regression model on top of the data\n4. Assess if the model is a decent predictor for the number of fatalities\n\n::: {.callout-caution collapse=\"true\"}\n## Answer\n\n### Load the data\n\nFirst, we load the data.\n\n::: panel-tabset\n## tidyverse\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseatbelts <- read_csv(\"data/seatbelts.csv\")\n```\n\n::: {.cell-output .cell-output-stderr}\n```\nRows: 192 Columns: 10\n── Column specification ────────────────────────────────────────────────────────\nDelimiter: \",\"\nchr (1): month\ndbl (9): drivers_killed, drivers, front, rear, kms, petrol_price, van_killed...\n\nℹ Use `spec()` to retrieve the full column specification for this data.\nℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.\n```\n:::\n:::\n\n:::\n\n### Visualise the data\n\nThe data tracks the number of drivers killed in road traffic accidents, before and after the seatbelt law was introduced. The information on whether the law was in place is encoded in the `law` column as `0` (law not in place) or `1` (law in place).\n\nThere are many more observations when the law was *not* in place, so we need to keep this in mind when we're interpreting the data.\n\nFirst we have a look at the data comparing no law vs law:\n\n::: panel-tabset\n## tidyverse\n\nWe have to convert the `law` column to a factor, otherwise R will see it as numerical.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseatbelts %>% \n ggplot(aes(as_factor(law), drivers_killed)) +\n geom_boxplot()\n```\n\n::: {.cell-output-display}\n![](glm-practical-poisson_files/figure-html/unnamed-chunk-19-1.png){width=672}\n:::\n:::\n\n\nThe data are recorded by month and year, so we can also display the number of drivers killed by year:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseatbelts %>% \n ggplot(aes(year, drivers_killed)) +\n geom_point()\n```\n\n::: {.cell-output-display}\n![](glm-practical-poisson_files/figure-html/unnamed-chunk-20-1.png){width=672}\n:::\n:::\n\n:::\n\nThe data look a bit weird. There is quite some variation within years (keeping in mind that the data are aggregated monthly). The data also seems to wave around a bit... with some vague peaks (e.g. 1972 - 1973) and some troughs (e.g. around 1976).\n\nSo my initial thought is that these data are going to be a bit tricky to interpret. But that's OK.\n\n### Model building\n\nWe're dealing with count data, so we're going to use a poisson regression.\n\n::: panel-tabset\n## tidyverse\n\nAs before, we first define the model type.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nstb_mod <- poisson_reg() %>% \n set_mode(\"regression\") %>% \n set_engine(\"glm\")\n```\n:::\n\n\nAnd check that everything is in order.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nstb_mod %>% translate()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\nPoisson Regression Model Specification (regression)\n\nComputational engine: glm \n\nModel fit template:\nstats::glm(formula = missing_arg(), data = missing_arg(), weights = missing_arg(), \n family = stats::poisson)\n```\n:::\n:::\n\n\nNext, we fit our data to the model we just specified:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nstb_fit <- stb_mod %>% \n fit(drivers_killed ~ year,\n data = seatbelts)\n```\n:::\n\n\nAnd we can extract the estimated coefficients from these fitted data:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nstb_fit %>% tidy()\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 5\n term estimate std.error statistic p.value\n \n1 (Intercept) 37.2 2.80 13.3 2.62e-40\n2 year -0.0164 0.00142 -11.6 5.88e-31\n```\n:::\n:::\n\n:::\n\n### Visualise the model\n\nTo see if the model is actually a decent prediction for our data, we can plot it.\n\n::: panel-tabset\n## tidyverse\n\nTo do this, we create modelled values for our predictor variable `year`. Because we have monthly data, we create a sequence of \"years\" in 1/12th intervals - one for each month.\n\nNext, we ask the model to predict the number of drivers killed based on these values.\n\nLastly, we can plot those predicted values against the observed values in our data set.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# create the sequence of values that are used\n# in predicting number of deaths\nmodel <- tibble(year = seq(1969, 1984, (1/12)))\n\n# fit these data to the model\nstb_curve <- stb_fit %>% augment(new_data = model)\n\n# plot the predicted values\n# and overlay the observed values\nggplot(seatbelts, aes(year, drivers_killed)) +\n geom_point() +\n geom_point(data = stb_curve, aes(x = year, y = .pred),\n colour = \"red\")\n```\n\n::: {.cell-output-display}\n![](glm-practical-poisson_files/figure-html/unnamed-chunk-25-1.png){width=672}\n:::\n:::\n\n:::\n\nThat does not look like a very good fit, but then again the data look rather messy as well.\n\n### Goodness-of-fit\n\nLet's check the goodness-of-fit.\n\n::: panel-tabset\n## tidyverse\n\nFirst we store the parameter estimates in an object. Then we use the `pchisq()` function to calculate the probability that the residual deviance is actually distributed as a $\\chi^2$ random variable with degrees of freedom equal to the model's residual degrees of freedom.\n\nYes, you can read that sentence three times and still wonder what that really means. Suffice to say is that the outcome gives us a measure of how well-specified the model is.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nstb_parameter <- stb_fit %>% glance()\n\nstb_parameter\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 1 × 8\n null.deviance df.null logLik AIC BIC deviance df.residual nobs\n \n1 984. 191 -1062. 2127. 2134. 850. 190 192\n```\n:::\n\n```{.r .cell-code}\npchisq(stb_parameter$deviance,\n stb_parameter$df.residual,\n lower.tail = FALSE)\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n[1] 3.12987e-84\n```\n:::\n:::\n\n:::\n\nWell, that's a bit of a blow. The probability value is extremely low, suggesting that the model is not very well-specified. Which kind of matches what we already saw in the plot. It might still be better than the null model (\"the data can be modelled as the average across all the observations\"), but we seem to be missing some parameters here.\n\n### Confidence intervals\n\nSimilar to the `islands` example, we can also calculate the confidence intervals associated with our estimated parameters.\n\n::: panel-tabset\n## tidyverse\n\n\n::: {.cell}\n\n```{.r .cell-code}\nstb_fit %>% tidy(conf.int = TRUE, # default is FALSE\n conf.level = 0.95) %>% # is the default\n select(term, estimate, conf.low, conf.high) %>% \n mutate(conf.low_exp = exp(conf.low),\n conf.high_exp = exp(conf.high))\n```\n\n::: {.cell-output .cell-output-stdout}\n```\n# A tibble: 2 × 6\n term estimate conf.low conf.high conf.low_exp conf.high_exp\n \n1 (Intercept) 37.2 31.7 42.7 5.78e+13 3.34e+18\n2 year -0.0164 -0.0191 -0.0136 9.81e- 1 9.86e- 1\n```\n:::\n:::\n\n\nWe're interested in the confidence interval for the `year` variable. Remember that if there is no association at all between `year` and `drivers_killed` then the parameter $e^{\\beta_1} = 1$.\n\nIn our case the interval we calculated for $e^{\\beta_1}$ lies between 0.981 and 0.986. This does not include 1, so it seems that the model that takes `year` into account is still preferred over a model that doesn't.\n:::\n:::\n\n## Key points\n\n::: callout-note\n- Poisson regression is useful when dealing with count data\n:::\n", - "supporting": [ - "glm-practical-poisson_files" - ], - "filters": [ - "rmarkdown/pagebreak.lua" - ], - "includes": {}, - "engineDependencies": {}, - "preserve": {}, - "postProcess": true - } -} \ No newline at end of file diff --git a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-12-1.png b/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-12-1.png deleted file mode 100644 index 4238ca0..0000000 Binary files a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-12-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-19-1.png b/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-19-1.png deleted file mode 100644 index a0074bb..0000000 Binary files a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-19-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-20-1.png b/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-20-1.png deleted file mode 100644 index 900293b..0000000 Binary files a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-20-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-25-1.png b/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-25-1.png deleted file mode 100644 index c76990d..0000000 Binary files a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-25-1.png and /dev/null differ diff --git a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-3-1.png b/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-3-1.png deleted file mode 100644 index eeeee5f..0000000 Binary files a/_freeze/materials/glm-practical-poisson/figure-html/unnamed-chunk-3-1.png and /dev/null differ diff --git a/_freeze/site_libs/clipboard/clipboard.min.js b/_freeze/site_libs/clipboard/clipboard.min.js index 41c6a0f..1103f81 100644 --- a/_freeze/site_libs/clipboard/clipboard.min.js +++ b/_freeze/site_libs/clipboard/clipboard.min.js @@ -1,7 +1,7 @@ /*! - * clipboard.js v2.0.10 + * clipboard.js v2.0.11 * https://clipboardjs.com/ * * Licensed MIT © Zeno Rocha */ -!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return n={686:function(t,e,n){"use strict";n.d(e,{default:function(){return o}});var e=n(279),i=n.n(e),e=n(370),u=n.n(e),e=n(817),c=n.n(e);function a(t){try{return document.execCommand(t)}catch(t){return}}var f=function(t){t=c()(t);return a("cut"),t};var l=function(t){var e,n,o,r=1 - + @@ -17,7 +17,7 @@ ul.task-list{list-style: none;} ul.task-list li input[type="checkbox"] { width: 0.8em; - margin: 0 0.8em 0.2em -1.6em; + margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ vertical-align: middle; } @@ -46,7 +46,12 @@ "collapse-after": 3, "panel-placement": "start", "type": "textbox", - "limit": 20, + "limit": 50, + "keyboard-shortcut": [ + "f", + "/", + "s" + ], "language": { "search-no-results-text": "No results", "search-matching-documents-text": "matching documents", @@ -55,8 +60,10 @@ "search-more-match-text": "more match in this document", "search-more-matches-text": "more matches in this document", "search-clear-button-title": "Clear", + "search-text-placeholder": "", "search-detached-cancel-button-title": "Cancel", - "search-submit-button-title": "Submit" + "search-submit-button-title": "Submit", + "search-label": "Search" } } @@ -67,9 +74,9 @@
-