diff --git a/R/preprocess_penguins.rds b/R/preprocess_penguins.rds index 4f8a805..c1a0d06 100644 Binary files a/R/preprocess_penguins.rds and b/R/preprocess_penguins.rds differ diff --git a/figures/cheem_tour.gif b/figures/cheem_tour.gif index 24516fb..5855f20 100644 Binary files a/figures/cheem_tour.gif and b/figures/cheem_tour.gif differ diff --git a/figures/grand_tour.gif b/figures/grand_tour.gif index 09eb4ac..516cdd3 100644 Binary files a/figures/grand_tour.gif and b/figures/grand_tour.gif differ diff --git a/figures/manual_tour.gif b/figures/manual_tour.gif index 97b89f0..8659f16 100644 Binary files a/figures/manual_tour.gif and b/figures/manual_tour.gif differ diff --git a/figures/pixel_based.png b/figures/pixel_based.png new file mode 100644 index 0000000..d127a8e Binary files /dev/null and b/figures/pixel_based.png differ diff --git a/images/index.rmd/unnamed-chunk-4-1.png b/images/index.rmd/unnamed-chunk-4-1.png index 11ef756..59ef737 100644 Binary files a/images/index.rmd/unnamed-chunk-4-1.png and b/images/index.rmd/unnamed-chunk-4-1.png differ diff --git a/images/index.rmd/unnamed-chunk-7-1.png b/images/index.rmd/unnamed-chunk-7-1.png index 9ad9b1a..1024a08 100644 Binary files a/images/index.rmd/unnamed-chunk-7-1.png and b/images/index.rmd/unnamed-chunk-7-1.png differ diff --git a/images/index.rmd/unnamed-chunk-7-2.png b/images/index.rmd/unnamed-chunk-7-2.png new file mode 100644 index 0000000..a083113 Binary files /dev/null and b/images/index.rmd/unnamed-chunk-7-2.png differ diff --git a/index.html b/index.html index e8ddbed..157a02d 100644 --- a/index.html +++ b/index.html @@ -1,10 +1,10 @@ - Multivariate data vis, dynamic linear projection, and their application to local explanations of models + Multivariate Data Visualization and Thinking - + @@ -39,34 +39,46 @@ class: title-slide count: false -# .monash-blue[Multivariate data vis, dynamic linear projection, and their application to local explanations of models] +# .monash-blue[Multivariate Data Visualization and Thinking] <br> -<h2 style="font-weight:900!important;">Monash EBS reading group</h2> +<h2 style="font-weight:900!important;">IFFRUG</h2> .bottom_abs.width100[ <br> *Nicholas Spyrison* <br> -Monash University <br> -[nspyrison.netlify.app](https://nspyrison.netlify.app/)<br>nicholas.syrison@monash.edu <br> -6 Dec 2021 <br><br> -Slides -- [nspyrison.github.io/pres_ebs](https://nspyrison.github.io/pres_ebs/#1) +[nspyrison.netlify.app](https://nspyrison.netlify.app/)<br>Nicholas.Syrison@iff.com <br> +IFF, Madison <br> +April 2023 <br><br> +Slides -- [nspyrison.github.io/pred_ebs](https://nspyrison.github.io/pred_ebs/#1) ] --- -# Terminology +# Terminology & bias -- *Variable* over attribute, column, measure -- *Observation* over item -- *Projection* over embedding +- Stats background, your preferred terms may vary +- __Variable__ - _p_, attribute, measure, column +- __Observation__ - _n_, item, instance, sample, repetition, row +- __Projection__ - an embedded space, of _d < p_ dimension +- __Dimension__ - Overloaded, but __p__ data or approximated space, __d__ projection - R, Grammar of Graphics, & __ggplot2__ <br> # Etiquette -- Please interrupt for clarifications & discussion +- Please interrupt for clarifications & elaboration - Tangential or extension questions at the end, time permitting --- +# Contents + +- Context and data +- Traditional techniques +- Dimension reduction +- Tours +- Cheem + +--- + # Context: data types <img src="./figures/munzner_datatypes.PNG" width="70%" style="display: block; margin: auto;" /> @@ -79,10 +91,10 @@ # Example data -- Palmer penguins -- penguins near Palmer Station, Antarctica +- Penguins near Palmer Station, Antarctica - 330 observations - X variables: 4 physical measurements -- species of penguin mapped to color & shape +- Species of penguin mapped to color & shape <table> <thead> @@ -142,13 +154,13 @@ --- -# Scatterplot matrix +# Scatterplot matrix (SPLOM, small multiples) <img src="images/index.rmd/unnamed-chunk-4-1.png" width="50%" style="display: block; margin: auto;" /> _Chambers, 1983_ ``` -GGally::ggpairs() +GGally::ggpairs(...) ``` - Scalability? @@ -161,7 +173,7 @@ <img src="images/index.rmd/unnamed-chunk-5-1.png" width="50%" style="display: block; margin: auto;" /> _Ocagne, 1885_ ``` -GGally::ggparcoord() +GGally::ggparcoord(...) ``` - Scalability? @@ -170,6 +182,27 @@ --- +# Bonus: observation-based (observation-mapped axes) + +<img src="./figures/pixel_based.png" width="80%" style="display: block; margin: auto;" /> + +- Scalability? Interpretability? Correlation? +- Asymmetric across variable order + +--- + +# Bonus: Chernoff faces + +- Scalability? Interpretability? Correlation? + +``` +tourr::animate_faces() ## Only for fun? +``` + +<img src="images/index.rmd/unnamed-chunk-7-1.png" width="50%" style="display: block; margin: auto;" /><img src="images/index.rmd/unnamed-chunk-7-2.png" width="50%" style="display: block; margin: auto;" /> + +--- + # Dimension reduction **Linear** @@ -205,11 +238,7 @@ “All non-linear projections are wrong, but some are useful.” <br>--- Anatasios Panagiotelis (play on George Box’s quote about models) - <!-- quote from: NUMBAT Seminar, 04/20/2020 --> -<!-- (borrowing George Box’s --> -<!-- quote about models) --> - --- class: transition @@ -220,55 +249,57 @@ # Linear projections -- intuition -You are already familiar with `\((p=3, d=2)\)` linear projections! +Good news; you are already familiar with ($p=3, d=2$) linear projections! <img src="./figures/intuition_linear_proj.PNG" width="100%" style="display: block; margin: auto;" /> -LDA: _Fischer, 1936_, supervised cluster separation - Not all orientations hold interesting structure -- But information/structure could be gleaned from rotation +- But structural information could be gleaned from rotation --- -# Linear projections -- notation +# Linear projections -- rotation <br> <img src="./figures/linear_proj_wide.png" width="100%" style="display: block; margin: auto;" /> +LDA: _Fischer, 1936_, supervised cluster separation -<br> -Using linear combinations of variables we can find bases (orientation) that separate clusters the most (oLDA) -<br> +Given the distributions and orientations of different clusters, find a basis (rotation) that separate clusters the most -- -__Caveat:__ - Often times our output space is of the same dimensionality as our input space - - PCA returns a `\(pxp\)` basis - - The reduction happens when the space is approximated with fewer components, often involving guided, but _subjective_ selection (scree plot) + - eg. PCA returns a `\(pxp\)` basis + - The reduction happens when the space is approximated with fewer components, often involving guided, but _subjective_ selection. eg. Finding an "elbow" in screeplot + - Or worse -- only showing PC1:2, with no recognition or discussion of the others --- -# Linear projections -- traditional approch +# Linear projections -- traditional process <br><br> -1) Scale each variable to [0, 1] or by standard deviations away from the mean<br> +1) Scale each variable to [0, 1] or by standard deviations away from the mean (z-score)<br> 2) Some people 'whiten' or 'sphere' transform the covariance matrix to an identity matrix; should be justified<br> -3) If `\(p\)` is sizable, say more than 10 or, may do an initial round of dimension reduction to get to a realistic view-space<br> - - Typically with PCA by eyeballing an elbow in a scree plot - - "We approximate 90% of the variation of our 20 variable in the first 5 principal components" -4) Visualize with data- or component- space with scatterplot matrix, _etc._ +3) If `\(p\)` is sizable, say more than 10 or, may first approximate the data in fewer components to get to a realistic dimensionality to view +- Typically with PCA by eyeballing an elbow in the screeplot +- "We approximate 90% of the variation of our 20 variable in the first 5 principal components" +- Alternative mathematical approximations: "intrinsic dimensionality estimation" +4) Visualize data- or component- space + +-- <br><br> -**Static, discrete, and orthogonal views** +Visualize _one rotation (or more if lucky);_ **static, not interactive** --- # Tours -- Instead of viewing only static orthogonal views, let's *animate* small changes in the basis over time +- Instead of viewing only static views, *animate* small changes in the basis over time - Object permanence between frames; can see observations and cluster moving together <img src="./figures/tour_frames.PNG" width="50%" style="display: block; margin: auto;" /> +*Buja et al., 2005* --- @@ -286,7 +317,8 @@ ggtour() + proto_point() + proto_basis() + - proto_origin() + proto_origin() %>% + animate_gganimate() ``` ] @@ -321,16 +353,20 @@ <td style="text-align:left;"> local </td> <td style="text-align:left;"> random bases within a local vacinity </td> </tr> + <tr> + <td style="text-align:left;"> _et al._ </td> + <td style="text-align:left;"> slicing, lensing </td> + </tr> </tbody> </table> -- -<br><br> -- The grand tour is good for EDA, but selects target frames randomly, no objective function. - - Bill length is important for distinguishing the ornage cluster -- Component spaces and grand tour have no means to "steer" the projection -- Look at it in more detail: control its contribution with a _manual tour_ +<br> +- The grand tour is good for EDA, but selects target frames randomly, no objective function +- Hypothesis; Bill length is important for distinguishing the orange cluster +- Issue; Component spaces and grand tour have no user interaction to "steer" the basis +- Response; Control its contribution with a _manual tour_ --- @@ -347,7 +383,8 @@ ggtour() + proto_point() + proto_basis() + - proto_origin() + proto_origin() %>% + animate_gganimate() ``` ] @@ -370,8 +407,8 @@ <td style="text-align:left;"> Wickham et al., 2011 </td> </tr> <tr> - <td style="text-align:left;"> {spinifex} </td> - <td style="text-align:left;"> Manual tours, basis function, display with ggplot2 to plotly/gganimate animations </td> + <td style="text-align:left;"> _{spinifex}_ </td> + <td style="text-align:left;"> Manual tours, compose animations exportable to plotly/gganimate </td> <td style="text-align:left;"> Spyrison &amp; Cook, 2020 </td> </tr> <tr> @@ -379,17 +416,35 @@ <td style="text-align:left;"> Diagnostic plots for projection pursuit (guided tour), tracing basis-paths </td> <td style="text-align:left;"> Zhang et al., 2021 </td> </tr> + <tr> + <td style="text-align:left;"> {liminal} </td> + <td style="text-align:left;"> Ensemble graphics with tSNE and tours side-by-side </td> + <td style="text-align:left;"> Lee, 2021 </td> + </tr> + <tr> + <td style="text-align:left;"> {loon.tour} </td> + <td style="text-align:left;"> Graphics display system 'loon' for tours </td> + <td style="text-align:left;"> Xu &amp; Oldford, 2021 </td> + </tr> + <tr> + <td style="text-align:left;"> _{cheem}_ </td> + <td style="text-align:left;"> Explore local explanations of non-linear models with the radial tour </td> + <td style="text-align:left;"> Spyrison, 2022 </td> + </tr> + <tr> + <td style="text-align:left;"> {detour} </td> + <td style="text-align:left;"> Alternative HTML with brushing and 3D proj, but no basis </td> + <td style="text-align:left;"> Hart &amp; Wang, 2022 </td> + </tr> </tbody> </table> -- -<br><br> -__Caveat:__ -- Tours needn't project to `\(d=2\)` +- Reminder: other geometric display can be used when `\(d!=2\)` - 1D density curves - 3D scatterplot - - `\(d~|~d<p\)` Parallel coordinate plot, Chernoff faces + - `\(d\)`-dim scatterplot matrix, parallel coordinate plots --- @@ -407,13 +462,14 @@ --- class: transition -## Local explanation of a black-box models & application of radial tours, __cheem__ +## Local explanation of a black-box models & application of radial tours, _cheem_ --- # Local explanation -- Approximation of the linear variable importance to the model in the vacinity of one observation of a model +- Given a non-linear ("blackbox") model, how can we maintain interpretability +- Approximation of the linear variable importance to the model in the vicinity of _one_ observation of a model <img src="./figures/lime_nonlinear.png" width="60%" style="display: block; margin: auto;" /> _Ribeiro, et al. (2017). Why Should I Trust You?_ @@ -423,14 +479,17 @@ .pull-left[ # SHAP values +- SHapley Additive exPlanation - FIFA 2020 data, 5000 observations, ~35 skill measurements aggregated to 9 variables, Y: wages [2020 Euros] -- Model: Random forest, regress wages given 9 skill measurements +- Model: Random forest regressing wages from 9 skill measurements - SHAP is a model-agnostic local explanation - Approximate linear variable importance at one observation; the median importance, permuting over combinations of the explanatory variables **The model has very different variable importance across the player position** ] +-- + .pull-right[ <img src="./figures/cheem_fifa_messi_dijk.png" width="100%" style="display: block; margin: auto;" /> ] @@ -439,8 +498,8 @@ # Cheem, concept -- Create a black-box model -- Extract explanations for every variable (constitutionally expensive) +- Create a non-linear model +- Extract explanations for every observation (computationally expensive) - __Global view__: approximate data- and attribution-space side-by-side - Explore with liked brushing, tooltips, interactive tabular display - Select a primary and comparison point to explore their explanation @@ -450,15 +509,17 @@ -- <br> -- Generalizable to all compatible model * explanation pairings -- Illustrate with random forests and a tree SHAP (reduced computational complexity) +__New__, at a github server near you, + +- model- and local explanation-agnostic (BYO) +- Illustrationed with random forests and a tree SHAP (reduced computational complexity) --- # Global View -
- +
+ - Select a primary and comparison point, typically misclassified and neighboring correctly classified - Use the SHAP values of the primary point as the basis, perform a 1D radial (manual) tour to interrogate the models explanation @@ -489,33 +550,33 @@ # Demo the app -<br><br> -Walk through regression case +Explore interactively with an R shiny application + +<br> +Local resources: ```r cheem::run_app() ``` <br> -- See a hosted shiny app (slightly outdated): [ebsmonash.shinyapps.io/cheem_initial/](https://ebsmonash.shinyapps.io/cheem_initial/) -- __cheem__ in package format, on CRAN soon [github.com/nspyrison/cheem](https://github.com/nspyrison/cheem) +- Externally hosted shiny app (outdated): [ebsmonash.shinyapps.io/cheem_initial/](https://ebsmonash.shinyapps.io/cheem_initial/) +- __cheem__ (back) on CRAN soon, available on GitHub: [github.com/nspyrison/cheem](https://github.com/nspyrison/cheem) --- # Cheem -- getting started +<br><br> ``` -## Github dependancies: -remotes::install_github('ModelOriented/treeshap') -remotes::install_github("nspyrison/spinifex") - ## Install cheem development version & its CRAN dependancies. remotes::install_github("nspyrison/cheem", dependencies = TRUE) -## Run the {cheem} app, serveral preprocessed datasets or bring your own +## Run the {cheem} app cheem::run_app() +## BYO data, model, local explanation: +?cheem::cheem_ls ``` - --- # Namesake @@ -530,13 +591,16 @@ # Acknowledgments <br><br> -Thanks to Professor Przemyslaw Biecek for his time and input suggesting SHAP and FIFA, in addition to the __DALEX__ package ecosystem and _Exploratory Model Analysis_ book +Thanks to Professor Przemyslaw Biecek for his guidance and input, in addition to the __DALEX__ package ecosystem and _Exploratory Model Analysis_ book <br> Thanks to Di Cook and Kim Marriott for their supervision <br> -This research was supported by an Australian government Research Training Program (RTP) scholarship. These slides were in __R__ using __rmarkdown__ and __xaringan__ *(R Core Team, 2021; Xie et al. 2018; Xie, 2018)* +This research was supported by an Australian government Research Training Program (RTP) scholarship. These slides created in __R__ using __rmarkdown__ and __xaringan__ *(R Core Team, 2021; Xie et al. 2018; Xie, 2018)* + +<br> +_Tentatively,_ IFF may support continued development of related content; broader preprocessing functions, more/better geom-like displays --- background-size: cover @@ -545,16 +609,16 @@ # Thank you for attending <hr><br> -<h2 class="monash-blue" style="font-size: 20pt!important;">Multivariate data vis, dynamic linear projection, and their application to local explanations of models</h1> +<h2 class="monash-blue" style="font-size: 20pt!important;">Multivariate Data Visualization and Thinking</h1> <br> -<h3 style="font-weight:900!important;">Monash EBS reading group</h2> +<h3 style="font-weight:900!important;">IFFRUG</h2> .bottom_abs.width100[ <br> *Nicholas Spyrison* <br> -Monash University <br> -[nspyrison.netlify.app](https://nspyrison.netlify.app/)<br>nicholas.syrison@monash.edu <br> -6 Dec 2021 <br><br> -Slides -- [nspyrison.github.io/pres_ebs](https://nspyrison.github.io/pres_ebs/#1) +[nspyrison.netlify.app](https://nspyrison.netlify.app/)<br>Nicholas.Syrison@iff.com <br> +IFF, Madison <br> +April 2023 <br><br> +Slides -- [nspyrison.github.io/pred_ebs](https://nspyrison.github.io/pred_ebs/#1) ] @@ -612,6 +676,19 @@ deleted = true; }); })(); +// add `data-at-shortcutkeys` attribute to to resolve conflicts with JAWS +// screen reader (see PR #262) +(function(d) { + let res = {}; + d.querySelectorAll('.remark-help-content table tr').forEach(tr => { + const t = tr.querySelector('td:nth-child(2)').innerText; + tr.querySelectorAll('td:first-child .key').forEach(key => { + const k = key.innerText; + if (/^[a-z]$/.test(k)) res[k] = t; // must be a single letter (key) + }); + }); + d.body.setAttribute('data-at-shortcutkeys', JSON.stringify(res)); +})(document); (function() { "use strict" // Replace