diff --git a/R/xgb-model.R b/R/xgb-model.R index f80ee3f..0691801 100644 --- a/R/xgb-model.R +++ b/R/xgb-model.R @@ -236,15 +236,16 @@ plot_shap <- function(shap_object = NULL, model_type = NULL, alpha = NULL) { mesh_shap = shap_object$S[, 3] ) process_shap %>% - ggplot2::ggplot(aes(mesh_fact, mesh_shap, color = habitat_fact)) + - ggplot2::geom_jitter(width = 3, alpha = alpha, size = 1.5) + + ggplot2::ggplot(ggplot2::aes(mesh_fact, mesh_shap, color = habitat_fact)) + + ggplot2::geom_jitter(width = 2, alpha = alpha, size = 1.5, show.legend = FALSE) + + ggplot2::geom_point(size = 0.1) + ggplot2::theme_minimal() + ggplot2::scale_x_continuous(n.breaks = 10) + ggplot2::geom_hline(yintercept = 0, linetype = 2, color = "grey50") + - ggplot2::scale_color_manual(values = c("#f28f3b", "grey50", "#ffd5c2", "#588b8b", "#c8553d", "#2d3047", "#93b7be"))+ - ggplot2::coord_cartesian(expand = FALSE)+ - ggplot2::labs(color = "Habitat") - + ggplot2::scale_color_manual(values = c("#f28f3b", "#c27ba0", "#ffd5c2", "#588b8b", "#c8553d", "#2d3047", "#007ea7")) + + ggplot2::coord_cartesian(expand = FALSE) + + ggplot2::labs(color = "Habitat") + + ggplot2::guides(color = ggplot2::guide_legend(override.aes = list(size = 1.75))) } else { process_shap <- dplyr::tibble( @@ -269,14 +270,15 @@ plot_shap <- function(shap_object = NULL, model_type = NULL, alpha = NULL) { process_shap %>% ggplot2::ggplot(aes(reorder(habitat_gear_fact, habitat_gear_shap), habitat_gear_shap, color = vessel_fact)) + - ggplot2::geom_jitter(width = 0.5, alpha = alpha, size = 1.5) + + ggplot2::geom_jitter(width = 0.2, alpha = alpha, size = 1.5, show.legend = FALSE) + + ggplot2::geom_point(size = 0.1) + ggplot2::theme_minimal() + ggplot2::geom_hline(yintercept = 0, linetype = 2, color = "grey50") + - ggplot2::scale_color_manual(values = c("grey50", "#bc4749"))+ - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) + - ggplot2::coord_cartesian(expand = FALSE)+ - ggplot2::labs(color = "Transport") - + ggplot2::scale_color_manual(values = c("grey50", "#bc4749")) + + ggplot2::coord_cartesian(expand = FALSE) + + ggplot2::labs(color = "Transport") + + ggplot2::guides(color = ggplot2::guide_legend(override.aes = list(size = 1.75))) + + ggplot2::coord_flip() } } @@ -288,6 +290,7 @@ plot_shap <- function(shap_object = NULL, model_type = NULL, alpha = NULL) { #' @param model_type A character string specifying the model type, passed to the `plot_shap` function. #' @param alpha The alpha value for geom_jitter in ggplot2, controlling point transparency. #' @param cols The number of columns in the plot layout. +#' @param drop_legend Wether to return legend. Default is TRUE. #' #' @details #' The function uses the `shapviz` package for initial processing and then applies `plot_shap` to each model. @@ -299,7 +302,7 @@ plot_shap <- function(shap_object = NULL, model_type = NULL, alpha = NULL) { #' \dontrun{ #' plot_model_shaps(data_shaps = my_model_shaps, model_type = "gn", alpha = 0.2, cols = 2) #' } -plot_model_shaps <- function(data_shaps = NULL, model_type = NULL, alpha = 0.2, cols = 1) { +plot_model_shaps <- function(data_shaps = NULL, model_type = NULL, alpha = 0.2, cols = 1, drop_legend = FALSE) { sha <- shapviz::shapviz(data_shaps) shapviz_object <- purrr::map(sha, plot_shap, model_type = model_type, alpha = alpha) @@ -318,6 +321,7 @@ plot_model_shaps <- function(data_shaps = NULL, model_type = NULL, alpha = 0.2, legend.key.size = ggplot2::unit(0.8, "cm"), legend.title = ggplot2::element_text(size = 12) )) + combined_plots <- cowplot::plot_grid( plotlist = plots, ncol = cols, @@ -331,23 +335,34 @@ plot_model_shaps <- function(data_shaps = NULL, model_type = NULL, alpha = 0.2, if (model_type == "gn") { x_label <- cowplot::draw_label("Mesh size (mm)", x = 0.5, y = 0.05) + y_label <- cowplot::draw_label("SHAP value", x = 0.015, y = 0.5, angle = 90) } else { - x_label <- cowplot::draw_label("Habitat x Gear type ", x = 0.5, y = 0.05) + x_label <- cowplot::draw_label("Habitat x Gear type ", x = 0.015, y = 0.5, angle = 90) + y_label <- cowplot::draw_label("SHAP value", x = 0.5, y = 0.05) } - y_label <- cowplot::draw_label("SHAP value (impact on model output)", x = 0.015, y = 0.5, angle = 90) - - final_plot <- - cowplot::plot_grid( - combined_plots, - legend_plot, - ncol = 2, - rel_widths = c(1, 0.22), - scale = 0.9, - greedy = TRUE - ) + - x_label + - y_label + + if (drop_legend == TRUE) { + final_plot <- + cowplot::plot_grid(combined_plots, + scale = 0.9, + greedy = TRUE + ) + + x_label + + y_label + } else { + final_plot <- + cowplot::plot_grid( + combined_plots, + legend_plot, + ncol = 2, + rel_widths = c(1, 0.22), + scale = 0.9, + greedy = TRUE + ) + + x_label + + y_label + } final_plot } diff --git a/docs/404.html b/docs/404.html index 2809892..b8c5c44 100644 --- a/docs/404.html +++ b/docs/404.html @@ -23,7 +23,7 @@ - + diff --git a/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/model-explanation-1.png b/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/model-explanation-1.png deleted file mode 100644 index c59792c..0000000 Binary files a/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/model-explanation-1.png and /dev/null differ diff --git a/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-10-1.png b/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-10-1.png new file mode 100644 index 0000000..a6aef42 Binary files /dev/null and b/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-8-1.png b/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-8-1.png new file mode 100644 index 0000000..4a886a7 Binary files /dev/null and b/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-9-1.png b/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-9-1.png new file mode 100644 index 0000000..0222dee Binary files /dev/null and b/docs/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/data.html b/docs/data.html index ef7b87d..e9adfc3 100644 --- a/docs/data.html +++ b/docs/data.html @@ -23,7 +23,7 @@ - + diff --git a/docs/distribution.html b/docs/distribution.html index 1dcac69..774bfd1 100644 --- a/docs/distribution.html +++ b/docs/distribution.html @@ -23,7 +23,7 @@ - + diff --git a/docs/highlight.html b/docs/highlight.html index 5e9b472..99cae0a 100644 --- a/docs/highlight.html +++ b/docs/highlight.html @@ -23,7 +23,7 @@ - + @@ -186,8 +186,8 @@

3.1 Timor-Est SSF nutritional sce -
- +
+ diff --git a/docs/index.html b/docs/index.html index d51d9c6..d0a2bca 100644 --- a/docs/index.html +++ b/docs/index.html @@ -23,7 +23,7 @@ - + @@ -141,7 +141,7 @@

1 Content

diff --git a/docs/profiles.html b/docs/profiles.html index 82eecc8..d6065c7 100644 --- a/docs/profiles.html +++ b/docs/profiles.html @@ -23,7 +23,7 @@ - + @@ -170,10 +170,9 @@

5.2 Results

5.2.1 Clusters

The WSS analysis indicated that either 4 or 5 clusters were the best for organizing each subset of our data. We decided to use 5 clusters for all subsets to maintain uniformity across our analyses and to better represent the varied patterns in nutrient profiles.

-

The bar chart (Figure 5.1) displaying nutrient adequacy across nutrient profiles indicated the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch for various nutrients. The profiles are the result of k-means clustering, reflecting distinct groupings based on the type and quantity of nutrients present in the catch.

-

In the Atauro subsets (Panels a and b), there is a notable variation in the composition of nutrients. For example, some profiles are characterized by higher segments of protein and omega-3, suggestive of a catch rich in these particular nutrients. This is especially evident in the subset using gill nets, where the nutrient profile segments for protein and omega-3 are more prominent. -Contrastingly, the Mainland subsets (Panels c and d) display a different nutrient composition. Here, the catch appears to be less concentrated in protein and omega-3 but shows relatively larger segments for iron and zinc, indicating a variance in the type of marine life caught or perhaps the nutritional quality of these catches.

-

Across all panels, the nutrient profiles exhibit significant differences in the levels of calcium and vitamin A. Certain profiles show substantial segments representing these nutrients, while others are noticeably lacking, suggesting that the nutrient density of the catches can vary greatly within the same region and gear type.

+

The bar chart (Figure 5.1) displaying nutrient adequacy across nutrient profiles indicated the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch for various nutrients. The profiles are the result of k-means clustering, reflecting distinct groupings based on the type and quantity of nutrients present in the catch. +For the Atauro dataset using all gear types (Panel a),we observe diverse distributions of nutrient adequacy across the profiles Specifically, clusters 3 and 5 exhibit a notably higher content of vitamin A relative to the other clusters, whereas calcium and protein appear more evenly distributed among all nutrient profiles. The distribution of zinc varies greatly, with cluster 1 showing the greatest concentration. Iron is most abundant in cluster 4, distinguishing it from the rest.

+

For the subset of data from Atauro using only gill net gear (Panel b), the distribution is characterized by higher proportions of calcium in clusters 2 and 4. Additionally, clusters 1 and 4 stand out due to their higher vitamin A content….etc…etc…

Distribution of nutrient adequacy across k-means clusters. The bar chart delineates the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch within identified k-means clusters. Each bar is categorized into six segments corresponding to the evaluated nutrients. The clusters are enumerated on the y-axis, each representing a group with a distinct nutritional profile as determined by the cluster analysis. The x-axis quantifies the count of individuals within each cluster that meet the RNI for the respective nutrients, underlining the variability in nutrient adequacy across clusters. Panels (a) through (d) compare these distributions across different fishing practices and locations, namely Atauro and the Mainland, using all gear types or exclusively gill nets.

@@ -189,8 +188,8 @@

5.2.1 Clusters

- +
+

Table 5.1: Results of PERMANOVA analysis assessing the homogeneity of nutrient profiles within fishing trip clusters. The analysis was conducted across four datasets: Atauro with all gears (atauro_AG), Atauro with gill nets (atauro_GN), Mainland with all gears (mainland_AG), and Mainland with gill nets (mainland_GN). For each dataset, the term ‘clusters’ represents the within-group sum of squares (SUMOFSQS), which measures the variance within the nutritional profiles, while ‘Residual’ represents the variance between nutritional profiles Degrees of Freedom (DF), R-squared values (R2), and associated statistics indicate the strength and significance of the clustering. The R2 value quantifies the proportion of variance explained by the clusters.

@@ -206,11 +205,32 @@

5.2.2 XGBoost model

- +
+

Table 5.2: Performance Metrics for XGBoost Model Across Fishing Data Subsets. This table provides a comprehensive overview of the predictive performance of an XGBoost classification model for four distinct subsets of fishing data: Atauro with all gears (ATAURO AG), Atauro with gill nets (ATAURO GN), Mainland with all gears (MAINLAND AG), and Mainland with gill nets (MAINLAND GN). Key performance indicators include ROC-AUC (area under the receiver operating characteristic curve), accuracy, Kappa (kap), sensitivity (sens), specificity (spec), positive predictive value (ppv), negative predictive value (npv), Matthew’s correlation coefficient (mcc), Youden’s J index (j_index), balanced accuracy (bal_accuracy), detection prevalence, precision, recall, and F measure (f_meas). The metrics collectively reflect the model’s ability to discriminate between nutritional profiles, its overall accuracy, and the balance between the sensitivity and specificity for each subset.

-

Shap values results… -

+

The analysis of SHAP values from gill net models reveals the interaction between mesh size and habitat in predicting nutrient profiles. In the Atauro region, as depicted in Figure 5.4, smaller mesh sizes (below 40 mm) are consistently linked to a higher prediction of nutrient profile NP1 across various habitats, especially reefs, beaches, and mangroves. This suggests that smaller mesh sizes are generally effective across these diverse marine environments for predicting NP1.

+

For nutrient profile NP2, there is a noticeable increase in SHAP values within the 40 to 60 mm mesh size range, with reefs and beaches showing this pattern most clearly. This indicates that medium mesh sizes are particularly predictive of NP2 in these ecological settings.

+

Larger mesh sizes, specifically those between 60 and 70 mm, have been associated with nutrient profiles NP3 and NP4 across several habitats, including reefs, beaches, and mangroves. A more specific association is observed with mesh sizes between 70 and 80 mm, which are predominantly linked to predicting NP4. For the largest mesh sizes analyzed, nutrient profile NP5 emerges as the most likely prediction among the various profiles, especially in the Atauro data subset.

+

The SHAP values derived from the mainland data present a more varied pattern. Small mesh sizes (less than 35 mm) used in deep water and FAD environments are linked with the prediction of nutrient profiles NP3 and NP4, with the latter also being associated with reef and beach habitats. Mesh sizes in the range of 40 to 65 mm are strong predictors for nutrient profiles NP1 and NP5. Profile NP1 is most commonly predicted in reef and FAD settings, while NP5 is typically associated with deeper waters. At the larger end of the mesh size spectrum, nutrient profile NP2 becomes the most probable prediction, particularly when fishing occurs in deeper habitats.

+
+Differential influence of mesh size on nutritional profile predictions across habitats. The figure compiles subplots for five distinct nutrient profiles (NP1-NP5) as predicted by gill net XGBoost models, with each subplot showing the distribution of SHAP values across varying mesh sizes. Each data point is colored to represent different habitats: Beach, Deep, FAD, Mangrove, Reef, Seagrass and Traditional FAD, providing insight into the habitat-specific impact of mesh size on the predictive accuracy of the model. The x-axis delineates the mesh size range, while the y-axis quantifies the magnitude of the SHAP value, with positive values denoting a heightened probability of a nutrient profile's occurrence and negative values indicating a reduced probability. +

+Figure 5.4: Differential influence of mesh size on nutritional profile predictions across habitats. The figure compiles subplots for five distinct nutrient profiles (NP1-NP5) as predicted by gill net XGBoost models, with each subplot showing the distribution of SHAP values across varying mesh sizes. Each data point is colored to represent different habitats: Beach, Deep, FAD, Mangrove, Reef, Seagrass and Traditional FAD, providing insight into the habitat-specific impact of mesh size on the predictive accuracy of the model. The x-axis delineates the mesh size range, while the y-axis quantifies the magnitude of the SHAP value, with positive values denoting a heightened probability of a nutrient profile’s occurrence and negative values indicating a reduced probability. +

+
+

SHAP results of all gears models …

+
+Lore ipsum +

+Figure 5.5: Lore ipsum +

+
+
+Lore ipsum2 +

+Figure 5.6: Lore ipsum2 +

+
diff --git a/docs/reference-keys.txt b/docs/reference-keys.txt index 7561d89..eed0bf1 100644 --- a/docs/reference-keys.txt +++ b/docs/reference-keys.txt @@ -4,6 +4,9 @@ fig:unnamed-chunk-3 fig:unnamed-chunk-4 fig:unnamed-chunk-5 fig:model-settings +fig:unnamed-chunk-8 +fig:unnamed-chunk-9 +fig:unnamed-chunk-10 content data catch-weight-and-nutrional-content diff --git a/docs/references.html b/docs/references.html index 9edd43f..304e857 100644 --- a/docs/references.html +++ b/docs/references.html @@ -23,7 +23,7 @@ - + diff --git a/docs/search_index.json b/docs/search_index.json index 06b1239..629c76f 100644 --- a/docs/search_index.json +++ b/docs/search_index.json @@ -1 +1 @@ -[["index.html", "Modelling scenarios for nutrient-sensitive fisheries management 1 Content", " Modelling scenarios for nutrient-sensitive fisheries management Lore 2023-12-03 1 Content This book contains analyses and reports of the paper ‘Modelling scenarios for nutrient-sensitive fisheries management’. All data and code to generate the analyses are in organised in https://github.com/WorldFishCenter/timor.nutrients. "],["data.html", "2 Data 2.1 Catch weight and nutrional content 2.2 Checks and limitations", " 2 Data The research presented in this book relies on two primary sources of data: Recorded Catch (RC): This dataset comprises detailed records of fishing trips that were documented by data collectors in the coastal municipalities of East Timor starting from January 2018. Estimated Catch (EC): This dataset provides a broader view of catch data on a regional level. It is created by combining RC with additional information, including the frequency of fishing trips made by each fishing boat and the total number of boats surveyed (censused) in each municipality. This combination extrapolates the recorded catch data to a larger scale. 2.1 Catch weight and nutrional content The total estimated catch weight is determined by the number of individuals and the length range of each catch. Specifically, during the initial phase of the Peskas project (July 2017 - April 2019), the standard length measurement used was the fork length (FL), which later changed to the total length (TL) in the subsequent and current version of the project. We utilized the API service offered by the FishBase database to incorporate length-to-length and length-to-weight conversion tables, using information from survey landings to calculate the weight in grams based on the following formula: W = a × L^b Here, W represents the weight in grams, L is the total length (TL) in centimeters, and a and b are the conversion parameters obtained from FishBase for each fish species. The FishBase database provides length-to-length and length-to-weight relationships for over 5,000 fish species. Typically, there are multiple records for the parameters a and b for each species. Since the length measurements in Peskas’ first version pertained to FL, we initially standardized all length measurements to TL using the FishBase length-to-length conversion tables. Subsequently, we applied the TL-to-weight conversion tables to estimate the weights. The FishBase length-to-weight conversion tables offer species-level taxonomic resolution. To derive a singular length-to-weight relationship for each fish group, we calculated the median values of parameters a and b for all species within a particular fish group. To ensure relevance to the region of interest, we refined the species list using FAO country codes (https://www.fao.org/countryprofiles/iso3list/en/) pertinent to Timor-Leste and Indonesia (country codes 626 and 360, respectively). For instance, to ascertain the weight of a catch categorized under the fish group labeled ECN (representing the Echeneidae family), we first identified the species within ECN documented in Timor-Leste and Indonesia. After this, we computed the average values of the parameters a and b for the identified species, which in this case were Echeneis naucrates and Remora remora (as illustrated in the figure below). To address the scarcity of measured nutrient values for fish, which are typically limited to a few species and countries. To overcome this data limitation, MacNeil et al. developed a Bayesian hierarchical model that leverages both phylogenetic information and trait-based information to predict concentrations of seven essential nutrients: calcium, iron, omega-3 fatty acids, protein, selenium, vitamin A, and zinc for both marine and inland fish species globally (see Hicks et al. 2019). For each catch, the nutritional yield was calculated by combining the validated weight estimates for each fish group with the modelled nutrient concentrations. Specifically, we used the highest posterior predictive density values for each of the seven nutrients, which can be found in the repository (https://github.com/mamacneil/NutrientFishbase). For non-fish groups—including octopuses, squids, cockles, shrimps, crabs, and lobsters—nutritional yield information was not available in the NutrientFishbase repository models. We retrieved the necessary data for these groups from the Global food composition database, using the same methodological approach as for the fish groups to estimate their nutritional content. To represent the nutrient concentration associated with each fish group, we used the median value as a summarizing metric. Figure 2.1: Distribution of nutrients’ concentration for each fish group. Dots represent the median, bars represent the 95% confidence interval. 2.2 Checks and limitations Check groups with higher dispersion… Dow we need to narrow species grouping? "],["highlight.html", "3 Highlight statistics 3.1 Timor-Est SSF nutritional scenario", " 3 Highlight statistics 3.1 Timor-Est SSF nutritional scenario The table uses the EC dataset and summarizes the main statistics on nutrient supply for each region. Below is a description of each table’ column: MUNICIPALITY (POPULATION): Municipality and number of people > 5 years old in 2022. NUTRIENT: Nutrient of reference ANNUAL SUPPLY: Aggregated annual value in kg. These values represent municipal-level estimates based on the number of fishing boats recorded in the 2021 Timor-Leste boat census, average number of fishing trips per boat and average landing weight values for each fish group. N. PEOPLE SUPPLIED DAILY: It describes the number of people meeting the nutrient’ RNI for each municipality. RNI values used are the following: Selenium Zinc Protein Total -3 PUFA Calcium Iron Vitamin-A 0.000026 0.0049 46 2.939 1 0.0294 0.0005 The 20% of RNIs values was take as reference in consideration of the fact that an ‘adequate diet’ is expected to comprise 5 food group. RNIs were then converted from grams to kg (dividing by 1000) and the requirements was calculated as: \\(\\frac{Anuual\\ supply\\ (kg)}{(RNI\\times 0.20) \\ / 1000} /365\\) POPULATION MEETING RNI REQUIREMENTS: Percentage of the population meeting the RNI requirements in each municipality: \\(\\frac{Number\\ of\\ people\\ supplied\\ daily}{Municipality\\ population} \\times 100\\) "],["distribution.html", "4 Nutrients distribution 4.1 Fish groups 4.2 Habitat and gear type", " 4 Nutrients distribution This section presents the analyses that illustrates the distribution of nutrients within various components of small-scale fisheries in East Timor. 4.1 Fish groups Figure 4.1: The bar chart illustrates the cumulative contribution of various marine food sources to the Recommended Nutrient Intake (RNI) for six essential nutrients, based on a 100g portion size. The x-axis is scaled in percentage terms, with the 100% mark indicating the complete RNI for a reproductive-age woman. Each horizontal bar is a stacked representation, segmented by color to denote the specific nutrient contributions from marine food sources. The marine food sources are labeled on the y-axis, which allows for a comparative visualization of their nutrient profiles, highlighting the diversity in nutrient density and emphasizing their potential significance in dietary nutrition. 4.2 Habitat and gear type Figure 4.2: Sankey diagram showing the relative distribution of key nutrients across various marine habitats and the corresponding extraction by different fishing gear types used in Timor-Est small-scale fisheries. "],["profiles.html", "5 Timor SSF nutrient profiles 5.1 Methods 5.2 Results 5.3 Checks and limitations", " 5 Timor SSF nutrient profiles 5.1 Methods In this section, we identified recurrent nutritional profiles based on RC data, then, we predicted and explained the nutritional profiles on the basis of the fishing strategy and environmental factors. 5.1.1 Data analysis design and subset division As a first step we addressed the inherent imbalance in the RC data, a critical aspect for ensuring accurate and unbiased analysis. Notably, a substantial portion of the data, exceeding 40%, is from Atauro, with gill net being the most frequently reported gear type across all the municipalities. To mitigate the skew caused by this overrepresentation, we strategically divided the dataset into four distinct subsets: Atauro GN: Focused on data from Atauro using gill nets. Atauro AG: Included data from Atauro using fishing methods other than gill nets. Mainland GN: Comprised of gill net data from all municipalities excluding Atauro. Mainland AG: Encompassed data from all other municipalities using non-gill net fishing methods. This subdivision of the dataset was intended to reduce biases and enhance analytical precision. Furthermore, by isolating gill net data, we were able to specifically examine the impact of mesh size on the prediction of nutrient profiles in gill net catches, providing a more focused and detailed analysis of this gear type’s influence on nutritional outcomes. 5.1.2 Clustering and Classification Subsequently, we identified recurrent nutritional profiles for each dataset. We assessed the total within sum of square (WSS) of six nutrient concentrations—excluding selenium—to identify the optimal number of clusters (distinctive nutritional profiles). Once established the optimal number of clusters for each dataset, we proceeded with the K-means clustering method to organize the data into distinct groups based on similarities in nutrient concentrations. Each trip was grouped based on its nutrient concentration profile, thereby enabling us to discern patterns and categorize trips according to their nutritional profile. The K-means algorithm functions by assigning each data point to the nearest cluster, based on the mean value of the points in the cluster. This iterative process continues until the assignment of points to clusters no longer changes, indicating that the clusters are as distinct as possible. The result is a set of clusters that represent unique nutritional profiles, each characterized by a specific combination of nutrient concentrations. Subsequent to the clustering, we conducted Permutational Multivariate Analysis of Variance (PERMANOVA) to validate the clustering methodology across four distinct datasets: Atauro AG, Atauro GN, Mainland AG, and Mainland GN. PERMANOVA is a robust non-parametric statistical test that evaluates whether there are significant differences between groups. Unlike traditional ANOVA, PERMANOVA does not rely on assumptions of normality and is therefore suitable for ecological data, which often do not follow normal distributions. Our PERMANOVA analysis was conducted on each of the four subsets on a distance matrix representing pairwise dissimilarities in nutrient concentrations across all fishing trips. This approach allowed us to test the hypothesis that the nutrient profiles of fishing trips within the same cluster are more similar to each other than to trips in different clusters. Finally, we performed a XGBoost model to each data subset to predict the nutritional profiles based on the fishing strategy, habitat and season. We employed the XGBoost algorithm due to its effectiveness in preventing overfitting and its ability to highlight key predictors. We used mesh size, habitat, quarter of the year, and vessel type as predictors for gill net subsets. For other gear types, the models used habitat x gear interaction, habitat, gear type, quarter of the year, and vessel type as predictors. Model tuning was conducted dynamically, adjusting several parameters including the number of trees, tree depth, loss reduction, sample size, and early stopping. The 4 data subsets were split into training (80%) and testing (20%) sets, with 10-fold cross-validation applied to the training set for enhanced accuracy and generalizability. The models’ performance was assessed using accuracy, ROC AUC, sensitivity, and specificity, providing a comprehensive understanding of their ability to accurately distinguish between different nutritional profiles. The ROC curves and AUC values offered an additional layer of model effectiveness evaluation. We employed SHapley Additive exPlanations (SHAP) values to dissect and quantify the influence of various predictors on the nutritional profiles predicted by our XGBoost models. SHAP values, rooted in cooperative game theory, offer a nuanced approach to understanding machine learning model outputs. They decompose a model’s prediction into contributions from each feature, illuminating not only the significance of these features but also the direction of their impact on the prediction. Specifically, for subsets involving gill net fishing methods (Atauro GN and Mainland GN), our focus was on understanding the impact of mesh size. In contrast, for the other subsets (Atauro AG and Mainland AG), which included different fishing methods, we concentrated on analyzing how the habitat and gear type interacted and influenced the nutritional profile predictions. 5.2 Results 5.2.1 Clusters The WSS analysis indicated that either 4 or 5 clusters were the best for organizing each subset of our data. We decided to use 5 clusters for all subsets to maintain uniformity across our analyses and to better represent the varied patterns in nutrient profiles. The bar chart (Figure 5.1) displaying nutrient adequacy across nutrient profiles indicated the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch for various nutrients. The profiles are the result of k-means clustering, reflecting distinct groupings based on the type and quantity of nutrients present in the catch. In the Atauro subsets (Panels a and b), there is a notable variation in the composition of nutrients. For example, some profiles are characterized by higher segments of protein and omega-3, suggestive of a catch rich in these particular nutrients. This is especially evident in the subset using gill nets, where the nutrient profile segments for protein and omega-3 are more prominent. Contrastingly, the Mainland subsets (Panels c and d) display a different nutrient composition. Here, the catch appears to be less concentrated in protein and omega-3 but shows relatively larger segments for iron and zinc, indicating a variance in the type of marine life caught or perhaps the nutritional quality of these catches. Across all panels, the nutrient profiles exhibit significant differences in the levels of calcium and vitamin A. Certain profiles show substantial segments representing these nutrients, while others are noticeably lacking, suggesting that the nutrient density of the catches can vary greatly within the same region and gear type. Figure 5.1: Distribution of nutrient adequacy across k-means clusters. The bar chart delineates the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch within identified k-means clusters. Each bar is categorized into six segments corresponding to the evaluated nutrients. The clusters are enumerated on the y-axis, each representing a group with a distinct nutritional profile as determined by the cluster analysis. The x-axis quantifies the count of individuals within each cluster that meet the RNI for the respective nutrients, underlining the variability in nutrient adequacy across clusters. Panels (a) through (d) compare these distributions across different fishing practices and locations, namely Atauro and the Mainland, using all gear types or exclusively gill nets. The scatter plot from the k-means clustering (Figure 5.2) showed the distribution of nutrient profiles across different clusters in each data subset. The first two principal components explained a significant portion of the variance, indicating distinct groupings in nutrient profiles among the fishing trips. Figure 5.2: Nutritional profile clustering of fishing trips by region and gear type. Each plot presents a k-means clustering analysis of fishing trip observations, grouped by their nutritional contributions to the Recommended Nutrient Intake (RNI) for six nutrients. The four panels, labeled (a) through (d), display data subsets for Atauro and the Mainland, utilizing all gear types and gill nets specifically. The scatter plots within each panel are charted in a two-dimensional space defined by the first two principal components, with the axes denoting the percentage of explained variance. Points are color-coded to denote distinct nutritional profile clusters derived from the k-means algorithm. Convex hulls define the periphery of each cluster, providing insight into the cluster density and separation. Convex hulls around the clusters aid in visualizing the distribution and delineation of nutritional profile groupings across different fishing methods and geographic areas. The PERMANOVA analyses (Table 5.1) revealed statistically significant differences between clusters, suggesting robust groupings based on the nutrient profiles. The pseudo-F statistics were remarkably high in all cases, indicating strong differentiation between clusters. Specifically, the R² values were 0.86, 0.82, 0.85, and 0.92 for Atauro AG, Atauro GN, Mainland AG, and Mainland GN respectively, indicating that between 82% to 92% of the variance in nutrient concentrations was explained by the clusters. The high R² values underscore the distinctness of the clusters, reinforcing the validity of the K-means clustering. These findings were consistent across all the datasets, with p-values below 0.001, providing clear evidence to reject the null hypothesis of no difference between clusters. Hence, the PERMANOVA results robustly support the effectiveness of the K-means algorithm in capturing meaningful patterns in nutrient profiles. Table 5.1: Results of PERMANOVA analysis assessing the homogeneity of nutrient profiles within fishing trip clusters. The analysis was conducted across four datasets: Atauro with all gears (atauro_AG), Atauro with gill nets (atauro_GN), Mainland with all gears (mainland_AG), and Mainland with gill nets (mainland_GN). For each dataset, the term ‘clusters’ represents the within-group sum of squares (SUMOFSQS), which measures the variance within the nutritional profiles, while ‘Residual’ represents the variance between nutritional profiles Degrees of Freedom (DF), R-squared values (R2), and associated statistics indicate the strength and significance of the clustering. The R2 value quantifies the proportion of variance explained by the clusters. 5.2.2 XGBoost model In the analysis of the XGBoost model’s predictive performance, both quantitative and visual assessments were conducted, detailed in Table 5.2 and Figure 5.3, respectively. The Receiver Operating Characteristic (ROC) curves (see ML model interpretation) presented in Figure 5.3 offer a graphical evaluation of the model’s sensitivity and specificity across four subsets of fishing data, categorized by region and gear type. These curves plot the true positive rate against the false positive rate for each nutritional profile group identified within the data. An examination of the ROC curves reveals variability in the model’s ability to distinguish between nutritional profile groups. The areas under the curves (AUC) provide a numerical measure of the model’s discriminative power, with a value of 1 representing perfect prediction and 0.5 indicating no discriminative power. While none of the profile groups reach perfection, several demonstrate substantial AUC values, indicating a robust ability to classify observations accurately. In comparing these visual findings with the statistical data from Table 5.2, it is observed that subsets from Atauro (both with all gears and gill nets) yield higher AUC, accuracy, and kappa statistics, suggesting a more consistent and accurate classification of nutritional profiles. These subsets also show higher sensitivity and specificity, indicating a balanced predictive capability for identifying true positives and true negatives. Conversely, the Mainland subsets exhibit lower performance metrics, indicating a more challenging classification scenario. This is reflected in the ROC curves where the lines for the Mainland subsets are farther from the top-left corner, suggesting a lower true positive rate relative to the false positive rate compared to the Atauro subsets. The positive predictive value (PPV) and negative predictive value (NPV), which provide insight into the model’s precision and reliability, also align with the ROC curve analysis, showing higher values for the Atauro subsets. This indicates that when the model predicts a particular nutritional profile for these subsets, it is more likely to be correct. The Matthew’s correlation coefficient (MCC) values, a balanced measure of quality for binary classifications, corroborate the ROC analysis by indicating that the Atauro subsets maintain a higher quality of prediction across classes. In summary, the integrated analysis of Table 5.2 and Figure 5.3 reveals a differentiated performance of the XGBoost model across various subsets of fishing data. The model showcases commendable predictive strength in the Atauro subsets, with high AUC, accuracy, and kappa metrics indicating a reliable classification of nutritional profiles. The ROC curve analysis further supports this, with curves for Atauro subsets nearer to the desired top-left corner, denoting higher sensitivity and specificity. In contrast, the Mainland subsets, despite achieving moderate success, suggest an area for improvement, as seen by their relative distance from the optimal point on the ROC curves and lower performance metrics. This suggests that while the model is effective in identifying nutritional profiles in certain contexts, its performance is not uniformly high across all subsets. Figure 5.3: Receiver Operating Characteristic (ROC) Curves for evaluating the performance of a cluster-based XGBoost classification model across four distinct fishing datasets: Atauro with all gears (a), Atauro with gill nets (b), Mainland with all gears (c), and Mainland with gill nets (d). Each curve represents one of the five clusters obtained from the classification, with different colors marking each cluster. Data points on the curves indicate the trade-off between sensitivity (true positive rate) and 1-specificity (false positive rate) for each cluster. The proximity of the curves to the top-left corner reflects the accuracy of the model in classifying the nutritional profiles into the correct clusters. Table 5.2: Performance Metrics for XGBoost Model Across Fishing Data Subsets. This table provides a comprehensive overview of the predictive performance of an XGBoost classification model for four distinct subsets of fishing data: Atauro with all gears (ATAURO AG), Atauro with gill nets (ATAURO GN), Mainland with all gears (MAINLAND AG), and Mainland with gill nets (MAINLAND GN). Key performance indicators include ROC-AUC (area under the receiver operating characteristic curve), accuracy, Kappa (kap), sensitivity (sens), specificity (spec), positive predictive value (ppv), negative predictive value (npv), Matthew’s correlation coefficient (mcc), Youden’s J index (j_index), balanced accuracy (bal_accuracy), detection prevalence, precision, recall, and F measure (f_meas). The metrics collectively reflect the model’s ability to discriminate between nutritional profiles, its overall accuracy, and the balance between the sensitivity and specificity for each subset. Shap values results… 5.3 Checks and limitations Are we considering all the possible potential good predictors? "],["simple.html", "6 In simple terms 6.1 ML model interpretation 6.2 ML model explanation", " 6 In simple terms 6.1 ML model interpretation ROC Curve: The curve plots the true positive rate (sensitivity) against the false positive rate (1 - specificity) at various threshold settings. The true positive rate is on the y-axis, and the false positive rate is on the x-axis. Performance: A perfect classifier would have a point in the upper left corner of the graph, where the true positive rate is 1 (or 100%) and the false positive rate is 0. The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test. Diagonal Line: The dotted diagonal line represents a no-skill classifier (e.g., random guessing). A good classifier stays as far away from this line as possible (toward the upper left corner). Area Under the Curve (AUC): The area under each ROC curve (AUC) is a measure of the test’s accuracy. An AUC of 0.5 suggests no discrimination (no better than random chance), while an AUC of 1.0 suggests perfect discrimination. 6.2 ML model explanation SHAP values: help in understanding how each predictor in the dataset contributed to each particular prediction. A high positive SHAP value for a feature increases the probability of a certain prediction, while a high negative SHAP value decreases it. "],["references.html", "References", " References "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]] +[["index.html", "Modelling scenarios for nutrient-sensitive fisheries management 1 Content", " Modelling scenarios for nutrient-sensitive fisheries management Lore 2023-12-04 1 Content This book contains analyses and reports of the paper ‘Modelling scenarios for nutrient-sensitive fisheries management’. All data and code to generate the analyses are in organised in https://github.com/WorldFishCenter/timor.nutrients. "],["data.html", "2 Data 2.1 Catch weight and nutrional content 2.2 Checks and limitations", " 2 Data The research presented in this book relies on two primary sources of data: Recorded Catch (RC): This dataset comprises detailed records of fishing trips that were documented by data collectors in the coastal municipalities of East Timor starting from January 2018. Estimated Catch (EC): This dataset provides a broader view of catch data on a regional level. It is created by combining RC with additional information, including the frequency of fishing trips made by each fishing boat and the total number of boats surveyed (censused) in each municipality. This combination extrapolates the recorded catch data to a larger scale. 2.1 Catch weight and nutrional content The total estimated catch weight is determined by the number of individuals and the length range of each catch. Specifically, during the initial phase of the Peskas project (July 2017 - April 2019), the standard length measurement used was the fork length (FL), which later changed to the total length (TL) in the subsequent and current version of the project. We utilized the API service offered by the FishBase database to incorporate length-to-length and length-to-weight conversion tables, using information from survey landings to calculate the weight in grams based on the following formula: W = a × L^b Here, W represents the weight in grams, L is the total length (TL) in centimeters, and a and b are the conversion parameters obtained from FishBase for each fish species. The FishBase database provides length-to-length and length-to-weight relationships for over 5,000 fish species. Typically, there are multiple records for the parameters a and b for each species. Since the length measurements in Peskas’ first version pertained to FL, we initially standardized all length measurements to TL using the FishBase length-to-length conversion tables. Subsequently, we applied the TL-to-weight conversion tables to estimate the weights. The FishBase length-to-weight conversion tables offer species-level taxonomic resolution. To derive a singular length-to-weight relationship for each fish group, we calculated the median values of parameters a and b for all species within a particular fish group. To ensure relevance to the region of interest, we refined the species list using FAO country codes (https://www.fao.org/countryprofiles/iso3list/en/) pertinent to Timor-Leste and Indonesia (country codes 626 and 360, respectively). For instance, to ascertain the weight of a catch categorized under the fish group labeled ECN (representing the Echeneidae family), we first identified the species within ECN documented in Timor-Leste and Indonesia. After this, we computed the average values of the parameters a and b for the identified species, which in this case were Echeneis naucrates and Remora remora (as illustrated in the figure below). To address the scarcity of measured nutrient values for fish, which are typically limited to a few species and countries. To overcome this data limitation, MacNeil et al. developed a Bayesian hierarchical model that leverages both phylogenetic information and trait-based information to predict concentrations of seven essential nutrients: calcium, iron, omega-3 fatty acids, protein, selenium, vitamin A, and zinc for both marine and inland fish species globally (see Hicks et al. 2019). For each catch, the nutritional yield was calculated by combining the validated weight estimates for each fish group with the modelled nutrient concentrations. Specifically, we used the highest posterior predictive density values for each of the seven nutrients, which can be found in the repository (https://github.com/mamacneil/NutrientFishbase). For non-fish groups—including octopuses, squids, cockles, shrimps, crabs, and lobsters—nutritional yield information was not available in the NutrientFishbase repository models. We retrieved the necessary data for these groups from the Global food composition database, using the same methodological approach as for the fish groups to estimate their nutritional content. To represent the nutrient concentration associated with each fish group, we used the median value as a summarizing metric. Figure 2.1: Distribution of nutrients’ concentration for each fish group. Dots represent the median, bars represent the 95% confidence interval. 2.2 Checks and limitations Check groups with higher dispersion… Dow we need to narrow species grouping? "],["highlight.html", "3 Highlight statistics 3.1 Timor-Est SSF nutritional scenario", " 3 Highlight statistics 3.1 Timor-Est SSF nutritional scenario The table uses the EC dataset and summarizes the main statistics on nutrient supply for each region. Below is a description of each table’ column: MUNICIPALITY (POPULATION): Municipality and number of people > 5 years old in 2022. NUTRIENT: Nutrient of reference ANNUAL SUPPLY: Aggregated annual value in kg. These values represent municipal-level estimates based on the number of fishing boats recorded in the 2021 Timor-Leste boat census, average number of fishing trips per boat and average landing weight values for each fish group. N. PEOPLE SUPPLIED DAILY: It describes the number of people meeting the nutrient’ RNI for each municipality. RNI values used are the following: Selenium Zinc Protein Total -3 PUFA Calcium Iron Vitamin-A 0.000026 0.0049 46 2.939 1 0.0294 0.0005 The 20% of RNIs values was take as reference in consideration of the fact that an ‘adequate diet’ is expected to comprise 5 food group. RNIs were then converted from grams to kg (dividing by 1000) and the requirements was calculated as: \\(\\frac{Anuual\\ supply\\ (kg)}{(RNI\\times 0.20) \\ / 1000} /365\\) POPULATION MEETING RNI REQUIREMENTS: Percentage of the population meeting the RNI requirements in each municipality: \\(\\frac{Number\\ of\\ people\\ supplied\\ daily}{Municipality\\ population} \\times 100\\) "],["distribution.html", "4 Nutrients distribution 4.1 Fish groups 4.2 Habitat and gear type", " 4 Nutrients distribution This section presents the analyses that illustrates the distribution of nutrients within various components of small-scale fisheries in East Timor. 4.1 Fish groups Figure 4.1: The bar chart illustrates the cumulative contribution of various marine food sources to the Recommended Nutrient Intake (RNI) for six essential nutrients, based on a 100g portion size. The x-axis is scaled in percentage terms, with the 100% mark indicating the complete RNI for a reproductive-age woman. Each horizontal bar is a stacked representation, segmented by color to denote the specific nutrient contributions from marine food sources. The marine food sources are labeled on the y-axis, which allows for a comparative visualization of their nutrient profiles, highlighting the diversity in nutrient density and emphasizing their potential significance in dietary nutrition. 4.2 Habitat and gear type Figure 4.2: Sankey diagram showing the relative distribution of key nutrients across various marine habitats and the corresponding extraction by different fishing gear types used in Timor-Est small-scale fisheries. "],["profiles.html", "5 Timor SSF nutrient profiles 5.1 Methods 5.2 Results 5.3 Checks and limitations", " 5 Timor SSF nutrient profiles 5.1 Methods In this section, we identified recurrent nutritional profiles based on RC data, then, we predicted and explained the nutritional profiles on the basis of the fishing strategy and environmental factors. 5.1.1 Data analysis design and subset division As a first step we addressed the inherent imbalance in the RC data, a critical aspect for ensuring accurate and unbiased analysis. Notably, a substantial portion of the data, exceeding 40%, is from Atauro, with gill net being the most frequently reported gear type across all the municipalities. To mitigate the skew caused by this overrepresentation, we strategically divided the dataset into four distinct subsets: Atauro GN: Focused on data from Atauro using gill nets. Atauro AG: Included data from Atauro using fishing methods other than gill nets. Mainland GN: Comprised of gill net data from all municipalities excluding Atauro. Mainland AG: Encompassed data from all other municipalities using non-gill net fishing methods. This subdivision of the dataset was intended to reduce biases and enhance analytical precision. Furthermore, by isolating gill net data, we were able to specifically examine the impact of mesh size on the prediction of nutrient profiles in gill net catches, providing a more focused and detailed analysis of this gear type’s influence on nutritional outcomes. 5.1.2 Clustering and Classification Subsequently, we identified recurrent nutritional profiles for each dataset. We assessed the total within sum of square (WSS) of six nutrient concentrations—excluding selenium—to identify the optimal number of clusters (distinctive nutritional profiles). Once established the optimal number of clusters for each dataset, we proceeded with the K-means clustering method to organize the data into distinct groups based on similarities in nutrient concentrations. Each trip was grouped based on its nutrient concentration profile, thereby enabling us to discern patterns and categorize trips according to their nutritional profile. The K-means algorithm functions by assigning each data point to the nearest cluster, based on the mean value of the points in the cluster. This iterative process continues until the assignment of points to clusters no longer changes, indicating that the clusters are as distinct as possible. The result is a set of clusters that represent unique nutritional profiles, each characterized by a specific combination of nutrient concentrations. Subsequent to the clustering, we conducted Permutational Multivariate Analysis of Variance (PERMANOVA) to validate the clustering methodology across four distinct datasets: Atauro AG, Atauro GN, Mainland AG, and Mainland GN. PERMANOVA is a robust non-parametric statistical test that evaluates whether there are significant differences between groups. Unlike traditional ANOVA, PERMANOVA does not rely on assumptions of normality and is therefore suitable for ecological data, which often do not follow normal distributions. Our PERMANOVA analysis was conducted on each of the four subsets on a distance matrix representing pairwise dissimilarities in nutrient concentrations across all fishing trips. This approach allowed us to test the hypothesis that the nutrient profiles of fishing trips within the same cluster are more similar to each other than to trips in different clusters. Finally, we performed a XGBoost model to each data subset to predict the nutritional profiles based on the fishing strategy, habitat and season. We employed the XGBoost algorithm due to its effectiveness in preventing overfitting and its ability to highlight key predictors. We used mesh size, habitat, quarter of the year, and vessel type as predictors for gill net subsets. For other gear types, the models used habitat x gear interaction, habitat, gear type, quarter of the year, and vessel type as predictors. Model tuning was conducted dynamically, adjusting several parameters including the number of trees, tree depth, loss reduction, sample size, and early stopping. The 4 data subsets were split into training (80%) and testing (20%) sets, with 10-fold cross-validation applied to the training set for enhanced accuracy and generalizability. The models’ performance was assessed using accuracy, ROC AUC, sensitivity, and specificity, providing a comprehensive understanding of their ability to accurately distinguish between different nutritional profiles. The ROC curves and AUC values offered an additional layer of model effectiveness evaluation. We employed SHapley Additive exPlanations (SHAP) values to dissect and quantify the influence of various predictors on the nutritional profiles predicted by our XGBoost models. SHAP values, rooted in cooperative game theory, offer a nuanced approach to understanding machine learning model outputs. They decompose a model’s prediction into contributions from each feature, illuminating not only the significance of these features but also the direction of their impact on the prediction. Specifically, for subsets involving gill net fishing methods (Atauro GN and Mainland GN), our focus was on understanding the impact of mesh size. In contrast, for the other subsets (Atauro AG and Mainland AG), which included different fishing methods, we concentrated on analyzing how the habitat and gear type interacted and influenced the nutritional profile predictions. 5.2 Results 5.2.1 Clusters The WSS analysis indicated that either 4 or 5 clusters were the best for organizing each subset of our data. We decided to use 5 clusters for all subsets to maintain uniformity across our analyses and to better represent the varied patterns in nutrient profiles. The bar chart (Figure 5.1) displaying nutrient adequacy across nutrient profiles indicated the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch for various nutrients. The profiles are the result of k-means clustering, reflecting distinct groupings based on the type and quantity of nutrients present in the catch. For the Atauro dataset using all gear types (Panel a),we observe diverse distributions of nutrient adequacy across the profiles Specifically, clusters 3 and 5 exhibit a notably higher content of vitamin A relative to the other clusters, whereas calcium and protein appear more evenly distributed among all nutrient profiles. The distribution of zinc varies greatly, with cluster 1 showing the greatest concentration. Iron is most abundant in cluster 4, distinguishing it from the rest. For the subset of data from Atauro using only gill net gear (Panel b), the distribution is characterized by higher proportions of calcium in clusters 2 and 4. Additionally, clusters 1 and 4 stand out due to their higher vitamin A content….etc…etc… Figure 5.1: Distribution of nutrient adequacy across k-means clusters. The bar chart delineates the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch within identified k-means clusters. Each bar is categorized into six segments corresponding to the evaluated nutrients. The clusters are enumerated on the y-axis, each representing a group with a distinct nutritional profile as determined by the cluster analysis. The x-axis quantifies the count of individuals within each cluster that meet the RNI for the respective nutrients, underlining the variability in nutrient adequacy across clusters. Panels (a) through (d) compare these distributions across different fishing practices and locations, namely Atauro and the Mainland, using all gear types or exclusively gill nets. The scatter plot from the k-means clustering (Figure 5.2) showed the distribution of nutrient profiles across different clusters in each data subset. The first two principal components explained a significant portion of the variance, indicating distinct groupings in nutrient profiles among the fishing trips. Figure 5.2: Nutritional profile clustering of fishing trips by region and gear type. Each plot presents a k-means clustering analysis of fishing trip observations, grouped by their nutritional contributions to the Recommended Nutrient Intake (RNI) for six nutrients. The four panels, labeled (a) through (d), display data subsets for Atauro and the Mainland, utilizing all gear types and gill nets specifically. The scatter plots within each panel are charted in a two-dimensional space defined by the first two principal components, with the axes denoting the percentage of explained variance. Points are color-coded to denote distinct nutritional profile clusters derived from the k-means algorithm. Convex hulls define the periphery of each cluster, providing insight into the cluster density and separation. Convex hulls around the clusters aid in visualizing the distribution and delineation of nutritional profile groupings across different fishing methods and geographic areas. The PERMANOVA analyses (Table 5.1) revealed statistically significant differences between clusters, suggesting robust groupings based on the nutrient profiles. The pseudo-F statistics were remarkably high in all cases, indicating strong differentiation between clusters. Specifically, the R² values were 0.86, 0.82, 0.85, and 0.92 for Atauro AG, Atauro GN, Mainland AG, and Mainland GN respectively, indicating that between 82% to 92% of the variance in nutrient concentrations was explained by the clusters. The high R² values underscore the distinctness of the clusters, reinforcing the validity of the K-means clustering. These findings were consistent across all the datasets, with p-values below 0.001, providing clear evidence to reject the null hypothesis of no difference between clusters. Hence, the PERMANOVA results robustly support the effectiveness of the K-means algorithm in capturing meaningful patterns in nutrient profiles. Table 5.1: Results of PERMANOVA analysis assessing the homogeneity of nutrient profiles within fishing trip clusters. The analysis was conducted across four datasets: Atauro with all gears (atauro_AG), Atauro with gill nets (atauro_GN), Mainland with all gears (mainland_AG), and Mainland with gill nets (mainland_GN). For each dataset, the term ‘clusters’ represents the within-group sum of squares (SUMOFSQS), which measures the variance within the nutritional profiles, while ‘Residual’ represents the variance between nutritional profiles Degrees of Freedom (DF), R-squared values (R2), and associated statistics indicate the strength and significance of the clustering. The R2 value quantifies the proportion of variance explained by the clusters. 5.2.2 XGBoost model In the analysis of the XGBoost model’s predictive performance, both quantitative and visual assessments were conducted, detailed in Table 5.2 and Figure 5.3, respectively. The Receiver Operating Characteristic (ROC) curves (see ML model interpretation) presented in Figure 5.3 offer a graphical evaluation of the model’s sensitivity and specificity across four subsets of fishing data, categorized by region and gear type. These curves plot the true positive rate against the false positive rate for each nutritional profile group identified within the data. An examination of the ROC curves reveals variability in the model’s ability to distinguish between nutritional profile groups. The areas under the curves (AUC) provide a numerical measure of the model’s discriminative power, with a value of 1 representing perfect prediction and 0.5 indicating no discriminative power. While none of the profile groups reach perfection, several demonstrate substantial AUC values, indicating a robust ability to classify observations accurately. In comparing these visual findings with the statistical data from Table 5.2, it is observed that subsets from Atauro (both with all gears and gill nets) yield higher AUC, accuracy, and kappa statistics, suggesting a more consistent and accurate classification of nutritional profiles. These subsets also show higher sensitivity and specificity, indicating a balanced predictive capability for identifying true positives and true negatives. Conversely, the Mainland subsets exhibit lower performance metrics, indicating a more challenging classification scenario. This is reflected in the ROC curves where the lines for the Mainland subsets are farther from the top-left corner, suggesting a lower true positive rate relative to the false positive rate compared to the Atauro subsets. The positive predictive value (PPV) and negative predictive value (NPV), which provide insight into the model’s precision and reliability, also align with the ROC curve analysis, showing higher values for the Atauro subsets. This indicates that when the model predicts a particular nutritional profile for these subsets, it is more likely to be correct. The Matthew’s correlation coefficient (MCC) values, a balanced measure of quality for binary classifications, corroborate the ROC analysis by indicating that the Atauro subsets maintain a higher quality of prediction across classes. In summary, the integrated analysis of Table 5.2 and Figure 5.3 reveals a differentiated performance of the XGBoost model across various subsets of fishing data. The model showcases commendable predictive strength in the Atauro subsets, with high AUC, accuracy, and kappa metrics indicating a reliable classification of nutritional profiles. The ROC curve analysis further supports this, with curves for Atauro subsets nearer to the desired top-left corner, denoting higher sensitivity and specificity. In contrast, the Mainland subsets, despite achieving moderate success, suggest an area for improvement, as seen by their relative distance from the optimal point on the ROC curves and lower performance metrics. This suggests that while the model is effective in identifying nutritional profiles in certain contexts, its performance is not uniformly high across all subsets. Figure 5.3: Receiver Operating Characteristic (ROC) Curves for evaluating the performance of a cluster-based XGBoost classification model across four distinct fishing datasets: Atauro with all gears (a), Atauro with gill nets (b), Mainland with all gears (c), and Mainland with gill nets (d). Each curve represents one of the five clusters obtained from the classification, with different colors marking each cluster. Data points on the curves indicate the trade-off between sensitivity (true positive rate) and 1-specificity (false positive rate) for each cluster. The proximity of the curves to the top-left corner reflects the accuracy of the model in classifying the nutritional profiles into the correct clusters. Table 5.2: Performance Metrics for XGBoost Model Across Fishing Data Subsets. This table provides a comprehensive overview of the predictive performance of an XGBoost classification model for four distinct subsets of fishing data: Atauro with all gears (ATAURO AG), Atauro with gill nets (ATAURO GN), Mainland with all gears (MAINLAND AG), and Mainland with gill nets (MAINLAND GN). Key performance indicators include ROC-AUC (area under the receiver operating characteristic curve), accuracy, Kappa (kap), sensitivity (sens), specificity (spec), positive predictive value (ppv), negative predictive value (npv), Matthew’s correlation coefficient (mcc), Youden’s J index (j_index), balanced accuracy (bal_accuracy), detection prevalence, precision, recall, and F measure (f_meas). The metrics collectively reflect the model’s ability to discriminate between nutritional profiles, its overall accuracy, and the balance between the sensitivity and specificity for each subset. The analysis of SHAP values from gill net models reveals the interaction between mesh size and habitat in predicting nutrient profiles. In the Atauro region, as depicted in Figure 5.4, smaller mesh sizes (below 40 mm) are consistently linked to a higher prediction of nutrient profile NP1 across various habitats, especially reefs, beaches, and mangroves. This suggests that smaller mesh sizes are generally effective across these diverse marine environments for predicting NP1. For nutrient profile NP2, there is a noticeable increase in SHAP values within the 40 to 60 mm mesh size range, with reefs and beaches showing this pattern most clearly. This indicates that medium mesh sizes are particularly predictive of NP2 in these ecological settings. Larger mesh sizes, specifically those between 60 and 70 mm, have been associated with nutrient profiles NP3 and NP4 across several habitats, including reefs, beaches, and mangroves. A more specific association is observed with mesh sizes between 70 and 80 mm, which are predominantly linked to predicting NP4. For the largest mesh sizes analyzed, nutrient profile NP5 emerges as the most likely prediction among the various profiles, especially in the Atauro data subset. The SHAP values derived from the mainland data present a more varied pattern. Small mesh sizes (less than 35 mm) used in deep water and FAD environments are linked with the prediction of nutrient profiles NP3 and NP4, with the latter also being associated with reef and beach habitats. Mesh sizes in the range of 40 to 65 mm are strong predictors for nutrient profiles NP1 and NP5. Profile NP1 is most commonly predicted in reef and FAD settings, while NP5 is typically associated with deeper waters. At the larger end of the mesh size spectrum, nutrient profile NP2 becomes the most probable prediction, particularly when fishing occurs in deeper habitats. Figure 5.4: Differential influence of mesh size on nutritional profile predictions across habitats. The figure compiles subplots for five distinct nutrient profiles (NP1-NP5) as predicted by gill net XGBoost models, with each subplot showing the distribution of SHAP values across varying mesh sizes. Each data point is colored to represent different habitats: Beach, Deep, FAD, Mangrove, Reef, Seagrass and Traditional FAD, providing insight into the habitat-specific impact of mesh size on the predictive accuracy of the model. The x-axis delineates the mesh size range, while the y-axis quantifies the magnitude of the SHAP value, with positive values denoting a heightened probability of a nutrient profile’s occurrence and negative values indicating a reduced probability. SHAP results of all gears models … Figure 5.5: Lore ipsum Figure 5.6: Lore ipsum2 5.3 Checks and limitations Are we considering all the possible potential good predictors? "],["simple.html", "6 In simple terms 6.1 ML model interpretation 6.2 ML model explanation", " 6 In simple terms 6.1 ML model interpretation ROC Curve: The curve plots the true positive rate (sensitivity) against the false positive rate (1 - specificity) at various threshold settings. The true positive rate is on the y-axis, and the false positive rate is on the x-axis. Performance: A perfect classifier would have a point in the upper left corner of the graph, where the true positive rate is 1 (or 100%) and the false positive rate is 0. The closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test. Diagonal Line: The dotted diagonal line represents a no-skill classifier (e.g., random guessing). A good classifier stays as far away from this line as possible (toward the upper left corner). Area Under the Curve (AUC): The area under each ROC curve (AUC) is a measure of the test’s accuracy. An AUC of 0.5 suggests no discrimination (no better than random chance), while an AUC of 1.0 suggests perfect discrimination. 6.2 ML model explanation SHAP values: help in understanding how each predictor in the dataset contributed to each particular prediction. A high positive SHAP value for a feature increases the probability of a certain prediction, while a high negative SHAP value decreases it. "],["references.html", "References", " References "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]] diff --git a/docs/simple.html b/docs/simple.html index 114b8fe..6954abb 100644 --- a/docs/simple.html +++ b/docs/simple.html @@ -23,7 +23,7 @@ - + diff --git a/docs_book/04-profiles.Rmd b/docs_book/04-profiles.Rmd index 1015f76..55bf00b 100644 --- a/docs_book/04-profiles.Rmd +++ b/docs_book/04-profiles.Rmd @@ -24,7 +24,7 @@ This subdivision of the dataset was intended to reduce biases and enhance analyt ### Clustering and Classification -Subsequently, we identified recurrent nutritional profiles for each dataset. We assessed the total within sum of square (WSS) of six nutrient concentrations---excluding selenium---to identify the optimal number of clusters (distinctive nutritional profiles). Once established the optimal number of clusters for each dataset, we proceeded with the K-means clustering method to organize the data into distinct groups based on similarities in nutrient concentrations. Each trip was grouped based on its nutrient concentration profile, thereby enabling us to discern patterns and categorize trips according to their nutritional profile. +After data partition we identified recurrent nutritional profiles for each dataset. We assessed the total within sum of square (WSS) of six nutrient concentrations---excluding selenium---to identify the optimal number of clusters (distinctive nutritional profiles). Once established the optimal number of clusters for each dataset, we proceeded with the K-means clustering method to organize the data into distinct groups based on similarities in nutrient concentrations. Each trip was grouped based on its nutrient concentration profile, thereby enabling us to discern patterns and categorize trips according to their nutritional profile. The K-means algorithm functions by assigning each data point to the nearest cluster, based on the mean value of the points in the cluster. This iterative process continues until the assignment of points to clusters no longer changes, indicating that the clusters are as distinct as possible. The result is a set of clusters that represent unique nutritional profiles, each characterized by a specific combination of nutrient concentrations. Subsequent to the clustering, we conducted Permutational Multivariate Analysis of Variance (PERMANOVA) to validate the clustering methodology across four distinct datasets: Atauro AG, Atauro GN, Mainland AG, and Mainland GN. PERMANOVA is a robust non-parametric statistical test that evaluates whether there are significant differences between groups. Unlike traditional ANOVA, PERMANOVA does not rely on assumptions of normality and is therefore suitable for ecological data, which often do not follow normal distributions. Our PERMANOVA analysis was conducted on each of the four subsets on a distance matrix representing pairwise dissimilarities in nutrient concentrations across all fishing trips. This approach allowed us to test the hypothesis that the nutrient profiles of fishing trips within the same cluster are more similar to each other than to trips in different clusters. @@ -43,11 +43,10 @@ Specifically, for subsets involving gill net fishing methods (Atauro GN and Main The WSS analysis indicated that either 4 or 5 clusters were the best for organizing each subset of our data. We decided to use 5 clusters for all subsets to maintain uniformity across our analyses and to better represent the varied patterns in nutrient profiles. The bar chart (Figure 5.1) displaying nutrient adequacy across nutrient profiles indicated the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch for various nutrients. The profiles are the result of k-means clustering, reflecting distinct groupings based on the type and quantity of nutrients present in the catch. +For the Atauro dataset using all gear types (Panel a),we observe diverse distributions of nutrient adequacy across the profiles Specifically, clusters 3 and 5 exhibit a notably higher content of vitamin A relative to the other clusters, whereas calcium and protein appear more evenly distributed among all nutrient profiles. The distribution of zinc varies greatly, with cluster 1 showing the greatest concentration. Iron is most abundant in cluster 4, distinguishing it from the rest. -In the Atauro subsets (Panels a and b), there is a notable variation in the composition of nutrients. For example, some profiles are characterized by higher segments of protein and omega-3, suggestive of a catch rich in these particular nutrients. This is especially evident in the subset using gill nets, where the nutrient profile segments for protein and omega-3 are more prominent. -Contrastingly, the Mainland subsets (Panels c and d) display a different nutrient composition. Here, the catch appears to be less concentrated in protein and omega-3 but shows relatively larger segments for iron and zinc, indicating a variance in the type of marine life caught or perhaps the nutritional quality of these catches. +For the subset of data from Atauro using only gill net gear (Panel b), the distribution is characterized by higher proportions of calcium in clusters 2 and 4. Additionally, clusters 1 and 4 stand out due to their higher vitamin A content....etc...etc... -Across all panels, the nutrient profiles exhibit significant differences in the levels of calcium and vitamin A. Certain profiles show substantial segments representing these nutrients, while others are noticeably lacking, suggesting that the nutrient density of the catches can vary greatly within the same region and gear type. ```{r echo=FALSE, fig.cap="Distribution of nutrient adequacy across k-means clusters. The bar chart delineates the number of individuals meeting the Recommended Nutrient Intake (RNI) per 1kg of catch within identified k-means clusters. Each bar is categorized into six segments corresponding to the evaluated nutrients. The clusters are enumerated on the y-axis, each representing a group with a distinct nutritional profile as determined by the cluster analysis. The x-axis quantifies the count of individuals within each cluster that meet the RNI for the respective nutrients, underlining the variability in nutrient adequacy across clusters. Panels (a) through (d) compare these distributions across different fishing practices and locations, namely Atauro and the Mainland, using all gear types or exclusively gill nets.", fig.height=6, fig.width=8, message=FALSE, warning=FALSE} library(ggplot2) @@ -67,8 +66,10 @@ plot_bars <- function(x) { ggplot2::scale_x_continuous(n.breaks = 8) + ggplot2::theme(legend.position = "bottom") + ggplot2::labs(fill = "", x = "", y = "") + - ggplot2::theme(legend.position = "", - plot.margin = unit(c(0, 0, 0, 0), "cm")) + ggplot2::theme( + legend.position = "", + plot.margin = unit(c(0, 0, 0, 0), "cm") + ) } plots <- purrr::map(data$data_raw, plot_bars) @@ -105,7 +106,6 @@ final_plot <- y_label final_plot - ``` The scatter plot from the k-means clustering (Figure 5.2) showed the distribution of nutrient profiles across different clusters in each data subset. The first two principal components explained a significant portion of the variance, indicating distinct groupings in nutrient profiles among the fishing trips. @@ -124,10 +124,14 @@ plots <- plots <- lapply(plots, function(x) { x + ggplot2::theme_minimal(10) + - ggplot2::theme(legend.position = "none", - plot.margin = unit(c(0, 0, 0, 0), "cm"))+ - ggplot2::labs(fill = "Nutritional\nprofile", - color = "Nutritional\nprofile") + ggplot2::theme( + legend.position = "none", + plot.margin = unit(c(0, 0, 0, 0), "cm") + ) + + ggplot2::labs( + fill = "Nutritional\nprofile", + color = "Nutritional\nprofile" + ) }) legend_plot <- cowplot::get_legend(plots[[1]] + @@ -208,16 +212,18 @@ plots <- ) plots <- lapply(plots, function(x) { x + - ggplot2::theme(legend.position = "none", - plot.margin = unit(c(0, 0, 0, 0), "cm")) + + ggplot2::theme( + legend.position = "none", + plot.margin = unit(c(0, 0, 0, 0), "cm") + ) + ggplot2::labs(x = "", y = "", color = "Nutritional\nprofile") }) legend_plot <- cowplot::get_legend(plots[[1]] + - ggplot2::theme( - legend.position = "right", - legend.key.size = ggplot2::unit(0.8, "cm"), - legend.title = ggplot2::element_text(size = 12) - )) + ggplot2::theme( + legend.position = "right", + legend.key.size = ggplot2::unit(0.8, "cm"), + legend.title = ggplot2::element_text(size = 12) + )) combined_plots <- cowplot::plot_grid(plotlist = plots, ncol = 2, labels = "auto") x_label <- cowplot::draw_label("1 - Specificity", x = 0.5, y = 0.05) @@ -238,8 +244,7 @@ final_plot ``` ```{r, echo=FALSE, message=FALSE, warning=FALSE} - -models_metrics <- +models_metrics <- timor.nutrients::model_outputs %>% purrr::map(purrr::pluck(4)) %>% purrr::imap(~ summary(.x)) %>% @@ -247,9 +252,9 @@ models_metrics <- dplyr::select(-.estimator) %>% tidyr::pivot_wider(names_from = subset, values_from = .estimate) %>% dplyr::rename(metric = .metric) %>% - na.omit() + na.omit() -models_auc <- +models_auc <- timor.nutrients::model_outputs %>% purrr::map(purrr::pluck(8)) %>% dplyr::bind_rows(.id = "subset") %>% @@ -257,13 +262,15 @@ models_auc <- tidyr::pivot_wider(names_from = subset, values_from = estimate) -dplyr::bind_rows(models_auc, models_metrics) %>% - dplyr::bind_rows(models_auc, models_metrics) %>% - dplyr::rename("Atauro AG" = model_atauro_AG, - "Mainland AG" = model_timor_AG, - "Atauro GN" = model_atauro_GN, - "Mainland GN" = model_timor_GN) %>% - dplyr::mutate(dplyr::across(.cols = dplyr::where(is.numeric), ~ round(.x, 2))) %>% +dplyr::bind_rows(models_auc, models_metrics) %>% + dplyr::bind_rows(models_auc, models_metrics) %>% + dplyr::rename( + "Atauro AG" = model_atauro_AG, + "Mainland AG" = model_timor_AG, + "Atauro GN" = model_atauro_GN, + "Mainland GN" = model_timor_GN + ) %>% + dplyr::mutate(dplyr::across(.cols = dplyr::where(is.numeric), ~ round(.x, 2))) %>% reactable::reactable( theme = reactablefmtr::fivethirtyeight(centered = TRUE), defaultExpanded = TRUE, @@ -280,21 +287,130 @@ dplyr::bind_rows(models_auc, models_metrics) %>% ) ) ) - ``` Table 5.2: Performance Metrics for XGBoost Model Across Fishing Data Subsets. This table provides a comprehensive overview of the predictive performance of an XGBoost classification model for four distinct subsets of fishing data: Atauro with all gears (ATAURO AG), Atauro with gill nets (ATAURO GN), Mainland with all gears (MAINLAND AG), and Mainland with gill nets (MAINLAND GN). Key performance indicators include ROC-AUC (area under the receiver operating characteristic curve), accuracy, Kappa (kap), sensitivity (sens), specificity (spec), positive predictive value (ppv), negative predictive value (npv), Matthew's correlation coefficient (mcc), Youden's J index (j_index), balanced accuracy (bal_accuracy), detection prevalence, precision, recall, and F measure (f_meas). The metrics collectively reflect the model's ability to discriminate between nutritional profiles, its overall accuracy, and the balance between the sensitivity and specificity for each subset. -Shap values results... -```{r model-explanation, echo=FALSE, message=FALSE, warning=FALSE,fig.height=9, fig.width=7} -plot_model_shaps(timor.nutrients::shap_results$model_atauro_GN, - model_type = "gn", - cols = 1, - alpha = 0.75) +The analysis of SHAP values from gill net models reveals the interaction between mesh size and habitat in predicting nutrient profiles. In the Atauro region, as depicted in Figure 5.4, smaller mesh sizes (below 40 mm) are consistently linked to a higher prediction of nutrient profile NP1 across various habitats, especially reefs, beaches, and mangroves. This suggests that smaller mesh sizes are generally effective across these diverse marine environments for predicting NP1. + +For nutrient profile NP2, there is a noticeable increase in SHAP values within the 40 to 60 mm mesh size range, with reefs and beaches showing this pattern most clearly. This indicates that medium mesh sizes are particularly predictive of NP2 in these ecological settings. + +Larger mesh sizes, specifically those between 60 and 70 mm, have been associated with nutrient profiles NP3 and NP4 across several habitats, including reefs, beaches, and mangroves. A more specific association is observed with mesh sizes between 70 and 80 mm, which are predominantly linked to predicting NP4. For the largest mesh sizes analyzed, nutrient profile NP5 emerges as the most likely prediction among the various profiles, especially in the Atauro data subset. + +The SHAP values derived from the mainland data present a more varied pattern. Small mesh sizes (less than 35 mm) used in deep water and FAD environments are linked with the prediction of nutrient profiles NP3 and NP4, with the latter also being associated with reef and beach habitats. Mesh sizes in the range of 40 to 65 mm are strong predictors for nutrient profiles NP1 and NP5. Profile NP1 is most commonly predicted in reef and FAD settings, while NP5 is typically associated with deeper waters. At the larger end of the mesh size spectrum, nutrient profile NP2 becomes the most probable prediction, particularly when fishing occurs in deeper habitats. + +```{r, echo=FALSE, message=FALSE, warning=FALSE,fig.height=7, fig.width=8, fig.cap="Differential influence of mesh size on nutritional profile predictions across habitats. The figure compiles subplots for five distinct nutrient profiles (NP1-NP5) as predicted by gill net XGBoost models, with each subplot showing the distribution of SHAP values across varying mesh sizes. Each data point is colored to represent different habitats: Beach, Deep, FAD, Mangrove, Reef, Seagrass and Traditional FAD, providing insight into the habitat-specific impact of mesh size on the predictive accuracy of the model. The x-axis delineates the mesh size range, while the y-axis quantifies the magnitude of the SHAP value, with positive values denoting a heightened probability of a nutrient profile's occurrence and negative values indicating a reduced probability."} +# atauro gn +sha <- shapviz::shapviz(timor.nutrients::shap_results$model_atauro_GN) +shapviz_object <- purrr::map(sha, plot_shap, model_type = "gn", alpha = 0.3) + +plots <- lapply(shapviz_object, function(x) { + x + + ggplot2::theme( + legend.position = "none", + plot.margin = unit(c(0, 0, 0, 0), "cm") + ) + + ggplot2::labs(x = "", y = "") +}) + +combined_plots_atauro <- cowplot::plot_grid( + plotlist = plots, + ncol = 1, + label_fontface = "bold", + label_size = 6.5, + hjust = -0.5, + vjust = -0.5, + align = "hv", + labels = c("NP1", "NP2", "NP3", "NP4", "NP5") +) + +# timor gn +sha <- shapviz::shapviz(timor.nutrients::shap_results$model_timor_GN) +shapviz_object <- purrr::map(sha, plot_shap, model_type = "gn", alpha = 0.3) + +plots <- lapply(shapviz_object, function(x) { + x + + ggplot2::theme( + legend.position = "none", + plot.margin = unit(c(0, 0, 0, 0), "cm") + ) + + ggplot2::labs(x = "", y = "") +}) + +legend_plot <- cowplot::get_legend(plots[[1]] + + ggplot2::theme( + plot.margin = unit(c(0, 0, 0, 0), "cm"), + legend.title = ggplot2::element_text(size = 11), + legend.position = "bottom", + legend.direction = "horizontal", + legend.justification = "center", + legend.box.just = "bottom" + )) +combined_plots_timor <- cowplot::plot_grid( + plotlist = plots, + ncol = 1, + label_fontface = "bold", + label_size = 6.5, + hjust = -0.5, + vjust = -0.5, + align = "hv", + labels = c("NP1", "NP2", "NP3", "NP4", "NP5") +) + +gn_plots <- cowplot::plot_grid(combined_plots_atauro, + combined_plots_timor, + labels = c("Atauro GN", "Mainland GN"), + align = "hv", + label_size = 9, + scale = 0.9, + hjust = -1.5 +) + + +x_label <- cowplot::draw_label("Mesh size (mm)", x = 0.5, y = 0.025) +# } else { +# x_label <- cowplot::draw_label("Habitat x Gear type ", x = 0.5, y = 0.05) +# } +y_label <- cowplot::draw_label("SHAP value", x = 0.015, y = 0.5, angle = 90) + +body_plot <- + gn_plots + + x_label + + y_label +final_plot <- + cowplot::plot_grid( + body_plot, + legend_plot, + ncol = 1, + nrow = 2, + rel_heights = c(2.5, 0.15), + greedy = FALSE + ) +final_plot +``` + +SHAP results of all gears models ... + +```{r, echo=FALSE, message=FALSE, warning=FALSE,fig.height=7, fig.width=8, fig.cap="Lore ipsum"} +plot_model_shaps(timor.nutrients::shap_results$model_atauro_AG, + model_type = "ag", + cols = 2, + alpha = 0.25, + drop_legend = TRUE +) ``` +```{r, echo=FALSE, message=FALSE, warning=FALSE,fig.height=8, fig.width=9, fig.cap="Lore ipsum2"} +plot_model_shaps(timor.nutrients::shap_results$model_timor_AG, + model_type = "ag", + cols = 2, + alpha = 0.25, + drop_legend = TRUE +) +``` + + ## Checks and limitations - Are we considering all the possible potential good predictors? - diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-10-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-10-1.png index 55ef2e6..a6aef42 100644 Binary files a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-10-1.png and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-11-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-11-1.png index 2a3f086..1cfe421 100644 Binary files a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-11-1.png and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-11-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-12-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-12-1.png index 55ef2e6..6b3bef7 100644 Binary files a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-12-1.png and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-12-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-13-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-13-1.png index 2a3f086..ea45f07 100644 Binary files a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-13-1.png and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-16-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-16-1.png new file mode 100644 index 0000000..496142f Binary files /dev/null and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-16-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-17-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-17-1.png new file mode 100644 index 0000000..6b3bef7 Binary files /dev/null and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-17-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-6-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 0000000..ce76702 Binary files /dev/null and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-7-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-7-1.png new file mode 100644 index 0000000..55ef2e6 Binary files /dev/null and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-8-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-8-1.png index 4438b5c..4a886a7 100644 Binary files a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-8-1.png and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-9-1.png b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-9-1.png index ce76702..0222dee 100644 Binary files a/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-9-1.png and b/docs_book/Timor-nutrient-sensitive-fisheries-management_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/man/plot_model_shaps.Rd b/man/plot_model_shaps.Rd index 7d00c42..0a037d5 100644 --- a/man/plot_model_shaps.Rd +++ b/man/plot_model_shaps.Rd @@ -4,7 +4,13 @@ \alias{plot_model_shaps} \title{Plot SHAP Values for Multiple Models} \usage{ -plot_model_shaps(data_shaps = NULL, model_type = NULL, alpha = 0.2, cols = 1) +plot_model_shaps( + data_shaps = NULL, + model_type = NULL, + alpha = 0.2, + cols = 1, + drop_legend = FALSE +) } \arguments{ \item{data_shaps}{A data frame or list of data frames containing SHAP values and corresponding features.} @@ -14,6 +20,8 @@ plot_model_shaps(data_shaps = NULL, model_type = NULL, alpha = 0.2, cols = 1) \item{alpha}{The alpha value for geom_jitter in ggplot2, controlling point transparency.} \item{cols}{The number of columns in the plot layout.} + +\item{drop_legend}{Wether to return legend. Default is TRUE.} } \value{ A ggplot object representing the combined grid of SHAP value plots for different models.