diff --git a/.Rbuildignore b/.Rbuildignore index fc86f525..aa09cc1d 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,3 +3,5 @@ ^data-raw$ ^.travis.yml ^appveyor\.yml$ +^doc$ +^Meta$ diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..46523f94 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,75 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + schedule: + - cron: "5 5 5 * *" # run at 05:05 on 5th day of each month + +name: Build + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel'} + - {os: ubuntu-latest, r: 'devel'} + - {os: macOS-latest, r: 'release'} + - {os: windows-latest, r: 'release'} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + # on PR set branch to be able to push back, see https://github.com/actions/checkout/issues/124 + - uses: actions/checkout@v2 + if: github.event_name == 'pull_request' + with: + ref: ${{ github.event.pull_request.head.ref }} + + - uses: actions/checkout@v2 + if: github.event_name != 'pull_request' + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + use-public-rspm: true + + - uses: r-lib/actions/setup-pandoc@v2 + - uses: r-lib/actions/setup-tinytex@v2 + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: | + any::devtools + any::covr + + - name: Document + run: devtools::document() + shell: Rscript {0} + + - name: Commit and push documentation changes + if: ${{ (matrix.config.os == 'ubuntu-latest') && (matrix.config.r == 'release') }} + run: | + git config --local user.name "$GITHUB_ACTOR" + git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" + git add man/\* NAMESPACE + git commit -m "Update documentation" || echo "No changes to commit" + git pull --ff-only + git push origin + + - name: Check package + run: devtools::check() + shell: Rscript {0} + + - name: Check coverage + run: covr::codecov(quiet = FALSE) + shell: Rscript {0} diff --git a/.github/workflows/pkgdown.yml b/.github/workflows/pkgdown.yml new file mode 100644 index 00000000..076fe718 --- /dev/null +++ b/.github/workflows/pkgdown.yml @@ -0,0 +1,41 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + release: + types: [published] + +name: Website + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, local::. + + - name: Build site + run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) + shell: Rscript {0} + + - name: Deploy to GitHub pages 🚀 + uses: JamesIves/github-pages-deploy-action@4.1.4 + with: + clean: false + branch: gh-pages + folder: docs \ No newline at end of file diff --git a/.gitignore b/.gitignore index 600853c9..2c77472a 100755 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ *.so *.dll vignettes/.build.timestamp +inst/doc +doc +Meta +docs/ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index f8b09bb0..00000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -language: r -cache: packages -sudo: false -warnings_are_errors: false - -r_packages: -- covr - -os: -- linux - -after_success: -- Rscript -e 'library(covr); codecov()' - -notifications: - slack: - on_success: change - on_failure: change - rooms: - secure: BZLpMK+Hoh7Bv6CUqYvyebuxQiSi9KptXeCabjFBeJWPi7aLq09RNUI3w8QEsbnl2qmaLckDyaJBXLUvB1cb49Hp/q3obEh9wGBP5Y/Qbsi18pwHAFvJcZKAHVaTcNBBomvO4bYwopcZSmwiKzMdlk2BeYjbnPbf4DVo1QvGXqREw8R71Lk1oBgZ+KoienFkhdpG2pSECQV0iU6v+MbmwYBxV9k2B9bmySinxvvzwbV9nON/5dOYggPc6uIVCLbwaU3Y2c9ffzsXKGB/FrfYaWeYVRiZ3Gp2E6ZcAmK2AcjAfyij1IalU9D6AWBuKhHHxk+nJhyGK3tgvXlO1G3XTUBLhYgURvzTugLBJm0MU2KkTiSLxJEaZl+fUvoDuXmwwusZV8vZFHvwkdOBgnWFFcoH7ocy8OCfXFkFMDtpx+CwXFa/uBjLRhWmgwBXeIytHcfwHKVhgH7Vj7ShldgonTM0hcVoRldenXg22pK2W8MDcek16I9JZ6htJ/8f7doqOTNnepDIFHTBkpR3WQWd7GfdHwmxwwyJi5tJr3qpnkFAKaJhUAyNIPU5EU2JZTqGsxaYwLSI8xxIwMgzZTXijWIi5KXbsn85n1rpT9CsObLbROeYLngs9YvdarT6OvFIXSN9Oo05fW32iwvWuck001w8jn2pYD8u7jy5AMM1g4k= - template: - - "Build <%{build_url}|#%{build_number}> (<%{compare_url}|%{commit}>) of %{repository}@%{branch} by %{author} %{result} in %{duration}. %{message}" \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 459e02e0..98c333f1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: netdist Title: An implementation of the NetEMD alignment-free network distance measure -Version: 0.4.9000 +Version: 0.4.9100 Authors@R: person("Martin", "O'Reilly",email = "developer@mtspace.net", role = c("aut", "cre")) Description: An implementation of the NetEMD alignment-free network @@ -20,13 +20,15 @@ Imports: purrr, lpSolve, plyr, - dplyr, - Rcpp + pheatmap, + Rcpp, Suggests: + dplyr, + phangorn, testthat, knitr, - phangorn, - rmarkdown -RoxygenNote: 6.0.1 + rmarkdown, + roxygen2 +RoxygenNote: 7.2.0 VignetteBuilder: knitr Encoding: UTF-8 diff --git a/Meta/vignette.rds b/Meta/vignette.rds new file mode 100644 index 00000000..eda91ed9 Binary files /dev/null and b/Meta/vignette.rds differ diff --git a/NAMESPACE b/NAMESPACE index 24fa6913..9b550b04 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,6 @@ # Generated by roxygen2: do not edit by hand export(NetEmdSmoothV2) -export(NetEmdSmooth) export(adaptive_breaks) export(area_between_dhist_ecmfs) export(as_smoothed_dhist) @@ -10,13 +9,15 @@ export(binned_densities_adaptive) export(count_graphlet_tuples) export(count_graphlet_tuples_ego) export(count_graphlets_ego) -export(count_graphlets_ego_scaled) export(count_graphlets_for_graph) export(count_graphlets_per_node) export(count_orbits_per_node) export(counts_from_observations) export(cross_comp_to_matrix) export(cross_comparison_spec) +export(density_binned_counts) +export(density_binned_counts_gp) +export(density_from_counts) export(dhist) export(dhist_ecmf) export(dhist_from_obs) @@ -25,6 +26,8 @@ export(dhist_mean_location) export(dhist_std) export(dhist_variance) export(ecmf_knots) +export(ego_network_density) +export(ego_to_graphlet_counts) export(emd) export(emd_cs) export(emd_fast_no_smoothing) @@ -46,17 +49,20 @@ export(min_emd) export(min_emd_exhaustive) export(min_emd_optimise) export(min_emd_optimise_fast) -export(netEMDSpeedTest) export(netEMDSpeedTestSmooth) -export(net_emd) -export(net_emds_for_all_graphs) export(netdis) +export(netdis.plot) export(netdis_centred_graphlet_counts) -export(netdis_centred_graphlet_counts_ego) -export(netdis_expected_graphlet_counts) -export(netdis_expected_graphlet_counts_ego) -export(netdis_expected_graphlet_counts_ego_fn) -export(netdis_for_all_graphs) +export(netdis_expected_counts) +export(netdis_many_to_many) +export(netdis_one_to_many) +export(netdis_one_to_one) +export(netdis_subtract_exp_counts) +export(netdis_uptok) +export(netemd.plot) +export(netemd_many_to_many) +export(netemd_one_to_one) +export(netemd_single_pair) export(normalise_dhist_mass) export(normalise_dhist_variance) export(orbit_key) @@ -64,11 +70,10 @@ export(orbit_to_graphlet_counts) export(read_simple_graph) export(read_simple_graphs) export(scale_graphlet_count) +export(scale_graphlet_counts_ego) export(shift_dhist) export(simplify_graph) +export(single_density_bin) export(sort_dhist) -export(zeros_to_ones) -import(Rcpp) -importFrom(Rcpp,evalCpp) importFrom(Rcpp,sourceCpp) useDynLib(netdist, .registration=TRUE) diff --git a/R/PlottingFunctions.R b/R/PlottingFunctions.R new file mode 100644 index 00000000..e525dabc --- /dev/null +++ b/R/PlottingFunctions.R @@ -0,0 +1,56 @@ + +#' Heatmap of Netdis many-to-many comparisons +#' +#' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +#' +#' @param netdislist Default output of \code{netdis_many_to_many}. +#' +#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. +#' +#' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). +#' +#' @param main Title of the plot. +#' +#' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. +#' +#' @return Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +#' @export + +netdis.plot <- function(netdislist,whatrow=c(1,2)[2],clustering_method="ward.D",main="Nedis",docluster=TRUE){ + adjmat <- cross_comp_to_matrix(measure = netdislist$netdis[whatrow,], cross_comparison_spec = netdislist$comp_spec) + vnames <- rownames(adjmat) + + legend1 <- seq(min(adjmat),max(adjmat),length.out = 5) + levels1 <- round(legend1,digits = 2) + pheatmap::pheatmap(mat = as.dist(adjmat),cluster_rows = docluster,cluster_cols = docluster,clustering_method = clustering_method,angle_col=45,main = main,treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) +} + + + + +#' Heatmap of NetEmd many-to-many comparisons +#' +#' Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +#' +#' @param netdislist Default output of \code{netdis_many_to_many}. +#' +#' @param whatrow Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting. +#' +#' @param clustering_method Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default). +#' +#' @param main Title of the plot. +#' +#' @param docluster controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn. +#' +#' @return Provides a heat map and dendrogram for the network comparisons via \code{pheatmap}. +#' @export + +netemd.plot <- function(netemdlist,clustering_method="ward.D",main="NetEmd",docluster=TRUE){ + adjmat <- cross_comp_to_matrix(measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec) + vnames <- rownames(adjmat) + + legend1 <- seq(min(adjmat),max(adjmat),length.out = 5) + levels1 <- round(legend1,digits = 2) + pheatmap::pheatmap(mat = as.dist(adjmat),cluster_rows = docluster,cluster_cols = docluster,clustering_method = clustering_method,angle_col=45,main = main,treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) + +} diff --git a/R/RcppExports.R b/R/RcppExports.R index e718b542..ab9a7e71 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -29,16 +29,6 @@ emd_fast_no_smoothing <- function(locations1, values1, locations2, values2) { #' Compute EMD NULL -#' -#' @export -NetEmdSmooth <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { - .Call(`_netdist_NetEmdSmooth`, loc1, val1, binWidth1, loc2, val2, binWidth2) -} - -#' @title -#' Compute EMD -NULL - #' #' @export NetEmdSmoothV2 <- function(loc1, val1, binWidth1, loc2, val2, binWidth2) { diff --git a/R/data.R b/R/data.R index 43e2c487..cf5c5dff 100644 --- a/R/data.R +++ b/R/data.R @@ -1,8 +1,8 @@ #' Protein-protein interaction (PPI) networks for 5 microorganisms #' -#' A dataset containing the protein-protein interaction networks for the +#' A dataset containing the protein-protein interaction networks for the #' following 5 microorganisms -#'\itemize{ +#' \itemize{ #' \item EBV #' \itemize{ #' \item Common name: Epstein Barr virus @@ -33,11 +33,32 @@ #' \item Scientific name: Human alphaherpesvirus 3 #' \item TaxonomyID: 10335 #' } -#'} +#' } #' #' @format A list of \code{igraph} objects. #' @source \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. #' @source \strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, Parkinson J (2009) The Modular Organization of Protein Interactions in Escherichia coli. PLoS Comput Biol 5(10): e1000523. \url{https://doi.org/10.1371/journal.pcbi.1000523} #' @source \strong{Taxonomy ground truth:} NCBI taxonomy database. \url{https://www.ncbi.nlm.nih.gov/taxonomy} #' @encoding UTF-8 -"virusppi" \ No newline at end of file +"virusppi" + + + + + +#' World trade networks from 1985–2014 +#' +#' The world trade data set consists of a small sample of world trade networks for the years 2001-2014, and pre-computed subgraph counts of a larger set of world trade networks (1985–2014). The world trade networks are based on the data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the United Nations division COMTRADE [Division, 2015] for the years 2001-2014. +#' +#' \itemize{ +#' \item wtnets: List of \code{igraph} objects providing the world trade networks from 2001–2014. +#' \item Counts: Pre-computed graphlet counts for the world trade networks in the years 1985-2014. +#' } +#' +#' @format A list of two elements. The first element, 'wtnets', is a list of \code{igraph} objects providing a small sample of world trade networks from 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph counts of world trade networks in the years 1985-2014. +#' @source \strong{World trade networks:}. United nations commodity trade statistics database (UN comtrade). http://comtrade.un.org/, 2015. +#' @source \strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). +#' +#' @encoding UTF-8 +"worldtradesub" + diff --git a/R/dhist.R b/R/dhist.R index af572073..a5261f76 100644 --- a/R/dhist.R +++ b/R/dhist.R @@ -1,17 +1,17 @@ # HISTOGRAM FUNCTIONS #' Discrete histogram constructor -#' -#' Creates a discrete histogram object of class \code{dhist}, with bin +#' +#' Creates a discrete histogram object of class \code{dhist}, with bin #' \code{locations} and \code{masses} set to the 1D numeric vectors provided. #' @param locations A 1D numeric vector specifying the discrete locations #' of the histogram bins -#' @param masses A 1D numeric vector specifying the mass present at each +#' @param masses A 1D numeric vector specifying the mass present at each #' location #' @param smoothing_window_width If greater than 0, the discrete histogram will #' be treated as having the mass at each location "smoothed" uniformly across #' a bin centred on the location and having width = \code{smoothing_window_width} #' (default = \code{0} - no smoothing) -#' @param sorted Whether or not to return a discrete histogram with locations +#' @param sorted Whether or not to return a discrete histogram with locations #' and masses sorted by ascending mass (default = \code{TRUE}) #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: @@ -22,30 +22,32 @@ #' Note that locations where no mass is present are not included in the returned #' \code{dhist} object. Mass in these discrete histograms is treated as being #' present precisely at the specified location. Discrete histograms should not be used -#' for data where observations have been grouped into bins representing ranges +#' for data where observations have been grouped into bins representing ranges #' of observation values. #' @export dhist <- function(locations, masses, smoothing_window_width = 0, sorted = TRUE) { - if(!is_numeric_vector_1d(locations)) { + if (!is_numeric_vector_1d(locations)) { stop("Bin locations must be provided as a 1D numeric vector") } - if(!is_numeric_vector_1d(masses)) { + if (!is_numeric_vector_1d(masses)) { stop("Bin masses must be provided as a 1D numeric vector") } - if(length(locations) != length(masses)) { + if (length(locations) != length(masses)) { stop("The number of bin locations and masses provided must be equal") } - dhist <- list(locations = locations, masses = masses, - smoothing_window_width = smoothing_window_width) + dhist <- list( + locations = locations, masses = masses, + smoothing_window_width = smoothing_window_width + ) class(dhist) <- "dhist" - if(sorted == TRUE) { + if (sorted == TRUE) { dhist <- sort_dhist(dhist) } return(dhist) } #' Compare dhists -#' +#' #' Compares all fields of the dhist and only returns treu if they are all the #' same in both dhists #' @param dhist1 A discrete histogram as a \code{dhist} object @@ -53,23 +55,23 @@ dhist <- function(locations, masses, smoothing_window_width = 0, sorted = TRUE) `==.dhist` <- function(dhist1, dhist2) { class(dhist1) == class(dhist2) && all(mapply(`==`, dhist1$locations, dhist2$locations)) && - all(mapply(`==`, dhist1$masses, dhist2$masses)) && + all(mapply(`==`, dhist1$masses, dhist2$masses)) && dhist1$smoothing_window_width == dhist2$smoothing_window_width } -update_dhist <- +update_dhist <- function(dhist, locations = dhist$locations, masses = dhist$masses, smoothing_window_width = dhist$smoothing_window_width) { dhist$locations <- locations dhist$masses <- masses dhist$smoothing_window_width <- smoothing_window_width return(dhist) - } + } #' Set dhist smoothing -#' -#' Returns a "smoothed" copy of a \code{dhist} object with its -#' \code{smoothing_window_width} attribute set to the value provided +#' +#' Returns a "smoothed" copy of a \code{dhist} object with its +#' \code{smoothing_window_width} attribute set to the value provided #' \code{smoothing_window_width} parameter. #' @param dhist A discrete histogram as a \code{dhist} object #' @param smoothing_window_width If greater than 0, the discrete histogram will @@ -84,8 +86,8 @@ as_smoothed_dhist <- function(dhist, smoothing_window_width) { } #' Remove dhist smoothing -#' -#' Returns an "unsmoothed" copy of a \code{dhist} object with its +#' +#' Returns an "unsmoothed" copy of a \code{dhist} object with its #' \code{smoothing_window_width} attribute set to 0. #' @param dhist A discrete histogram as a \code{dhist} object #' @return A copy of a \code{dhist} object with its \code{smoothing_window_width} @@ -97,40 +99,40 @@ as_unsmoothed_dhist <- function(dhist) { } #' Check if an object is a \code{dhist} discrete histogram -#' -#' Checks if the input object is of class \code{dhist}. If \code{fast_check} is -#' \code{TRUE} then the only check is whether the object has a class attribute of +#' +#' Checks if the input object is of class \code{dhist}. If \code{fast_check} is +#' \code{TRUE} then the only check is whether the object has a class attribute of #' \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks #' are also made to ensure that the object has the structure required of a -#' \code{dhist} object. +#' \code{dhist} object. #' @param x An arbitrary object -#' @param fast_check Boolean flag indicating whether to perform only a -#' superficial fast check limited to checking the object's class attribute +#' @param fast_check Boolean flag indicating whether to perform only a +#' superficial fast check limited to checking the object's class attribute #' is set to \code{dhist} (default = \code{TRUE}) #' @export is_dhist <- function(x, fast_check = TRUE) { # Quick check that relies on user not to construct variables with "dhist" class # that do not have the required elements - has_class_attr <-(class(x) == "dhist") - if(fast_check) { + has_class_attr <- (class(x) == "dhist") + if (fast_check) { # Early return if fast check requested return(has_class_attr) } # Otherwise check structure - has_locations <- purrr::contains(attr(x, "name"), "locations") - has_masses <- purrr::contains(attr(x, "name"), "masses") - # Require list with correct class and presence of 1D numeric vector named + has_locations <- purrr::has_element(attr(x, "name"), "locations") + has_masses <- purrr::has_element(attr(x, "name"), "masses") + # Require list with correct class and presence of 1D numeric vector named # elements "locations" and "masses" return(has_class_attr - && purrr::is_list(x) - && has_locations - && has_masses - && is_numeric_vector_1d(x$locations) - && is_numeric_vector_1d(x$masses)) + && purrr::is_list(x) + && has_locations + && has_masses + && is_numeric_vector_1d(x$locations) + && is_numeric_vector_1d(x$masses)) } #' Discrete histogram from observations (Pure R slow version) -#' +#' #' Generate a sparse discrete histogram from a set of discrete numeric observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which @@ -142,14 +144,16 @@ is_dhist <- function(x, fast_check = TRUE) { #' @export dhist_from_obs_slow <- function(observations) { # Require 1D numeric vector - if(!is_numeric_vector_1d(observations)) { + if (!is_numeric_vector_1d(observations)) { stop("Observations must be provided as a 1D numeric vector") } # Identify unique observations locations <- sort(unique(observations)) # Count occurences of each unique obervation - counts <- sapply(locations, function(location) {sum(observations == location)}) + counts <- sapply(locations, function(location) { + sum(observations == location) + }) # Construct histogram object hist <- dhist(locations = locations, masses = counts) return(hist) @@ -157,7 +161,7 @@ dhist_from_obs_slow <- function(observations) { #' Discrete histogram from observations -#' +#' #' Generate a sparse discrete histogram from a set of discrete numeric observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which @@ -169,25 +173,25 @@ dhist_from_obs_slow <- function(observations) { #' @export dhist_from_obs <- function(observations) { # Require 1D numeric vector - if(!is_numeric_vector_1d(observations)) { + if (!is_numeric_vector_1d(observations)) { stop("Observations must be provided as a 1D numeric vector") } - if (any(is.na(observations))) { - stop("NA observed in features") - } + if (any(is.na(observations))) { + stop("NA observed in features") + } results <- counts_from_observations(matrix(observations)) # Construct histogram object - hist <- dhist(locations = results[,1], masses = results[,2]) + hist <- dhist(locations = results[, 1], masses = results[, 2]) return(hist) } -#' Generate interpolating empirical cumulative mass function (ECMF) for +#' Generate interpolating empirical cumulative mass function (ECMF) for #' a discrete histogram -#' +#' #' @param dhist A discrete histogram as a \code{dhist} object #' @return An interpolating ECMF as an \code{approxfun} object. This function -#' will return the interpolated cumulative mass for a vector of arbitrary +#' will return the interpolated cumulative mass for a vector of arbitrary #' locations. If \code{dhist$smoothing_window_width} is zero, the ECMF will be #' piecewise constant. If \code{dhist$smoothing_window_width} is one, the ECMF #' will be piece-wise linear. If \code{dhist$smoothing_window_width} is any @@ -199,7 +203,7 @@ dhist_ecmf <- function(dhist) { # Determine cumulative mass at each location cum_mass <- cumsum(dhist$masses) # Generate ECMF - if(dhist$smoothing_window_width == 0) { + if (dhist$smoothing_window_width == 0) { # Avoid any issues with floating point equality comparison completely when # no smoothing is occurring x_knots <- dhist$locations @@ -215,22 +219,22 @@ dhist_ecmf <- function(dhist) { upper_limits <- dhist$locations + hw cum_mass_lower <- cum_mass cum_mass_upper <- cum_mass - # 2. Set lower limit cumulative masses to have the same value as the + # 2. Set lower limit cumulative masses to have the same value as the # upper limit of the previous location. This ensures constant interpolation # between the upper limit of one location and the lower limit of the next - cum_mass_lower <- c(0, utils::head(cum_mass_upper, num_locs -1)) - # 3. Identify upper limits within machine precision of the lower limit of + cum_mass_lower <- c(0, utils::head(cum_mass_upper, num_locs - 1)) + # 3. Identify upper limits within machine precision of the lower limit of # the next location - diff <- abs(utils::head(upper_limits, num_locs -1) - - utils::tail(lower_limits, num_locs -1)) + diff <- abs(utils::head(upper_limits, num_locs - 1) - + utils::tail(lower_limits, num_locs - 1)) tolerance <- .Machine$double.eps drop_indexes <- which(diff <= tolerance) - # 4. Drop upper limits and associated cumulative masses where a lower + # 4. Drop upper limits and associated cumulative masses where a lower # limit exists at the same location (to within machine precision). # NOTE: We need to skip this step entirely if there are no upper limits to # drop as vector[-0] returns an empty vector rather than all entries in the # vector. - if(length(drop_indexes) > 0) { + if (length(drop_indexes) > 0) { upper_limits <- upper_limits[-drop_indexes] cum_mass_upper <- cum_mass_upper[-drop_indexes] } @@ -247,9 +251,11 @@ dhist_ecmf <- function(dhist) { } # Construct ECMF max_mass <- cum_mass[length(cum_mass)] - dhist_ecmf <- stats::approxfun(x = x_knots, y = cum_mass, - method = interpolation_method, yleft = 0, - yright = max_mass, f = 0, ties = min) + dhist_ecmf <- stats::approxfun( + x = x_knots, y = cum_mass, + method = interpolation_method, yleft = 0, + yright = max_mass, f = 0, ties = min + ) class(dhist_ecmf) <- c("dhist_ecmf", class(dhist_ecmf)) attr(dhist_ecmf, "type") <- interpolation_method return(dhist_ecmf) @@ -257,30 +263,30 @@ dhist_ecmf <- function(dhist) { #' Get "knots" for discrete histogram empirical cumulative mass function #' (ECMF). The "knots" are the x-values at which the y-value of the ECDM changes -#' gradient (i.e. the x-values between which the ECMF does its constant or +#' gradient (i.e. the x-values between which the ECMF does its constant or #' linear interpolates) -#' -#' @param dhist_ecmf An object of class \code{dhist_ecmf}, returned from a call +#' +#' @param dhist_ecmf An object of class \code{dhist_ecmf}, returned from a call #' to the \code{dhist_ecmf} function -#' @return x_knots A list of "knots" for the ECMF, containing all x-values at +#' @return x_knots A list of "knots" for the ECMF, containing all x-values at #' which the y-value changes gradient (i.e. the x-values between which the ECMF #' does its constant or linear interpolation) #' @export ecmf_knots <- function(dhist_ecmf) { # dhist_ecmf is a stats::approxfun object and is either a piecewise constant - # or piece-wise linear function, with the x argument of the underlying + # or piece-wise linear function, with the x argument of the underlying # approxfun set to the inflexion points (or knots) of the pricewise function - # We simply recover the value of the x argument by evaluating "x" in the + # We simply recover the value of the x argument by evaluating "x" in the # environment of the dhist_ecmf approxfun - eval(expression(x), envir=environment(dhist_ecmf)) + eval(expression(x), envir = environment(dhist_ecmf)) } -#' Calculate area between two discrete histogram empirical cumulative +#' Calculate area between two discrete histogram empirical cumulative #' mass functions (ECMFs) -#' -#' @param dhist_ecmf1 An object of class \code{dhist_ecmf}, returned from a call +#' +#' @param dhist_ecmf1 An object of class \code{dhist_ecmf}, returned from a call #' to the \code{dhist_ecmf} function -#' @param dhist_ecmf2 An object of class \code{dhist_ecmf}, returned from a call +#' @param dhist_ecmf2 An object of class \code{dhist_ecmf}, returned from a call #' to the \code{dhist_ecmf} function #' @return area The area between the two discrete histogram ECMFs, calculated as #' the integral of the absolute difference between the two ECMFs @@ -289,7 +295,7 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { # Ensure ECMFs have compatible types ecmf_type1 <- attr(dhist_ecmf1, "type") ecmf_type2 <- attr(dhist_ecmf2, "type") - if(ecmf_type1 != ecmf_type2) { + if (ecmf_type1 != ecmf_type2) { stop("ECMFs must have the same type") } ecmf_type <- ecmf_type1 @@ -305,7 +311,7 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { x_lower <- utils::head(x, num_segs) x_upper <- utils::tail(x, num_segs) # Depending on the ECDF type, we calculate the area between ECMFs differently - if(ecmf_type == "constant") { + if (ecmf_type == "constant") { # Area of each rectangular segment between ECMFs is the absolute difference # between the ECMFs at the lower limit of the segment * the width of the # segement @@ -313,7 +319,7 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { ecm_diff_lower <- utils::head(ecm_diff, num_segs) segment_width <- abs(x_upper - x_lower) segment_areas <- ecm_diff_lower * segment_width - } else if(ecmf_type == "linear") { + } else if (ecmf_type == "linear") { # -------------------------------------------------------------- # Determine area between pairs of linear segments from each ECMF # -------------------------------------------------------------- @@ -324,8 +330,8 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { # Determine if ECMFs intersect within each segment. The linear segments from # each ECMF will only intersect if the ordering of the y-components of their # start and end endpoints are different (i.e. the ECMF with the y-component - # at the start of the segment has the higher y-component at the end of the - # segment). An equivalent expression of this condition is that the signs + # at the start of the segment has the higher y-component at the end of the + # segment). An equivalent expression of this condition is that the signs # of the differences between the y-components of the two linear ECMF # segments will differ at the start (lower x-bound) and end (upper x-bound) # of a segment @@ -344,16 +350,20 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { x_diff <- x_upper - x_lower segment_areas <- rep(NaN, num_segs) # Use bowtie area for bowties - segment_areas[bowtie] <- - segment_area_bowtie(x_diff = x_diff[bowtie], - y_diff_lower = y_diff_lower[bowtie], - y_diff_upper = y_diff_upper[bowtie]) + segment_areas[bowtie] <- + segment_area_bowtie( + x_diff = x_diff[bowtie], + y_diff_lower = y_diff_lower[bowtie], + y_diff_upper = y_diff_upper[bowtie] + ) # Use trapezium area for other shapes (trapeziums, triangles and zero-area # co-linear) - segment_areas[trapezium] <- - segment_area_trapezium(x_diff = x_diff[trapezium], - y_diff_lower = y_diff_lower[trapezium], - y_diff_upper = y_diff_upper[trapezium]) + segment_areas[trapezium] <- + segment_area_trapezium( + x_diff = x_diff[trapezium], + y_diff_lower = y_diff_lower[trapezium], + y_diff_upper = y_diff_upper[trapezium] + ) } else { stop("ECMF type not recognised") } @@ -373,19 +383,19 @@ segment_area_bowtie <- function(x_diff, y_diff_lower, y_diff_upper) { # opposite signs and are not both zero. # See issue #21 for verification that this approach is equivalent to the # previous approach when the above conditions hold. - segment_area <- 0.5 * x_diff * (y_diff_lower^2 + y_diff_upper^2) / + segment_area <- 0.5 * x_diff * (y_diff_lower^2 + y_diff_upper^2) / (abs(y_diff_lower) + abs(y_diff_upper)) } #' Area between two offset Empirical Cumulative Mass Functions (ECMFs) -#' -#' @param ecmf1 An Empirical Cululative Mass Function (ECMF) object of class +#' +#' @param ecmf1 An Empirical Cululative Mass Function (ECMF) object of class #' \code{dhist_ecmf} -#' @param ecmf2 An Empirical Cululative Mass Function (ECMF) object of class +#' @param ecmf2 An Empirical Cululative Mass Function (ECMF) object of class #' \code{dhist_ecmf} #' @param offset An offset to add to all locations of the first ECMF. Postive #' offsets will shift the ECMF to the right and negative ones to the left. -#' @return area The area between the two ECMFs, calculated as the integral of +#' @return area The area between the two ECMFs, calculated as the integral of #' the absolute difference between the two ECMFs area_between_offset_ecmfs <- function(ecmf1, ecmf2, offset) { # Construct ECMFs for each normalised histogram @@ -395,11 +405,11 @@ area_between_offset_ecmfs <- function(ecmf1, ecmf2, offset) { } #' Sort discrete histogram -#' -#' Sort a discrete histogram so that locations are in increasing (default) or +#' +#' Sort a discrete histogram so that locations are in increasing (default) or #' decreasing order #' @param dhist A discrete histogram as a \code{dhist} object -#' @param decreasing Logical indicating whether histograms should be sorted in +#' @param decreasing Logical indicating whether histograms should be sorted in #' increasing (default) or decreasing order of location #' @export sort_dhist <- function(dhist, decreasing = FALSE) { @@ -410,8 +420,8 @@ sort_dhist <- function(dhist, decreasing = FALSE) { } #' Shift discrete histogram -#' -#' Shift the locations of a discrete histogram rightwards on the x-axis by the +#' +#' Shift the locations of a discrete histogram rightwards on the x-axis by the #' specified amount #' @param dhist A discrete histogram as a \code{dhist} object #' @param shift The distance to add to all locations @@ -423,7 +433,7 @@ shift_dhist <- function(dhist, shift) { } #' Calculate mean location for a discrete histogram -#' +#' #' Calculates mean location for a discrete histogram by taking a weighted sum #' of each location weighted by the fraction of the total histogram mass at that #' location. @@ -431,25 +441,25 @@ shift_dhist <- function(dhist, shift) { #' @return The mass-weighted mean location #' @export dhist_mean_location <- function(dhist) { - sum((dhist$masses/ sum(dhist$masses)) * dhist$locations) + sum((dhist$masses / sum(dhist$masses)) * dhist$locations) } #' Calculate variance of a discrete histogram -#' +#' #' Calculates variance directly from the discrete histogram by using locations -#' weighted by masses. -#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +#' weighted by masses. +#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses #' may not represent counts so N is not necessarily known #' @param dhist A discrete histogram as a \code{dhist} object #' @return Variance of histogram #' @export dhist_variance <- function(dhist) { mean_centred_locations <- dhist$locations - dhist_mean_location(dhist) - # Variance is E[X^2] - E[X]. However, for mean-centred data, E[X] is zero, + # Variance is E[X^2] - E[X]. However, for mean-centred data, E[X] is zero, # so variance is simply E[X^2]. Centring prior to squaring also helps avoid - # any potential integer overfloww issues (R uses a signed 32-bit integer + # any potential integer overfloww issues (R uses a signed 32-bit integer # representation, so cannot handle integers over ~2.1 billion) - if(dhist$smoothing_window_width == 0) { + if (dhist$smoothing_window_width == 0) { # For unsmoothed discrete histograms, the mass associated with each location # is located precisely at the lcoation. Therefore cariance (i.e. E[X^2]) # is the mass-weighted sum of the mean-centred locations @@ -459,22 +469,22 @@ dhist_variance <- function(dhist) { # uniformly across a bin centred on the location with width = smoothing_window_width # Variance (i.e. E[X^2]) is therefore the mass-weighted sum of the integrals # of x^2 over the mean-centred bins at each location. - hw = dhist$smoothing_window_width / 2 + hw <- dhist$smoothing_window_width / 2 bin_lowers <- mean_centred_locations - hw bin_uppers <- mean_centred_locations + hw # See comment in issue #21 on Github repository for verification that E[X^2] # is calculated as below for a uniform bin - bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + bin_lowers*bin_uppers) / 3 + bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + bin_lowers * bin_uppers) / 3 variance <- sum(dhist$masses * bin_x2_integrals) / sum(dhist$masses) } return(variance) } #' Calculate standard deviation of a discrete histogram -#' -#' Calculates standard deviation directly from the discrete histogram by using +#' +#' Calculates standard deviation directly from the discrete histogram by using #' locations weighted by masses. -#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses #' may not represent counts so N is not necessarily known #' @param dhist A discrete histogram as a \code{dhist} object #' @return Standard deviation of histogram @@ -484,21 +494,21 @@ dhist_std <- function(dhist) { } #' Centre a discrete histogram around its mean location -#' -#' Centres a discrete histogram around its mass-weighted mean location by +#' +#' Centres a discrete histogram around its mass-weighted mean location by #' subtracting the mass-weighted mean from each location. #' @param dhist A discrete histogram as a \code{dhist} object #' @return The mass-weighted mean location #' @export mean_centre_dhist <- function(dhist) { centred_locations <- dhist$locations - dhist_mean_location(dhist) - dhist <- update_dhist(dhist,locations = centred_locations) + dhist <- update_dhist(dhist, locations = centred_locations) return(dhist) } #' Normalise a discrete histogram to unit mass -#' -#' Normalises a discrete histogram to unit mass by dividing each mass by the +#' +#' Normalises a discrete histogram to unit mass by dividing each mass by the #' total of the non-normalised masses #' @param dhist A discrete histogram as a \code{dhist} object #' @return A discrete histogram normalised to have mass 1 @@ -511,18 +521,18 @@ normalise_dhist_mass <- function(dhist) { } #' Normalise a discrete histogram to unit variance -#' +#' #' Normalises a discrete histogram to unit variance by dividing each centred -#' location by the standard deviation of the discrete histogram before +#' location by the standard deviation of the discrete histogram before #' decentering #' @param dhist A discrete histogram as a \code{dhist} object #' @return A discrete histogram normalised to have variance 1 #' @export normalise_dhist_variance <- function(dhist) { - # Special case for histograms with only one location and no smoothing. + # Special case for histograms with only one location and no smoothing. # Variance is zero / undefined so normalisation fails. Just return bin centres # unchanged - if(length(dhist$locations) == 1 && dhist$smoothing_window_width == 0) { + if (length(dhist$locations) == 1 && dhist$smoothing_window_width == 0) { dhist <- dhist } else { # Centre locations on mean, divide centred locations by standard deviation @@ -534,7 +544,7 @@ normalise_dhist_variance <- function(dhist) { dhist <- update_dhist(dhist, locations = normalised_locations) # If smoothing_window_width not zero, then update it to reflect the variance # normalisation - if(dhist$smoothing_window_width != 0) { + if (dhist$smoothing_window_width != 0) { normalised_smoothing_window_width <- dhist$smoothing_window_width / std_dev dhist <- update_dhist(dhist, smoothing_window_width = normalised_smoothing_window_width) } @@ -543,9 +553,9 @@ normalise_dhist_variance <- function(dhist) { } #' Harmonise a pair of discrete histograms to share a common set of locations -#' +#' #' Where a location only exists in one histogram, add this location to the other -#' histogram with zero mass. This ensures that all location exist in both +#' histogram with zero mass. This ensures that all location exist in both #' histograms. #' @param dhist1 A discrete histogram as a \code{dhist} object #' @param dhist2 A discrete histogram as a \code{dhist} object @@ -562,7 +572,7 @@ harmonise_dhist_locations <- function(dhist1, dhist2) { masses1 <- c(dhist1$masses, rep(0, length(missing_locations1))) masses2 <- c(dhist2$masses, rep(0, length(missing_locations2))) # Construct a new histogram using the dhist constructor to ensure that the - # harmonised histograms have the same properties as if they had been + # harmonised histograms have the same properties as if they had been # constructed with the additional bins in the first place # (e.g. sorted by location) dhist1 <- update_dhist(dhist1, locations = locations1, masses = masses1) @@ -571,7 +581,7 @@ harmonise_dhist_locations <- function(dhist1, dhist2) { } #' Check if 1D numeric vector -#' +#' #' Check if a variable is a 1D numeric vector by checking that: #' \itemize{ #' \item \code{is_numeric(input)}: Input is vector, matrix, array or list of numbers diff --git a/R/emd.R b/R/emd.R index e9eee6f3..6ea1c26f 100644 --- a/R/emd.R +++ b/R/emd.R @@ -1,29 +1,29 @@ -#' Minimum Earth Mover's Distance (EMD) -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' Minimum Earth Mover's Distance (EMD) +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete #' histograms. This is the minimum EMD between the two histograms across all #' possible offsets of histogram 1 against histogram 2. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object -#' @param method The method to use to find the minimum EMD across all potential +#' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use -#' R's built-in \code{stats::optimise} method to efficiently find the offset -#' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' R's built-in \code{stats::optimise} method to efficiently find the offset +#' with the minimal EMD. However, this is not guaranteed to find the global +#' minimum if multiple local minima EMDs exist. You can alternatively specify the +#' "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @return Earth Mover's Distance between the two discrete histograms #' @export min_emd <- function(dhist1, dhist2, method = "optimise") { - # Require input to be a pair of "dhist" discrete histograms - if(!(is_dhist(dhist1) && is_dhist(dhist2))) { + # Require input to be a pair of "dhist" discrete histograms + if (!(is_dhist(dhist1) && is_dhist(dhist2))) { stop("All inputs must be 'dhist' discrete histogram objects") } - if(method == "optimise") { + if (method == "optimise") { return(min_emd_optimise_fast(dhist1, dhist2)) - } else if(method == "optimiseRonly"){ + } else if (method == "optimiseRonly") { return(min_emd_optimise(dhist1, dhist2)) - } else if(method == "exhaustive"){ + } else if (method == "exhaustive") { return(min_emd_exhaustive(dhist1, dhist2)) } else { stop("Method not recognised. Must be 'exhaustive' or ' optimise'") @@ -33,9 +33,9 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' Minimum Earth Mover's Distance (EMD) using fast optimiser search -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete -#' histograms by minimising the offset parameter of the \code{emd} function +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' histograms by minimising the offset parameter of the \code{emd} function #' using the built-in \code{stats::optimise} method. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object @@ -43,15 +43,14 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' @export min_emd_optimise_fast <- function(dhist1, dhist2) { # Can we run the C++ fast implementation (only works with no smoothing)? - if ((dhist1$smoothing_window_width==0) && (dhist2$smoothing_window_width==0)) - { + if ((dhist1$smoothing_window_width == 0) && (dhist2$smoothing_window_width == 0)) { # Determine minimum and maximum offset of range in which histograms overlap # (based on sliding histogram 1) min_offset <- min(dhist2$locations) - max(dhist1$locations) max_offset <- max(dhist2$locations) - min(dhist1$locations) # Set lower and upper range for optimise algorithm to be somewhat wider than # range defined by the minimum and maximum offset. This guards against a - # couple of issues that arise if the optimise range is exactly min_offset + # couple of issues that arise if the optimise range is exactly min_offset # to max_offset # 1) If lower and upper are equal, the optimise method will throw an error # 2) It seems that optimise is not guaranteed to explore its lower and upper @@ -63,22 +62,24 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { # Define a single parameter function to minimise emd as a function of offset val1 <- cumsum(dhist1$masses) val2 <- cumsum(dhist2$masses) - val1 <- val1/val1[length(val1)] - val2 <- val2/val2[length(val2)] - loc1=dhist1$locations - loc2=dhist2$locations - count=0 + val1 <- val1 / val1[length(val1)] + val2 <- val2 / val2[length(val2)] + loc1 <- dhist1$locations + loc2 <- dhist2$locations + count <- 0 emd_offset <- function(offset) { - temp1<- emd_fast_no_smoothing(loc1+offset,val1,loc2,val2) + temp1 <- emd_fast_no_smoothing(loc1 + offset, val1, loc2, val2) temp1 } # Get solution from optimiser - soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, - tol = .Machine$double.eps*1000) - # Return mnimum EMD and associated offset - min_emd <- soln$objective - min_offset <- soln$minimum - return(list(min_emd = min_emd, min_offset = min_offset)) + soln <- stats::optimise(emd_offset, + lower = min_offset, upper = max_offset, + tol = .Machine$double.eps * 1000 + ) + # Return mnimum EMD and associated offset + min_emd <- soln$objective + min_offset <- soln$minimum + return(list(min_emd = min_emd, min_offset = min_offset)) } else #if ((dhist1$smoothing_window_width==1) && (dhist2$smoothing_window_width==1)) { @@ -129,9 +130,9 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { #' Minimum Earth Mover's Distance (EMD) using optimiser search -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete -#' histograms by minimising the offset parameter of the \code{emd} function +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' histograms by minimising the offset parameter of the \code{emd} function #' using the built-in \code{stats::optimise} method. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object @@ -142,10 +143,10 @@ min_emd_optimise <- function(dhist1, dhist2) { # (based on sliding histogram 1) min_offset <- min(dhist2$locations) - max(dhist1$locations) max_offset <- max(dhist2$locations) - min(dhist1$locations) - + # Set lower and upper range for optimise algorithm to be somewhat wider than # range defined by the minimum and maximum offset. This guards against a - # couple of issues that arise if the optimise range is exactly min_offset + # couple of issues that arise if the optimise range is exactly min_offset # to max_offset # 1) If lower and upper are equal, the optimise method will throw and error # 2) It seems that optimise is not guaranteed to explore its lower and upper @@ -154,7 +155,7 @@ min_emd_optimise <- function(dhist1, dhist2) { buffer <- 0.1 min_offset <- min_offset - buffer max_offset <- max_offset + buffer - + # Define a single parameter function to minimise emd as a function of offset emd_offset <- function(offset) { # Construct ECMFs for each normalised histogram @@ -162,12 +163,13 @@ min_emd_optimise <- function(dhist1, dhist2) { ecmf2 <- dhist_ecmf(dhist2) area_between_dhist_ecmfs(ecmf1, ecmf2) } - + # Get solution from optimiser - soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, - tol = .Machine$double.eps*1000) - - browser() + soln <- stats::optimise(emd_offset, + lower = min_offset, upper = max_offset, + tol = .Machine$double.eps * 1000 + ) + # Return mnimum EMD and associated offset min_emd <- soln$objective min_offset <- soln$minimum @@ -175,20 +177,20 @@ min_emd_optimise <- function(dhist1, dhist2) { } #' Minimum Earth Mover's Distance (EMD) using exhaustive search -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete #' histograms using an exhaustive search. -#' -#' When "sliding" two piecewise-linear empirical cumulative mass functions -#' (ECMFs) across each other to minimise the EMD between them, it is sufficient -#' to calculate the EMD at all offsets where any knots from the two ECMFs align +#' +#' When "sliding" two piecewise-linear empirical cumulative mass functions +#' (ECMFs) across each other to minimise the EMD between them, it is sufficient +#' to calculate the EMD at all offsets where any knots from the two ECMFs align #' to ensure that the offset with the global minimum EMD is found. #' -#'This is because of the piece-wise linear nature of the two ECMFs. Between any -#'two offsets where knots from the two ECMFs align, EMD will be either constant, -#'or uniformly increasing or decreasing. Therefore, there the EMD between two -#'sets of aligned knots cannot be smaller than the EMD at one or other of the -#'bounding offsets. +#' This is because of the piece-wise linear nature of the two ECMFs. Between any +#' two offsets where knots from the two ECMFs align, EMD will be either constant, +#' or uniformly increasing or decreasing. Therefore, there the EMD between two +#' sets of aligned knots cannot be smaller than the EMD at one or other of the +#' bounding offsets. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object #' @return Earth Mover's Distance between the two discrete histograms @@ -203,17 +205,18 @@ min_emd_exhaustive <- function(dhist1, dhist2) { cur_offset <- 0 # 0 so that adding first step shift gives initial offset # Set state variables distance_matrix <- NULL - while(step_shift < Inf) { + while (step_shift < Inf) { dhist1 <- shift_dhist(dhist1, step_shift) cur_offset <- cur_offset + step_shift cur_emd <- emd(dhist1, dhist2) - if(cur_emd < min_emd) { + if (cur_emd < min_emd) { min_emd <- cur_emd min_offset <- cur_offset } - res <- shift_to_next_alignment(dhist1$locations, dhist2$locations, - distance_matrix_prev = distance_matrix, - shift_prev = step_shift) + res <- shift_to_next_alignment(dhist1$locations, dhist2$locations, + distance_matrix_prev = distance_matrix, + shift_prev = step_shift + ) step_shift <- res$shift distance_matrix <- res$distance_matrix } @@ -221,31 +224,31 @@ min_emd_exhaustive <- function(dhist1, dhist2) { } #' Minimum shift to next alignment of two location vectors -#' -#' Calculate minimum right shift of first location vector to make any pair of +#' +#' Calculate minimum right shift of first location vector to make any pair of #' locations from the two vectors equal #' @param x1 First location vector. This vector is being shifted rightwards #' @param x2 Second location vector. This vector is remaining unchanged. -#' @return Minimum non-zero right-shift to apply to x1 to align at least one +#' @return Minimum non-zero right-shift to apply to x1 to align at least one #' element of x1 with at least one element of x2 -shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, +shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, shift_prev = NULL) { - if(!is.null(distance_matrix_prev) && !is.null(shift_prev)) { + if (!is.null(distance_matrix_prev) && !is.null(shift_prev)) { # If both distance matrix and shift from previous step provided, use these # to more efficiently calculate distance matrix distance_matrix <- (distance_matrix_prev - shift_prev) } else { - # Otherwise calculate distance matrix from scratch by calculating the + # Otherwise calculate distance matrix from scratch by calculating the # distance from each x1 to each x2 # NOTE: outer() generates a matrix with the first vector mapped to rows and - # the second vector mapped to columns, so the rows will be x2 and the + # the second vector mapped to columns, so the rows will be x2 and the # columns x1 distance_matrix <- outer(x2, x1, "-") - } + } # Calculate the distance from each x1 to each x2 - # outer() generates a matrix with the first vector mapped to rows and the + # outer() generates a matrix with the first vector mapped to rows and the # second vector mapped to columns - # We're stepping x1 from left to right across x2, so drop all negative + # We're stepping x1 from left to right across x2, so drop all negative # distances. Also drop zero distances as we want to step to the next alingment # even when x1 and x2 are already aligned distance_matrix[distance_matrix <= 0] <- Inf @@ -254,8 +257,8 @@ shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, return(list(shift = min(distance_matrix), distance_matrix = distance_matrix)) } -#' Earth Mover's Distance (EMD) -#' +#' Earth Mover's Distance (EMD) +#' #' Calculates the Earth Mover's Distance (EMD) between two discrete histograms #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object @@ -263,20 +266,20 @@ shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, #' @export emd <- function(dhist1, dhist2) { # Require the inputs to be "dhist" objects - if(!(is_dhist(dhist1) && is_dhist(dhist2))) { + if (!(is_dhist(dhist1) && is_dhist(dhist2))) { stop("All inputs must be 'dhist' discrete histogram objects") } - # Use efficient difference of cumulative histogram method that can also + # Use efficient difference of cumulative histogram method that can also # handle non-integer bin masses and location differences emd_cs(dhist1, dhist2) } #' Earth Mover's Distance (EMD) using the difference of cumulative sums method -#' +#' #' Takes two discrete histograms and calculates the Wasserstein / Earth Mover's -#' Distance between the two histograms by summing the absolute difference +#' Distance between the two histograms by summing the absolute difference #' between the two cumulative histograms. -#' @references +#' @references #' Calculation of the Wasserstein Distance Between Probability Distributions on the Line #' S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, 784-786 #' \url{http://dx.doi.org/10.1137/1118101} @@ -294,11 +297,11 @@ emd_cs <- function(dhist1, dhist2) { } #' Earth Mover's Distance (EMD) using linear programming (LP) -#' -#' Takes two sets of histogram bin masses and bin centres and calculates the +#' +#' Takes two sets of histogram bin masses and bin centres and calculates the #' Earth Mover's Distance between the two histograms by solving the Transport #' Problem using linear programming. -#' +#' #' WARNING: Linear Programming solution will only give a correct answer if all #' masses and distances between bin centres are integers. #' @param bin_masses1 Bin masses for histogram 1 @@ -310,49 +313,51 @@ emd_cs <- function(dhist1, dhist2) { emd_lp <- function(bin_masses1, bin_masses2, bin_centres1, bin_centres2) { num_bins1 <- length(bin_masses1) num_bins2 <- length(bin_masses2) - + # Check inputs: All bins in each histogram must have a mass and centre, so # the bin_mass and bin_centre vectors for each histogram must have the same # length. - if(length(bin_centres1) != num_bins1) { + if (length(bin_centres1) != num_bins1) { stop("Number of bin masses and bin centres provided for histogram 1 must be equal") } - if(length(bin_centres2) != num_bins2) { + if (length(bin_centres2) != num_bins2) { stop("Number of bin masses and bin centres provided for histogram 2 must be equal") } - + # Generate cost matrix cost_mat <- cost_matrix(bin_centres1, bin_centres2) - - # Linear Programming solver requires all bin masses and transportation costs + + # Linear Programming solver requires all bin masses and transportation costs # to be integers to generate correct answer - if(!isTRUE(all.equal(bin_masses1, floor(bin_masses1)))) { + if (!isTRUE(all.equal(bin_masses1, floor(bin_masses1)))) { stop("All bin masses for histogram 1 must be integers for accurate Linear Programming solution") } - if(!isTRUE(all.equal(bin_masses2, floor(bin_masses2)))) { + if (!isTRUE(all.equal(bin_masses2, floor(bin_masses2)))) { stop("All bin masses for histogram 2 must be integers for accurate Linear Programming solution") } - if(!isTRUE(all.equal(cost_mat, floor(cost_mat)))) { + if (!isTRUE(all.equal(cost_mat, floor(cost_mat)))) { stop("All costs must be integers for accurate Linear Programming solution") - } + } row_signs <- rep("==", num_bins1) col_signs <- rep("<=", num_bins2) - s <- lpSolve::lp.transport(cost.mat = cost_mat, row.signs = row_signs, - col.signs = col_signs, row.rhs = bin_masses1, - col.rhs = bin_masses2) + s <- lpSolve::lp.transport( + cost.mat = cost_mat, row.signs = row_signs, + col.signs = col_signs, row.rhs = bin_masses1, + col.rhs = bin_masses2 + ) return(s$objval) } #' Inter-bin cost matrix from bin centres -#' -#' Generates a matrix for the cost of moving a unit of mass between each bin in +#' +#' Generates a matrix for the cost of moving a unit of mass between each bin in #' histogram 1 and each bin in histogram 2. #' @param bin_centres1 Bin centres for histogram 1 #' @param bin_centres2 Bin centres for histogram 2 #' @return Cost matrix cost_matrix <- function(bin_centres1, bin_centres2) { # Calculate distances between all bins in network 1 and all bins in network 2 - num_bins1 <- length(bin_centres1) + num_bins1 <- length(bin_centres1) num_bins2 <- length(bin_centres2) loc_mat1 <- matrix(bin_centres1, nrow = num_bins1, ncol = num_bins2, byrow = FALSE) loc_mat2 <- matrix(bin_centres2, nrow = num_bins1, ncol = num_bins2, byrow = TRUE) diff --git a/R/graph_binning.R b/R/graph_binning.R index 47a58bcd..3f8aa21b 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -1,97 +1,113 @@ -#' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to -#' generate a function for calculating expected ego-network graphlet counts -#' from the statistics of a provided reference graph. -#' Temporarily accessible during development. +#' binned_densities_adaptive +#' +#' Adaptive binning function guaranteeing a minimum number of entries in each +#' bin. +#' @param densities Density values to use for binning. +#' @param min_counts_per_interval Minimum count for each bin. +#' @param num_intervals Initial number of density bins to generate. #' TODO: Remove @export prior to publishing #' @export -binned_densities_adaptive <- function(densities, min_counts_per_interval, num_intervals) -{ - breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, - breaks = num_intervals) - interval_indexes <- interval_index(densities, breaks = breaks, - out_of_range_intervals = FALSE) - list(densities = densities, interval_indexes = interval_indexes, breaks = breaks) +binned_densities_adaptive <- function(densities, + min_counts_per_interval, + num_intervals) { + if( length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + breaks <- adaptive_breaks(densities, + min_count = min_counts_per_interval, + breaks = num_intervals + ) + interval_indexes <- interval_index(densities, + breaks = breaks, + out_of_range_intervals = FALSE + ) + list( + densities = densities, + interval_indexes = interval_indexes, + breaks = breaks + ) } #' Bin values into intervals based on the provided breaks -#' +#' #' @param x The values to be binned #' @param breaks The boundaries between bins -#' @param out_of_range_intervals If \code{TRUE}, "out of range" values lying -#' below the first break or above the last break will be assigned to additional -#' unbounded lower and upper extrema intervals. If \code{FALSE} these "out of -#' range" values will be assigned to intervals bounded by the lowest or +#' @param out_of_range_intervals If \code{TRUE}, "out of range" values lying +#' below the first break or above the last break will be assigned to additional +#' unbounded lower and upper extrema intervals. If \code{FALSE} these "out of +#' range" values will be assigned to intervals bounded by the lowest or #' uppermost break respectively. #' @return A vector of bin indexes, one per value provided #' @export interval_index <- function(x, breaks, out_of_range_intervals = FALSE) { - # Get indexes for the intervals each value falls into. Setting - # all.inside = TRUE ensures that the minimum and maximum values will be + # Get indexes for the intervals each value falls into. Setting + # all.inside = TRUE ensures that the minimum and maximum values will be # assigned to the intervals they bound. findInterval(x, breaks, all.inside = !out_of_range_intervals) } #' Generate a set of breaks that attempt to be evenly spaced while ensuring each #' interval has the specified minimum count -#' +#' #' Starts by binning the variable by the breaks provided in \code{breaks} (if #' \code{breaks} is a vector), or generating a set of \code{breaks} at uniformly -#' spaced intervals (if \code{breaks} is a single number). It then iteratively -#' merges intervals with counts lower than \code{min_count} by removing breaks +#' spaced intervals (if \code{breaks} is a single number). It then iteratively +#' merges intervals with counts lower than \code{min_count} by removing breaks #' until all remaining intervals have counts of at least \code{min_count}. -#' +#' #' @param x The variable to be binned #' @param min_count The minimum count for each bin #' @param breaks Either a vector containing an intital set of breaks or a single -#' number indicating how many uniformly spaced intervals to use when constructing -#' the initial set of breaks. If a single number is provided, the minumum break -#' will be the minimum value of x and the maximum break will be the maximum -#' value of x. +#' number indicating how many uniformly spaced intervals to use when +#' constructing the initial set of breaks. If a single number is provided, the +#' minumum break will be the minimum value of x and the maximum break will be +#' the maximum value of x. #' #' @export adaptive_breaks <- function(x, min_count, breaks) { - if(length(breaks) == 1) { + if (length(breaks) == 1) { # Similarly to base::cut, we interpret a single number in breaks as the # number of intervals required and generate these evenly spaced min_x <- min(x) max_x <- max(x) - breaks = seq(from = min_x, to = max_x, length.out = breaks + 1) + breaks <- seq(from = min_x, to = max_x, length.out = breaks + 1) } # There is one less interval than there are breaks num_intervals <- length(breaks) - 1 # Get indexes for the intervals each value of x falls into. x_interval_indexes <- interval_index(x, breaks) # Find the lowest interval with fewer than the minimum required count. - # Not all intervals are guaranteed to have members in x. If they don't, they - # won't appear in x_interval_indexes. We therefore append the full list of - # indexes prior to counting and subtract 1 from all counts afterwards to get + # Not all intervals are guaranteed to have members in x. If they don't, they + # won't appear in x_interval_indexes. We therefore append the full list of + # indexes prior to counting and subtract 1 from all counts afterwards to get # an accurate count that includes indexes with no members with zero counts all_interval_indexes <- 1:num_intervals - interval_index_counts <- plyr::count(c(x_interval_indexes, all_interval_indexes)) + interval_index_counts <- plyr::count( + c(x_interval_indexes, all_interval_indexes) + ) interval_index_counts$freq <- interval_index_counts$freq - 1 - + # Find the first interval with fewer members than the minimum specified count - merge_position <- Position(function(i) i < min_count, interval_index_counts$freq) - # Not all intervals are guaranteed to have members, so convert the index + merge_position <- Position( + function(i) i < min_count, + interval_index_counts$freq + ) + # Not all intervals are guaranteed to have members, so convert the index # provided by Position into an index into the full interval list and then add merge_interval_index <- interval_index_counts$x[merge_position] - if(is.na(merge_interval_index)) { + if (is.na(merge_interval_index)) { # If all intervals have at least the minimum count, return the breaks return(breaks) } else { # Remove a break to merge the low count interval with one of its neighbours # and recursively call this function if (merge_interval_index == num_intervals) { - # If low interval is last one, we can only merge with the previous interval - # so remove lower break for low interval + # If low interval is last one, we can only merge with the previous + # interval so remove lower break for low interval merge_break_index <- merge_interval_index } else { - # In all other cases merge low interval with next inteval by removing + # In all other cases merge low interval with next inteval by removing # upper breal for low interval merge_break_index <- merge_interval_index + 1 } return(adaptive_breaks(x, min_count, breaks[-merge_break_index])) } -} \ No newline at end of file +} diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index a4825638..d1120d03 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,343 +1,1210 @@ -#' Netdis between all graph pairs using provided Centred Graphlet Counts -#' @param centred_graphlet_counts List containing Centred Graphlet Counts for -#' all graphs being compared -#' @param graphlet_size The size of graphlets to use for the Netdis calculation -#' (only counts for graphlets of the specified size will be used). The size of -#' a graphlet is the number of nodes it contains. -#' @return Pairwise Netdis statistics between graphs calculated using centred -#' counts for graphlets of the specified size +#' Netdis between two graphs +#' +#' Calculates the different variants of the network dissimilarity statistic Netdis between two graphs. The variants currently supported are Netdis using a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), and Netdis using a Geometric Poisson approximation for the expectation (\code{ref_graph = NULL}). +#' +#' +#' @param graph_1 A simple graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{graphlet_counts_1} is provided. If both \code{graph_1} and \code{graphlet_counts_1} are not \code{NULL}, then only \code{graphlet_counts_1} will be considered. +#' +#' @param graph_2 A simple graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{graphlet_counts_2} is provided. If both \code{graph_2} and \code{graphlet_counts_2} are not \code{NULL}, then only \code{graphlet_counts_2} will be considered. +#' +#' @param graphlet_counts_1 Pre-generated graphlet counts for the first query +#' graph. Matrix containing counts of each graphlet (columns) for +#' each ego-network (rows) in the input graph. Columns are labelled with +#' graphlet IDs and rows are labelled with the ID of the central node in each +#' ego-network. As well as graphlet counts, each matrix must contain an +#' additional column labelled "N" including the node count for +#' each ego network. (default: NULL). +#' If the \code{graphlet_counts_1} argument is defined then +#' \code{graph_1} will not be used. These counts can be obtained with \code{count_graphlets_ego}. +#' +#' +#' @param graphlet_counts_2 Pre-generated graphlet counts for the second query +#' graph. Matrix containing counts of each graphlet (columns) for +#' each ego-network (rows) in the input graph. Columns are labelled with +#' graphlet IDs and rows are labelled with the ID of the central node in each +#' ego-network. As well as graphlet counts, each matrix must contain an +#' additional column labelled "N" including the node count for +#' each ego network. (default: NULL). +#' If the \code{graphlet_counts_2} argument is defined then +#' \code{graph_2} will not be used. These counts can be obtained with \code{count_graphlets_ego}. +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query +#' graphs . +#' 2) A simplified \code{igraph} object - used as a reference graph from which +#' expected counts are calculated for all query graphs. +#' 3) NULL (Default) - Used for Netdis-GP, where the expected counts will be calculated based on the properties of the +#' query graphs themselves. (Geometric-Poisson approximation). +#' +#' @param graphlet_counts_ref Pre-generated reference graphlet counts. +#' Matrix containing counts of each graphlet (columns) for +#' each ego-network (rows) in the reference graph. Columns are labelled with +#' graphlet IDs and rows are labelled with the ID of the central node in each +#' ego-network. As well as graphlet counts, each matrix must contain an +#' additional column labelled "N" including the node count for +#' each ego network. (default: NULL). +#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +#' be used. +#' +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' +#' @param neighbourhood_size Ego network neighborhood size (default: 2). +#' +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes (default: 3). +#' +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges (default: 1). +#' +#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} +#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used +#' (Default: NULL). +#' +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), it will apply +#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply +#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' +#' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes +#' up to and including max_graphlet_size. +#' +#' @examples +#' require(netdist) +#' require(igraph) +#' #Set source directory for Virus PPI graph edge files stored in the netdist package. +#' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +#' # Load query graphs as igraph objects +#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +#' graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +#' +#' #Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. +#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) #This option will focus on detecting more general and global discrepancies between the ego-network structures. +#' +#' #Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. +#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) +#' +#' # Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. +#' # Two lattice networks of different sizes are used for this example. +#' goldstd_1 <- graph.lattice(c(8,8)) #A reference net +#' goldstd_2 <- graph.lattice(c(44,44)) #A reference net +#' +#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) +#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +#' +#' +#' #Providing pre-calculated subgraph counts. +#' +#' props_1 <- count_graphlets_ego(graph = graph_1) +#' props_2 <- count_graphlets_ego(graph = graph_2) +#' props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) +#' props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) +#' +#' #Netdis Geometric-Poisson. +#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) +#' +#' #Netdis Zero Expectation. +#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) +#' +#' #Netdis using gold-standard network +#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) +#' netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +#' @export +netdis_one_to_one <- function(graph_1 = NULL, + graph_2 = NULL, + ref_graph = 0, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL, + graphlet_counts_ref= NULL) { + + ## ------------------------------------------------------------------------ + # Check arguments + if (is.null(graph_1) && is.null(graphlet_counts_1)) { + stop("One of graph_1 and graphlet_counts_1 must be supplied.") + } + if (is.null(graph_2) && is.null(graphlet_counts_2)) { + stop("One of graph_2 and graphlet_counts_2 must be supplied.") + } + ## ------------------------------------------------------------------------ + # Generate graphlet counts and bundle them into named list with format needed + # for netdis_many_to_many. + + if (is.null(graphlet_counts_1)) { + graphlet_counts_1 <- count_graphlets_ego( + graph_1, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graph_1) + + if (is.null(graphlet_counts_2)) { + graphlet_counts_2 <- count_graphlets_ego( + graph_2, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graph_2) + + graphlet_counts <- list( + graph_1 = graphlet_counts_1, + graph_2 = graphlet_counts_2 + ) + + if(!is.null(ref_graph)){ + if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { + graphlet_counts_ref <- count_graphlets_ego( + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + ref_graph <- NULL + } + } + ## ------------------------------------------------------------------------ + # calculate netdis + result <- netdis_many_to_many( + graphs = NULL, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn, + graphlet_counts = graphlet_counts, + graphlet_counts_ref = graphlet_counts_ref + ) + + ## ------------------------------------------------------------------------ + # extract netdis statistics from list returned by netdis_many_to_many + result$netdis[, 1] +} + + +#' Netdis comparisons between one graph and many other graphs. +#' +#' @param graph_1 Query graph - this graph will be compared with +#' all graphs in graphs_compare. A simplified igraph graph object. +#' +#' @param graphs_compare Graphs graph_1 will be compared with. A named list of +#' simplified igraph graph objects. +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query +#' graphs (DEFAULT: 0). +#' 2) A simplified \code{igraph} object - used as a reference graph from which +#' expected counts are calculated for all query graphs. +#' 3) NULL - Expected counts will be calculated based on the properties of the +#' query graphs themselves. +#' +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 and 5 are supported. +#' +#' @param neighbourhood_size Ego network neighbourhood size. +#' +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes. +#' +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges. +#' +#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} +#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used +#' (Default: NULL). +#' +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), +#' it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply +#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' +#' +#' @param graphlet_counts_1 Pre-generated graphlet counts for the first query +#' graph. If the \code{graphlet_counts_1} argument is defined then +#' \code{graph_1} will not be used. +#' +#' @param graphlet_counts_compare Named list of pre-generated graphlet counts +#' for the remaining query graphs. If the \code{graphlet_counts_compare} +#' argument is defined then \code{graphs_compare} will not be used. +#' +#' @param graphlet_counts_ref Pre-generated reference graphlet counts. If the +#' \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +#' be used. +#' +#' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes +#' up to and including max_graphlet_size #' @export -netdis_for_all_graphs <- function( - centred_graphlet_counts, graphlet_size, mc.cores = getOption("mc.cores", 2L)) { - comp_spec <- cross_comparison_spec(centred_graphlet_counts) - # NOTE: mcapply only works on unix-like systems with system level forking - # capability. This means it will work on Linux and OSX, but not Windows. - # For now, we just revert to single threaded operation on Windows - # TODO: Look into using the parLappy function on Windows - if(.Platform$OS.type != "unix") { - # Force cores to 1 if system is not unix-like as it will not support - # forking - mc.cores = 1 +netdis_one_to_many <- function(graph_1 = NULL, + graphs_compare = NULL, + ref_graph = 0, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL, + graphlet_counts_ref= NULL) { + ## ------------------------------------------------------------------------ + # Check arguments + if (is.null(graph_1) && is.null(graphlet_counts_1)) { + stop("One of graph_1 and graphlet_counts_1 must be supplied.") } - netdis <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) {netdis( - centred_graphlet_counts[[index_a]], centred_graphlet_counts[[index_b]], - graphlet_size = graphlet_size) - }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE)) - list(netdis = netdis, comp_spec = comp_spec) + if (is.null(graphs_compare) && is.null(graphlet_counts_compare)) { + stop("One of graph_2 and graphlet_counts_2 must be supplied.") + } + + ## ------------------------------------------------------------------------ + # Generate graphlet counts and bundle them into named list with format needed + # for netdis_many_to_many. + + if (is.null(graphlet_counts_1)) { + graphlet_counts_1 <- count_graphlets_ego( + graph_1, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graph_1) + + if (is.null(graphlet_counts_compare)) { + graphlet_counts_compare <- purrr::map( + graphs_compare, + count_graphlets_ego, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graphs_compare) + + graphlet_counts <- append(graphlet_counts_compare, + list(graph_1 = graphlet_counts_1), + after = 0 + ) + + if(!is.null(ref_graph)){ + if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { + graphlet_counts_ref <- count_graphlets_ego( + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + ref_graph <- NULL + } + } + + ## ------------------------------------------------------------------------ + # calculate netdis + result <- netdis_many_to_many( + graphs = NULL, + ref_graph = ref_graph, + comparisons = "one-to-many", + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn, + graphlet_counts = graphlet_counts, + graphlet_counts_ref = graphlet_counts_ref + ) + + ## ------------------------------------------------------------------------ + # restructure netdis_many_to_many output + colnames(result$netdis) <- result$comp_spec$name_b + result$netdis } -#' Netdis + +#' Compute any of the Netdis variants between all graph pairs. +#' +#' @param graphs A named list of simplified igraph graph objects (undirected +#' graphs excluding loops, multiple edges), such as those +#' obtained by using \code{read_simple_graphs}. +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query +#' graphs. +#' 2) A simplified \code{igraph} object - used as a reference graph from which +#' expected counts are calculated for all query graphs. +#' 3) NULL (default) - Expected counts will be calculated based on the properties of the +#' query graphs themselves. (Geometric-Poisson approximation). +#' +#' @param comparisons Which comparisons to perform between graphs. +#' Can be "many-to-many" (all pairwise combinations) or "one-to-many" +#' (compare first graph in graphs to all other graphs.) +#' +#' @param max_graphlet_size Generate graphlets up to this size. Currently only 4 (default) and 5 are supported. +#' +#' @param neighbourhood_size Ego network neighbourhood size (default 2). +#' +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes (default 3). +#' +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges (default 1). +#' +#' @param binning_fn Function used to bin ego network densities. Takes edge \code{densities} +#' as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. +#' ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used (default: NULL). +#' +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +#' (bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), +#' it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson +#' approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' If \code{exp_counts_fn} is \code{NULL}, (default), it will apply +#' either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +#' values of \code{ref_graph} and \code{graphlet_counts_ref}. +#' +#' @param graphlet_counts Pre-generated graphlet counts (default: NULL). If the +#' \code{graphlet_counts} argument is defined then \code{graphs} will not be +#' used. +#' A named list of matrices containing counts of each graphlet (columns) for +#' each ego-network (rows) in the input graph. Columns are labelled with +#' graphlet IDs and rows are labelled with the ID of the central node in each +#' ego-network. As well as graphlet counts, each matrix must contain an +#' additional column labelled "N" including the node count for +#' each ego network. +#' +#' @param graphlet_counts_ref Pre-generated reference graphlet counts (default: NULL). Matrix containing counts +#' of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with +#' graphlet IDs and rows are labelled with the ID of the central node in each +#' ego-network. As well as graphlet counts, each matrix must contain an +#' additional column labelled "N" including the node count for +#' each ego network. +#' If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +#' be used. #' +#' @return Netdis statistics between query graphs for graphlet sizes +#' up to and including max_graphlet_size. +#' +#' @export +netdis_many_to_many <- function(graphs = NULL, + ref_graph = NULL, + comparisons = "many-to-many", + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + graphlet_counts = NULL, + graphlet_counts_ref = NULL) { + + ## ------------------------------------------------------------------------ + # Check arguments and set functions appropriately + if (is.null(graphs) && is.null(graphlet_counts)) { + stop("One of graphs and graphlet_counts must be supplied.") + } + + + # Set default binning_fn if none supplied + if (is.null(binning_fn)) { + binning_fn <- purrr::partial( + binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100 + ) + } + + # If no ref_graph supplied, default to geometric poisson unless user-defined + # functions have been provided. + if (is.null(ref_graph) && is.null(graphlet_counts_ref)) { + if (is.null(bin_counts_fn)) { + bin_counts_fn <- density_binned_counts_gp + } + if (is.null(exp_counts_fn)) { + exp_counts_fn <- purrr::partial( + netdis_expected_counts, + scale_fn = NULL + ) + } + # If a ref_graph value supplied (including a constant), default to approach + # from original netdis paper, unless user-defined functions provided. + } else { + if (is.null(bin_counts_fn)) { + bin_counts_fn <- purrr::partial( + density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego + ) + } + if (is.null(exp_counts_fn)) { + exp_counts_fn <- purrr::partial( + netdis_expected_counts, + scale_fn = count_graphlet_tuples + ) + } + } + + ## ------------------------------------------------------------------------ + # Generate ego networks and count graphlets for query graphs. + # But if graphlet counts have already been provided we can skip this step. + if (is.null(graphlet_counts)) { + graphlet_counts <- purrr::map( + graphs, + count_graphlets_ego, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graphs) + + ## ------------------------------------------------------------------------ + # Centred counts + # If there are no graphlet_counts_ref, and a number has been passed as ref_graph, treat it as a constant expected + # counts value (e.g. if ref_graph = 0 then no centring of counts). + if (is.numeric(ref_graph) && length(ref_graph) == 1 && is.null(graphlet_counts_ref)) { + centred_graphlet_counts <- purrr::map( + graphlet_counts, + netdis_centred_graphlet_counts, + ref_ego_density_bins = NULL, + ref_binned_graphlet_counts = ref_graph, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + max_graphlet_size = max_graphlet_size + ) + + ## ------------------------------------------------------------------------ + # If there are no graphlet_counts_ref, and If a reference graph passed, use it to calculate expected counts for all + # query graphs. + } else if (!is.null(ref_graph) || !is.null(graphlet_counts_ref)) { + + # Generate ego networks and calculate graphlet counts + # But if graphlet_counts_ref provided can skip this step + if (is.null(graphlet_counts_ref)) { + graphlet_counts_ref <- count_graphlets_ego( + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(ref_graph) + + # Get ego-network densities + densities_ref <- ego_network_density(graphlet_counts_ref) + + # bin ref ego-network densities + binned_densities <- binning_fn(densities_ref) + + ref_ego_density_bins <- binned_densities$breaks + + # Average ref graphlet counts across density bins + ref_binned_graphlet_counts <- bin_counts_fn( + graphlet_counts_ref, + binned_densities$interval_indexes, + max_graphlet_size = max_graphlet_size + ) + + # Calculate centred counts using ref graph + centred_graphlet_counts <- purrr::map( + graphlet_counts, + netdis_centred_graphlet_counts, + ref_ego_density_bins = ref_ego_density_bins, + ref_binned_graphlet_counts = ref_binned_graphlet_counts, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn, + max_graphlet_size = max_graphlet_size + ) + + ## ------------------------------------------------------------------------ + # If no reference passed, calculate expected counts using query networks + # themselves. Geometric-Poisson GP #This is the function that creates an error for a graph with three connected nodes. + } else { + centred_graphlet_counts <- purrr::map( + graphlet_counts, + netdis_centred_graphlet_counts, + ref_ego_density_bins = NULL, + ref_binned_graphlet_counts = NULL, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn, + max_graphlet_size = max_graphlet_size + ) + } + rm(graphlet_counts) + + ## ------------------------------------------------------------------------ + # Sum centred graphlet counts across all ego networks + sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) + + rm(centred_graphlet_counts) + + ## ------------------------------------------------------------------------ + # Generate pairwise comparisons + comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = comparisons) + + ## ------------------------------------------------------------------------ + # Calculate netdis statistics + results <- parallel::mcmapply( + function(index_a, index_b) { + netdis_uptok( + sum_graphlet_counts[[index_a]], + sum_graphlet_counts[[index_b]], + max_graphlet_size = max_graphlet_size + ) + }, + comp_spec$index_a, + comp_spec$index_b, + SIMPLIFY = TRUE + ) + + list(netdis = results, comp_spec = comp_spec) +} + +#' Netdis - for one graphlet size +#' #' Calculate Netdis statistic between two graphs from their Centred Graphlet -#' Counts (generated using \code{netdis_centred_graphlet_counts}). -#' @param centred_graphlet_counts1 Centred Graphlet Counts for graph 1 -#' @param centred_graphlet_counts2 Centred Graphlet Counts for graph 2 +#' Counts (generated using \code{netdis_centred_graphlet_counts}) for graphlets +#' of size \code{graphlet_size}. +#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for graph 1 +#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for graph 2 #' @param graphlet_size The size of graphlets to use for the Netdis calculation #' (only counts for graphlets of the specified size will be used). The size of #' a graphlet is the number of nodes it contains. -#' @return Netdis statistic calculated using centred counts for graphlets of +#' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export -netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, - graphlet_size) -{ - # Select subset of centred counts corresponding to graphlets of the +netdis <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vector_2, + graphlet_size) { + # Select subset of centred counts corresponding to graphlets of the # specified size ids <- graphlet_ids_for_size(graphlet_size) - counts1 <- centred_graphlet_counts1[ids] - counts2 <- centred_graphlet_counts2[ids] + counts1 <- centred_graphlet_count_vector_1[ids] + counts2 <- centred_graphlet_count_vector_2[ids] # Calculate normalising constant - norm_const <- sum(counts1^2 / sqrt(counts1^2 + counts2^2),na.rm = TRUE) * - sum(counts2^2 / sqrt(counts1^2 + counts2^2),na.rm = TRUE) + norm_const <- sum(counts1^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) * + sum(counts2^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate intermediate "netD" statistic that falls within range -1..1 - netds2 <- (1/sqrt(norm_const)) * sum((counts1 * counts2) / sqrt(counts1^2 + counts2^2),na.rm = TRUE) + netds2 <- (1 / sqrt(norm_const)) * + sum((counts1 * counts2) / + sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate corresponding "netd" Netdis statistic that falls within range 0..1 0.5 * (1 - netds2) -} +} -#' Scaled graphlet count for ego-networks -#' -#' Calculates graphlet counts for the n-step ego-network of each node in a graph, -#' scaled by dividing the graphlet counts for each ego-network by the total -#' number of possible groupings of nodes in the ego-network with the same number -#' of nodes as each graphlet. This scaling factor is choose(n, k), where n is the -#' number of nodes in the ego-network and k is the number of nodes in the graphlet. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. -#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside -#' graphlet counts to enable further processing. -#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix -#' containing counts of each graphlet (columns, C) for each ego-network in the -#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are -#' labelled with the ID of the central node in each ego-network (if nodes in the -#' input graph are labelled). If \code{return_ego_networks = TRUE}, returns a -#' list with the following elements: -#' \itemize{ -#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each -#' ego-network in the input graph as described above. -#' \item \code{ego_networks}: The ego-networks of the query graph. -#' } +#' Netdis - for all graphlet sizes up to max_graphlet_size +#' +#' Calculate Netdis statistic between two graphs from their Centred Graphlet +#' Counts (generated using \code{netdis_centred_graphlet_counts}) for all +#' graphlet sizes up to \code{max_graphlet_size}. +#' @param centred_graphlet_count_vector_1 Centred Graphlet Counts vector for graph 1 +#' @param centred_graphlet_count_vector_2 Centred Graphlet Counts vector for graph 2 +#' @param max_graphlet_size max graphlet size to calculate Netdis for. +#' The size of a graphlet is the number of nodes it contains. Netdis is +#' calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported. +#' @return Netdis statistic calculated using centred counts for graphlets of +#' the specified size #' @export -count_graphlets_ego_scaled <- function( - graph, max_graphlet_size, neighbourhood_size, return_ego_networks = FALSE) { - # Calculate ego-network graphlet counts, also returning the ego networks for - # use later in function - ego_data <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - return_ego_networks = TRUE) - ego_graphlet_counts <- ego_data$graphlet_counts - ego_networks <- ego_data$ego_networks - # Scale ego-network graphlet counts by dividing by total number of k-tuples in - # ego-network (where k is graphlet size) - ego_graphlet_tuples <- - count_graphlet_tuples_ego(ego_networks, max_graphlet_size = max_graphlet_size) - ego_graphlet_counts <- scale_graphlet_count(ego_graphlet_counts, ego_graphlet_tuples) - # Return either graphlet counts, or graphlet counts and ego_networks - if(return_ego_networks) { - return(list(graphlet_counts = ego_graphlet_counts, - ego_networks = ego_networks)) - } else { - return(ego_graphlet_counts) +netdis_uptok <- function(centred_graphlet_count_vector_1, centred_graphlet_count_vector_2, + max_graphlet_size) { + if ((max_graphlet_size > 5) | (max_graphlet_size < 3)) { + stop("max_graphlet_size must be 3, 4 or 5.") } -} - -#' Generate Netdis centred graphlets counts by subtracting expected counts -#' -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. -#' @param expected_ego_count_fn A function for generating expected ego-network -#' graphlet counts for a graph. This function should take a connected, -#' undirected, simple graph as an \code{igraph} object for its only argument. -#' Where \code{expected_ego_count_fn} is specific to particular values of -#' \code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken -#' to ensure that the values of these parameters passed to this function are -#' consistent with those used when creating \code{expected_ego_count_fn}. -#' @return A vector with centred counts for each graphlet type -#' @export -netdis_centred_graphlet_counts <- function( - graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn = - NULL) { - # Get centred counts for each ego network - centred_counts <- netdis_centred_graphlet_counts_ego( - graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn) - # Sum centred counts over ego-networks - apply(centred_counts, MARGIN = 2, FUN = sum) + + netdis_statistics <- purrr::map(3:max_graphlet_size, + netdis, + centred_graphlet_count_vector_1 = centred_graphlet_count_vector_1, + centred_graphlet_count_vector_2 = centred_graphlet_count_vector_2 + ) + + netdis_statistics <- simplify2array(netdis_statistics) + + names(netdis_statistics) <- + sapply( + "netdis", + paste, + 3:max_graphlet_size, + sep = "" + ) + + netdis_statistics } -#' TODO: Remove @export prior to publishing +#' netdis_centred_graphlet_counts +#' +#' Calculate expected graphlet counts for each ego network in a query graph and +#' centre the actual counts by subtracting those calculated expected count +#' values. +#' @param graphlet_counts Ego network graphlet counts for a query graph +#' +#' @param ref_ego_density_bins Either a list of previously calculated ego +#' network density bin edges from a reference network, or \code{NULL}, in +#' which case density bins are generated using the query graph itself. +#' +#' @param ref_binned_graphlet_counts Either expected graphlet counts for each +#' ego network density bin from a reference network (a matrix with columns +#' labelled by graphlet ID and rows by density bin index), \code{NULL}, in +#' which case density binned counts are generated using the query graph itself, +#' or a constant numeric value to subtract from all graphlet counts. +#' +#' @param binning_fn Function used to bin ego network densities. Only needed if +#' \code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are +#' \code{NULL}. Takes densities as its single argument, and returns a named list +#' including keys \code{breaks} (vector of bin edges) and \code{interval_indexes} +#' (density bin index for each ego network). +#' +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Only needed if \code{ref_ego_density_bins} and +#' \code{ref_binned_graphlet_counts} are \code{NULL}. Takes +#' \code{graphlet_counts}, \code{interval_indexes} (bin indexes) and +#' \code{max_graphlet_size} as arguments. +#' +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' +#' @param max_graphlet_size max graphlet size to calculate centred counts for. Currently only size 4 and 5 are supported. +#' +#' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size +#' max_graphlet_size. #' @export -netdis_centred_graphlet_counts_ego <- function( - graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn = NULL, - min_ego_nodes = 3, min_ego_edges = 1) { - # Get unscaled ego-network graphlet counts - res <- count_graphlets_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, return_ego_networks = TRUE) - actual_counts = res$graphlet_counts - ego_networks <- res$ego_networks +netdis_centred_graphlet_counts <- function( + graphlet_counts, + ref_ego_density_bins, + ref_binned_graphlet_counts, + binning_fn, + bin_counts_fn, + exp_counts_fn, + max_graphlet_size) { + + ## ------------------------------------------------------------------------ + # If a number has been passed as ref_binned_graphlet_counts, treat it as a + # constant expected counts value (e.g. if ref_binned_graphlet_counts = 0 + # then no centring of counts). + if (is.numeric(ref_binned_graphlet_counts) && + length(ref_binned_graphlet_counts) == 1) { + exp_graphlet_counts <- netdis_const_expected_counts( + graphlet_counts, + const = ref_binned_graphlet_counts + ) - # Drop ego-networks that don't have the minimum number of nodes or edges - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { - (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) - })) - actual_counts <- actual_counts[!drop_index,] - ego_networks <- ego_networks[!drop_index] + ## ------------------------------------------------------------------------ + # If reference bins and counts passed, use them to calculate + # expected counts + } else if (!is.null(ref_ego_density_bins) && + !is.null(ref_binned_graphlet_counts)) { + # Calculate expected graphlet counts (using ref + # graph ego network density bins) + exp_graphlet_counts <- exp_counts_fn( + graphlet_counts, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size + ) - # Centre these counts by subtracting the expected counts - if(is.null(expected_ego_count_fn)) { - centred_counts = actual_counts - } else { - centred_counts <- actual_counts - expected_ego_count_fn(graph) - } - centred_counts + ## ------------------------------------------------------------------------ + # If NULL passed as ref bins and counts, calculate expected counts using + # query network itself. This should be GP. + } else if (is.null(ref_ego_density_bins) && + is.null(ref_binned_graphlet_counts)) { + # Get ego-network densities + densities <- ego_network_density(graphlet_counts) + + # bin ref ego-network densities + binned_densities <- binning_fn(densities) + + # extract bin breaks and indexes from binning results + ego_density_bin_breaks <- binned_densities$breaks + ego_density_bin_indexes <- binned_densities$interval_indexes + + # Calculate expected counts in each bin + binned_graphlet_counts <- bin_counts_fn( + graphlet_counts, + ego_density_bin_indexes, + max_graphlet_size = max_graphlet_size + ) + + # Calculate expected graphlet counts for each ego network + exp_graphlet_counts <- exp_counts_fn( + graphlet_counts, + ego_density_bin_breaks, + binned_graphlet_counts, + max_graphlet_size = max_graphlet_size + ) + + ## ------------------------------------------------------------------------ + # Invalid combination of ref_ego_density_bins and ref_binned_graphlet_counts + } else { + stop("Invalid combination of ref_ego_density_bins and + ref_binned_graphlet_counts. Options are: + - Both NULL: calculate expected counts using query network. + - Vector of bin edges and matrix of binned counts: Reference graph values + for calculating expected counts. + - Constant numeric ref_binned_graphlet_counts: Use as constant expected + counts value.") + } + + ## ------------------------------------------------------------------------ + # Subtract expected counts from actual graphlet counts + netdis_subtract_exp_counts( + graphlet_counts, + exp_graphlet_counts, + max_graphlet_size + ) } -#' Generate Netdis expected graphlet count function -#' -#' Generates a function to calculate expected ego-network graphlet counts for -#' query graphs based on the statistics of a provided reference graph. -#' -#' Generates graphlet counts for all ego-networks in the supplied reference graph -#' and then averages these graphlet counts over density bins to generate -#' density-dependent reference graphlet counts. Prior to averaging, the graphlet -#' counts are scaled in a size-dependent manner to permit ego-networks with -#' similar densities but different sizes to be averaged together. -#' -#' Returns a function that uses the density-dependent reference graphlet -#' counts to generate expected graphlet counts for all ego-networks in a query -#' network. When doing so, it matches ego-networks to reference counts by -#' density and reverses the scaling that was applied to the original reference -#' counts in order to allow averaging across ego-networks with similar density -#' but different numbers of nodes. -#' @param graph A connected, undirected, simple reference graph as an -#' \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' node in ego-network. -#' @return A function taking a connected, undirected, simple query graph as an -#' \code{igraph} object and returning an RxC matrix containing the expected -#' counts of each graphlet (columns, C) for each ego-network in the query graph -#' (rows, R). Columns are labelled with graphlet IDs and rows are labelled with -#' the ID of the central node in each ego-network (if nodes in the query graph -#' are labelled) + +#' netdis_subtract_exp_counts +#' +#' Subtract expected graphlet counts from actual graphlet counts. +#' +#' @param graphlet_counts Matrix of graphlet counts (columns) for a +#' nummber of ego networks (rows). +#' @param exp_graphlet_counts Matrix of expected graphlet counts (columns) for a +#' nummber of ego networks (rows). +#' @param max_graphlet_size Do the subtraction for graphlets up to this size. Currently only size 4 and 5 are supported. #' @export -netdis_expected_graphlet_counts_ego_fn <- function( - graph, max_graphlet_size, neighbourhood_size, - min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, num_bins = 100) { - - # Calculate the scaled graphlet counts for all ego networks in the reference - # graph, also returning the ego networks themselves in order to calculate - # their densities - res <- count_graphlets_ego_scaled( - graph, max_graphlet_size, neighbourhood_size, return_ego_networks = TRUE) - scaled_graphlet_counts = res$graphlet_counts - ego_networks <- res$ego_networks - - # Drop ego-networks that don't have the minimum number of nodes or edges - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { - (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) - })) - scaled_graphlet_counts <- scaled_graphlet_counts[!drop_index,] - ego_networks <- ego_networks[!drop_index] - - # Get ego-network densities - densities <- purrr::simplify(purrr::map_dbl(ego_networks, igraph::edge_density)) - - # Adaptively bin ego-network densities - binned_densities <- binned_densities_adaptive( - densities, min_counts_per_interval = min_bin_count, num_intervals = num_bins) - - # Average graphlet counts across density bins - density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts, binned_densities$interval_indexes) - - # Return a partially applied function with the key reference graph information - # built-in - purrr::partial( - netdis_expected_graphlet_counts_ego, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - density_breaks = binned_densities$breaks, - density_binned_reference_counts = density_binned_graphlet_counts) +netdis_subtract_exp_counts <- function( + graphlet_counts, + exp_graphlet_counts, + max_graphlet_size) { + + # select columns for graphlets up to size max_graphlet_size + id <- graphlet_key(max_graphlet_size)$id + graphlet_counts <- graphlet_counts[, id] + exp_graphlet_counts <- exp_graphlet_counts[, id] + + # Subtract expected counts from actual graphlet counts + graphlet_counts - exp_graphlet_counts } -#' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to -#' generate a function for calculating expected ego-network graphlet counts -#' from the statistics of a provided reference graph. -#' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing +#' netdis_expected_counts +#' +#' Calculates expected graphlet counts for each ego network based on its density +#' and pre-calculated reference density bins and graphlet counts for each bin. +#' +#' @param graphlet_counts Matrix of graphlet and node counts (columns) for a +#' nummber of ego networks (rows). +#' @param density_breaks Density values defining bin edges. +#' @param density_binned_reference_counts Reference network graphlet counts for +#' each density bin. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported. +#' @param scale_fn Optional function to scale calculated expected counts, taking +#' \code{graphlet_counts} and \code{max_graphlet_size} as arguments, +#' and returning a scale factor that the looked up +#' \code{density_binned_reference_counts} values will be multiplied by. +#' #' @export -netdis_expected_graphlet_counts_ego <- function( - graph, max_graphlet_size, neighbourhood_size, - density_breaks, density_binned_reference_counts, - min_ego_nodes = 3, min_ego_edges = 1) { - # Generate ego-networks for query graph - ego_networks <- make_named_ego_graph(graph, neighbourhood_size) - # Drop ego-networks that don't have the minimum number of nodes or edges - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { - (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) - })) - ego_networks <- ego_networks[!drop_index] - # Map over query graph ego-networks, using reference graph statistics to +netdis_expected_counts <- function( + graphlet_counts, + density_breaks, + density_binned_reference_counts, + max_graphlet_size, + scale_fn = NULL) { + + + # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. - expected_graphlet_counts <- - purrr::map(ego_networks, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts) - names(expected_graphlet_counts) <- names(ego_networks) - # Simplify list to array - t(simplify2array(expected_graphlet_counts)) + expected_graphlet_counts <- t(apply( + graphlet_counts, 1, netdis_expected_counts_ego, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = density_binned_reference_counts, + scale_fn = scale_fn + )) + + expected_graphlet_counts } +#' netdis_expected_counts_ego #' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego} to -#' calculate expected graphlet counts for a query graph ego-network from the -#' statistics of a provided reference graph. -#' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing -#' @export -netdis_expected_graphlet_counts <- function( - graph, max_graphlet_size, density_breaks, density_binned_reference_counts) { +#' +#' Calculates expected graphlet counts for one ego network based on its density +#' and pre-calculated reference density bins and graphlet counts for each bin. +#' +#' @param graphlet_counts Node and graphlet counts for an ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported. +#' @param density_breaks Density values defining bin edges. +#' @param density_binned_reference_counts Reference network graphlet counts for +#' each density bin. +#' @param scale_fn Optional function to scale calculated expected counts, taking +#' \code{graphlet_counts} and \code{max_graphlet_size} as arguments, and +#' returning a scale factor that the looked up +#' \code{density_binned_reference_counts} values will be multiplied by. +#' +netdis_expected_counts_ego <- function(graphlet_counts, + max_graphlet_size, + density_breaks, + density_binned_reference_counts, + scale_fn = NULL) { + # Look up average scaled graphlet counts for graphs of similar density # in the reference graph - query_density <- igraph::edge_density(graph) + query_density <- density_from_counts(graphlet_counts) matched_density_index <- interval_index(query_density, density_breaks) - matched_reference_counts <- density_binned_reference_counts[matched_density_index,] - # Scale reference counts by multiplying the reference count for each graphlet - # by the number of possible sets of k nodes in the query graph, where k is the - # number of nodes in the graphlet - matched_reference_counts * count_graphlet_tuples(graph, max_graphlet_size) + + matched_reference_counts <- + density_binned_reference_counts[matched_density_index, ] + + if (!is.null(scale_fn)) { + # Scale reference counts e.g. by multiplying the + # reference count for each graphlet by the number + # of possible sets of k nodes in the query graph, + # where k is the number of nodes in the graphlet. + matched_reference_counts <- matched_reference_counts * + scale_fn(graphlet_counts, max_graphlet_size) + } + matched_reference_counts } -#' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to -#' generate a function for calculating expected ego-network graphlet counts -#' from the statistics of a provided reference graph. -#' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing +#' mean_density_binned_graphlet_counts +#' +#' Calculate mean (dy default) graphlet counts for ego networks in each density +#' bin. +#' +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network in graphlet_counts. +#' @param agg_fn Function to aggregate counts in each bin +#' (default \code{agg_fn = mean}). +#' #' @export -mean_density_binned_graphlet_counts <- function( - graphlet_counts, density_interval_indexes) { - # The ego network graphlet counts are an E x G matrix with rows (E) representing - # ego networks and columns (G) representing graphlets. We want to calculate - # the mean count for each graphlet / density bin combination, so we will - # use tapply to average counts for each graphlet across density bins, using - # apply to map this operation over graphlets - mean_density_binned_graphlet_counts <- +mean_density_binned_graphlet_counts <- function(graphlet_counts, + density_interval_indexes, + agg_fn = mean) { + # The ego network graphlet counts are an E x G matrix with rows (E) + # representing ego networks and columns (G) representing graphlets. We want + # to calculate the mean count for each graphlet / density bin combination, + # so we will use tapply to average counts for each graphlet across density + # bins, using apply to map this operation over graphlets + mean_density_binned_graphlet_counts <- apply(graphlet_counts, MARGIN = 2, function(gc) { - tapply(gc, INDEX = density_interval_indexes, FUN = mean)}) + tapply(gc, INDEX = density_interval_indexes, FUN = agg_fn) + }) + + # if only 1 bin (i.e. no binning) will be left with a 1D list. + # convert it into a 2D list. + if (is.null(dim(mean_density_binned_graphlet_counts))) { + dim(mean_density_binned_graphlet_counts) <- + c(1, length(mean_density_binned_graphlet_counts)) + + colnames(mean_density_binned_graphlet_counts) <- + colnames(graphlet_counts) + } + + mean_density_binned_graphlet_counts } +#' For case where don't want to use binning, return a single bin which covers +#' the full range of possible density values (0 to 1). +#' @param densities Ego network density values (only used to return +#' a list of indexes of the required length.) +#' @export +single_density_bin <- function(densities) { + list( + densities = densities, + interval_indexes = rep(1, length(densities)), + breaks = c(0, 1) + ) +} +#' Used to calculate aggregated graphlet counts for each density bin. +#' +#' @param graphlet_counts Graphlet and node counts (columns) for a number of +#' ego_networks (rows). +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param agg_fn Function to aggregate counts in each bin +#' (default \code{agg_fn = mean}). +#' @param scale_fn Optional function to apply a transformation/scaling +#' to the raw graphlet_counts. Must have arguments \code{graphlet_counts} and +#' \code{max_graphlet_size}, and return a transformed \code{graphlet_counts} +#' object with the same number of rows as the input, and columns for all +#' graphlets up to \code{max_graphlet_size}. +#' @param max_graphlet_size Optionally passed and used by scale_fn. +#' #' @export +density_binned_counts <- function(graphlet_counts, + density_interval_indexes, + agg_fn = mean, + scale_fn = NULL, + max_graphlet_size = NULL) { + if (!is.null(scale_fn)) { + # Scale ego-network graphlet counts e.g. + # by dividing by total number of k-tuples in + # ego-network (where k is graphlet size) + graphlet_counts <- scale_fn(graphlet_counts, + max_graphlet_size = max_graphlet_size + ) + } + + mean_density_binned_graphlet_counts(graphlet_counts, + density_interval_indexes, + agg_fn = agg_fn + ) +} + +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Used by \code{density_binned_counts_gp} +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin indexes for each ego network in +#' \code{graphlet_counts}. +#' @param max_graphlet_size Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. +#' included in graphlet_counts. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + # extract ego networks belonging to input density bin index + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + + # mean graphlet counts in this density bin + means <- colMeans(counts) + + # subtract mean graphlet counts from actual graphlet counts + mean_sub_counts <- sweep(counts, 2, means) + + # variance in graphlet counts across ego networks in this density bin + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + + # Dealing with zero variance HERE + ind_zerovar <- (Vd_sq < .00000001) + if(sum(ind_zerovar) > 0) Vd_sq[ind_zerovar] <- 0.1 + + # GP theta parameter for each graphlet id in this density bin + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + # GP lambda parameter for graphlet size k in this density bin + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE + ) + + # Expected counts for graphlet size k in this density bin + exp_counts_dk <- append( + exp_counts_dk, + lambda_dk / theta_d[graphlet_idx] + ) + + } + + # Dealing with divisions by zero. + ind <- is.na(exp_counts_dk) + ind <- ind | is.infinite(exp_counts_dk) + if(sum(ind) > 0) exp_counts_dk[ind & ind_zerovar[-1]] <- 0 + + + exp_counts_dk +} + +#' Calculate expected counts in density bins using the +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. +#' included in graphlet_counts. +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + # calculate expected counts for each density bin index + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply( + 1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + max_graphlet_size = max_graphlet_size + )) + + # remove NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + + expected_counts_bin +} + +#' Create matrix of constant value to use as expected counts. +#' @param graphlet_counts Ego network graphlet counts matrix to create expected +#' counts for. +#' @param const Constant expected counts value to use. +#' @return Counts of value const with same shape and names as graphlet_counts. +netdis_const_expected_counts <- function(graphlet_counts, const) { + exp_counts <- graphlet_counts + exp_counts[, ] <- const + exp_counts +} + + +#' Replace zero values in a vector with ones. Used by +#' \code{scale_graphlet_count} to prevent divide by +#' zero errors. +#' @param v A vector. zeros_to_ones <- function(v) { zero_index <- which(v == 0) v[zero_index] <- 1 v } + +#' Divide graphlet counts by pre-computed scaling factor from +#' \code{count_graphlet_tuples} output. +#' @param graphlet_count Pre-computed graphlet counts. +#' @param graphlet_tuples Pre-computed \code{count_graphlet_tuples} output. #' @export scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { # Avoid divide by zero errors by replacing all zeros with ones in the # divisor - graphlet_count / zeros_to_ones(graphlet_tuples) + graphlet_count[, colnames(graphlet_tuples)] / zeros_to_ones(graphlet_tuples) } + +#' Run count_graphlet_tuples across pre-computed ego networks. +#' @param graphlet_counts Matrix of graphlet and node counts (columns) for a +#' number of ego networks (rows). +#' @param max_graphlet_size Determines the maximum size of graphlets included +#' in the tuple counts. Currently only size 4 and 5 are supported. #' @export -count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { - graphlet_tuple_counts <- - t(simplify2array(purrr::map(ego_networks, count_graphlet_tuples, - max_graphlet_size = max_graphlet_size))) +count_graphlet_tuples_ego <- function(graphlet_counts, max_graphlet_size) { + graphlet_tuple_counts <- + t(apply(graphlet_counts, 1, + count_graphlet_tuples, + max_graphlet_size = max_graphlet_size + )) + graphlet_tuple_counts } + +#' Calculate edge density for a single graph. +#' @param graphlet_counts Vector of pre-calculated graphlet, edge and node +#' counts. Must have named items "N" (node counts) and "G0" (edge counts). #' @export -count_graphlet_tuples <- function(graph, max_graphlet_size) { - graph_node_count <- igraph::vcount(graph) - graphlet_key <- graphlet_key(max_graphlet_size) - graphlet_node_counts <- graphlet_key$node_count - graphlet_tuple_counts <- choose(graph_node_count, graphlet_node_counts) - graphlet_tuple_counts <- stats::setNames(graphlet_tuple_counts, graphlet_key$id) - graphlet_tuple_counts +density_from_counts <- function(graphlet_counts) { + graphlet_counts["G0"] / choose(graphlet_counts["N"], 2) +} + +#' Calculate ego network edge densities. +#' @param graphlet_counts Matrix of pre-generated graphlet, edge and node counts +#' (columns) for each ego network (rows). Columns must include "N" (node counts) +#' and "G0" (edge counts). +#' @export +ego_network_density <- function(graphlet_counts) { + apply(graphlet_counts, 1, density_from_counts) } +#' Scale graphlet counts for an ego network by the n choose k possible +#' choices of k nodes in that ego-network, where n is the number of nodes +#' in the ego network and k is the number of nodes in the graphlet. +#' +#' @param graphlet_counts Pre-calculated graphlet counts for each ego_network. +#' @param max_graphlet_size Determines the maximum size of graphlets included +#' in graphlet_counts. Currently only size 4 and 5 are supported. +#' @return scaled graphlet counts. +#' @export +scale_graphlet_counts_ego <- function(graphlet_counts, + max_graphlet_size) { + ego_graphlet_tuples <- count_graphlet_tuples_ego( + graphlet_counts, + max_graphlet_size = max_graphlet_size + ) + + scaled_graphlet_counts <- scale_graphlet_count( + graphlet_counts, + ego_graphlet_tuples + ) + + return(scaled_graphlet_counts) +} +#' For each graphlet calculate the number of possible sets of k nodes in the +#' query graph, where k is the number of nodes in the graphlet. +#' +#' @param graph_graphlet_counts Node and graphlet counts for a single graph. +#' @param max_graphlet_size Determines the maximum size of graphlets included +#' in the tuple counts. Currently only size 4 and 5 are supported. +#' @export +count_graphlet_tuples <- function(graph_graphlet_counts, max_graphlet_size) { + # extract node counts from graph_graphlet_counts + N <- graph_graphlet_counts["N"] + + graphlet_key <- graphlet_key(max_graphlet_size) + graphlet_node_counts <- graphlet_key$node_count + + graphlet_tuple_counts <- choose(N, graphlet_node_counts) + + graphlet_tuple_counts <- stats::setNames( + graphlet_tuple_counts, + graphlet_key$id + ) +} diff --git a/R/measures_net_emd.R b/R/measures_net_emd.R index b45b667e..8246dfeb 100755 --- a/R/measures_net_emd.R +++ b/R/measures_net_emd.R @@ -1,116 +1,136 @@ -#' NetEMDs between all graph pairs using provided Graphlet-based Degree -#' Distributions -#' @param gdds List containing sets of Graphlet-based Degree Distributions for -#' all graphs being compared -#' @param method The method to use to find the minimum EMD across all potential -#' offsets for each pair of histograms. Default is "optimise" to use -#' R's built-in \code{stats::optimise} method to efficiently find the offset -#' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the -#' histograms at all offsets that are candidates for the minimal EMD. -#' @param return_details Logical indicating whether to return the individual -#' minimal EMDs and associated offsets for all pairs of histograms -#' @param smoothing_window_width Width of "top-hat" smoothing window to apply to -#' "smear" point masses across a finite width in the real domain. Default is 0, -#' which results in no smoothing. Care should be taken to select a -#' \code{smoothing_window_width} that is appropriate for the discrete domain -#' (e.g.for the integer domain a width of 1 is the natural choice) -#' @param mc.cores Number of cores to use for parallel processing. Defaults to -#' the \code{mc.cores} option set in the R environment. -#' @return NetEMD measures between all pairs of graphs for which GDDs -#' were provided. Format of returned data depends on the \code{return_details} -#' parameter. If set to FALSE, a list is returned with the following named -#' elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, -#' \code{comp_spec}: a comaprison specification table containing the graph names -#' and indices within the input GDD list for each pair of graphs compared. -#' If \code{return_details} is set to FALSE, the list also contains the following -#' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD -#' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving -#' the minimal EMD for each GDD -#' @export -net_emds_for_all_graphs <- function( - gdds, method = "optimise", smoothing_window_width = 0, - return_details = FALSE, mc.cores = getOption("mc.cores", 2L)) { - comp_spec <- cross_comparison_spec(gdds) - # NOTE: mcapply only works on unix-like systems with system level forking - # capability. This means it will work on Linux and OSX, but not Windows. - # For now, we just revert to single threaded operation on Windows - # TODO: Look into using the parLappy function on Windows - if(.Platform$OS.type != "unix") { - # Force cores to 1 if system is not unix-like as it will not support - # forking - mc.cores = 1 - } - num_features <- length(gdds[[1]]) - out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) {net_emd( - gdds[[index_a]], gdds[[index_b]], method = method, return_details = return_details, - smoothing_window_width = smoothing_window_width) - }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE, mc.cores = mc.cores)) - if(return_details) { - net_emds <- purrr::simplify(purrr::map(out, ~.$net_emd)) - min_emds <- matrix(purrr::simplify(purrr::map(out, ~.$min_emds)), ncol = num_features, byrow = TRUE) - colnames(min_emds) <- purrr::simplify(purrr::map(1:num_features, ~paste("MinEMD_O", .-1, sep = ""))) - min_offsets <- matrix(purrr::simplify(purrr::map(out, ~.$min_offsets)), ncol = num_features, byrow = TRUE) - colnames(min_offsets) <- purrr::simplify(purrr::map(1:num_features, ~paste("MinOffsets_O", .-1, sep = ""))) - min_offsets_std <- matrix(purrr::simplify(purrr::map(out, ~.$min_offsets_std)), ncol = num_features, byrow = TRUE) - colnames(min_offsets_std) <- purrr::simplify(purrr::map(1:num_features, ~paste("MinOffsetsStd_O", .-1, sep = ""))) - ret <- list(net_emds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets,min_offsets_std = min_offsets_std) - } else { - net_emds <- out - ret <- list(net_emds = net_emds, comp_spec = comp_spec) - } -} - -#' NetEMD Network Earth Mover's Distance -#' -#' Calculates the mean minimum Earth Mover's Distance (EMD) between two sets of -#' discrete histograms after normalising each histogram to unit mass and variance. +#' NetEMD Network Earth Mover's Distance between a pair of networks. +#' +#' Calculates the network Earth Mover's Distance (EMD) between +#' two sets of network features. This is done by individually normalising the distribution +#' of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. #' This is calculated as follows: -#' 1. Normalise each histogram to have unit mass and unit variance -#' 2. Find the minimum EMD between each pair of histograms -#' 3. Take the average minimum EMD across all histogram pairs -#' @param dhists1 A \code{dhist} discrete histogram object or a list of such objects -#' @param dhists2 A \code{dhist} discrete histogram object or a list of such objects -#' @param method The method to use to find the minimum EMD across all potential +#' 1. Normalise each feature histogram to have unit mass and unit variance. +#' 2. For each feature, find the minimum EMD between each pair of histograms considering all possible histogram translations. +#' 3. Take the average minimum EMD across all features. +#' @param graph_1 A network/graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is provided. +#' @param graph_2 A network/graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is provided. +#' @param dhists_1 Either, a \code{dhist} discrete histogram object, or list of such objects, or a matrix of network features (each column representing a feature). \code{dhists_1} can be set to \code{NULL} (default) if \code{graph_1} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}. +#' @param dhists_2 Same as \code{dhists_1}. +#' @param method The method to be used to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use -#' R's built-in \code{stats::optimise} method to efficiently find the offset -#' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the -#' histograms at all offsets that are candidates for the minimal EMD. +#' R's built-in \code{stats::optimise} method to efficiently find the offset +#' with the minimal EMD. However, this is not guaranteed to find the global +#' minimum if multiple local minima EMDs exist. You can alternatively specify the +#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' histograms at all offsets that are candidates for the minimal EMD at the cost of computational time. #' @param return_details Logical indicating whether to return the individual -#' minimal EMDs and associated offsets for all pairs of histograms +#' minimal EMDs and associated offsets for all pairs of histograms. #' @param smoothing_window_width Width of "top-hat" smoothing window to apply to -#' "smear" point masses across a finite width in the real domain. Default is 0, -#' which results in no smoothing. Care should be taken to select a -#' \code{smoothing_window_width} that is appropriate for the discrete domain -#' (e.g.for the integer domain a width of 1 is the natural choice) -#' @return NetEMD measure for the two sets of discrete histograms -#' (\code{return_details = FALSE}) or a list with the following named elements -#' \code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_emds}: +#' "smear" point masses across a finite width in the real domain. Default is 0, +#' which results in no smoothing. Care should be taken to select a +#' \code{smoothing_window_width} that is appropriate for the discrete domain +#' (e.g.for the integer domain a width of 1 is the natural choice). +#' @param feature_type Type of graphlet-based feature to count: "graphlet" +#' counts the number of graphlets each node participates in; "orbit" (default) calculates +#' the number of graphlet orbits each node participates in. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Possible values are 4, and 5 (default). +#' @param ego_neighbourhood_size The number of steps from the source node to +#' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. +#' @return NetEMD measure for the two sets of discrete histograms (or graphs). If +#' (\code{return_details = FALSE}) then a list with the following named elements is returned +#' \code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: #' the minimal EMD for each pair of histograms, \code{min_offsets}: the associated #' offsets giving the minimal EMD for each pair of histograms +#' @examples +#' require(igraph) +#' graph_1 <- graph.lattice(c(8,8)) +#' graph_2 <- graph.lattice(c(44,44)) +#' netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) +#' +#' #Providing a matrix of network features +#' props_a= count_orbits_per_node(graph = graph_1,max_graphlet_size = 5) +#' props_b= count_orbits_per_node(graph = graph_2,max_graphlet_size = 5) +#' +#' netemd_one_to_one(dhists_1=props_a, dhists_2=props_b,smoothing_window_width = 1) +#' +#' #Providing the network features as lists of dhist objects +#' dhists_1<- graph_features_to_histograms(props_a) +#' dhists_2<- graph_features_to_histograms(props_b) +#' +#' netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) +#' +#' +#' # A variation of NetEmd: Using the Laplacian spectrum +#' #Laplacian +#' Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) +#' Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) +#' +#' #Normalized Laplacian +#' NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) +#' NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) +#' +#' #Spectra (This may take a couple of minutes). +#' props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +#' props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) +#' +#' netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +#' #' @export -net_emd <- function(dhists1, dhists2, method = "optimise", - return_details = FALSE, smoothing_window_width = 0) { +netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2=NULL, method = "optimise", + return_details = FALSE, smoothing_window_width = 0,feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { + ## ------------------------------------------------------------------------ + # Check arguments 1 + if (!igraph::is.igraph(graph_1) & is.null(dhists_1)) { + stop("One of graph_1 or dhists_1 must be supplied.") + } + if (!igraph::is.igraph(graph_2) & is.null(dhists_2)) { + stop("One of graph_2 or dhists_2 must be supplied.") + } + ## ------------------------------------------------------------------------ + # Check arguments 2 + # If dhists_1 is a matrix of network features then transform them to dhist objects. + if(is.matrix(dhists_1)){ + dhists_1 <- graph_features_to_histograms(dhists_1) + } + if(is.matrix(dhists_2)){ + dhists_2 <- graph_features_to_histograms(dhists_2) + } + ## ------------------------------------------------------------------------ + # Check arguments 3 + #If input is graph then get graphlet counts + if(igraph::is.igraph(graph_1)){ + if(!is.null(dhists_1)){warning("dhists_1 will be calculated from graph_1.")} + dhists_1 <- gdd(graph = graph_1, feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ) + } + if(igraph::is.igraph(graph_2)){ + if(!is.null(dhists_2)){warning("dhists_2 will be calculated from graph_2.")} + dhists_2 <- gdd(graph = graph_2, feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ) + } + + rm(graph_1,graph_2) + ## ------------------------------------------------------------------------ # Require either a pair of "dhist" discrete histograms or two lists of "dhist" # discrete histograms - pair_of_dhist_lists <- all(purrr::map_lgl(dhists1, is_dhist)) && all(purrr::map_lgl(dhists2, is_dhist)) + pair_of_dhist_lists <- all(purrr::map_lgl(dhists_1, is_dhist)) && all(purrr::map_lgl(dhists_2, is_dhist)) # If input is two lists of "dhist" discrete histograms, determine the minimum - # EMD and associated offset for pairs of histograms taken from the same + # EMD and associated offset for pairs of histograms taken from the same # position in each list - if(pair_of_dhist_lists) { - details <- purrr::map2(dhists1, dhists2, function(dhist1, dhist2) { - net_emd_single_pair(dhist1, dhist2, method = method, - smoothing_window_width = smoothing_window_width) - }) + if (pair_of_dhist_lists) { + details <- purrr::map2(dhists_1, dhists_2, function(dhist1, dhist2) { + netemd_single_pair(dhist1, dhist2, + method = method, + smoothing_window_width = smoothing_window_width + ) + }) # Collect the minimum EMDs and associated offsets for all histogram pairs min_emds <- purrr::simplify(purrr::transpose(details)$min_emd) min_offsets <- purrr::simplify(purrr::transpose(details)$min_offset) min_offsets_std <- purrr::simplify(purrr::transpose(details)$min_offset_std) - # The NetEMD is the arithmetic mean of the minimum EMDs for each pair of + # The NetEMD is the arithmetic mean of the minimum EMDs for each pair of # histograms arithmetic_mean <- sum(min_emds) / length(min_emds) net_emd <- arithmetic_mean @@ -118,8 +138,8 @@ net_emd <- function(dhists1, dhists2, method = "optimise", # the minumum EMD and associated offsets for the individual histograms # Note that the offsets represent shifts after the histograms have been # scaled to unit variance - if(return_details) { - return(list(net_emd = net_emd, min_emds = min_emds, min_offsets = min_offsets,min_offsets_std=min_offsets_std)) + if (return_details) { + return(list(net_emd = net_emd, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std)) } else { return(arithmetic_mean) } @@ -127,24 +147,188 @@ net_emd <- function(dhists1, dhists2, method = "optimise", else { # Wrap each member of a single pair of histograms is a list and recursively # call this net_emd function. This ensures they are treated the same. - return(net_emd(list(dhists1), list(dhists2), method = method, - return_details = return_details, - smoothing_window_width = smoothing_window_width)) + return(netemd_one_to_one(dhists_1 = list(dhists_1), dhists_2 = list(dhists_2), + method = method, + return_details = return_details, + smoothing_window_width = smoothing_window_width + )) + } +} + + +#' NetEMDs between all graph pairs using provided Graphlet-based Degree +#' Distributions +#' @param graphs A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is provided. +#' @param dhists A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}. +#' @param method The method to use to find the minimum EMD across all potential +#' offsets for each pair of histograms. Default is "optimise" to use +#' R's built-in \code{stats::optimise} method to efficiently find the offset +#' with the minimal EMD. However, this is not guaranteed to find the global +#' minimum if multiple local minima EMDs exist. You can alternatively specify the +#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' histograms at all offsets that are candidates for the minimal EMD. +#' @param return_details Logical indicating whether to return the individual +#' minimal EMDs and associated offsets for all pairs of histograms +#' @param smoothing_window_width Width of "top-hat" smoothing window to apply to +#' "smear" point masses across a finite width in the real domain. Default is 0, +#' which results in no smoothing. Care should be taken to select a +#' \code{smoothing_window_width} that is appropriate for the discrete domain +#' (e.g.for the integer domain a width of 1 is the natural choice). +#' @param mc.cores Number of cores to use for parallel processing. Defaults to +#' the \code{mc.cores} option set in the R environment. +#' @param feature_type Type of graphlet-based feature to count: "graphlet" +#' counts the number of graphlets each node participates in; "orbit" (default) calculates +#' the number of graphlet orbits each node participates in. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Possible values are 4, and 5 (default). +#' @param ego_neighbourhood_size The number of steps from the source node to +#' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. +#' @return NetEMD measures between all pairs of graphs for which features +#' were provided. Format of returned data depends on the \code{return_details} +#' parameter. If set to FALSE, a list is returned with the following named +#' elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, +#' \code{comp_spec}: a comparison specification table containing the graph names +#' and indices within the input GDD list for each pair of graphs compared. +#' If \code{return_details} is set to FALSE, the list also contains the following +#' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD +#' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving +#' the minimal EMD for each GDD +#' @export +netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smoothing_window_width = 0, + return_details = FALSE, mc.cores = getOption("mc.cores", 2L),feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { + if(max_graphlet_size > 4 & mc.cores > 1) print(paste("This function will compute orbits of graphlets up to size 5 using ", mc.cores," cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM.")) + + # NOTE: mcapply only works on unix-like systems with system level forking + # capability. This means it will work on Linux and OSX, but not Windows. + # For now, we just revert to single threaded operation on Windows + # TODO: Look into using the parLappy function on Windows + if (.Platform$OS.type != "unix") { + # Force cores to 1 if system is not unix-like as it will not support + # forking + mc.cores <- 1 + } + ## ------------------------------------------------------------------------ + # Check arguments 1 + which_imput_type <- NULL + if(!is.null(graphs) & is.null(dhists)){ + if ( !all(( unlist(sapply(X = graphs, FUN = igraph::is.igraph)) ) ) ) { + stop("Graphs need to be igraph graph objects, or a list of dhists network features should be supplied.") + } + which_imput_type <- "Graphs" + } + if (!is.null(dhists) ) { + if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) { + which_imput_type <- "Matrix" + } + if ( all(( unlist(sapply(X = dhists, FUN = + function(l){ all(( unlist(sapply(X = l, FUN = is_dhist)) ) ) } + )) ) ) ) { + which_imput_type <- "dhist" + } + if(is.null(which_imput_type)){ + warning("dhists does not conform to a Matrix or dhist class for all elmenents/netwroks in the list.") + } + } + ## ------------------------------------------------------------------------ + # Check arguments 2 + # If dhists is a list of matrices of network features then transform them to dhist objects. + if(which_imput_type == "Matrix"){ + dhists <- sapply(X = dhists,FUN = graph_features_to_histograms, simplify = FALSE ) + } + ## ------------------------------------------------------------------------ + # Check arguments 3 + #If input is graph then get graphlet counts + if(which_imput_type == "Graphs"){ + dhists <- parallel::mcmapply(gdd, graphs, + MoreArgs = + list( + feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ), + SIMPLIFY = FALSE, mc.cores = mc.cores + ) + } + rm(graphs) + ## ------------------------------------------------------------------------ + # Check arguments 4 + #cross_comparison_spec was coded to require names! + if(is.null(names(dhists))){ + names(dhists) <- paste("Net",1:length(dhists),sep = "") + } + ## ------------------------------------------------------------------------ + comp_spec <- cross_comparison_spec(dhists) + num_features <- length(dhists[[1]]) + out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { + netemd_one_to_one(dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], + method = method, return_details = return_details, + smoothing_window_width = smoothing_window_width + ) + }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE, mc.cores = mc.cores)) + if (return_details) { + net_emds <- purrr::simplify(purrr::map(out, ~ .$net_emd)) + min_emds <- matrix(purrr::simplify(purrr::map(out, ~ .$min_emds)), ncol = num_features, byrow = TRUE) + colnames(min_emds) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinEMD_O", . - 1, sep = ""))) + min_offsets <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets)), ncol = num_features, byrow = TRUE) + colnames(min_offsets) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsets_O", . - 1, sep = ""))) + min_offsets_std <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets_std)), ncol = num_features, byrow = TRUE) + colnames(min_offsets_std) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsetsStd_O", . - 1, sep = ""))) + ret <- list(netemds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std) + } else { + net_emds <- out + ret <- list(netemds = net_emds, comp_spec = comp_spec) } + return(ret) } -net_emd_single_pair <- function(dhist1, dhist2, method = "optimise", - smoothing_window_width = 0) { - # Present dhists as smoothed or unsmoothed histograms depending on the value +#' Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two +#' discrete histograms after normalising each histogram to unit mass and variance. +#' This is calculated as follows: +#' 1. Normalise each histogram to have unit mass and unit variance +#' 2. Find the minimum EMD between the histograms +#' @param dhists_1 A \code{dhist} discrete histogram object or a list of such objects +#' @param dhists_2 A \code{dhist} discrete histogram object or a list of such objects +#' @param method The method to use to find the minimum EMD across all potential +#' offsets for each pair of histograms. Default is "optimise" to use +#' R's built-in \code{stats::optimise} method to efficiently find the offset +#' with the minimal EMD. However, this is not guaranteed to find the global +#' minimum if multiple local minima EMDs exist. You can alternatively specify the +#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' histograms at all offsets that are candidates for the minimal EMD. +#' @param smoothing_window_width Width of "top-hat" smoothing window to apply to +#' "smear" point masses across a finite width in the real domain. Default is 0, +#' which results in no smoothing. Care should be taken to select a +#' \code{smoothing_window_width} that is appropriate for the discrete domain +#' (e.g.for the integer domain a width of 1 is the natural choice) +#' @return A list with the following named elements +#' \code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_offsets}: the associated +#' offsets giving the minimal EMD for each pair of histograms and \code{min_offset_std}: Offset used in the standardised histograms. +#' @examples +#' require(igraph) +#' goldstd_1 <- graph.lattice(c(8,8)) +#' goldstd_2 <- graph.lattice(c(44,44)) +#' props_1 <- count_orbits_per_node(graph = goldstd_1,max_graphlet_size = 5) +#' props_2 <- count_orbits_per_node(graph = goldstd_2,max_graphlet_size = 5) +#' dhists_1<- graph_features_to_histograms(props_1) +#' dhists_2<- graph_features_to_histograms(props_2) +#' # Obtain the minimum NetEMD_edges between the histograms +#' netemd_single_pair(dhists_1[[1]],dhists_2[[1]],method = "optimise",smoothing_window_width = 0) +#' @export +netemd_single_pair <- function(dhist1, dhist2, method = "optimise", + smoothing_window_width = 0) { + # Present dhists as smoothed or unsmoothed histograms depending on the value # of smoothing_window_width - # NOTE: This MUST be done prior to any variance normalisation as the - # calculation of variance differs depending on whether or not the histograms - # are smoothed (i.e. we need to ensure that the smoothing_window_width + # NOTE: This MUST be done prior to any variance normalisation as the + # calculation of variance differs depending on whether or not the histograms + # are smoothed (i.e. we need to ensure that the smoothing_window_width # attribute of the dhists is set to the smoothing_window_width parameter # provided by the caller) - # TODO: Consider moving the smoothing of histograms outside to the user's + # TODO: Consider moving the smoothing of histograms outside to the user's # calling code. It feels a bit untidy in here. - if(smoothing_window_width == 0) { + if (smoothing_window_width == 0) { dhist1 <- as_unsmoothed_dhist(dhist1) dhist2 <- as_unsmoothed_dhist(dhist2) } else { @@ -159,15 +343,15 @@ net_emd_single_pair <- function(dhist1, dhist2, method = "optimise", var1 <- dhist_variance(dhist1) var2 <- dhist_variance(dhist2) - # Mean centre histograms. This helps with numerical stability as, after + # Mean centre histograms. This helps with numerical stability as, after # variance normalisation, the differences between locations are often small. # We want to avoid calculating small differences between large numbers as # floating point precision issues can result in accumulating inaccuracies. # Mean-centering histograms results in variance normalised locations being # clustered around zero, rather than some potentially large mean location. - dhist1<-mean_centre_dhist(dhist1) - dhist2<-mean_centre_dhist(dhist2) - + dhist1 <- mean_centre_dhist(dhist1) + dhist2 <- mean_centre_dhist(dhist2) + # Normalise histogram to unit mass and unit variance dhist1_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist1)) dhist2_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist2)) @@ -175,10 +359,10 @@ net_emd_single_pair <- function(dhist1, dhist2, method = "optimise", # Calculate minimal EMD result <- min_emd(dhist1_norm, dhist2_norm, method = method) # As we mean-centred the histograms prior to passing to min_emd(), the offset - # returned is not the "true" offset for the supplied histograms. We report + # returned is not the "true" offset for the supplied histograms. We report # this as the "standardised" offset. - result$min_offset_std <- result$min_offset - # We report the "true" offset as the offset with no mean-centring, so need to + result$min_offset_std <- result$min_offset + # We report the "true" offset as the offset with no mean-centring, so need to # adjust to reverse the earlier mean-centring result$min_offset <- result$min_offset + mean2 - mean1 return(result) diff --git a/R/net_emd_speed_benchmark.R b/R/net_emd_speed_benchmark.R index 757811e8..922a2d03 100644 --- a/R/net_emd_speed_benchmark.R +++ b/R/net_emd_speed_benchmark.R @@ -1,35 +1,33 @@ -#' @export -netEMDSpeedTest <- function() -{ - ##load the data +netEMDSpeedTest <- function() { + ## load the data source_dir <- system.file(file.path("extdata", "random"), package = "netdist") print(source_dir) - edge_format = "ncol" - file_pattern = "" + edge_format <- "ncol" + file_pattern <- "" # source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # edge_format = "ncol" # file_pattern = ".txt" graphs <- read_simple_graphs(source_dir = source_dir, format = edge_format, pattern = file_pattern) - n1=names(graphs) - lab1=c() - gddBuildTime=c() - netEMDtime=c() + n1 <- names(graphs) + lab1 <- c() + gddBuildTime <- c() + netEMDtime <- c() for (i in 1:length(graphs)) { for (j in 1:(i)) { - g1=graphs[[i]] - g2=graphs[[j]] - lab1=append(lab1,paste(n1[i],n1[j],sep=',')) - print(paste(n1[i],n1[j],sep=',')) - fulltimeStart=Sys.time() - gdd1=gdd(g1) - gdd2=gdd(g2) - netEMDStart=Sys.time() - net_emd(gdd1,gdd2) - endTime=Sys.time() - gddBuildTime=append(gddBuildTime,as.double(netEMDStart-fulltimeStart)) - netEMDtime=append(netEMDtime,as.double(endTime-netEMDStart)) + g1 <- graphs[[i]] + g2 <- graphs[[j]] + lab1 <- append(lab1, paste(n1[i], n1[j], sep = ",")) + print(paste(n1[i], n1[j], sep = ",")) + fulltimeStart <- Sys.time() + gdd1 <- gdd(g1) + gdd2 <- gdd(g2) + netEMDStart <- Sys.time() + net_emd(gdd1, gdd2) + endTime <- Sys.time() + gddBuildTime <- append(gddBuildTime, as.double(netEMDStart - fulltimeStart)) + netEMDtime <- append(netEMDtime, as.double(endTime - netEMDStart)) } } list(gddBuildTime = gddBuildTime, netEMDtime = netEMDtime) @@ -70,4 +68,4 @@ netEMDSpeedTestSmooth <- function() } } list(gddBuildTime = gddBuildTime, netEMDtime = netEMDtime) -} \ No newline at end of file +} diff --git a/R/netdist_package.R b/R/netdist_package.R index b5f3704a..afc198d9 100644 --- a/R/netdist_package.R +++ b/R/netdist_package.R @@ -1,3 +1,3 @@ #' @useDynLib netdist, .registration=TRUE #' @importFrom Rcpp sourceCpp -NULL \ No newline at end of file +NULL diff --git a/R/orca_interface.R b/R/orca_interface.R index 191fe8d8..130595a9 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -1,9 +1,9 @@ #' Integer index edge list from igraph -#' +#' #' Takes a igraph graph object and generates an edgelist where each edge is #' represented by the integer indexes of its vertices. Note that, where a graph #' has isolated vertices, the indexes for these vertices will not be present -#' in the edge list. Where a graph has no isolated vertices, the edge list will +#' in the edge list. Where a graph has no isolated vertices, the edge list will #' include all vertex indexes from 1 to numVertices. #' @param graph An igraph graph object #' @return A 2 x numEdges edgelist with vertices labelled with integer indices @@ -11,7 +11,7 @@ #' the label for the vertice represented by index N in the edgelist #' @export graph_to_indexed_edges <- function(graph) { - # Use igraph method to get edge list with edges specified using vertex ID + # Use igraph method to get edge list with edges specified using vertex ID # (indexes) rather than names edges <- igraph::get.edgelist(graph, names = FALSE) # Convert edge list from numeric to integer @@ -23,150 +23,176 @@ graph_to_indexed_edges <- function(graph) { } #' Graph from integer index edge list -#' -#' Takes an integer indexed edgelist (where each edge is represented by the +#' +#' Takes an integer indexed edgelist (where each edge is represented by the #' integer indexes of its vertices) and converts it to an igraph format graph. #' If the edge list has a "vertex_names" attribute, this will be used to name #' the vertices in the resultant graph. -#' @param indexed_edges A 2 x numEdges edgelist with vertices labelled with +#' @param indexed_edges A 2 x numEdges edgelist with vertices labelled with #' integer indices, with an optional "vertex_names" attribute #' @return An igraph graph object #' @export indexed_edges_to_graph <- function(indexed_edges) { graph <- igraph::graph_from_edgelist(indexed_edges) - graph <- igraph::set.vertex.attribute(graph, name = "name", value = attr(indexed_edges, "vertex_names")) + graph <- igraph::set.vertex.attribute( + graph, + name = "name", + value = attr(indexed_edges, "vertex_names") + ) return(graph) } #' Read all graphs in a directory, simplifying as requested -#' +#' #' Reads graph data from all files in a directory matching the specified #' filename pattern. From each file, an a igraph graph object is constructed -#' and the requested subset of the following simplifications is made in the +#' and the requested subset of the following simplifications is made in the #' following order: #' 1. Makes the graph undirected #' 2. Removes loops (where both endpoints of an edge are the same vertex) -#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each +#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each #' pair of endpoints) -#' 4. Removes isolated vertices (i.e. vertices with no edges after the +#' 4. Removes isolated vertices (i.e. vertices with no edges after the #' previous alterations) #' @param source_dir Path to directory containing files with graph data -#' @param format Format of graph data. Any format supported by +#' @param format Format of graph data. Any format supported by #' \code{igraph::read_graph} can be used. #' @param pattern Pattern to use to filter filenames. Any pattern supported by #' \code{dir} can be used. #' @param as_undirected If TRUE make graph edges undirected #' @param remove_loops If TRUE, remove edgeds that connect a vertex to itself -#' @param remove_multiple If TRUE remove multiple edges connencting the same +#' @param remove_multiple If TRUE remove multiple edges connencting the same #' pair of vertices -#' @param remove_isolates If TRUE, remove vertices with no edges after the +#' @param remove_isolates If TRUE, remove vertices with no edges after the #' previous alterations have been made #' @return A named list of simplified igraph graph object, with the name of each #' graph set to the name of the file it was read from. +#' @examples +#' # Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +#' source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +#' print(source_dir) +#' # Load query graphs as igraph objects +#' graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +#' graph_1 #' @export -read_simple_graphs <- function(source_dir, format = "ncol", pattern = "*", - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE) { +read_simple_graphs <- function(source_dir, + format = "ncol", + pattern = "*", + as_undirected = TRUE, + remove_loops = TRUE, + remove_multiple = TRUE, + remove_isolates = TRUE) { # Get list of all filenames in directory that match the pattern file_names <- dir(source_dir, pattern = pattern) - # Read graph data from each matched file as an igraph format graph, + # Read graph data from each matched file as an igraph format graph, # simplifying as requested - graphs <- purrr::map(file_names, function(file_name) { - read_simple_graph(file = file.path(source_dir, file_name), format = format, - as_undirected = as_undirected, remove_loops = remove_loops, - remove_multiple = remove_multiple, - remove_isolates = remove_isolates) - }) - # Perform any requested simplifications graphs <- purrr::map( - graphs, simplify_graph, as_undirected = as_undirected, - remove_loops = remove_loops, remove_multiple = remove_multiple, - remove_isolates = remove_isolates) + file_names, + function(file_name) { + read_simple_graph( + file = file.path(source_dir, file_name), + format = format, + as_undirected = as_undirected, + remove_loops = remove_loops, + remove_multiple = remove_multiple, + remove_isolates = remove_isolates + ) + } + ) + # Name each graph with the name of the file it was read from (with any # extension moved) - names <- purrr::simplify(purrr::map(strsplit(file_names, "\\."), - function(s) { - if(length(s) == 1) { - s - } else { - paste(utils::head(s, -1), collapse = ".") - } - })) + names <- purrr::simplify( + purrr::map( + strsplit(file_names, "\\."), + function(s) { + if (length(s) == 1) { + s + } else { + paste(utils::head(s, -1), collapse = ".") + } + } + ) + ) attr(graphs, "names") <- names return(graphs) } #' Read a graph from file, simplifying as requested -#' +#' #' Reads graph data from file, constructing an a igraph graph object, making the #' requested subset of the following simplifications in the following order: #' 1. Makes the graph undirected #' 2. Removes loops (where both endpoints of an edge are the same vertex) -#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each +#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each #' pair of endpoints) -#' 4. Removes isolated vertices (i.e. vertices with no edges after the -#' previous alterations) +#' 4. Removes isolated vertices (i.e. vertices with no edges after the +#' previous alterations). #' @param file Path to file containing graph data -#' @param format Format of graph data. All formats supported by +#' @param format Format of graph data. All formats supported by #' \code{igraph::read_graph} are supported. #' @param as_undirected If TRUE make graph edges undirected #' @param remove_loops If TRUE, remove edgeds that connect a vertex to itself -#' @param remove_multiple If TRUE remove multiple edges connencting the same +#' @param remove_multiple If TRUE remove multiple edges connencting the same #' pair of vertices -#' @param remove_isolates If TRUE, remove vertices with no edges after the +#' @param remove_isolates If TRUE, remove vertices with no edges after the #' previous alterations have been made #' @return A simplified igraph graph object #' @export -read_simple_graph <- function(file, format, as_undirected = TRUE, - remove_loops = TRUE, remove_multiple = TRUE, +read_simple_graph <- function(file, format, as_undirected = TRUE, + remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE) { # Read graph from file. NOTE: igraph only supported the "directed" argument - # for some formats, but passes it to formats that don't support it, which + # for some formats, but passes it to formats that don't support it, which # then throw an error - if(format %in% c("edgelist", "ncol", "lgl", "dimacs", "dl")) { + if (format %in% c("edgelist", "ncol", "lgl", "dimacs", "dl")) { graph <- igraph::read_graph(file = file, format = format, directed = TRUE) } else { graph <- igraph::read_graph(file = file, format = format) } # Perform any requested simplifications - simplify_graph(graph, as_undirected = as_undirected, - remove_loops = remove_loops, remove_multiple = remove_multiple, - remove_isolates = remove_isolates) + simplify_graph(graph, + as_undirected = as_undirected, + remove_loops = remove_loops, remove_multiple = remove_multiple, + remove_isolates = remove_isolates + ) } #' Simplify an igraph -#' +#' #' Takes a igraph graph object and makes the requested subset of the following #' simplifications in the following order: #' 1. Makes the graph undirected #' 2. Removes loops (where both endpoints of an edge are the same vertex) -#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each +#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each #' pair of endpoints) -#' 4. Removes isolated vertices (i.e. vertices with no edges after the +#' 4. Removes isolated vertices (i.e. vertices with no edges after the #' previous alterations) #' @param graph An graph or list of graphs in igraph format #' @param as_undirected If TRUE make graph edges undirected #' @param remove_loops If TRUE, remove edgeds that connect a vertex to itself -#' @param remove_multiple If TRUE remove multiple edges connencting the same +#' @param remove_multiple If TRUE remove multiple edges connencting the same #' pair of vertices -#' @param remove_isolates If TRUE, remove vertices with no edges after the +#' @param remove_isolates If TRUE, remove vertices with no edges after the #' previous alterations have been made #' @return A simplified igraph graph object #' @export -simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE) { - if(as_undirected) { +simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE) { + if (as_undirected) { # Ensure graph is undirected graph <- igraph::as.undirected(graph, mode = "each") } - if(remove_loops || remove_multiple) { - # Remove loops (where both endpoints of an edge are the same vertex) and + if (remove_loops || remove_multiple) { + # Remove loops (where both endpoints of an edge are the same vertex) and # multiple edges (where two edges have the same endpoints [in the same order # for directed graphs]) - graph <- igraph::simplify(graph, remove.loops = remove_loops, - remove.multiple = remove_multiple) + graph <- igraph::simplify(graph, + remove.loops = remove_loops, + remove.multiple = remove_multiple + ) } - if(remove_isolates) { + if (remove_isolates) { # Remove vertices that have no edges connecting them to other vertices # NOTE: Vertices that only connect to themselves will only be removed if # their self-connecting edges have been removed by setting remove_loops to @@ -177,84 +203,88 @@ simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, return(graph) } -#' Convert a matrix of node level features to a discrete histogram for each feature -#' -#' Converts a matrix of node level features (e.g. for ORCA output this is counts -#' of each graphlet or orbit at each graph vertex) to -#' a set of discrete histograms (a histogram of counts for each distinct value -#' across all graph vertices for each feature with no binning) -#' @param A number of nodes (rows) by number of features (columns) matrix, where -#' the ij entry is the score of node i on feature j (e.g. for ORCA output this is -#' counts of each graphlet or orbit at each graph vertex) -#' @return Feature histograms: List of discrete histograms for each -#' feature +#' Convert a matrix of node level features to a "discrete histogram" for +#' each feature. +#' +#' Converts a matrix of node level features (e.g. for example counts +#' of multiple graphlets or orbits at each node) to +#' a set of histogram like objects (observed frequency distribution of each feature/column) +#' @param features_matrix A matrix whose rows represent nodes and whose columns represent different node level features. This means that entry ij provides the value of feature j for node i. +#' @return Feature histograms: List of "discrete histograms" for each +#' feature #' @export -graph_features_to_histograms <- function(featuresMatrix) { - apply(featuresMatrix, 2, dhist_from_obs) +graph_features_to_histograms <- function(features_matrix) { + apply(features_matrix, 2, dhist_from_obs) } -graph_features_to_histogramsSLOW <- function(featuresMatrix) { - apply(featuresMatrix, 2, dhist_from_obsSLOW) +graph_features_to_histogramsSLOW <- function(features_matrix) { + apply(features_matrix, 2, dhist_from_obsSLOW) } #' Graphlet-based degree distributions (GDDs) -#' -#' Generates graphlet-based degree distributions from \code{igraph} graph object, +#' +#' Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object #' using the ORCA fast graphlet orbit counting package. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param feature_type Type of graphlet-based feature to count: "graphlet" #' counts the number of graphlets each node participates in; "orbit" calculates #' the number of graphlet orbits each node participates in. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @param ego_neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Currently only size 4 and 5 are supported. +#' @param ego_neighbourhood_size The number of steps from the source node used to select the +#' neighboring nodes to be included in the source node ego-network. #' @return List of graphlet-based degree distributions, with each distribution #' represented as a \code{dhist} discrete histogram object. #' @export -gdd <- function(graph, feature_type = 'orbit', max_graphlet_size = 4, - ego_neighbourhood_size = 0){ +gdd <- function(graph, feature_type = "orbit", max_graphlet_size = 4, + ego_neighbourhood_size = 0) { graph <- simplify_graph(graph) - if(ego_neighbourhood_size > 0) { - if(feature_type != 'graphlet') { + if (ego_neighbourhood_size > 0) { + if (feature_type != "graphlet") { stop("Feature type not supported for ego-networks") } else { - out <- count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = ego_neighbourhood_size) + out <- count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = ego_neighbourhood_size + ) } - } else if(feature_type == "orbit") { + } else if (feature_type == "orbit") { out <- count_orbits_per_node(graph, max_graphlet_size = max_graphlet_size) - } else if(feature_type == "graphlet") { - out <- count_graphlets_per_node(graph, max_graphlet_size = max_graphlet_size) + } else if (feature_type == "graphlet") { + out <- count_graphlets_per_node(graph, + max_graphlet_size = max_graphlet_size + ) } else { - stop('gdd: unrecognised feature_type') + stop("gdd: unrecognised feature_type") } graph_features_to_histograms(out) } #' Count graphlet orbits for each node in a graph -#' -#' Calculates graphlet orbit counts for each node in an \code{igraph} graph -#' object, using the ORCA fast graphlet orbit counting package. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' +#' Calculates graphlet orbit counts for each node in an \code{igraph} graph +#' object, using the \code{orca} fast graphlet orbit counting package. +#' @param graph A undirected, simple graph as an \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Currently only size 4 and 5 are supported. #' @return ORCA-format matrix containing counts of each graphlet #' orbit (columns) at each node in the graph (rows). #' @export count_orbits_per_node <- function(graph, max_graphlet_size) { - if(max_graphlet_size == 4) { + if (max_graphlet_size == 4) { orca_fn <- orca::count4 - } else if(max_graphlet_size == 5) { + } else if (max_graphlet_size == 5) { orca_fn <- orca::count5 } else { stop("Unsupported maximum graphlet size") } indexed_edges <- graph_to_indexed_edges(graph) num_edges <- dim(indexed_edges)[[1]] - if(num_edges >= 1) { + if (num_edges >= 1) { orbit_counts <- orca_fn(indexed_edges) } else { # ORCA functions expect at least one edge, so handle this case separately @@ -270,31 +300,35 @@ count_orbits_per_node <- function(graph, max_graphlet_size) { } #' Count graphlets for each node in a graph -#' -#' Calculates graphlet counts for each node in an \code{igraph} graph object, -#' using the ORCA fast graphlet orbit counting package. by summing orbits over +#' +#' Calculates graphlet counts for each node in an \code{igraph} graph object, +#' using the ORCA fast graphlet orbit counting package. by summing orbits over #' graphlets. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @return ORCA-format matrix containing counts of each graphlet (columns) at +#' @param graph A connected, undirected, simple graph as an \code{igraph} object +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Currently only size 4 and 5 are supported. +#' @return ORCA-format matrix containing counts of each graphlet (columns) at #' each node in the graph (rows). #' @export count_graphlets_per_node <- function(graph, max_graphlet_size) { - orbit_counts <- count_orbits_per_node(graph, max_graphlet_size = max_graphlet_size) + orbit_counts <- count_orbits_per_node(graph, + max_graphlet_size = max_graphlet_size + ) orbit_to_graphlet_counts(orbit_counts) } #' Count total number of graphlets in a graph -#' -#' Calculates total graphlet counts for a \code{igraph} graph object using the +#' +#' Calculates total graphlet counts for a \code{igraph} graph object using the #' ORCA fast graphlet orbit counting package. Per-node graphlet counts are #' calculated by summing orbits over graphlets. These are then divided by the -#' number of nodes comprising each graphlet to avoid counting the same graphlet +#' number of nodes comprising each graphlet to avoid counting the same graphlet #' multiple times. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Currently only size 4 and 5 are supported. #' @return Vector containing counts of each graphlet for the graph. #' @export count_graphlets_for_graph <- function(graph, max_graphlet_size) { @@ -302,52 +336,98 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { # Sum graphlet counts over all nodes (rows) total_counts <- colSums(node_counts) # To ensure we only count each graphlet present in an ego network once, divide - # the graphlet counts by the number of nodes that contribute to + # the graphlet counts by the number of nodes that contribute to # each graphlet type nodes_per_graphlet <- graphlet_key(max_graphlet_size)$node_count - return(total_counts / nodes_per_graphlet) + total_counts <- total_counts / nodes_per_graphlet + + # add overall graph node count to total_counts + N <- igraph::vcount(graph) + total_counts <- c(N = N, total_counts) + total_counts } #' Ego-network graphlet counts -#' +#' #' Calculates graphlet counts for the n-step ego-network of each node in a graph -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. -#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside -#' graphlet counts to enable further processing. -#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix -#' containing counts of each graphlet (columns, C) for each ego-network in the -#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are +#' @param graph An undirected, simple graph as an \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Currently only size 4 (default) and 5 are supported. +#' @param neighbourhood_size The number of steps from the source node used to select the +#' neighboring nodes to be included in the source node ego-network. (Default 2). +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' nodes are returned. (Default 3). +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' edges are returned. (Default 1). +#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside +#' graphlet counts to enable further processing. +#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix +#' containing counts of each graphlet (columns, C) for each ego-network in the +#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are #' labelled with the ID of the central node in each ego-network (if nodes in the #' input graph are labelled). If \code{return_ego_networks = TRUE}, returns a #' list with the following elements: #' \itemize{ -#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each +#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each #' ego-network in the input graph as described above. #' \item \code{ego_networks}: The ego-networks of the query graph. #' } #' @export -count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size, +count_graphlets_ego <- function(graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, return_ego_networks = FALSE) { # Extract ego network for each node in original graph, naming each ego network # in the list with the name of the node the ego network is generated for - ego_networks <- make_named_ego_graph(graph, order = neighbourhood_size) + ego_networks <- make_named_ego_graph(graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + # Generate graphlet counts for each node in each ego network + ego_graphlet_counts <- ego_to_graphlet_counts(ego_networks, max_graphlet_size) + + # Return either graphlet counts, or graphlet counts and ego_networks + if (return_ego_networks) { + return(list( + graphlet_counts = ego_graphlet_counts, + ego_networks = ego_networks + )) + } else { + return(ego_graphlet_counts) + } +} + +#' ego_to_graphlet_counts +#' +#' Calculates graphlet counts for previously generated ego networks. +#' @param ego_networks Named list of ego networks for a graph. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Currently only size 4 and 5 are supported. +#' @return returns an RxC matrix +#' containing counts of each graphlet (columns, C) for each ego-network +#' (rows, R). +#' Columns are labelled with graphlet IDs and rows are +#' labelled with the ID of the central node in each ego-network. +#' @export +ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { # Generate graphlet counts for each node in each ego network (returns an ORCA # format graphlet count matrix for each ego network) - ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size) + ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size + ) + # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) - # Return either graphlet counts, or graphlet counts and ego_networks - if(return_ego_networks) { - return(list(graphlet_counts = ego_graphlet_counts, ego_networks = ego_networks)) - } else { - return(ego_graphlet_counts) - } + + # Return graphlet counts + return(ego_graphlet_counts) } #' Get ego-networks for a graph as a named list @@ -358,18 +438,30 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size #' @param graph An \code{igraph} object #' @param order The number of steps from the source node to include #' nodes for each ego-network. -#' @param ... Additional parameters to be passed to the underlying +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' nodes are returned. +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' edges are returned. +#' @param ... Additional parameters to be passed to the underlying #' \code{igraph::make_ego_graph} function used. #' @export -make_named_ego_graph <- function(graph, order, ...) { +make_named_ego_graph <- function(graph, order, min_ego_nodes = 3, + min_ego_edges = 1, ...) { ego_networks <- igraph::make_ego_graph(graph, order, ...) names(ego_networks) <- igraph::V(graph)$name - ego_networks + + # Drop ego-networks that don't have the minimum number of nodes or edges + drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { + (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) + })) + ego_networks <- ego_networks[!drop_index] + + return(ego_networks) } #' Orbit to graphlet counts -#' -#' Converts graphlet orbit counts at each vertex to graphlet counts at each +#' +#' Converts graphlet orbit counts at each vertex to graphlet counts at each #' vertex by summing over all orbits contained within each graphlet #' @param orbit_counts ORCA-format matrix containing counts of each graphlet #' orbit (columns) at each vertex in the graph (rows) @@ -378,31 +470,42 @@ make_named_ego_graph <- function(graph, order, ...) { #' @export orbit_to_graphlet_counts <- function(orbit_counts) { num_orbits <- dim(orbit_counts)[2] - # Indexes to select the orbit(s) that comprise each graphlet. Note that we - # define these in the zero-based indexing used in journal papers, but + # Indexes to select the orbit(s) that comprise each graphlet. Note that we + # define these in the zero-based indexing used in journal papers, but # need to add 1 to convert to the 1-based indexing used by R - if(num_orbits == 15) { + if (num_orbits == 15) { # Orbits for graphlets comprising up to 4 nodes max_nodes <- 4 - orbit_to_graphlet_map <- - purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14), - function(indexes){ indexes + 1}) - } else if(num_orbits == 73) { + orbit_to_graphlet_map <- + purrr::map( + list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14), + function(indexes) { + indexes + 1 + } + ) + } else if (num_orbits == 73) { # Orbits for graphlets comprising up to 5 nodes max_nodes <- 5 - orbit_to_graphlet_map <- - purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14, 15:17, 18:21, - 22:23, 24:26, 27:30, 31:33, 34, 35:38, 39:42, 43:44, - 45:48, 49:50, 51:53, 54:55, 56:58, 59:61, 62:64, - 65:67, 68:69, 70:71, 72), - function(indexes){ indexes + 1}) + orbit_to_graphlet_map <- + purrr::map( + list( + 0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14, 15:17, 18:21, + 22:23, 24:26, 27:30, 31:33, 34, 35:38, 39:42, 43:44, + 45:48, 49:50, 51:53, 54:55, 56:58, 59:61, 62:64, + 65:67, 68:69, 70:71, 72 + ), + function(indexes) { + indexes + 1 + } + ) } else { stop(("Unsupported number of orbits")) } # Sum counts across orbits in graphlets - graphlet_counts <- sapply(orbit_to_graphlet_map, function(indexes){ - rowSums(orbit_counts[,indexes, drop = FALSE])}) - if(dim(orbit_counts)[[1]] == 1) { + graphlet_counts <- sapply(orbit_to_graphlet_map, function(indexes) { + rowSums(orbit_counts[, indexes, drop = FALSE]) + }) + if (dim(orbit_counts)[[1]] == 1) { # If orbit counts has only a single row, sapply returns a vector # rather than a matrix, so convert to a matrix by adding dim dim(graphlet_counts) <- c(1, length(graphlet_counts)) @@ -413,65 +516,75 @@ orbit_to_graphlet_counts <- function(orbit_counts) { } #' Graphlet key -#' +#' #' Metdata about graphlet groups. -#' @param max_graphlet_size Maximum number of nodes graphlets can contain +#' @param max_graphlet_size Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported. #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain -#' \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to +#' \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to #' num_graphlets #' \item \code{node_count}: Number of nodes contained within each graphlet #' } #' @export graphlet_key <- function(max_graphlet_size) { - if(max_graphlet_size == 2) { + if (max_graphlet_size == 2) { node_count <- c(2) - } else if(max_graphlet_size == 3) { - node_count <- c(2, rep(3,2)) - } else if(max_graphlet_size == 4) { - node_count <- c(2, rep(3,2), rep(4,6)) + } else if (max_graphlet_size == 3) { + node_count <- c(2, rep(3, 2)) + } else if (max_graphlet_size == 4) { + node_count <- c(2, rep(3, 2), rep(4, 6)) } else if (max_graphlet_size == 5) { - node_count <- c(2, rep(3,2), rep(4,6), rep(5, 21)) + node_count <- c(2, rep(3, 2), rep(4, 6), rep(5, 21)) } else { stop("Unsupported maximum graphlet size") } - max_node_index <- length(node_count)-1 + max_node_index <- length(node_count) - 1 id <- purrr::simplify(purrr::map(0:max_node_index, function(index) { - paste('G', index, sep = "")})) - name <- - return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) + paste("G", index, sep = "") + })) + name <- + return(list( + max_nodes = max_graphlet_size, + id = id, + node_count = node_count + )) } #' Orbit key -#' +#' #' Metdata about orbit groups. -#' @param max_graphlet_size Maximum number of nodes graphlets can contain +#' @param max_graphlet_size Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported. #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain -#' \item \code{id}: ID of each graphlet in format On, where n is in range 0 to +#' \item \code{id}: ID of each graphlet in format On, where n is in range 0 to #' num_orbits #' \item \code{node_count}: Number of nodes contained within each graphlet #' } #' @export orbit_key <- function(max_graphlet_size) { - if(max_graphlet_size == 2) { + if (max_graphlet_size == 2) { node_count <- c(2) - } else if(max_graphlet_size == 3) { - node_count <- c(2, rep(3,3)) - } else if(max_graphlet_size == 4) { - node_count <- c(2, rep(3,3), rep(4,11)) + } else if (max_graphlet_size == 3) { + node_count <- c(2, rep(3, 3)) + } else if (max_graphlet_size == 4) { + node_count <- c(2, rep(3, 3), rep(4, 11)) } else if (max_graphlet_size == 5) { - node_count <- c(2, rep(3,3), rep(4,11), rep(5, 58)) + node_count <- c(2, rep(3, 3), rep(4, 11), rep(5, 58)) } else { stop("Unsupported maximum graphlet size") } - max_node_index <- length(node_count)-1 + max_node_index <- length(node_count) - 1 id <- purrr::simplify(purrr::map(0:max_node_index, function(index) { - paste('O', index, sep = "")})) - name <- - return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) + paste("O", index, sep = "") + })) + name <- + return(list( + max_nodes = max_graphlet_size, + id = id, + node_count = node_count + )) } #' Graphlet IDs for size @@ -483,99 +596,139 @@ orbit_key <- function(max_graphlet_size) { #' @export graphlet_ids_for_size <- function(graphlet_size) { graphlet_key <- graphlet_key(graphlet_size) - graphlet_key$id[graphlet_key$node_count==graphlet_size] + graphlet_key$id[graphlet_key$node_count == graphlet_size] } #' Load all graphs in a directory and calculates their Graphlet-based Degree #' Distributions (GDDs) -#' -#' Loads graphs from all files matching the given pattern in the given directory, -#' converts them to indexed edge lists compatible with the ORCA fast orbit -#' counting package and calculates the specified set of graphlet-based degree -#' distributions usingthe ORCA package. +#' +#' Loads graphs from all files matching the given pattern in the given +#' directory, converts them to indexed edge lists compatible with the ORCA fast +#' orbit counting package and calculates the specified set of graphlet-based +#' degree distributions usingthe ORCA package. #' @param source_dir Path to graph directory #' @param format Format of graph files #' @param pattern Filename pattern to match graph files -#' @param feature_type Type of graphlet-based degree distributions. Can be +#' @param feature_type Type of graphlet-based degree distributions. Can be #' \code{graphlet} to count graphlets or \code{orbit} to count orbits. #' @return A named list where each element contains a set of GDDs for a single -#' @param max_graphlet_size Maximum size of graphlets to use when generating GDD -#' @param ego_neighbourhood_size The number of steps from the source node to -#' include nodes for each ego-network. If set to 0, ego-networks will not be -#' used +#' @param max_graphlet_size Maximum size of graphlets to use when generating GDD. +#' Currently only size 4 and 5 are supported. +#' @param ego_neighbourhood_size The number of steps from the source node used to select the +#' neighboring nodes to be included in the source node ego-network. If set to 0, ego-networks will not be +#' used. #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. #' @return A named list where each element contains a set of GDDs for a single -#' graph from the source directory. Each set of GDDs is itself a named list, +#' graph from the source directory. Each set of GDDs is itself a named list, #' where each GDD element is a \code{dhist} discrete histogram object. #' @export -gdd_for_all_graphs <- function( - source_dir, format = "ncol", pattern = ".txt", feature_type = "orbit", - max_graphlet_size = 4, ego_neighbourhood_size = 0, - mc.cores = getOption("mc.cores", 2L)) { +gdd_for_all_graphs <- function(source_dir, + format = "ncol", + pattern = ".txt", + feature_type = "orbit", + max_graphlet_size = 4, + ego_neighbourhood_size = 0, + mc.cores = getOption("mc.cores", 2L)) { # Create function to read graph from file and generate GDD graphs <- read_simple_graphs( - source_dir = source_dir, format = format, pattern = pattern) + source_dir = source_dir, format = format, pattern = pattern + ) # Calculate specified GDDs for each graph - # NOTE: mcapply only works on unix-like systems with system level forking + # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows # TODO: Look into using the parLappy function on Windows - if(.Platform$OS.type != "unix") { - # Force cores to 1 if system is not unix-like as it will not support + if (.Platform$OS.type != "unix") { + # Force cores to 1 if system is not unix-like as it will not support # forking - mc.cores = 1 + mc.cores <- 1 } - parallel::mcmapply(gdd, graphs, MoreArgs = - list(feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size), - SIMPLIFY = FALSE, mc.cores = mc.cores) + parallel::mcmapply(gdd, graphs, + MoreArgs = + list( + feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ), + SIMPLIFY = FALSE, mc.cores = mc.cores + ) } #' Generate a cross-comparison specification -#' -#' Creates a cross-comparison matrix with all possible pair-wise combinations +#' +#' Creates a cross-comparison matrix with pair-wise combinations #' of elements from the provided list. #' @param named_list A named list of items for which an exhaustive pair-wise #' cross-comparison is required. +#' @param how How to generate pair-wise combinations. Either "many-to-many" +#' (default) which generates all possible pair-wise combinations, or +#' "one-to-many" which generates all combinations between the first element +#' in named_list and the rest of the elements only. #' @return A matrix with one row for each possible pair-wise combination -#' of elements from the provided named list. The first and second columns -#' contain the names of the elements in the pair and the third and fourth -#'columns contain the indexes of these elements in the provided list. +#' of elements from the provided named list. The first and second columns +#' contain the names of the elements in the pair and the third and fourth +#' columns contain the indexes of these elements in the provided list. #' @export -cross_comparison_spec <- function(named_list) { - indexes <- as.data.frame(t(utils::combn(1:length(named_list),2))) - names <- as.data.frame(cbind(names(named_list)[indexes[,1]], - names(named_list)[indexes[,2]])) +cross_comparison_spec <- function(named_list, how = "many-to-many") { + if (how == "one-to-many") { + indexes <- data.frame( + rep(1, length(named_list) - 1), + 2:length(named_list) + ) + } else { + indexes <- as.data.frame(t(utils::combn(1:length(named_list), 2))) + } + + names <- as.data.frame(cbind( + names(named_list)[indexes[, 1]], + names(named_list)[indexes[, 2]] + )) spec <- cbind(names, indexes) colnames(spec) <- c("name_a", "name_b", "index_a", "index_b") return(spec) } #' Convert a pair-wise cross-comparison into a matrix format -#' +#' #' Converts a pair-wise cross-comparison into a matrix format #' @param measure A list of pair-wise comparison measiures #' @param cross_comparison_spec A cross-comparison specification generated #' using \code{cross_comparison_spec} #' @return A square symmetric matrix with a zero diagonal, with elements #' Cij and Cji populated from the element from \code{measure} corresponding to -#' the row of \code{cross_comparison_spec} with \code{index_a = i} and +#' the row of \code{cross_comparison_spec} with \code{index_a = i} and #' \code{index_b = j} #' @export cross_comp_to_matrix <- function(measure, cross_comparison_spec) { - num_items <- max(c(cross_comparison_spec$index_a, cross_comparison_spec$index_b)) - out <- matrix(data = 0, nrow = num_items, ncol = num_items); - out[cbind(cross_comparison_spec$index_a, cross_comparison_spec$index_b)] <- measure - out[cbind(cross_comparison_spec$index_b, cross_comparison_spec$index_a)] <- measure + num_items <- max(c( + cross_comparison_spec$index_a, + cross_comparison_spec$index_b + )) + out <- matrix(data = 0, nrow = num_items, ncol = num_items) + out[cbind( + cross_comparison_spec$index_a, + cross_comparison_spec$index_b + )] <- measure + out[cbind( + cross_comparison_spec$index_b, + cross_comparison_spec$index_a + )] <- measure row_labels <- rep("", num_items) - row_labels[cross_comparison_spec$index_a] <- as.character(cross_comparison_spec$name_a) - row_labels[cross_comparison_spec$index_b] <- as.character(cross_comparison_spec$name_b) + row_labels[cross_comparison_spec$index_a] <- as.character( + cross_comparison_spec$name_a + ) + row_labels[cross_comparison_spec$index_b] <- as.character( + cross_comparison_spec$name_b + ) rownames(out) <- row_labels col_labels <- rep("", num_items) - col_labels[cross_comparison_spec$index_a] <- as.character(cross_comparison_spec$name_a) - col_labels[cross_comparison_spec$index_b] <- as.character(cross_comparison_spec$name_b) + col_labels[cross_comparison_spec$index_a] <- as.character( + cross_comparison_spec$name_a + ) + col_labels[cross_comparison_spec$index_b] <- as.character( + cross_comparison_spec$name_b + ) colnames(out) <- col_labels return(out) } diff --git a/R/utility_functions.R b/R/utility_functions.R index b5b1cf5b..25c58b6a 100755 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -1,8 +1,8 @@ # VECTOR FUNCTIONS rotl_vec <- function(vec, lshift) { num_els <- length(vec) - select_mask <- ((1:num_els + lshift) %% num_els) - select_mask[select_mask==0] <- num_els + select_mask <- ((1:num_els + lshift) %% num_els) + select_mask[select_mask == 0] <- num_els return(vec[select_mask]) } diff --git a/README.md b/README.md index 43b3164f..39989dcb 100755 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # Network Comparison -An R package implementing the Netdis and NetEMD alignment-free network comparison measures. +An R package implementing the Netdis and NetEMD alignment-free network comparison measures. + ### :warning: BETA: Package under construction (pre-release) :warning: Until this package hits release 1.0 anything can change with no notice. [![Project Status: WIP - Initial development is in progress, but there has not yet been a stable, usable release suitable for the public.](http://www.repostatus.org/badges/latest/wip.svg)](http://www.repostatus.org/#wip) [![GitHub release](https://img.shields.io/github/release/alan-turing-institute/network-comparison.svg)](https://github.com/alan-turing-institute/network-comparison/releases/latest) -[![Travis](https://img.shields.io/travis/alan-turing-institute/network-comparison/master.svg)](https://travis-ci.org/alan-turing-institute/network-comparison/branches) -[![Appveyor](https://ci.appveyor.com/api/projects/status/jn1a36c22vjw1l4d/branch/master?svg=true)](https://ci.appveyor.com/project/alan-turing-institute/network-comparison/branch/master) +[![Build](https://github.com/alan-turing-institute/network-comparison/actions/workflows/build.yml/badge.svg)](https://github.com/alan-turing-institute/network-comparison/actions/workflows/build.yml) [![Codecov](https://img.shields.io/codecov/c/github/alan-turing-institute/network-comparison/master.svg)](https://codecov.io/gh/alan-turing-institute/network-comparison?branch=master) [![license](https://img.shields.io/github/license/alan-turing-institute/network-comparison.svg)](https://github.com/alan-turing-institute/network-comparison/edit/master/LICENSE) diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index 0232e911..00000000 --- a/appveyor.yml +++ /dev/null @@ -1,50 +0,0 @@ -# DO NOT CHANGE the "init" and "install" sections below - -# Download script file from GitHub -init: - ps: | - $ErrorActionPreference = "Stop" - Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" - Import-Module '..\appveyor-tool.ps1' - -install: - ps: Bootstrap - -# Adapt as necessary starting from here - -build_script: - - travis-tool.sh install_deps - -test_script: - - travis-tool.sh run_tests - -on_failure: - - 7z a failure.zip *.Rcheck\* - - appveyor PushArtifact failure.zip - -artifacts: - - path: '*.Rcheck\**\*.log' - name: Logs - - - path: '*.Rcheck\**\*.out' - name: Logs - - - path: '*.Rcheck\**\*.fail' - name: Logs - - - path: '*.Rcheck\**\*.Rout' - name: Logs - - - path: '\*_*.tar.gz' - name: Bits - - - path: '\*_*.zip' - name: Bits - -notifications: - - provider: Slack - on_build_success: false - on_build_failure: false - on_build_status_changed: true - incoming_webhook: - secure: k56hsY5K++kvpU/bhF5r2YmgwSeTdGJVY8qnT8z9qRug/zgo50Ev8lB9kN4FT3vNJQpfmkDBOOlXe6XURHzaJPeTtDKKciAsXQe48IVkFVw= diff --git a/data-raw/virus.R b/data-raw/virus.R index 9fb046f6..0d4d42a2 100644 --- a/data-raw/virus.R +++ b/data-raw/virus.R @@ -7,12 +7,12 @@ load_virus_data <- function(filename) { read_simple_graph(file = file.path(data_dir, filename), format = "ncol") } -virusppi <- list(EBV = load_virus_data("EBV.txt"), - ECL = load_virus_data("ECL.txt"), - `HSV-1` = load_virus_data("HSV-1.txt"), - KSHV = load_virus_data("KSHV.txt"), - VZV = load_virus_data("VZV.txt") - ) +virusppi <- list( + EBV = load_virus_data("EBV.txt"), + ECL = load_virus_data("ECL.txt"), + `HSV-1` = load_virus_data("HSV-1.txt"), + KSHV = load_virus_data("KSHV.txt"), + VZV = load_virus_data("VZV.txt") +) devtools::use_data(virusppi, overwrite = TRUE) - diff --git a/data/worldtradesub.rda b/data/worldtradesub.rda new file mode 100644 index 00000000..125df511 Binary files /dev/null and b/data/worldtradesub.rda differ diff --git a/doc/V-Menu.R b/doc/V-Menu.R new file mode 100644 index 00000000..40d07382 --- /dev/null +++ b/doc/V-Menu.R @@ -0,0 +1,6 @@ +## ---- include = FALSE--------------------------------------------------------- +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) + diff --git a/doc/V-Menu.Rmd b/doc/V-Menu.Rmd new file mode 100644 index 00000000..93723f7b --- /dev/null +++ b/doc/V-Menu.Rmd @@ -0,0 +1,29 @@ +--- +title: "Netdis Vignette's Menu" +date: "`10-06-2020`" +author: "Luis Ospina-Forero" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{V-Menu} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + + +# Netdis introductory Vignettes + +This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods 'Netdis', 'NetEmd' and their variants (e.g. `Netdis Geometric-Poisson'), and also the large flexibility of these methodologies. The following is a list of the available vignettes: + + +* [Default, simple and quick use usage 1: pairwise comparisons](default_pairwise_usage.html): +* [Default, simple and quick use usage 2: precomputed properties](PreComputedProps.html): +* [Default, simple and quick use usage 3: many to many comparions](ManyToMany.html): + + diff --git a/doc/V-Menu.html b/doc/V-Menu.html new file mode 100644 index 00000000..35637614 --- /dev/null +++ b/doc/V-Menu.html @@ -0,0 +1,260 @@ + + + + + + + + + + + + + + + +Netdis Vignette’s Menu + + + + + + + + + + + + + + + + + + + +

Netdis Vignette’s Menu

+

Luis Ospina-Forero

+

10-06-2020

+ + + +
+

Netdis introductory Vignettes

+

This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods ‘Netdis’, ‘NetEmd’ and their variants (e.g. `Netdis Geometric-Poisson’), and also the large flexibility of these methodologies. The following is a list of the available vignettes:

+ +
+ + + + + + + + + + + diff --git a/doc/default_pairwise_usage.R b/doc/default_pairwise_usage.R new file mode 100644 index 00000000..a56afaa6 --- /dev/null +++ b/doc/default_pairwise_usage.R @@ -0,0 +1,88 @@ +## ---- include = FALSE--------------------------------------------------------- +knitr::opts_chunk$set( +collapse = TRUE, +comment = "#>" +) + +## ---- packages, message= FALSE------------------------------------------------ +# Load packages/libraries +library("netdist") +library("igraph") + +## ---- graphs,fig.align='center'----------------------------------------------- +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 + +#A simple visualization of the graphs. +plot(graph_1,vertex.size=0.5,vertex.label=NA) +plot(graph_2,vertex.size=0.5,vertex.label=NA) + +## ---- netemd,fig.align='center'----------------------------------------------- +# Set source directory for Virus protein-protein interaction network edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# One to one NetEmd comparison. +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. + +## ---- netemdEigen,fig.align='center'------------------------------------------ +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) + +#Spectra (This may take a couple of minutes). +props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. + +## ----netdisgoldstand,fig.align='center'--------------------------------------- +# Lattice graphs to be used as gold-standard as a reference point comparison +goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes + +plot(goldstd_1,vertex.size=0.8,vertex.label=NA) +plot(goldstd_2,vertex.size=0.5,vertex.label=NA) + + +# Netdis using the goldstd_1 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + +# Netdis using the goldstd_2 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) + +## ---- netdisGP---------------------------------------------------------------- +#Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) + +## ----netdiszero--------------------------------------------------------------- +#Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + + diff --git a/doc/default_pairwise_usage.Rmd b/doc/default_pairwise_usage.Rmd new file mode 100644 index 00000000..e5ff1e64 --- /dev/null +++ b/doc/default_pairwise_usage.Rmd @@ -0,0 +1,219 @@ +--- +title: "Default, simple and quick use usage 1: pairwise comparisons" +date: "`10-06-2020`" +author: "Luis Ospina-Forero" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{default_pairwise_usage} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( +collapse = TRUE, +comment = "#>" +) +``` + +# Introduction + +The `netdist` package currently considers to broad methodologies for network comparison, namely Netdis and NetEmd. Netdis considers multiple variants (via background expectations) to capture the dissimilarity between the local structure of networks exhibited by the occurrence of small subgraphs. NetEmd is also a method to capture the dissimilarity between networks using subgraph counts, but it has also been defined for any type of network features; for example eigen distributions. The variants of Netdis are controlled by the input selected for the background expectations, whereas the variants of NetEmd are controlled directly by the user in the selection of the network features being compared (by default this package uses subgraph counts). + +The following shows a quick introduction to the most simple functions of the package, and to some of the variants of Netdis and NetEmd. + +# Load required packages/libraries +```{r, packages, message= FALSE} +# Load packages/libraries +library("netdist") +library("igraph") +``` + +## Load graphs included in the netdist package +```{r, graphs,fig.align='center'} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 + +#A simple visualization of the graphs. +plot(graph_1,vertex.size=0.5,vertex.label=NA) +plot(graph_2,vertex.size=0.5,vertex.label=NA) +``` + + + +# Compare two networks via NetEmd. + +## What is NetEmd? +(Extracted from Wegner et al. (2017)): +NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties which reflect the topological organization of the network. From an abstract point of view, NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. + +Based on these NetEmd uses the following measure between distributions $p$ and $q$ that are supported on $\mathbb{R}$ and have non-zero, finite variances: +\begin{equation}\label{emdmet} +EMD^*(p,q)=\mathrm{inf}_{c\in\mathbb{R}}\left( EMD\big(\tilde{p}(\cdot+c),\tilde{q}(\cdot)\big)\right), +\end{equation} +where $EMD$ is the earth mover's distance and $\tilde{p}$ and $\tilde{q}$ are the distributions obtained by rescaling $p$ and $q$ to have variance 1. More precisely, $\tilde{p}$ is the distribution obtained from $p$ by the transformation $x\rightarrow \frac{x}{\sigma(p)}$, where $\sigma(p)$ is the standard deviation of $p$. For probability distributions $p$ and $q$ with support in $\mathbb{R}$ and bounded absolute first moment, the $EMD$ between $p$ and $q$ is given by $EMD(p,q)=\int_{-\infty}^\infty|F(x)-G(x)|\,\mathrm{d}x$, where $F$ and $G$ are the cumulative distribution functions of $p$ and $q$ respectively. + +Now, for two networks $G$ and $G'$ and for a given set $T=\{t_1,t_2,...,t_m\}$ of network features, the $NetEmd$ measure corresponding to $T$ is: +\begin{equation}\label{eq:def_netemd} +NetEmd_T(G,G')=\frac{1}{m}\sum_{j=1}^{m} NetEmd_{t_j} (G,G'), +\end{equation} +where +\begin{equation} +NetEmd_{t_i} (G,G')=EMD^*(p_{t_i}(G),p_{t_i}(G')), +\end{equation} +and where $p_{t_i}(G)$ and $p_{t_i}(G')$ are the distributions of ${t_i}$ on $G$ and $G'$ respectively. $NetEmd_{t_i}$ can be shown to be a pseudometric between graphs for any feature $t$, that is it is non-negative, symmetric and satisfies the triangle inequality. + + +## Comparing two graphs with NetEmd. +```{r, netemd,fig.align='center'} +# Set source directory for Virus protein-protein interaction network edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# One to one NetEmd comparison. +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +``` + +## Comparing two graphs with NetEmd via their Laplacian spectrum. +```{r, netemdEigen,fig.align='center'} +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) + +#Spectra (This may take a couple of minutes). +props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +``` +------------------------- + +# Compare two networks via Netdis and its variants. + +## What is Netdis? +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared thus leading to the Netdis statistic. + +Netdis is constructed as follows: + +Let $N_{w,i}(G)$ be the number of induced occurrences of small graphs $w$ in the 2-step ego network of vertex $i$. Now, bin all 2-step ego-networks of network $G$ according to their network density. Let $E_w(G,d)$ be the expected number of occurrences of $w$ in an ego-network whose density falls in density bin $d$. For a given network $G$ compute the centred subgraph counts as +\[ +S_w(G)=\sum\limits_{i }{\bigg (N_{w,i}(G)- E_w(G, \rho(i)) \bigg )}, +\] +where $i$ is a node in $G$ and $\rho(i)$ the density bin of the 2-step ego-network of node $i$. + +Now, to compare networks $G_1$ and $G_2$, set +$$ +\displaystyle +netD_2^S(k) = \tfrac{1}{ \sqrt{ M(k)} } \sum\limits_{w \in A(k)} +\bigg ({ \tfrac{S_w(G_1) S_w(G_2)} {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +}\bigg ), \quad k=3,4, 5, +$$ +where $A(k)$ is the set of connected subgraphs of size $k$, and where $M(k)$ is a normalising constant so that $netD_2^S(k)\in[-1,1]$. $M(k)$ is equal to +\[ +M(k) = \sum\limits_{w \in A(k)} +\left( \tfrac{ S_w(G_1)^2 }{\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} \right) +\sum\limits_{w \in A(k)} +\left(\tfrac{ S_w(G_2)^2 } {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +\right) +. +\] +The corresponding Netdis statistic is defined as +$$Netdis(k)=netd_2^S(k)=\tfrac{1}{2}(1-netD_2^S(k)) \in [0,1].$$ +Small values of Netdis suggest higher `similarity' between the networks. By default Netdis uses subgraphs on $k=4$ nodes. + + +## Using netdis with a gold-standard graph to obtain $E_w$. +The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard (summarized in $E_w$). + +```{r,netdisgoldstand,fig.align='center'} +# Lattice graphs to be used as gold-standard as a reference point comparison +goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes + +plot(goldstd_1,vertex.size=0.8,vertex.label=NA) +plot(goldstd_2,vertex.size=0.5,vertex.label=NA) + + +# Netdis using the goldstd_1 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + +# Netdis using the goldstd_2 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +``` + +## Netdis-GP: Using a Geometric-Poisson approximation + +(Extracted from Ospina-Forero et al. (2018)): Instead of considering an approximation based on an observed gold-standard network whose selection may be difficult, $E_w$ is computed, independently for each graph, based on a Geometric-Poisson (GP) approximation for the distribution of the number of occurrences of subgraph $w$. It assumes that $N_{w,i} \sim GP(\lambda^{\rho(i)}_k, \theta^{\rho(i)}_w)$, where $\lambda^{\rho(i)}_k$ is the Poisson parameter indexed by the size of subgraph $w$ and the density bin $\rho(i)$; and where $\theta^{\rho(i)}_w$ is the geometric parameter indexed by subgraph $w$ and density bin $\rho(i)$. $E_w(G, \rho(i))$ is taken as the mean of the GP approximation, i.e. $\lambda^{\rho(i)}_k/\theta^{\rho(i)}_w$. + +As $\lambda^{\rho(i)}_k$ and $\theta^{\rho(i)}_w$ are not known, they are estimated as follows: +Let $x_{w,d}^j$ be the number of subgraphs $w$ on the 2-step ego-network $j$ of density bin $d$, and let +\[ +\bar{X}_{w,d}=\frac{1}{q} \sum_{j=1}^q x_{w,d}^j, \qquad V^2_{w,d}=\frac{1}{q-1} \sum_{j=1}^q (x_{w,d}^j - \bar{X}_{w,d})^2 +, +\] +where $q$ is the number of ego-networks in density bin $d$. Then, +\[ +\hat{\lambda}^{d}_{k}= \frac{1}{l} \sum_{h \in A(k)} \frac{2 (\bar{X}_{h,d})^2}{V^2_{h,d}+\bar{X}_{h,d}} , \qquad \hat{\theta}^{d}_w= \frac{2\bar{X}_{w,d}}{V^2_{w,d}+\bar{X}_{w,d}}, +\] +where $l$ is the number of connected subgraphs of size $k$, for example, $l=6$ for $k=4$. These estimators are based on the moment estimators of a GP random variable and the proposal made by (Picard et al.(2008)), where the total count of each individual subgraph could be thought as the sum of the total subgraph counts over multiple ``clumps'' of edges that appear across the network. + +This variant focuses on detecting more general and global discrepancies between the ego-network structures. + +```{r, netdisGP} +#Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) +``` + + +## Using Netdis with no expectation ($E_w=0$) +Comparing the networks via their observed ego counts without centering them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks. + + +```{r,netdiszero} +#Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + +``` + +------------------------- + + + + + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. \ No newline at end of file diff --git a/doc/default_pairwise_usage.html b/doc/default_pairwise_usage.html new file mode 100644 index 00000000..13f24329 --- /dev/null +++ b/doc/default_pairwise_usage.html @@ -0,0 +1,529 @@ + + + + + + + + + + + + + + + +Default, simple and quick use usage 1: pairwise comparisons + + + + + + + + + + + + + + + + + + + + + + +

Default, simple and quick use usage 1: pairwise comparisons

+

Luis Ospina-Forero

+

10-06-2020

+ + + +
+

Introduction

+

The netdist package currently considers to broad methodologies for network comparison, namely Netdis and NetEmd. Netdis considers multiple variants (via background expectations) to capture the dissimilarity between the local structure of networks exhibited by the occurrence of small subgraphs. NetEmd is also a method to capture the dissimilarity between networks using subgraph counts, but it has also been defined for any type of network features; for example eigen distributions. The variants of Netdis are controlled by the input selected for the background expectations, whereas the variants of NetEmd are controlled directly by the user in the selection of the network features being compared (by default this package uses subgraph counts).

+

The following shows a quick introduction to the most simple functions of the package, and to some of the variants of Netdis and NetEmd.

+
+
+

Load required packages/libraries

+
# Load packages/libraries
+library("netdist")
+library("igraph")
+
+

Load graphs included in the netdist package

+
# Set source directory for Virus protein-protein interaction edge files stored in the netdist package.
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+
+# Load query graphs as igraph objects
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges.
+graph_1
+#> IGRAPH f8527ed UN-- 60 208 -- 
+#> + attr: name (v/c)
+#> + edges from f8527ed (vertex names):
+#>  [1] A73  --BALF3 A73  --BARF0 A73  --BBLF2 A73  --BDRF1 A73  --BFRF4
+#>  [6] A73  --BGLF2 A73  --BGLF3 A73  --BGLF5 A73  --BLLF2 A73  --BTRF1
+#> [11] BALF3--BBLF2 BALF3--BDRF1 BALF3--BFRF4 BALF3--BGLF5 BALF3--BTRF1
+#> [16] BALF3--BALF1 BALF3--BALF2 BALF3--BORF1 BALF3--BALF4 BALF3--BFLF2
+#> [21] BALF3--BPLF1 BALF3--BALF5 BALF3--BBLF4 BALF3--BDLF2 BALF3--BdRF1
+#> [26] BALF3--BERF3 BALF3--BHRF1 BALF3--LMP2A BARF0--BBLF2 BARF0--BFRF4
+#> [31] BARF0--BSRF1 BARF0--BALF4 BARF0--BPLF1 BARF0--BALF5 BARF0--BDLF2
+#> [36] BARF0--BdRF1 BARF0--BERF3 BARF0--BGLF1 BARF0--LMP2A BBLF2--BDRF1
+#> + ... omitted several edges
+
+# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges.
+graph_2
+#> IGRAPH cf97448 UN-- 1941 3989 -- 
+#> + attr: name (v/c)
+#> + edges from cf97448 (vertex names):
+#>  [1] B1882--B1888 B1882--B1945 B1882--B1946 B1882--B1886 B1882--B1887
+#>  [6] B1882--B1939 B1882--B1938 B1882--B1884 B1882--B1883 B1882--B3210
+#> [11] B1882--B1881 B1882--B4355 B1882--B1922 B1882--B1890 B1882--B1889
+#> [16] B1888--B1886 B1888--B1887 B1888--B1884 B1888--B1883 B1888--B1881
+#> [21] B1888--B4355 B1888--B1890 B1888--B1889 B1888--B1421 B1888--B3072
+#> [26] B1888--B1885 B0728--B0729 B0728--B0724 B0728--B0726 B0728--B0727
+#> [31] B0729--B0724 B0729--B3734 B0729--B0726 B0729--B0727 B0729--B0720
+#> [36] B0729--B3236 B1812--B3360 B1812--B1260 B1812--B1261 B1812--B1263
+#> + ... omitted several edges
+
+#A simple visualization of the graphs.
+plot(graph_1,vertex.size=0.5,vertex.label=NA)
+

+
plot(graph_2,vertex.size=0.5,vertex.label=NA)
+

+
+
+
+

Compare two networks via NetEmd.

+
+

What is NetEmd?

+

(Extracted from Wegner et al. (2017)): NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties which reflect the topological organization of the network. From an abstract point of view, NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e\(.\) translations and re-scalings of the axis.

+

Based on these NetEmd uses the following measure between distributions \(p\) and \(q\) that are supported on \(\mathbb{R}\) and have non-zero, finite variances: \[\begin{equation}\label{emdmet} +EMD^*(p,q)=\mathrm{inf}_{c\in\mathbb{R}}\left( EMD\big(\tilde{p}(\cdot+c),\tilde{q}(\cdot)\big)\right), +\end{equation}\] where \(EMD\) is the earth mover’s distance and \(\tilde{p}\) and \(\tilde{q}\) are the distributions obtained by rescaling \(p\) and \(q\) to have variance 1. More precisely, \(\tilde{p}\) is the distribution obtained from \(p\) by the transformation \(x\rightarrow \frac{x}{\sigma(p)}\), where \(\sigma(p)\) is the standard deviation of \(p\). For probability distributions \(p\) and \(q\) with support in \(\mathbb{R}\) and bounded absolute first moment, the \(EMD\) between \(p\) and \(q\) is given by \(EMD(p,q)=\int_{-\infty}^\infty|F(x)-G(x)|\,\mathrm{d}x\), where \(F\) and \(G\) are the cumulative distribution functions of \(p\) and \(q\) respectively.

+

Now, for two networks \(G\) and \(G'\) and for a given set \(T=\{t_1,t_2,...,t_m\}\) of network features, the \(NetEmd\) measure corresponding to \(T\) is: \[\begin{equation}\label{eq:def_netemd} +NetEmd_T(G,G')=\frac{1}{m}\sum_{j=1}^{m} NetEmd_{t_j} (G,G'), +\end{equation}\] where \[\begin{equation} +NetEmd_{t_i} (G,G')=EMD^*(p_{t_i}(G),p_{t_i}(G')), +\end{equation}\] and where \(p_{t_i}(G)\) and \(p_{t_i}(G')\) are the distributions of \({t_i}\) on \(G\) and \(G'\) respectively. \(NetEmd_{t_i}\) can be shown to be a pseudometric between graphs for any feature \(t\), that is it is non-negative, symmetric and satisfies the triangle inequality.

+
+
+

Comparing two graphs with NetEmd.

+
# Set source directory for Virus protein-protein interaction network edge files stored in the netdist package.
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs as igraph objects
+# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges.
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges.
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+# One to one NetEmd comparison.
+netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended.
+#> [1] 0.5163894
+
+
+

Comparing two graphs with NetEmd via their Laplacian spectrum.

+
#Laplacian
+Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE)
+Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE)
+
+#Normalized Laplacian
+NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE)
+NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE)
+
+#Spectra (This may take a couple of minutes).
+props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) 
+props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) 
+
+netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended.
+#> [1] 0.1818771
+
+
+
+
+

Compare two networks via Netdis and its variants.

+
+

What is Netdis?

+

(Extracted from Ali et al. (2014)): Netdis counts small subgraphs \(w\) on \(k\) nodes for all 2-step ego-networks, \(k=3,4,5\). These counts are centred by subtracting the expected number of counts \(E_w\). These centred counts of each network are then compared thus leading to the Netdis statistic.

+

Netdis is constructed as follows:

+

Let \(N_{w,i}(G)\) be the number of induced occurrences of small graphs \(w\) in the 2-step ego network of vertex \(i\). Now, bin all 2-step ego-networks of network \(G\) according to their network density. Let \(E_w(G,d)\) be the expected number of occurrences of \(w\) in an ego-network whose density falls in density bin \(d\). For a given network \(G\) compute the centred subgraph counts as \[ +S_w(G)=\sum\limits_{i }{\bigg (N_{w,i}(G)- E_w(G, \rho(i)) \bigg )}, +\] where \(i\) is a node in \(G\) and \(\rho(i)\) the density bin of the 2-step ego-network of node \(i\).

+

Now, to compare networks \(G_1\) and \(G_2\), set \[ +\displaystyle +netD_2^S(k) = \tfrac{1}{ \sqrt{ M(k)} } \sum\limits_{w \in A(k)} +\bigg ({ \tfrac{S_w(G_1) S_w(G_2)} {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +}\bigg ), \quad k=3,4, 5, +\] where \(A(k)\) is the set of connected subgraphs of size \(k\), and where \(M(k)\) is a normalising constant so that \(netD_2^S(k)\in[-1,1]\). \(M(k)\) is equal to \[ +M(k) = \sum\limits_{w \in A(k)} +\left( \tfrac{ S_w(G_1)^2 }{\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} \right) +\sum\limits_{w \in A(k)} +\left(\tfrac{ S_w(G_2)^2 } {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +\right) +. +\] The corresponding Netdis statistic is defined as \[Netdis(k)=netd_2^S(k)=\tfrac{1}{2}(1-netD_2^S(k)) \in [0,1].\] Small values of Netdis suggest higher `similarity’ between the networks. By default Netdis uses subgraphs on \(k=4\) nodes.

+
+
+

Using netdis with a gold-standard graph to obtain \(E_w\).

+

The selection of a gold-standard graph as a substitute for \(E_w\) could be done when such graph is known to be a good proxy for \(E_w\), or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard (summarized in \(E_w\)).

+
# Lattice graphs to be used as gold-standard as a reference point comparison
+goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes
+goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes
+
+plot(goldstd_1,vertex.size=0.8,vertex.label=NA)
+

+
plot(goldstd_2,vertex.size=0.5,vertex.label=NA)
+

+

+
+# Netdis using the goldstd_1 graph as gold-standard reference point
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = goldstd_1)
+#>   netdis3   netdis4 
+#> 0.1422771 0.2517043
+
+# Netdis using the goldstd_2 graph as gold-standard reference point
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = goldstd_2)
+#>   netdis3   netdis4 
+#> 0.1401654 0.2505384
+
+
+

Netdis-GP: Using a Geometric-Poisson approximation

+

(Extracted from Ospina-Forero et al. (2018)): Instead of considering an approximation based on an observed gold-standard network whose selection may be difficult, \(E_w\) is computed, independently for each graph, based on a Geometric-Poisson (GP) approximation for the distribution of the number of occurrences of subgraph \(w\). It assumes that \(N_{w,i} \sim GP(\lambda^{\rho(i)}_k, \theta^{\rho(i)}_w)\), where \(\lambda^{\rho(i)}_k\) is the Poisson parameter indexed by the size of subgraph \(w\) and the density bin \(\rho(i)\); and where \(\theta^{\rho(i)}_w\) is the geometric parameter indexed by subgraph \(w\) and density bin \(\rho(i)\). \(E_w(G, \rho(i))\) is taken as the mean of the GP approximation, i.e. \(\lambda^{\rho(i)}_k/\theta^{\rho(i)}_w\).

+

As \(\lambda^{\rho(i)}_k\) and \(\theta^{\rho(i)}_w\) are not known, they are estimated as follows: Let \(x_{w,d}^j\) be the number of subgraphs \(w\) on the 2-step ego-network \(j\) of density bin \(d\), and let \[ +\bar{X}_{w,d}=\frac{1}{q} \sum_{j=1}^q x_{w,d}^j, \qquad V^2_{w,d}=\frac{1}{q-1} \sum_{j=1}^q (x_{w,d}^j - \bar{X}_{w,d})^2 +, +\] where \(q\) is the number of ego-networks in density bin \(d\). Then, \[ +\hat{\lambda}^{d}_{k}= \frac{1}{l} \sum_{h \in A(k)} \frac{2 (\bar{X}_{h,d})^2}{V^2_{h,d}+\bar{X}_{h,d}} , \qquad \hat{\theta}^{d}_w= \frac{2\bar{X}_{w,d}}{V^2_{w,d}+\bar{X}_{w,d}}, +\] where \(l\) is the number of connected subgraphs of size \(k\), for example, \(l=6\) for \(k=4\). These estimators are based on the moment estimators of a GP random variable and the proposal made by (Picard et al.(2008)), where the total count of each individual subgraph could be thought as the sum of the total subgraph counts over multiple ``clumps’’ of edges that appear across the network.

+

This variant focuses on detecting more general and global discrepancies between the ego-network structures.

+
#Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. 
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = NULL)
+#>   netdis3   netdis4 
+#> 0.8822527 0.1892716
+
+
+

Using Netdis with no expectation (\(E_w=0\))

+

Comparing the networks via their observed ego counts without centering them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks.

+
#Netdis using no expectations (or equivalently, expectation equal to zero).
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = 0)
+#>    netdis3    netdis4 
+#> 0.00761545 0.02106628
+
+
+
+
+

Bibliography

+
    +
  • W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014.

  • +
  • L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018.

  • +
  • A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017.

  • +
  • F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008.

  • +
+
+ + + + + + + + + + + diff --git a/doc/dendrogram_example_net_dis.R b/doc/dendrogram_example_net_dis.R new file mode 100644 index 00000000..17c93627 --- /dev/null +++ b/doc/dendrogram_example_net_dis.R @@ -0,0 +1,87 @@ +## ----------------------------------------------------------------------------- +library("netdist") +edge_format = "ncol" +# Load reference graph (used for Netdis. Not required for NetEMD) +ref_path = file.path(system.file(file.path("extdata", "random"), + package = "netdist"), + "ER_1250_10_1") +ref_graph <- read_simple_graph(ref_path, format = edge_format) + +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" + +# Load all graphs in the source folder matching the filename pattern +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) +print(names(query_graphs)) + +## ----------------------------------------------------------------------------- +# Set the maximum graphlet size to compute counts for +max_graphlet_size <- 4 +neighbourhood_size <- 2 + +## ----------------------------------------------------------------------------- + +# Calculate netdis measure for graphlets up to size max_graphlet_size +netdis_result <- netdis_many_to_many(query_graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size) + +# Netdis measure for graphlets of size 3 +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) + +print("Netdis: graphlet size = 3") +print(netdis3_mat) + +# Netdis measure for graphlets of size 4 +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print("Netdis: graphlet size = 4") +print(netdis4_mat) + +## ----------------------------------------------------------------------------- +graphdists <- as.dist(netdis4_mat) +par(mfrow = c(1, 2)) +cex <- 1 + +# Dendrogram based on Netdis measure for graphlets of size 3 +title <- paste("Netdis: graphlet size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + +# Dendrogram based on Netdis measure for graphlets of size 4 +title = paste("Netdis: graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + +## ----------------------------------------------------------------------------- +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) + +## ----------------------------------------------------------------------------- +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 4, sep = "") +heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) + diff --git a/vignettes/Quick_start_net_dis.Rmd b/doc/dendrogram_example_net_dis.Rmd similarity index 56% rename from vignettes/Quick_start_net_dis.Rmd rename to doc/dendrogram_example_net_dis.Rmd index b64bd751..e8dce40e 100644 --- a/vignettes/Quick_start_net_dis.Rmd +++ b/doc/dendrogram_example_net_dis.Rmd @@ -1,10 +1,10 @@ --- -title: "Quick start guide for Netdis" +title: "Dendrogram example for Netdis" author: "Martin O'Reilly" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Quick start for Netdis} + %\VignetteIndexEntry{Dendrogram example for Netdis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -31,17 +31,21 @@ properties. This can be avoided by setting the relevant `as_undirected`, library("netdist") edge_format = "ncol" # Load reference graph (used for Netdis. Not required for NetEMD) -ref_path = file.path(system.file(file.path("extdata", "random"), package = "netdist"), "ER_1250_10_1") +ref_path = file.path(system.file(file.path("extdata", "random"), + package = "netdist"), + "ER_1250_10_1") ref_graph <- read_simple_graph(ref_path, format = edge_format) # Set source directory and file properties for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -edge_format = "ncol" -file_pattern = "*" +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" # Load all graphs in the source folder matching the filename pattern -query_graphs <- read_simple_graphs(source_dir, format = edge_format, - pattern = file_pattern) +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) print(names(query_graphs)) ``` @@ -54,60 +58,67 @@ max_graphlet_size <- 4 neighbourhood_size <- 2 ``` -## Generate a function to generate expected graphlet counts -Use `netdis_expected_graphlet_counts_ego_fn` to generate a function that -calculates expected ego-network graphlet counts for query graphs based on the -statistics of a provided reference graph. - +## Generate NetDis measures between each pair of query graphs ```{r} -expected_count_fn <- netdis_expected_graphlet_counts_ego_fn( - ref_graph, max_graphlet_size, neighbourhood_size) -``` -## Generate centred graphlet counts for a set of query graphs -```{r} -centred_counts <- purrr::map(query_graphs, netdis_centred_graphlet_counts, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - expected_ego_count_fn = expected_count_fn) -``` +# Calculate netdis measure for graphlets up to size max_graphlet_size +netdis_result <- netdis_many_to_many(query_graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size) -## Generate NetDis measures between each pair of query graphs -```{r} # Netdis measure for graphlets of size 3 -res3 <- netdis_for_all_graphs(centred_counts, 3) -netdis3_mat <- cross_comp_to_matrix(res3$netdis, res3$comp_spec) +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) + +print("Netdis: graphlet size = 3") +print(netdis3_mat) + # Netdis measure for graphlets of size 4 -res4 <- netdis_for_all_graphs(centred_counts, 4) -netdis4_mat <- cross_comp_to_matrix(res4$netdis, res4$comp_spec) -netdis4_mat +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print("Netdis: graphlet size = 4") +print(netdis4_mat) ``` ## Generate dendrograms ```{r} -graphdists<-as.dist(netdis4_mat) -par(mfrow=c(1,2)) -cex=1 +graphdists <- as.dist(netdis4_mat) +par(mfrow = c(1, 2)) +cex <- 1 + # Dendrogram based on Netdis measure for graphlets of size 3 -title = paste("Netdis: graphlet size = ", 3, sep = "") -plot(phangorn::upgma(as.dist(netdis3_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + # Dendrogram based on Netdis measure for graphlets of size 4 title = paste("Netdis: graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netdis4_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) ``` ```{r} -cex=1.5 +cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 3, sep = "") -heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) ``` ```{r} -cex=1.5 +cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 4, sep = "") -heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +title <- paste("Netdis: graphlet size = ", 4, sep = "") +heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) ``` \ No newline at end of file diff --git a/doc/dendrogram_example_net_dis.html b/doc/dendrogram_example_net_dis.html new file mode 100644 index 00000000..afa0ef35 --- /dev/null +++ b/doc/dendrogram_example_net_dis.html @@ -0,0 +1,445 @@ + + + + + + + + + + + + + + + + +Dendrogram example for Netdis + + + + + + + + + + + + + + + + + + + + + + +

Dendrogram example for Netdis

+

Martin O’Reilly

+

2020-07-13

+ + + +
+

Virus PPI example for Netdis

+
+

Load graphs

+

Use read_simple_graphs to read graph data from all files in a directory that match a specific filename pattern in a format suitable for calculating graphlet-based feature counts using the ORCA package. We use igraph::read_graph to read graph data from files, so support all file formats it supports. See help for igraph::read_graph for a list of supported values for the format parameter and the igraph documentation for descriptions of each of the supported file formats.

+

The ORCA package we use to efficiently calculate graphlet and orbit counts requires that graphs are undirected, simple (i.e. have no self-loops or multiple edges) and connected (i.e. have no isolated vertices). Therefore, by default, graphs loaded by read_simple_graphs will be coerced to have the above properties. This can be avoided by setting the relevant as_undirected, remove_loops, remove_multiple or remove_isolates parameters to FALSE.

+
library("netdist")
+edge_format = "ncol"
+# Load reference graph (used for Netdis. Not required for NetEMD)
+ref_path = file.path(system.file(file.path("extdata", "random"),
+                                 package = "netdist"),
+                     "ER_1250_10_1")
+ref_graph <- read_simple_graph(ref_path, format = edge_format)
+
+# Set source directory and file properties for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"),
+                          package = "netdist")
+edge_format <- "ncol"
+file_pattern <- "*"
+
+# Load all graphs in the source folder matching the filename pattern
+query_graphs <- read_simple_graphs(source_dir,
+                                   format = edge_format, 
+                                   pattern = file_pattern)
+print(names(query_graphs))
+
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
+

In this example we will use counts of graphlets containing up to 4 nodes and consider ego-network neighbourhoods of size 2 (i.e. the immediate neighbours of each node plus their immediate neighbours).

+
# Set the maximum graphlet size to compute counts for
+max_graphlet_size <- 4
+neighbourhood_size <- 2
+
+
+
+

Generate NetDis measures between each pair of query graphs

+
# Calculate netdis measure for graphlets up to size max_graphlet_size
+netdis_result <- netdis_many_to_many(query_graphs,
+                                     ref_graph,
+                                     max_graphlet_size = max_graphlet_size,
+                                     neighbourhood_size = neighbourhood_size)
+
+# Netdis measure for graphlets of size 3
+res3 <- netdis_result$netdis["netdis3", ]
+netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec)
+
+print("Netdis: graphlet size = 3")
+
## [1] "Netdis: graphlet size = 3"
+
print(netdis3_mat)
+
##               EBV       ECL        HSV-1         KSHV          VZV
+## EBV   0.000000000 0.1846655 0.0082642217 0.0100538469 0.0067775779
+## ECL   0.184665514 0.0000000 0.2065761911 0.2091240549 0.2075471192
+## HSV-1 0.008264222 0.2065762 0.0000000000 0.0001335756 0.0001748254
+## KSHV  0.010053847 0.2091241 0.0001335756 0.0000000000 0.0005964448
+## VZV   0.006777578 0.2075471 0.0001748254 0.0005964448 0.0000000000
+
# Netdis measure for graphlets of size 4
+res4 <- netdis_result$netdis["netdis4", ]
+netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec)
+
+print("Netdis: graphlet size = 4")
+
## [1] "Netdis: graphlet size = 4"
+
print(netdis4_mat)
+
##              EBV       ECL      HSV-1       KSHV        VZV
+## EBV   0.00000000 0.1749835 0.16526412 0.01969246 0.15971116
+## ECL   0.17498347 0.0000000 0.29176120 0.22155786 0.41716144
+## HSV-1 0.16526412 0.2917612 0.00000000 0.07602426 0.03434187
+## KSHV  0.01969246 0.2215579 0.07602426 0.00000000 0.13115524
+## VZV   0.15971116 0.4171614 0.03434187 0.13115524 0.00000000
+
+
+

Generate dendrograms

+
graphdists <- as.dist(netdis4_mat)
+par(mfrow = c(1, 2))
+cex <- 1
+
+# Dendrogram based on Netdis measure for graphlets of size 3
+title <- paste("Netdis: graphlet size = ", 3, sep = "")
+plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"),
+     use.edge.length = FALSE, 
+     edge.width = cex*2,
+     main = title,
+     cex.lab = cex, cex.axis = cex,
+     cex.main = cex, cex.sub = cex,
+     cex = cex)
+
+# Dendrogram based on Netdis measure for graphlets of size 4
+title = paste("Netdis: graphlet size = ", 4, sep = "")
+plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"),
+     use.edge.length = FALSE, 
+     edge.width = cex*2,
+     main = title,
+     cex.lab = cex, cex.axis = cex,
+     cex.main = cex, cex.sub = cex,
+     cex = cex)
+

+
cex <- 1.5
+col <- colorRampPalette(colors = c("blue","white"))(100)
+title <- paste("Netdis: graphlet size = ", 3, sep = "")
+heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
+        cexRow = cex, cexCol = cex, symm = TRUE)
+

+
cex <- 1.5
+col <- colorRampPalette(colors = c("blue","white"))(100)
+title <- paste("Netdis: graphlet size = ", 4, sep = "")
+heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
+        cexRow = cex, cexCol = cex, symm = TRUE)
+

+
+ + + + + + + + + + + diff --git a/vignettes/Quick_start_net_emd.R b/doc/dendrogram_example_net_emd.R similarity index 65% rename from vignettes/Quick_start_net_emd.R rename to doc/dendrogram_example_net_emd.R index e997af69..6a677202 100644 --- a/vignettes/Quick_start_net_emd.R +++ b/doc/dendrogram_example_net_emd.R @@ -1,4 +1,4 @@ -## ---- fig.show='hold'---------------------------------------------------- +## ---- fig.show='hold'--------------------------------------------------------- library("netdist") # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") @@ -29,7 +29,7 @@ names(virus_gdds) # of unit width (smoothing_window_width = 1). Returns a named list containing: # (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) # You can also specify method = "fixed_step" to use the much slower method of # exhaustively evaluating the EMD at all offsets separated by a fixed step. @@ -40,6 +40,27 @@ res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) # normalised to unit variance # Convert to matrix for input to dendrogram method -netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) +netemd_mat <- cross_comp_to_matrix(res$netemds, res$comp_spec) netemd_mat +## ----------------------------------------------------------------------------- +cex=1 +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, + edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, + cex.sub=cex, cex=cex) + +# The gdd_for_all_graphs and netemd_many_to_many functions will run in +# parallel using multiple threads where supported. The number of threads +# used is determined by the global R option "mc.cores". You can inspect the +# current value of this using options("mc.cores") and set it with +# options("mc.cores" = ). To fully utilise a modern consumer +# processor, this should be set to 2x the number of available processor +# cores as each core supports two threads. + +## ----------------------------------------------------------------------------- +cex=1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) + diff --git a/vignettes/Quick_start_net_emd.Rmd b/doc/dendrogram_example_net_emd.Rmd similarity index 91% rename from vignettes/Quick_start_net_emd.Rmd rename to doc/dendrogram_example_net_emd.Rmd index 73283132..4a70fb8d 100644 --- a/vignettes/Quick_start_net_emd.Rmd +++ b/doc/dendrogram_example_net_emd.Rmd @@ -1,10 +1,10 @@ --- -title: "Quick start guide for NetEMD" +title: "Dendrogram example for NetEMD" author: "Martin O'Reilly" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Quick start for NetEMD} + %\VignetteIndexEntry{Dendrogram example for NetEMD} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -40,7 +40,7 @@ names(virus_gdds) # of unit width (smoothing_window_width = 1). Returns a named list containing: # (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) # You can also specify method = "fixed_step" to use the much slower method of # exhaustively evaluating the EMD at all offsets separated by a fixed step. @@ -51,7 +51,7 @@ res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) # normalised to unit variance # Convert to matrix for input to dendrogram method -netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) +netemd_mat <- cross_comp_to_matrix(res$netemds, res$comp_spec) netemd_mat ``` @@ -62,7 +62,7 @@ plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FAL edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, cex.sub=cex, cex=cex) -# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in +# The gdd_for_all_graphs and netemd_many_to_many functions will run in # parallel using multiple threads where supported. The number of threads # used is determined by the global R option "mc.cores". You can inspect the # current value of this using options("mc.cores") and set it with diff --git a/doc/dendrogram_example_net_emd.html b/doc/dendrogram_example_net_emd.html new file mode 100644 index 00000000..68e141bb --- /dev/null +++ b/doc/dendrogram_example_net_emd.html @@ -0,0 +1,409 @@ + + + + + + + + + + + + + + + + +Dendrogram example for NetEMD + + + + + + + + + + + + + + + + + + + + + + +

Dendrogram example for NetEMD

+

Martin O’Reilly

+

2020-12-10

+ + + +
+

Virus PPI example for NetEMD

+
library("netdist")
+# Set source directory and file properties for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+edge_format = "ncol"
+file_pattern = ".txt"
+
+# Calculate graphlet-based degree distributions for all orbits in graphlets 
+# comprising up to 4 nodes for all graphs. This only needs to be done once 
+# per graph (feature_type = "orbit", max_graphlet_size = 4).. 
+# If feature_type is set to "feature_type", orbit counts for orbits in the
+# same graphlet will be summed to generate graphlet counts
+# If max_graphlet_size is set to 5, graphlet-based degree distributions will  
+# be calculated for graphlets comprising up to 5 nodes.
+virus_gdds <- gdd_for_all_graphs(
+  source_dir = source_dir, format = edge_format, pattern = file_pattern, 
+  feature_type = "orbit", max_graphlet_size = 4)
+names(virus_gdds)
+
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
+
# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- 
+# based degree distributions using the default fast "optimise" method and no
+# smoothing (default). The "optimise" method uses the built-in R optimise
+# function to efficiently find the offset with the minimum EMD, but is not
+# guaranteed to find the global minimum if EMD as a function of offset
+# is non-convex and/or multimodal. The smoothing window width determines 
+# whether to calculate the NetEMD from the unaltered discrete GDD histograms
+# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" 
+# smoothing by "smearing" the discrete GDD histogram point masses across bins 
+# of unit width (smoothing_window_width = 1). Returns a named list containing:
+# (i) the NetEMDs and (ii) a table containing the graph names and indices 
+# within the input GDD list for each pair of graphs compared.
+res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0)
+
## [1] "This function will compute orbits of graphlets up to size 5 using  2  cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM."
+
# You can also specify method = "fixed_step" to use the much slower method of 
+# exhaustively evaluating the EMD at all offsets separated by a fixed step. 
+# The default step size is 1/2 the the minimum spacing between locations in 
+# either histogram after normalising to unit variance. However, you can 
+# specifiy your own fixed step using the optional "step_size" parameter.
+# Note that this step size is applied to the histograms after they have been 
+# normalised to unit variance
+
+# Convert to matrix for input to dendrogram method
+netemd_mat <- cross_comp_to_matrix(res$netemds, res$comp_spec)
+netemd_mat
+
##             EBV       ECL     HSV-1      KSHV       VZV
+## EBV   0.0000000 0.4876039 0.1662892 0.1607293 0.1994605
+## ECL   0.4876039 0.0000000 0.3986281 0.4024176 0.4029344
+## HSV-1 0.1662892 0.3986281 0.0000000 0.1581520 0.2164003
+## KSHV  0.1607293 0.4024176 0.1581520 0.0000000 0.2323936
+## VZV   0.1994605 0.4029344 0.2164003 0.2323936 0.0000000
+
cex=1
+title = paste("NetEMD: max graphlet size = ", 4, sep = "")
+plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, 
+     edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, 
+     cex.sub=cex, cex=cex)
+

+
# The gdd_for_all_graphs and netemd_many_to_many functions will run in 
+# parallel using multiple threads where supported. The number of threads
+# used is determined by the global R option "mc.cores". You can inspect the 
+# current value of this using options("mc.cores") and set it with 
+# options("mc.cores" = <num_cores>). To fully utilise a modern consumer
+# processor, this should be set to 2x the number of available processor 
+# cores as each core supports two threads.
+
cex=1.5
+col <- colorRampPalette(colors = c("blue","white"))(100)
+title = paste("NetEMD: max graphlet size = ", 4, sep = "")
+heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE)
+

+
+ + + + + + + + + + + diff --git a/doc/netdis_2graphs_polya-aeppli.R b/doc/netdis_2graphs_polya-aeppli.R new file mode 100644 index 00000000..4d73ce38 --- /dev/null +++ b/doc/netdis_2graphs_polya-aeppli.R @@ -0,0 +1,180 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ----------------------------------------------------------------------------- +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + +## ----------------------------------------------------------------------------- +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + + +## ----------------------------------------------------------------------------- + +# Get ego-network densities +densities_1 <- ego_network_density(graphlet_counts_1) +densities_2 <- ego_network_density(graphlet_counts_2) + +# Adaptively bin ego-network densities +binned_densities_1 <- binned_densities_adaptive(densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_1 <- binned_densities_1$breaks + +binned_densities_2 <- binned_densities_adaptive(densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_2 <- binned_densities_2$breaks + +## ----------------------------------------------------------------------------- + +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + mean_binned_graphlet_counts, + max_graphlet_size) { + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx, ] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk +} + +#' Calculate expected counts in density bins using the +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply(1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + + expected_counts_bin +} + +binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) + +binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size) + +## ----------------------------------------------------------------------------- +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ego_density_bins_1, + binned_graphlet_counts_1, + max_graphlet_size, + scale_fn = NULL) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ego_density_bins_2, + binned_graphlet_counts_2, + max_graphlet_size, + scale_fn = NULL) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) + +## ----------------------------------------------------------------------------- +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + +## ----------------------------------------------------------------------------- + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) + diff --git a/doc/netdis_2graphs_polya-aeppli.Rmd b/doc/netdis_2graphs_polya-aeppli.Rmd new file mode 100644 index 00000000..ac4c1191 --- /dev/null +++ b/doc/netdis_2graphs_polya-aeppli.Rmd @@ -0,0 +1,211 @@ +--- +title: "Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis - 2 graphs with GP Approximation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + +``` + +## Bin ego networks by density +```{r} + +# Get ego-network densities +densities_1 <- ego_network_density(graphlet_counts_1) +densities_2 <- ego_network_density(graphlet_counts_2) + +# Adaptively bin ego-network densities +binned_densities_1 <- binned_densities_adaptive(densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_1 <- binned_densities_1$breaks + +binned_densities_2 <- binned_densities_adaptive(densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_2 <- binned_densities_2$breaks +``` + +## Calculate expected graphlet counts in each bin using geometric poisson approximation +```{r} + +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + mean_binned_graphlet_counts, + max_graphlet_size) { + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx, ] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk +} + +#' Calculate expected counts in density bins using the +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply(1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + + expected_counts_bin +} + +binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) + +binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size) +``` + +## Centre graphlet counts of query graphs using binned expected counts +```{r} +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ego_density_bins_1, + binned_graphlet_counts_1, + max_graphlet_size, + scale_fn = NULL) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ego_density_bins_2, + binned_graphlet_counts_2, + max_graphlet_size, + scale_fn = NULL) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) +``` + + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) +``` + +## Calculate netdis statistics +```{r} + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) +``` \ No newline at end of file diff --git a/doc/netdis_2graphs_polya-aeppli.html b/doc/netdis_2graphs_polya-aeppli.html new file mode 100644 index 00000000..6ddf6bc1 --- /dev/null +++ b/doc/netdis_2graphs_polya-aeppli.html @@ -0,0 +1,530 @@ + + + + + + + + + + + + + + + + +Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation + + + + + + + + + + + + + + + + + + + + + + +

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

+

Martin O’Reilly, Jack Roberts

+

2020-07-13

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
## 
+## Attaching package: 'purrr'
+
## The following objects are masked from 'package:igraph':
+## 
+##     compose, simplify
+
+
+

Load graphs

+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+
+

Generate ego networks

+
# Get ego networks for query graphs and reference graph
+ego_1 <- make_named_ego_graph(graph_1, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+ego_2 <- make_named_ego_graph(graph_2, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+
+

Count graphlets in ego networks

+
# Count graphlets for ego networks in query and reference graphs
+graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
+graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
+
+
+

Bin ego networks by density

+
# Get ego-network densities
+densities_1 <- ego_network_density(graphlet_counts_1)
+densities_2 <- ego_network_density(graphlet_counts_2)
+
+# Adaptively bin ego-network densities
+binned_densities_1 <- binned_densities_adaptive(densities_1, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_1 <- binned_densities_1$breaks
+
+binned_densities_2 <- binned_densities_adaptive(densities_2, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_2 <- binned_densities_2$breaks
+
+
+

Calculate expected graphlet counts in each bin using geometric poisson approximation

+
#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY
+#' Calculate expected counts with geometric poisson (Polya-Aeppli)
+#' approximation for a single density bin.
+#' @param bin_idx Density bin index to calculate expected counts for.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+exp_counts_bin_gp <- function(bin_idx, graphlet_counts,
+                              density_interval_indexes,
+                              mean_binned_graphlet_counts,
+                              max_graphlet_size) {
+  counts <- graphlet_counts[density_interval_indexes == bin_idx, ]
+  means <- mean_binned_graphlet_counts[bin_idx, ]
+  
+  mean_sub_counts <- sweep(counts, 2, means)
+  
+  Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1)
+  theta_d <- 2 * means / (Vd_sq + means)
+  
+  exp_counts_dk <- vector()
+  for (k in 2:max_graphlet_size) {
+    graphlet_idx <- graphlet_ids_for_size(k)
+    
+    lambda_dk <- mean(2 * means[graphlet_idx]^2 /
+                        (Vd_sq[graphlet_idx] + means[graphlet_idx]),
+                      na.rm = TRUE)
+    
+    exp_counts_dk <- append(exp_counts_dk,
+                            lambda_dk / theta_d[graphlet_idx])
+  }
+  
+  exp_counts_dk
+}
+
+#' Calculate expected counts in density bins using the
+#' geometric poisson (Polya-Aeppli) approximation.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+#' @param max_graphlet_size Determines the maximum size of graphlets
+#' included in graphlet_counts.
+#' @export
+density_binned_counts_gp <- function(graphlet_counts,
+                                     density_interval_indexes,
+                                     max_graphlet_size) {
+
+  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
+    graphlet_counts,
+    density_interval_indexes)
+
+  nbins <- length(unique(density_interval_indexes))
+  expected_counts_bin <- t(sapply(1:nbins,
+                                  exp_counts_bin_gp,
+                                  graphlet_counts = graphlet_counts,
+                                  density_interval_indexes = density_interval_indexes,
+                                  mean_binned_graphlet_counts = mean_binned_graphlet_counts,
+                                  max_graphlet_size = max_graphlet_size))
+
+  # deal with NAs caused by bins with zero counts for a graphlet
+  expected_counts_bin[is.nan(expected_counts_bin)] <- 0
+
+  expected_counts_bin
+}
+
+binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1,
+                                                     binned_densities_1$interval_indexes,
+                                                     max_graphlet_size)
+
+binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2,
+                                                     binned_densities_2$interval_indexes,
+                                                     max_graphlet_size)
+
+
+

Centre graphlet counts of query graphs using binned expected counts

+
# Calculate expected graphlet counts for each ego network
+exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
+                                                                 ego_density_bins_1, 
+                                                                 binned_graphlet_counts_1,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+
+
+exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
+                                                                 ego_density_bins_2, 
+                                                                 binned_graphlet_counts_2,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
+                                                        exp_graphlet_counts_1,
+                                                        max_graphlet_size)
+
+centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
+                                                        exp_graphlet_counts_2,
+                                                        max_graphlet_size)
+
+
+

Sum centred graphlet counts across all ego networks

+
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
+
+sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
+
+
+

Calculate netdis statistics

+
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
+                              sum_graphlet_counts_2, 
+                              max_graphlet_size)
+
+print(netdis_result)
+
##   netdis3   netdis4 
+## 0.8822527 0.1892716
+
+ + + + + + + + + + + diff --git a/doc/netdis_customisations.R b/doc/netdis_customisations.R new file mode 100644 index 00000000..2c5c0071 --- /dev/null +++ b/doc/netdis_customisations.R @@ -0,0 +1,98 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + + +## ----------------------------------------------------------------------------- +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +## ----------------------------------------------------------------------------- + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) + +## ----------------------------------------------------------------------------- + +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, + num_intervals = 50) + + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn) + +print(results$netdis) +print(results$comp_spec) + + + +## ----------------------------------------------------------------------------- +bin_counts_fn <- density_binned_counts_gp + +exp_counts_fn <- purrr::partial(netdis_expected_counts, + scale_fn = NULL) + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) + +## ----------------------------------------------------------------------------- +binning_fn <- single_density_bin +bin_counts_fn <- density_binned_counts +exp_counts_fn <- netdis_expected_counts + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) + diff --git a/doc/netdis_customisations.Rmd b/doc/netdis_customisations.Rmd new file mode 100644 index 00000000..0c1c724a --- /dev/null +++ b/doc/netdis_customisations.Rmd @@ -0,0 +1,122 @@ +--- +title: "Usage of netdis with binning and expected counts customisations." +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis function customisations} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Load query graphs +```{r} +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +``` + +## Default Expected Counts with Reference Graph +```{r} + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Binning Parameters +```{r} + +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, + num_intervals = 50) + + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn) + +print(results$netdis) +print(results$comp_spec) + + +``` + +## With Modified Expected Counts: Geometric Poisson +```{r} +bin_counts_fn <- density_binned_counts_gp + +exp_counts_fn <- purrr::partial(netdis_expected_counts, + scale_fn = NULL) + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Expected Counts: Simple Mean +```{r} +binning_fn <- single_density_bin +bin_counts_fn <- density_binned_counts +exp_counts_fn <- netdis_expected_counts + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) +``` \ No newline at end of file diff --git a/doc/netdis_customisations.html b/doc/netdis_customisations.html new file mode 100644 index 00000000..6b0ea449 --- /dev/null +++ b/doc/netdis_customisations.html @@ -0,0 +1,507 @@ + + + + + + + + + + + + + + + + +Usage of netdis with binning and expected counts customisations. + + + + + + + + + + + + + + + + + + + + + + +

Usage of netdis with binning and expected counts customisations.

+

Jack Roberts

+

2020-07-13

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+
+

Load query graphs

+
source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+
+
+

Default Expected Counts with Reference Graph

+
# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges)
+
+print(results$netdis)
+
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
+## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Binning Parameters

+
binning_fn <- purrr::partial(binned_densities_adaptive,
+                             min_counts_per_interval = 10,
+                             num_intervals = 50)
+
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               binning_fn = binning_fn)
+
+print(results$netdis)
+
##               [,1]        [,2]        [,3]        [,4]      [,5]      [,6]
+## netdis3 0.08499773 0.005900766 0.009547675 0.007177066 0.1078916 0.1144589
+## netdis4 0.20037679 0.045244760 0.018904439 0.112043371 0.3361503 0.2631420
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.1101426 0.0006494388 2.478794e-05 0.0004097632
+## netdis4 0.4818139 0.0274434372 3.227187e-02 0.0928126401
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Expected Counts: Geometric Poisson

+
bin_counts_fn <- density_binned_counts_gp
+
+exp_counts_fn <- purrr::partial(netdis_expected_counts,
+                                scale_fn = NULL)
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph = NULL,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               bin_counts_fn = bin_counts_fn,
+                               exp_counts_fn = exp_counts_fn)
+
+print(results$netdis)
+
##              [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
+## netdis3 0.8822527 0.9101084 0.8838054 0.96266771 0.04173551 0.03585169
+## netdis4 0.1892716 0.5735233 0.3719671 0.04604718 0.60270399 0.20370737
+##               [,7]         [,8]        [,9]       [,10]
+## netdis3 0.06271238 0.0004211575 0.005364888 0.009114229
+## netdis4 0.12978637 0.7173089685 0.487688692 0.371848474
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Expected Counts: Simple Mean

+
binning_fn <- single_density_bin
+bin_counts_fn <- density_binned_counts
+exp_counts_fn <- netdis_expected_counts
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph = NULL,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               binning_fn = binning_fn,
+                               bin_counts_fn = bin_counts_fn,
+                               exp_counts_fn = exp_counts_fn)
+
+print(results$netdis)
+
##              [,1]      [,2]      [,3]       [,4]      [,5]      [,6]      [,7]
+## netdis3 0.3116860 0.8254261 0.8768637 0.04053921 0.8531485 0.8226894 0.2353732
+## netdis4 0.9592365 0.2009423 0.7974697 0.21688688 0.7734930 0.2144558 0.8030030
+##               [,8]      [,9]     [,10]
+## netdis3 0.01970843 0.8288649 0.9167543
+## netdis4 0.39992007 0.3300305 0.6301018
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + + + + diff --git a/doc/netdis_pairwise_comparisons.R b/doc/netdis_pairwise_comparisons.R new file mode 100644 index 00000000..8261d7a4 --- /dev/null +++ b/doc/netdis_pairwise_comparisons.R @@ -0,0 +1,74 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + + +## ----------------------------------------------------------------------------- +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Calculate netdis statistics +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +## ----------------------------------------------------------------------------- +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +graph_1 <- graphs$EBV +graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + +# Calculate netdis statistics +netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +## ----------------------------------------------------------------------------- +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) + diff --git a/doc/netdis_pairwise_comparisons.Rmd b/doc/netdis_pairwise_comparisons.Rmd new file mode 100644 index 00000000..d5809c4d --- /dev/null +++ b/doc/netdis_pairwise_comparisons.Rmd @@ -0,0 +1,94 @@ +--- +title: "Usage of netdis interfaces for different pairwise comparison options." +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis pairwise comparisons} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Compare two graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Calculate netdis statistics +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Compare one graph to many other graphs +```{r} +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +graph_1 <- graphs$EBV +graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + +# Calculate netdis statistics +netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Do pairwise netdis calculations for many graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) +``` \ No newline at end of file diff --git a/doc/netdis_pairwise_comparisons.html b/doc/netdis_pairwise_comparisons.html new file mode 100644 index 00000000..1f0f90a4 --- /dev/null +++ b/doc/netdis_pairwise_comparisons.html @@ -0,0 +1,439 @@ + + + + + + + + + + + + + + + + +Usage of netdis interfaces for different pairwise comparison options. + + + + + + + + + + + + + + + + + + + + + + +

Usage of netdis interfaces for different pairwise comparison options.

+

Jack Roberts

+

2020-07-13

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+# Reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+
+

Compare two graphs

+
# Load query graphs
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+# Calculate netdis statistics
+netdis_one_to_one(graph_1, graph_2,
+                  ref_graph,
+                  max_graphlet_size = max_graphlet_size,
+                  neighbourhood_size = neighbourhood_size,
+                  min_ego_nodes = min_ego_nodes,
+                  min_ego_edges = min_ego_edges)
+
##   netdis3   netdis4 
+## 0.1846655 0.1749835
+
+
+

Compare one graph to many other graphs

+
# Load query graphs
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+graph_1 <- graphs$EBV
+graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")]
+
+# Calculate netdis statistics
+netdis_one_to_many(graph_1, graphs_compare,
+                   ref_graph,
+                   max_graphlet_size = max_graphlet_size,
+                   neighbourhood_size = neighbourhood_size,
+                   min_ego_nodes = min_ego_nodes,
+                   min_ego_edges = min_ego_edges)
+
##               ECL       HSV-1       KSHV         VZV
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160
+
+
+

Do pairwise netdis calculations for many graphs

+
# Load query graphs
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges)
+
+print(results$netdis)
+
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
+## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + + + + diff --git a/doc/quickstart_netdis_2graphs.R b/doc/quickstart_netdis_2graphs.R new file mode 100644 index 00000000..a8189f71 --- /dev/null +++ b/doc/quickstart_netdis_2graphs.R @@ -0,0 +1,121 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ----------------------------------------------------------------------------- +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +## ----------------------------------------------------------------------------- +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + +## ----------------------------------------------------------------------------- +# Load reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, + max_graphlet_size) + + +# Get ego-network densities +densities_ref <- ego_network_density(graphlet_counts_ref) + +# Adaptively bin ref ego-network densities +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks + +# Average ref graphlet counts across density bins +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) + + +## ----------------------------------------------------------------------------- +# Calculate expected graphlet counts (using ref graph ego network density bins) +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) + +## ----------------------------------------------------------------------------- +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + +## ----------------------------------------------------------------------------- + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) + diff --git a/doc/quickstart_netdis_2graphs.Rmd b/doc/quickstart_netdis_2graphs.Rmd new file mode 100644 index 00000000..a55f1d82 --- /dev/null +++ b/doc/quickstart_netdis_2graphs.Rmd @@ -0,0 +1,150 @@ +--- +title: "Quick start guide for Netdis - 2 graphs" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Quick start for Netdis - 2 graphs} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) +``` + +## Use a reference graph to calculate expected graphlet counts in ego network density bins +```{r} +# Load reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, + max_graphlet_size) + + +# Get ego-network densities +densities_ref <- ego_network_density(graphlet_counts_ref) + +# Adaptively bin ref ego-network densities +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks + +# Average ref graphlet counts across density bins +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) + +``` + + +## Centre graphlet counts of query graphs based on statistics of reference graph +```{r} +# Calculate expected graphlet counts (using ref graph ego network density bins) +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) +``` + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) +``` + +## Calculate netdis statistics +```{r} + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) +``` diff --git a/doc/quickstart_netdis_2graphs.html b/doc/quickstart_netdis_2graphs.html new file mode 100644 index 00000000..0a96700a --- /dev/null +++ b/doc/quickstart_netdis_2graphs.html @@ -0,0 +1,468 @@ + + + + + + + + + + + + + + + + +Quick start guide for Netdis - 2 graphs + + + + + + + + + + + + + + + + + + + + + + +

Quick start guide for Netdis - 2 graphs

+

Martin O’Reilly, Jack Roberts

+

2020-07-13

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Load graphs

+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+
+

Generate ego networks

+
# Get ego networks for query graphs and reference graph
+ego_1 <- make_named_ego_graph(graph_1, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+ego_2 <- make_named_ego_graph(graph_2, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+
+

Count graphlets in ego networks

+
# Count graphlets for ego networks in query and reference graphs
+graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
+graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
+
+
+

Use a reference graph to calculate expected graphlet counts in ego network density bins

+
# Load reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+ego_ref <- make_named_ego_graph(ref_graph, 
+                                order = neighbourhood_size, 
+                                min_ego_nodes = min_ego_nodes, 
+                                min_ego_edges = min_ego_edges)
+
+graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size)
+
+# Scale ego-network graphlet counts by dividing by total number of k-tuples in
+# ego-network (where k is graphlet size)
+scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, 
+                                                        max_graphlet_size)
+
+
+# Get ego-network densities
+densities_ref <- ego_network_density(graphlet_counts_ref)
+
+# Adaptively bin ref ego-network densities
+binned_densities <- binned_densities_adaptive(densities_ref, 
+                                              min_counts_per_interval = min_bin_count, 
+                                              num_intervals = num_bins)
+
+ref_ego_density_bins <- binned_densities$breaks
+
+# Average ref graphlet counts across density bins
+ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
+                                  scaled_graphlet_counts_ref, 
+                                  binned_densities$interval_indexes)
+
+
+

Centre graphlet counts of query graphs based on statistics of reference graph

+
# Calculate expected graphlet counts (using ref graph ego network density bins)
+exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
+                                                                 ref_ego_density_bins, 
+                                                                 ref_binned_graphlet_counts,
+                                                                 max_graphlet_size,
+                                                                 scale_fn=count_graphlet_tuples)
+
+
+exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
+                                                                 ref_ego_density_bins, 
+                                                                 ref_binned_graphlet_counts,
+                                                                 max_graphlet_size,
+                                                                 scale_fn=count_graphlet_tuples)
+
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
+                                                        exp_graphlet_counts_1,
+                                                        max_graphlet_size)
+
+centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
+                                                        exp_graphlet_counts_2,
+                                                        max_graphlet_size)
+
+
+

Sum centred graphlet counts across all ego networks

+
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
+
+sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
+
+
+

Calculate netdis statistics

+
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
+                              sum_graphlet_counts_2, 
+                              max_graphlet_size)
+
+print(netdis_result)
+
##   netdis3   netdis4 
+## 0.1846655 0.1749835
+
+ + + + + + + + + + + diff --git a/man/adaptive_breaks.Rd b/man/adaptive_breaks.Rd index 497cfc31..d94b0e74 100644 --- a/man/adaptive_breaks.Rd +++ b/man/adaptive_breaks.Rd @@ -13,15 +13,15 @@ adaptive_breaks(x, min_count, breaks) \item{min_count}{The minimum count for each bin} \item{breaks}{Either a vector containing an intital set of breaks or a single -number indicating how many uniformly spaced intervals to use when constructing -the initial set of breaks. If a single number is provided, the minumum break -will be the minimum value of x and the maximum break will be the maximum -value of x.} +number indicating how many uniformly spaced intervals to use when +constructing the initial set of breaks. If a single number is provided, the +minumum break will be the minimum value of x and the maximum break will be +the maximum value of x.} } \description{ Starts by binning the variable by the breaks provided in \code{breaks} (if \code{breaks} is a vector), or generating a set of \code{breaks} at uniformly -spaced intervals (if \code{breaks} is a single number). It then iteratively -merges intervals with counts lower than \code{min_count} by removing breaks +spaced intervals (if \code{breaks} is a single number). It then iteratively +merges intervals with counts lower than \code{min_count} by removing breaks until all remaining intervals have counts of at least \code{min_count}. } diff --git a/man/area_between_dhist_ecmfs.Rd b/man/area_between_dhist_ecmfs.Rd index d5320d15..f123ec5b 100644 --- a/man/area_between_dhist_ecmfs.Rd +++ b/man/area_between_dhist_ecmfs.Rd @@ -2,16 +2,16 @@ % Please edit documentation in R/dhist.R \name{area_between_dhist_ecmfs} \alias{area_between_dhist_ecmfs} -\title{Calculate area between two discrete histogram empirical cumulative +\title{Calculate area between two discrete histogram empirical cumulative mass functions (ECMFs)} \usage{ area_between_dhist_ecmfs(dhist_ecmf1, dhist_ecmf2) } \arguments{ -\item{dhist_ecmf1}{An object of class \code{dhist_ecmf}, returned from a call +\item{dhist_ecmf1}{An object of class \code{dhist_ecmf}, returned from a call to the \code{dhist_ecmf} function} -\item{dhist_ecmf2}{An object of class \code{dhist_ecmf}, returned from a call +\item{dhist_ecmf2}{An object of class \code{dhist_ecmf}, returned from a call to the \code{dhist_ecmf} function} } \value{ @@ -19,6 +19,6 @@ area The area between the two discrete histogram ECMFs, calculated as the integral of the absolute difference between the two ECMFs } \description{ -Calculate area between two discrete histogram empirical cumulative +Calculate area between two discrete histogram empirical cumulative mass functions (ECMFs) } diff --git a/man/area_between_offset_ecmfs.Rd b/man/area_between_offset_ecmfs.Rd index 1c55beb7..9dc8149d 100644 --- a/man/area_between_offset_ecmfs.Rd +++ b/man/area_between_offset_ecmfs.Rd @@ -7,17 +7,17 @@ area_between_offset_ecmfs(ecmf1, ecmf2, offset) } \arguments{ -\item{ecmf1}{An Empirical Cululative Mass Function (ECMF) object of class +\item{ecmf1}{An Empirical Cululative Mass Function (ECMF) object of class \code{dhist_ecmf}} -\item{ecmf2}{An Empirical Cululative Mass Function (ECMF) object of class +\item{ecmf2}{An Empirical Cululative Mass Function (ECMF) object of class \code{dhist_ecmf}} \item{offset}{An offset to add to all locations of the first ECMF. Postive offsets will shift the ECMF to the right and negative ones to the left.} } \value{ -area The area between the two ECMFs, calculated as the integral of +area The area between the two ECMFs, calculated as the integral of the absolute difference between the two ECMFs } \description{ diff --git a/man/as_smoothed_dhist.Rd b/man/as_smoothed_dhist.Rd index 935b1043..1dfc0c14 100644 --- a/man/as_smoothed_dhist.Rd +++ b/man/as_smoothed_dhist.Rd @@ -18,7 +18,7 @@ A copy of a \code{dhist} object with its \code{smoothing_window_width} attribute set to the value provided \code{smoothing_window_width} parameter. } \description{ -Returns a "smoothed" copy of a \code{dhist} object with its -\code{smoothing_window_width} attribute set to the value provided +Returns a "smoothed" copy of a \code{dhist} object with its +\code{smoothing_window_width} attribute set to the value provided \code{smoothing_window_width} parameter. } diff --git a/man/as_unsmoothed_dhist.Rd b/man/as_unsmoothed_dhist.Rd index 0ce52261..d4304810 100644 --- a/man/as_unsmoothed_dhist.Rd +++ b/man/as_unsmoothed_dhist.Rd @@ -14,6 +14,6 @@ A copy of a \code{dhist} object with its \code{smoothing_window_width} attribute set to 0. } \description{ -Returns an "unsmoothed" copy of a \code{dhist} object with its +Returns an "unsmoothed" copy of a \code{dhist} object with its \code{smoothing_window_width} attribute set to 0. } diff --git a/man/binned_densities_adaptive.Rd b/man/binned_densities_adaptive.Rd index cb8dd9ce..369f00e3 100644 --- a/man/binned_densities_adaptive.Rd +++ b/man/binned_densities_adaptive.Rd @@ -2,14 +2,19 @@ % Please edit documentation in R/graph_binning.R \name{binned_densities_adaptive} \alias{binned_densities_adaptive} -\title{INTERNAL FUNCTION - Do not call directly} +\title{binned_densities_adaptive} \usage{ binned_densities_adaptive(densities, min_counts_per_interval, num_intervals) } +\arguments{ +\item{densities}{Density values to use for binning.} + +\item{min_counts_per_interval}{Minimum count for each bin.} + +\item{num_intervals}{Initial number of density bins to generate. +TODO: Remove @export prior to publishing} +} \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to -generate a function for calculating expected ego-network graphlet counts -from the statistics of a provided reference graph. -Temporarily accessible during development. -TODO: Remove @export prior to publishing +Adaptive binning function guaranteeing a minimum number of entries in each +bin. } diff --git a/man/cost_matrix.Rd b/man/cost_matrix.Rd index 5ae538e0..10d28f75 100755 --- a/man/cost_matrix.Rd +++ b/man/cost_matrix.Rd @@ -15,6 +15,6 @@ cost_matrix(bin_centres1, bin_centres2) Cost matrix } \description{ -Generates a matrix for the cost of moving a unit of mass between each bin in +Generates a matrix for the cost of moving a unit of mass between each bin in histogram 1 and each bin in histogram 2. } diff --git a/man/count_graphlet_tuples.Rd b/man/count_graphlet_tuples.Rd new file mode 100644 index 00000000..4109039a --- /dev/null +++ b/man/count_graphlet_tuples.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{count_graphlet_tuples} +\alias{count_graphlet_tuples} +\title{For each graphlet calculate the number of possible sets of k nodes in the +query graph, where k is the number of nodes in the graphlet.} +\usage{ +count_graphlet_tuples(graph_graphlet_counts, max_graphlet_size) +} +\arguments{ +\item{graph_graphlet_counts}{Node and graphlet counts for a single graph.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets included +in the tuple counts. Currently only size 4 and 5 are supported.} +} +\description{ +For each graphlet calculate the number of possible sets of k nodes in the +query graph, where k is the number of nodes in the graphlet. +} diff --git a/man/count_graphlet_tuples_ego.Rd b/man/count_graphlet_tuples_ego.Rd new file mode 100644 index 00000000..1bd70670 --- /dev/null +++ b/man/count_graphlet_tuples_ego.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{count_graphlet_tuples_ego} +\alias{count_graphlet_tuples_ego} +\title{Run count_graphlet_tuples across pre-computed ego networks.} +\usage{ +count_graphlet_tuples_ego(graphlet_counts, max_graphlet_size) +} +\arguments{ +\item{graphlet_counts}{Matrix of graphlet and node counts (columns) for a +number of ego networks (rows).} + +\item{max_graphlet_size}{Determines the maximum size of graphlets included +in the tuple counts. Currently only size 4 and 5 are supported.} +} +\description{ +Run count_graphlet_tuples across pre-computed ego networks. +} diff --git a/man/count_graphlets_ego.Rd b/man/count_graphlets_ego.Rd index 262e616e..5ae422c8 100644 --- a/man/count_graphlets_ego.Rd +++ b/man/count_graphlets_ego.Rd @@ -4,30 +4,43 @@ \alias{count_graphlets_ego} \title{Ego-network graphlet counts} \usage{ -count_graphlets_ego(graph, max_graphlet_size = 4, neighbourhood_size, - return_ego_networks = FALSE) +count_graphlets_ego( + graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + return_ego_networks = FALSE +) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{An undirected, simple graph as an \code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Currently only size 4 (default) and 5 are supported.} -\item{neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} +\item{neighbourhood_size}{The number of steps from the source node used to select the +neighboring nodes to be included in the source node ego-network. (Default 2).} -\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +nodes are returned. (Default 3).} + +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +edges are returned. (Default 1).} + +\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside graphlet counts to enable further processing.} } \value{ -If \code{return_ego_networks = FALSE}, returns an RxC matrix -containing counts of each graphlet (columns, C) for each ego-network in the -input graph (rows, R). Columns are labelled with graphlet IDs and rows are +If \code{return_ego_networks = FALSE}, returns an RxC matrix +containing counts of each graphlet (columns, C) for each ego-network in the +input graph (rows, R). Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network (if nodes in the input graph are labelled). If \code{return_ego_networks = TRUE}, returns a list with the following elements: \itemize{ - \item \code{graphlet_counts}: A matrix containing graphlet counts for each + \item \code{graphlet_counts}: A matrix containing graphlet counts for each ego-network in the input graph as described above. \item \code{ego_networks}: The ego-networks of the query graph. } diff --git a/man/count_graphlets_ego_scaled.Rd b/man/count_graphlets_ego_scaled.Rd deleted file mode 100644 index fb6eb069..00000000 --- a/man/count_graphlets_ego_scaled.Rd +++ /dev/null @@ -1,41 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{count_graphlets_ego_scaled} -\alias{count_graphlets_ego_scaled} -\title{Scaled graphlet count for ego-networks} -\usage{ -count_graphlets_ego_scaled(graph, max_graphlet_size, neighbourhood_size, - return_ego_networks = FALSE) -} -\arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} - -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} - -\item{neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} - -\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside -graphlet counts to enable further processing.} -} -\value{ -If \code{return_ego_networks = FALSE}, returns an RxC matrix -containing counts of each graphlet (columns, C) for each ego-network in the -input graph (rows, R). Columns are labelled with graphlet IDs and rows are -labelled with the ID of the central node in each ego-network (if nodes in the -input graph are labelled). If \code{return_ego_networks = TRUE}, returns a -list with the following elements: -\itemize{ - \item \code{graphlet_counts}: A matrix containing graphlet counts for each - ego-network in the input graph as described above. - \item \code{ego_networks}: The ego-networks of the query graph. -} -} -\description{ -Calculates graphlet counts for the n-step ego-network of each node in a graph, -scaled by dividing the graphlet counts for each ego-network by the total -number of possible groupings of nodes in the ego-network with the same number -of nodes as each graphlet. This scaling factor is choose(n, k), where n is the -number of nodes in the ego-network and k is the number of nodes in the graphlet. -} diff --git a/man/count_graphlets_for_graph.Rd b/man/count_graphlets_for_graph.Rd index 56db3c7d..587a7d7d 100644 --- a/man/count_graphlets_for_graph.Rd +++ b/man/count_graphlets_for_graph.Rd @@ -7,18 +7,19 @@ count_graphlets_for_graph(graph, max_graphlet_size) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Currently only size 4 and 5 are supported.} } \value{ Vector containing counts of each graphlet for the graph. } \description{ -Calculates total graphlet counts for a \code{igraph} graph object using the +Calculates total graphlet counts for a \code{igraph} graph object using the ORCA fast graphlet orbit counting package. Per-node graphlet counts are calculated by summing orbits over graphlets. These are then divided by the -number of nodes comprising each graphlet to avoid counting the same graphlet +number of nodes comprising each graphlet to avoid counting the same graphlet multiple times. } diff --git a/man/count_graphlets_per_node.Rd b/man/count_graphlets_per_node.Rd index c8353aed..8260a92e 100644 --- a/man/count_graphlets_per_node.Rd +++ b/man/count_graphlets_per_node.Rd @@ -7,17 +7,18 @@ count_graphlets_per_node(graph, max_graphlet_size) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Currently only size 4 and 5 are supported.} } \value{ -ORCA-format matrix containing counts of each graphlet (columns) at +ORCA-format matrix containing counts of each graphlet (columns) at each node in the graph (rows). } \description{ -Calculates graphlet counts for each node in an \code{igraph} graph object, -using the ORCA fast graphlet orbit counting package. by summing orbits over +Calculates graphlet counts for each node in an \code{igraph} graph object, +using the ORCA fast graphlet orbit counting package. by summing orbits over graphlets. } diff --git a/man/count_orbits_per_node.Rd b/man/count_orbits_per_node.Rd index d8233c32..0566eba8 100644 --- a/man/count_orbits_per_node.Rd +++ b/man/count_orbits_per_node.Rd @@ -7,16 +7,17 @@ count_orbits_per_node(graph, max_graphlet_size) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A undirected, simple graph as an \code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Currently only size 4 and 5 are supported.} } \value{ ORCA-format matrix containing counts of each graphlet orbit (columns) at each node in the graph (rows). } \description{ -Calculates graphlet orbit counts for each node in an \code{igraph} graph -object, using the ORCA fast graphlet orbit counting package. +Calculates graphlet orbit counts for each node in an \code{igraph} graph +object, using the \code{orca} fast graphlet orbit counting package. } diff --git a/man/counts_from_observations.Rd b/man/counts_from_observations.Rd index 501af571..ba548ecb 100644 --- a/man/counts_from_observations.Rd +++ b/man/counts_from_observations.Rd @@ -9,3 +9,6 @@ counts_from_observations(features) \arguments{ \item{features}{A Matrix with doubles.} } +\description{ +Count number of occurences +} diff --git a/man/cross_comp_to_matrix.Rd b/man/cross_comp_to_matrix.Rd index d26bfa48..7b76edd0 100644 --- a/man/cross_comp_to_matrix.Rd +++ b/man/cross_comp_to_matrix.Rd @@ -15,7 +15,7 @@ using \code{cross_comparison_spec}} \value{ A square symmetric matrix with a zero diagonal, with elements Cij and Cji populated from the element from \code{measure} corresponding to -the row of \code{cross_comparison_spec} with \code{index_a = i} and +the row of \code{cross_comparison_spec} with \code{index_a = i} and \code{index_b = j} } \description{ diff --git a/man/cross_comparison_spec.Rd b/man/cross_comparison_spec.Rd index 790a74b5..35ab245f 100644 --- a/man/cross_comparison_spec.Rd +++ b/man/cross_comparison_spec.Rd @@ -4,19 +4,24 @@ \alias{cross_comparison_spec} \title{Generate a cross-comparison specification} \usage{ -cross_comparison_spec(named_list) +cross_comparison_spec(named_list, how = "many-to-many") } \arguments{ \item{named_list}{A named list of items for which an exhaustive pair-wise cross-comparison is required.} + +\item{how}{How to generate pair-wise combinations. Either "many-to-many" +(default) which generates all possible pair-wise combinations, or +"one-to-many" which generates all combinations between the first element +in named_list and the rest of the elements only.} } \value{ A matrix with one row for each possible pair-wise combination -of elements from the provided named list. The first and second columns -contain the names of the elements in the pair and the third and fourth +of elements from the provided named list. The first and second columns +contain the names of the elements in the pair and the third and fourth columns contain the indexes of these elements in the provided list. } \description{ -Creates a cross-comparison matrix with all possible pair-wise combinations +Creates a cross-comparison matrix with pair-wise combinations of elements from the provided list. } diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd new file mode 100644 index 00000000..91bf7eca --- /dev/null +++ b/man/density_binned_counts.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{density_binned_counts} +\alias{density_binned_counts} +\title{Used to calculate aggregated graphlet counts for each density bin.} +\usage{ +density_binned_counts( + graphlet_counts, + density_interval_indexes, + agg_fn = mean, + scale_fn = NULL, + max_graphlet_size = NULL +) +} +\arguments{ +\item{graphlet_counts}{Graphlet and node counts (columns) for a number of +ego_networks (rows).} + +\item{density_interval_indexes}{Density bin index for +each ego network.} + +\item{agg_fn}{Function to aggregate counts in each bin +(default \code{agg_fn = mean}).} + +\item{scale_fn}{Optional function to apply a transformation/scaling +to the raw graphlet_counts. Must have arguments \code{graphlet_counts} and +\code{max_graphlet_size}, and return a transformed \code{graphlet_counts} +object with the same number of rows as the input, and columns for all +graphlets up to \code{max_graphlet_size}.} + +\item{max_graphlet_size}{Optionally passed and used by scale_fn.} +} +\description{ +Used to calculate aggregated graphlet counts for each density bin. +} diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd new file mode 100644 index 00000000..0dcd867e --- /dev/null +++ b/man/density_binned_counts_gp.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{density_binned_counts_gp} +\alias{density_binned_counts_gp} +\title{Calculate expected counts in density bins using the +geometric poisson (Polya-Aeppli) approximation.} +\usage{ +density_binned_counts_gp( + graphlet_counts, + density_interval_indexes, + max_graphlet_size +) +} +\arguments{ +\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} + +\item{density_interval_indexes}{Density bin index for +each ego network.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. +included in graphlet_counts.} +} +\description{ +Calculate expected counts in density bins using the +geometric poisson (Polya-Aeppli) approximation. +} diff --git a/man/density_from_counts.Rd b/man/density_from_counts.Rd new file mode 100644 index 00000000..6436ca4e --- /dev/null +++ b/man/density_from_counts.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{density_from_counts} +\alias{density_from_counts} +\title{Calculate edge density for a single graph.} +\usage{ +density_from_counts(graphlet_counts) +} +\arguments{ +\item{graphlet_counts}{Vector of pre-calculated graphlet, edge and node +counts. Must have named items "N" (node counts) and "G0" (edge counts).} +} +\description{ +Calculate edge density for a single graph. +} diff --git a/man/dhist.Rd b/man/dhist.Rd index cb9e6890..4566df12 100644 --- a/man/dhist.Rd +++ b/man/dhist.Rd @@ -10,7 +10,7 @@ dhist(locations, masses, smoothing_window_width = 0, sorted = TRUE) \item{locations}{A 1D numeric vector specifying the discrete locations of the histogram bins} -\item{masses}{A 1D numeric vector specifying the mass present at each +\item{masses}{A 1D numeric vector specifying the mass present at each location} \item{smoothing_window_width}{If greater than 0, the discrete histogram will @@ -18,7 +18,7 @@ be treated as having the mass at each location "smoothed" uniformly across a bin centred on the location and having width = \code{smoothing_window_width} (default = \code{0} - no smoothing)} -\item{sorted}{Whether or not to return a discrete histogram with locations +\item{sorted}{Whether or not to return a discrete histogram with locations and masses sorted by ascending mass (default = \code{TRUE})} } \value{ @@ -31,10 +31,10 @@ is a list of class \code{dhist} with the following named elements: Note that locations where no mass is present are not included in the returned \code{dhist} object. Mass in these discrete histograms is treated as being present precisely at the specified location. Discrete histograms should not be used -for data where observations have been grouped into bins representing ranges +for data where observations have been grouped into bins representing ranges of observation values. } \description{ -Creates a discrete histogram object of class \code{dhist}, with bin +Creates a discrete histogram object of class \code{dhist}, with bin \code{locations} and \code{masses} set to the 1D numeric vectors provided. } diff --git a/man/dhist_ecmf.Rd b/man/dhist_ecmf.Rd index 132b5f23..447f0966 100644 --- a/man/dhist_ecmf.Rd +++ b/man/dhist_ecmf.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/dhist.R \name{dhist_ecmf} \alias{dhist_ecmf} -\title{Generate interpolating empirical cumulative mass function (ECMF) for +\title{Generate interpolating empirical cumulative mass function (ECMF) for a discrete histogram} \usage{ dhist_ecmf(dhist) @@ -12,13 +12,13 @@ dhist_ecmf(dhist) } \value{ An interpolating ECMF as an \code{approxfun} object. This function -will return the interpolated cumulative mass for a vector of arbitrary +will return the interpolated cumulative mass for a vector of arbitrary locations. If \code{dhist$smoothing_window_width} is zero, the ECMF will be piecewise constant. If \code{dhist$smoothing_window_width} is one, the ECMF will be piece-wise linear. If \code{dhist$smoothing_window_width} is any other value, the ECMF will not correctly represent the cumulative mass. } \description{ -Generate interpolating empirical cumulative mass function (ECMF) for +Generate interpolating empirical cumulative mass function (ECMF) for a discrete histogram } diff --git a/man/dhist_std.Rd b/man/dhist_std.Rd index 544d6e6b..120064c5 100644 --- a/man/dhist_std.Rd +++ b/man/dhist_std.Rd @@ -13,8 +13,8 @@ dhist_std(dhist) Standard deviation of histogram } \description{ -Calculates standard deviation directly from the discrete histogram by using +Calculates standard deviation directly from the discrete histogram by using locations weighted by masses. -NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses may not represent counts so N is not necessarily known } diff --git a/man/dhist_variance.Rd b/man/dhist_variance.Rd index 6ea354f5..f35c7eb9 100644 --- a/man/dhist_variance.Rd +++ b/man/dhist_variance.Rd @@ -14,7 +14,7 @@ Variance of histogram } \description{ Calculates variance directly from the discrete histogram by using locations -weighted by masses. -NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +weighted by masses. +NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses may not represent counts so N is not necessarily known } diff --git a/man/ecmf_knots.Rd b/man/ecmf_knots.Rd index aed18724..5d75167d 100644 --- a/man/ecmf_knots.Rd +++ b/man/ecmf_knots.Rd @@ -4,23 +4,23 @@ \alias{ecmf_knots} \title{Get "knots" for discrete histogram empirical cumulative mass function (ECMF). The "knots" are the x-values at which the y-value of the ECDM changes -gradient (i.e. the x-values between which the ECMF does its constant or +gradient (i.e. the x-values between which the ECMF does its constant or linear interpolates)} \usage{ ecmf_knots(dhist_ecmf) } \arguments{ -\item{dhist_ecmf}{An object of class \code{dhist_ecmf}, returned from a call +\item{dhist_ecmf}{An object of class \code{dhist_ecmf}, returned from a call to the \code{dhist_ecmf} function} } \value{ -x_knots A list of "knots" for the ECMF, containing all x-values at +x_knots A list of "knots" for the ECMF, containing all x-values at which the y-value changes gradient (i.e. the x-values between which the ECMF does its constant or linear interpolation) } \description{ Get "knots" for discrete histogram empirical cumulative mass function (ECMF). The "knots" are the x-values at which the y-value of the ECDM changes -gradient (i.e. the x-values between which the ECMF does its constant or +gradient (i.e. the x-values between which the ECMF does its constant or linear interpolates) } diff --git a/man/ego_network_density.Rd b/man/ego_network_density.Rd new file mode 100644 index 00000000..43114243 --- /dev/null +++ b/man/ego_network_density.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{ego_network_density} +\alias{ego_network_density} +\title{Calculate ego network edge densities.} +\usage{ +ego_network_density(graphlet_counts) +} +\arguments{ +\item{graphlet_counts}{Matrix of pre-generated graphlet, edge and node counts +(columns) for each ego network (rows). Columns must include "N" (node counts) +and "G0" (edge counts).} +} +\description{ +Calculate ego network edge densities. +} diff --git a/man/ego_to_graphlet_counts.Rd b/man/ego_to_graphlet_counts.Rd new file mode 100644 index 00000000..87c95330 --- /dev/null +++ b/man/ego_to_graphlet_counts.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/orca_interface.R +\name{ego_to_graphlet_counts} +\alias{ego_to_graphlet_counts} +\title{ego_to_graphlet_counts} +\usage{ +ego_to_graphlet_counts(ego_networks, max_graphlet_size = 4) +} +\arguments{ +\item{ego_networks}{Named list of ego networks for a graph.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Currently only size 4 and 5 are supported.} +} +\value{ +returns an RxC matrix +containing counts of each graphlet (columns, C) for each ego-network +(rows, R). +Columns are labelled with graphlet IDs and rows are +labelled with the ID of the central node in each ego-network. +} +\description{ +Calculates graphlet counts for previously generated ego networks. +} diff --git a/man/emd_cs.Rd b/man/emd_cs.Rd index 112b3977..1e580760 100755 --- a/man/emd_cs.Rd +++ b/man/emd_cs.Rd @@ -16,7 +16,7 @@ Earth Mover's Distance between the two input histograms } \description{ Takes two discrete histograms and calculates the Wasserstein / Earth Mover's -Distance between the two histograms by summing the absolute difference +Distance between the two histograms by summing the absolute difference between the two cumulative histograms. } \references{ diff --git a/man/emd_fast_no_smoothing.Rd b/man/emd_fast_no_smoothing.Rd index eae8b8fe..66421f81 100644 --- a/man/emd_fast_no_smoothing.Rd +++ b/man/emd_fast_no_smoothing.Rd @@ -16,3 +16,7 @@ emd_fast_no_smoothing(locations1, values1, locations2, values2) \item{values2}{Cumulative masses for ECDF 2} } +\description{ +Compute Earth Mover's Distance (EMD) between two Empirical Cumulative +Density Functions (ECDFs) +} diff --git a/man/emd_lp.Rd b/man/emd_lp.Rd index 9c90e3b9..f934de99 100755 --- a/man/emd_lp.Rd +++ b/man/emd_lp.Rd @@ -19,7 +19,7 @@ emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2) Earth Mover's Distance between the two input histograms } \description{ -Takes two sets of histogram bin masses and bin centres and calculates the +Takes two sets of histogram bin masses and bin centres and calculates the Earth Mover's Distance between the two histograms by solving the Transport Problem using linear programming. } diff --git a/man/exp_counts_bin_gp.Rd b/man/exp_counts_bin_gp.Rd new file mode 100644 index 00000000..dc3bc3c5 --- /dev/null +++ b/man/exp_counts_bin_gp.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{exp_counts_bin_gp} +\alias{exp_counts_bin_gp} +\title{INTERNAL FUNCTION - DO NOT CALL DIRECTLY +Used by \code{density_binned_counts_gp} +Calculate expected counts with geometric poisson (Polya-Aeppli) +approximation for a single density bin.} +\usage{ +exp_counts_bin_gp( + bin_idx, + graphlet_counts, + density_interval_indexes, + max_graphlet_size +) +} +\arguments{ +\item{bin_idx}{Density bin index to calculate expected counts for.} + +\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} + +\item{density_interval_indexes}{Density bin indexes for each ego network in +\code{graphlet_counts}.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets. Currently only size 4 and 5 are supported. +included in graphlet_counts.} +} +\description{ +INTERNAL FUNCTION - DO NOT CALL DIRECTLY +Used by \code{density_binned_counts_gp} +Calculate expected counts with geometric poisson (Polya-Aeppli) +approximation for a single density bin. +} diff --git a/man/gdd.Rd b/man/gdd.Rd index 83dc624e..7444bbe7 100644 --- a/man/gdd.Rd +++ b/man/gdd.Rd @@ -4,27 +4,32 @@ \alias{gdd} \title{Graphlet-based degree distributions (GDDs)} \usage{ -gdd(graph, feature_type = "orbit", max_graphlet_size = 4, - ego_neighbourhood_size = 0) +gdd( + graph, + feature_type = "orbit", + max_graphlet_size = 4, + ego_neighbourhood_size = 0 +) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} \item{feature_type}{Type of graphlet-based feature to count: "graphlet" counts the number of graphlets each node participates in; "orbit" calculates the number of graphlet orbits each node participates in.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Currently only size 4 and 5 are supported.} -\item{ego_neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} +\item{ego_neighbourhood_size}{The number of steps from the source node used to select the +neighboring nodes to be included in the source node ego-network.} } \value{ List of graphlet-based degree distributions, with each distribution represented as a \code{dhist} discrete histogram object. } \description{ -Generates graphlet-based degree distributions from \code{igraph} graph object, +Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object using the ORCA fast graphlet orbit counting package. } diff --git a/man/gdd_for_all_graphs.Rd b/man/gdd_for_all_graphs.Rd index 51f4380d..7e6956bf 100644 --- a/man/gdd_for_all_graphs.Rd +++ b/man/gdd_for_all_graphs.Rd @@ -5,9 +5,15 @@ \title{Load all graphs in a directory and calculates their Graphlet-based Degree Distributions (GDDs)} \usage{ -gdd_for_all_graphs(source_dir, format = "ncol", pattern = ".txt", - feature_type = "orbit", max_graphlet_size = 4, - ego_neighbourhood_size = 0, mc.cores = getOption("mc.cores", 2L)) +gdd_for_all_graphs( + source_dir, + format = "ncol", + pattern = ".txt", + feature_type = "orbit", + max_graphlet_size = 4, + ego_neighbourhood_size = 0, + mc.cores = getOption("mc.cores", 2L) +) } \arguments{ \item{source_dir}{Path to graph directory} @@ -16,14 +22,15 @@ gdd_for_all_graphs(source_dir, format = "ncol", pattern = ".txt", \item{pattern}{Filename pattern to match graph files} -\item{feature_type}{Type of graphlet-based degree distributions. Can be +\item{feature_type}{Type of graphlet-based degree distributions. Can be \code{graphlet} to count graphlets or \code{orbit} to count orbits.} -\item{max_graphlet_size}{Maximum size of graphlets to use when generating GDD} +\item{max_graphlet_size}{Maximum size of graphlets to use when generating GDD. +Currently only size 4 and 5 are supported.} -\item{ego_neighbourhood_size}{The number of steps from the source node to -include nodes for each ego-network. If set to 0, ego-networks will not be -used} +\item{ego_neighbourhood_size}{The number of steps from the source node used to select the +neighboring nodes to be included in the source node ego-network. If set to 0, ego-networks will not be +used.} \item{mc.cores}{Number of cores to use for parallel processing. Defaults to the \code{mc.cores} option set in the R environment.} @@ -32,12 +39,12 @@ the \code{mc.cores} option set in the R environment.} A named list where each element contains a set of GDDs for a single A named list where each element contains a set of GDDs for a single -graph from the source directory. Each set of GDDs is itself a named list, +graph from the source directory. Each set of GDDs is itself a named list, where each GDD element is a \code{dhist} discrete histogram object. } \description{ -Loads graphs from all files matching the given pattern in the given directory, -converts them to indexed edge lists compatible with the ORCA fast orbit -counting package and calculates the specified set of graphlet-based degree -distributions usingthe ORCA package. +Loads graphs from all files matching the given pattern in the given +directory, converts them to indexed edge lists compatible with the ORCA fast +orbit counting package and calculates the specified set of graphlet-based +degree distributions usingthe ORCA package. } diff --git a/man/graph_features_to_histograms.Rd b/man/graph_features_to_histograms.Rd index a003a710..ae526218 100644 --- a/man/graph_features_to_histograms.Rd +++ b/man/graph_features_to_histograms.Rd @@ -2,22 +2,20 @@ % Please edit documentation in R/orca_interface.R \name{graph_features_to_histograms} \alias{graph_features_to_histograms} -\title{Convert a matrix of node level features to a discrete histogram for each feature} +\title{Convert a matrix of node level features to a "discrete histogram" for +each feature.} \usage{ -graph_features_to_histograms(featuresMatrix) +graph_features_to_histograms(features_matrix) } \arguments{ -\item{A}{number of nodes (rows) by number of features (columns) matrix, where -the ij entry is the score of node i on feature j (e.g. for ORCA output this is -counts of each graphlet or orbit at each graph vertex)} +\item{features_matrix}{A matrix whose rows represent nodes and whose columns represent different node level features. This means that entry ij provides the value of feature j for node i.} } \value{ -Feature histograms: List of discrete histograms for each +Feature histograms: List of "discrete histograms" for each feature } \description{ -Converts a matrix of node level features (e.g. for ORCA output this is counts -of each graphlet or orbit at each graph vertex) to -a set of discrete histograms (a histogram of counts for each distinct value -across all graph vertices for each feature with no binning) +Converts a matrix of node level features (e.g. for example counts +of multiple graphlets or orbits at each node) to +a set of histogram like objects (observed frequency distribution of each feature/column) } diff --git a/man/graph_to_indexed_edges.Rd b/man/graph_to_indexed_edges.Rd index fa72273d..02aa7cc0 100644 --- a/man/graph_to_indexed_edges.Rd +++ b/man/graph_to_indexed_edges.Rd @@ -18,6 +18,6 @@ the label for the vertice represented by index N in the edgelist Takes a igraph graph object and generates an edgelist where each edge is represented by the integer indexes of its vertices. Note that, where a graph has isolated vertices, the indexes for these vertices will not be present -in the edge list. Where a graph has no isolated vertices, the edge list will +in the edge list. Where a graph has no isolated vertices, the edge list will include all vertex indexes from 1 to numVertices. } diff --git a/man/graphlet_key.Rd b/man/graphlet_key.Rd index c2b4ce84..23f2289b 100644 --- a/man/graphlet_key.Rd +++ b/man/graphlet_key.Rd @@ -7,13 +7,13 @@ graphlet_key(max_graphlet_size) } \arguments{ -\item{max_graphlet_size}{Maximum number of nodes graphlets can contain} +\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported.} } \value{ Metadata list with the following named fields: \itemize{ \item \code{max_nodes}: Maximum number of nodes graphlets can contain - \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to + \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to num_graphlets \item \code{node_count}: Number of nodes contained within each graphlet } diff --git a/man/harmonise_dhist_locations.Rd b/man/harmonise_dhist_locations.Rd index 6b941442..18b7e34a 100644 --- a/man/harmonise_dhist_locations.Rd +++ b/man/harmonise_dhist_locations.Rd @@ -16,6 +16,6 @@ Harmonised histograms } \description{ Where a location only exists in one histogram, add this location to the other -histogram with zero mass. This ensures that all location exist in both +histogram with zero mass. This ensures that all location exist in both histograms. } diff --git a/man/indexed_edges_to_graph.Rd b/man/indexed_edges_to_graph.Rd index c51e223c..971da377 100644 --- a/man/indexed_edges_to_graph.Rd +++ b/man/indexed_edges_to_graph.Rd @@ -7,14 +7,14 @@ indexed_edges_to_graph(indexed_edges) } \arguments{ -\item{indexed_edges}{A 2 x numEdges edgelist with vertices labelled with +\item{indexed_edges}{A 2 x numEdges edgelist with vertices labelled with integer indices, with an optional "vertex_names" attribute} } \value{ An igraph graph object } \description{ -Takes an integer indexed edgelist (where each edge is represented by the +Takes an integer indexed edgelist (where each edge is represented by the integer indexes of its vertices) and converts it to an igraph format graph. If the edge list has a "vertex_names" attribute, this will be used to name the vertices in the resultant graph. diff --git a/man/interval_index.Rd b/man/interval_index.Rd index 2c8a539e..94a5f40a 100644 --- a/man/interval_index.Rd +++ b/man/interval_index.Rd @@ -12,9 +12,9 @@ interval_index(x, breaks, out_of_range_intervals = FALSE) \item{breaks}{The boundaries between bins} \item{out_of_range_intervals}{If \code{TRUE}, "out of range" values lying -below the first break or above the last break will be assigned to additional -unbounded lower and upper extrema intervals. If \code{FALSE} these "out of -range" values will be assigned to intervals bounded by the lowest or +below the first break or above the last break will be assigned to additional +unbounded lower and upper extrema intervals. If \code{FALSE} these "out of +range" values will be assigned to intervals bounded by the lowest or uppermost break respectively.} } \value{ diff --git a/man/is_dhist.Rd b/man/is_dhist.Rd index 429a4b3e..1f550d6f 100644 --- a/man/is_dhist.Rd +++ b/man/is_dhist.Rd @@ -9,13 +9,13 @@ is_dhist(x, fast_check = TRUE) \arguments{ \item{x}{An arbitrary object} -\item{fast_check}{Boolean flag indicating whether to perform only a -superficial fast check limited to checking the object's class attribute +\item{fast_check}{Boolean flag indicating whether to perform only a +superficial fast check limited to checking the object's class attribute is set to \code{dhist} (default = \code{TRUE})} } \description{ -Checks if the input object is of class \code{dhist}. If \code{fast_check} is -\code{TRUE} then the only check is whether the object has a class attribute of +Checks if the input object is of class \code{dhist}. If \code{fast_check} is +\code{TRUE} then the only check is whether the object has a class attribute of \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks are also made to ensure that the object has the structure required of a \code{dhist} object. diff --git a/man/make_named_ego_graph.Rd b/man/make_named_ego_graph.Rd index 9bfeca25..8fc1868a 100644 --- a/man/make_named_ego_graph.Rd +++ b/man/make_named_ego_graph.Rd @@ -4,7 +4,7 @@ \alias{make_named_ego_graph} \title{Get ego-networks for a graph as a named list} \usage{ -make_named_ego_graph(graph, order, ...) +make_named_ego_graph(graph, order, min_ego_nodes = 3, min_ego_edges = 1, ...) } \arguments{ \item{graph}{An \code{igraph} object} @@ -12,7 +12,13 @@ make_named_ego_graph(graph, order, ...) \item{order}{The number of steps from the source node to include nodes for each ego-network.} -\item{...}{Additional parameters to be passed to the underlying +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +nodes are returned.} + +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +edges are returned.} + +\item{...}{Additional parameters to be passed to the underlying \code{igraph::make_ego_graph} function used.} } \description{ diff --git a/man/mean_centre_dhist.Rd b/man/mean_centre_dhist.Rd index 933e68c9..b4b18ae1 100644 --- a/man/mean_centre_dhist.Rd +++ b/man/mean_centre_dhist.Rd @@ -13,6 +13,6 @@ mean_centre_dhist(dhist) The mass-weighted mean location } \description{ -Centres a discrete histogram around its mass-weighted mean location by +Centres a discrete histogram around its mass-weighted mean location by subtracting the mass-weighted mean from each location. } diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index 079c99f0..cd2d0ad0 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -2,14 +2,24 @@ % Please edit documentation in R/measures_net_dis.R \name{mean_density_binned_graphlet_counts} \alias{mean_density_binned_graphlet_counts} -\title{INTERNAL FUNCTION - Do not call directly} +\title{mean_density_binned_graphlet_counts} \usage{ -mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes) +mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes, + agg_fn = mean +) +} +\arguments{ +\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} + +\item{density_interval_indexes}{Density bin index for +each ego network in graphlet_counts.} + +\item{agg_fn}{Function to aggregate counts in each bin +(default \code{agg_fn = mean}).} } \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to -generate a function for calculating expected ego-network graphlet counts -from the statistics of a provided reference graph. -Temporarily accessible during development. -TODO: Remove @export prior to publishing +Calculate mean (dy default) graphlet counts for ego networks in each density +bin. } diff --git a/man/min_emd.Rd b/man/min_emd.Rd index 3b78a6b8..2799f48a 100644 --- a/man/min_emd.Rd +++ b/man/min_emd.Rd @@ -11,19 +11,19 @@ min_emd(dhist1, dhist2, method = "optimise") \item{dhist2}{A \code{dhist} discrete histogram object} -\item{method}{The method to use to find the minimum EMD across all potential +\item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use -R's built-in \code{stats::optimise} method to efficiently find the offset -with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +R's built-in \code{stats::optimise} method to efficiently find the offset +with the minimal EMD. However, this is not guaranteed to find the global +minimum if multiple local minima EMDs exist. You can alternatively specify the +"exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} } \value{ Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete +Calculates the minimum Earth Mover's Distance (EMD) between two discrete histograms. This is the minimum EMD between the two histograms across all possible offsets of histogram 1 against histogram 2. } diff --git a/man/min_emd_exhaustive.Rd b/man/min_emd_exhaustive.Rd index ca20cae5..cd02830b 100644 --- a/man/min_emd_exhaustive.Rd +++ b/man/min_emd_exhaustive.Rd @@ -15,18 +15,18 @@ min_emd_exhaustive(dhist1, dhist2) Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete +Calculates the minimum Earth Mover's Distance (EMD) between two discrete histograms using an exhaustive search. } \details{ -When "sliding" two piecewise-linear empirical cumulative mass functions -(ECMFs) across each other to minimise the EMD between them, it is sufficient -to calculate the EMD at all offsets where any knots from the two ECMFs align +When "sliding" two piecewise-linear empirical cumulative mass functions +(ECMFs) across each other to minimise the EMD between them, it is sufficient +to calculate the EMD at all offsets where any knots from the two ECMFs align to ensure that the offset with the global minimum EMD is found. -This is because of the piece-wise linear nature of the two ECMFs. Between any +This is because of the piece-wise linear nature of the two ECMFs. Between any two offsets where knots from the two ECMFs align, EMD will be either constant, -or uniformly increasing or decreasing. Therefore, there the EMD between two -sets of aligned knots cannot be smaller than the EMD at one or other of the +or uniformly increasing or decreasing. Therefore, there the EMD between two +sets of aligned knots cannot be smaller than the EMD at one or other of the bounding offsets. } diff --git a/man/min_emd_optimise.Rd b/man/min_emd_optimise.Rd index 25eb0e9e..19b04b1d 100644 --- a/man/min_emd_optimise.Rd +++ b/man/min_emd_optimise.Rd @@ -15,7 +15,7 @@ min_emd_optimise(dhist1, dhist2) Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete -histograms by minimising the offset parameter of the \code{emd} function +Calculates the minimum Earth Mover's Distance (EMD) between two discrete +histograms by minimising the offset parameter of the \code{emd} function using the built-in \code{stats::optimise} method. } diff --git a/man/min_emd_optimise_fast.Rd b/man/min_emd_optimise_fast.Rd index 8f9931e8..ecc1da25 100644 --- a/man/min_emd_optimise_fast.Rd +++ b/man/min_emd_optimise_fast.Rd @@ -15,7 +15,7 @@ min_emd_optimise_fast(dhist1, dhist2) Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete -histograms by minimising the offset parameter of the \code{emd} function +Calculates the minimum Earth Mover's Distance (EMD) between two discrete +histograms by minimising the offset parameter of the \code{emd} function using the built-in \code{stats::optimise} method. } diff --git a/man/net_emd.Rd b/man/net_emd.Rd deleted file mode 100755 index ef9d5851..00000000 --- a/man/net_emd.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_emd.R -\name{net_emd} -\alias{net_emd} -\title{NetEMD Network Earth Mover's Distance} -\usage{ -net_emd(dhists1, dhists2, method = "optimise", return_details = FALSE, - smoothing_window_width = 0) -} -\arguments{ -\item{dhists1}{A \code{dhist} discrete histogram object or a list of such objects} - -\item{dhists2}{A \code{dhist} discrete histogram object or a list of such objects} - -\item{method}{The method to use to find the minimum EMD across all potential -offsets for each pair of histograms. Default is "optimise" to use -R's built-in \code{stats::optimise} method to efficiently find the offset -with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the -histograms at all offsets that are candidates for the minimal EMD.} - -\item{return_details}{Logical indicating whether to return the individual -minimal EMDs and associated offsets for all pairs of histograms} - -\item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to -"smear" point masses across a finite width in the real domain. Default is 0, -which results in no smoothing. Care should be taken to select a -\code{smoothing_window_width} that is appropriate for the discrete domain -(e.g.for the integer domain a width of 1 is the natural choice)} -} -\value{ -NetEMD measure for the two sets of discrete histograms -(\code{return_details = FALSE}) or a list with the following named elements -\code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_emds}: -the minimal EMD for each pair of histograms, \code{min_offsets}: the associated -offsets giving the minimal EMD for each pair of histograms -} -\description{ -Calculates the mean minimum Earth Mover's Distance (EMD) between two sets of -discrete histograms after normalising each histogram to unit mass and variance. -This is calculated as follows: - 1. Normalise each histogram to have unit mass and unit variance - 2. Find the minimum EMD between each pair of histograms - 3. Take the average minimum EMD across all histogram pairs -} diff --git a/man/netdis.Rd b/man/netdis.Rd index 64843d89..cc091357 100644 --- a/man/netdis.Rd +++ b/man/netdis.Rd @@ -2,24 +2,29 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis} \alias{netdis} -\title{Netdis} +\title{Netdis - for one graphlet size} \usage{ -netdis(centred_graphlet_counts1, centred_graphlet_counts2, graphlet_size) +netdis( + centred_graphlet_count_vector_1, + centred_graphlet_count_vector_2, + graphlet_size +) } \arguments{ -\item{centred_graphlet_counts1}{Centred Graphlet Counts for graph 1} +\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for graph 1} -\item{centred_graphlet_counts2}{Centred Graphlet Counts for graph 2} +\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for graph 2} \item{graphlet_size}{The size of graphlets to use for the Netdis calculation (only counts for graphlets of the specified size will be used). The size of a graphlet is the number of nodes it contains.} } \value{ -Netdis statistic calculated using centred counts for graphlets of +Netdis statistic calculated using centred counts for graphlets of the specified size } \description{ Calculate Netdis statistic between two graphs from their Centred Graphlet -Counts (generated using \code{netdis_centred_graphlet_counts}). +Counts (generated using \code{netdis_centred_graphlet_counts}) for graphlets +of size \code{graphlet_size}. } diff --git a/man/netdis.plot.Rd b/man/netdis.plot.Rd new file mode 100644 index 00000000..44cc33ed --- /dev/null +++ b/man/netdis.plot.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlottingFunctions.R +\name{netdis.plot} +\alias{netdis.plot} +\title{Heatmap of Netdis many-to-many comparisons} +\usage{ +netdis.plot( + netdislist, + whatrow = c(1, 2)[2], + clustering_method = "ward.D", + main = "Nedis", + docluster = TRUE +) +} +\arguments{ +\item{netdislist}{Default output of \code{netdis_many_to_many}.} + +\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting.} + +\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default).} + +\item{main}{Title of the plot.} + +\item{docluster}{controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn.} +} +\value{ +Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +} +\description{ +Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +} diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd index 71f0e93d..eb3779cd 100644 --- a/man/netdis_centred_graphlet_counts.Rd +++ b/man/netdis_centred_graphlet_counts.Rd @@ -2,31 +2,56 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_centred_graphlet_counts} \alias{netdis_centred_graphlet_counts} -\title{Generate Netdis centred graphlets counts by subtracting expected counts} +\title{netdis_centred_graphlet_counts} \usage{ -netdis_centred_graphlet_counts(graph, max_graphlet_size, neighbourhood_size, - expected_ego_count_fn = NULL) +netdis_centred_graphlet_counts( + graphlet_counts, + ref_ego_density_bins, + ref_binned_graphlet_counts, + binning_fn, + bin_counts_fn, + exp_counts_fn, + max_graphlet_size +) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graphlet_counts}{Ego network graphlet counts for a query graph} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{ref_ego_density_bins}{Either a list of previously calculated ego +network density bin edges from a reference network, or \code{NULL}, in +which case density bins are generated using the query graph itself.} -\item{neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} +\item{ref_binned_graphlet_counts}{Either expected graphlet counts for each +ego network density bin from a reference network (a matrix with columns +labelled by graphlet ID and rows by density bin index), \code{NULL}, in +which case density binned counts are generated using the query graph itself, +or a constant numeric value to subtract from all graphlet counts.} -\item{expected_ego_count_fn}{A function for generating expected ego-network -graphlet counts for a graph. This function should take a connected, -undirected, simple graph as an \code{igraph} object for its only argument. -Where \code{expected_ego_count_fn} is specific to particular values of -\code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken -to ensure that the values of these parameters passed to this function are -consistent with those used when creating \code{expected_ego_count_fn}.} +\item{binning_fn}{Function used to bin ego network densities. Only needed if +\code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are +\code{NULL}. Takes densities as its single argument, and returns a named list +including keys \code{breaks} (vector of bin edges) and \code{interval_indexes} +(density bin index for each ego network).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Only needed if \code{ref_ego_density_bins} and +\code{ref_binned_graphlet_counts} are \code{NULL}. Takes +\code{graphlet_counts}, \code{interval_indexes} (bin indexes) and +\code{max_graphlet_size} as arguments.} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments.} + +\item{max_graphlet_size}{max graphlet size to calculate centred counts for. Currently only size 4 and 5 are supported.} } \value{ -A vector with centred counts for each graphlet type +graphlet_counts minus exp_graphlet_counts for graphlets up to size +max_graphlet_size. } \description{ -Generate Netdis centred graphlets counts by subtracting expected counts +Calculate expected graphlet counts for each ego network in a query graph and +centre the actual counts by subtracting those calculated expected count +values. } diff --git a/man/netdis_centred_graphlet_counts_ego.Rd b/man/netdis_centred_graphlet_counts_ego.Rd deleted file mode 100644 index a6e00bcb..00000000 --- a/man/netdis_centred_graphlet_counts_ego.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_centred_graphlet_counts_ego} -\alias{netdis_centred_graphlet_counts_ego} -\title{TODO: Remove @export prior to publishing} -\usage{ -netdis_centred_graphlet_counts_ego(graph, max_graphlet_size, neighbourhood_size, - expected_ego_count_fn = NULL, min_ego_nodes = 3, min_ego_edges = 1) -} -\description{ -TODO: Remove @export prior to publishing -} diff --git a/man/netdis_const_expected_counts.Rd b/man/netdis_const_expected_counts.Rd new file mode 100644 index 00000000..5eb624a6 --- /dev/null +++ b/man/netdis_const_expected_counts.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_const_expected_counts} +\alias{netdis_const_expected_counts} +\title{Create matrix of constant value to use as expected counts.} +\usage{ +netdis_const_expected_counts(graphlet_counts, const) +} +\arguments{ +\item{graphlet_counts}{Ego network graphlet counts matrix to create expected +counts for.} + +\item{const}{Constant expected counts value to use.} +} +\value{ +Counts of value const with same shape and names as graphlet_counts. +} +\description{ +Create matrix of constant value to use as expected counts. +} diff --git a/man/netdis_expected_counts.Rd b/man/netdis_expected_counts.Rd new file mode 100644 index 00000000..f9ee0968 --- /dev/null +++ b/man/netdis_expected_counts.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_expected_counts} +\alias{netdis_expected_counts} +\title{netdis_expected_counts} +\usage{ +netdis_expected_counts( + graphlet_counts, + density_breaks, + density_binned_reference_counts, + max_graphlet_size, + scale_fn = NULL +) +} +\arguments{ +\item{graphlet_counts}{Matrix of graphlet and node counts (columns) for a +nummber of ego networks (rows).} + +\item{density_breaks}{Density values defining bin edges.} + +\item{density_binned_reference_counts}{Reference network graphlet counts for +each density bin.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported.} + +\item{scale_fn}{Optional function to scale calculated expected counts, taking +\code{graphlet_counts} and \code{max_graphlet_size} as arguments, +and returning a scale factor that the looked up +\code{density_binned_reference_counts} values will be multiplied by.} +} +\description{ +Calculates expected graphlet counts for each ego network based on its density +and pre-calculated reference density bins and graphlet counts for each bin. +} diff --git a/man/netdis_expected_counts_ego.Rd b/man/netdis_expected_counts_ego.Rd new file mode 100644 index 00000000..b565ecab --- /dev/null +++ b/man/netdis_expected_counts_ego.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_expected_counts_ego} +\alias{netdis_expected_counts_ego} +\title{netdis_expected_counts_ego +INTERNAL FUNCTION - Do not call directly} +\usage{ +netdis_expected_counts_ego( + graphlet_counts, + max_graphlet_size, + density_breaks, + density_binned_reference_counts, + scale_fn = NULL +) +} +\arguments{ +\item{graphlet_counts}{Node and graphlet counts for an ego network.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes are counted. Currently only size 4 and 5 are supported.} + +\item{density_breaks}{Density values defining bin edges.} + +\item{density_binned_reference_counts}{Reference network graphlet counts for +each density bin.} + +\item{scale_fn}{Optional function to scale calculated expected counts, taking +\code{graphlet_counts} and \code{max_graphlet_size} as arguments, and +returning a scale factor that the looked up +\code{density_binned_reference_counts} values will be multiplied by.} +} +\description{ +Calculates expected graphlet counts for one ego network based on its density +and pre-calculated reference density bins and graphlet counts for each bin. +} diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd deleted file mode 100644 index 896936ea..00000000 --- a/man/netdis_expected_graphlet_counts.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_expected_graphlet_counts} -\alias{netdis_expected_graphlet_counts} -\title{INTERNAL FUNCTION - Do not call directly} -\usage{ -netdis_expected_graphlet_counts(graph, max_graphlet_size, density_breaks, - density_binned_reference_counts) -} -\description{ -Used by \code{netdis_expected_graphlet_counts_ego} to -calculate expected graphlet counts for a query graph ego-network from the -statistics of a provided reference graph. -Temporarily accessible during development. -TODO: Remove @export prior to publishing -} diff --git a/man/netdis_expected_graphlet_counts_ego.Rd b/man/netdis_expected_graphlet_counts_ego.Rd deleted file mode 100644 index 73cf94cf..00000000 --- a/man/netdis_expected_graphlet_counts_ego.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_expected_graphlet_counts_ego} -\alias{netdis_expected_graphlet_counts_ego} -\title{INTERNAL FUNCTION - Do not call directly} -\usage{ -netdis_expected_graphlet_counts_ego(graph, max_graphlet_size, - neighbourhood_size, density_breaks, density_binned_reference_counts, - min_ego_nodes = 3, min_ego_edges = 1) -} -\description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to -generate a function for calculating expected ego-network graphlet counts -from the statistics of a provided reference graph. -Temporarily accessible during development. -TODO: Remove @export prior to publishing -} diff --git a/man/netdis_expected_graphlet_counts_ego_fn.Rd b/man/netdis_expected_graphlet_counts_ego_fn.Rd deleted file mode 100644 index 5655b8f0..00000000 --- a/man/netdis_expected_graphlet_counts_ego_fn.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_expected_graphlet_counts_ego_fn} -\alias{netdis_expected_graphlet_counts_ego_fn} -\title{Generate Netdis expected graphlet count function} -\usage{ -netdis_expected_graphlet_counts_ego_fn(graph, max_graphlet_size, - neighbourhood_size, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, num_bins = 100) -} -\arguments{ -\item{graph}{A connected, undirected, simple reference graph as an -\code{igraph} object.} - -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} - -\item{neighbourhood_size}{The number of steps from the source node to include -node in ego-network.} -} -\value{ -A function taking a connected, undirected, simple query graph as an -\code{igraph} object and returning an RxC matrix containing the expected -counts of each graphlet (columns, C) for each ego-network in the query graph -(rows, R). Columns are labelled with graphlet IDs and rows are labelled with -the ID of the central node in each ego-network (if nodes in the query graph -are labelled) -} -\description{ -Generates a function to calculate expected ego-network graphlet counts for -query graphs based on the statistics of a provided reference graph. -} -\details{ -Generates graphlet counts for all ego-networks in the supplied reference graph -and then averages these graphlet counts over density bins to generate -density-dependent reference graphlet counts. Prior to averaging, the graphlet -counts are scaled in a size-dependent manner to permit ego-networks with -similar densities but different sizes to be averaged together. - -Returns a function that uses the density-dependent reference graphlet -counts to generate expected graphlet counts for all ego-networks in a query -network. When doing so, it matches ego-networks to reference counts by -density and reverses the scaling that was applied to the original reference -counts in order to allow averaging across ego-networks with similar density -but different numbers of nodes. -} diff --git a/man/netdis_for_all_graphs.Rd b/man/netdis_for_all_graphs.Rd deleted file mode 100644 index e8ebe66d..00000000 --- a/man/netdis_for_all_graphs.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_for_all_graphs} -\alias{netdis_for_all_graphs} -\title{Netdis between all graph pairs using provided Centred Graphlet Counts} -\usage{ -netdis_for_all_graphs(centred_graphlet_counts, graphlet_size, - mc.cores = getOption("mc.cores", 2L)) -} -\arguments{ -\item{centred_graphlet_counts}{List containing Centred Graphlet Counts for -all graphs being compared} - -\item{graphlet_size}{The size of graphlets to use for the Netdis calculation -(only counts for graphlets of the specified size will be used). The size of -a graphlet is the number of nodes it contains.} -} -\value{ -Pairwise Netdis statistics between graphs calculated using centred -counts for graphlets of the specified size -} -\description{ -Netdis between all graph pairs using provided Centred Graphlet Counts -} diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd new file mode 100644 index 00000000..ef60e0f6 --- /dev/null +++ b/man/netdis_many_to_many.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_many_to_many} +\alias{netdis_many_to_many} +\title{Compute any of the Netdis variants between all graph pairs.} +\usage{ +netdis_many_to_many( + graphs = NULL, + ref_graph = NULL, + comparisons = "many-to-many", + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + graphlet_counts = NULL, + graphlet_counts_ref = NULL +) +} +\arguments{ +\item{graphs}{A named list of simplified igraph graph objects (undirected +graphs excluding loops, multiple edges), such as those +obtained by using \code{read_simple_graphs}.} + +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query +graphs. +2) A simplified \code{igraph} object - used as a reference graph from which +expected counts are calculated for all query graphs. +3) NULL (default) - Expected counts will be calculated based on the properties of the +query graphs themselves. (Geometric-Poisson approximation).} + +\item{comparisons}{Which comparisons to perform between graphs. +Can be "many-to-many" (all pairwise combinations) or "one-to-many" +(compare first graph in graphs to all other graphs.)} + +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 (default) and 5 are supported.} + +\item{neighbourhood_size}{Ego network neighbourhood size (default 2).} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes (default 3).} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges (default 1).} + +\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} +as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used (default: NULL).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), +it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson +approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}.} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +If \code{exp_counts_fn} is \code{NULL}, (default), it will apply +either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +values of \code{ref_graph} and \code{graphlet_counts_ref}.} + +\item{graphlet_counts}{Pre-generated graphlet counts (default: NULL). If the +\code{graphlet_counts} argument is defined then \code{graphs} will not be +used. +A named list of matrices containing counts of each graphlet (columns) for +each ego-network (rows) in the input graph. Columns are labelled with +graphlet IDs and rows are labelled with the ID of the central node in each +ego-network. As well as graphlet counts, each matrix must contain an +additional column labelled "N" including the node count for +each ego network.} + +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts (default: NULL). Matrix containing counts +of each graphlet (columns) for each ego-network (rows) in the input graph. Columns are labelled with +graphlet IDs and rows are labelled with the ID of the central node in each +ego-network. As well as graphlet counts, each matrix must contain an +additional column labelled "N" including the node count for +each ego network. +If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +be used.} +} +\value{ +Netdis statistics between query graphs for graphlet sizes +up to and including max_graphlet_size. +} +\description{ +Compute any of the Netdis variants between all graph pairs. +} diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd new file mode 100644 index 00000000..c50cd4f4 --- /dev/null +++ b/man/netdis_one_to_many.Rd @@ -0,0 +1,86 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_one_to_many} +\alias{netdis_one_to_many} +\title{Netdis comparisons between one graph and many other graphs.} +\usage{ +netdis_one_to_many( + graph_1 = NULL, + graphs_compare = NULL, + ref_graph = 0, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL, + graphlet_counts_ref = NULL +) +} +\arguments{ +\item{graph_1}{Query graph - this graph will be compared with +all graphs in graphs_compare. A simplified igraph graph object.} + +\item{graphs_compare}{Graphs graph_1 will be compared with. A named list of +simplified igraph graph objects.} + +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query +graphs (DEFAULT: 0). +2) A simplified \code{igraph} object - used as a reference graph from which +expected counts are calculated for all query graphs. +3) NULL - Expected counts will be calculated based on the properties of the +query graphs themselves.} + +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 and 5 are supported.} + +\item{neighbourhood_size}{Ego network neighbourhood size.} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes.} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges.} + +\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} +as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used +(Default: NULL).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), + it will apply either the approach from the original Netdis paper, or the respective Geometric-Poisson + approximation; depending on the values of \code{ref_graph} and \code{graphlet_counts_ref}.} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +If \code{exp_counts_fn} is \code{NULL}, (default), it will apply +either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +values of \code{ref_graph} and \code{graphlet_counts_ref}.} + +\item{graphlet_counts_1}{Pre-generated graphlet counts for the first query +graph. If the \code{graphlet_counts_1} argument is defined then +\code{graph_1} will not be used.} + +\item{graphlet_counts_compare}{Named list of pre-generated graphlet counts +for the remaining query graphs. If the \code{graphlet_counts_compare} +argument is defined then \code{graphs_compare} will not be used.} + +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts. If the +\code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +be used.} +} +\value{ +Netdis statistics between graph_1 and graph_2 for graphlet sizes +up to and including max_graphlet_size +} +\description{ +Netdis comparisons between one graph and many other graphs. +} diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd new file mode 100644 index 00000000..28b8a259 --- /dev/null +++ b/man/netdis_one_to_one.Rd @@ -0,0 +1,143 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_one_to_one} +\alias{netdis_one_to_one} +\title{Netdis between two graphs} +\usage{ +netdis_one_to_one( + graph_1 = NULL, + graph_2 = NULL, + ref_graph = 0, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL, + graphlet_counts_ref = NULL +) +} +\arguments{ +\item{graph_1}{A simple graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{graphlet_counts_1} is provided. If both \code{graph_1} and \code{graphlet_counts_1} are not \code{NULL}, then only \code{graphlet_counts_1} will be considered.} + +\item{graph_2}{A simple graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{graphlet_counts_2} is provided. If both \code{graph_2} and \code{graphlet_counts_2} are not \code{NULL}, then only \code{graphlet_counts_2} will be considered.} + +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query +graphs . +2) A simplified \code{igraph} object - used as a reference graph from which +expected counts are calculated for all query graphs. +3) NULL (Default) - Used for Netdis-GP, where the expected counts will be calculated based on the properties of the +query graphs themselves. (Geometric-Poisson approximation).} + +\item{max_graphlet_size}{Generate graphlets up to this size. Currently only 4 (default) and 5 are supported.} + +\item{neighbourhood_size}{Ego network neighborhood size (default: 2).} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes (default: 3).} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges (default: 1).} + +\item{binning_fn}{Function used to bin ego network densities. Takes edge \code{densities} +as its single argument, and returns a named list including, the input \code{densities}, the resulting bin \code{breaks} (vector of density bin limits), and the vector \code{interval_indexes} which states to what bin each of the individual elements in \code{densities} belongs to. +ego network). If \code{NULL}, then the method \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100} is used +(Default: NULL).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +(bin indexes) and \code{max_graphlet_size} as arguments. If \code{bin_counts_fn} is \code{NULL}, (default), it will apply +either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +values of \code{ref_graph} and \code{graphlet_counts_ref}.} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +If \code{exp_counts_fn} is \code{NULL}, (default), it will apply +either the approach from the original Netdis paper, or the respective Geometric-Poisson approximation; depending on the +values of \code{ref_graph} and \code{graphlet_counts_ref}.} + +\item{graphlet_counts_1}{Pre-generated graphlet counts for the first query +graph. Matrix containing counts of each graphlet (columns) for +each ego-network (rows) in the input graph. Columns are labelled with +graphlet IDs and rows are labelled with the ID of the central node in each +ego-network. As well as graphlet counts, each matrix must contain an +additional column labelled "N" including the node count for +each ego network. (default: NULL). +If the \code{graphlet_counts_1} argument is defined then +\code{graph_1} will not be used. These counts can be obtained with \code{count_graphlets_ego}.} + +\item{graphlet_counts_2}{Pre-generated graphlet counts for the second query +graph. Matrix containing counts of each graphlet (columns) for +each ego-network (rows) in the input graph. Columns are labelled with +graphlet IDs and rows are labelled with the ID of the central node in each +ego-network. As well as graphlet counts, each matrix must contain an +additional column labelled "N" including the node count for +each ego network. (default: NULL). +If the \code{graphlet_counts_2} argument is defined then +\code{graph_2} will not be used. These counts can be obtained with \code{count_graphlets_ego}.} + +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts. +Matrix containing counts of each graphlet (columns) for +each ego-network (rows) in the reference graph. Columns are labelled with +graphlet IDs and rows are labelled with the ID of the central node in each +ego-network. As well as graphlet counts, each matrix must contain an +additional column labelled "N" including the node count for +each ego network. (default: NULL). +If the \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +be used.} +} +\value{ +Netdis statistics between graph_1 and graph_2 for graphlet sizes +up to and including max_graphlet_size. +} +\description{ +Calculates the different variants of the network dissimilarity statistic Netdis between two graphs. The variants currently supported are Netdis using a gold-standard network, Netdis using no expecations (\code{ref_graph = 0}), and Netdis using a Geometric Poisson approximation for the expectation (\code{ref_graph = NULL}). +} +\examples{ +require(netdist) +require(igraph) +#Set source directory for Virus PPI graph edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") + +#Netdis variant using the Geometric Poisson approximation to remove the background expectation of each network. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) #This option will focus on detecting more general and global discrepancies between the ego-network structures. + +#Comparing the networks via their observed ego counts without centering them (equivalent to using expectation equal to zero). This option, will focus on detecting small discrepancies. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + +# Example of the use of netdis with a reference graph.This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. +# Two lattice networks of different sizes are used for this example. + goldstd_1 <- graph.lattice(c(8,8)) #A reference net + goldstd_2 <- graph.lattice(c(44,44)) #A reference net + + netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) + + + #Providing pre-calculated subgraph counts. + + props_1 <- count_graphlets_ego(graph = graph_1) + props_2 <- count_graphlets_ego(graph = graph_2) + props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) + props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) + +#Netdis Geometric-Poisson. +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) + +#Netdis Zero Expectation. +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) + +#Netdis using gold-standard network +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +} diff --git a/man/netdis_subtract_exp_counts.Rd b/man/netdis_subtract_exp_counts.Rd new file mode 100644 index 00000000..c03aa42f --- /dev/null +++ b/man/netdis_subtract_exp_counts.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_subtract_exp_counts} +\alias{netdis_subtract_exp_counts} +\title{netdis_subtract_exp_counts} +\usage{ +netdis_subtract_exp_counts( + graphlet_counts, + exp_graphlet_counts, + max_graphlet_size +) +} +\arguments{ +\item{graphlet_counts}{Matrix of graphlet counts (columns) for a +nummber of ego networks (rows).} + +\item{exp_graphlet_counts}{Matrix of expected graphlet counts (columns) for a +nummber of ego networks (rows).} + +\item{max_graphlet_size}{Do the subtraction for graphlets up to this size. Currently only size 4 and 5 are supported.} +} +\description{ +Subtract expected graphlet counts from actual graphlet counts. +} diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd new file mode 100644 index 00000000..3ab881c9 --- /dev/null +++ b/man/netdis_uptok.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_uptok} +\alias{netdis_uptok} +\title{Netdis - for all graphlet sizes up to max_graphlet_size} +\usage{ +netdis_uptok( + centred_graphlet_count_vector_1, + centred_graphlet_count_vector_2, + max_graphlet_size +) +} +\arguments{ +\item{centred_graphlet_count_vector_1}{Centred Graphlet Counts vector for graph 1} + +\item{centred_graphlet_count_vector_2}{Centred Graphlet Counts vector for graph 2} + +\item{max_graphlet_size}{max graphlet size to calculate Netdis for. +The size of a graphlet is the number of nodes it contains. Netdis is +calculated for all graphlets from size 3 to size max_graphlet_size. Currently only 4 and 5 are supported.} +} +\value{ +Netdis statistic calculated using centred counts for graphlets of +the specified size +} +\description{ +Calculate Netdis statistic between two graphs from their Centred Graphlet +Counts (generated using \code{netdis_centred_graphlet_counts}) for all +graphlet sizes up to \code{max_graphlet_size}. +} diff --git a/man/netemd.plot.Rd b/man/netemd.plot.Rd new file mode 100644 index 00000000..269e8010 --- /dev/null +++ b/man/netemd.plot.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PlottingFunctions.R +\name{netemd.plot} +\alias{netemd.plot} +\title{Heatmap of NetEmd many-to-many comparisons} +\usage{ +netemd.plot( + netemdlist, + clustering_method = "ward.D", + main = "NetEmd", + docluster = TRUE +) +} +\arguments{ +\item{clustering_method}{Clustering method as allowed in the \code{pheatmap} function from the \code{pheatmap} package. The dendrogram will appear if \code{docluster} is TRUE (default).} + +\item{main}{Title of the plot.} + +\item{docluster}{controls the order of the rows and columns. If TRUE (default) the rows and columns will be reordered to create the dendrogram. If FALSE, then only the heatmap is drawn.} + +\item{netdislist}{Default output of \code{netdis_many_to_many}.} + +\item{whatrow}{Selection of the row in \code{netdis_many_to_many$comp_spec} to be used for plotting.} +} +\value{ +Provides a heat map and dendrogram for the network comparisons via \code{pheatmap}. +} +\description{ +Provides a heatmap and dendrogram for the network comparisons via \code{pheatmap}. +} diff --git a/man/net_emds_for_all_graphs.Rd b/man/netemd_many_to_many.Rd similarity index 51% rename from man/net_emds_for_all_graphs.Rd rename to man/netemd_many_to_many.Rd index 071d9af2..0e7e8da9 100644 --- a/man/net_emds_for_all_graphs.Rd +++ b/man/netemd_many_to_many.Rd @@ -1,51 +1,71 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/measures_net_emd.R -\name{net_emds_for_all_graphs} -\alias{net_emds_for_all_graphs} -\title{NetEMDs between all graph pairs using provided Graphlet-based Degree +\name{netemd_many_to_many} +\alias{netemd_many_to_many} +\title{NetEMDs between all graph pairs using provided Graphlet-based Degree Distributions} \usage{ -net_emds_for_all_graphs(gdds, method = "optimise", - smoothing_window_width = 0, return_details = FALSE, - mc.cores = getOption("mc.cores", 2L)) +netemd_many_to_many( + graphs = NULL, + dhists = NULL, + method = "optimise", + smoothing_window_width = 0, + return_details = FALSE, + mc.cores = getOption("mc.cores", 2L), + feature_type = "orbit", + max_graphlet_size = 5, + ego_neighbourhood_size = 0 +) } \arguments{ -\item{gdds}{List containing sets of Graphlet-based Degree Distributions for -all graphs being compared} +\item{graphs}{A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if \code{dhists} is provided.} -\item{method}{The method to use to find the minimum EMD across all potential +\item{dhists}{A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}.} + +\item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use -R's built-in \code{stats::optimise} method to efficiently find the offset -with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +R's built-in \code{stats::optimise} method to efficiently find the offset +with the minimal EMD. However, this is not guaranteed to find the global +minimum if multiple local minima EMDs exist. You can alternatively specify the +"exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} \item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to -"smear" point masses across a finite width in the real domain. Default is 0, -which results in no smoothing. Care should be taken to select a -\code{smoothing_window_width} that is appropriate for the discrete domain -(e.g.for the integer domain a width of 1 is the natural choice)} +"smear" point masses across a finite width in the real domain. Default is 0, +which results in no smoothing. Care should be taken to select a +\code{smoothing_window_width} that is appropriate for the discrete domain +(e.g.for the integer domain a width of 1 is the natural choice).} \item{return_details}{Logical indicating whether to return the individual minimal EMDs and associated offsets for all pairs of histograms} \item{mc.cores}{Number of cores to use for parallel processing. Defaults to the \code{mc.cores} option set in the R environment.} + +\item{feature_type}{Type of graphlet-based feature to count: "graphlet" +counts the number of graphlets each node participates in; "orbit" (default) calculates +the number of graphlet orbits each node participates in.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Possible values are 4, and 5 (default).} + +\item{ego_neighbourhood_size}{The number of steps from the source node to +include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0.} } \value{ -NetEMD measures between all pairs of graphs for which GDDs +NetEMD measures between all pairs of graphs for which features were provided. Format of returned data depends on the \code{return_details} parameter. If set to FALSE, a list is returned with the following named -elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, -\code{comp_spec}: a comaprison specification table containing the graph names +elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, +\code{comp_spec}: a comparison specification table containing the graph names and indices within the input GDD list for each pair of graphs compared. If \code{return_details} is set to FALSE, the list also contains the following -matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD +matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD used to compute the NetEMD, \code{min_offsets}: the associated offsets giving the minimal EMD for each GDD } \description{ -NetEMDs between all graph pairs using provided Graphlet-based Degree +NetEMDs between all graph pairs using provided Graphlet-based Degree Distributions } diff --git a/man/netemd_one_to_one.Rd b/man/netemd_one_to_one.Rd new file mode 100644 index 00000000..59f4b04e --- /dev/null +++ b/man/netemd_one_to_one.Rd @@ -0,0 +1,107 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_emd.R +\name{netemd_one_to_one} +\alias{netemd_one_to_one} +\title{NetEMD Network Earth Mover's Distance between a pair of networks.} +\usage{ +netemd_one_to_one( + graph_1 = NULL, + graph_2 = NULL, + dhists_1 = NULL, + dhists_2 = NULL, + method = "optimise", + return_details = FALSE, + smoothing_window_width = 0, + feature_type = "orbit", + max_graphlet_size = 5, + ego_neighbourhood_size = 0 +) +} +\arguments{ +\item{graph_1}{A network/graph object from the \code{igraph} package. \code{graph_1} can be set to \code{NULL} (default) if \code{dhists_1} is provided.} + +\item{graph_2}{A network/graph object from the \code{igraph} package. \code{graph_2} can be set to \code{NULL} (default) if \code{dhists_2} is provided.} + +\item{dhists_1}{Either, a \code{dhist} discrete histogram object, or list of such objects, or a matrix of network features (each column representing a feature). \code{dhists_1} can be set to \code{NULL} (default) if \code{graph_1} is provided. A \code{dhist} object can be obtained from \code{graph_features_to_histograms}.} + +\item{dhists_2}{Same as \code{dhists_1}.} + +\item{method}{The method to be used to find the minimum EMD across all potential +offsets for each pair of histograms. Default is "optimise" to use +R's built-in \code{stats::optimise} method to efficiently find the offset +with the minimal EMD. However, this is not guaranteed to find the global +minimum if multiple local minima EMDs exist. You can alternatively specify the +"exhaustive" method, which will exhaustively evaluate the EMD between the +histograms at all offsets that are candidates for the minimal EMD at the cost of computational time.} + +\item{return_details}{Logical indicating whether to return the individual +minimal EMDs and associated offsets for all pairs of histograms.} + +\item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to +"smear" point masses across a finite width in the real domain. Default is 0, +which results in no smoothing. Care should be taken to select a +\code{smoothing_window_width} that is appropriate for the discrete domain +(e.g.for the integer domain a width of 1 is the natural choice).} + +\item{feature_type}{Type of graphlet-based feature to count: "graphlet" +counts the number of graphlets each node participates in; "orbit" (default) calculates +the number of graphlet orbits each node participates in.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Possible values are 4, and 5 (default).} + +\item{ego_neighbourhood_size}{The number of steps from the source node to +include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0.} +} +\value{ +NetEMD measure for the two sets of discrete histograms (or graphs). If +(\code{return_details = FALSE}) then a list with the following named elements is returned +\code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: +the minimal EMD for each pair of histograms, \code{min_offsets}: the associated +offsets giving the minimal EMD for each pair of histograms +} +\description{ +Calculates the network Earth Mover's Distance (EMD) between +two sets of network features. This is done by individually normalising the distribution +of each feature so that they have unit mass and unit variance. Then the minimun EMD between the same pair of features (one for each corresponding graph) is calculated by considering all possible translations of the feature distributions. Finally the average over all features is reported. +This is calculated as follows: + 1. Normalise each feature histogram to have unit mass and unit variance. + 2. For each feature, find the minimum EMD between each pair of histograms considering all possible histogram translations. + 3. Take the average minimum EMD across all features. +} +\examples{ + require(igraph) + graph_1 <- graph.lattice(c(8,8)) + graph_2 <- graph.lattice(c(44,44)) + netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) + + #Providing a matrix of network features + props_a= count_orbits_per_node(graph = graph_1,max_graphlet_size = 5) + props_b= count_orbits_per_node(graph = graph_2,max_graphlet_size = 5) + + netemd_one_to_one(dhists_1=props_a, dhists_2=props_b,smoothing_window_width = 1) + + #Providing the network features as lists of dhist objects + dhists_1<- graph_features_to_histograms(props_a) + dhists_2<- graph_features_to_histograms(props_b) + + netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) + + + # A variation of NetEmd: Using the Laplacian spectrum + #Laplacian + Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) + Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) + + #Normalized Laplacian + NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) + NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) + + #Spectra (This may take a couple of minutes). + props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) + props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + + netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. + +} diff --git a/man/netemd_single_pair.Rd b/man/netemd_single_pair.Rd new file mode 100644 index 00000000..1158d2f5 --- /dev/null +++ b/man/netemd_single_pair.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_emd.R +\name{netemd_single_pair} +\alias{netemd_single_pair} +\title{Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms} +\usage{ +netemd_single_pair( + dhist1, + dhist2, + method = "optimise", + smoothing_window_width = 0 +) +} +\arguments{ +\item{method}{The method to use to find the minimum EMD across all potential +offsets for each pair of histograms. Default is "optimise" to use +R's built-in \code{stats::optimise} method to efficiently find the offset +with the minimal EMD. However, this is not guaranteed to find the global +minimum if multiple local minima EMDs exist. You can alternatively specify the +"exhaustive" method, which will exhaustively evaluate the EMD between the +histograms at all offsets that are candidates for the minimal EMD.} + +\item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to +"smear" point masses across a finite width in the real domain. Default is 0, +which results in no smoothing. Care should be taken to select a +\code{smoothing_window_width} that is appropriate for the discrete domain +(e.g.for the integer domain a width of 1 is the natural choice)} + +\item{dhists_1}{A \code{dhist} discrete histogram object or a list of such objects} + +\item{dhists_2}{A \code{dhist} discrete histogram object or a list of such objects} +} +\value{ +A list with the following named elements +\code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_offsets}: the associated +offsets giving the minimal EMD for each pair of histograms and \code{min_offset_std}: Offset used in the standardised histograms. +} +\description{ +Calculates the minimum Earth Mover's Distance (EMD) between two +discrete histograms after normalising each histogram to unit mass and variance. +This is calculated as follows: + 1. Normalise each histogram to have unit mass and unit variance + 2. Find the minimum EMD between the histograms +} +\examples{ + require(igraph) + goldstd_1 <- graph.lattice(c(8,8)) + goldstd_2 <- graph.lattice(c(44,44)) + props_1 <- count_orbits_per_node(graph = goldstd_1,max_graphlet_size = 5) + props_2 <- count_orbits_per_node(graph = goldstd_2,max_graphlet_size = 5) + dhists_1<- graph_features_to_histograms(props_1) + dhists_2<- graph_features_to_histograms(props_2) + # Obtain the minimum NetEMD_edges between the histograms + netemd_single_pair(dhists_1[[1]],dhists_2[[1]],method = "optimise",smoothing_window_width = 0) +} diff --git a/man/normalise_dhist_mass.Rd b/man/normalise_dhist_mass.Rd index d4856d76..c732c203 100644 --- a/man/normalise_dhist_mass.Rd +++ b/man/normalise_dhist_mass.Rd @@ -13,6 +13,6 @@ normalise_dhist_mass(dhist) A discrete histogram normalised to have mass 1 } \description{ -Normalises a discrete histogram to unit mass by dividing each mass by the +Normalises a discrete histogram to unit mass by dividing each mass by the total of the non-normalised masses } diff --git a/man/normalise_dhist_variance.Rd b/man/normalise_dhist_variance.Rd index 2e3b65d9..ce7d0537 100644 --- a/man/normalise_dhist_variance.Rd +++ b/man/normalise_dhist_variance.Rd @@ -14,6 +14,6 @@ A discrete histogram normalised to have variance 1 } \description{ Normalises a discrete histogram to unit variance by dividing each centred -location by the standard deviation of the discrete histogram before +location by the standard deviation of the discrete histogram before decentering } diff --git a/man/orbit_key.Rd b/man/orbit_key.Rd index de20fd9f..79d8afb4 100644 --- a/man/orbit_key.Rd +++ b/man/orbit_key.Rd @@ -7,13 +7,13 @@ orbit_key(max_graphlet_size) } \arguments{ -\item{max_graphlet_size}{Maximum number of nodes graphlets can contain} +\item{max_graphlet_size}{Maximum number of nodes graphlets can contain. Currently only size 2 to 5 are supported.} } \value{ Metadata list with the following named fields: \itemize{ \item \code{max_nodes}: Maximum number of nodes graphlets can contain - \item \code{id}: ID of each graphlet in format On, where n is in range 0 to + \item \code{id}: ID of each graphlet in format On, where n is in range 0 to num_orbits \item \code{node_count}: Number of nodes contained within each graphlet } diff --git a/man/orbit_to_graphlet_counts.Rd b/man/orbit_to_graphlet_counts.Rd index e763c12d..835ecb6e 100644 --- a/man/orbit_to_graphlet_counts.Rd +++ b/man/orbit_to_graphlet_counts.Rd @@ -15,6 +15,6 @@ An ORCA-style matrix containing counts of each graphlet (columns) at each vertex in the graph (rows) } \description{ -Converts graphlet orbit counts at each vertex to graphlet counts at each +Converts graphlet orbit counts at each vertex to graphlet counts at each vertex by summing over all orbits contained within each graphlet } diff --git a/man/read_simple_graph.Rd b/man/read_simple_graph.Rd index 77f7aef6..a44ada7f 100644 --- a/man/read_simple_graph.Rd +++ b/man/read_simple_graph.Rd @@ -4,23 +4,29 @@ \alias{read_simple_graph} \title{Read a graph from file, simplifying as requested} \usage{ -read_simple_graph(file, format, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE) +read_simple_graph( + file, + format, + as_undirected = TRUE, + remove_loops = TRUE, + remove_multiple = TRUE, + remove_isolates = TRUE +) } \arguments{ \item{file}{Path to file containing graph data} -\item{format}{Format of graph data. All formats supported by +\item{format}{Format of graph data. All formats supported by \code{igraph::read_graph} are supported.} \item{as_undirected}{If TRUE make graph edges undirected} \item{remove_loops}{If TRUE, remove edgeds that connect a vertex to itself} -\item{remove_multiple}{If TRUE remove multiple edges connencting the same +\item{remove_multiple}{If TRUE remove multiple edges connencting the same pair of vertices} -\item{remove_isolates}{If TRUE, remove vertices with no edges after the +\item{remove_isolates}{If TRUE, remove vertices with no edges after the previous alterations have been made} } \value{ @@ -31,8 +37,8 @@ Reads graph data from file, constructing an a igraph graph object, making the requested subset of the following simplifications in the following order: 1. Makes the graph undirected 2. Removes loops (where both endpoints of an edge are the same vertex) - 3. Removes multiple edges (i.e. ensuring only one edge exists for each + 3. Removes multiple edges (i.e. ensuring only one edge exists for each pair of endpoints) - 4. Removes isolated vertices (i.e. vertices with no edges after the - previous alterations) + 4. Removes isolated vertices (i.e. vertices with no edges after the + previous alterations). } diff --git a/man/read_simple_graphs.Rd b/man/read_simple_graphs.Rd index 05624ae3..57d45b86 100644 --- a/man/read_simple_graphs.Rd +++ b/man/read_simple_graphs.Rd @@ -4,14 +4,20 @@ \alias{read_simple_graphs} \title{Read all graphs in a directory, simplifying as requested} \usage{ -read_simple_graphs(source_dir, format = "ncol", pattern = "*", - as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, - remove_isolates = TRUE) +read_simple_graphs( + source_dir, + format = "ncol", + pattern = "*", + as_undirected = TRUE, + remove_loops = TRUE, + remove_multiple = TRUE, + remove_isolates = TRUE +) } \arguments{ \item{source_dir}{Path to directory containing files with graph data} -\item{format}{Format of graph data. Any format supported by +\item{format}{Format of graph data. Any format supported by \code{igraph::read_graph} can be used.} \item{pattern}{Pattern to use to filter filenames. Any pattern supported by @@ -21,10 +27,10 @@ read_simple_graphs(source_dir, format = "ncol", pattern = "*", \item{remove_loops}{If TRUE, remove edgeds that connect a vertex to itself} -\item{remove_multiple}{If TRUE remove multiple edges connencting the same +\item{remove_multiple}{If TRUE remove multiple edges connencting the same pair of vertices} -\item{remove_isolates}{If TRUE, remove vertices with no edges after the +\item{remove_isolates}{If TRUE, remove vertices with no edges after the previous alterations have been made} } \value{ @@ -34,12 +40,20 @@ graph set to the name of the file it was read from. \description{ Reads graph data from all files in a directory matching the specified filename pattern. From each file, an a igraph graph object is constructed -and the requested subset of the following simplifications is made in the +and the requested subset of the following simplifications is made in the following order: 1. Makes the graph undirected 2. Removes loops (where both endpoints of an edge are the same vertex) - 3. Removes multiple edges (i.e. ensuring only one edge exists for each + 3. Removes multiple edges (i.e. ensuring only one edge exists for each pair of endpoints) - 4. Removes isolated vertices (i.e. vertices with no edges after the + 4. Removes isolated vertices (i.e. vertices with no edges after the previous alterations) } +\examples{ +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +print(source_dir) +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), format = "ncol") +graph_1 +} diff --git a/man/scale_graphlet_count.Rd b/man/scale_graphlet_count.Rd new file mode 100644 index 00000000..8a313c27 --- /dev/null +++ b/man/scale_graphlet_count.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{scale_graphlet_count} +\alias{scale_graphlet_count} +\title{Divide graphlet counts by pre-computed scaling factor from +\code{count_graphlet_tuples} output.} +\usage{ +scale_graphlet_count(graphlet_count, graphlet_tuples) +} +\arguments{ +\item{graphlet_count}{Pre-computed graphlet counts.} + +\item{graphlet_tuples}{Pre-computed \code{count_graphlet_tuples} output.} +} +\description{ +Divide graphlet counts by pre-computed scaling factor from +\code{count_graphlet_tuples} output. +} diff --git a/man/scale_graphlet_counts_ego.Rd b/man/scale_graphlet_counts_ego.Rd new file mode 100644 index 00000000..477a1ec5 --- /dev/null +++ b/man/scale_graphlet_counts_ego.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{scale_graphlet_counts_ego} +\alias{scale_graphlet_counts_ego} +\title{Scale graphlet counts for an ego network by the n choose k possible +choices of k nodes in that ego-network, where n is the number of nodes +in the ego network and k is the number of nodes in the graphlet.} +\usage{ +scale_graphlet_counts_ego(graphlet_counts, max_graphlet_size) +} +\arguments{ +\item{graphlet_counts}{Pre-calculated graphlet counts for each ego_network.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets included +in graphlet_counts. Currently only size 4 and 5 are supported.} +} +\value{ +scaled graphlet counts. +} +\description{ +Scale graphlet counts for an ego network by the n choose k possible +choices of k nodes in that ego-network, where n is the number of nodes +in the ego network and k is the number of nodes in the graphlet. +} diff --git a/man/shift_dhist.Rd b/man/shift_dhist.Rd index 8ace8c44..967d569c 100644 --- a/man/shift_dhist.Rd +++ b/man/shift_dhist.Rd @@ -15,6 +15,6 @@ shift_dhist(dhist, shift) A shifted discrete histogram as a \code{dhist} object } \description{ -Shift the locations of a discrete histogram rightwards on the x-axis by the +Shift the locations of a discrete histogram rightwards on the x-axis by the specified amount } diff --git a/man/shift_to_next_alignment.Rd b/man/shift_to_next_alignment.Rd index c558f142..af80eab1 100644 --- a/man/shift_to_next_alignment.Rd +++ b/man/shift_to_next_alignment.Rd @@ -4,8 +4,7 @@ \alias{shift_to_next_alignment} \title{Minimum shift to next alignment of two location vectors} \usage{ -shift_to_next_alignment(x1, x2, distance_matrix_prev = NULL, - shift_prev = NULL) +shift_to_next_alignment(x1, x2, distance_matrix_prev = NULL, shift_prev = NULL) } \arguments{ \item{x1}{First location vector. This vector is being shifted rightwards} @@ -13,10 +12,10 @@ shift_to_next_alignment(x1, x2, distance_matrix_prev = NULL, \item{x2}{Second location vector. This vector is remaining unchanged.} } \value{ -Minimum non-zero right-shift to apply to x1 to align at least one +Minimum non-zero right-shift to apply to x1 to align at least one element of x1 with at least one element of x2 } \description{ -Calculate minimum right shift of first location vector to make any pair of +Calculate minimum right shift of first location vector to make any pair of locations from the two vectors equal } diff --git a/man/simplify_graph.Rd b/man/simplify_graph.Rd index fa4f716a..454daf23 100644 --- a/man/simplify_graph.Rd +++ b/man/simplify_graph.Rd @@ -4,8 +4,13 @@ \alias{simplify_graph} \title{Simplify an igraph} \usage{ -simplify_graph(graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE) +simplify_graph( + graph, + as_undirected = TRUE, + remove_loops = TRUE, + remove_multiple = TRUE, + remove_isolates = TRUE +) } \arguments{ \item{graph}{An graph or list of graphs in igraph format} @@ -14,10 +19,10 @@ simplify_graph(graph, as_undirected = TRUE, remove_loops = TRUE, \item{remove_loops}{If TRUE, remove edgeds that connect a vertex to itself} -\item{remove_multiple}{If TRUE remove multiple edges connencting the same +\item{remove_multiple}{If TRUE remove multiple edges connencting the same pair of vertices} -\item{remove_isolates}{If TRUE, remove vertices with no edges after the +\item{remove_isolates}{If TRUE, remove vertices with no edges after the previous alterations have been made} } \value{ @@ -28,8 +33,8 @@ Takes a igraph graph object and makes the requested subset of the following simplifications in the following order: 1. Makes the graph undirected 2. Removes loops (where both endpoints of an edge are the same vertex) - 3. Removes multiple edges (i.e. ensuring only one edge exists for each + 3. Removes multiple edges (i.e. ensuring only one edge exists for each pair of endpoints) - 4. Removes isolated vertices (i.e. vertices with no edges after the + 4. Removes isolated vertices (i.e. vertices with no edges after the previous alterations) } diff --git a/man/single_density_bin.Rd b/man/single_density_bin.Rd new file mode 100644 index 00000000..2c3cd29f --- /dev/null +++ b/man/single_density_bin.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{single_density_bin} +\alias{single_density_bin} +\title{For case where don't want to use binning, return a single bin which covers +the full range of possible density values (0 to 1).} +\usage{ +single_density_bin(densities) +} +\arguments{ +\item{densities}{Ego network density values (only used to return +a list of indexes of the required length.)} +} +\description{ +For case where don't want to use binning, return a single bin which covers +the full range of possible density values (0 to 1). +} diff --git a/man/sort_dhist.Rd b/man/sort_dhist.Rd index 9a881332..a5b0331f 100644 --- a/man/sort_dhist.Rd +++ b/man/sort_dhist.Rd @@ -9,10 +9,10 @@ sort_dhist(dhist, decreasing = FALSE) \arguments{ \item{dhist}{A discrete histogram as a \code{dhist} object} -\item{decreasing}{Logical indicating whether histograms should be sorted in +\item{decreasing}{Logical indicating whether histograms should be sorted in increasing (default) or decreasing order of location} } \description{ -Sort a discrete histogram so that locations are in increasing (default) or +Sort a discrete histogram so that locations are in increasing (default) or decreasing order } diff --git a/man/virusppi.Rd b/man/virusppi.Rd index fa16de16..6c905b2c 100644 --- a/man/virusppi.Rd +++ b/man/virusppi.Rd @@ -5,7 +5,9 @@ \name{virusppi} \alias{virusppi} \title{Protein-protein interaction (PPI) networks for 5 microorganisms} -\format{A list of \code{igraph} objects.} +\format{ +A list of \code{igraph} objects. +} \source{ \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. @@ -17,7 +19,7 @@ virusppi } \description{ -A dataset containing the protein-protein interaction networks for the +A dataset containing the protein-protein interaction networks for the following 5 microorganisms \itemize{ \item EBV diff --git a/man/worldtradesub.Rd b/man/worldtradesub.Rd new file mode 100644 index 00000000..aa537cac --- /dev/null +++ b/man/worldtradesub.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\encoding{UTF-8} +\name{worldtradesub} +\alias{worldtradesub} +\title{World trade networks from 1985–2014} +\format{ +A list of two elements. The first element, 'wtnets', is a list of \code{igraph} objects providing a small sample of world trade networks from 2001–2014. The second element, 'Counts', is a list of pre-computed subgraph counts of world trade networks in the years 1985-2014. +} +\source{ +\strong{World trade networks:}. United nations commodity trade statistics database (UN comtrade). http://comtrade.un.org/, 2015. + +\strong{Subgraph Counts:} Feenstra RC,Lipsey RE, Deng H, Ma AC, and Mo H. (2005) World trade flows: 1962-2000. Technical report, National Bureau of Economic Research. (See also https://cid.econ.ucdavis.edu/wix.html). +} +\usage{ +worldtradesub +} +\description{ +The world trade data set consists of a small sample of world trade networks for the years 2001-2014, and pre-computed subgraph counts of a larger set of world trade networks (1985–2014). The world trade networks are based on the data set from [Feenstra et al., 2005] for the years 1962- 2000 and on the United Nations division COMTRADE [Division, 2015] for the years 2001-2014. +} +\details{ +\itemize{ + \item wtnets: List of \code{igraph} objects providing the world trade networks from 2001–2014. + \item Counts: Pre-computed graphlet counts for the world trade networks in the years 1985-2014. + } +} +\keyword{datasets} diff --git a/man/zeros_to_ones.Rd b/man/zeros_to_ones.Rd new file mode 100644 index 00000000..a33206bf --- /dev/null +++ b/man/zeros_to_ones.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{zeros_to_ones} +\alias{zeros_to_ones} +\title{Replace zero values in a vector with ones. Used by +\code{scale_graphlet_count} to prevent divide by +zero errors.} +\usage{ +zeros_to_ones(v) +} +\arguments{ +\item{v}{A vector.} +} +\description{ +Replace zero values in a vector with ones. Used by +\code{scale_graphlet_count} to prevent divide by +zero errors. +} diff --git a/src/Makevars b/src/Makevars index 25761e11..fe240994 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ CXX_STD = CXX11 -PKG_CPPFLAGS += -fno-fast-math -msse2 -mfpmath=sse -mstackrealign +PKG_CPPFLAGS += -fno-fast-math -msse2 -mstackrealign diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 99086611..0ee8eaf6 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // counts_from_observations NumericMatrix counts_from_observations(NumericMatrix features); RcppExport SEXP _netdist_counts_from_observations(SEXP featuresSEXP) { @@ -30,22 +35,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// NetEmdSmooth -double NetEmdSmooth(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); -RcppExport SEXP _netdist_NetEmdSmooth(SEXP loc1SEXP, SEXP val1SEXP, SEXP binWidth1SEXP, SEXP loc2SEXP, SEXP val2SEXP, SEXP binWidth2SEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< NumericVector >::type loc1(loc1SEXP); - Rcpp::traits::input_parameter< NumericVector >::type val1(val1SEXP); - Rcpp::traits::input_parameter< double >::type binWidth1(binWidth1SEXP); - Rcpp::traits::input_parameter< NumericVector >::type loc2(loc2SEXP); - Rcpp::traits::input_parameter< NumericVector >::type val2(val2SEXP); - Rcpp::traits::input_parameter< double >::type binWidth2(binWidth2SEXP); - rcpp_result_gen = Rcpp::wrap(NetEmdSmooth(loc1, val1, binWidth1, loc2, val2, binWidth2)); - return rcpp_result_gen; -END_RCPP -} // NetEmdSmoothV2 double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, NumericVector loc2, NumericVector val2, double binWidth2); RcppExport SEXP _netdist_NetEmdSmoothV2(SEXP loc1SEXP, SEXP val1SEXP, SEXP binWidth1SEXP, SEXP loc2SEXP, SEXP val2SEXP, SEXP binWidth2SEXP) { @@ -68,9 +57,8 @@ RcppExport SEXP run_testthat_tests(); static const R_CallMethodDef CallEntries[] = { {"_netdist_counts_from_observations", (DL_FUNC) &_netdist_counts_from_observations, 1}, {"_netdist_emd_fast_no_smoothing", (DL_FUNC) &_netdist_emd_fast_no_smoothing, 4}, - {"_netdist_NetEmdSmooth", (DL_FUNC) &_netdist_NetEmdSmooth, 6}, {"_netdist_NetEmdSmoothV2", (DL_FUNC) &_netdist_NetEmdSmoothV2, 6}, - {"run_testthat_tests", (DL_FUNC) &run_testthat_tests, 0}, + {"run_testthat_tests", (DL_FUNC) &run_testthat_tests, 0}, {NULL, NULL, 0} }; diff --git a/src/fastSmooth.cpp b/src/fastSmooth.cpp deleted file mode 100644 index b2d43847..00000000 --- a/src/fastSmooth.cpp +++ /dev/null @@ -1,171 +0,0 @@ -// Enable C++11 -// [[Rcpp::plugins(cpp11)]] -#include -#include -#include -#include -#include -using namespace Rcpp; - -// [[Rcpp::plugins("cpp11")]] - -//' @title -//' Compute EMD -////' -////' @param loc1 numeric vector. -////' @param val1 numeric vector. -////' @param loc2 numeric vector. -////' @param val2 numeric vector. -//' -//' @export -// [[Rcpp::export]] -double NetEmdSmooth(NumericVector loc1,NumericVector val1,double binWidth1,NumericVector loc2,NumericVector val2,double binWidth2) -{ - //init - double res=0; - double curVal1,curVal2; - double curPos; - double temp1; - int count; - int i,j,k; - //place start of windows before - //start of histogram so we can start the loop - // stores the result - res=0; - //TODO be worried about adding lots of small numbers - - // current location on hist 1 and hist 2 - i=0; - j=0; - // hist1 variables - double loc1SegStart=loc1[0]; //- start of a Segment in x - double loc1SegEnd=loc1[0]+binWidth1; //- end of a Segment in x - double loc1SegValStart=0; //- start of a Segment in y - double loc1SegValEnd=val1[0]; //- end of a Segment in y - - // hist2 variables - double loc2SegStart=loc2[0]; - double loc2SegEnd=loc2[0]+binWidth2; - double loc2SegValStart=0; - double loc2SegValEnd=val2[0]; - - double curStartVal; // start value in y - double curEndVal; // end value in y - double loc1Start; // start value in x hist1 - double loc2Start; // start value in x hist2 - double loc1End; // end value in x hist1 - double loc2End; // end value in x hist2 - double h; - res=0; - // set as 0 as at bottom of hist - curStartVal=0; - - // need to know if first y segment ends with hist1 or hist2 - // Need to set the first start locations - // Commented this section as they are both set to zero - if (loc1SegValStart #include #include +#include "emd_fast_no_smoothing.h" // add_element_kahan() +#include "fastSmoothV2.h" + using namespace Rcpp; -//compute segment -inline double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +double bowtie_area(double length, double val1_start, double val1_end, + double val2_start, double val2_end) +{ + double midPoint = (val1_start - val2_start) / + ((val2_end - val2_start) - (val1_end - val1_start)); + + const double midValue = val1_start + midPoint * (val1_end - val1_start); + + midPoint = midPoint * length; + + double topTriangle = 0.5 * midPoint * (midValue - val1_start); + double topRectangle = midPoint * (val1_start - val2_start); + double bottomTriangle = 0.5 * midPoint * (midValue - val2_start); + + double res = topTriangle + topRectangle - bottomTriangle; + + topTriangle = 0.5 * (length - midPoint) * (val2_end - midValue); + topRectangle = 0; // midPoint*(val1_start-val2_start); + bottomTriangle = 0.5 * (length - midPoint) * (val1_end - midValue); + + res += topTriangle + topRectangle - bottomTriangle; + return res; +} + +// Compute the unsigned area between two line segments +// assumes that val1_end > val1_start and val2_end > val2_start +double get_segment(double start, double end, double val1_start, + double val1_end, double val2_start, double val2_end) { - double length; - length = end - start; + const double length = end - start; + double topTriangle; double topRectangle; double bottomTriangle; double midPoint; double midValue; - double res=0; - if (val1_start > val2_start) + double res = 0; + + bool both_differences_positive = val1_start > val2_start && val1_end >= val2_end; + bool both_differences_negative = val1_start <= val2_start && val1_end <= val2_end; + + if (both_differences_positive || both_differences_negative) { - if (val1_end >= val2_end) - { - // They are in the same order no bowtie - // seg1 is above seg2 - // triangle of seg1 - topTriangle = 0.5*length*(val1_end-val1_start); - // rectangle between seg1 and seg2 - topRectangle = length*(val1_start-val2_start); - // triangle of seg2 (to be removed) - bottomTriangle = 0.5*length*(val2_end-val2_start); - return topTriangle+topRectangle-bottomTriangle; - } - else - { - //bowtie - // lets make this really simple as the compiler - // will combine the expressions as needed - midPoint = (val1_start-val2_start)/((val2_end-val2_start) - (val1_end-val1_start)); - midValue = val1_start + midPoint*(val1_end-val1_start); - midPoint = midPoint*length; - - topTriangle = 0.5*midPoint*(midValue-val1_start); - topRectangle = midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*midPoint*(midValue-val2_start); - - res = topTriangle+topRectangle-bottomTriangle; - - topTriangle = 0.5*(length-midPoint)*(val2_end-midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*(length - midPoint)*(val1_end - midValue); - res += topTriangle+topRectangle-bottomTriangle; - return res; - } + // They are in the same order: no bowtie + // triangle of seg1 + topTriangle = 0.5 * length * (val1_end - val1_start); + // rectangle between seg1 and seg2 + topRectangle = length * (val1_start - val2_start); + // triangle of seg2 (to be removed) + bottomTriangle = 0.5 * length * (val2_end - val2_start); + + const double sign = both_differences_positive?1.0:-1.0; + return sign * (topTriangle + topRectangle - bottomTriangle); + } + else if (val1_start > val2_start) { // bowtie, first case + return bowtie_area(length, val1_start, val1_end, val2_start, val2_end); + } + else { // bowtie, second case + return bowtie_area(length, val2_start, val2_end, val1_start, val1_end); } - else - { - if (val1_end > val2_end) - { - //bowtie - // Find the point where they cross. - // (Solution of linear equations) - midPoint = (val2_start-val1_start)/((val1_end-val1_start) - (val2_end-val2_start)); - midValue = val2_start + midPoint*(val2_end-val2_start); - midPoint = midPoint*length; - - topTriangle = 0.5*midPoint*(midValue-val2_start); - topRectangle = midPoint*(val2_start-val1_start); - bottomTriangle = 0.5*midPoint*(midValue-val1_start); - - res = topTriangle+topRectangle-bottomTriangle; - - topTriangle = 0.5*(length-midPoint)*(val1_end-midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*(length - midPoint)*(val2_end - midValue); - res += topTriangle+topRectangle-bottomTriangle; - return res; - - } - else // same order - { - // seg2 is above seg1 - // Triangle seg2 above seg1 - topTriangle = 0.5*length*(val2_end-val2_start); - // rectangle between seg2 and seg1 - topRectangle = length*(val2_start-val1_start); - // Seg1 triangle to be removed - bottomTriangle = 0.5*length*(val1_end-val1_start); - return topTriangle+topRectangle-bottomTriangle; - } - } } // cut down and compute segment -inline double get_segment_constrained(double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) +double get_segment_constrained(double seg1L1, double seg1L2, + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2) { - //We have a valid range - double valStart1, valEnd1, valStart2, valEnd2; - double start,end; - start = std::max(seg1L1,seg2L1); - end = std::min(seg1L2,seg2L2); - if (startcurSeg2Loc3) - {break;} - } - } - else - { - // loc2 starts before loc1 so lets deal with those segments first - // Fix the position of Seg2 and then interate over Seg1 until we have all - // of the segments of Seg1 before Seg2 starts. - curSeg2Loc1=minLoc; - curSeg2Loc2=minLoc; - curSeg2Loc3=loc2[0]; - curSeg2Val1=0; - curSeg2Val2=0; - // Set this value so we can update in the lopp - curSeg1Val2=0; - for (index1=0;index1curSeg1Loc3) - {break;} - } - } - // Add both the overlapping sections and the non overlapping section on the right - // Note we reiterate over the first few sections loc1 - // Could store where we are upto from above to save time - // Reset Val counter - curSeg1Val2=0; - for (index1=0;index1curSeg1Loc3) - {break;} - } + // The OverlappingSegments iterator returns pairs of the left + // indices of overlapping endpoints: it->first is the index into + // loc1, it->second is the index into loc2. When the smallest + // element in one of these vectors is larger than the current + // element of the other, an 'index' of -1 is returned. + + // Hist 1 + // Start of the gradient section in Seg1 + double curSeg1Loc1 = segs.loc1_left(it->first); + + // End of the gradient section in Seg1 + double curSeg1Loc2 = segs.loc1_mid(it->first); + + // End of the flat section in Seg1 + double curSeg1Loc3 = segs.loc1_right(it->first); + + // Start and end values in Seg1: val1 gives the values at *right* + // endpoints of the segments. A value of 0.0 is used before the + // first segment. + double curSeg1Val1 = (it->first > 0) ? val1[it->first - 1] : 0.0; + double curSeg1Val2 = (it->first >= 0) ? val1[it->first] : 0.0; + + // Hist 2 + // Start of the gradient section in Seg1 + double curSeg2Loc1 = segs.loc2_left(it->second); + + // End of the gradient section in Seg1 + double curSeg2Loc2 = segs.loc2_mid(it->second); + + // End of the flat section in Seg1 + double curSeg2Loc3 = segs.loc2_right(it->second); + + // Start and end values in Seg2: val2 gives the values at *right* + // endpoints of the segments. A value of 0.0 is used before the + // first segment. + double curSeg2Val1 = (it->second > 0) ? val2[it->second - 1] : 0.0; + double curSeg2Val2 = (it->second >= 0) ? val2[it->second] : 0.0; + + double element = get_double_segment_constrained( + curSeg1Loc1, curSeg1Loc2, curSeg1Loc3, curSeg1Val1, curSeg1Val2, + curSeg2Loc1, curSeg2Loc2, curSeg2Loc3, curSeg2Val1, curSeg2Val2); + + add_element_kahan(res, element, compensation); } + return res; } diff --git a/src/fastSmoothV2.h b/src/fastSmoothV2.h new file mode 100644 index 00000000..77e3d461 --- /dev/null +++ b/src/fastSmoothV2.h @@ -0,0 +1,236 @@ +// Enable C++11 +// [[Rcpp::plugins(cpp11)]] + +#ifndef FASTSMOOTHV2_H +#define FASTSMOOTHV2_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Rcpp; + +class OverlappingSegments { + + // The two sequences of left-hand segment endpoints + const NumericVector& loc1; + const NumericVector& loc2; + const double binWidth1, binWidth2; + + // shorter names for loc1.size() and loc2.size() + const long N1, N2; + + // the minimum and maximum values over both sequences; + double minloc, maxloc; + +public: + OverlappingSegments(NumericVector& loc1_, NumericVector& loc2_, + double binWidth1_ = 1.0, double binWidth2_ = 1.0) + : loc1(loc1_), loc2(loc2_), + binWidth1(binWidth1_), binWidth2(binWidth2_), + N1(loc1.size()), N2(loc2.size()) + { + if (N1 == 0 || N2 == 0) + throw std::invalid_argument("Input vectors must be nonempty"); + + for (int i = 0; i < N1 - 1; i++) + if (loc1[i] > loc1[i + 1] + binWidth1) + throw std::invalid_argument( + "Elements of loc1 must be sorted in ascending order, " + "with elements separated by at least binWidth1"); + + for (int i = 0; i < N2 - 1; i++) + if (loc2[i] > loc2[i + 1] + binWidth2) + throw std::invalid_argument( + "Elements of loc2 must be sorted in ascending order, " + "with elements separated by at least binWidth2"); + + minloc = std::min(loc1[0], loc2[0]); + maxloc = std::max(loc1[N1 - 1] + binWidth1, loc2[N2 - 1] + binWidth2); + } + + // left, mid and right locations of the segments, for loc1 and loc2 + double loc1_left(long i) const { + return (i >= 0) ? loc1[i] : minloc; + } + + double loc1_mid(long i) const { + return (i >= 0) ? (loc1[i] + binWidth1) : minloc; + } + + double loc1_right(long i) const { + return (i + 1 < N1) ? loc1[i + 1] : maxloc; + } + + double loc2_left(long i) const { + return (i >= 0) ? loc2[i] : minloc; + } + + double loc2_mid(long i) const { + return (i >= 0) ? (loc2[i] + binWidth2) : minloc; + } + + double loc2_right(long i) const { + return (i + 1 < N2) ? loc2[i + 1] : maxloc; + } + + // Does interval i (from the first collection of segments) overlap + // interval j (from the second)? + bool intervals_overlap(long i, long j) const { + return (loc1_left(i) < loc2_right(j) && loc2_left(j) < loc1_right(i)); + } + + // OverlappingSegments iterator + class iterator { + const OverlappingSegments& segs; + + // the current iteration state + std::pair idx; + + public: + typedef std::pair value_type; + typedef void difference_type; + typedef value_type* pointer; + typedef value_type& reference; + typedef std::input_iterator_tag iterator_category; + + // Iterate over pairs of indices (i,j) into the sequences loc1 and + // loc2, where the intervals [loc1[i], loc1[i+1]] and [loc2[j], + // loc2[j+1]] overlap. + // + // These indices are returned from the iterator as + // std::pair. + // + // A sequence has an implicit segment from minloc (with index -1) + // to its zeroth element. The elements loc1[0] and loc2[0] are + // compared to determine whether, for either sequence, this + // initial implicit segment overlaps the zeroth segment of the + // other one. If both sequences start with the same value, the + // iteration starts at (0,0). + // + explicit iterator(const OverlappingSegments& segs_) + : segs(segs_) + { + if (segs.loc1[0] < segs.loc2[0]) { + idx.first = 0; + idx.second = -1; + } + else if (segs.loc1[0] == segs.loc2[0]) { + idx.first = 0; + idx.second = 0; + } + else { + idx.first = -1; + idx.second = 0; + } + } + + // Is the current iterator at one-past-the-end? Equivalent to an + // equality comparison with segs.end(). + bool at_end() const { + return idx.first == segs.N1 && idx.second == segs.N2 - 1; + } + + // Update the current iterator to point to one-past-the-end + iterator& advance_to_end() { + idx.first = segs.N1; + idx.second = segs.N2 - 1; + return *this; + } + + iterator& operator++() { +#if !NDEBUG + // Verify precondition + if (!segs.intervals_overlap(idx.first, idx.second)) { + throw std::logic_error("Iterator precondition not satisfied: " + "current intervals do not overlap"); + } +#endif + + // Advance the second segment if it would still overlap the first + // + // The condition below is equivalent to + // idx.second < N2 - 1 && intervals_overlap(idx.first, idx.second + 1) + // given that we know (by the precondition) that + // loc1_left(idx.first) < loc2_right(idx.second) + // and therefore that + // loc1_left(idx.first) < loc2_right(idx.second + 1), + // + if (idx.second < segs.N2 - 1 + && segs.loc2_left(idx.second + 1) < segs.loc1_right(idx.first)) { + idx.second++; + } + // Could not advance the second segment above: advance the first instead, + // and the second as well if they share an endpoint + else { + if (idx.second < segs.N2 - 1 + && segs.loc2_left(idx.second + 1) == segs.loc1_right(idx.first)) { + idx.second++; + } + idx.first++; + } + +#if !NDEBUG + // Verify postcondition + if (!(at_end() || segs.intervals_overlap(idx.first, idx.second))) { + throw std::logic_error("Iterator postcondition not satisfied: " + "current intervals do not overlap (not at end)"); + } +#endif + + return *this; + } + + iterator operator++(int) { + iterator res = *this; + operator++(); + return res; + } + + value_type operator*() const { return idx; } + + const value_type *operator->() const { return &idx; } + + friend bool operator==(const iterator& lhs, const iterator& rhs) { + return lhs.idx == rhs.idx; + } + + friend bool operator!=(const iterator& lhs, const iterator& rhs) { + return !(lhs == rhs); + } + }; + + iterator begin() { return iterator(*this); } + iterator end() { return iterator(*this).advance_to_end(); } +}; + + +double bowtie_area(double length, double val1_start, double val1_end, + double val2_start, double val2_end); + +double get_segment(double start, double end, double val1_start, + double val1_end, double val2_start, double val2_end); + +double get_segment_constrained(double seg1L1, double seg1L2, + double seg2L1, double seg2L2, + double seg1V1, double seg1V2, + double seg2V1, double seg2V2); + +double get_double_segment_constrained( + double seg1Loc1, double seg1Loc2, double seg1Loc3, + double seg1Val1, double seg1Val2, + double seg2Loc1, double seg2Loc2, double seg2Loc3, + double seg2Val1, double seg2Val2); + +double NetEmdSmoothV2(NumericVector loc1, NumericVector val1, double binWidth1, + NumericVector loc2, NumericVector val2, double binWidth2); + +#endif // FASTSMOOTHV2_H diff --git a/src/fastSmoothV2_old.cpp b/src/fastSmoothV2_old.cpp deleted file mode 100644 index 8eb72d8c..00000000 --- a/src/fastSmoothV2_old.cpp +++ /dev/null @@ -1,287 +0,0 @@ -// Enable C++11 -// [[Rcpp::plugins(cpp11)]] -#include -#include -#include -#include -#include -using namespace Rcpp; - -//compute segment -inline double get_segment(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) -{ - double length; - length = end - start; - double topTriangle; - double topRectangle; - double bottomTriangle; - double midPoint; - double midValue; - double res=0; - if (val1_start > val2_start) - { - if (val1_end >= val2_end) - { - // They are in the same order no bowtie - topTriangle = 0.5*length*(val1_end-val1_start); - topRectangle = length*(val1_start-val2_start); - bottomTriangle = 0.5*length*(val2_end-val2_start); - return topTriangle+topRectangle-bottomTriangle; - } - else - { - //bowtie - // lets make this really simple as the compiler - // will combine the expressions as needed - midPoint = (val1_start-val2_start)/((val2_end-val2_start) - (val1_end-val1_start)); - midValue = val1_start + midPoint*(val1_end-val1_start); - midPoint = midPoint*length; -// std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; - - topTriangle = 0.5*midPoint*(midValue-val1_start); - topRectangle = midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*midPoint*(midValue-val2_start); - - res = topTriangle+topRectangle-bottomTriangle; - - topTriangle = 0.5*(length-midPoint)*(val2_end-midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*(length - midPoint)*(val1_end - midValue); - res += topTriangle+topRectangle-bottomTriangle; - return res; - } - } - else - { - if (val1_end > val2_end) - { -// std::cout << "\n Path3"; - //bowtie - midPoint = (val2_start-val1_start)/((val1_end-val1_start) - (val2_end-val2_start)); - midValue = val2_start + midPoint*(val2_end-val2_start); - midPoint = midPoint*length; -// std::cout << "\n midPoint: " << midPoint << " midValue: " << midValue << "\n"; - - topTriangle = 0.5*midPoint*(midValue-val2_start); - topRectangle = midPoint*(val2_start-val1_start); - bottomTriangle = 0.5*midPoint*(midValue-val1_start); - - res = topTriangle+topRectangle-bottomTriangle; - - topTriangle = 0.5*(length-midPoint)*(val1_end-midValue); - topRectangle = 0; // midPoint*(val1_start-val2_start); - bottomTriangle = 0.5*(length - midPoint)*(val2_end - midValue); - res += topTriangle+topRectangle-bottomTriangle; - return res; - - } - else // same order - { -// std::cout << "\n Path4"; - topTriangle = 0.5*length*(val2_end-val2_start); - topRectangle = length*(val2_start-val1_start); - bottomTriangle = 0.5*length*(val1_end-val1_start); - return topTriangle+topRectangle-bottomTriangle; - } - } -} - -// cut down and compute segment -inline double get_segment_constrained(double seg1L1, double seg1L2, double seg2L1, double seg2L2, double seg1V1, double seg1V2, double seg2V1, double seg2V2) -{ - //We have a valid range - double valStart1, valEnd1, valStart2, valEnd2; - double start,end; - start = std::max(seg1L1,seg2L1); - end = std::min(seg1L2,seg2L2); - if (start +// header file. +#include "fastSmoothV2.h" +#include "emd_fast_no_smoothing.h" + +#include +#include +#include + +// Helper function to test tolerance +bool within_toleranceV2(double actual, double expected, double tolerance) { + if(actual > expected) { + return ((actual - expected) <= tolerance); + } + else { + return ((expected - actual) <= tolerance); + } +} + +double simpleSlowArea(double startx,double endx,double starty1,double endy1,double starty2,double endy2) +{ + // Making this step size smaller + double step = (endx-startx)/100000000.0; + double curX; + double curY1; + double curY2; + double res = 0; + for (int i=0;i<100000000;i++) + { + curX = startx + i*step; + curY1 = starty1 +(endy1-starty1)*i/100000000.0; + curY2 = starty2 +(endy2-starty2)*i/100000000.0; + res += step*std::abs(curY1-curY2); + } + return res; +} + +void runSegmentConstraintTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ + double tempVal1; + double tempVal2; + tempVal1 = get_segment_constrained(start,end,start,end,val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + std::cout << "\n"; + std::cout << "segment constrained " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); +} + + +void runSegmentTest(double start,double end,double val1_start,double val1_end,double val2_start,double val2_end) +{ + double tempVal1; + double tempVal2; + tempVal1 = get_segment(start,end, val1_start,val1_end,val2_start,val2_end); + tempVal2 = simpleSlowArea(start,end, val1_start,val1_end,val2_start,val2_end); + std::cout << "\n"; + std::cout << "segment test " << tempVal1 << " simpleResult " << tempVal2 << "\n"; + expect_true(within_toleranceV2(tempVal1,tempVal2,0.0001)); +} + +context("emd_fast_smoothing segment constrain simple") { + test_that("emd_fast_smoothing segment constrain simple") { + // Two upward linear segments + runSegmentConstraintTest(0.0,1.0,0.0,1.0,0.0,1.0); + // One upward one down linear segments + runSegmentConstraintTest(0.0,1.0,0.0,1.0,1.0,0.0); + runSegmentConstraintTest(0.0,1.0,1.0,0.0,0.0,1.0); + // Two down linear segments + runSegmentConstraintTest(0.0,1.0,1.0,0.0,1.0,0.0); + // One flat one up segments + runSegmentConstraintTest(0.0,1.0,0.0,0.0,0.0,1.0); + runSegmentConstraintTest(0.0,1.0,0.0,1.0,0.0,0.0); + // One flat one down segments + runSegmentConstraintTest(0.0,1.0,1.0,0.0,0.0,0.0); + runSegmentConstraintTest(0.0,1.0,0.0,0.0,1.0,0.0); + // Different gradients segments + runSegmentConstraintTest(0.0,1.0,0.0,3.0,0.0,0.0); + runSegmentConstraintTest(0.0,1.0,0.0,0.0,0.0,3.0); + // Different gradients segments + runSegmentConstraintTest(0.0,1.0,2.0,4.0,1.0,2.0); + runSegmentConstraintTest(0.0,1.0,1.0,2.0,2.0,3.0); +}} + +context("emd_fast_smoothing segment full") { + test_that("emd_fast_smoothing segment full") { + // Two upward linear segments + runSegmentTest(0.0,1.0,0.0,1.0,0.0,1.0); + // One upward one down linear segments + runSegmentTest(0.0,1.0,0.0,1.0,1.0,0.0); + runSegmentTest(0.0,1.0,1.0,0.0,0.0,1.0); + // Two down linear segments + runSegmentTest(0.0,1.0,1.0,0.0,1.0,0.0); + // One flat one up segments + runSegmentTest(0.0,1.0,0.0,0.0,0.0,1.0); + runSegmentTest(0.0,1.0,0.0,1.0,0.0,0.0); + // One flat one down segments + runSegmentTest(0.0,1.0,1.0,0.0,0.0,0.0); + runSegmentTest(0.0,1.0,0.0,0.0,1.0,0.0); + // Different gradients segments + runSegmentTest(0.0,1.0,0.0,3.0,0.0,0.0); + runSegmentTest(0.0,1.0,0.0,0.0,0.0,3.0); + // Different gradients segments + runSegmentTest(0.0,1.0,2.0,4.0,1.0,2.0); + runSegmentTest(0.0,1.0,1.0,2.0,2.0,3.0); +}} + +template +void runIntervalOverlapTest(Container1T& actual, Container2T& expected) +{ + std::cout << "Left endpoints of overlapping intervals:" << std::endl; + for (std::pair p : actual) + std::cout << p.first << ", " << p.second << std::endl; + + std::cout << "Expected:" << std::endl; + for (std::pair p : expected) + std::cout << p.first << ", " << p.second << std::endl; + + bool result = std::equal(actual.begin(), actual.end(), expected.begin()); + + std::cout << "Same? " << std::boolalpha << result << std::endl; + std::cout << "~~~~~~~~~~\n"; + + expect_true(result); +} + +context("emd_fast_smooth overlapping interval iterator") { + test_that("emd_fast_smooth overlapping interval iterator") { + { + NumericVector xs {1.0, 3.0, 5.0}; + NumericVector ys {2.0, 4.0, 6.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 0}, {1, 1}, {2, 1}, {2, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 3.0}; + NumericVector ys {4.0, 6.0, 8.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {1, -1}, {1, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {5.0, 5.5}; + NumericVector ys {4.0, 6.0, 8.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {-1, 0}, {0, 0}, {1, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.0, 3.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, 0}, {1, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 3.0}; + NumericVector ys {1.0, 2.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, 0}, {0, 1}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.5, 2.0, 3.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 1}, {1, 2}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.0, 2.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + + { + NumericVector xs {1.0, 2.0}; + NumericVector ys {1.5, 2.0}; + OverlappingSegments actual(xs, ys); + std::vector > expected { + {0, -1}, {0, 0}, {1, 1}}; + runIntervalOverlapTest(actual, expected); + } + } +} diff --git a/tests/testthat.R b/tests/testthat.R index 76669a6d..bc52a947 100755 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) library(netdist) -test_check("netdist") \ No newline at end of file +test_check("netdist") diff --git a/tests/testthat/test-cpp.R b/tests/testthat/test-cpp.R index 37ffa32b..8c682729 100644 --- a/tests/testthat/test-cpp.R +++ b/tests/testthat/test-cpp.R @@ -1,4 +1,4 @@ context("C++") test_that("Catch unit tests pass", { - expect_cpp_tests_pass("netdist") + expect_cpp_tests_pass("netdist") }) diff --git a/tests/testthat/test_dhist.R b/tests/testthat/test_dhist.R index 4aae5e28..37dc59b2 100644 --- a/tests/testthat/test_dhist.R +++ b/tests/testthat/test_dhist.R @@ -1,6 +1,6 @@ context("dhist: Discrete histogram from observations") test_that("discrete_hist generates correct discrete histograms for random integer observations", { - # Method for generating random observations containing specific locations a + # Method for generating random observations containing specific locations a # specific number of times random_observations <- function(locations, counts) { # Construct vector containing each location replicated "count" times @@ -8,266 +8,314 @@ test_that("discrete_hist generates correct discrete histograms for random intege # Randomise the order of the observations sample(observations, size = length(observations), replace = FALSE) } - + set.seed(2684) num_tests <- 100 - + run_test <- function() { # Set parameters for generation of random observation sets num_observations <- 100 - location_range <- -(num_observations*3):(num_observations*3) - # Do not allow zero counts as these locations will not be present in the + location_range <- -(num_observations * 3):(num_observations * 3) + # Do not allow zero counts as these locations will not be present in the # observations generated from the locations and counts count_range <- 1:10 - + # Generate random observation sets locations <- sample(location_range, num_observations, replace = FALSE) counts <- sample(count_range, num_observations, replace = TRUE) - + # Construct vector containing each location replicated "count" times observations_orig <- purrr::simplify(purrr::map2(locations, counts, rep)) # Randomise the order of the observations observations <- sample(observations_orig, size = length(observations_orig), replace = FALSE) - + # Generate discrete histograms hist <- dhist_from_obs(observations) - - # discrete_hist will drop bins with zero counts, so remove these from the - # expected data (not necessary now we've restricted counts to be >= 1, but + + # discrete_hist will drop bins with zero counts, so remove these from the + # expected data (not necessary now we've restricted counts to be >= 1, but # the bug where we generated test locations with zero counts was so annoying # to identify that we're going with a belt and braces approach) non_zero_count_indexes <- counts != 0 expected_locations <- locations[non_zero_count_indexes] expected_counts <- counts[non_zero_count_indexes] - # dhist_from_obs will return results with bins ordered by ascending location, + # dhist_from_obs will return results with bins ordered by ascending location, # so sort expected data to match sorted_locations <- sort(expected_locations, index.return = TRUE) sorted_location_indexes <- sorted_locations$ix expected_locations <- expected_locations[sorted_location_indexes] expected_counts <- expected_counts[sorted_location_indexes] - - # Check that histogram locations and counts match those used to generate the + + # Check that histogram locations and counts match those used to generate the # observations expect_true(all.equal(hist$locations, expected_locations)) expect_true(all.equal(hist$masses, expected_counts)) } - - for(i in 1:num_tests) { + + for (i in 1:num_tests) { run_test() } }) context("dhist: constructor, equality operator and as_* transformation functions") test_that("dhist constuctor has correct locations and masses (default smoothing, unsorted)", { - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) actual1 <- dhist(locations = locations1, masses = masses1, sorted = FALSE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) actual2 <- dhist(locations = locations2, masses = masses2, sorted = FALSE) - + expected_class <- "dhist" expected_smoothing_window_width <- 0 - - expected1 = list(locations = locations1, masses = masses1, - smoothing_window_width = expected_smoothing_window_width) + + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = expected_smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = locations2, masses = masses2, - smoothing_window_width = expected_smoothing_window_width) + + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = expected_smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (default smoothing, sorted)", { - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) actual1 <- dhist(locations = locations1, masses = masses1, sorted = TRUE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) actual2 <- dhist(locations = locations2, masses = masses2, sorted = TRUE) - + expected_class <- "dhist" expected_smoothing_window_width <- 0 - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = expected_smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = expected_smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = expected_smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (default smoothing, default sorting)", { - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) actual1 <- dhist(locations = locations1, masses = masses1) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) actual2 <- dhist(locations = locations2, masses = masses2) - + expected_class <- "dhist" expected_smoothing_window_width <- 0 - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = expected_smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = expected_smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = expected_smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (specified smoothing, unsorted)", { smoothing_window_width <- 1 - - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = FALSE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = FALSE) - + + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) + expected_class <- "dhist" - - expected1 = list(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width) + + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width) + + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (specified smoothing, sorted)", { smoothing_window_width <- 1 - - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = TRUE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = TRUE) - + + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) + expected_class <- "dhist" - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (specified smoothing, default sorting)", { smoothing_window_width <- 1 - - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width) - + + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) + expected_class <- "dhist" - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("as_smoothed_dhist sets smoothing_window_width correctly", { - dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14)) + dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14) + ) expected_smoothing_window_width_pre <- 0 expected_smoothing_window_width_post <- 1 - - expect_equal(dhist_pre$smoothing_window_width, - expected_smoothing_window_width_pre) + + expect_equal( + dhist_pre$smoothing_window_width, + expected_smoothing_window_width_pre + ) dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) - expect_equal(dhist_post$smoothing_window_width, - expected_smoothing_window_width_post) + expect_equal( + dhist_post$smoothing_window_width, + expected_smoothing_window_width_post + ) }) test_that("as_unsmoothed_dhist sets smoothing_window_width correctly", { - dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14), - smoothing_window_width <- 1) + dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14), + smoothing_window_width <- 1 + ) expected_smoothing_window_width_pre <- 1 expected_smoothing_window_width_post <- 0 - - expect_equal(dhist_pre$smoothing_window_width, - expected_smoothing_window_width_pre) + + expect_equal( + dhist_pre$smoothing_window_width, + expected_smoothing_window_width_pre + ) dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) - expect_equal(dhist_post$smoothing_window_width, - expected_smoothing_window_width_post) + expect_equal( + dhist_post$smoothing_window_width, + expected_smoothing_window_width_post + ) }) test_that("Identical dhists are considered equal", { - dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14), - smoothing_window = 0) + dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14), + smoothing_window = 0 + ) dhist2 <- dhist1 expect_true(dhist1 == dhist2) }) test_that("Non-identical dhists are NOT considered equal", { - dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14), - smoothing_window = 0) - + dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14), + smoothing_window = 0 + ) + # Change a single element of the locations field dhist2_one_location_mismatch <- dhist1 dhist2_one_location_mismatch$locations[3] <- dhist2_one_location_mismatch$locations[1] + 1 expect_false(dhist1 == dhist2_one_location_mismatch) - + # Change a single element of the masses field dhist2_one_mass_mismatch <- dhist1 dhist2_one_mass_mismatch$masses[2] <- dhist2_one_mass_mismatch$masses[1] + 1 expect_false(dhist1 == dhist2_one_mass_mismatch) - + # Change the smoothing window field dhist2_smoothing_mismatch <- dhist1 dhist2_smoothing_mismatch$smoothing_window_width <- 1 expect_false(dhist1 == dhist2_smoothing_mismatch) - + # Change class dhist2_class_mismatch <- dhist1 attr(dhist2_class_mismatch, "class") <- "mismatch" @@ -276,21 +324,21 @@ test_that("Non-identical dhists are NOT considered equal", { context("dhist: Discrete histogram variance") test_that("dhist_variance difference for smoothed and unsmoothed dhists is smoothing_window_width^2 / 12", { - dhist <- dhist(locations <- c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) + dhist <- dhist(locations <- c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) # Be careful: ensure that no smoothing window width results in overlapping bins smoothing_window_width_A <- 1 smoothing_window_width_B <- 2 dhist_unsmoothed <- as_unsmoothed_dhist(dhist) dhist_smoothed_A <- as_smoothed_dhist(dhist, smoothing_window_width_A) dhist_smoothed_B <- as_smoothed_dhist(dhist, smoothing_window_width_B) - + var_unsmoothed <- dhist_variance(dhist_unsmoothed) var_smoothed_A <- dhist_variance(dhist_smoothed_A) var_smoothed_B <- dhist_variance(dhist_smoothed_B) - + expected_var_smoothed_A <- var_unsmoothed + ((smoothing_window_width_A^2) / 12) expected_var_smoothed_B <- var_unsmoothed + ((smoothing_window_width_B^2) / 12) - + expect_equal(var_smoothed_A, expected_var_smoothed_A) expect_equal(var_smoothed_B, expected_var_smoothed_B) }) @@ -298,23 +346,27 @@ test_that("dhist_variance difference for smoothed and unsmoothed dhists is smoot test_that("dhist_variance returns sigma^2 for unsmoothed normal histograms", { num_hists <- 5 num_bins <- 100001 - + mus <- runif(num_hists, -10, 10) sigmas <- runif(num_hists, 0, 10) - - rand_locations <- function(mu, sigma) {return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins))} - + + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) masses <- dnorm(locations, mean = mu, sd = sigma) return(dhist(masses = masses, locations = locations)) }) - + actuals <- purrr::map_dbl(rand_dhists, dhist_variance) - expected <- purrr::map_dbl(sigmas, function(sigma) {return(sigma^2)}) - + expected <- purrr::map_dbl(sigmas, function(sigma) { + return(sigma^2) + }) + expect_equalish <- function(actual, expected) { - scaled_diff <- abs(actual - expected)/min(actual, expected) + scaled_diff <- abs(actual - expected) / min(actual, expected) max_diff <- 1e-4 return(expect_lte(scaled_diff, max_diff)) } @@ -326,25 +378,35 @@ test_that("normalise_dhist_mass output sums to 1", { # Generate histograms with random masses (no centres needed for this test) num_hists <- 10 num_bins <- 100 - + mass_min <- 0 mass_max <- 100 - rand_bin_masses <- function() {return(runif(num_bins, mass_min, mass_max))} + rand_bin_masses <- function() { + return(runif(num_bins, mass_min, mass_max)) + } bin_mass_lists <- replicate(num_hists, rand_bin_masses(), simplify = FALSE) # Locations are unimportant as they do not affect mass normalisation locations <- 1:num_bins smoothing_window_width <- 1 - + normalised_dhists <- purrr::map(bin_mass_lists, function(masses) { - normalise_dhist_mass(dhist(masses = masses, locations = locations, - smoothing_window_width = smoothing_window_width)) + normalise_dhist_mass(dhist( + masses = masses, locations = locations, + smoothing_window_width = smoothing_window_width + )) }) expected_total_mass <- 1 # Check total masses match expectations - purrr::map_dbl(normalised_dhists, function(dhist) {expect_equal(sum(dhist$masses), expected_total_mass)}) + purrr::map_dbl(normalised_dhists, function(dhist) { + expect_equal(sum(dhist$masses), expected_total_mass) + }) # Check other histogram properties unchanged - purrr::walk(normalised_dhists, function(dhist) {expect_equal(dhist$locations, locations)}) - purrr::walk(normalised_dhists, function(dhist) {expect_equal(dhist$smoothing_window_width, smoothing_window_width)}) + purrr::walk(normalised_dhists, function(dhist) { + expect_equal(dhist$locations, locations) + }) + purrr::walk(normalised_dhists, function(dhist) { + expect_equal(dhist$smoothing_window_width, smoothing_window_width) + }) }) context("dhist: Discrete histogram variance normalisation") @@ -352,60 +414,92 @@ test_that("normalise_histogram_variance output has variance of 1 for random inte # Generate histograms with random masses and random centres num_hists <- 10 num_bins <- 70 - + mass_min <- 0 mass_max <- 100 - rand_masses <- function() {return(runif(num_bins, mass_min, mass_max))} - + rand_masses <- function() { + return(runif(num_bins, mass_min, mass_max)) + } + centre_min <- -30 centre_max <- 70 - rand_locations <- function() {return(round(sample(centre_min:centre_max, num_bins), digits = 0))} - + rand_locations <- function() { + return(round(sample(centre_min:centre_max, num_bins), digits = 0)) + } + rand_dhists <- replicate(num_hists, dhist(masses = rand_masses(), locations = rand_locations()), simplify = FALSE) - + smoothing_window_width <- 1 rand_dhists_unsmoothed <- purrr::map(rand_dhists, as_unsmoothed_dhist) rand_dhists_smoothed <- purrr::map(rand_dhists, as_smoothed_dhist, smoothing_window_width = smoothing_window_width) - + expected_post_norm_smoothing_windows <- purrr::map_dbl(rand_dhists_smoothed, function(dhist) { - smoothing_window_width/dhist_std(dhist) - }) - - actual_dhist_unsmoothed <- purrr::map(rand_dhists_unsmoothed, function(dhist) {normalise_dhist_variance(dhist)}) - actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) {normalise_dhist_variance(dhist)}) + smoothing_window_width / dhist_std(dhist) + }) + + actual_dhist_unsmoothed <- purrr::map(rand_dhists_unsmoothed, function(dhist) { + normalise_dhist_variance(dhist) + }) + actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) { + normalise_dhist_variance(dhist) + }) expected_variance <- 1 # Check variance of normalised hostograms is as expected - purrr::walk(actual_dhist_unsmoothed, function(dhist) {expect_equal(dhist_variance(dhist), expected_variance)}) - purrr::walk(actual_dhist_smoothed, function(dhist) {expect_equal(dhist_variance(dhist), expected_variance)}) + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist_variance(dhist), expected_variance) + }) + purrr::walk(actual_dhist_smoothed, function(dhist) { + expect_equal(dhist_variance(dhist), expected_variance) + }) # Check smoothing window is as expected (0 for unsmoothe; smoothing_window_width/sigma for smoothed) - purrr::walk(actual_dhist_unsmoothed, function(dhist) {expect_equal(dhist$smoothing_window_width, 0)}) - purrr::walk2(actual_dhist_smoothed, expected_post_norm_smoothing_windows, - function(dhist, sww) {expect_equal(dhist$smoothing_window_width, sww)}) + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist$smoothing_window_width, 0) + }) + purrr::walk2( + actual_dhist_smoothed, expected_post_norm_smoothing_windows, + function(dhist, sww) { + expect_equal(dhist$smoothing_window_width, sww) + } + ) # Check masses unaltered - purrr::walk2(actual_dhist_unsmoothed, rand_dhists_unsmoothed, - function(actual, expected) {expect_equal(actual$masses, expected$masses)}) - purrr::walk2(actual_dhist_smoothed, rand_dhists_smoothed, - function(actual, expected) {expect_equal(actual$masses, expected$masses)}) + purrr::walk2( + actual_dhist_unsmoothed, rand_dhists_unsmoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) + purrr::walk2( + actual_dhist_smoothed, rand_dhists_smoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) }) test_that("normalise_histogram_variance output has variance of 1 for normal histograms", { num_hists <- 5 num_bins <- 100001 - + mus <- runif(num_hists, -10, 10) sigmas <- runif(num_hists, 0, 10) - - rand_locations <- function(mu, sigma) {return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins))} - + + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) masses <- dnorm(locations, mean = mu, sd = sigma) return(dhist(masses = masses, locations = locations)) }) - actuals <- purrr::map(rand_dhists, function(dhist) {dhist_variance(normalise_dhist_variance(dhist))}) + actuals <- purrr::map(rand_dhists, function(dhist) { + dhist_variance(normalise_dhist_variance(dhist)) + }) expected <- 1 - purrr::map_dbl(actuals, function(actual) {expect_equal(actual, expected)}) + purrr::map_dbl(actuals, function(actual) { + expect_equal(actual, expected) + }) }) context("dhist: Sort dhist") @@ -416,12 +510,12 @@ test_that("sort_dhist works", { class(dhist1) <- "dhist" dhist2 <- list(locations = c(3, 0, -62, 7, 16, -58), masses = c(23, 24, 26, 22, 21, 25)) class(dhist2) <- "dhist" - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), masses = c(16, 15, 14, 13 ,12, 11)) + + expected1 <- list(locations = c(1, 7, 9, 21, 42, 101), masses = c(16, 15, 14, 13, 12, 11)) class(expected1) <- "dhist" - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), masses = c(26, 25, 24, 23, 22, 21)) + expected2 <- list(locations = c(-62, -58, 0, 3, 7, 16), masses = c(26, 25, 24, 23, 22, 21)) class(expected2) <- "dhist" - + actual1 <- sort_dhist(dhist1) actual2 <- sort_dhist(dhist2) @@ -433,7 +527,7 @@ context("dhist: ECMF") test_that("dhist_ecmf returns correct step function when smoothing_window_width is zero", { dhist1 <- dhist(locations = c(1, 2, 4, 7, 11, 16, 22), masses = c(21, 22, 23, 27, 31, 36, 42)) dhist1_unsmoothed <- as_unsmoothed_dhist(dhist1) - + ecmf1 <- dhist_ecmf(dhist1) actual_knots1 <- ecmf_knots(ecmf1) actual_knots_ecds1 <- ecmf1(actual_knots1) @@ -441,14 +535,14 @@ test_that("dhist_ecmf returns correct step function when smoothing_window_width actual_inter_knots_ecds1 <- ecmf1(inter_knots_x) extra_knots <- c(actual_knots1[1] - 1, actual_knots1[length(actual_knots1)] + 1) actual_extra_knots_ecds1 <- ecmf1(extra_knots) - + cum_masses1 <- cumsum(dhist1$masses) max_cum_mass <- cum_masses1[length(cum_masses1)] expected_knots_ecds1 <- cum_masses1 - expected_inter_knots_ecds1 <- head(expected_knots_ecds1, length(expected_knots_ecds1) -1) + expected_inter_knots_ecds1 <- head(expected_knots_ecds1, length(expected_knots_ecds1) - 1) expected_extra_knots_ecds1 <- c(0, max_cum_mass) expected_knots1 <- dhist1$locations - + expect_equal(actual_knots1, expected_knots1) expect_equal(actual_knots_ecds1, expected_knots_ecds1) expect_equal(actual_inter_knots_ecds1, expected_inter_knots_ecds1) @@ -457,7 +551,7 @@ test_that("dhist_ecmf returns correct step function when smoothing_window_width context("dhist: Area between ECMFs (simple integer dhists)") test_that("area_between_dhist_ecmfs returns correct value for simple integer dhists", { - # Example dhists constructed by hand to result in lots of "bowtie" segments + # Example dhists constructed by hand to result in lots of "bowtie" segments # for smoothed ECMFs and to allow expected areas to be calculated by hand # Unsmoothed locations are on an integer grid, smoothed bin edges are on a # half-integer grid @@ -465,18 +559,18 @@ test_that("area_between_dhist_ecmfs returns correct value for simple integer dhi # Smoothed ECMF crossing points are on a quarter-integer grid dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) - + # Set up smoothed and unsmoothed versions of histograms smoothing_window_width <- 1 dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - + # Set expected area expected_area_unsmoothed <- 4 expected_area_smoothed <- 3 - + # Generate ecmfs ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) @@ -486,7 +580,7 @@ test_that("area_between_dhist_ecmfs returns correct value for simple integer dhi # Calculate area between ECMFs actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - + # Compare caculated areas with expected areas expect_equal(actual_area_unsmoothed, expected_area_unsmoothed) expect_equal(actual_area_smoothed, expected_area_smoothed) @@ -494,29 +588,29 @@ test_that("area_between_dhist_ecmfs returns correct value for simple integer dhi context("dhist: Area between ECMFs (non-integer normalised dhists)") test_that("area_between_dhist_ecmfs returns correct value for non-integer normalised dhists", { - - # Previous simple integer grid where both histograms have been separately - # normalised to unit mass and variance. Has locations and masses at a range + + # Previous simple integer grid where both histograms have been separately + # normalised to unit mass and variance. Has locations and masses at a range # of floating point locations. Has bowties, triangles and trapeziums. dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) dhistA <- normalise_dhist_mass(normalise_dhist_variance(dhistA)) dhistB <- normalise_dhist_mass(normalise_dhist_variance(dhistB)) - + # Set up smoothed and unsmoothed versions of histograms smoothing_window_width <- 1 dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - + # Generate ecmfs ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) - + # Define some functions to make calculation of manually measured areas easier rectangle_area <- function(width, height) { return(width * height) @@ -532,42 +626,46 @@ test_that("area_between_dhist_ecmfs returns correct value for non-integer normal # Actual grid counts preserved in data to facilitate less tedious manual # checking if required # --- Unsmoothed --- - area_A_unsmoothed <- rectangle_area(width = 10*0.02, height = 12.5*0.01) - area_B_unsmoothed <- rectangle_area(width = 50.5*0.02, height = 37.5*0.01) - area_C_unsmoothed <- rectangle_area(width = 26*0.02, height = 12.5*0.01) - area_D_unsmoothed <- rectangle_area(width = 34.5*0.02, height = 12.5*0.01) - area_E_unsmoothed <- rectangle_area(width = 26.5*0.02, height = 25*0.01) - expected_area_unsmoothed <- - sum(area_A_unsmoothed, area_B_unsmoothed, area_C_unsmoothed, - area_D_unsmoothed, area_E_unsmoothed) + area_A_unsmoothed <- rectangle_area(width = 10 * 0.02, height = 12.5 * 0.01) + area_B_unsmoothed <- rectangle_area(width = 50.5 * 0.02, height = 37.5 * 0.01) + area_C_unsmoothed <- rectangle_area(width = 26 * 0.02, height = 12.5 * 0.01) + area_D_unsmoothed <- rectangle_area(width = 34.5 * 0.02, height = 12.5 * 0.01) + area_E_unsmoothed <- rectangle_area(width = 26.5 * 0.02, height = 25 * 0.01) + expected_area_unsmoothed <- + sum( + area_A_unsmoothed, area_B_unsmoothed, area_C_unsmoothed, + area_D_unsmoothed, area_E_unsmoothed + ) # --- Smoothed --- - area_A_smoothed <- triangle_area(base = 2.75*0.01, height = 6.5*0.02) - area_B_smoothed <- triangle_area(base = 2.75*0.01, height = 3*0.02) - area_C_smoothed <- triangle_area(base = 18.5*0.01, height = 21*0.02) - area_D_smoothed <- trapezium_area(side_a = 18.5*0.01, side_b = 37.5*0.01, height = 14.5*0.02) - area_E_smoothed <- trapezium_area(side_a = 37.5*0.01, side_b = 37.5*0.01, height = 16*0.02) - area_F_smoothed <- triangle_area(base = 37.5*0.01, height = 22.5*0.02) - area_G_smoothed <- triangle_area(base = 7.5*0.01, height = 8*0.02) - area_H_smoothed <- triangle_area(base = 7.5*0.01, height = 11*0.02) - area_I_smoothed <- triangle_area(base = 12.5*0.01, height = 19.5*0.02) - area_J_smoothed <- trapezium_area(side_a = 12.5*0.01, side_b = 20*0.01, height = 30.5*0.02) - area_K_smoothed <- trapezium_area(side_a = 20*0.01, side_b = 18*0.01, height = 8*0.02) - area_L_smoothed <- triangle_area(base = 18*0.01, height = 22*0.02) - expected_area_smoothed <- - sum(area_A_smoothed, area_B_smoothed, area_C_smoothed, area_D_smoothed, - area_E_smoothed, area_F_smoothed, area_G_smoothed, area_H_smoothed, - area_I_smoothed, area_J_smoothed, area_K_smoothed, area_L_smoothed) - + area_A_smoothed <- triangle_area(base = 2.75 * 0.01, height = 6.5 * 0.02) + area_B_smoothed <- triangle_area(base = 2.75 * 0.01, height = 3 * 0.02) + area_C_smoothed <- triangle_area(base = 18.5 * 0.01, height = 21 * 0.02) + area_D_smoothed <- trapezium_area(side_a = 18.5 * 0.01, side_b = 37.5 * 0.01, height = 14.5 * 0.02) + area_E_smoothed <- trapezium_area(side_a = 37.5 * 0.01, side_b = 37.5 * 0.01, height = 16 * 0.02) + area_F_smoothed <- triangle_area(base = 37.5 * 0.01, height = 22.5 * 0.02) + area_G_smoothed <- triangle_area(base = 7.5 * 0.01, height = 8 * 0.02) + area_H_smoothed <- triangle_area(base = 7.5 * 0.01, height = 11 * 0.02) + area_I_smoothed <- triangle_area(base = 12.5 * 0.01, height = 19.5 * 0.02) + area_J_smoothed <- trapezium_area(side_a = 12.5 * 0.01, side_b = 20 * 0.01, height = 30.5 * 0.02) + area_K_smoothed <- trapezium_area(side_a = 20 * 0.01, side_b = 18 * 0.01, height = 8 * 0.02) + area_L_smoothed <- triangle_area(base = 18 * 0.01, height = 22 * 0.02) + expected_area_smoothed <- + sum( + area_A_smoothed, area_B_smoothed, area_C_smoothed, area_D_smoothed, + area_E_smoothed, area_F_smoothed, area_G_smoothed, area_H_smoothed, + area_I_smoothed, area_J_smoothed, area_K_smoothed, area_L_smoothed + ) + # Calculate area between ECMFs actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - + # Compare caculated areas with expected areas expect_equalish_manual <- function(actual, expected, relative_tolerance) { relative_diff <- abs(actual - expected) / expected expect_lte(relative_diff, relative_tolerance) } - + # Given manual measurement of areas between curves, consider area correct # if actual and expected areas are within 1% of each other expect_equalish_manual(actual_area_unsmoothed, expected_area_unsmoothed, 0.01) @@ -578,7 +676,7 @@ context("dhist: Harmonise dhist locations") test_that("harmonise_dhist_locations works A", { dhist1 <- dhist(masses = c(11, 12, 13), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) dhist2 <- dhist(masses = c(21, 22, 23), locations = c(2, 4, 6), smoothing_window_width = 1, sorted = FALSE) - + expected <- list( dhist1 = dhist(masses = c(11, 12, 13, 0, 0, 0), locations = c(1, 3, 5, 2, 4, 6), smoothing_window_width = 1, sorted = FALSE), dhist2 = dhist(masses = c(21, 22, 23, 0, 0, 0), locations = c(2, 4, 6, 1, 3, 5), smoothing_window_width = 1, sorted = FALSE) @@ -590,7 +688,7 @@ test_that("harmonise_dhist_locations works A", { test_that("harmonise_dhist_locations works B", { dhist1 <- dhist(masses = c(1, 1, 1), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) dhist2 <- dhist(masses = c(1, 1, 1), locations = c(4, 5, 6), smoothing_window_width = 1, sorted = FALSE) - + expected <- list( dhist1 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(1, 3, 5, 4, 6), smoothing_window_width = 1, sorted = FALSE), dhist2 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(4, 5, 6, 1, 3), smoothing_window_width = 1, sorted = FALSE) diff --git a/tests/testthat/test_emd.R b/tests/testthat/test_emd.R index 59145923..960a00c0 100644 --- a/tests/testthat/test_emd.R +++ b/tests/testthat/test_emd.R @@ -1,275 +1,336 @@ context("EMD: Cost matrix") # COST_MATRIX: Property-based tests -test_that("cost_matrix returns all zeros when all bin locations are identical",{ +test_that("cost_matrix returns all zeros when all bin locations are identical", { bin_centres1 <- c(1, 1, 1, 1, 1, 1, 1) bin_centres2 <- bin_centres1 - expected <- matrix(0, nrow = length(bin_centres1), - ncol = length(bin_centres2)) + expected <- matrix(0, + nrow = length(bin_centres1), + ncol = length(bin_centres2) + ) expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) }) test_that("cost_matrix returns zeros along diagonal when both sets of bin locations are the same", { - bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - bin_centres2 <- bin_centres1 - expected <- rep(0, length(bin_centres1)) - expect_equal(diag(cost_matrix(bin_centres1, bin_centres2)), expected) - }) + bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) + bin_centres2 <- bin_centres1 + expected <- rep(0, length(bin_centres1)) + expect_equal(diag(cost_matrix(bin_centres1, bin_centres2)), expected) +}) test_that("cost_matrix returns zeros along diagonal and taxicab distance from all zeros for all other elements when both sets of bin locations are the same and are a sequence of consecutive integers", { - bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - bin_centres2 <- bin_centres1 - num_bins <- length(bin_centres1) - expected <- toeplitz(1:num_bins)-1 - expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) - }) + bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) + bin_centres2 <- bin_centres1 + num_bins <- length(bin_centres1) + expected <- toeplitz(1:num_bins) - 1 + expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) +}) test_that("cost_matrix is correct size when the two histograms are of different lengths", { - bin_centres1 <- c(1, 2, 3, 4, 5, 6, 7) - bin_centres2 <- c(8, 9, 10) - - cm <- cost_matrix(bin_centres1, bin_centres2) - - expect_equal(nrow(cm), length(bin_centres1)) - expect_equal(ncol(cm), length(bin_centres2)) - }) + bin_centres1 <- c(1, 2, 3, 4, 5, 6, 7) + bin_centres2 <- c(8, 9, 10) + + cm <- cost_matrix(bin_centres1, bin_centres2) + + expect_equal(nrow(cm), length(bin_centres1)) + expect_equal(ncol(cm), length(bin_centres2)) +}) context("EMD: EMD") # EMD: Property-based tests test_that("EMD methods return 0 when comparing a 1D feature distribution to - itself",{ - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_masses2 <- bin_masses1 - bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - bin_centres2 <- bin_centres1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - - expected <- 0 - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - }) + itself", { + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_masses2 <- bin_masses1 + bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) + bin_centres2 <- bin_centres1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + + expected <- 0 + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) +}) test_that("EMD methods return numBins/2 when offsetting a symmetric discrete triangle distribution by 1", { - cost_fn <- function(triangle_width) { - move_dist <- ceiling((triangle_width+1)/2) - num_moves <- ceiling(triangle_width/2) - return(move_dist * num_moves) - } - - # Triangle(4, even), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - num_nonzero_bins <- sum(bin_masses1 > 0) - expected <- cost_fn(num_nonzero_bins) - emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(4, even), shifting by changing centres - bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - num_nonzero_bins <- sum(bin_masses1 > 0) - expected <- cost_fn(num_nonzero_bins) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, odd), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, odd), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, even), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, even), shifting by changing centres - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(6, odd), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(6, odd), shifting by changing centres - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - }) + cost_fn <- function(triangle_width) { + move_dist <- ceiling((triangle_width + 1) / 2) + num_moves <- ceiling(triangle_width / 2) + return(move_dist * num_moves) + } + + # Triangle(4, even), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + num_nonzero_bins <- sum(bin_masses1 > 0) + expected <- cost_fn(num_nonzero_bins) + emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(4, even), shifting by changing centres + bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + num_nonzero_bins <- sum(bin_masses1 > 0) + expected <- cost_fn(num_nonzero_bins) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, odd), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, odd), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, even), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, even), shifting by changing centres + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(6, odd), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(6, odd), shifting by changing centres + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) +}) test_that("EMD methods return same result for densely and sparsely specified bins", { - sparse_bin_masses1 <- c(1, 1, 1, 1, 1, 1) - sparse_bin_masses2 <- c(1, 1, 1, 1, 1, 1) - sparse_bin_centres1 <- c(1, 2, 4, 7, 11, 16) - sparse_bin_centres2 <- c(21, 22, 24, 27, 31, 36) - sparse_histogram1 <- dhist(masses = sparse_bin_masses1, - locations = sparse_bin_centres1) - sparse_histogram2 <- dhist(masses = sparse_bin_masses2, - locations = sparse_bin_centres2) - - dense_bin_centres1 <- 1:36 - dense_bin_centres2 <- dense_bin_centres1 - bin_mass_sequence <- c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1) - bin_mass_padding <- rep(0, length(dense_bin_centres1) - - length(bin_mass_sequence)) - dense_bin_masses1 <- c(bin_mass_sequence, bin_mass_padding) - dense_bin_masses2 <- c(bin_mass_padding, bin_mass_sequence) - dense_histogram1 <- dhist(masses = dense_bin_masses1, - locations = dense_bin_centres1) - dense_histogram2 <- dhist(masses = dense_bin_masses2, - locations = dense_bin_centres2) - - expect_equal(emd_lp(dense_bin_masses1, dense_bin_masses2, - dense_bin_centres1, dense_bin_centres2), - emd_lp(sparse_bin_masses1, sparse_bin_masses2, - sparse_bin_centres1, sparse_bin_centres2)) - expect_equal(emd_cs(dense_histogram1, dense_histogram2), - emd_cs(sparse_histogram1,sparse_histogram2)) - expect_equal(emd(dense_histogram1, dense_histogram2), - emd(sparse_histogram1, sparse_histogram2)) - }) + sparse_bin_masses1 <- c(1, 1, 1, 1, 1, 1) + sparse_bin_masses2 <- c(1, 1, 1, 1, 1, 1) + sparse_bin_centres1 <- c(1, 2, 4, 7, 11, 16) + sparse_bin_centres2 <- c(21, 22, 24, 27, 31, 36) + sparse_histogram1 <- dhist( + masses = sparse_bin_masses1, + locations = sparse_bin_centres1 + ) + sparse_histogram2 <- dhist( + masses = sparse_bin_masses2, + locations = sparse_bin_centres2 + ) + + dense_bin_centres1 <- 1:36 + dense_bin_centres2 <- dense_bin_centres1 + bin_mass_sequence <- c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1) + bin_mass_padding <- rep(0, length(dense_bin_centres1) - + length(bin_mass_sequence)) + dense_bin_masses1 <- c(bin_mass_sequence, bin_mass_padding) + dense_bin_masses2 <- c(bin_mass_padding, bin_mass_sequence) + dense_histogram1 <- dhist( + masses = dense_bin_masses1, + locations = dense_bin_centres1 + ) + dense_histogram2 <- dhist( + masses = dense_bin_masses2, + locations = dense_bin_centres2 + ) + + expect_equal( + emd_lp( + dense_bin_masses1, dense_bin_masses2, + dense_bin_centres1, dense_bin_centres2 + ), + emd_lp( + sparse_bin_masses1, sparse_bin_masses2, + sparse_bin_centres1, sparse_bin_centres2 + ) + ) + expect_equal( + emd_cs(dense_histogram1, dense_histogram2), + emd_cs(sparse_histogram1, sparse_histogram2) + ) + expect_equal( + emd(dense_histogram1, dense_histogram2), + emd(sparse_histogram1, sparse_histogram2) + ) +}) test_that("EMD methods return same result when order of densely specified bins is changed", { - bin_masses1 <- c(1, 1, 1, 1, 0, 0, 0, 0, 0) - bin_masses2 <- c(0, 0, 0, 0, 0, 1, 1, 1, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - - permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) - permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) - - permuted_bin_masses1 <- bin_masses1[permuted_indexes1] - permuted_bin_centres1 <- bin_centres1[permuted_indexes1] - permuted_bin_masses2 <- bin_masses2[permuted_indexes2] - permuted_bin_centres2 <- bin_centres2[permuted_indexes2] - permuted_histogram1 <- dhist(masses = permuted_bin_masses1, - locations = permuted_bin_centres1) - permuted_histogram2 <- dhist(masses = permuted_bin_masses2, - locations = permuted_bin_centres2) - - expect_equal(emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), - emd_lp(permuted_bin_masses1, permuted_bin_masses2, - permuted_bin_centres1, permuted_bin_centres2)) - expect_equal(emd_cs(histogram1, histogram2), - emd_cs(permuted_histogram1, permuted_histogram2)) - expect_equal(emd(histogram1, histogram2), - emd(permuted_histogram1, permuted_histogram2)) - }) + bin_masses1 <- c(1, 1, 1, 1, 0, 0, 0, 0, 0) + bin_masses2 <- c(0, 0, 0, 0, 0, 1, 1, 1, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + + permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) + permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) + + permuted_bin_masses1 <- bin_masses1[permuted_indexes1] + permuted_bin_centres1 <- bin_centres1[permuted_indexes1] + permuted_bin_masses2 <- bin_masses2[permuted_indexes2] + permuted_bin_centres2 <- bin_centres2[permuted_indexes2] + permuted_histogram1 <- dhist( + masses = permuted_bin_masses1, + locations = permuted_bin_centres1 + ) + permuted_histogram2 <- dhist( + masses = permuted_bin_masses2, + locations = permuted_bin_centres2 + ) + + expect_equal( + emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), + emd_lp( + permuted_bin_masses1, permuted_bin_masses2, + permuted_bin_centres1, permuted_bin_centres2 + ) + ) + expect_equal( + emd_cs(histogram1, histogram2), + emd_cs(permuted_histogram1, permuted_histogram2) + ) + expect_equal( + emd(histogram1, histogram2), + emd(permuted_histogram1, permuted_histogram2) + ) +}) test_that("EMD methods return same result when order of sparsely specified bins is changed", { - bin_masses1 <- c(1, 1, 1, 1, 1, 1) - bin_masses2 <- c(1, 1, 1, 1, 1, 1) - bin_centres1 <- c(1, 2, 4, 8, 16, 32) - bin_centres2 <- c(-32, -16, -8, -4, -2, -1) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - - permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) - permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) - - permuted_bin_masses1 <- bin_masses1[permuted_indexes1] - permuted_bin_centres1 <- bin_centres1[permuted_indexes1] - permuted_bin_masses2 <- bin_masses2[permuted_indexes2] - permuted_bin_centres2 <- bin_centres2[permuted_indexes2] - permuted_histogram1 <- dhist(masses = permuted_bin_masses1, - locations = permuted_bin_centres1) - permuted_histogram2 <- dhist(masses = permuted_bin_masses2, - locations = permuted_bin_centres2) - - expect_equal(emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), - emd_lp(permuted_bin_masses1, permuted_bin_masses2, - permuted_bin_centres1, permuted_bin_centres2)) - expect_equal(emd_cs(histogram1, histogram2), - emd_cs(permuted_histogram1, permuted_histogram2)) - expect_equal(emd(histogram1, histogram2), - emd(permuted_histogram1, permuted_histogram2)) - }) + bin_masses1 <- c(1, 1, 1, 1, 1, 1) + bin_masses2 <- c(1, 1, 1, 1, 1, 1) + bin_centres1 <- c(1, 2, 4, 8, 16, 32) + bin_centres2 <- c(-32, -16, -8, -4, -2, -1) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + + permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) + permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) + + permuted_bin_masses1 <- bin_masses1[permuted_indexes1] + permuted_bin_centres1 <- bin_centres1[permuted_indexes1] + permuted_bin_masses2 <- bin_masses2[permuted_indexes2] + permuted_bin_centres2 <- bin_centres2[permuted_indexes2] + permuted_histogram1 <- dhist( + masses = permuted_bin_masses1, + locations = permuted_bin_centres1 + ) + permuted_histogram2 <- dhist( + masses = permuted_bin_masses2, + locations = permuted_bin_centres2 + ) + + expect_equal( + emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), + emd_lp( + permuted_bin_masses1, permuted_bin_masses2, + permuted_bin_centres1, permuted_bin_centres2 + ) + ) + expect_equal( + emd_cs(histogram1, histogram2), + emd_cs(permuted_histogram1, permuted_histogram2) + ) + expect_equal( + emd(histogram1, histogram2), + emd(permuted_histogram1, permuted_histogram2) + ) +}) context("EMD: Next step") test_that("next_step gives correct shift and matrix for simple x1, x2", { x1 <- c(-3000, -2000, -1000, 0, 1000, 2000, 3000, 4000) x2 <- c(-3100, -2100, -1100, 10, 100, 1100, 2100, 3100, 4100) - + expected_shift <- 10 expected_distance_matrix <- rbind( c(-100, -1100, -2100, -3100, -4100, -5100, -6100, -7100), @@ -282,7 +343,7 @@ test_that("next_step gives correct shift and matrix for simple x1, x2", { c(6100, 5100, 4100, 3100, 2100, 1100, 100, -900), c(7100, 6100, 5100, 4100, 3100, 2100, 1100, 100) ) - expected_distance_matrix[expected_distance_matrix<=0] <- Inf + expected_distance_matrix[expected_distance_matrix <= 0] <- Inf expected <- list(shift = 10, distance_matrix = expected_distance_matrix) actual <- shift_to_next_alignment(x1, x2) expect_equal(actual, expected) @@ -295,7 +356,7 @@ test_that("next_step gives correct shift for random x1, x2", { x1_min <- -1000 x1_max <- 1000 x1_prec <- 10 - x1 <- unique(sort(trunc(runif(27, x1_min, x1_max)/x1_prec) * x1_prec)) + x1 <- unique(sort(trunc(runif(27, x1_min, x1_max) / x1_prec) * x1_prec)) # Initialise x2 to a copy of x1 with all elements shifted right by 40% of # the minimum spacing between elements std_shift <- 0.4 * x1_prec @@ -304,7 +365,7 @@ test_that("next_step gives correct shift for random x1, x2", { min_shift <- 0.1 * x1_prec x2_rand_ind <- trunc(runif(1, 1, length(x2) + 1)) x2[x2_rand_ind] <- x2[x2_rand_ind] - std_shift + min_shift - + expected_shift <- min_shift actual_shift <- shift_to_next_alignment(x1, x2)$shift expect_equal(actual_shift, expected_shift) @@ -314,14 +375,14 @@ test_that("next_step gives correct shift for random x1, x2", { context("EMD: MinEMD") test_that("min_emd_ methods correctly compare a non-offset 1D feature - distribution to itself",{ + distribution to itself", { bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) bin_masses2 <- bin_masses1 bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) bin_centres2 <- bin_centres1 dhist1 <- dhist(masses = bin_masses1, locations = bin_centres1) dhist2 <- dhist(masses = bin_masses2, locations = bin_centres2) - + expected <- list(min_emd = 0, min_offset = 0) # Check min_emd_optimise actual_optimise <- min_emd_exhaustive(dhist1, dhist2) @@ -332,8 +393,8 @@ test_that("min_emd_ methods correctly compare a non-offset 1D feature }) test_that("min_emd_ methods correctly compare an offset 1D feature - distribution to itself",{ - offset = 10 + distribution to itself", { + offset <- 10 bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) bin_masses2 <- bin_masses1 bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) @@ -344,7 +405,7 @@ test_that("min_emd_ methods correctly compare an offset 1D feature bin_centres2 <- bin_centres1 + offset dhist1 <- dhist(masses = bin_masses1, locations = bin_centres1) dhist2 <- dhist(masses = bin_masses2, locations = bin_centres2) - + expected <- list(min_emd = 0, min_offset = offset) # Check min_emd_optimise actual_optimise <- min_emd_exhaustive(dhist1, dhist2) diff --git a/tests/testthat/test_fastEMD.R b/tests/testthat/test_fastEMD.R index cd091401..d8c15b83 100644 --- a/tests/testthat/test_fastEMD.R +++ b/tests/testthat/test_fastEMD.R @@ -83,6 +83,31 @@ getVal <- function(x1,v1,w1,x2,v2,w2) integrate(f1,mi123,ma123) } +getRvalue <- function(x1,v1,w1,x2,v2,w2) +{ + v1s <- c(v1[1],diff(v1)) + v2s <- c(v2[1],diff(v2)) + d1 = dhist(x1+w1/2,v1s,smoothing_window_width=w1) + d2 = dhist(x2+w2/2,v2s,smoothing_window_width=w2) + d1ec <- dhist_ecmf(d1) + d2ec <- dhist_ecmf(d2) + area_between_dhist_ecmfs(d1ec,d2ec) +} + + +compareRonlyVsOpt <- function(x1,v1,w1,x2,v2,w2) +{ + v1s <- c(v1[1],diff(v1)) + v2s <- c(v2[1],diff(v2)) + d1 = dhist(x1,v1s,smoothing_window_width=w1) + d2 = dhist(x2,v2s,smoothing_window_width=w2) + res1 <- netdist::netemd_single_pair(d1,d2,method = "optimise") + res2 <- netdist::netemd_single_pair(d1,d2,method = "optimiseRonly") + expect_lt(abs(res1$min_emd-res2$min_emd),10**(-4)) + c(res1$min_emd,res2$min_emd) +} + + test_that("3 element test", { @@ -264,16 +289,22 @@ test_that("many element test Mixture ", { f1 <- makeFunction(x1,v1,w1,x2,v2,w2) top1 = max(x2[length(x2)],x1[length(x1)])+max(w1,w2) bottom1 = min(x2[1],x1[1]) + + q1 = compareRonlyVsOpt(x1,v1,w1,x2,v2,w2) + res2 <- 0 res2 <- res2 + integrate(f1,bottom1,top1,abs.tol=0.000000001,subdivisions = 100000000)[[1]] res1 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) + + res3 <- getRvalue(x1,v1,w1,x2,v2,w2) + # print(c(q1,res1,res2,res3)) # if (abs(res1-res2)>0.001) # { # browser() # } # Swapped to percentage error - expect_lt(abs(res1-res2),10**(-3)) + expect_lt(abs(res1-res3),10**(-3)) } } @@ -306,8 +337,8 @@ test_that("Old failure case", { attr(d2,'class') <- "dhist" d2$locations <- c(0,1,2,3) d2$masses <- c(8634,1242,114,10) - sq1 <- net_emd_single_pair(d1,d2,method='optimise',smoothing_window_width = 1) - sq2 <- net_emd_single_pair(d1,d2,method='optimiseRonly',smoothing_window_width = 1) + sq1 <- netemd_single_pair(d1,d2,method='optimise',smoothing_window_width = 1) + sq2 <- netemd_single_pair(d1,d2,method='optimiseRonly',smoothing_window_width = 1) expect_lt(abs(sq1$min_emd-sq2$min_emd),10**(-4)) }) @@ -338,4 +369,42 @@ test_that("Old Failure Case 2 reverse", { res1 <- integrate(f1,min(min(x1),min(x2)),max(max(x1),max(x2))+max(w1,w2))[[1]] res2 <- NetEmdSmoothV2(x1,v1,w1,x2,v2,w2) expect_lt(abs(res2-res1),10**(-4)) -}) \ No newline at end of file +}) + + +test_that("equal distributions moving upwards",{ + x2 = 1:10 + v2 = 1:10 + w2 = 0.5 + x1 = 1:10 + v1 = c(1,2) + v1 = 1:10 + w1 = 0.5 + for (i in 0:10) + { + res2 <- NetEmdSmoothV2(x1,v1+i,w1,x2,v2,w2) + res3 <- getRvalue(x1,v1+i,w1,x2,v2,w2) + print(c(i,res2,res3)) + expect_lt(abs(res2-9.25*i),10**(-4)) + expect_lt(abs(res2-res3),10**(-4)) + } + } + ) + +test_that("equal distributions moving upwards diff width",{ + x2 = 1:10 + v2 = 1:10 + w2 = 0.5 + x1 = 1:10 + v1 = c(1,2) + v1 = 1:10 + w1 = 0.25 + for (i in 0:10) + { + res2 <- NetEmdSmoothV2(x1,v1+i,w1,x2,v2,w2) + res3 <- getRvalue(x1,v1+i,w1,x2,v2,w2) + print(c(i,res2,res3)) + expect_lt(abs(res2-res3),10**(-4)) + } + } + ) \ No newline at end of file diff --git a/tests/testthat/test_graph_binning.R b/tests/testthat/test_graph_binning.R index 958efa80..b54d4796 100644 --- a/tests/testthat/test_graph_binning.R +++ b/tests/testthat/test_graph_binning.R @@ -1,67 +1,79 @@ context("Graph binning: Adaptive binning") test_that("adaptive_breaks merges 2 lowest bins where only first bin is below minimum", { min_count <- 5 - x <- c(1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1)) + x <- c( + 1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 3, 4, 5, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges 3 lowest bins where lowest 2 combined are below minimum", { min_count <- 5 - x <- c(1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1)) + x <- c( + 1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 4, 5, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges pair of bins in middle", { min_count <- 5 - x <- c(rep(1.6, min_count), rep(2.2, min_count), rep(3.5, 2), rep(4.5, 3), - rep(5.5, min_count), rep(6.5, min_count + 1)) + x <- c( + rep(1.6, min_count), rep(2.2, min_count), rep(3.5, 2), rep(4.5, 3), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 3, 5, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges two spearated pairs of bins in middle", { min_count <- 5 - x <- c(rep(1.6, min_count), rep(2.2, 2), rep(3.5, 3), rep(4.5, min_count), - rep(5.5, 3), rep(6.5, 2), rep(7.8, min_count)) + x <- c( + rep(1.6, min_count), rep(2.2, 2), rep(3.5, 3), rep(4.5, min_count), + rep(5.5, 3), rep(6.5, 2), rep(7.8, min_count) + ) initial_breaks <- 1:8 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 4, 5, 7, 8) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges 2 uppermost bins where both are below minimum", { min_count <- 5 - x <- c(rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3)) + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2,3, 4, 5, 7) - + final_breaks_expected <- c(1, 2, 3, 4, 5, 7) + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges 2 uppermost bins where only last bin is below minimum", { min_count <- 5 - x <- c(rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3)) + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 3, 4, 5, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) @@ -71,7 +83,7 @@ test_that("adaptive_breaks merges bins with no members with the next bin", { initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) @@ -81,23 +93,27 @@ test_that("adaptive_breaks merges 2 bins below minimum, plus the empty bins betw initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) context("Graph binning: Adaptively binned densities") test_that("binned_densities_adaptive works", { # Helper function - test_binning <- function(densities, min_counts_per_interval, num_intervals, + test_binning <- function(densities, min_counts_per_interval, num_intervals, breaks, expected_breaks, expected_interval_indexes) { # Set up expected output - expected <- list(densities = densities, - interval_indexes = expected_interval_indexes, - breaks = expected_breaks) + expected <- list( + densities = densities, + interval_indexes = expected_interval_indexes, + breaks = expected_breaks + ) # Calculate actual output actual <- binned_densities_adaptive( - densities, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals) + densities, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) # Check actual matches expected expect_equal(actual, expected) } @@ -105,27 +121,32 @@ test_that("binned_densities_adaptive works", { densities <- c(0, 0.099, 0.2, 0.299, 0.4, 0.49, 0.6, 0.699, 0.8, 0.899, 1.0) min_counts_per_interval <- 2 num_intervals <- 100 - expected_breaks <-c(0, 0.1, 0.3, 0.5, 0.7, 1.0) + expected_breaks <- c(0, 0.1, 0.3, 0.5, 0.7, 1.0) expected_interval_indexes <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5) - test_binning(densities, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals, expected_breaks = expected_breaks, - expected_interval_indexes = expected_interval_indexes) + test_binning(densities, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals, expected_breaks = expected_breaks, + expected_interval_indexes = expected_interval_indexes + ) # Test 2: densities <- c(0, 0.012, 0.099, 0.201, 0.299, 0.402, 0.49, 0.596, 0.699, 0.803, 0.899, 1.0) min_counts_per_interval <- 2 num_intervals <- 100 - expected_breaks <-c(0, 0.02, 0.21, 0.41, 0.6, 0.81, 1.0) + expected_breaks <- c(0, 0.02, 0.21, 0.41, 0.6, 0.81, 1.0) expected_interval_indexes <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) - test_binning(densities, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals, expected_breaks = expected_breaks, - expected_interval_indexes = expected_interval_indexes) + test_binning(densities, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals, expected_breaks = expected_breaks, + expected_interval_indexes = expected_interval_indexes + ) }) -expected_binned_graphlet_counts <- +expected_binned_graphlet_counts <- function(graphs, binning_fn, max_graphlet_size) { binned_graphs <- binning_fn(graphs) - ref_counts <- purrr::map(binned_graphs$graphs, count_graphlets_for_graph, - max_graphlet_size) + ref_counts <- purrr::map( + binned_graphs$graphs, count_graphlets_for_graph, + max_graphlet_size + ) ref_counts } - diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index f045651f..ac1a68c2 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -2,33 +2,41 @@ context("Measures Netdis: Graphlet tuples") test_message <- paste("count_graphlet_tuples and count_graphlet_tuples_ego give", - "choose(node_count, graphlet_size) for each graph + graphlet", - "combination",sep = " ") + "choose(node_count, graphlet_size) for each graph + graphlet", + "combination", + sep = " " + ) test_that(test_message, { - # Create some test graphs with known node counts (this is the only graph + # Create some test graphs with known node counts (this is the only graph # property we care about for this test) graph_n11 <- igraph::erdos.renyi.game(11, p = 1, type = "gnp") graph_n37 <- igraph::erdos.renyi.game(37, p = 1, type = "gnp") graph_n73 <- igraph::erdos.renyi.game(73, p = 1, type = "gnp") + + # calculate graphlet counts object using previously tested function + graphlet_counts_n11 <- count_graphlets_for_graph(graph_n11, 5) + graphlet_counts_n37 <- count_graphlets_for_graph(graph_n37, 5) + graphlet_counts_n73 <- count_graphlets_for_graph(graph_n73, 5) + # Calculate expected graph tuple count for graphlets of various sizes. There # is 1 graphlet of size 1, 2 of size 3, 6 of size 4, and 21 of size 5 graphlet_tuple_counts <- function(n, max_graphlet_size) { - if(max_graphlet_size >= 2) { + if (max_graphlet_size >= 2) { tuple_counts <- rep(choose(n, 2), 1) } - if(max_graphlet_size >= 3) { + if (max_graphlet_size >= 3) { tuple_counts <- c(tuple_counts, rep(choose(n, 3), 2)) } - if(max_graphlet_size >= 4) { + if (max_graphlet_size >= 4) { tuple_counts <- c(tuple_counts, rep(choose(n, 4), 6)) } - if(max_graphlet_size >= 5) { + if (max_graphlet_size >= 5) { tuple_counts <- c(tuple_counts, rep(choose(n, 5), 21)) } tuple_counts <- setNames(tuple_counts, graphlet_key(max_graphlet_size)$id) tuple_counts } - + # === TEST count_graphlet_tuples === # Generate expected tuple counts for graphlets up to size 4 and 5 expected_tuple_count_n11_gs4 <- graphlet_tuple_counts(11, 4) @@ -37,13 +45,15 @@ test_that(test_message, { expected_tuple_count_n11_gs5 <- graphlet_tuple_counts(11, 5) expected_tuple_count_n37_gs5 <- graphlet_tuple_counts(37, 5) expected_tuple_count_n73_gs5 <- graphlet_tuple_counts(73, 5) + # Generate actual tuple counts for graphlets up to size 4 and 5 - actual_tuple_count_n11_gs4 <- count_graphlet_tuples(graph_n11, 4) - actual_tuple_count_n37_gs4 <- count_graphlet_tuples(graph_n37, 4) - actual_tuple_count_n73_gs4 <- count_graphlet_tuples(graph_n73, 4) - actual_tuple_count_n11_gs5 <- count_graphlet_tuples(graph_n11, 5) - actual_tuple_count_n37_gs5 <- count_graphlet_tuples(graph_n37, 5) - actual_tuple_count_n73_gs5 <- count_graphlet_tuples(graph_n73, 5) + actual_tuple_count_n11_gs4 <- count_graphlet_tuples(graphlet_counts_n11, 4) + actual_tuple_count_n37_gs4 <- count_graphlet_tuples(graphlet_counts_n37, 4) + actual_tuple_count_n73_gs4 <- count_graphlet_tuples(graphlet_counts_n73, 4) + actual_tuple_count_n11_gs5 <- count_graphlet_tuples(graphlet_counts_n11, 5) + actual_tuple_count_n37_gs5 <- count_graphlet_tuples(graphlet_counts_n37, 5) + actual_tuple_count_n73_gs5 <- count_graphlet_tuples(graphlet_counts_n73, 5) + # Compare expected tuple counts with actual expect_equal(expected_tuple_count_n11_gs4, actual_tuple_count_n11_gs4) expect_equal(expected_tuple_count_n37_gs4, actual_tuple_count_n37_gs4) @@ -51,24 +61,27 @@ test_that(test_message, { expect_equal(expected_tuple_count_n11_gs5, actual_tuple_count_n11_gs5) expect_equal(expected_tuple_count_n37_gs5, actual_tuple_count_n37_gs5) expect_equal(expected_tuple_count_n73_gs5, actual_tuple_count_n73_gs5) - + # === TEST count_graphlet_tuples_ego === - # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar + # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar # to the method under test. However, it's a simple method so maybe that's ok? - graphlet_tuple_counts_ego <- function(ego_networks, max_graphlet_size) { - t(sapply(ego_networks, FUN = function(g) { - graphlet_tuple_counts(length(igraph::V(g)), max_graphlet_size)})) + graphlet_tuple_counts_ego <- function(graphlet_counts_ego, max_graphlet_size) { + t(apply(graphlet_counts_ego, 1, + count_graphlet_tuples, + max_graphlet_size = max_graphlet_size + )) } - # Generate ego networks for each graph - graph_n11_ego1 <- make_named_ego_graph(graph_n11, order = 1) - graph_n37_ego1 <- make_named_ego_graph(graph_n37, order = 1) - graph_n73_ego1 <- make_named_ego_graph(graph_n73, order = 1) - graph_n11_ego2 <- make_named_ego_graph(graph_n11, order = 2) - graph_n37_ego2 <- make_named_ego_graph(graph_n37, order = 2) - graph_n73_ego2 <- make_named_ego_graph(graph_n73, order = 2) + # Generate ego network graphlet counts for each graph + graph_n11_ego1 <- count_graphlets_ego(graph_n11, neighbourhood_size = 1) + graph_n37_ego1 <- count_graphlets_ego(graph_n37, neighbourhood_size = 1) + graph_n73_ego1 <- count_graphlets_ego(graph_n73, neighbourhood_size = 1) + graph_n11_ego2 <- count_graphlets_ego(graph_n11, neighbourhood_size = 2) + graph_n37_ego2 <- count_graphlets_ego(graph_n37, neighbourhood_size = 2) + graph_n73_ego2 <- count_graphlets_ego(graph_n73, neighbourhood_size = 2) + # Generate expected tuple counts for graphlets up to size 4 and 5 # 1. For ego-networks of order 1 - expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) + expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) expected_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego1, 4) expected_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego1, 4) expected_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego1, 5) @@ -81,7 +94,7 @@ test_that(test_message, { expected_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego2, 5) expected_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego2, 5) expected_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego2, 5) - + # Calculate actual tuple counts # 1. For ego-networks of order 1 actual_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego(graph_n11_ego1, 4) @@ -97,9 +110,8 @@ test_that(test_message, { actual_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego(graph_n11_ego2, 5) actual_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego(graph_n37_ego2, 5) actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego(graph_n73_ego2, 5) - + # Compare expected with actual - # 1. For ego-networks of order 1 expect_equal(expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4) expect_equal(expected_tuple_count_n37_ego1_gs4, actual_tuple_count_n37_ego1_gs4) expect_equal(expected_tuple_count_n73_ego1_gs4, actual_tuple_count_n73_ego1_gs4) @@ -115,156 +127,386 @@ test_that(test_message, { expect_equal(expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5) }) -context("Measures Netdis: Ego-network scaled graphlet outputs for manually verified networks") -test_that("Ego-network 4-node graphlet counts match manually verified totals",{ +context("Measures Netdis: Ego-network density values match those for manually verified networks") +test_that("Ego-network 4-node density values match manually verified totals", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) + expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) + names(expected_densities_o1) <- node_labels + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) + expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) + names(expected_densities_o2) <- node_labels + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Generate order 1 and 2 ego network graphlet counts + # with previously tested function + graphlet_counts_ego_o1 <- count_graphlets_ego(graph, + neighbourhood_size = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + graphlet_counts_ego_o2 <- count_graphlets_ego(graph, + neighbourhood_size = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + + # Calculate densities + actual_densities_o1 <- ego_network_density(graphlet_counts_ego_o1) + actual_densities_o2 <- ego_network_density(graphlet_counts_ego_o2) + + # Check densities match expected values + expect_equal(actual_densities_o1, expected_densities_o1) + expect_equal(actual_densities_o2, expected_densities_o2) +}) + +context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") +test_that("Ego-network 4-node density-binned reference counts match manually verified totals", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) + expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) + expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Set manually verified density bins for ego-networks + # 1. Ego-networks of order 1 + expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) + expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + expected_binned_densities_o1 <- list( + densities = expected_densities_o1, + interval_indexes = expected_interval_indexes_o1, + breaks = expected_breaks_o1 + ) + # Check binned densities are as expected + actual_binned_densities_o1 <- binned_densities_adaptive( + expected_densities_o1, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) + # 2. Ego-networks of order 2 + expected_min_break_o2 <- 1 / 3 + expected_max_break_o2 <- 0.6 + expected_initial_interval_o2 <- + (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 + expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) + expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + expected_binned_densities_o2 <- list( + densities = expected_densities_o2, + interval_indexes = expected_interval_indexes_o2, + breaks = expected_breaks_o2 + ) + # Check binned densities are as expected + actual_binned_densities_o2 <- binned_densities_adaptive( + expected_densities_o2, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) + + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + expected_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(expected_counts_o1) <- node_labels + colnames(expected_counts_o1) <- graphlet_labels + # 2-step ego networks + expected_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_o2) <- node_labels + colnames(expected_counts_o2) <- graphlet_labels + + # Calculate binned average expected counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + + expected_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + + expected_counts_o1[5, ] + expected_counts_o1[9, ] + + expected_counts_o1[10, ]) / 5 + expected_mean_density_binned_counts_o1 <- rbind( + mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 + ) + rownames(expected_mean_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + + expected_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + + expected_counts_o2[10, ]) / 3 + expected_mean_density_binned_counts_o2 <- rbind( + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin4_o2 + ) + rownames(expected_mean_density_binned_counts_o2) <- 1:4 + + # Calculate actual output of function under test + actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( + expected_counts_o1, expected_interval_indexes_o1 + ) + actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( + expected_counts_o2, expected_interval_indexes_o2 + ) + + # Check actual output vs expected + expect_equal( + actual_mean_density_binned_counts_o1, + expected_mean_density_binned_counts_o1 + ) + expect_equal( + actual_mean_density_binned_counts_o2, + expected_mean_density_binned_counts_o2 + ) +}) + +context("Measures Netdis: scale_graphlet_counts_ego for manually verified networks") +test_that("Ego-network 4-node graphlet counts match manually verified totals", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 max_graphlet_size <- 4 - actual_counts_order_1 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1) - actual_counts_order_2 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2) + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Use previously tested functions to generate ego networks and calcualte graphlet + # counts. + # ego nets + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + + # graphlet counts + graphlet_counts_o1 <- + ego_to_graphlet_counts(ego_networks_o1, + max_graphlet_size = max_graphlet_size + ) + graphlet_counts_o2 <- + ego_to_graphlet_counts(ego_networks_o2, + max_graphlet_size = max_graphlet_size + ) + + + # Calculate scaled counts with scale_graphlet_counts_ego + # (function to test). + actual_counts_o1 <- + scale_graphlet_counts_ego(graphlet_counts_o1, + max_graphlet_size = max_graphlet_size + ) + actual_counts_o2 <- + scale_graphlet_counts_ego(graphlet_counts_o2, + max_graphlet_size = max_graphlet_size + ) + graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count # Set manually verified counts # 1-step ego networks - expected_counts_order_1 <- rbind( + expected_counts_o1 <- rbind( c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) ) - rownames(expected_counts_order_1) <- node_labels - colnames(expected_counts_order_1) <- graphlet_labels + rownames(expected_counts_o1) <- node_labels + colnames(expected_counts_o1) <- graphlet_labels # 2-step ego networks - expected_counts_order_2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10 , k)), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1) / zeros_to_ones(choose(7 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)) - ) - rownames(expected_counts_order_2) <- node_labels - colnames(expected_counts_order_2) <- graphlet_labels - - # Test that actual counts match expected with only counts requested (default) - expect_equal(actual_counts_order_1, expected_counts_order_1) - expect_equal(actual_counts_order_2, expected_counts_order_2) - - # Test that actual counts and returned ego networks match expected - # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) - expected_counts_with_networks_order_1 <- - list(graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1) - expected_counts_with_networks_order_2 <- - list(graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2) - # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) - actual_counts_with_networks_order_2 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) - - # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to - # indexed edge list and then compare. Do in-situ replacement of igraphs with - # indexed edge lists to ensure we are checking full properties of returned - # objects (i.e. named lists with matching elements). - # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map(expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map(expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map(actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map(actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) - # 3b. Do comparison - expect_equal(actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1) - expect_equal(actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2) + expected_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_o2) <- node_labels + colnames(expected_counts_o2) <- graphlet_labels + + # Test that actual counts match expected + expect_equal(actual_counts_o1, expected_counts_o1) + expect_equal(actual_counts_o2, expected_counts_o2) }) -context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") -test_that("Ego-network 4-node density-binned reference counts match manually verified totals",{ +context("Measures Netdis: Ego-network density-binned counts for manually verified networks") +test_that("density_binned_counts output matches manually verified totals with different scaling and aggregation functions", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + # Set parameters for test - max_graphlet_size = 4 + max_graphlet_size <- 4 min_counts_per_interval <- 2 num_intervals <- 100 - + min_ego_edges <- 0 + min_ego_nodes <- 0 + # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") # Set manually verified ego-network node counts and edge densities - #1 . Ego-networks of order 1 + # 1 . Ego-networks of order 1 expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) @@ -278,7 +520,7 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) # Order 2 expected densities should be: # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 - + # Set manually verified density bins for ego-networks # 1. Ego-networks of order 1 expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) @@ -288,15 +530,10 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver interval_indexes = expected_interval_indexes_o1, breaks = expected_breaks_o1 ) - # Check binned densities are as expected - actual_binned_densities_o1 <- binned_densities_adaptive( - expected_densities_o1, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals) - expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) # 2. Ego-networks of order 2 - expected_min_break_o2 <- 1/3 + expected_min_break_o2 <- 1 / 3 expected_max_break_o2 <- 0.6 - expected_initial_interval_o2 <- + expected_initial_interval_o2 <- (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) @@ -305,12 +542,7 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver interval_indexes = expected_interval_indexes_o2, breaks = expected_breaks_o2 ) - # Check binned densities are as expected - actual_binned_densities_o2 <- binned_densities_adaptive( - expected_densities_o2, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals) - expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) - + # Set manually verified scaled ego-network graphlet counts graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count @@ -322,7 +554,7 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) @@ -331,80 +563,224 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver colnames(expected_counts_o1) <- graphlet_labels # 2-step ego networks expected_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10 , k)), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1) / zeros_to_ones(choose(7 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)) + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) ) rownames(expected_counts_o2) <- node_labels colnames(expected_counts_o2) <- graphlet_labels - + # Calculate binned average expected counts based on manually verified counts # and density bins # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - mean_counts_bin1_o1 <- (expected_counts_o1[1,] + expected_counts_o1[2,]) / 2 - mean_counts_bin2_o1 <- (expected_counts_o1[6,] + expected_counts_o1[7,] + - expected_counts_o1[8,]) / 3 - mean_counts_bin3_o1 <- (expected_counts_o1[3,] + expected_counts_o1[4,] + - expected_counts_o1[5,] + expected_counts_o1[9,] + - expected_counts_o1[10,]) / 5 + mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + + expected_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + + expected_counts_o1[5, ] + expected_counts_o1[9, ] + + expected_counts_o1[10, ]) / 5 expected_mean_density_binned_counts_o1 <- rbind( mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 ) rownames(expected_mean_density_binned_counts_o1) <- 1:3 # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_counts_bin1_o2 <- (expected_counts_o2[1,] + expected_counts_o2[4,]) / 2 - mean_counts_bin2_o2 <- (expected_counts_o2[2,] + expected_counts_o2[6,] + - expected_counts_o2[7,]) / 3 - mean_counts_bin3_o2 <- (expected_counts_o2[3,] + expected_counts_o2[5,]) / 2 - mean_counts_bin4_o2 <- (expected_counts_o2[8,] + expected_counts_o2[9,] + - expected_counts_o2[10,]) / 3 + mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + + expected_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + + expected_counts_o2[10, ]) / 3 expected_mean_density_binned_counts_o2 <- rbind( - mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, mean_counts_bin4_o2 ) rownames(expected_mean_density_binned_counts_o2) <- 1:4 - - # Calculate actual output of function under test - actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( - expected_counts_o1, expected_interval_indexes_o1) - actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( - expected_counts_o2, expected_interval_indexes_o2) - + + # density_binned_counts with default arguments should give + # mean graphlet count in each density bin + actual_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1 + ) + + actual_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2 + ) + + # Check actual output vs expected + expect_equal( + actual_density_binned_counts_o1, + expected_mean_density_binned_counts_o1 + ) + expect_equal( + actual_density_binned_counts_o2, + expected_mean_density_binned_counts_o2 + ) + + # Calculate max binned counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + # apply(x, 2, max): returns max of each column in x + max_counts_bin1_o1 <- apply(rbind(expected_counts_o1[1, ], expected_counts_o1[2, ]), 2, max) + max_counts_bin2_o1 <- apply(rbind( + expected_counts_o1[6, ], expected_counts_o1[7, ], + expected_counts_o1[8, ] + ), 2, max) + max_counts_bin3_o1 <- apply(rbind( + expected_counts_o1[3, ], expected_counts_o1[4, ], + expected_counts_o1[5, ], expected_counts_o1[9, ], + expected_counts_o1[10, ] + ), 2, max) + + expected_max_density_binned_counts_o1 <- rbind( + max_counts_bin1_o1, max_counts_bin2_o1, max_counts_bin3_o1 + ) + rownames(expected_max_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + max_counts_bin1_o2 <- apply(rbind(expected_counts_o2[1, ], expected_counts_o2[4, ]), 2, max) + max_counts_bin2_o2 <- apply(rbind( + expected_counts_o2[2, ], expected_counts_o2[6, ], + expected_counts_o2[7, ] + ), 2, max) + max_counts_bin3_o2 <- apply(rbind(expected_counts_o2[3, ], expected_counts_o2[5, ]), 2, max) + max_counts_bin4_o2 <- apply(rbind( + expected_counts_o2[8, ], expected_counts_o2[9, ], + expected_counts_o2[10, ] + ), 2, max) + + expected_max_density_binned_counts_o2 <- rbind( + max_counts_bin1_o2, max_counts_bin2_o2, max_counts_bin3_o2, + max_counts_bin4_o2 + ) + rownames(expected_max_density_binned_counts_o2) <- 1:4 + + # density_binned_counts with agg_fn = max should give + # max graphlet count in each density bin + agg_fn <- max + scale_fn <- NULL + + actual_max_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn + ) + + actual_max_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn + ) + + # Check actual output vs expected + expect_equal( + actual_max_density_binned_counts_o1, + expected_max_density_binned_counts_o1 + ) + expect_equal( + actual_max_density_binned_counts_o2, + expected_max_density_binned_counts_o2 + ) + + # density_binned_counts with scale_fn = scale_graphlet_counts_ego + # should give mean graphlet counts in each density bin scaled by + # count_graphlet_tuples. + agg_fn <- mean + scale_fn <- scale_graphlet_counts_ego + + # calculate expected counts using previously tested function + expected_scaled_counts_o1 <- + scale_graphlet_counts_ego(expected_counts_o1, + max_graphlet_size = max_graphlet_size + ) + expected_scaled_counts_o2 <- + scale_graphlet_counts_ego(expected_counts_o2, + max_graphlet_size = max_graphlet_size + ) + + # calculate mean expected counts using expected density bins + mean_scaled_counts_bin1_o1 <- (expected_scaled_counts_o1[1, ] + expected_scaled_counts_o1[2, ]) / 2 + mean_scaled_counts_bin2_o1 <- (expected_scaled_counts_o1[6, ] + expected_scaled_counts_o1[7, ] + + expected_scaled_counts_o1[8, ]) / 3 + mean_scaled_counts_bin3_o1 <- (expected_scaled_counts_o1[3, ] + expected_scaled_counts_o1[4, ] + + expected_scaled_counts_o1[5, ] + expected_scaled_counts_o1[9, ] + + expected_scaled_counts_o1[10, ]) / 5 + expected_scaled_density_binned_counts_o1 <- rbind( + mean_scaled_counts_bin1_o1, mean_scaled_counts_bin2_o1, mean_scaled_counts_bin3_o1 + ) + rownames(expected_scaled_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_scaled_counts_bin1_o2 <- (expected_scaled_counts_o2[1, ] + expected_scaled_counts_o2[4, ]) / 2 + mean_scaled_counts_bin2_o2 <- (expected_scaled_counts_o2[2, ] + expected_scaled_counts_o2[6, ] + + expected_scaled_counts_o2[7, ]) / 3 + mean_scaled_counts_bin3_o2 <- (expected_scaled_counts_o2[3, ] + expected_scaled_counts_o2[5, ]) / 2 + mean_scaled_counts_bin4_o2 <- (expected_scaled_counts_o2[8, ] + expected_scaled_counts_o2[9, ] + + expected_scaled_counts_o2[10, ]) / 3 + expected_scaled_density_binned_counts_o2 <- rbind( + mean_scaled_counts_bin1_o2, mean_scaled_counts_bin2_o2, mean_scaled_counts_bin3_o2, + mean_scaled_counts_bin4_o2 + ) + rownames(expected_scaled_density_binned_counts_o2) <- 1:4 + + # Calculate scaled binned counts with density_binned_counts (function to test) + actual_scaled_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn, + max_graphlet_size = max_graphlet_size + ) + + actual_scaled_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn, + max_graphlet_size = max_graphlet_size + ) + # Check actual output vs expected - expect_equal(actual_mean_density_binned_counts_o1, - expected_mean_density_binned_counts_o1) - expect_equal(actual_mean_density_binned_counts_o2, - expected_mean_density_binned_counts_o2) + expect_equal( + actual_scaled_density_binned_counts_o1, + expected_scaled_density_binned_counts_o1 + ) + expect_equal( + actual_scaled_density_binned_counts_o2, + expected_scaled_density_binned_counts_o2 + ) }) context("Measures Netdis: Expected graphlet counts") -test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { +test_that("netdis_expected_counts_ego works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes rand_graph <- function(num_nodes, density) { max_edges <- choose(num_nodes, 2) num_edges <- density * max_edges - igraph::erdos.renyi.game(num_nodes, num_edges , "gnm", - loops = FALSE, directed = FALSE) + igraph::erdos.renyi.game(num_nodes, num_edges, "gnm", + loops = FALSE, directed = FALSE + ) } # Set up some dummy reference density breaks and scaled reference counts density_breaks <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0) scaled_reference_counts <- rbind( - c( 1, 2, 3, 4, 5, 6, 7, 8, 9), + c(1, 2, 3, 4, 5, 6, 7, 8, 9), c(11, 12, 13, 14, 15, 16, 17, 18, 19), c(21, 22, 23, 24, 25, 26, 27, 28, 29), c(31, 32, 33, 34, 35, 36, 37, 38, 39), c(41, 42, 43, 44, 45, 46, 47, 48, 49), c(51, 52, 53, 54, 55, 56, 57, 58, 59), - c(61, 62, 63, 64, 65, 66, 67, 68 ,69), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), c(71, 72, 73, 74, 75, 76, 77, 78, 79), - c(81, 82, 83, 84, 85, 86 ,87, 88, 89), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), c(91, 92, 93, 94, 95, 96, 97, 98, 99) ) graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") @@ -412,66 +788,111 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { rownames(scaled_reference_counts) <- 1:10 graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) names(graphlet_sizes) <- graphlet_labels - max_graphlet_size = 4 - + max_graphlet_size <- 4 + # Generate some test graphs densities <- c(0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95) density_indexes <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) num_nodes <- rep(120, 10) graphs <- purrr::map2(num_nodes, densities, rand_graph) - + graphlet_counts <- purrr::map(graphs, + count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size + ) + # WITH scale_fn = NULL (bin counts directly with no scaling) + # Helper function to calculate expected expected graphlet counts + expected_expected_graphlet_counts_fn <- function(density_index) { + scaled_reference_counts[density_index, ] + } + # Determine expected and actual expected graphlet counts + expected_expected_graphlet_counts <- + purrr::map(density_indexes, expected_expected_graphlet_counts_fn) + actual_expected_graphlet_counts <- + purrr::map(graphlet_counts, netdis_expected_counts_ego, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = scaled_reference_counts, + scale_fn = NULL + ) + + # Loop over each graph and compare expected with actual + # NOTE: v2.0.0 of testthat library made a breaking change that means using + # map, mapply etc can cause failures under certain conditions + # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 + for (i in 1:length(actual_expected_graphlet_counts)) { + expect_equal( + actual_expected_graphlet_counts[i], + expected_expected_graphlet_counts[i] + ) + } + + # WITH scale_fn = count_graphlet_tuples (default netdis from paper) # Helper function to calculate expected expected graphlet counts expected_expected_graphlet_counts_fn <- function(density_index, node_count) { - reference_counts <- scaled_reference_counts[density_index,] + reference_counts <- scaled_reference_counts[density_index, ] reference_counts * choose(node_count, graphlet_sizes) - } # Determine expected and actual expected graphlet counts - expected_expected_graphlet_counts <- + expected_expected_graphlet_counts <- purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) - actual_expected_graphlet_counts <- - purrr::map(graphs, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = scaled_reference_counts) + actual_expected_graphlet_counts <- + purrr::map(graphlet_counts, netdis_expected_counts_ego, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = scaled_reference_counts, + scale_fn = count_graphlet_tuples + ) # Loop over each graph and compare expected with actual # NOTE: v2.0.0 of testthat library made a breaking change that means using # map, mapply etc can cause failures under certain conditions # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 - for(i in 1:length(actual_expected_graphlet_counts)) { - expect_equal(actual_expected_graphlet_counts[i], - expected_expected_graphlet_counts[i]) + for (i in 1:length(actual_expected_graphlet_counts)) { + expect_equal( + actual_expected_graphlet_counts[i], + expected_expected_graphlet_counts[i] + ) } }) -test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes", { +test_that("netdis_expected_counts works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - max_graphlet_size = 4 - # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, order = 1) - ego_networks_o2 <- make_named_ego_graph(graph, order = 2) + max_graphlet_size <- 4 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Get ego network graphlet counts + graphlet_counts_ego_o1 <- count_graphlets_ego(graph, + neighbourhood_size = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + graphlet_counts_ego_o2 <- count_graphlets_ego(graph, + neighbourhood_size = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) # Set manually-verified node counts and densities # 1. Ego-networks of order 1 num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) @@ -491,24 +912,28 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) # Set dummy reference counts scaled_reference_counts <- rbind( - c( 1, 2, 3, 4, 5, 6, 7, 8, 9), + c(1, 2, 3, 4, 5, 6, 7, 8, 9), c(11, 12, 13, 14, 15, 16, 17, 18, 19), c(21, 22, 23, 24, 25, 26, 27, 28, 29), c(31, 32, 33, 34, 35, 36, 37, 38, 39), c(41, 42, 43, 44, 45, 46, 47, 48, 49), c(51, 52, 53, 54, 55, 56, 57, 58, 59), - c(61, 62, 63, 64, 65, 66, 67, 68 ,69), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), c(71, 72, 73, 74, 75, 76, 77, 78, 79), - c(81, 82, 83, 84, 85, 86 ,87, 88, 89), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), c(91, 92, 93, 94, 95, 96, 97, 98, 99) ) + colnames(scaled_reference_counts) <- graphlet_labels expected_dims <- dim(scaled_reference_counts) - min_ego_nodes = 3 - min_ego_edges = 1 - + min_ego_nodes <- 3 + min_ego_edges <- 1 + + #------------------------------------------------------- + # With scale_fn = count_graphlet_tuples (default netdis paper) + #------------------------------------------------------- # Helper function to calculate expected expected graphlet counts expected_expected_graphlet_counts_fn <- function(density_index, node_count) { - reference_counts <- scaled_reference_counts[density_index,] + reference_counts <- scaled_reference_counts[density_index, ] reference_counts * choose(node_count, graphlet_sizes) } # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet @@ -527,350 +952,505 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) - # Drop rows for nodes with ewer than minumum required nodes and edges in ego - # network - expected_expected_graphlet_counts_ego_o1 <- - expected_expected_graphlet_counts_ego_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), - ] - expected_expected_graphlet_counts_ego_o2 <- - expected_expected_graphlet_counts_ego_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] - + rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o1) + # Calculate actual output of function under test - actual_expected_graphlet_counts_ego_o1 <- - netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, density_breaks = breaks, - density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - actual_expected_graphlet_counts_ego_o2 <- - netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, density_breaks = breaks, - density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - - # Compare actual to expected - expect_equal(actual_expected_graphlet_counts_ego_o1, - actual_expected_graphlet_counts_ego_o1) - expect_equal(actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2) -}) + actual_expected_graphlet_counts_ego_o1 <- + netdis_expected_counts( + graphlet_counts_ego_o1, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples + ) + actual_expected_graphlet_counts_ego_o2 <- + netdis_expected_counts( + graphlet_counts_ego_o2, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples + ) -test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 nodes", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + # Compare actual to expected + expect_equal( + actual_expected_graphlet_counts_ego_o1, + expected_expected_graphlet_counts_ego_o1 ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - names(graphlet_sizes) <- graphlet_labels - max_graphlet_size = 4 - # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, order = 1) - ego_networks_o2 <- make_named_ego_graph(graph, order = 2) - # Set manually-verified node counts and densities - # 1. Ego-networks of order 1 - num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edges_o1 <- choose(num_nodes_o1, 2) - densities_o1 <- num_edges_o1 / max_edges_o1 - # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 - # 2. Ego-networks of order 2 - num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edges_o2 <- choose(num_nodes_o2, 2) - densities_o2 <- num_edges_o2 / max_edges_o2 - # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 - # Set manually determined density breaks and indexes, based on a min bin count - # of 2 and an initial request for 100 bins - min_bin_count = 2 - num_bins = 100 - num_breaks = num_bins + 1 - min_density_o1 <- 0.5 - max_density_o1 <- 1.0 - breaks_o1 <- seq(min_density_o1, max_density_o1,length.out = num_breaks)[c(1, 22, 42, 101)] - density_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - min_density_o2 <- 1/3 - max_density_o2 <- 0.6 - breaks_o2 <- seq(min_density_o2, max_density_o2,length.out = num_breaks)[c(1, 10, 51, 64, 101)] - density_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - # Guard against errors in manually determined breaks and indexes by checking - # against already tested code. This also lets us ensure we handle densities - # falling exactly on a bin boundary the same as the code under test. - comp_binned_densities_o1 <- binned_densities_adaptive( - densities_o1, min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - comp_binned_densities_o2 <- binned_densities_adaptive( - densities_o2, min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - expect_equal(comp_binned_densities_o1, - list(densities = densities_o1, - interval_indexes = density_indexes_o1, - breaks = breaks_o1)) - expect_equal(comp_binned_densities_o2, - list(densities = densities_o2, - interval_indexes = density_indexes_o2, - breaks = breaks_o2)) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - scaled_reference_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 ) - # 2-step ego networks - scaled_reference_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10 , k)), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1) / zeros_to_ones(choose(7 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)) - ) - min_ego_nodes = 3 - min_ego_edges = 1 - # Drop rows for nodes with ewer than minumum required nodes and edges in ego - # network - scaled_reference_counts_o1 <- - scaled_reference_counts_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), - ] - scaled_reference_counts_o2 <- - scaled_reference_counts_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] - density_indexes_o1 <- density_indexes_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges) - ] - density_indexes_o2 <- density_indexes_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges) - ] - # Average manually verified scaled reference counts across density bins - density_binned_reference_counts_o1 <- rbind( - (scaled_reference_counts_o1[1,] + scaled_reference_counts_o1[2,]) / 2, - (scaled_reference_counts_o1[4,] + scaled_reference_counts_o1[5,] + - scaled_reference_counts_o1[6,]) / 3, - ( scaled_reference_counts_o1[3,] + - scaled_reference_counts_o1[7,] + - scaled_reference_counts_o1[8,]) / 3 - ) - rownames(density_binned_reference_counts_o1) <- 1:3 - density_binned_reference_counts_o2 <- rbind( - (scaled_reference_counts_o2[1,] + scaled_reference_counts_o2[4,]) / 2, - (scaled_reference_counts_o2[2,] + scaled_reference_counts_o2[6,] + - scaled_reference_counts_o2[7,]) / 3, - (scaled_reference_counts_o2[3,] + scaled_reference_counts_o2[5,]) / 2, - (scaled_reference_counts_o2[8,] + scaled_reference_counts_o2[9,] + - scaled_reference_counts_o2[10,]) / 3 - ) - rownames(density_binned_reference_counts_o2) <- 1:4 - - # Helper functions to calculate expected expected graphlet counts - expected_expected_graphlet_counts_o1_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o1[density_index,] - reference_counts * choose(node_count, graphlet_sizes) - } - expected_expected_graphlet_counts_o2_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o2[density_index,] - reference_counts * choose(node_count, graphlet_sizes) + + #------------------------------------------------------- + # With scale_fn = NULL (take reference counts directly) + #------------------------------------------------------- + # Helper function to calculate expected expected graphlet counts + expected_expected_graphlet_counts_fn <- function(density_index) { + scaled_reference_counts[density_index, ] } - # Calculate expected graphlet counts - expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( - density_indexes_o1, num_nodes_o1[(num_nodes_o1 >= min_ego_nodes)], - expected_expected_graphlet_counts_o1_fn + # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet + # types as columns and ego networks for nodes in graph as rows + expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map( + density_indexes_o1, expected_expected_graphlet_counts_fn ))) - rownames(expected_expected_graphlet_counts_ego_o1) <- - names(ego_networks_o1[(num_nodes_o1 >= min_ego_nodes)]) - expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( - density_indexes_o2, num_nodes_o2[(num_nodes_o2 >= min_ego_nodes)], - expected_expected_graphlet_counts_o2_fn + expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map( + density_indexes_o2, expected_expected_graphlet_counts_fn ))) - rownames(expected_expected_graphlet_counts_ego_o2) <- - names(ego_networks_o2[(num_nodes_o2 >= min_ego_nodes)]) - - # Sanity check manually derived expected expected counts by comparing against - # pre-tested fully applied expected_graphlet_counts_ego function - expect_equal(expected_expected_graphlet_counts_ego_o1, - netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - density_breaks = breaks_o1, - density_binned_reference_counts_o1, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges - )) - expect_equal(expected_expected_graphlet_counts_ego_o2, - netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - density_breaks = breaks_o2, - density_binned_reference_counts_o2, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges - )) - - # Generate partially applied functions using function under test - actual_expected_graphlet_counts_ego_fn_o1 <- - netdis_expected_graphlet_counts_ego_fn( - graph, max_graphlet_size = max_graphlet_size, neighbourhood_size = 1, - min_bin_count = min_bin_count, num_bins = num_bins) - actual_expected_graphlet_counts_ego_fn_o2 <- - netdis_expected_graphlet_counts_ego_fn( - graph, max_graphlet_size = max_graphlet_size, neighbourhood_size = 2, - min_bin_count = min_bin_count, num_bins = num_bins) - # Generate actual expected accounts by applying generated functions to test - # graph - actual_expected_graphlet_counts_ego_o1 <- - actual_expected_graphlet_counts_ego_fn_o1(graph) - actual_expected_graphlet_counts_ego_o2 <- - actual_expected_graphlet_counts_ego_fn_o2(graph) - + # Sanity check for expected output shape. Should be matrix with graphlet types + # as columns and nodes as rows + expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) + expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) + # Set column labels to graphlet names + colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels + colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels + # Set row labels to ego network names + rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o2) + + # Calculate actual output of function under test + actual_expected_graphlet_counts_ego_o1 <- + netdis_expected_counts( + graphlet_counts_ego_o1, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = NULL + ) + actual_expected_graphlet_counts_ego_o2 <- + netdis_expected_counts( + graphlet_counts_ego_o2, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = NULL + ) + # Compare actual to expected - expect_equal(actual_expected_graphlet_counts_ego_o1, - expected_expected_graphlet_counts_ego_o1) - expect_equal(actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2) + expect_equal( + actual_expected_graphlet_counts_ego_o1, + expected_expected_graphlet_counts_ego_o1 + ) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 + ) +}) + +context("Netdis: Statistic calculation") +test_that("netdis statistic function output matches manually verified result", { + # arbitrary counts of correct size for graphlets up to size 5 + counts_1 <- c( + 11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14 + ) + counts_2 <- c( + 12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10 + ) + + # add graphlet names + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids + + # manually verified results + expected_netdis_3 <- 0.03418796 + expected_netdis_4 <- 0.02091792 + expected_netdis_5 <- 0.03826385 + + # check function to test + actual_netdis_3 <- netdis(counts_1, counts_2, 3) + actual_netdis_4 <- netdis(counts_1, counts_2, 4) + actual_netdis_5 <- netdis(counts_1, counts_2, 5) + + expect_equal(expected_netdis_3, actual_netdis_3) + expect_equal(expected_netdis_4, actual_netdis_4) + expect_equal(expected_netdis_5, actual_netdis_5) +}) +test_that("netdis_uptok gives expected netdis result for graphlets up to size k", { + # arbitrary counts of correct size for graphlets up to size 5 + counts_1 <- c( + 11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14 + ) + counts_2 <- c( + 12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10 + ) + + # add graphlet names + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids + + # manually verified results + expected_netdis <- c(0.03418796, 0.02091792, 0.03826385) + names(expected_netdis) <- c("netdis3", "netdis4", "netdis5") + + # check function to test + actual_netdis <- netdis_uptok(counts_1, counts_2, 5) + + expect_equal(expected_netdis, actual_netdis) }) -context("Measures Netdis: Centered graphlet counts") -test_that("netdis_centred_graphlet_counts_ego is correct", { - # Set up small sample networks each with each graphlet represented in at least - # one ego network - ref_elist <- rbind( +context("Netdis: full calculation pipeline") +test_that("netdis_many_to_many gives expected result", { + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + # Load query and reference graphs + graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + graphs <- graphs[c("EBV", "ECL", "HSV-1", "KSHV")] + + ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" + ) + ref_graph <- read_simple_graph(ref_path, format = "ncol") + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified results + # $netdis + # [,1] [,2] [,3] [,4] [,5] [,6] + # netdis3 0.1846655 0.008264222 0.01005385 0.2065762 0.2091241 0.0001335756 + # netdis4 0.1749835 0.165264120 0.01969246 0.2917612 0.2215579 0.0760242643 + # + # $comp_spec + # name_a name_b index_a index_b + # 1 EBV ECL 1 2 + # 2 EBV HSV-1 1 3 + # 3 EBV KSHV 1 4 + # 4 ECL HSV-1 2 3 + # 5 ECL KSHV 2 4 + # 6 HSV-1 KSHV 3 4 + expected_netdis_netdis <- matrix(nrow = 2, ncol = 6) + expected_netdis_netdis[1, ] <- c( + 0.1846655, 0.008264222, 0.01005385, + 0.2065762, 0.2091241, 0.0001335756 + ) + expected_netdis_netdis[2, ] <- c( + 0.1749835, 0.165264120, 0.01969246, + 0.2917612, 0.2215579, 0.0760242643 + ) + rownames(expected_netdis_netdis) <- c("netdis3", "netdis4") + + expected_netdis_comp_spec <- cross_comparison_spec( + list( + "EBV" = c(), + "ECL" = c(), + "HSV-1" = c(), + "KSHV" = c() + ) + ) + + expected_netdis <- list( + netdis = expected_netdis_netdis, + comp_spec = expected_netdis_comp_spec + ) + + + # Calculate netdis statistics + actual_netdis <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + # Check results as expected + expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) +}) + +context("Netdis: functions for different pairwise comparisons") +test_that("netdis_one_to_one gives expected result", { + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + # Load query and reference graphs + graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol" + ) + + graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol" + ) + + ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" + ) + ref_graph <- read_simple_graph(ref_path, format = "ncol") + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified results + expected_netdis <- c(0.1846655, 0.1749835) + names(expected_netdis) <- c("netdis3", "netdis4") + + # check function to test + actual_netdis <- netdis_one_to_one(graph_1, + graph_2, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) +}) +test_that("netdis_one_to_many gives expected result", { + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + # Load query and reference graphs + graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + graph_1 <- graphs$EBV + graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + + ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist" + ) + ref_graph <- read_simple_graph(ref_path, format = "ncol") + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified results + # ECL HSV-1 KSHV VZV + # netdis3 0.1846655 0.008264222 0.01005385 0.006777578 + # netdis4 0.1749835 0.165264120 0.01969246 0.159711160 + expected_netdis <- matrix(nrow = 2, ncol = 4) + colnames(expected_netdis) <- c("ECL", "HSV-1", "KSHV", "VZV") + rownames(expected_netdis) <- c("netdis3", "netdis4") + expected_netdis[1, ] <- c(0.1846655, 0.008264222, 0.01005385, 0.006777578) + expected_netdis[2, ] <- c(0.1749835, 0.165264120, 0.01969246, 0.159711160) + + # Calculate netdis statistics + actual_netdis <- netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + # Check results as expected + expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) +}) + +context("Netdis: error if no query graphs or graphlet counts provided") +test_that("netdis functions error when no query graphs provided", { + # dummy values to use for other parameters + ref_graph <- NULL + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + comparisons <- "many-to-many" + + elist <- rbind( c("n1", "n2"), - c("n1", "n3"), + c("n2", "n3"), c("n1", "n4"), - c("n1", "n5"), + c("n2", "n5"), c("n1", "n6"), - c("n2", "n7"), - c("n2", "n8"), - c("n2", "n9"), - c("n9", "n10"), - c("n10", "n11"), - c("n11", "n12"), - c("n11", "n13"), - c("n2", "n14"), - c("n8", "n14"), - c("n12", "n15"), - c("n12", "n16"), - c("n15", "n17"), - c("n12", "n18"), - c("n15", "n18"), - c("n16", "n17"), - c("n16", "n18"), - c("n17", "n18"), - c("n16", "n19"), - c("n16", "n20"), - c("n16", "n21"), - c("n19", "n20"), - c("n19", "n21"), - c("n15", "n22"), - c("n15", "n23"), - c("n15", "n24"), - c("n22", "n23"), - c("n22", "n24"), - c("n23", "n24") - ) - ref_graph <- igraph::graph_from_edgelist(ref_elist, directed = FALSE) - - query_elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") - ) - query_graph <- igraph::graph_from_edgelist(query_elist, directed = FALSE) - - max_graphlet_size = 4 - # Use pre-tested functions to generate ego-network graphlet counts - # 1. Reference graph ego-network graphlet counts - ref_o1 <- count_graphlets_ego( - ref_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) - ego_counts_ref_o1 <- ref_o1$graphlet_counts - ego_networks_ref_o1 <- ref_o1$ego_networks - density_ref_o1 <- sapply(ego_networks_ref_o1, igraph::edge_density) - - ref_o2 <- count_graphlets_ego( - ref_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) - ego_counts_ref_o2 <- ref_o2$graphlet_counts - ego_networks_ref_o2 <- ref_o2$ego_networks - density_ref_o2 <- sapply(ego_networks_ref_o2, igraph::edge_density) - - # 2. Query graph ego-network graphlet countsa - query_o1 <- count_graphlets_ego( - query_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) - ego_counts_query_o1 <- query_o1$graphlet_counts - ego_networks_query_o1 <- query_o1$ego_networks - density_query_o1 <- sapply(ego_networks_query_o1, igraph::edge_density) - - query_o2 <- count_graphlets_ego( - query_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) - ego_counts_query_o2 <- query_o2$graphlet_counts - ego_networks_query_o2 <- query_o2$ego_networks - density_query_o2 <- sapply(ego_networks_query_o2, igraph::edge_density) - - centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, - query_node_counts, ref_node_count, - min_nodes, min_edges, - min_bin_count, num_bins) { - graphlet_node_counts_k4 <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - # 1. Calculate scaling factors for each reference and query graphlet count - # These are nCk, where n is the number of nodes in the network and - # k is the number of nodes in the graphlet - ref_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) {choose(ref_node_count, k)}) - query_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) {choose(query_node_count, k)}) - # 2. Calculate scaled reference counts by dividing by ref_scale_factor - ref_scaled_graphlet_count <- query_graphlet_count / ref_scale_factor - # - } -}) \ No newline at end of file + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # one to one function + expect_error( + netdis_one_to_one( + graph_1 = graph, + graph_2 = NULL, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL + ) + ) + expect_error( + netdis_one_to_one( + graph_1 = NULL, + graph_2 = graph, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL + ) + ) + + # one to many function + expect_error( + netdis_one_to_many( + graph_1 = graph, + graphs_compare = NULL, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL + ) + ) + expect_error( + netdis_one_to_many( + graph_1 = NULL, + graphs_compare = list(graph_1 = graph, graph_2 = graph), + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL + ) + ) + # many to many function + expect_error( + netdis_many_to_many( + graphs = NULL, + comparisons = comparisons, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts = NULL + ) + ) +}) + +context("Netdis: constant expected counts") +test_that("netdis_centred_graphlet_counts correctly interprets numeric + ref_binned_graphlet_counts value", { + # dummy counts + graphlet_counts <- rbind( + c(1, 2, 3, 4, 5, 6, 7, 8, 9), + c(11, 12, 13, 14, 15, 16, 17, 18, 19), + c(21, 22, 23, 24, 25, 26, 27, 28, 29), + c(31, 32, 33, 34, 35, 36, 37, 38, 39), + c(41, 42, 43, 44, 45, 46, 47, 48, 49), + c(51, 52, 53, 54, 55, 56, 57, 58, 59), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), + c(71, 72, 73, 74, 75, 76, 77, 78, 79), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), + c(91, 92, 93, 94, 95, 96, 97, 98, 99) + ) + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + colnames(graphlet_counts) <- graphlet_labels + rownames(graphlet_counts) <- 1:10 + max_graphlet_size <- 4 + + # netdis_centred_graphlet_counts with ref_binned_graphlet_counts=0 should + # perform no centring, i.e. centred_counts should equal input graphlet_counts + centred_counts <- netdis_centred_graphlet_counts( + graphlet_counts = graphlet_counts, + ref_ego_density_bins = NULL, + ref_binned_graphlet_counts = 0, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + max_graphlet_size = 4 + ) + + expect_equal(centred_counts, graphlet_counts) +}) + +context("Netdis: Geometric Poisson Approximation") +test_that("netdis_one_to_one gives expected result when using geometric Poisson + approximation", { + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + # Load query and reference graphs + graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol" + ) + + graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol" + ) + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified result for graphlets of size 4 + # verified using a different implementation of geometric poisson with these + # networks. + expected_netdis4 <- 0.1892716 + + # check function to test + actual_netdis <- netdis_one_to_one(graph_1, + graph_2, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + ) + + expect_equal(expected_netdis4, actual_netdis[["netdis4"]], + tolerance = .0001, scale = 1 + ) +}) + +context("Netdis: error if max_graphlet_size is not 3, 4 or 5") +test_that("netdis_uptok errors for unsupported max_graphlet_size", { + # dummy counts values + counts_1 <- c( + 11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14 + ) + counts_2 <- c( + 12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10 + ) + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids + + # graphlet size greater than 5 + expect_error(netdis_uptok(counts_1, counts_2, 6)) + + # graphlet size less than 3 + expect_error(netdis_uptok(counts_1, counts_2, 2)) +}) + +context("Netdis: works correctly when using a single density bin") +test_that("netdis single density bin works correctly", { + # TODO +}) diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index b3a4ea3d..6a99c71b 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -1,65 +1,92 @@ - self_net_emd <- function(histogram, shift, method) { - net_emd(histogram, shift_dhist(histogram, shift), method = method) - } - expected <- 0 - - locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - histogram <- dhist(locations = locations, masses = masses) - - expect_equal(self_net_emd(histogram, shift = 1, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 1, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.5, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.5, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.1, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.1, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.05, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.05, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.01, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.01, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0, "exhaustive"), expected) - - expect_self_net_emd_correct <- function(histogram, shift, method, - return_details = FALSE) { - self_net_emd <- net_emd(histogram, shift_dhist(histogram, shift), - method = method, return_details = return_details) - loc=histogram$locations - mass=histogram$masses - var=sum(loc*loc*mass)/sum(mass)-(sum(loc*mass)/sum(mass))^2 - expected <- list(net_emd = 0, min_emds = 0, min_offsets = shift, - min_offsets_std = 0) - expect_equal(self_net_emd, expected) - } - - locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - histogram <- dhist(locations = locations, masses = masses) - expect_self_net_emd_correct(histogram, shift = 1, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 1, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.5, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.5, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.1, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.1, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.05, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.05, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.01, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.01, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0, "exhaustive", - return_details = TRUE) +self_net_emd <- function(histogram, shift, method) { + netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method) +} +expected <- 0 + +locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) +masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) +histogram <- dhist(locations = locations, masses = masses) + +expect_equal(self_net_emd(histogram, shift = 1, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 1, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.5, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.5, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.1, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.1, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.05, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.05, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.01, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.01, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0, "exhaustive"), expected) + +expect_self_netemd_correct <- function(histogram, shift, method, + return_details = FALSE) { + self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), + method = method, return_details = return_details + ) + loc <- histogram$locations + mass <- histogram$masses + var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 + expected <- list( + net_emd = 0, min_emds = 0, min_offsets = shift, + min_offsets_std = 0 + ) + expect_equal(self_net_emd, expected) +} + +locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) +masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) +histogram <- dhist(locations = locations, masses = masses) +expect_self_netemd_correct(histogram, + shift = 1, "optimise", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 1, "exhaustive", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.5, "optimise", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.5, "exhaustive", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.1, "optimise", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.1, "exhaustive", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.05, "optimise", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.05, "exhaustive", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.01, "optimise", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0.01, "exhaustive", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0, "optimise", + return_details = TRUE +) +expect_self_netemd_correct(histogram, + shift = 0, "exhaustive", + return_details = TRUE +) test_that("net_emd returns 0 when comparing any normal histogram against itself (no offset)", { num_hists <- 5 @@ -70,7 +97,7 @@ test_that("net_emd returns 0 when comparing any normal histogram against itself rand_locations <- function(mu, sigma) { return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) - } + } rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) @@ -80,23 +107,22 @@ test_that("net_emd returns 0 when comparing any normal histogram against itself expected <- 0 actuals_opt <- purrr::map(rand_dhists, function(dhist) { - net_emd(dhist, dhist, method = "optimise") - }) + netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "optimise") + }) purrr::walk(actuals_opt, function(actual) { expect_equal(actual, expected) - }) + }) actuals_exhaustive_default <- purrr::map(rand_dhists, function(dhist) { - net_emd(dhist, dhist, method = "exhaustive") - }) + netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "exhaustive") + }) purrr::walk(actuals_exhaustive_default, function(actual) { expect_equal(actual, expected) - }) + }) }) test_that("net_emd returns 0 when comparing any normal histogram randomly offset against itself", { - num_hists <- 2 num_bins <- 101 num_offsets <- 3 @@ -107,7 +133,7 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset rand_locations <- function(mu, sigma) { return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) - } + } rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) @@ -117,31 +143,40 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset offset_lists <- replicate(num_hists, offsets, simplify = FALSE) - net_emd_offset_self <- function(dhist, offsets, method) { - net_emds <- purrr::map_dbl(offsets, function(offset) { - net_emd(dhist, shift_dhist(dhist, offset), method = method)}) - return(net_emds) + netemd_offset_self <- function(dhist, offsets, method) { + netemds <- purrr::map_dbl(offsets, function(offset) { + netemd_one_to_one(dhists_1 = dhist, dhists_2 = shift_dhist(dhist, offset), method = method) + }) + return(netemds) } expected <- 0 - actuals_list_opt <- purrr::map2(rand_dhists, offset_lists, - function(dhist, offsets) { - net_emd_offset_self(dhist, offsets, method = "optimise")}) + actuals_list_opt <- purrr::map2( + rand_dhists, offset_lists, + function(dhist, offsets) { + netemd_offset_self(dhist, offsets, method = "optimise") + } + ) purrr::walk(actuals_list_opt, function(actuals) { - purrr::walk(actuals, function(actual) { - expect_equal(actual, expected)}) + purrr::walk(actuals, function(actual) { + expect_equal(actual, expected) + }) }) - actuals_list_exhaustive <- purrr::map2(rand_dhists, offset_lists, - function(dhist, offsets) { - net_emd_offset_self(dhist, offsets, method = "exhaustive")}) + actuals_list_exhaustive <- purrr::map2( + rand_dhists, offset_lists, + function(dhist, offsets) { + netemd_offset_self(dhist, offsets, method = "exhaustive") + } + ) purrr::walk(actuals_list_exhaustive, function(actuals) { - purrr::walk(actuals, function(actual) {expect_equal(actual, expected)}) + purrr::walk(actuals, function(actual) { + expect_equal(actual, expected) + }) }) }) test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any normal histogram randomly offset against itself", { - num_hists <- 2 num_bins <- 101 num_offsets <- 3 @@ -151,7 +186,8 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any offsets <- runif(num_offsets, -10, 10) rand_locations <- function(mu, sigma) { - return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins))} + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) @@ -161,29 +197,33 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any offset_lists <- replicate(num_hists, offsets, simplify = FALSE) - expect_self_net_emd_correct <- + expect_self_netemd_correct <- function(histogram, shift, method, return_details = FALSE) { - self_net_emd <- net_emd(histogram, shift_dhist(histogram, shift), - method, return_details) - loc=histogram$locations - mass=histogram$masses - var=sum(loc*loc*mass)/sum(mass)-(sum(loc*mass)/sum(mass))^2 - expected <- list(net_emd = 0, min_emds = 0, min_offsets = shift, - min_offsets_std = 0) - expect_equal(self_net_emd, expected) - } + self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift),method = method, return_details = return_details + ) + loc <- histogram$locations + mass <- histogram$masses + var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 + expected <- list( + net_emd = 0, min_emds = 0, min_offsets = shift, + min_offsets_std = 0 + ) + expect_equal(self_net_emd, expected) + } purrr::walk2(rand_dhists, offset_lists, function(dhist, offsets) { - purrr::walk(offsets, function(offset){ - expect_self_net_emd_correct(dhist, offset, "optimise", - return_details = TRUE) + purrr::walk(offsets, function(offset) { + expect_self_netemd_correct(dhist, offset, "optimise", + return_details = TRUE + ) }) }) purrr::walk2(rand_dhists, offset_lists, function(dhist, offsets) { - purrr::walk(offsets, function(offset){ - expect_self_net_emd_correct(dhist, offset, "exhaustive", - return_details = TRUE) + purrr::walk(offsets, function(offset) { + expect_self_netemd_correct(dhist, offset, "exhaustive", + return_details = TRUE + ) }) }) }) @@ -192,47 +232,46 @@ test_that("net_emd returns analytically derived non-zero solutions for distribut where the analytical solution is known", { # Helper functions to create dhists for a given value of "p" two_bin_dhist <- function(p) { - dhist(locations = c(0, 1), masses = c(p, 1-p)) + dhist(locations = c(0, 1), masses = c(p, 1 - p)) } three_bin_dhist <- function(p) { - dhist(locations = c(-1, 0, 1), masses = c(0.5*p*(1-p), 1-(p*(1-p)), 0.5*p*(1-p))) + dhist(locations = c(-1, 0, 1), masses = c(0.5 * p * (1 - p), 1 - (p * (1 - p)), 0.5 * p * (1 - p))) } - + # Helper function to test actual vs expected test_pair <- function(p, expected) { dhistA <- two_bin_dhist(p) dhistB <- three_bin_dhist(p) - expect_equal(net_emd(dhistA, dhistB, method = "exhaustive"), expected, tolerance = 1e-12) + expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "exhaustive"), expected, tolerance = 1e-12) # Even setting the stats::optimise method tolerance to machine double precision, the # optimised NetEMD is ~1e-09, so set a slightly looser tolerance here - expect_equal(net_emd(dhistA, dhistB, method = "optimise"), expected, tolerance = 1e-08) + expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "optimise"), expected, tolerance = 1e-08) } - + # Test for p values with analytically calculated NetEMD - test_pair(1/2, 1) - test_pair(1/3, 1/sqrt(2)) - test_pair(1/5, 1/2) - + test_pair(1 / 2, 1) + test_pair(1 / 3, 1 / sqrt(2)) + test_pair(1 / 5, 1 / 2) }) context("Measures NetEMD: Virus PPI (EMD)") # EMD and NET_EMD: Virus PPI datasets test_that("emd return 0 when comparing graphlet orbit degree distributions of virus PPI graphs to themselves", { - # Load viurs PPI network data in ORCA-compatible edge list format - data_indexes <- 1:length(virusppi) - data_names <- attr(virusppi, "name") + # Load viurs PPI network data in ORCA-compatible edge list format + data_indexes <- 1:length(virusppi) + data_names <- attr(virusppi, "name") - # Calculate graphlet-based degree distributions up to graphlet order 4 - virus_gdd <- purrr::map(virusppi, gdd) + # Calculate graphlet-based degree distributions up to graphlet order 4 + virus_gdd <- purrr::map(virusppi, gdd) - # Map over virus PPI networks - purrr::walk(virus_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equal(emd(gdd_Ox, gdd_Ox), 0) - }) - }) - }) + # Map over virus PPI networks + purrr::walk(virus_gdd, function(gdd) { + purrr::walk(gdd, function(gdd_Ox) { + expect_equal(emd(gdd_Ox, gdd_Ox), 0) + }) + }) +}) context("Measures NetEMD: Virus PPI (NetEMD)") test_that("net_emd return 0 when comparing graphlet orbit degree distributions @@ -253,8 +292,10 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over virus PPI networks purrr::walk(virus_gdd, function(gdd) { purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(net_emd(gdd_Ox, gdd_Ox, method = "optimise", - smoothing_window_width = 0), 0) + expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + method = "optimise", + smoothing_window_width = 0 + ), 0) }) }) }) @@ -263,102 +304,118 @@ context("Measures NetEMD: Random graphs (EMD)") # EMD and NET_EMD: Random graph datasets test_that("emd return 0 when comparing graphlet orbit degree distributions of random graphs to themselves", { - # Load random graph data in ORCA-compatible edge list format - random_graphs <- read_simple_graphs( - system.file(package = "netdist", "extdata", "random"), - format = "ncol", pattern = "*") - data_indexes <- 1:length(random_graphs) - data_names <- attr(random_graphs, "name") - - # Calculate graphlet-based degree distributions up to graphlet order 4 - random_gdd <- purrr::map(random_graphs, gdd) - - # Map over random graphs - purrr::walk(random_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equal(emd(gdd_Ox, gdd_Ox), 0) - }) - }) - }) + # Load random graph data in ORCA-compatible edge list format + random_graphs <- read_simple_graphs( + system.file(package = "netdist", "extdata", "random"), + format = "ncol", pattern = "*" + ) + data_indexes <- 1:length(random_graphs) + data_names <- attr(random_graphs, "name") + + # Calculate graphlet-based degree distributions up to graphlet order 4 + random_gdd <- purrr::map(random_graphs, gdd) + + # Map over random graphs + purrr::walk(random_gdd, function(gdd) { + purrr::walk(gdd, function(gdd_Ox) { + expect_equal(emd(gdd_Ox, gdd_Ox), 0) + }) + }) +}) context("Measures NetEMD: Random graphs (NetEMD)") test_that("net_emd return 0 when comparing graphlet orbit degree distributions of random graphs to themselves", { - # Load random graph data in ORCA-compatible edge list format - random_graphs <- read_simple_graphs( - system.file(package = "netdist", "extdata", "random"), - format = "ncol", pattern = "*") - data_indexes <- 1:length(random_graphs) - data_names <- attr(random_graphs, "name") - - # Calculate graphlet-based degree distributions up to graphlet order 4 - random_gdd <- purrr::map(random_graphs, gdd) - - expect_equalish <- function(actual, expected) { - diff <- abs(actual - expected) - max_diff <- 1e-12 - return(expect_lte(diff, max_diff)) - } - - # Map over random graphs - purrr::walk(random_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(net_emd(gdd_Ox, gdd_Ox, method = "optimise", - smoothing_window_width = 0), 0) - }) - }) - }) + # Load random graph data in ORCA-compatible edge list format + random_graphs <- read_simple_graphs( + system.file(package = "netdist", "extdata", "random"), + format = "ncol", pattern = "*" + ) + data_indexes <- 1:length(random_graphs) + data_names <- attr(random_graphs, "name") + + # Calculate graphlet-based degree distributions up to graphlet order 4 + random_gdd <- purrr::map(random_graphs, gdd) + + expect_equalish <- function(actual, expected) { + diff <- abs(actual - expected) + max_diff <- 1e-12 + return(expect_lte(diff, max_diff)) + } + + # Map over random graphs + purrr::walk(random_gdd, function(gdd) { + purrr::walk(gdd, function(gdd_Ox) { + expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + method = "optimise", + smoothing_window_width = 0 + ), 0) + }) + }) +}) context("Measures NetEMD: All graphs in directory") -test_that("net_emds_for_all_graphs works", { +test_that("netemd_many_to_many works", { # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - edge_format = "ncol" - file_pattern = ".txt" + edge_format <- "ncol" + file_pattern <- ".txt" # Set number of threads to use at once for parallel processing. - num_threads = getOption("mc.cores", 2L) + num_threads <- getOption("mc.cores", 2L) # Use previously tested GDD code to generate inputs to function under test gdds_orbits_g4 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "orbit", max_graphlet_size = 4) + feature_type = "orbit", max_graphlet_size = 4 + ) gdds_orbits_g5 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "orbit", max_graphlet_size = 5) + feature_type = "orbit", max_graphlet_size = 5 + ) gdds_graphlets_g4 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 4) + feature_type = "graphlet", max_graphlet_size = 4 + ) gdds_graphlets_g5 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 5) + feature_type = "graphlet", max_graphlet_size = 5 + ) gdds_graphlets_g4_e1 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 1) + feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 1 + ) gdds_graphlets_g5_e1 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 1) + feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 1 + ) gdds_graphlets_g4_e2 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 2) + feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 2 + ) gdds_graphlets_g5_e2 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 2) + feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 2 + ) # Use previously tested NetEMD function to generate expected NetEMD scores # individually and combine into expected output for code under test - expected_net_emd_fn<- function(gdds) { - list(net_emds = c(net_emd(gdds$EBV, gdds$ECL), net_emd(gdds$EBV, gdds$HSV), - net_emd(gdds$EBV, gdds$KSHV), net_emd(gdds$EBV, gdds$VZV), - net_emd(gdds$ECL, gdds$HSV), net_emd(gdds$ECL, gdds$KSHV), - net_emd(gdds$ECL, gdds$VZV), net_emd(gdds$HSV, gdds$KSHV), - net_emd(gdds$HSV, gdds$VZV), net_emd(gdds$KSHV, gdds$VZV)), - comp_spec = cross_comparison_spec(gdds)) + expected_netemd_fn <- function(gdds) { + list( + netemds = c( + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$HSV), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 =gdds$ECL, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$HSV, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$KSHV, dhists_2 = gdds$VZV) + ), + comp_spec = cross_comparison_spec(gdds) + ) } # Comparison function for clarity compare_fn <- function(gdds) { - expect_equal(net_emds_for_all_graphs(gdds), expected_net_emd_fn(gdds)) + expect_equal(netemd_many_to_many(dhists=gdds), expected_netemd_fn(gdds)) } # Map over test parameters, comparing actual gdds to expected @@ -374,5 +431,3 @@ test_that("net_emds_for_all_graphs works", { compare_fn(gdds_graphlets_g4_e2) compare_fn(gdds_graphlets_g5_e2) }) - - diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index fadcb428..5c48dbc4 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -3,73 +3,105 @@ test_that("Graph to indexed edge list round trip conversion works", { data_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") g_orig <- igraph::read_graph(file = file.path(data_dir, "EBV.txt"), format = "ncol") g_rtrip <- netdist::indexed_edges_to_graph(graph_to_indexed_edges(g_orig)) - expect_true(all.equal(igraph::get.edgelist(g_orig),igraph::get.edgelist(g_orig))) + expect_true(all.equal(igraph::get.edgelist(g_orig), igraph::get.edgelist(g_orig))) }) context("ORCA interface: Graphlet key") test_that("graphlet_key gives correct output for all supported max graphlet sizes", { correct_graphlet_key_2 <- list(max_nodes = 2, id = c("G0"), node_count = c(2)) - correct_graphlet_key_3 <- list(max_nodes = 3, id = c("G0", "G1", "G2"), - node_count = c(2, 3, 3)) - correct_graphlet_key_4 = list(max_nodes = 4, - id = c("G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8"), - node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4)) - correct_graphlet_key_5 <- list(max_nodes = 5, - id = c("G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8", "G9", "G10", "G11", "G12", - "G13", "G14", "G15", "G16", "G17", - "G18", "G19", "G20", "G21", "G22", - "G23", "G24", "G25", "G26", "G27", - "G28", "G29"), - node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5)) + correct_graphlet_key_3 <- list( + max_nodes = 3, id = c("G0", "G1", "G2"), + node_count = c(2, 3, 3) + ) + correct_graphlet_key_4 <- list( + max_nodes = 4, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8" + ), + node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4) + ) + correct_graphlet_key_5 <- list( + max_nodes = 5, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8", "G9", "G10", "G11", "G12", + "G13", "G14", "G15", "G16", "G17", + "G18", "G19", "G20", "G21", "G22", + "G23", "G24", "G25", "G26", "G27", + "G28", "G29" + ), + node_count = c( + 2, 3, 3, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5 + ) + ) expect_equal(graphlet_key(2), correct_graphlet_key_2) expect_equal(graphlet_key(3), correct_graphlet_key_3) expect_equal(graphlet_key(4), correct_graphlet_key_4) expect_equal(graphlet_key(5), correct_graphlet_key_5) - }) +}) test_that("graphlet_key gives error for unsupported max graphlet sizes", { max_size_too_low <- c(1, 0, -1, -2, -3, -4, -5, -6) max_size_too_high <- c(6, 7, 8, 9, 10) max_size_not_int <- c(2.5, 3.5, 4.5) - purrr::map(max_size_too_low, function(s) {expect_error(graphlet_key(s))}) - purrr::map(max_size_too_high, function(s) {expect_error(graphlet_key(s))}) - purrr::map(max_size_not_int, function(s) {expect_error(graphlet_key(s))}) + purrr::map(max_size_too_low, function(s) { + expect_error(graphlet_key(s)) + }) + purrr::map(max_size_too_high, function(s) { + expect_error(graphlet_key(s)) + }) + purrr::map(max_size_not_int, function(s) { + expect_error(graphlet_key(s)) + }) }) context("ORCA interface: Orbit key") test_that("orbit_key gives correct output for all supported max graphlet sizes", { correct_orbit_key_2 <- list(max_nodes = 2, id = c("O0"), node_count = c(2)) - correct_orbit_key_3 <- list(max_nodes = 3, id = c("O0", "O1", "O2", "O3"), - node_count = c(2, 3, 3, 3)) - correct_orbit_key_4 = list(max_nodes = 4, - id = c("O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14"), - node_count = c(2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)) - correct_orbit_key_5 <- list(max_nodes = 5, - id = c("O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", - "O18", "O19", "O20", "O21", "O22", - "O23", "O24", "O25", "O26", "O27", "O28", "O29", - "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", - "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", - "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", - "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", - "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", - "O70", "O71", "O72"), - node_count = c(2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5)) + correct_orbit_key_3 <- list( + max_nodes = 3, id = c("O0", "O1", "O2", "O3"), + node_count = c(2, 3, 3, 3) + ) + correct_orbit_key_4 <- list( + max_nodes = 4, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 + ) + ) + correct_orbit_key_5 <- list( + max_nodes = 5, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", + "O18", "O19", "O20", "O21", "O22", + "O23", "O24", "O25", "O26", "O27", "O28", "O29", + "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", + "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", + "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", + "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", + "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", + "O70", "O71", "O72" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5 + ) + ) expect_equal(orbit_key(2), correct_orbit_key_2) expect_equal(orbit_key(3), correct_orbit_key_3) expect_equal(orbit_key(4), correct_orbit_key_4) @@ -79,26 +111,32 @@ test_that("orbit_key gives correct output for all supported max graphlet sizes", context("ORCA interface: Graph cross comparison") test_that("cross_comparison_spec works for virus PPI data", { # Load viurs PPI network data in ORCA-compatible edge list format - expected_name_A <- c(rep("EBV", 4), rep("ECL", 3), rep("HSV-1", 2), - rep("KSHV", 1), rep("VZV", 0)) + expected_name_A <- c( + rep("EBV", 4), rep("ECL", 3), rep("HSV-1", 2), + rep("KSHV", 1), rep("VZV", 0) + ) expected_index_A <- c(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1), rep(5, 0)) - expected_name_B <- c(c("ECL", "HSV-1", "KSHV", "VZV"), c("HSV-1", "KSHV", "VZV"), - c("KSHV", "VZV"), c("VZV")) + expected_name_B <- c( + c("ECL", "HSV-1", "KSHV", "VZV"), c("HSV-1", "KSHV", "VZV"), + c("KSHV", "VZV"), c("VZV") + ) expected_index_B <- c(c(2, 3, 4, 5), c(3, 4, 5), c(4, 5), c(5)) - expected <- as.data.frame(cbind(expected_name_A, expected_name_B, - expected_index_A, expected_index_B)) + expected <- as.data.frame(cbind( + expected_name_A, expected_name_B, + expected_index_A, expected_index_B + )) colnames(expected) <- c("name_a", "name_b", "index_a", "index_b") - + actual <- cross_comparison_spec(virusppi) - + matched_output <- function(actual, expected) { dims_match <- identical(dim(as.matrix(expected)), dim(as.matrix(actual))) data_matches <- identical(as.matrix(expected), as.matrix(actual)) headers_match <- identical(colnames(expected), colnames(actual)) return(dims_match && data_matches && headers_match) } - - # Check that actual output matches one of the two acceptable outputs at each + + # Check that actual output matches one of the two acceptable outputs at each # cell expect_true(matched_output(actual, expected)) }) @@ -107,35 +145,47 @@ context("ORCA interface: Orbit count wrapper") test_that("Single and zero node graphs are gracefully handled", { single_node_graph <- igraph::graph_from_adjacency_matrix(0, mode = "undirected") zero_node_graph <- igraph::delete.vertices(single_node_graph, 1) - names4 <- c("O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14") - names5 <- c(names4, c("O15", "O16", "O17", "O18", "O19", "O20", "O21", "O22", - "O23", "O24", "O25", "O26", "O27", "O28", "O29", - "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", - "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", - "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", - "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", - "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", - "O70", "O71", "O72")) + names4 <- c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14" + ) + names5 <- c(names4, c( + "O15", "O16", "O17", "O18", "O19", "O20", "O21", "O22", + "O23", "O24", "O25", "O26", "O27", "O28", "O29", + "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", + "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", + "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", + "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", + "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", + "O70", "O71", "O72" + )) expected_zero_node_counts4 <- matrix(0, nrow = 0, ncol = length(names4)) colnames(expected_zero_node_counts4) <- names4 expected_zero_node_counts5 <- matrix(0, nrow = 0, ncol = length(names5)) colnames(expected_zero_node_counts5) <- names5 - + expected_single_node_counts4 <- matrix(0, nrow = 1, ncol = length(names4)) colnames(expected_single_node_counts4) <- names4 expected_single_node_counts5 <- matrix(0, nrow = 1, ncol = length(names5)) colnames(expected_single_node_counts5) <- names5 - - expect_equal(expected_zero_node_counts4, - count_orbits_per_node(zero_node_graph, max_graphlet_size = 4)) - expect_equal(expected_zero_node_counts5, - count_orbits_per_node(zero_node_graph, max_graphlet_size = 5)) - - expect_equal(expected_single_node_counts4, - count_orbits_per_node(single_node_graph, max_graphlet_size = 4)) - expect_equal(expected_single_node_counts5, - count_orbits_per_node(single_node_graph, max_graphlet_size = 5)) + + expect_equal( + expected_zero_node_counts4, + count_orbits_per_node(zero_node_graph, max_graphlet_size = 4) + ) + expect_equal( + expected_zero_node_counts5, + count_orbits_per_node(zero_node_graph, max_graphlet_size = 5) + ) + + expect_equal( + expected_single_node_counts4, + count_orbits_per_node(single_node_graph, max_graphlet_size = 4) + ) + expect_equal( + expected_single_node_counts5, + count_orbits_per_node(single_node_graph, max_graphlet_size = 5) + ) }) context("ORCA interface: Simplify graph") @@ -153,14 +203,14 @@ test_that("simplify_graph works", { rownames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") colnames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") - + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -170,143 +220,191 @@ test_that("simplify_graph works", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # Check "do nothing" option works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")), + igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # Check directed -> undirected works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus")), + igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - + # 1: Check DIRECTED simplifications # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - + # 2: Check UNDIRECTED simplifications individually # 2a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple + # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) - ) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) + ) }) context("GDD: test simplify graph") @@ -324,14 +422,14 @@ test_that("gdd simplifies works", { rownames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") colnames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") - + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -341,182 +439,229 @@ test_that("gdd simplifies works", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # Check "do nothing" option works - t1<-gdd(igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")) - t2<-gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) - expect_equal(t1,t2) + t1 <- gdd(igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")) + t2 <- gdd(simplify_graph( + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) + expect_equal(t1, t2) # Check directed -> undirected works expect_equal( gdd( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus")), + igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - + # 1: Check DIRECTED simplifications # 1a. Loop removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # 1b. Multiple edge removal expect_equal( -gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed")), + gdd( + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1ab. Loop + multiple edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1bc. Multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) # 1abc. Loop + multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - + # 2: Check UNDIRECTED simplifications individually # 2a. Loop removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple + # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) - ) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) + ) }) context("Features to Histograms Test ") test_that("Features to Histograms Test", { - #basic test - c1<-matrix(c(1,2,3,4,5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(1,2,3,4,5)) - expect_equal(res[[1]]$masses,c(1,1,1,1,1)) - #multiple - c1<-matrix(c(1,1,3,4,5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(1,3,4,5)) - expect_equal(res[[1]]$masses,c(2,1,1,1)) - #non-integer - c1<-matrix(c(0.1,0.1,0.3,0.4,0.5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(0.1,0.3,0.4,0.5)) - expect_equal(res[[1]]$masses,c(2,1,1,1)) - #Negative - c1<-matrix(c(0.1,-0.1,0.3,-0.4,0.5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(-0.4,-0.1,0.1,0.3,0.5)) - expect_equal(res[[1]]$masses,c(1,1,1,1,1)) - #negative multiple - c1<-matrix(c(0.1,-0.1,0.3,-0.4,0.5,-0.4),nrow=6) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(-0.4,-0.1,0.1,0.3,0.5)) - expect_equal(res[[1]]$masses,c(2,1,1,1,1)) - #small (testing machine precision) - c1<-matrix(c(10^-8,10^-9,10^-2,10^3,10^-8,10^-10),nrow=6) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(10^-10,10^-9,10^-8,10^-2,10^3)) - expect_equal(res[[1]]$masses,c(1,1,2,1,1)) - #irrational - c1<-matrix(c(pi,sqrt(2),sqrt(2)/pi,sqrt(3),sqrt(2),sqrt(2)/pi),nrow=6) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(sqrt(2)/pi,sqrt(2),sqrt(3),pi)) - expect_equal(res[[1]]$masses,c(2,2,1,1)) + # basic test + c1 <- matrix(c(1, 2, 3, 4, 5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(1, 2, 3, 4, 5)) + expect_equal(res[[1]]$masses, c(1, 1, 1, 1, 1)) + # multiple + c1 <- matrix(c(1, 1, 3, 4, 5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(1, 3, 4, 5)) + expect_equal(res[[1]]$masses, c(2, 1, 1, 1)) + # non-integer + c1 <- matrix(c(0.1, 0.1, 0.3, 0.4, 0.5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(0.1, 0.3, 0.4, 0.5)) + expect_equal(res[[1]]$masses, c(2, 1, 1, 1)) + # Negative + c1 <- matrix(c(0.1, -0.1, 0.3, -0.4, 0.5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(-0.4, -0.1, 0.1, 0.3, 0.5)) + expect_equal(res[[1]]$masses, c(1, 1, 1, 1, 1)) + # negative multiple + c1 <- matrix(c(0.1, -0.1, 0.3, -0.4, 0.5, -0.4), nrow = 6) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(-0.4, -0.1, 0.1, 0.3, 0.5)) + expect_equal(res[[1]]$masses, c(2, 1, 1, 1, 1)) + # small (testing machine precision) + c1 <- matrix(c(10^-8, 10^-9, 10^-2, 10^3, 10^-8, 10^-10), nrow = 6) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(10^-10, 10^-9, 10^-8, 10^-2, 10^3)) + expect_equal(res[[1]]$masses, c(1, 1, 2, 1, 1)) + # irrational + c1 <- matrix(c(pi, sqrt(2), sqrt(2) / pi, sqrt(3), sqrt(2), sqrt(2) / pi), nrow = 6) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(sqrt(2) / pi, sqrt(2), sqrt(3), pi)) + expect_equal(res[[1]]$masses, c(2, 2, 1, 1)) }) @@ -538,20 +683,22 @@ test_that("read_simple_graph works", { graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") # Save graph to temp directory path <- file.path(tempdir(), "read_simple_graph_test_input.txt") - format = "graphml" + format <- "graphml" igraph::write_graph(graph, path, format = format) # Sanity check round trip of graph to file and back check_graph <- igraph::read_graph(file = path, format = format) - expect_equal(igraph::as_adjacency_matrix(graph), - igraph::as_adjacency_matrix(check_graph)) - + expect_equal( + igraph::as_adjacency_matrix(graph), + igraph::as_adjacency_matrix(check_graph) + ) + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -561,142 +708,190 @@ test_that("read_simple_graph works", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # Check "do nothing" option works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # Check directed -> undirected works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - + # 1: Check DIRECTED simplifications # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - + # 2: Check UNDIRECTED simplifications individually # 2a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) - ) - # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) + ) + # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) - ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) + ) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) - ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) + ) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) - ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) + ) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) }) @@ -715,20 +910,20 @@ test_that("read_simple_files works (all files in a directory)", { colnames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") # Save graphs to temp directory - format = "graphml" + format <- "graphml" base_dir <- tempdir() - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_1.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_2.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_3.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_4.txt"), format = format) - + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_1.txt"), format = format) + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_2.txt"), format = format) + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_3.txt"), format = format) + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_4.txt"), format = format) + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -738,24 +933,31 @@ test_that("read_simple_files works (all files in a directory)", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # No simplification graphs_actual <- read_simple_graphs( - base_dir, format = format, pattern = "oltw54387eNS*", as_undirected = FALSE, - remove_loops = FALSE, remove_multiple = FALSE, remove_isolates = FALSE) - purrr::walk(graphs_actual, ~expect_equal( - igraph::as_adjacency_matrix(.x), igraph::as_adjacency_matrix(graph))) - + base_dir, + format = format, pattern = "oltw54387eNS*", as_undirected = FALSE, + remove_loops = FALSE, remove_multiple = FALSE, remove_isolates = FALSE + ) + purrr::walk(graphs_actual, ~ expect_equal( + igraph::as_adjacency_matrix(.x), igraph::as_adjacency_matrix(graph) + )) + # Full ORCA compatible simplification graphs_actual <- read_simple_graphs( - base_dir, format = format, pattern = "oltw54387eNS*", as_undirected = TRUE, - remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE) - purrr::walk(graphs_actual, ~expect_equal( + base_dir, + format = format, pattern = "oltw54387eNS*", as_undirected = TRUE, + remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE + ) + purrr::walk(graphs_actual, ~ expect_equal( igraph::as_adjacency_matrix(.x), igraph::as_adjacency_matrix( igraph::as.undirected( igraph::graph_from_adjacency_matrix( - remove_isolates(remove_multiples(remove_loops(adj_mat)))) - )) + remove_isolates(remove_multiples(remove_loops(adj_mat))) + ) + ) + ) )) }) @@ -786,7 +988,7 @@ test_that("orbit_to_graphlet_counts summation works", { g16_indexes <- c(35:38) + 1 g17_indexes <- c(39:42) + 1 g18_indexes <- c(43:44) + 1 - g19_indexes <- c(45:48) +1 + g19_indexes <- c(45:48) + 1 g20_indexes <- c(49:50) + 1 g21_indexes <- c(51:53) + 1 g22_indexes <- c(54:55) + 1 @@ -829,21 +1031,25 @@ test_that("orbit_to_graphlet_counts summation works", { g28_counts <- rowSums(orbit_counts_5[, g28_indexes, drop = FALSE]) g29_counts <- rowSums(orbit_counts_5[, g29_indexes, drop = FALSE]) # Define expected graphlet count matrix for graphlets up to 5 nodes - expected_graphlet_counts_5 <- - cbind(g0_counts, g1_counts, g2_counts, g3_counts, g4_counts, g5_counts, - g6_counts, g7_counts, g8_counts, g9_counts, g10_counts, g11_counts, - g12_counts, g13_counts, g14_counts, g15_counts, g16_counts, - g17_counts, g18_counts, g19_counts, g20_counts, g21_counts, - g22_counts, g23_counts, g24_counts, g25_counts, g26_counts, - g27_counts, g28_counts, g29_counts) - colnames(expected_graphlet_counts_5) <- - c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9","G10", - "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", - "G20", "G21", "G22", "G23", "G24", "G25", "G26", "G27", "G28", - "G29") + expected_graphlet_counts_5 <- + cbind( + g0_counts, g1_counts, g2_counts, g3_counts, g4_counts, g5_counts, + g6_counts, g7_counts, g8_counts, g9_counts, g10_counts, g11_counts, + g12_counts, g13_counts, g14_counts, g15_counts, g16_counts, + g17_counts, g18_counts, g19_counts, g20_counts, g21_counts, + g22_counts, g23_counts, g24_counts, g25_counts, g26_counts, + g27_counts, g28_counts, g29_counts + ) + colnames(expected_graphlet_counts_5) <- + c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", + "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", + "G20", "G21", "G22", "G23", "G24", "G25", "G26", "G27", "G28", + "G29" + ) # Define epected graphlet count matrix for graphlets up to 4 nodes by selecting # a subset of the matrix for graphlets up to 5 nodes - expected_graphlet_counts_4 <- expected_graphlet_counts_5[,1:9] + expected_graphlet_counts_4 <- expected_graphlet_counts_5[, 1:9] # Calculate actual graphlet counts from functions under test actual_graphlet_counts_4 <- orbit_to_graphlet_counts(orbit_counts_4) actual_graphlet_counts_5 <- orbit_to_graphlet_counts(orbit_counts_5) @@ -854,281 +1060,429 @@ test_that("orbit_to_graphlet_counts summation works", { context("ORCA interface: Named ego networks") test_that("make_named_ego_graph labels each ego-network with the correct node name", { - # Helper function to sort edgelists in consistent order + # Helper function to sort edgelists in consistent order sort_edge_list <- function(edge_list) { - edge_list[order(edge_list[,1],edge_list[,2], decreasing = FALSE),] + edge_list[order(edge_list[, 1], edge_list[, 2], decreasing = FALSE), ] } # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - # The expectation below is based on igraph::graph_from_edgelist adding nodes + # The expectation below is based on igraph::graph_from_edgelist adding nodes # in the order they appear in the edge list, and igraph::V returning them # in this same order - expected_node_names <- c("n1","n2","n3","n4","n5","n6","n7","n8","n9","n10") - + expected_node_names <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10") + # Expected edgelists for ego networks of order 1 expected_ego_elist_n1_o1 <- rbind( - c("n1","n2"), - c("n1","n4"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6") + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6") ) expected_ego_elist_n2_o1 <- rbind( - c("n1","n2"), - c("n1","n4"), - c("n2","n3"), - c("n2","n4"), - c("n2","n5") + c("n1", "n2"), + c("n1", "n4"), + c("n2", "n3"), + c("n2", "n4"), + c("n2", "n5") ) expected_ego_elist_n3_o1 <- rbind( - c("n2","n3") + c("n2", "n3") ) expected_ego_elist_n4_o1 <- rbind( - c("n1","n2"), - c("n1","n4"), - c("n1","n6"), - c("n2","n4"), - c("n4","n6") + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n2", "n4"), + c("n4", "n6") ) expected_ego_elist_n5_o1 <- rbind( - c("n2","n5") + c("n2", "n5") ) expected_ego_elist_n6_o1 <- rbind( - c("n1","n4"), - c("n1","n6"), - c("n4","n6"), - c("n6","n8") + c("n1", "n4"), + c("n1", "n6"), + c("n4", "n6"), + c("n6", "n8") ) expected_ego_elist_n7_o1 <- rbind( - c("n1","n7"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n7"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) expected_ego_elist_n8_o1 <- rbind( - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) expected_ego_elist_n9_o1 <- rbind( - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) expected_ego_elist_n10_o1 <- rbind( - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") - ) - + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + # Test ego-networks of order 1. # We compare edgelists as igraphs do not implement comparison order <- 1 + min_ego_nodes <- 0 + min_ego_edges <- 0 + expected_ego_elists_o1 <- list( - n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), - n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), - n3 = dplyr::arrange(data.frame(expected_ego_elist_n3_o1), X1, X2), - n4 = dplyr::arrange(data.frame(expected_ego_elist_n4_o1), X1, X2), - n5 = dplyr::arrange(data.frame(expected_ego_elist_n5_o1), X1, X2), - n6 = dplyr::arrange(data.frame(expected_ego_elist_n6_o1), X1, X2), - n7 = dplyr::arrange(data.frame(expected_ego_elist_n7_o1), X1, X2), - n8 = dplyr::arrange(data.frame(expected_ego_elist_n8_o1), X1, X2), - n9 = dplyr::arrange(data.frame(expected_ego_elist_n9_o1), X1, X2), + n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), + n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), + n3 = dplyr::arrange(data.frame(expected_ego_elist_n3_o1), X1, X2), + n4 = dplyr::arrange(data.frame(expected_ego_elist_n4_o1), X1, X2), + n5 = dplyr::arrange(data.frame(expected_ego_elist_n5_o1), X1, X2), + n6 = dplyr::arrange(data.frame(expected_ego_elist_n6_o1), X1, X2), + n7 = dplyr::arrange(data.frame(expected_ego_elist_n7_o1), X1, X2), + n8 = dplyr::arrange(data.frame(expected_ego_elist_n8_o1), X1, X2), + n9 = dplyr::arrange(data.frame(expected_ego_elist_n9_o1), X1, X2), n10 = dplyr::arrange(data.frame(expected_ego_elist_n10_o1), X1, X2) ) # Generate actual ego-networks and convert to edge lists for comparison - actual_ego_elists_o1 <- - purrr::map(make_named_ego_graph(graph, order), function(g) { - dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) - }) + actual_ego_elists_o1 <- + purrr::map( + make_named_ego_graph(graph, order, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ), + function(g) { + dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) + } + ) expect_equal(actual_ego_elists_o1, expected_ego_elists_o1) }) context("ORCA interface: Graphlet counts") test_that("count_graphlets_for_graph works", { - # Set up a small sample network with at least that contains at least one of + # Set up a small sample network with at least that contains at least one of # each graphlet elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + # Setgraphlet labels to use for names in expected counts - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - + graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + # Manually verified graphlet counts - expected_counts <- c(15, 18,6, 21,3,1, 11, 1, 1) + expected_counts <- c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1) names(expected_counts) <- graphlet_labels - + # Test actual_counts <- count_graphlets_for_graph(graph, max_graphlet_size = 4) expect_equal(expected_counts, actual_counts) }) context("ORCA interface: Ego-network graphlet counts") -test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manually verified totals for test graph",{ +test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manually verified totals for test graph", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - + graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + max_graphlet_size <- 4 graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count # Set manually verified counts # 1-step ego networks expected_counts_order_1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0), - c(5, 5, 1, 0, 2, 0, 2, 0, 0), - c(1, 0, 0, 0, 0, 0, 0, 0, 0), - c(5, 2, 2, 0, 0, 0, 0, 1, 0), - c(1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 2, 1, 0, 0, 0, 1, 0, 0), - c(7, 3, 4, 0, 0, 0, 3, 0, 1), - c(7, 3, 4, 0, 0, 0, 3, 0, 1), - c(6, 0, 4, 0, 0, 0, 0, 0, 1), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) ) rownames(expected_counts_order_1) <- node_labels colnames(expected_counts_order_1) <- graphlet_labels # 2-step ego networks expected_counts_order_2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(10, 14, 2, 11, 3, 1, 5, 1, 0), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) ) rownames(expected_counts_order_2) <- node_labels colnames(expected_counts_order_2) <- graphlet_labels - + # Count graphlets in each ego network of the graph with only counts requested - actual_counts_order_1 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1) - actual_counts_order_2 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2) + min_ego_nodes <- 0 + min_ego_edges <- 0 + + actual_counts_order_1 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + actual_counts_order_2 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) # Test that actual counts match expected with only counts requested (default) expect_equal(actual_counts_order_1, expected_counts_order_1) expect_equal(actual_counts_order_2, expected_counts_order_2) - + # Test that actual and returned ego networks match expected # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) + expected_ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + expected_ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) expected_counts_with_networks_order_1 <- - list(graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1) - expected_counts_with_networks_order_2 <- - list(graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2) + list( + graphlet_counts = expected_counts_order_1, + ego_networks = expected_ego_networks_order_1 + ) + expected_counts_with_networks_order_2 <- + list( + graphlet_counts = expected_counts_order_2, + ego_networks = expected_ego_networks_order_2 + ) # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) - actual_counts_with_networks_order_2 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) + actual_counts_with_networks_order_1 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + actual_counts_with_networks_order_2 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) # Test that actual counts match expected with ego-networks requested expect_equal(actual_counts_with_networks_order_1$graphlet_counts, expected_counts_order_1) expect_equal(actual_counts_with_networks_order_2$graphlet_counts, expected_counts_order_2) - + # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to + # Comparison is not implemented for igraph objects, so convert all igraphs to # indexed edge list and then compare. Do in-situ replacement of igraphs with # indexed edge lists to ensure we are checking full properties of returned # objects (i.e. named lists with matching elements). # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map(expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map(expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map(actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map(actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) + expected_counts_with_networks_order_1$ego_networks <- + purrr::map( + expected_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + expected_counts_with_networks_order_2$ego_networks <- + purrr::map( + expected_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_1$ego_networks <- + purrr::map( + actual_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_2$ego_networks <- + purrr::map( + actual_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) # 3b. Do comparison - expect_equal(actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1) - expect_equal(actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2) + expect_equal( + actual_counts_with_networks_order_1, + expected_counts_with_networks_order_1 + ) + expect_equal( + actual_counts_with_networks_order_2, + expected_counts_with_networks_order_2 + ) +}) + +context("ORCA interface: Ego-network graphlet counts") +test_that("ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manually verified totals for test graph", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + max_graphlet_size <- 4 + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + expected_counts_order_1 <- rbind( + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) + ) + rownames(expected_counts_order_1) <- node_labels + colnames(expected_counts_order_1) <- graphlet_labels + # 2-step ego networks + expected_counts_order_2 <- rbind( + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) + ) + rownames(expected_counts_order_2) <- node_labels + colnames(expected_counts_order_2) <- graphlet_labels + + # Count graphlets in each ego network of the graph with only counts requested + min_ego_nodes <- 0 + min_ego_edges <- 0 + + # Test that actual and returned ego graphlet counts match + # 1. Generate ego networks with previously tested function. + ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + # 2. Calculate counts with ego_to_graphlet_counts. + actual_counts_order_1 <- + ego_to_graphlet_counts(ego_networks_order_1, + max_graphlet_size = max_graphlet_size + ) + actual_counts_order_2 <- + ego_to_graphlet_counts(ego_networks_order_2, + max_graphlet_size = max_graphlet_size + ) + + # 3. Test that actual counts match expected + expect_equal(actual_counts_order_1, expected_counts_order_1) + expect_equal(actual_counts_order_2, expected_counts_order_2) }) # context("ORCA interface: Graphlet-based degree distributions") @@ -1155,7 +1509,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # gdd_orbit_default_actual <- gdd(graph, feature_type = "orbit") # gdd_graphlet_default_actual <- gdd(graph, feature_type = "graphlet") # gdd_default_default_actual <- gdd(graph) -# # Compare actual gdd with expected gdd +# # Compare actual gdd with expected gdd # expect_equal(gdd_orbit_4_actual, gdd_orbit_4_expected) # expect_equal(gdd_orbit_5_actual, gdd_orbit_5_expected) # expect_equal(gdd_graphlet_4_actual, gdd_graphlet_4_expected) @@ -1165,7 +1519,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # expect_equal(gdd_orbit_default_actual, gdd_orbit_4_expected) # expect_equal(gdd_graphlet_default_actual, gdd_graphlet_4_expected) # expect_equal(gdd_default_default_actual, gdd_orbit_4_expected) -# +# # # Check gdd throws error for invalid feature type # expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4)) # expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5)) @@ -1176,9 +1530,9 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2)) # expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3)) # expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6)) -# +# # }) -# +# # context("ORCA interface: Ego-network graphlet outputs for manually verified networks") # test_that("Ego-network 4-node graphlet counts match manually verified totals # and gdd gives expected discrete histograms",{ @@ -1202,20 +1556,20 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # c("n9","n10") # ) # graph <- igraph::graph_from_edgelist(elist, directed = FALSE) -# +# # # Set node and graphlet labels to use for row and col names in expected counts # node_labels <- igraph::V(graph)$name # graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") -# +# # # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 # max_graphlet_size <- 4 -# actual_counts_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_order_1 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 1) -# actual_counts_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_order_2 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 2) -# +# # # Set manually verified ego-network graphlet counts # # 1-step ego networks # expected_counts_order_1 <- rbind( @@ -1247,55 +1601,55 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # ) # rownames(expected_counts_order_2) <- node_labels # colnames(expected_counts_order_2) <- graphlet_labels -# +# # # Test that actual counts match expected with only counts requested (default) # expect_equal(actual_counts_order_1, expected_counts_order_1) # expect_equal(actual_counts_order_2, expected_counts_order_2) -# +# # # Test that actual counts and returned ego networks match expected # # 1. Define expected # expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) # expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) -# expected_counts_with_networks_order_1 <- +# expected_counts_with_networks_order_1 <- # list(graphlet_counts = expected_counts_order_1, # ego_networks = expected_ego_networks_order_1) -# expected_counts_with_networks_order_2 <- +# expected_counts_with_networks_order_2 <- # list(graphlet_counts = expected_counts_order_2, # ego_networks = expected_ego_networks_order_2) # # 2. Calculate actual -# actual_counts_with_networks_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_with_networks_order_1 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 1, return_ego_networks = TRUE) -# actual_counts_with_networks_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_with_networks_order_2 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 2, return_ego_networks = TRUE) # # 3. Compare -# # Comparison is not implemented for igraph objects, so convert all igraphs to +# # Comparison is not implemented for igraph objects, so convert all igraphs to # # indexed edge list and then compare. Do in-situ replacement of igraphs with # # indexed edge lists to ensure we are checking full properties of returned # # objects (i.e. named lists with matching elements). # # 3a. Convert expected and actual ego networks from igraphs to indexed edges -# expected_counts_with_networks_order_1$ego_networks <- -# purrr::map(expected_counts_with_networks_order_1$ego_networks, +# expected_counts_with_networks_order_1$ego_networks <- +# purrr::map(expected_counts_with_networks_order_1$ego_networks, # graph_to_indexed_edges) -# expected_counts_with_networks_order_2$ego_networks <- -# purrr::map(expected_counts_with_networks_order_2$ego_networks, +# expected_counts_with_networks_order_2$ego_networks <- +# purrr::map(expected_counts_with_networks_order_2$ego_networks, # graph_to_indexed_edges) -# actual_counts_with_networks_order_1$ego_networks <- -# purrr::map(actual_counts_with_networks_order_1$ego_networks, +# actual_counts_with_networks_order_1$ego_networks <- +# purrr::map(actual_counts_with_networks_order_1$ego_networks, # graph_to_indexed_edges) -# actual_counts_with_networks_order_2$ego_networks <- -# purrr::map(actual_counts_with_networks_order_2$ego_networks, +# actual_counts_with_networks_order_2$ego_networks <- +# purrr::map(actual_counts_with_networks_order_2$ego_networks, # graph_to_indexed_edges) # # 3b. Do comparison -# expect_equal(actual_counts_with_networks_order_1, +# expect_equal(actual_counts_with_networks_order_1, # expected_counts_with_networks_order_1) -# expect_equal(actual_counts_with_networks_order_2, +# expect_equal(actual_counts_with_networks_order_2, # expected_counts_with_networks_order_2) -# +# # # Test that gdd method gives the expected graphlet degree distributions # # 1-step ego-networks -# actual_gdd_order_1 <- gdd(graph, feature_type = "graphlet", +# actual_gdd_order_1 <- gdd(graph, feature_type = "graphlet", # max_graphlet_size = 4, ego_neighbourhood_size = 1) # expected_gdd_order_1 <- list( # G0 = dhist(locations = c(1, 4, 5, 6, 7), masses = c(2, 1, 2, 3, 2)), @@ -1310,7 +1664,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # ) # expect_equal(actual_gdd_order_1, expected_gdd_order_1) # # 2-step ego-networks -# actual_gdd_order_2 <- gdd(graph, feature_type = "graphlet", +# actual_gdd_order_2 <- gdd(graph, feature_type = "graphlet", # max_graphlet_size = 4, ego_neighbourhood_size = 2) # expected_gdd_order_2 <- list( # G0 = dhist(locations = c(5, 8, 9, 10, 11, 13, 15), masses = c(2, 1, 2, 1, 1, 2, 1)), @@ -1324,46 +1678,46 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # G8 = dhist(locations = c(0, 1), masses = c(4, 6)) # ) # expect_equal(actual_gdd_order_2, expected_gdd_order_2) -# +# # # Check gdd throws error for invalid feature type -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, # ego_neighbourhood_size = 1)) # # We don't support orbit feature type for ego networks (i.e. neighbourhood > 0) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 4, +# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 4, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 5, +# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 5, # ego_neighbourhood_size = 1)) # # Check gdd throws error for invalid maximum graphlet size -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, # ego_neighbourhood_size = 1)) # }) -# +# # context("ORCA interface: GDD for all graphs in a directory") # test_that("gdd_for_all_graphs works", { # # Set source directory and file properties for Virus PPI graph edge files # source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # edge_format = "ncol" # file_pattern = ".txt" -# +# # # Set number of threads to use at once for parallel processing. # num_threads = getOption("mc.cores", 2L) -# +# # # Use previously tested gdd code to calculate expected gdds # expected_gdd_fn <- function(feature_type, max_graphlet_size, ego_neighbourhood_size) { # gdds <- list( @@ -1376,12 +1730,12 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # names(gdds) <- c("EBV", "ECL", "HSV-1", "KSHV", "VZV") # gdds # } -# +# # # Use code under test to generate actual gdds # actual_gdd_fn <- function (feature_type, max_graphlet_size, ego_neighbourhood_size) { -# gdd_for_all_graphs(source_dir = source_dir, format = edge_format, -# pattern = file_pattern, feature_type = feature_type, -# max_graphlet_size = max_graphlet_size, +# gdd_for_all_graphs(source_dir = source_dir, format = edge_format, +# pattern = file_pattern, feature_type = feature_type, +# max_graphlet_size = max_graphlet_size, # ego_neighbourhood_size = ego_neighbourhood_size, # mc.cores = num_threads) # } diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..097b2416 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/ManyToMany.Rmd b/vignettes/ManyToMany.Rmd new file mode 100644 index 00000000..722a0c0b --- /dev/null +++ b/vignettes/ManyToMany.Rmd @@ -0,0 +1,227 @@ +--- +title: 'Simple and quick (default) usage 3: many to many comparisons' +#author: "Luis Ospina-Forero" +date: "23/06/2020" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{ManyToManyComp} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r, include = FALSE} + knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" + ) +``` + +# Introduction + + In some situations there is a need to compare multiple graphs among each other. For such situations, the `netdist` package contains some initial shortcut functions that perform such calculation and may automatically (Unix) incorporate parallel processing. + + This vignette does not show details about the Netdis and NetEmd network comparison methods or their variants, please check ["Simple and quick (default) usage 1: pairwise comparisons"](default_pairwise_usage.html) for these details. Instead, this vignette shows the default usage of the shortcut functions for many-to-many comparisons. + + Please note that for high performance computations these shortcut functions may not be ideal, particularly in terms of RAM consumption. + + For other vignettes in this package see the ["Menu"](V-Menu.html). + +# Load required packages/libraries +```{r, packages, message= FALSE} +# Load packages/libraries +library("netdist") +library("igraph") +library("pheatmap") +``` + +# Compare multiple networks via NetEmd + +(Extracted from Wegner et al. (2017)): NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties which reflect the topological organization of the network. From an abstract point of view, NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e. translations and re-scalings of the axis. + +## Networks being compared + +Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 nodes. The plots of these networks illustrate clearly their structural differences. +```{r, netwokrs,fig.align='center',fig.dim=c(8,4)} +# Create networks +set.seed(3171) +gLat_1 <- igraph::graph.lattice(c(20,20)) +gLat_2 <- igraph::graph.lattice(c(40,40)) +gRing_1 <- igraph::make_ring(20^2) +gRing_2 <- igraph::make_ring(40^2) +gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) +gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) + +par(mfrow=c(1,2)) +plot(gLat_1,vertex.size=0.8,vertex.label=NA) +plot(gLat_2,vertex.size=0.8,vertex.label=NA) +plot(gRing_1,vertex.size=0.8,vertex.label=NA) +plot(gRing_2,vertex.size=0.8,vertex.label=NA) +plot(gTree_1,vertex.size=0.8,vertex.label=NA) +plot(gTree_2,vertex.size=0.8,vertex.label=NA) +``` + +## NetEmd using subgraph counts + +Subgraph count based NetEmd comparisons: +```{r, netemdorbits,fig.align='center'} +# NetEMD using subgraph counts. +glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) + +netemdlist <- netemd_many_to_many(graphs = glist,smoothing_window_width = 1,mc.cores = 1) #Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. + +netemdlist +``` + +To read the results in a matrix form: +```{r,fig.align='center'} +# Creating a comparison matrix: +mat <- cross_comp_to_matrix(measure = netemdlist$netemds, cross_comparison_spec = netemdlist$comp_spec) +mat +``` + +Illustration of the multiple NetEmd comparisons based on subgraph counts. +```{r,netemdorbitsPLOT,fig.align='center',fig.dim=c(8,8)} +netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd subgraph counts") +``` + +## NetEmd using the Laplacian and Normalized Laplacian Spectrum + +Pre-compute the Laplacian and normalized Laplacian for each graph considered: +```{r, netemdspectrum} +# NetEMD using the Laplacian and normalized Laplacian Spectrum. +SPECT<-list() + +#This step may take several minutes. +for(i in 1:length(glist)){ + Lapg <- igraph::laplacian_matrix(graph = glist[[i]],normalized = FALSE,sparse = FALSE) + NLap <- igraph::laplacian_matrix(graph = glist[[i]],normalized = TRUE,sparse = FALSE) + SPECT[[ names(glist)[i] ]] <- cbind(L.Spectra= eigen(Lapg)$values, NL.Spectra= eigen(NLap)$values) +} +str(SPECT) +``` + +Compute NetEmd: +```{r} +netemdlist <- netemd_many_to_many(dhists = SPECT,smoothing_window_width = 0) #Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. + +netemdlist +``` + + +### Illustration of the multiple NetEmd comparisons based on the Laplacian and Normalized Laplacian spectra +```{r,netemdspectrumPLOT ,fig.align='center',fig.dim=c(8,8)} +netemd.plot(netemdlist=netemdlist,clustering_method="ward.D",main="NetEmd Spectra") +``` + + +------------------------- + +# Compare two networks via Netdis and its variants + +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared among one another thus leading to the Netdis statistic. + + +## Using Netdis with a reference or gold-standard graph to obtain the expectations $E_w$ + +The selection of a gold-standard graph as a substitute for a theoretical $E_w$ could be considered when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This Netdis variant focuses on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard. + + +### Netdis using reference or gold-standard networks + +Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 nodes. Reference graphs given by start-like graphs are used as illustration of reference graphs. The plots of these networks illustrate clearly their structural differences. + +```{r, netdisgoldstandnetworks,fig.align='center',fig.dim=c(8,4)} +# Create networks +set.seed(3171) +gLat_1 <- igraph::graph.lattice(c(20,20)) +gLat_2 <- igraph::graph.lattice(c(40,40)) +gRing_1 <- igraph::make_ring(20^2) +gRing_2 <- igraph::make_ring(40^2) +gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) +gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) + +# Create a random graph to be used as a gold-standard +gst_1 <- igraph::as.undirected( graph.star(20^2) ) +gst_2 <- igraph::as.undirected( graph.star(40^2) ) + +par(mfrow=c(1,2)) +plot(gst_1,vertex.size=0.8,vertex.label=NA) +plot(gst_2,vertex.size=0.8,vertex.label=NA) +``` + + +Obtain the comparison via Netdis using each of the reference graph networks. +```{r,netdisgoldstand ,fig.align='center'} +glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) + +# Netdis using the goldstd_1 graph as gold-standard reference point +netdis_mat_gst1 <- netdis_many_to_many(graphs = glist, ref_graph = gst_1) + +# Netdis using the goldstd_2 graph as gold-standard reference point +netdis_mat_gst2 <- netdis_many_to_many(graphs = glist, ref_graph = gst_2) + +netdis_mat_gst1 + +netdis_mat_gst2 +``` + +To read the results in a matrix form: +```{r,fig.align='center'} +# Creating a comparison matrix: +cross_comp_to_matrix(measure = netdis_mat_gst1$netdis, cross_comparison_spec = netdis_mat_gst1$comp_spec) + +cross_comp_to_matrix(measure = netdis_mat_gst2$netdis, cross_comparison_spec = netdis_mat_gst2$comp_spec) +``` + +Heatmap of the Netdis comparisons: +```{r,netdisgoldstandPLOT ,fig.align='center',fig.dim=c(8,8)} +#Network comparisons heatmap with Gold-Standard 1 +netdis.plot(netdislist = netdis_mat_gst1, whatrow = 2,main = "Netdis GoldStd-1") + +#Network comparisons heatmap with Gold-Standard 2 +netdis.plot(netdislist = netdis_mat_gst2, whatrow = 2,main = "Netdis GoldStd-2") +``` + + +### Netdis-GP: Using a Geometric-Poisson approximation + +This variant focuses on detecting more general and global discrepancies between the ego-network structures. + +```{r, netdisGP} +# Netdis Geometric-Poisson comparisons +netdis_mat <- netdis_many_to_many(graphs = glist, ref_graph = NULL) +netdis_mat +``` + +```{r,netdisGPPLOT ,fig.align='center',fig.dim=c(8,8)} +netdis.plot(netdislist = netdis_mat, whatrow = 2,main = "Netdis-GP") +``` + +### Using Netdis with no expectation ($E_w=0$) +Comparing the networks via their observed ego-counts without centring them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks. + + +```{r, netdiszero} +# Netdis with expectation zero +netdis_mat <- netdis_many_to_many(graphs = glist, ref_graph = 0) +netdis_mat +``` + +```{r,netdiszeroPLOT ,fig.align='center',fig.dim=c(8,8)} +netdis.plot(netdislist = netdis_mat, whatrow = 2,main = "Netdis-zero") +``` + +------------------------- + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. diff --git a/vignettes/NetEmdTimeOrdering.Rmd b/vignettes/NetEmdTimeOrdering.Rmd new file mode 100644 index 00000000..01c91197 --- /dev/null +++ b/vignettes/NetEmdTimeOrdering.Rmd @@ -0,0 +1,119 @@ +--- +title: 'NetEmd: World trade' +#author: "Luis Ospina-Forero" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteEncoding{UTF-8} + %\VignetteIndexEntry{NetEmd: World Trade} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +chunk_output_type: console +--- + +# World trade networks + +The world trade data set consists of a small sample of world trade networks for the years 2001-2014, and pre-computed subgraph counts of a larger set of world trade networks (1985–2014). The world trade networks are based on the data set from [Feenstra et al., 2005] for the years 1985-2000 and on the United Nations division COMTRADE for the years 2001-2014. + +For an introduction to NetEmd and its variants see ["Simple and quick (default) usage 1: pairwise comparisons"](default_pairwise_usage.html). + +For many to many comparisons see [Simple and quick (default) usage 3: many to many comparisons](ManyToMany.html) + +For other vignettes in this package see the ["Menu"](V-Menu.html). + + +# Loading world trade network data included in the package + +The package contains the world trade networks and pre-computed subgraph/graphlet counts in the R data object ` worldtradesub`. This object contains a list of two lists. The first list is `worldtradesub$wtnets` which contains a small sample of the trade networks (2001-2014) and the second `worldtradesub$Counts` which contains pre-computed counts for a larger set of trade networks going from 1985 to 2014. + +```{r, message=FALSE} + library("netdist") + library("igraph") + data(worldtradesub) + summary(worldtradesub) + wtnets<- worldtradesub$wtnets + summary(wtnets) +``` + +These world trade networks are denser than typically sparse social networks. For example, the edge density for the network in 2001 is `r igraph::graph.density(worldtradesub$wtnets$wtn2001)`. Here is a plot of this network highlighting the relatively large number of edges: +```{r,fig.align='center',fig.dim=c(5,5)} + plot(wtnets$wtn2001,vertex.size=5,vertex.label.cex=0.4) +``` + + + +# Generate NetEmd measures between each pair of query graphs + +In this example **NetEmd** will consider orbit counts of subgraphs containing up to 5 nodes. If NetEmd is to be called a single time, then the command `netemd_many_to_many(graphs = wtnets)` would suffice. The following code provides such an example: +```{r} + # As the trade networks are considerable dense, this example first considers a small number of networks. + #This example may take some minutes to run. + netemd_result <- netemd_many_to_many(graphs = wtnets[1:4],mc.cores = 1) + + print(netemd_result) +``` +However, if there are pre-computed counts or features NetEmd can be called via these features instead. + +# Pre-computing subgraph counts + +World trade networks consist of relatively dense networks, thus leading to longer computational times for the calculation of the subgraph counts. Hence, it is advisable to pre-compute counts in case there is a need to call NetEmd multiple times. This may, for example, be the case when adding a new network to the data set. The following illustrates the extraction of subgraph counts for the small network sample. + +```{r} +# This example may take more than a few minutes to run (approx. 20 mins) , and it is not necessary to run it for the upcoming examples as a larger set of counts has been already computed. +if(FALSE){# It is not necessary to run, as these counts are already available in. + Counts <- list() + for(i in 1:length(wtnets)){ + Counts[[ names(wtnets)[i] ]] <- count_orbits_per_node(graph = wtnets[[i]],max_graphlet_size = 5) + } +} +``` + +Now, with pre-computed counts NetEmd can be calculated more rapidly as the computations of the counts are often the bottle neck in the computational time of NetEmd. NetEmd will be called `r length(worldtradesub$Counts) * (length(worldtradesub$Counts) - 1)/2 ` times in order to obtain all pairwise comparisons between the trade networks from 1985 to 2014 (networks with pre-computed subgraph counts): +```{r} + # The pre-computed counts already in the package + Counts <- worldtradesub$Counts + + #Calling NetEmd + netemd_result <- netemd_many_to_many(dhists = Counts ,mc.cores = 1) + + #Results + netemd_matrix <- cross_comp_to_matrix(measure = netemd_result$netemds, cross_comparison_spec = netemd_result$comp_spec) + + print(netemd_matrix[1:10,1:5]) +``` + +# Evidence of change in world trade + +Based on the comparison of the world trade networks across the years, we can identify periods of time where possible considerable changes in world trade have occurred. The following heat map clearly shows the existence of two potential + changes in the world trade system, and which correspond to 1995-1996 and 2010-2011. +```{r,fig.align='center',fig.dim=c(8.5,8.5)} + netemd.plot(netemdlist=netemd_result,clustering_method="ward.D",main="NetEmd",docluster = FALSE) +``` + +The World Trade Organization (WTO) said the following about these years: + +* About 2010-2011: "*World trade growth decelerated sharply in 2011 as the global economy struggled under the influence of natural disasters, financial uncertainty and civil conflict. A slowdown in trade had been expected after the strong rebound of 2010 but the earthquake in Japan and flooding in Thailand shook global supply chains, and fears of sovereign default in the euro area weighed heavily in the closing months of the year. The civil war in Libya also reduced oil supplies and contributed to sharply higher prices. All of these factors combined to produce below average growth in trade in 2011*" WTO report 2012, section "World trade in 2011" accessed on February 4th 2021 (https://www.wto.org/english/res_e/booksp_e/anrep_e/world_trade_report12_e.pdf) and (https://www.wto.org/english/res_e/publications_e/wtr12_e.htm) + +* About 1995-1996: "*The WTO was created in January 1995, upon the completion of the Uruguay Round which established new rules for international trade in goods, services and intellectual property.*" WTO 1996 press releases accessed on February 4th 2021 (https://www.wto.org/english/news_e/pres96_e/anrep.htm). + + +-------------------- + +# Bibliography + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* R. C. Feenstra, R. E. Lipsey, H. Deng, A. C. Ma, and H. Mo. World trade flows: 1962-2000. Technical report, National Bureau of Economic Research, 2005. See also https://cid.econ.ucdavis.edu/wix.html. + +* United Nations Statistics Division. United nations commodity trade statistics database (un comtrade). http://comtrade.un.org/, 2015. + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. + + + + diff --git a/vignettes/NetEmdTimeOrdering_cache/html/__packages b/vignettes/NetEmdTimeOrdering_cache/html/__packages new file mode 100644 index 00000000..6878d0d0 --- /dev/null +++ b/vignettes/NetEmdTimeOrdering_cache/html/__packages @@ -0,0 +1,4 @@ +base +netdist +igraph +pheatmap diff --git a/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.RData b/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.RData new file mode 100644 index 00000000..5dd8be3a Binary files /dev/null and b/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.RData differ diff --git a/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.rdb b/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.rdb new file mode 100644 index 00000000..27b4c9f0 Binary files /dev/null and b/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.rdb differ diff --git a/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.rdx b/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.rdx new file mode 100644 index 00000000..774543e5 Binary files /dev/null and b/vignettes/NetEmdTimeOrdering_cache/html/unnamed-chunk-3_fa1cab3be9f1ecfbe870c0476447ff7d.rdx differ diff --git a/vignettes/NetEmdTimeOrdering_files/figure-html/unnamed-chunk-2-1.png b/vignettes/NetEmdTimeOrdering_files/figure-html/unnamed-chunk-2-1.png new file mode 100644 index 00000000..bb3b6037 Binary files /dev/null and b/vignettes/NetEmdTimeOrdering_files/figure-html/unnamed-chunk-2-1.png differ diff --git a/vignettes/NetEmdTimeOrdering_files/figure-html/unnamed-chunk-6-1.png b/vignettes/NetEmdTimeOrdering_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 00000000..736a6554 Binary files /dev/null and b/vignettes/NetEmdTimeOrdering_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/vignettes/NetdisGPStepByStep.Rmd b/vignettes/NetdisGPStepByStep.Rmd new file mode 100644 index 00000000..f7ee738f --- /dev/null +++ b/vignettes/NetdisGPStepByStep.Rmd @@ -0,0 +1,253 @@ +--- +title: "Netdis-GP: step by step" +#author: "Martin O'Reilly, Jack Roberts, Luis Ospina-Forero" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis-GP: step by step} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +editor_options: +chunk_output_type: console + +--- + +```{r, include = FALSE} + knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" + ) +``` + +# Introduction + +This vignette explains some inner calculations of Netdis-GP, which the user does not need to go through, however we explain them here for those users that may want to use Netdis in a modular fashion. Netdis-GP is a version of Netdis that uses the Polya-Aeppli/Geometric-Poisson approximation for the subgraph/graphlet count expectations. + +For a simple Netdis-GP function call see instead ["Simple and quick (default) usage 1: pairwise comparisons"](default_pairwise_usage.html). + +For other vignettes in this package see the ["Menu"](V-Menu.html). + +# What is Netdis-GP? +(Extracted from Ali et al. (2014) and Ospina-Forero et al. (2018)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared thus leading to the Netdis statistic. + +In Netdis-GP, the expected number of counts $E_w$ is computed independently for each graph based on a Geometric-Poisson (GP) approximation for the distribution of the number of occurrences of subgraph $w$ in the respective graph. It assumes that $N_{w,i} \sim GP(\lambda^{\rho(i)}_k, \theta^{\rho(i)}_w)$, where $\lambda^{\rho(i)}_k$ is the Poisson parameter indexed by the size of subgraph $w$ and the density bin $\rho(i)$; and where $\theta^{\rho(i)}_w$ is the geometric parameter indexed by subgraph $w$ and density bin $\rho(i)$. $E_w(G, \rho(i))$ is taken as the mean of the GP approximation, i.e. $\lambda^{\rho(i)}_k/\theta^{\rho(i)}_w$. As $\lambda^{\rho(i)}_k$ and $\theta^{\rho(i)}_w$ are not known, they are estimated as follows: + +Let $x_{w,d}^j$ be the number of subgraphs $w$ on the 2-step ego-network $j$ of density bin $d$, and let +\[ +\bar{X}_{w,d}=\frac{1}{q} \sum_{j=1}^q x_{w,d}^j, \qquad V^2_{w,d}=\frac{1}{q-1} \sum_{j=1}^q (x_{w,d}^j - \bar{X}_{w,d})^2 +, +\] +where $q$ is the number of ego-networks in density bin $d$. Then, +\[ +\hat{\lambda}^{d}_{k}= \frac{1}{l} \sum_{h \in A(k)} \frac{2 (\bar{X}_{h,d})^2}{V^2_{h,d}+\bar{X}_{h,d}} , \qquad \hat{\theta}^{d}_w= \frac{2\bar{X}_{w,d}}{V^2_{w,d}+\bar{X}_{w,d}}, +\] +where $l$ is the number of connected subgraphs of size $k$, for example, $l=6$ for $k=4$. These estimators are based on the moment estimators of a GP random variable and the proposal made by Picard et al.(2008), where the total count of each individual subgraph could be thought as the sum of the total subgraph counts over multiple ``clumps'' of edges that appear across the network. + +This variant focuses on detecting more general and global discrepancies between the ego-network structures. + + +## Netdis and its variants are constructed as follows: + +(Extracted from Ali et al. (2014)): Let $N_{w,i}(G)$ be the number of induced occurrences of small graphs $w$ in the 2-step ego network of vertex $i$. Now, bin all 2-step ego-networks of network $G$ according to their network density. Let $E_w(G,\rho)$ be the expected number of occurrences of $w$ in an ego-network whose density falls in density bin $\rho$. For a given network $G$ compute the centred subgraph counts as +\[ +S_w(G)=\sum\limits_{i }{\bigg (N_{w,i}(G)- E_w(G, \rho(i)) \bigg )}, +\] +where $i$ is a node in $G$ and $\rho(i)$ the density bin of the 2-step ego-network of node $i$. + +Now, to compare networks $G_1$ and $G_2$, set +$$ +\displaystyle +netD_2^S(k) = \tfrac{1}{ \sqrt{ M(k)} } \sum\limits_{w \in A(k)} +\bigg ({ \tfrac{S_w(G_1) S_w(G_2)} {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +}\bigg ), \quad k=3,4, 5, +$$ +where $A(k)$ is the set of connected subgraphs of size $k$, and where $M(k)$ is a normalising constant so that $netD_2^S(k)\in[-1,1]$. $M(k)$ is equal to +\[ +M(k) = \sum\limits_{w \in A(k)} +\left( \tfrac{ S_w(G_1)^2 }{\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} \right) +\sum\limits_{w \in A(k)} +\left(\tfrac{ S_w(G_2)^2 } {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +\right) +. +\] +The corresponding Netdis statistic is defined as +$$Netdis(k)=netd_2^S(k)=\tfrac{1}{2}(1-netD_2^S(k)) \in [0,1].$$ +Small values of Netdis suggest higher `similarity' between the networks. By default Netdis uses subgraphs on $k=4$ nodes. + + + +# Load required libraries +```{r,message=FALSE} +# Load libraries +library("netdist") +library("igraph") +library("purrr") +``` + +# Netdis-GP step by step + +## Networks being compared +Generation of tree-like networks with 400 nodes and 1600 nodes. + +```{r, netwokrs,fig.align='center',fig.dim=c(6,6)} +# Create networks +set.seed(34) +gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) +gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) + +plot(gTree_1,vertex.size=0.8,vertex.label=NA) +plot(gTree_2,vertex.size=0.8,vertex.label=NA) +``` + + + +## Set Netdis parameters + +Netdis uses some mostly internal parameters that define the size of the subgraphs/graphlets that are going to be used, as well as the length of the ego neighbourhood, the minimum size considered for the resulting ego-networks, and finally some parameters that control an ego-network binning function. By default, Netdis considers the following setup: + +```{r} +# Maximum subgraph size to calculate counts and netdis statistic for +max_subgraph_size <- 4 + +# Ego-network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego-networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego-network density binning parameters. Here, the minimum number of ego-networks per bin and the starting number of bins +min_bin_count <- 5 +num_bins <- 100 +``` + + +## Obtain list of ego networks +One of the first steps in Netdis is the extraction of all ego-networks in each of the query networks: +```{r} +# Get ego-networks for query graphs +ego_1 <- make_named_ego_graph(gTree_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(gTree_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +head(ego_1,n=2) +head(ego_2,n=2) +``` + +## Count the number of nodes and the subgraphs in the ego-networks of each graph ($N_w$) +Once the ego-networks are extracted, the subgraph counts for all ego-network are obtained for each network being compared: +```{r} +# Subgraphs counts for ego-networks in query graphs +subgraph_counts_1 <- ego_to_graphlet_counts(ego_networks = ego_1, max_graphlet_size = max_subgraph_size) +subgraph_counts_2 <- ego_to_graphlet_counts(ego_networks = ego_2, max_graphlet_size = max_subgraph_size) + +head(subgraph_counts_1) +head(subgraph_counts_2) +``` + + +### Obtain binning for each ego-network according to the edge-density of the ego-networks of each individual graph ($\rho(.)$) +For each ego-network, its edge density is obtained, then the ego-networks are binned according to their edge density. This process is repeated for each of the query networks: +```{r} +# Get ego-network densities +densities_1 <- ego_network_density(graphlet_counts = subgraph_counts_1) +densities_2 <- ego_network_density(graphlet_counts = subgraph_counts_2) + +# Adaptively bin ego-network densities +binned_densities_1 <- binned_densities_adaptive(densities = densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +binned_densities_2 <- binned_densities_adaptive(densities = densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +str(binned_densities_1) +str(binned_densities_2) +``` + +## Calculate expected subgraph counts in each density bin by the Geometric-Poisson approximation ($E_w$) +With the ego-network binning obtained, the Geometric-Poisson approximation of the expected subgraph counts, $E_w$, can be obtained for each subgraph $w$ and each density bin: +```{r} +binned_gp_subgraph_counts_1 <- density_binned_counts_gp(graphlet_counts = subgraph_counts_1, + density_interval_indexes = binned_densities_1$interval_indexes, + max_graphlet_size = max_subgraph_size) + +binned_gp_subgraph_counts_2 <- density_binned_counts_gp(graphlet_counts = subgraph_counts_2, + density_interval_indexes = binned_densities_2$interval_indexes, + max_graphlet_size = max_subgraph_size) +binned_gp_subgraph_counts_1 +binned_gp_subgraph_counts_2 +``` + +## Obtain the centred counts for each subgraph on the query graphs based on the Geometric-Poisson approximation ($N_w(G) - E_w(G)$) +With $E_w$ now obtained, Netdis-GP, can be compute as per its construction by first centring the observed counts: +```{r} +# Calculate expected subgraph counts for each ego network +exp_gp_subgraph_counts_1 <- netdis_expected_counts(graphlet_counts = subgraph_counts_1, + density_breaks = binned_densities_1$breaks, + density_binned_reference_counts = binned_gp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size, + scale_fn=NULL) + + +exp_gp_subgraph_counts_2 <- netdis_expected_counts(graphlet_counts = subgraph_counts_2, + density_breaks = binned_densities_2$breaks, + density_binned_reference_counts = binned_gp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size, + scale_fn=NULL) + +# Centre subgraph counts by subtracting expected counts +centred_subgraph_counts_1 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_1, + exp_graphlet_counts = exp_gp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size) + +centred_subgraph_counts_2 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_2, + exp_graphlet_counts = exp_gp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size) + +head(centred_subgraph_counts_1) +head(centred_subgraph_counts_2) +``` +## Sum centred subgraph counts across all ego networks ($S_w(G)$) + +After the counts are centred, the total count is computed for each subgraph in each query network: +```{r} +sum_subgraph_counts_1 <- colSums(centred_subgraph_counts_1) +sum_subgraph_counts_1 + +sum_subgraph_counts_2 <- colSums(centred_subgraph_counts_2) +sum_subgraph_counts_2 +``` + +## Calculate Netdis-GP statistics +Finally, the total centred counts can be used to obtain the Netdis statistic: +```{r} + +netdis_result <- netdis_uptok(centred_graphlet_count_vector_1 = sum_subgraph_counts_1, + centred_graphlet_count_vector_2 = sum_subgraph_counts_2, + max_graphlet_size = max_subgraph_size) + +print(netdis_result) +``` + +------------------------- + + + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. + diff --git a/vignettes/NetdisStepByStep.Rmd b/vignettes/NetdisStepByStep.Rmd new file mode 100644 index 00000000..9b69845e --- /dev/null +++ b/vignettes/NetdisStepByStep.Rmd @@ -0,0 +1,287 @@ +--- +title: "Netdis: step by step" +#author: "Martin O'Reilly, Jack Roberts, Luis Ospina-Forero" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis: step by step} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +editor_options: +chunk_output_type: console + +--- + +```{r, include = FALSE} + knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" + ) +``` + +# Introduction + +This vignette explains some inner calculations of Netdis which the user does not need to go through, however, we explain them here for those users that may want to use Netdis in a modular fashion. + +For a simple Netdis function call see instead ["Simple and quick (default) usage 1: pairwise comparisons"](default_pairwise_usage.html). + +For other vignettes in this package see the ["Menu"](V-Menu.html). + +# What is Netdis? +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared among one another thus leading to the Netdis statistic. + +The selection of a gold-standard graph as a substitute for $E_w$ can be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard summarized in $E_w$. + +## Netdis is defined as follows: + +Let $N_{w,i}(G)$ be the number of induced occurrences of small graphs $w$ in the 2-step ego-network of vertex $i$. Now, bin all 2-step ego-networks of network $G$ according to their network density. Let $E_w(G,\rho)$ be the expected number of occurrences of $w$ in an ego-network whose density falls in density bin $\rho$. For a given network $G$ compute the centred subgraph counts as +\[ +S_w(G)=\sum\limits_{i }{\bigg (N_{w,i}(G)- E_w(G, \rho(i)) \bigg )}, +\] +where $i$ is a node in $G$ and $\rho(i)$ the density bin of the 2-step ego-network of node $i$. + +Now, to compare networks $G_1$ and $G_2$, set +$$ +\displaystyle +netD_2^S(k) = \tfrac{1}{ \sqrt{ M(k)} } \sum\limits_{w \in A(k)} +\bigg ({ \tfrac{S_w(G_1) S_w(G_2)} {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +}\bigg ), \quad k=3,4, 5, +$$ +where $A(k)$ is the set of connected subgraphs of size $k$, and where $M(k)$ is a normalising constant so that $netD_2^S(k)\in[-1,1]$. $M(k)$ is equal to +\[ +M(k) = \sum\limits_{w \in A(k)} +\left( \tfrac{ S_w(G_1)^2 }{\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} \right) +\sum\limits_{w \in A(k)} +\left(\tfrac{ S_w(G_2)^2 } {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +\right) +. +\] +The corresponding Netdis statistic is defined as +$$Netdis(k)=netd_2^S(k)=\tfrac{1}{2}(1-netD_2^S(k)) \in [0,1].$$ +Small values of Netdis suggest higher `similarity' between the networks. By default Netdis uses subgraphs on $k=4$ nodes. + + +___ + +# Netdis step by step + +## Load required libraries +```{r,message=FALSE} +# Load libraries +library("netdist") +library("igraph") +library("purrr") +``` + + +## Networks being compared +Generation of tree-like networks with 400 nodes and 1600 nodes. + +```{r, netwokrs,fig.align='center',fig.dim=c(6,6)} +# Create networks +set.seed(34) +gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) +gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) + +plot(gTree_1,vertex.size=0.8,vertex.label=NA) +plot(gTree_2,vertex.size=0.8,vertex.label=NA) +``` + + + +## Set Netdis parameters + +Netdis uses some mostly internal parameters that define the size of the subgraphs/graphlets that are going to be used, as well as the length of the ego neighbourhood, the minimum size considered for the resulting ego-networks, and finally some parameters that control an ego-network binning function. By default, Netdis considers the following setup: + +```{r} +# Maximum subgraph size to calculate counts and netdis statistic for +max_subgraph_size <- 4 + +# Ego-network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego-networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego-network density binning parameters. Here, the minimum number of ego-networks per bin and the starting number of bins +min_bin_count <- 5 +num_bins <- 100 +``` + + +## Obtain list of ego-networks + +One of the first steps in Netdis is the extraction of all ego-networks in each of the query networks: +```{r} +# Get ego-networks for query graphs +ego_1 <- make_named_ego_graph(gTree_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(gTree_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +tail(ego_1,n=2) +tail(ego_2,n=2) +``` + +## Count the number of nodes and the subgraphs in ego-networks of query graphs ($N_w$) + +Once the ego-networks are extracted, the subgraph counts for all ego-network are obtained for each network being compared: +```{r} +# Subgraphs counts for ego-networks in query graphs +subgraph_counts_1 <- ego_to_graphlet_counts(ego_networks = ego_1, max_graphlet_size = max_subgraph_size) +subgraph_counts_2 <- ego_to_graphlet_counts(ego_networks = ego_2, max_graphlet_size = max_subgraph_size) + +tail(subgraph_counts_1) +tail(subgraph_counts_2) +``` + +## Calculate expected subgraph counts + +In Netdis users can consider the following variants: + +* Expectation via a gold-standard network (original proposal). +* Expectation via the Geometric-Poisson / Polya-Aeppli approximation. +* No expectation (equivalent to setting the expectation equal to zero). + +In this vignette the complete steps are given for "Expectation via a gold-standard network". The step by step computation of Netdis-GP is in [Netdis-GP: step by step](NetdisGPStepByStep.html). + + + +### Expectation via a gold-standard network +For this case the user must provide the gold-standard network of their choosing. This network will be used as a comparison reference point by Netdis. + +The following considers a tree-like network with `r 30^2` nodes as the gold-standard. +```{r,fig.align='center',fig.dim=c(6,6)} +# Network used as gold-standard +gst_1 <- erdos.renyi.game(n = 30^2,p.or.m = graph.density(graph = gTree_2)) +plot(gst_1,vertex.size=0.8,vertex.label=NA) +``` + +### Obtain the gold-standard ego-network counts and their binning according to their edge-density ($\rho(.)$) + +To calculate the expected counts, $E_w$, the counts of the ego-networks of the gold-standard network need to be obtained first: +```{r} +# Obtain subgraph counts and binning for gold-standard +ego_gst_1 <- make_named_ego_graph(graph = gst_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +subgraph_counts_gst_1 <- ego_to_graphlet_counts(ego_networks = ego_gst_1, + max_graphlet_size = max_subgraph_size) + +head(subgraph_counts_gst_1) +``` + +Subsequently, these ego-networks are binned according to their edge density: +```{r} +densities_gst_1<- ego_network_density(graphlet_counts = subgraph_counts_gst_1) + +# Adaptively bin ego-network densities +binned_densities_gst_1 <- binned_densities_adaptive(densities = densities_gst_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +str(binned_densities_gst_1) +``` + +### Obtain the scaled mean counts for each density bin for the gold-standard network (canonical $E_w$) + +$E_w$ is estimated based on the average subgraph counts of ego-networks per density bin for each given subgraph. However, as the query networks and the gold-standard may have different number of nodes, the counts of the gold-standard network are first scaled to a "standard" or "canonical" scale from which they can be scaled back towards networks of different sizes. The following code first shows the computation of the subgraph counts for the ego-networks in the gold-standard network with their corresponding scaling: + +```{r} +# Scale ego-network subgraph counts by dividing by the total number of k-tuples in the +# ego-network (where k is the subgraph size) +scaled_subgraph_counts_ref <- scale_graphlet_counts_ego(graphlet_counts = subgraph_counts_gst_1, + max_graphlet_size =max_subgraph_size) +str(scaled_subgraph_counts_ref) +``` + +Finally, the standard or canonical $E_w$ can be obtained by taking the average per bin of the scaled subgraph counts: +```{r} +# Average of the scaled reference subgraph counts in each density bin +ref_binned_canonical_subgraph_counts <- mean_density_binned_graphlet_counts(graphlet_counts = scaled_subgraph_counts_ref, density_interval_indexes = binned_densities_gst_1$interval_indexes) + +ref_binned_canonical_subgraph_counts +``` + +## Obtain the centred counts for each subgraph on the query graphs based on the gold-standard counts ($N_w(G) - E_w(G)$) + +After obtaining the average scaled subgraph counts per density bin, the subgraph counts of the query networks can be centred: +```{r} +# Scale the reference counts of the gold-standard network to the sizes of each of the query ego-networks. +exp_subgraph_counts_1 <- netdis_expected_counts(graphlet_counts = subgraph_counts_1, + density_breaks = binned_densities_gst_1$breaks, + density_binned_reference_counts = ref_binned_canonical_subgraph_counts, + max_graphlet_size = max_subgraph_size, + scale_fn=count_graphlet_tuples) + + +exp_subgraph_counts_2 <- netdis_expected_counts(graphlet_counts = subgraph_counts_2, + density_breaks = binned_densities_gst_1$breaks, + density_binned_reference_counts = ref_binned_canonical_subgraph_counts, + max_graphlet_size = max_subgraph_size, + scale_fn=count_graphlet_tuples) + +# Centre subgraph counts by subtracting expected counts +centred_subgraph_counts_1 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_1, + exp_graphlet_counts = exp_subgraph_counts_1, + max_graphlet_size = max_subgraph_size) + +centred_subgraph_counts_2 <- netdis_subtract_exp_counts(graphlet_counts = subgraph_counts_2, + exp_graphlet_counts = exp_subgraph_counts_2, + max_graphlet_size = max_subgraph_size) + +tail(centred_subgraph_counts_1) +tail(centred_subgraph_counts_2) +``` + + +## Sum centred subgraph counts across all ego-networks ($S_w(G)$) +After the counts are centred, the total count is computed for each subgraph in each query network: +```{r} +sum_subgraph_counts_1 <- colSums(centred_subgraph_counts_1) +sum_subgraph_counts_1 + +sum_subgraph_counts_2 <- colSums(centred_subgraph_counts_2) +sum_subgraph_counts_2 +``` + +## Calculate the Netdis statistics +Finally, the total centred counts can be used to obtain the Netdis statistic: +```{r} + +netdis_result <- netdis_uptok(centred_graphlet_count_vector_1 = sum_subgraph_counts_1, + centred_graphlet_count_vector_2 = sum_subgraph_counts_2, + max_graphlet_size = max_subgraph_size) + +print(netdis_result) +``` + + + + +------------------------- + + + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. + + + + + diff --git a/vignettes/NewNetdisCustomisations.Rmd b/vignettes/NewNetdisCustomisations.Rmd new file mode 100644 index 00000000..ba6c49b6 --- /dev/null +++ b/vignettes/NewNetdisCustomisations.Rmd @@ -0,0 +1,221 @@ +--- +title: "Netdis: customizations" +#author: "Jack Roberts, Luis Ospina-Forero" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteEncoding{UTF-8} + %\VignetteIndexEntry{Netdis: customizations} + %\VignetteEngine{knitr::rmarkdown} +editor_options: +chunk_output_type: console +--- + +```{r, include = FALSE} + knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" + ) +``` + +# Introduction + +This vignette explains variants of Netdis via global parameter changes of the functions applying Netdis, for example `netemd_one_to_one` and `netdis_many_to_many`. For a more in-depth view of the inner steps of Netdis please check ["Netdis: step by step"](NetdisStepByStep.html) and ["Netdis-GP: step by step"](NetdisGPStepByStep.html). + +Similarly, to see how to use pre-computed properties see ["Simple and quick (default) usage 2: pre-computed properties"](PreComputedProps.html) + +For other vignettes in this package see the ["Menu"](V-Menu.html). + + +# Compare multiple networks via Netdis + +## Load required packages/libraries +```{r, packages, message= FALSE} +library("netdist") +library("igraph") +library("pheatmap") +``` + +## Networks being compared + +Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 nodes. A start-like graph is generated so they can be used as input for the reference graph in the option `ref_graph`. +```{r,netdisgoldstandnetworks ,fig.align='center'} +# Create lattice, Ring and Tree like networks of sizes 20^2 and 40^2. +# Create networks +set.seed(3171) +gLat_1 <- graph.lattice(c(20,20)) +gLat_2 <- graph.lattice(c(40,40)) +gRing_1 <- make_ring(20^2) +gRing_2 <- make_ring(40^2) +gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) +gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) + +glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) + +# Create a random graph to be used as a gold-standard +gst <- igraph::as.undirected(graph.star(1000)) +``` + +# Variants of Netdis: Changing the expectation + +## Using Netdis with a reference graph as a proxy for $E_w$ +For this variant a reference graph or gold-standard graph has to be given in `ref_graph`: +```{r,netdisgoldstand,fig.align='center',fig.dim=c(7,7)} +netdis_mat_gst <- netdis_many_to_many(graphs = glist, + ref_graph = gst + ) +netdis.plot(netdislist = netdis_mat_gst,whatrow = 2, main = "Netdis with reference graph") +``` + +## Using Netdis with a constant valued expectation, $E_w=k$ + +For this variant please set `ref_graph` to the desired constant $k$. In this example we consider $k=0$ and $k=5$. Considering $k=0$ is equivalent to computing Netdis without background expectations: +```{r,netdisconstant,fig.align='center',fig.dim=c(7,7)} +netdis_mat_zero <- netdis_many_to_many(graphs = glist, + ref_graph = 0 + ) +netdis.plot(netdislist = netdis_mat_zero,whatrow = 2, main = "Netdis Ew=0") + +netdis_mat_5 <- netdis_many_to_many(graphs = glist, + ref_graph = 5 + ) +netdis.plot(netdislist = netdis_mat_5,whatrow = 2, main = "Netdis Ew=5") +``` + + +## Using Netdis-GP, Geometric-Poisson approximation for $E_w$ +In order to obtain the Netdis-GP variant set `ref_graph=NULL` (default). +```{r,fig.align='center',fig.dim=c(7,7)} +netdisgp_mat <- netdis_many_to_many(graphs = glist, + ref_graph = NULL + ) +netdis.plot(netdisgp_mat, whatrow = 2, main = "Netdis-GP") +``` + +--------------------------- + +# Customizing the ego-network binning function + +The ego-network binning function takes as argument a vector of the ego-network edge densities. The function should return a named list that includes: the input `densities`, the resulting bin `breaks` (vector of density bin limits), and the vector `interval_indexes`, which states to what bin each of the individual elements in `densities` belongs to. See for example: +```{r,fig.align='center',fig.dim=c(7,7)} +# Create an ego-network edge density binning function. +mybinning <- function(densities) { + min_counts_per_interval <- 5 + num_intervals <- 3 + # + if( length(densities) < min_counts_per_interval) min_counts_per_interval <- length(densities) + breaks <- adaptive_breaks(densities, + min_count = min_counts_per_interval, + breaks = num_intervals + ) + interval_indexes <- interval_index(densities, + breaks = breaks, + out_of_range_intervals = FALSE + ) + list( + densities = densities, + interval_indexes = interval_indexes, + breaks = breaks + ) +} + +# Let us see an example output of the binning function +binning_example <- mybinning(runif(20)) +binning_example$breaks +binning_example$interval_indexes +binning_example$densities + +# Calculate Netdis +netdisgp_mat_mybin <- netdis_many_to_many(graphs = glist, + ref_graph = NULL, + binning_fn = mybinning + ) + +netdis.plot(netdislist = netdisgp_mat_mybin,whatrow = 2, main = "Netdis-GP with mybinning") +``` + +Note that whenever $E_w$ is taken as a constant value, then the binning will not have an effect on the computation of Netdis. + +--------------------------- + +# Other simple customization of the Netdis computation + +## Changing default function parameter values in the computation of Netdis + +The following shows possible changes to the subgraph/graphlet size, the radius of the ego-networks considered, the minimum number of nodes and edges that are allowed in the ego-networks used in Netdis, and finally, some parameters to modify the ego-network binning. The following are the default values: +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego-network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +These values can be directly imputed and changed into the shortcut Netdis function calls. However, not all combinations may be possible. The following shows the use of subgraphs up to size 4, with 3-step ego-networks and where only ego-networks with at least 5 nodes and 4 edges can be considered. Furthermore, the binning of the ego-networks will be sett to start with 20 bins and each bin will be required to have at least 20 elements. + +```{r,fig.align='center',fig.dim=c(7,7)} +# (We only recommend changing these default values for those users that have a clear understanding of graph theory behind it) +#(change values with care as not all combinations may be possible). + +#Defining a new binning function: +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, #10-egos required per bin + num_intervals = 20) #Start binning with 20 bins + +#Changing parameter values in Netdis: +netdisgp_mat_custom <- netdis_many_to_many(graphs = glist, + ref_graph = NULL, + max_graphlet_size = 4, #Subgraphs/graphlets up to size 4 considered. + neighbourhood_size = 3,# 3-step ego-networks + min_ego_nodes = 5, #ego-networks with at least five nodes + min_ego_edges = 4, #ego-networks with at least 4 edges + binning_fn = binning_fn #Providing a custom binning function + ) +``` + +Here the default parameters are used, and a heatmap of the result of Netdis with default parameters and Netdis with the previously modified parameters is given: +```{r ,fig.align='center',fig.dim=c(7,7)} +#Default binning +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) +#Default computation of Netdis +netdisgp_mat <- netdis_many_to_many(graphs = glist, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn + ) +netdis.plot(netdislist = netdisgp_mat,whatrow = 2, main = "Netdis-GP: Default parameter values") +netdis.plot(netdislist = netdisgp_mat_custom,whatrow = 2, main = "Netdis-GP: illustrative parameter changes") + +``` + + +------------------------- + + + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. + + diff --git a/vignettes/PreComputedProps.Rmd b/vignettes/PreComputedProps.Rmd new file mode 100644 index 00000000..815f7e85 --- /dev/null +++ b/vignettes/PreComputedProps.Rmd @@ -0,0 +1,219 @@ +--- +title: "Simple and quick (default) usage 2: pre-computed properties" +#author: "Luis Ospina-Forero" +date: "23/06/2020" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{PreComputedProps} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Introduction + +In many situations there may be a need to pursue the network comparisons from pre-computed counts/features. Working from pre-computed features decreases the computational time to obtain results, particularly when the same networks are involved in multiple and subsequent comparisons, or when the same properties are used in other procedures. Similarly, advance users, could consider using other sets of pre-computed features, particularly for NetEmd. + +This vignette follows similar examples as ["Simple and quick (default) usage 1: pairwise comparisons"](default_pairwise_usage.html). However, this vignette shows the steps required to obtain counts/features and then how to use them in the network comparison methods. + +For other vignettes in this package see the ["Menu"](V-Menu.html). + +# Load required packages/libraries +```{r, packages, message= FALSE} +# Load packages/libraries +library("netdist") +library("igraph") +``` + +# Compare two networks via NetEmd + +The following sections show how to extract the subgraph counts, or other relevant features prior to running NetEmd or Netdis. Then it shows how to call NetEmd or Netdis using the pre-computed features. + +## Comparing two graphs with NetEmd via subgraph counts. + +NetEmd and Netdis use subgraph counts, however, NetEmd takes counts directly from the observed graph. Netdis takes counts from ego-networks of the observed graph. The following example shows how the counts are extracted for NetEmd via `count_orbits_per_node`. The user can call NetEmd by directly providing the matrix of counts, or by providing a processed version of these counts via `graph_features_to_histograms`. + +```{r, netemd,fig.align='center',fig.dim=c(8,4)} +# Create lattice networks +gLat_1 <- igraph::graph.lattice(c(20,20)) +gLat_2 <- igraph::graph.lattice(c(44,44)) + +par(mfrow=c(1,2)) +plot(gLat_1,vertex.size=4,vertex.label=NA) +plot(gLat_2,vertex.size=4,vertex.label=NA) +``` + +The simple computation of NetEmd without pre-computed features: +```{r} +netemd_one_to_one(graph_1=gLat_1,graph_2=gLat_2,feature_type="orbit",smoothing_window_width = 1) +``` + + +### Providing a matrix of network features +```{r} +counts_1= count_orbits_per_node(graph = gLat_1,max_graphlet_size = 5) +counts_2= count_orbits_per_node(graph = gLat_2,max_graphlet_size = 5) +head(counts_1[,1:4]) + +netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2,smoothing_window_width = 1) +``` + +### Providing the network features as lists of dhist objects +```{r} +dhists_1<- graph_features_to_histograms(features_matrix = counts_1) +dhists_2<- graph_features_to_histograms(features_matrix = counts_2) + +netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2,smoothing_window_width = 1) +``` +___ + +## Comparing two graphs with NetEmd via their Laplacian spectrum + +Computation of the Laplacian and Normalized Laplacian: +```{r, netemdEigen} +# Networks +gLat_1 <- graph.lattice(c(20,20)) +gLat_2 <- graph.lattice(c(44,44)) + +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = TRUE,sparse = FALSE) + + +# Providing a matrix of network features (e.g. Spectra). (This may take a couple of minutes). +spec_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +spec_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +head(spec_1) +``` + +Similarly to counts, all other features can be given as a matrix or as dhist objects: +```{r} +netemd_one_to_one(dhists_1 = spec_1,dhists_2 = spec_2, smoothing_window_width = 0) + + +# Providing pre-computed dhist objects from network features +dhists_1<- graph_features_to_histograms(spec_1) +dhists_2<- graph_features_to_histograms(spec_2) + +netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2, smoothing_window_width = 0) +``` +------------------------- + +# Compare two networks via Netdis and its variants + +Netdis uses counts from the resulting ego-networks of each of the nodes in a graph, but it also considers the mean or expectation of these ego-network counts ($E_w$). The following example shows how the user can compute and provide the properties required from each graph as well as any relevant property required to calculate $E_w$. The main properties can be obtained by means of `count_graphlets_ego`. + + +## Using Netdis with a gold-standard or reference graph to obtain $E_w$ + +The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard graph and which is summarized in $E_w$. + +```{r,netdisgoldstand} +# Set source directory for virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") +``` + +For illustration purposes consider the lattice networks as possible gold-standard networks: +```{r,fig.align='center'} +# Lattice graphs to be used as gold-standard as a reference point comparison +goldstd_1 <- igraph::graph.lattice(c(20,20)) #Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes +``` + +Now obtain the subgraph counts for all networks. +```{r} +# Providing pre-calculated subgraph counts. +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) +props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) +props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) +``` + +Compute Netdis using the pre-computed counts and any of the example gold-standard networks. +```{r} +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +``` + +Comparison to the result of Netdis without pre-computed counts. +```{r} +# Netdis using the goldstd_1 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + +# Netdis using the goldstd_2 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +``` + + +## Netdis-GP: Using a Geometric-Poisson approximation + +This Netdis variant focuses on detecting more meso-level discrepancies between the ego-network structures. + +```{r, netdisGP} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") + +# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) + +# Providing pre-calculated subgraph counts. +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) + +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) +``` + + +## Using Netdis with no background expectation ($E_w=0$) +Comparing the networks via their observed ego counts without centring them, (equivalent to using expectation equal to zero). This variant focuses on detecting small discrepancies between the networks. + + +```{r,netdiszero} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") + +#Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + +# Providing pre-calculated subgraph counts. +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) + +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) +``` + +------------------------- + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. diff --git a/vignettes/Quick_start_net_dis.R b/vignettes/Quick_start_net_dis.R deleted file mode 100644 index c7bda001..00000000 --- a/vignettes/Quick_start_net_dis.R +++ /dev/null @@ -1,68 +0,0 @@ -## ------------------------------------------------------------------------ -library("netdist") -edge_format = "ncol" -# Load reference graph (used for Netdis. Not required for NetEMD) -ref_path = file.path(system.file(file.path("extdata", "random"), package = "netdist"), "ER_1250_10_1") -ref_graph <- read_simple_graph(ref_path, format = edge_format) - -# Set source directory and file properties for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -edge_format = "ncol" -file_pattern = "*" - -# Load all graphs in the source folder matching the filename pattern -query_graphs <- read_simple_graphs(source_dir, format = edge_format, - pattern = file_pattern) -print(names(query_graphs)) - -## ------------------------------------------------------------------------ -# Set the maximum graphlet size to compute counts for -max_graphlet_size <- 4 -neighbourhood_size <- 2 - -## ------------------------------------------------------------------------ -expected_count_fn <- netdis_expected_graphlet_counts_ego_fn( - ref_graph, max_graphlet_size, neighbourhood_size) - -## ------------------------------------------------------------------------ -centred_counts <- purrr::map(query_graphs, netdis_centred_graphlet_counts, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - expected_ego_count_fn = expected_count_fn) - -## ------------------------------------------------------------------------ -# Netdis measure for graphlets of size 3 -res3 <- netdis_for_all_graphs(centred_counts, 3) -netdis3_mat <- cross_comp_to_matrix(res3$netdis, res3$comp_spec) -# Netdis measure for graphlets of size 4 -res4 <- netdis_for_all_graphs(centred_counts, 4) -netdis4_mat <- cross_comp_to_matrix(res4$netdis, res4$comp_spec) -netdis4_mat - -## ------------------------------------------------------------------------ -graphdists<-as.dist(netdis4_mat) -par(mfrow=c(1,2)) -cex=1 -# Dendrogram based on Netdis measure for graphlets of size 3 -title = paste("Netdis: graphlet size = ", 3, sep = "") -plot(phangorn::upgma(as.dist(netdis3_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) -# Dendrogram based on Netdis measure for graphlets of size 4 -title = paste("Netdis: graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netdis4_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) - -## ------------------------------------------------------------------------ -cex=1.5 -col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 3, sep = "") -heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) - -## ------------------------------------------------------------------------ -cex=1.5 -col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 4, sep = "") -heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) - diff --git a/vignettes/Quick_start_net_dis.html b/vignettes/Quick_start_net_dis.html deleted file mode 100644 index a37c51ab..00000000 --- a/vignettes/Quick_start_net_dis.html +++ /dev/null @@ -1,385 +0,0 @@ - - - - - - - - - - - - - - - - -Quick start guide for Netdis - - - - - - - - - - - - - - - - - -

Quick start guide for Netdis

-

Martin O’Reilly

-

2018-02-07

- - - -
-

Virus PPI example for Netdis

-
-

Load graphs

-

Use read_simple_graphs to read graph data from all files in a directory that match a specific filename pattern in a format suitable for calculating graphlet-based feature counts using the ORCA package. We use igraph::read_graph to read graph data from files, so support all file formats it supports. See help for igraph::read_graph for a list of supported values for the format parameter and the igraph documentation for descriptions of each of the supported file formats.

-

The ORCA package we use to efficiently calculate graphlet and orbit counts requires that graphs are undirected, simple (i.e. have no self-loops or multiple edges) and connected (i.e. have no isolated vertices). Therefore, by default, graphs loaded by read_simple_graphs will be coerced to have the above properties. This can be avoided by setting the relevant as_undirected, remove_loops, remove_multiple or remove_isolates parameters to FALSE.

- -
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
-

In this example we will use counts of graphlets containing up to 4 nodes and consider ego-network neighbourhoods of size 2 (i.e. the immediate neighbours of each node plus their immediate neighbours).

- -
-
-
-

Generate a function to generate expected graphlet counts

-

Use netdis_expected_graphlet_counts_ego_fn to generate a function that calculates expected ego-network graphlet counts for query graphs based on the statistics of a provided reference graph.

- -
-
-

Generate centred graphlet counts for a set of query graphs

- -
-
-

Generate NetDis measures between each pair of query graphs

- -
##              EBV       ECL      HSV-1       KSHV        VZV
-## EBV   0.00000000 0.1749835 0.16526412 0.01969246 0.15971116
-## ECL   0.17498347 0.0000000 0.29176120 0.22155786 0.41716144
-## HSV-1 0.16526412 0.2917612 0.00000000 0.07602426 0.03434187
-## KSHV  0.01969246 0.2215579 0.07602426 0.00000000 0.13115524
-## VZV   0.15971116 0.4171614 0.03434187 0.13115524 0.00000000
-
-
-

Generate dendrograms

- -

- -

- -

-
- - - - - - - - diff --git a/vignettes/Quick_start_net_emd.html b/vignettes/Quick_start_net_emd.html deleted file mode 100644 index 6519a226..00000000 --- a/vignettes/Quick_start_net_emd.html +++ /dev/null @@ -1,157 +0,0 @@ - - - - - - - - - - - - - - - - -Quick start guide for NetEMD - - - - - - - - - - - - - - - - - -

Quick start guide for NetEMD

-

Martin O’Reilly

-

2017-06-05

- - - -
-

Virus PPI example for NetEMD

-
library("netdist")
-# Set source directory and file properties for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-edge_format = "ncol"
-file_pattern = ".txt"
-
-# Calculate graphlet-based degree distributions for all orbits in graphlets 
-# comprising up to 4 nodes for all graphs. This only needs to be done once 
-# per graph (feature_type = "orbit", max_graphlet_size = 4).. 
-# If feature_type is set to "feature_type", orbit counts for orbits in the
-# same graphlet will be summed to generate graphlet counts
-# If max_graphlet_size is set to 5, graphlet-based degree distributions will  
-# be calculated for graphlets comprising up to 5 nodes.
-virus_gdds <- gdd_for_all_graphs(
-  source_dir = source_dir, format = edge_format, pattern = file_pattern, 
-  feature_type = "orbit", max_graphlet_size = 4)
-names(virus_gdds)
-
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
-
# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- 
-# based degree distributions using the default fast "optimise" method and no
-# smoothing (default). The "optimise" method uses the built-in R optimise
-# function to efficiently find the offset with the minimum EMD, but is not
-# guaranteed to find the global minimum if EMD as a function of offset
-# is non-convex and/or multimodal. The smoothing window width determines 
-# whether to calculate the NetEMD from the unaltered discrete GDD histograms
-# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" 
-# smoothing by "smearing" the discrete GDD histogram point masses across bins 
-# of unit width (smoothing_window_width = 1). Returns a named list containing:
-# (i) the NetEMDs and (ii) a table containing the graph names and indices 
-# within the input GDD list for each pair of graphs compared.
-res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0)
-
-# You can also specify method = "fixed_step" to use the much slower method of 
-# exhaustively evaluating the EMD at all offsets separated by a fixed step. 
-# The default step size is 1/2 the the minimum spacing between locations in 
-# either histogram after normalising to unit variance. However, you can 
-# specifiy your own fixed step using the optional "step_size" parameter.
-# Note that this step size is applied to the histograms after they have been 
-# normalised to unit variance
-
-# Convert to matrix for input to dendrogram method
-netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec)
-netemd_mat
-
##             EBV       ECL     HSV-1      KSHV       VZV
-## EBV   0.0000000 0.4876042 0.1662898 0.1607299 0.1994619
-## ECL   0.4876042 0.0000000 0.3986298 0.4024202 0.4029356
-## HSV-1 0.1662898 0.3986298 0.0000000 0.1581559 0.2164026
-## KSHV  0.1607299 0.4024202 0.1581559 0.0000000 0.2323955
-## VZV   0.1994619 0.4029356 0.2164026 0.2323955 0.0000000
-
cex=1
-# Dendrogram based on Netdis measure for graphlets of size 3
-title = paste("Netdis: graphlet size = ", 4, sep = "")
-plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, 
-     edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, 
-     cex.sub=cex, cex=cex)
-

-
# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in 
-# parallel using multiple threads where supported. The number of threads
-# used is determined by the global R option "mc.cores". You can inspect the 
-# current value of this using options("mc.cores") and set it with 
-# options("mc.cores" = <num_cores>). To fully utilise a modern consumer
-# processor, this should be set to 2x the number of available processor 
-# cores as each core supports two threads.
-
- - - - - - - - diff --git a/vignettes/V-Menu.Rmd b/vignettes/V-Menu.Rmd new file mode 100644 index 00000000..51c00df1 --- /dev/null +++ b/vignettes/V-Menu.Rmd @@ -0,0 +1,55 @@ +--- +title: "Netdis Vignette's Menu" +date: "`10-06-2020`" +#author: "Luis Ospina-Forero" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{V-Menu} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Introductory vignettes + +This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods 'Netdis', 'NetEmd' and their variants (e.g. `Netdis-GP'), and also the large flexibility of these methodologies. The following is a list of the available vignettes: + +* [Simple and quick (default) usage 1: pairwise comparisons](default_pairwise_usage.html). For comparisons between two networks. + +* [Simple and quick (default) usage 2: precomputed properties](PreComputedProps.html). For comparisons that use pre-computed properties. This may be useful if a user specific property is to be used; but also when the user wants to save computation time in a setting of multiple comparisons against the same network. + +* [Simple and quick (default) usage 3: many to many comparisons](ManyToMany.html). User friendly options to retrieve all pairwise comparisons among a set of $n$ networks. + +* ["Netdis: Protein interaction networks"](dendrogram_example_net_dis.html). Shows a potential application of Netdis for real world protein interaction networks. + +* ["NetEmd: World trade networks"](NetEmdTimeOrdering.html). Shows a potential application of NetEmd using world trade networks. + + +## Some more details of Netdis + +* ["Netdis: customizations"](NewNetdisCustomisations.html). Shows general parameter changes of Netdis. + +* ["Netdis: step by step"](NetdisStepByStep.html). Shows a breakdown of the Netdis computation. + +* ["Netdis-GP: step by step"](NetdisGPStepByStep.html). Shows a breakdown of the Netdis-GP variant computation. + +------------------------- + + + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. diff --git a/vignettes/default_pairwise_usage.Rmd b/vignettes/default_pairwise_usage.Rmd new file mode 100644 index 00000000..74db95f1 --- /dev/null +++ b/vignettes/default_pairwise_usage.Rmd @@ -0,0 +1,229 @@ +--- +title: "Simple and quick (default) usage 1: pairwise comparisons" +date: "`10-06-2020`" +#author: "Luis Ospina-Forero" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{default_pairwise_usage} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( +collapse = TRUE, +comment = "#>" +) +``` + +# Introduction + +The `netdist` package currently considers two broad methodologies for network comparison, namely Netdis and NetEmd. Netdis considers multiple variants (via background expectations) to capture the dissimilarity between the local structure of networks exhibited by the occurrence of small subgraphs on 3, 4 and 5 nodes. NetEmd is also a method to capture the dissimilarity between networks using subgraph counts, but it has in addition been defined for any type of network features; for example eigen distributions. The variants of Netdis are controlled by the input selected for the background expectations, whereas the variants of NetEmd are controlled indirectly by the user by the selection of the network features being compared via NetEmd (by default this package uses subgraph counts as features). + +The following shows a quick introduction to the simplest functions of the package, and to some of the variants of Netdis and NetEmd. + +For other vignettes in this package see the ["Menu"](V-Menu.html). + + +# Load required packages/libraries +```{r, packages, message= FALSE} +# Load packages/libraries +library("netdist") +library("igraph") +``` + +## Load protein interaction graphs included in the netdist package +The `netdist` package also includes examples of a few real networks. These are protein interaction networks (PPI) of a few _Herpes_ virus and of the bacteria _Escherichia Coli_ . In these networks, each node represents a protein and each link represents an interaction between proteins. See `help(virusppi)`. + +Although the `virusppi` list of PPI networks is loaded along with the `netdist` package, the following code shows how to read a network data from a file in disk: +```{r, graphs,fig.align='center',fig.dim=c(6,6)} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as undirected igraph objects, with no loops, multiple edges or degree zero nodes. +graph_1 <- read_simple_graph(file = file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file = file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 +#Note this graph is the same as +# virusppi$EBV + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 +#Note this graph is the same as +# virusppi$ECL + +#A simple visualization of the graphs. +plot(graph_1,vertex.size=4,vertex.label=NA) +plot(graph_2,vertex.size=4,vertex.label=NA) +``` + +Other networks loaded in this package are discussed in ["NetEmd: World trade networks"](NetEmdTimeOrdering.html). You can also see `?virusppi` and `?worldtradesub`. + +The next two sections will provide a basic introduction and usage of both NetEmd and Netdis. + +# Compare two networks via NetEmd + +## What is NetEmd? +(Extracted from Wegner et al. (2017)): +NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties reflects the topological organization of the network. From an abstract point of view, NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. + +Based on the previous ideas NetEmd uses the following measure between distributions $p$ and $q$ that are supported on $\mathbb{R}$ and have non-zero, finite variances: +\begin{equation}\label{emdmet} +EMD^*(p,q)=\mathrm{inf}_{c\in\mathbb{R}}\left( EMD\big(\tilde{p}(\cdot+c),\tilde{q}(\cdot)\big)\right), +\end{equation} +where $EMD$ is the earth mover's distance and $\tilde{p}$ and $\tilde{q}$ are the distributions obtained by rescaling $p$ and $q$ to have variance 1. More precisely, $\tilde{p}$ is the distribution obtained from $p$ by the transformation $x\rightarrow \frac{x}{\sigma(p)}$, where $\sigma(p)$ is the standard deviation of $p$. For probability distributions $p$ and $q$ with support in $\mathbb{R}$ and bounded absolute first moment, the $EMD$ between $p$ and $q$ is given by $EMD(p,q)=\int_{-\infty}^\infty|F(x)-G(x)|\,\mathrm{d}x$, where $F$ and $G$ are the cumulative distribution functions of $p$ and $q$ respectively. + +Now, for two networks $G$ and $G'$ and for a given set $T=\{t_1,t_2,...,t_m\}$ of network features, the $NetEmd$ measure corresponding to $T$ is: +\begin{equation}\label{eq:def_netemd} +NetEmd_T(G,G')=\frac{1}{m}\sum_{j=1}^{m} NetEmd_{t_j} (G,G'), +\end{equation} +where +\begin{equation} +NetEmd_{t_i} (G,G')=EMD^*(p_{t_i}(G),p_{t_i}(G')), +\end{equation} +and where $p_{t_i}(G)$ and $p_{t_i}(G')$ are the distributions of ${t_i}$ on $G$ and $G'$ respectively. $NetEmd_{t_i}$ can be shown to be a pseudometric between graphs for any feature $t$, that is it is non-negative, symmetric and satisfies the triangle inequality. + + +## Comparing two graphs with NetEmd +```{r, netemd,fig.align='center'} +# Set source directory for Virus protein-protein interaction network edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# One to one NetEmd comparison. +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +``` + +## Comparing two graphs with NetEmd via their Laplacian spectrum +```{r, netemdEigen,fig.align='center'} +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) + +#Spectra (this may take a couple of minutes). +props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +head(props_1,n=3) +head(props_2,n=3) + +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. +``` +------------------------- + +# Compare two networks via Netdis and its variants + +## What is Netdis? +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared thus leading to the Netdis statistic. + +Netdis is constructed as follows: + +Let $N_{w,i}(G)$ be the number of induced occurrences of small graphs $w$ in the 2-step ego network of vertex $i$. Now, bin all 2-step ego-networks of network $G$ according to their network density. Let $E_w(G,\rho)$ be the expected number of occurrences of $w$ in an ego-network whose density falls in density bin $\rho$. For a given network $G$ compute the centred subgraph counts as +\[ +S_w(G)=\sum\limits_{i }{\bigg (N_{w,i}(G)- E_w(G, \rho(i)) \bigg )}, +\] +where $i$ is a node in $G$ and $\rho(i)$ the density bin of the 2-step ego-network of node $i$. + +Now, to compare networks $G_1$ and $G_2$, set +$$ +\displaystyle +netD_2^S(k) = \tfrac{1}{ \sqrt{ M(k)} } \sum\limits_{w \in A(k)} +\bigg ({ \tfrac{S_w(G_1) S_w(G_2)} {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +}\bigg ), \quad k=3,4, 5, +$$ +where $A(k)$ is the set of connected subgraphs of size $k$, and where $M(k)$ is a normalising constant so that $netD_2^S(k)\in[-1,1]$. $M(k)$ is equal to +\[ +M(k) = \sum\limits_{w \in A(k)} +\left( \tfrac{ S_w(G_1)^2 }{\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} \right) +\sum\limits_{w \in A(k)} +\left(\tfrac{ S_w(G_2)^2 } {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +\right) +. +\] +The corresponding Netdis statistic is defined as +$$Netdis(k)=netd_2^S(k)=\tfrac{1}{2}(1-netD_2^S(k)) \in [0,1].$$ +Small values of Netdis suggest higher `similarity' between the networks. By default Netdis uses subgraphs on $k=4$ nodes. + + +## Using Netdis with a gold-standard graph to obtain $E_w$ + +The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard graph and which is summarized in $E_w$. + +```{r,netdisgoldstand,fig.align='center',fig.dim=c(6,6)} +# Lattice graphs to be used as a gold-standard reference point +goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes + +plot(goldstd_1,vertex.size=4,vertex.label=NA) +plot(goldstd_2,vertex.size=4,vertex.label=NA) + + +# Netdis using the goldstd_1 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + +# Netdis using the goldstd_2 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +``` + +## Netdis-GP: Using a Geometric-Poisson approximation + +(Extracted from Ospina-Forero et al. (2018)): Instead of considering an approximation based on an observed gold-standard network whose selection may be difficult, $E_w$ is computed independently for each graph, based on a Geometric-Poisson (GP) approximation of the distribution of the number of occurrences of subgraph $w$. It assumes that $N_{w,i} \sim GP(\lambda^{\rho(i)}_k, \theta^{\rho(i)}_w)$, where $\lambda^{\rho(i)}_k$ is the Poisson parameter indexed by the size of subgraph $w$ and the density bin $\rho(i)$; and where $\theta^{\rho(i)}_w$ is the geometric parameter indexed by subgraph $w$ and density bin $\rho(i)$. $E_w(G, \rho(i))$ is taken as the mean of the GP approximation, i.e. $\lambda^{\rho(i)}_k/\theta^{\rho(i)}_w$. + +As $\lambda^{\rho(i)}_k$ and $\theta^{\rho(i)}_w$ are not known, they are estimated as follows: +Let $x_{w,d}^j$ be the number of subgraphs $w$ on the 2-step ego-network $j$ of density bin $d$, and let +\[ +\bar{X}_{w,d}=\frac{1}{q} \sum_{j=1}^q x_{w,d}^j, \qquad V^2_{w,d}=\frac{1}{q-1} \sum_{j=1}^q (x_{w,d}^j - \bar{X}_{w,d})^2 +, +\] +where $q$ is the number of ego-networks in density bin $d$. Then, +\[ +\hat{\lambda}^{d}_{k}= \frac{1}{l} \sum_{h \in A(k)} \frac{2 (\bar{X}_{h,d})^2}{V^2_{h,d}+\bar{X}_{h,d}} , \qquad \hat{\theta}^{d}_w= \frac{2\bar{X}_{w,d}}{V^2_{w,d}+\bar{X}_{w,d}}, +\] +where $l$ is the number of connected subgraphs of size $k$, for example, $l=6$ for $k=4$. These estimators are based on the moment estimators of a GP random variable and the proposal made by (Picard et al.(2008)), where the total count of each individual subgraph could be thought as the sum of the total subgraph counts over multiple ``clumps'' of edges that appear across the network. + +This variant focuses on detecting more meso-level discrepancies between the ego-network structures. + +```{r, netdisGP} +#Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) +``` + + +## Using Netdis with no expectation ($E_w=0$) +Comparing the networks via their observed ego counts without centring them, (equivalent to using expectation equal to zero). This variant thus focus on detecting small discrepancies between the networks. + +```{r,netdiszero} +#Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + +``` + +------------------------- + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd new file mode 100644 index 00000000..5c0a3445 --- /dev/null +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -0,0 +1,178 @@ +--- +title: "Netdis: Protein interaction networks" +#author: "Martin O'Reilly, Luis Ospina-Forero" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteEncoding{UTF-8} + %\VignetteIndexEntry{Netdis: Protein interaction networks} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +chunk_output_type: console +--- + +# Protein interaction networks + +In this vignette we show a potential application of Netdis for real world data. In this case the comparison between the protein network structures could provide some insight into the phylogenetic tree of organisms as suggested in Ali et al. (2014). In this application we show a proxy of the phylogenetic tree of 5 organisms, 4 _Herpes_ virus and _Escherichia Coli_. The tree is approximated via a dendrogram built from all the pairwise protein interaction network comparisons as suggested in Ali et al. (2014). The correct tree is found using Netdis and using subgraphs of size 4 (taxonomy ground truth: NCBI taxonomy database. https://www.ncbi.nlm.nih.gov/taxonomy). + +For an introduction to Netdis and its variants see ["Simple and quick (default) usage 1: pairwise comparisons"](default_pairwise_usage.html). + +For many to many comparisons see [Simple and quick (default) usage 3: many to many comparisons](ManyToMany.html) + +For other vignettes in this package see the ["Menu"](V-Menu.html). + +# Loading protein interaction networks and an ER reference graph + +The package contains the protein interaction networks of a few _Herpes_ virus and of _Escherichia Coli_. These networks can be read by using `read_simple_graphs`, which reads graph data from all files in a directory that match a specific filename pattern in a format suitable for calculating subgraph-based features by using the +[ORCA package](https://CRAN.R-project.org/package=orca). + +We use `igraph::read_graph` to read graph data from files. See help for `igraph::read_graph` for a list of +supported values for the `format` parameter. + +```{r, message=FALSE} +library("netdist") +library("igraph") +edge_format = "ncol" +# Load reference graph (used for Netdis. Not required for NetEmd +ref_path = file.path(system.file(file.path("extdata", "random"), package = "netdist"), + "ER_1250_10_1") +ref_graph <- read_simple_graph(ref_path, format = edge_format) + +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" + +# Load all graphs in the source folder matching the filename pattern +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) +print(names(query_graphs)) +``` + +```{r,fig.align='center',fig.dim=c(5,5)} +plot(query_graphs$EBV,vertex.label=NA,vertex.size=8) +plot(query_graphs$`HSV-1`,vertex.label=NA,vertex.size=8) +plot(query_graphs$KSHV,vertex.label=NA,vertex.size=8) +plot(query_graphs$VZV,vertex.label=NA,vertex.size=8) +plot(query_graphs$ECL,vertex.label=NA,vertex.size=4) +``` + +# Generate Netdis measures between each pair of query graphs + +In this example **Netdis** will use counts of subgraphs containing up to 4 nodes and +consider ego-networks (node ''neighbourhoods'') of size 2 (i.e. the immediate neighbours of +each node plus their immediate neighbours). +```{r} +# Set the maximum subgraph size to compute counts for +max_subgraph_size <- 4 +neighbourhood_size <- 2 +``` + +## Netdis using an ER reference graph +```{r} +# Calculate netdis measure for subgraphs up to size max_subgraph_size +netdis_result <- netdis_many_to_many(graphs = query_graphs, + ref_graph = ref_graph, + max_graphlet_size = max_subgraph_size, + neighbourhood_size = neighbourhood_size) + +# Netdis measure for subgraphs of size 3 +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(measure = res3, cross_comparison_spec = netdis_result$comp_spec) + +print("Netdis: subgraph size = 3") +print(netdis3_mat) + +# Netdis measure for subgraphs of size 4 +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print("Netdis: subgraph size = 4") +print(netdis4_mat) +``` + + +# Generate dendrograms +Dendrograms are created based on the network comparison values by joining networks with the ones with greater similarity. The specific details on how the links are made can be found in the help of `phangorn::upgma`. + +```{r,fig.align='center',fig.dim=c(6,6)} +par(mfrow = c(1, 2)) +cex <- 1 + +# Dendrogram based on Netdis measure for subgraphs of size 3 +title <- paste("Netdis: subgraph size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + +# Dendrogram based on Netdis measure for subgraphs of size 4 +title = paste("Netdis: subgraph size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) +``` + + + +-------------------- + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. + + + + + + + diff --git a/vignettes/dendrogram_example_net_emd.Rmd b/vignettes/dendrogram_example_net_emd.Rmd new file mode 100644 index 00000000..4a70fb8d --- /dev/null +++ b/vignettes/dendrogram_example_net_emd.Rmd @@ -0,0 +1,80 @@ +--- +title: "Dendrogram example for NetEMD" +author: "Martin O'Reilly" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Dendrogram example for NetEMD} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- +## Virus PPI example for NetEMD +```{r, fig.show='hold'} +library("netdist") +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +edge_format = "ncol" +file_pattern = ".txt" + +# Calculate graphlet-based degree distributions for all orbits in graphlets +# comprising up to 4 nodes for all graphs. This only needs to be done once +# per graph (feature_type = "orbit", max_graphlet_size = 4).. +# If feature_type is set to "feature_type", orbit counts for orbits in the +# same graphlet will be summed to generate graphlet counts +# If max_graphlet_size is set to 5, graphlet-based degree distributions will +# be calculated for graphlets comprising up to 5 nodes. +virus_gdds <- gdd_for_all_graphs( + source_dir = source_dir, format = edge_format, pattern = file_pattern, + feature_type = "orbit", max_graphlet_size = 4) +names(virus_gdds) + +# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- +# based degree distributions using the default fast "optimise" method and no +# smoothing (default). The "optimise" method uses the built-in R optimise +# function to efficiently find the offset with the minimum EMD, but is not +# guaranteed to find the global minimum if EMD as a function of offset +# is non-convex and/or multimodal. The smoothing window width determines +# whether to calculate the NetEMD from the unaltered discrete GDD histograms +# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" +# smoothing by "smearing" the discrete GDD histogram point masses across bins +# of unit width (smoothing_window_width = 1). Returns a named list containing: +# (i) the NetEMDs and (ii) a table containing the graph names and indices +# within the input GDD list for each pair of graphs compared. +res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) + +# You can also specify method = "fixed_step" to use the much slower method of +# exhaustively evaluating the EMD at all offsets separated by a fixed step. +# The default step size is 1/2 the the minimum spacing between locations in +# either histogram after normalising to unit variance. However, you can +# specifiy your own fixed step using the optional "step_size" parameter. +# Note that this step size is applied to the histograms after they have been +# normalised to unit variance + +# Convert to matrix for input to dendrogram method +netemd_mat <- cross_comp_to_matrix(res$netemds, res$comp_spec) +netemd_mat +``` + +```{r} +cex=1 +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, + edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, + cex.sub=cex, cex=cex) + +# The gdd_for_all_graphs and netemd_many_to_many functions will run in +# parallel using multiple threads where supported. The number of threads +# used is determined by the global R option "mc.cores". You can inspect the +# current value of this using options("mc.cores") and set it with +# options("mc.cores" = ). To fully utilise a modern consumer +# processor, this should be set to 2x the number of available processor +# cores as each core supports two threads. +``` + +```{r} +cex=1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +``` + diff --git a/vignettes/netdis_customisations.Rmd b/vignettes/netdis_customisations.Rmd new file mode 100644 index 00000000..0c1c724a --- /dev/null +++ b/vignettes/netdis_customisations.Rmd @@ -0,0 +1,122 @@ +--- +title: "Usage of netdis with binning and expected counts customisations." +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis function customisations} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Load query graphs +```{r} +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +``` + +## Default Expected Counts with Reference Graph +```{r} + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Binning Parameters +```{r} + +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, + num_intervals = 50) + + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn) + +print(results$netdis) +print(results$comp_spec) + + +``` + +## With Modified Expected Counts: Geometric Poisson +```{r} +bin_counts_fn <- density_binned_counts_gp + +exp_counts_fn <- purrr::partial(netdis_expected_counts, + scale_fn = NULL) + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Expected Counts: Simple Mean +```{r} +binning_fn <- single_density_bin +bin_counts_fn <- density_binned_counts +exp_counts_fn <- netdis_expected_counts + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) +``` \ No newline at end of file diff --git a/vignettes/netdis_pairwise_comparisons.Rmd b/vignettes/netdis_pairwise_comparisons.Rmd new file mode 100644 index 00000000..d5809c4d --- /dev/null +++ b/vignettes/netdis_pairwise_comparisons.Rmd @@ -0,0 +1,94 @@ +--- +title: "Usage of netdis interfaces for different pairwise comparison options." +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis pairwise comparisons} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Compare two graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Calculate netdis statistics +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Compare one graph to many other graphs +```{r} +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +graph_1 <- graphs$EBV +graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + +# Calculate netdis statistics +netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Do pairwise netdis calculations for many graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) +``` \ No newline at end of file diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd new file mode 100644 index 00000000..a55f1d82 --- /dev/null +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -0,0 +1,150 @@ +--- +title: "Quick start guide for Netdis - 2 graphs" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Quick start for Netdis - 2 graphs} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) +``` + +## Use a reference graph to calculate expected graphlet counts in ego network density bins +```{r} +# Load reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, + max_graphlet_size) + + +# Get ego-network densities +densities_ref <- ego_network_density(graphlet_counts_ref) + +# Adaptively bin ref ego-network densities +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks + +# Average ref graphlet counts across density bins +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) + +``` + + +## Centre graphlet counts of query graphs based on statistics of reference graph +```{r} +# Calculate expected graphlet counts (using ref graph ego network density bins) +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) +``` + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) +``` + +## Calculate netdis statistics +```{r} + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) +```