From ab0a5edc0e9fa290ca21c0499335a3d707ef37fc Mon Sep 17 00:00:00 2001 From: Martin O'Reilly Date: Thu, 2 May 2019 18:37:33 +0100 Subject: [PATCH 001/188] Add roxygen2 as suggested dependency --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 459e02e0..5e58c022 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,8 @@ Suggests: testthat, knitr, phangorn, - rmarkdown RoxygenNote: 6.0.1 + rmarkdown, + roxygen2 VignetteBuilder: knitr Encoding: UTF-8 From 965f7dad629aa41930b76d587795a662e526788a Mon Sep 17 00:00:00 2001 From: Martin O'Reilly Date: Thu, 2 May 2019 18:37:52 +0100 Subject: [PATCH 002/188] Update documentation via Roxygen run --- DESCRIPTION | 2 +- man/binned_densities_adaptive.Rd | 3 ++- man/counts_from_observations.Rd | 3 +++ man/emd_fast_no_smoothing.Rd | 4 ++++ man/mean_density_binned_graphlet_counts.Rd | 3 ++- man/netdis_centred_graphlet_counts.Rd | 4 ++-- man/netdis_centred_graphlet_counts_ego.Rd | 5 +++-- man/read_simple_graph.Rd | 5 +++-- 8 files changed, 20 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5e58c022..ec10c988 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,8 +26,8 @@ Suggests: testthat, knitr, phangorn, -RoxygenNote: 6.0.1 rmarkdown, roxygen2 +RoxygenNote: 6.1.1 VignetteBuilder: knitr Encoding: UTF-8 diff --git a/man/binned_densities_adaptive.Rd b/man/binned_densities_adaptive.Rd index cb8dd9ce..5a1e380f 100644 --- a/man/binned_densities_adaptive.Rd +++ b/man/binned_densities_adaptive.Rd @@ -4,7 +4,8 @@ \alias{binned_densities_adaptive} \title{INTERNAL FUNCTION - Do not call directly} \usage{ -binned_densities_adaptive(densities, min_counts_per_interval, num_intervals) +binned_densities_adaptive(densities, min_counts_per_interval, + num_intervals) } \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to diff --git a/man/counts_from_observations.Rd b/man/counts_from_observations.Rd index 501af571..ba548ecb 100644 --- a/man/counts_from_observations.Rd +++ b/man/counts_from_observations.Rd @@ -9,3 +9,6 @@ counts_from_observations(features) \arguments{ \item{features}{A Matrix with doubles.} } +\description{ +Count number of occurences +} diff --git a/man/emd_fast_no_smoothing.Rd b/man/emd_fast_no_smoothing.Rd index eae8b8fe..66421f81 100644 --- a/man/emd_fast_no_smoothing.Rd +++ b/man/emd_fast_no_smoothing.Rd @@ -16,3 +16,7 @@ emd_fast_no_smoothing(locations1, values1, locations2, values2) \item{values2}{Cumulative masses for ECDF 2} } +\description{ +Compute Earth Mover's Distance (EMD) between two Empirical Cumulative +Density Functions (ECDFs) +} diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index 079c99f0..4aac4a0d 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -4,7 +4,8 @@ \alias{mean_density_binned_graphlet_counts} \title{INTERNAL FUNCTION - Do not call directly} \usage{ -mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes) +mean_density_binned_graphlet_counts(graphlet_counts, + density_interval_indexes) } \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd index 71f0e93d..7b183f38 100644 --- a/man/netdis_centred_graphlet_counts.Rd +++ b/man/netdis_centred_graphlet_counts.Rd @@ -4,8 +4,8 @@ \alias{netdis_centred_graphlet_counts} \title{Generate Netdis centred graphlets counts by subtracting expected counts} \usage{ -netdis_centred_graphlet_counts(graph, max_graphlet_size, neighbourhood_size, - expected_ego_count_fn = NULL) +netdis_centred_graphlet_counts(graph, max_graphlet_size, + neighbourhood_size, expected_ego_count_fn = NULL) } \arguments{ \item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} diff --git a/man/netdis_centred_graphlet_counts_ego.Rd b/man/netdis_centred_graphlet_counts_ego.Rd index a6e00bcb..d5fba09d 100644 --- a/man/netdis_centred_graphlet_counts_ego.Rd +++ b/man/netdis_centred_graphlet_counts_ego.Rd @@ -4,8 +4,9 @@ \alias{netdis_centred_graphlet_counts_ego} \title{TODO: Remove @export prior to publishing} \usage{ -netdis_centred_graphlet_counts_ego(graph, max_graphlet_size, neighbourhood_size, - expected_ego_count_fn = NULL, min_ego_nodes = 3, min_ego_edges = 1) +netdis_centred_graphlet_counts_ego(graph, max_graphlet_size, + neighbourhood_size, expected_ego_count_fn = NULL, min_ego_nodes = 3, + min_ego_edges = 1) } \description{ TODO: Remove @export prior to publishing diff --git a/man/read_simple_graph.Rd b/man/read_simple_graph.Rd index 77f7aef6..d97588b7 100644 --- a/man/read_simple_graph.Rd +++ b/man/read_simple_graph.Rd @@ -4,8 +4,9 @@ \alias{read_simple_graph} \title{Read a graph from file, simplifying as requested} \usage{ -read_simple_graph(file, format, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE) +read_simple_graph(file, format, as_undirected = TRUE, + remove_loops = TRUE, remove_multiple = TRUE, + remove_isolates = TRUE) } \arguments{ \item{file}{Path to file containing graph data} From cb0fe734d641da2f412d59fda9638964367cefc6 Mon Sep 17 00:00:00 2001 From: Martin O'Reilly Date: Thu, 2 May 2019 18:52:40 +0100 Subject: [PATCH 003/188] Rename dendrogram vignettes --- vignettes/Quick_start_net_emd.html | 157 ------- ...net_dis.R => dendrogram_example_net_dis.R} | 0 ...dis.Rmd => dendrogram_example_net_dis.Rmd} | 0 ...s.html => dendrogram_example_net_dis.html} | 67 ++- ...net_emd.R => dendrogram_example_net_emd.R} | 21 + ...emd.Rmd => dendrogram_example_net_emd.Rmd} | 0 vignettes/dendrogram_example_net_emd.html | 394 ++++++++++++++++++ 7 files changed, 460 insertions(+), 179 deletions(-) delete mode 100644 vignettes/Quick_start_net_emd.html rename vignettes/{Quick_start_net_dis.R => dendrogram_example_net_dis.R} (100%) rename vignettes/{Quick_start_net_dis.Rmd => dendrogram_example_net_dis.Rmd} (100%) rename vignettes/{Quick_start_net_dis.html => dendrogram_example_net_dis.html} (97%) rename vignettes/{Quick_start_net_emd.R => dendrogram_example_net_emd.R} (69%) rename vignettes/{Quick_start_net_emd.Rmd => dendrogram_example_net_emd.Rmd} (100%) create mode 100644 vignettes/dendrogram_example_net_emd.html diff --git a/vignettes/Quick_start_net_emd.html b/vignettes/Quick_start_net_emd.html deleted file mode 100644 index 6519a226..00000000 --- a/vignettes/Quick_start_net_emd.html +++ /dev/null @@ -1,157 +0,0 @@ - - - - - - - - - - - - - - - - -Quick start guide for NetEMD - - - - - - - - - - - - - - - - - -

Quick start guide for NetEMD

-

Martin O’Reilly

-

2017-06-05

- - - -
-

Virus PPI example for NetEMD

-
library("netdist")
-# Set source directory and file properties for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-edge_format = "ncol"
-file_pattern = ".txt"
-
-# Calculate graphlet-based degree distributions for all orbits in graphlets 
-# comprising up to 4 nodes for all graphs. This only needs to be done once 
-# per graph (feature_type = "orbit", max_graphlet_size = 4).. 
-# If feature_type is set to "feature_type", orbit counts for orbits in the
-# same graphlet will be summed to generate graphlet counts
-# If max_graphlet_size is set to 5, graphlet-based degree distributions will  
-# be calculated for graphlets comprising up to 5 nodes.
-virus_gdds <- gdd_for_all_graphs(
-  source_dir = source_dir, format = edge_format, pattern = file_pattern, 
-  feature_type = "orbit", max_graphlet_size = 4)
-names(virus_gdds)
-
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
-
# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- 
-# based degree distributions using the default fast "optimise" method and no
-# smoothing (default). The "optimise" method uses the built-in R optimise
-# function to efficiently find the offset with the minimum EMD, but is not
-# guaranteed to find the global minimum if EMD as a function of offset
-# is non-convex and/or multimodal. The smoothing window width determines 
-# whether to calculate the NetEMD from the unaltered discrete GDD histograms
-# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" 
-# smoothing by "smearing" the discrete GDD histogram point masses across bins 
-# of unit width (smoothing_window_width = 1). Returns a named list containing:
-# (i) the NetEMDs and (ii) a table containing the graph names and indices 
-# within the input GDD list for each pair of graphs compared.
-res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0)
-
-# You can also specify method = "fixed_step" to use the much slower method of 
-# exhaustively evaluating the EMD at all offsets separated by a fixed step. 
-# The default step size is 1/2 the the minimum spacing between locations in 
-# either histogram after normalising to unit variance. However, you can 
-# specifiy your own fixed step using the optional "step_size" parameter.
-# Note that this step size is applied to the histograms after they have been 
-# normalised to unit variance
-
-# Convert to matrix for input to dendrogram method
-netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec)
-netemd_mat
-
##             EBV       ECL     HSV-1      KSHV       VZV
-## EBV   0.0000000 0.4876042 0.1662898 0.1607299 0.1994619
-## ECL   0.4876042 0.0000000 0.3986298 0.4024202 0.4029356
-## HSV-1 0.1662898 0.3986298 0.0000000 0.1581559 0.2164026
-## KSHV  0.1607299 0.4024202 0.1581559 0.0000000 0.2323955
-## VZV   0.1994619 0.4029356 0.2164026 0.2323955 0.0000000
-
cex=1
-# Dendrogram based on Netdis measure for graphlets of size 3
-title = paste("Netdis: graphlet size = ", 4, sep = "")
-plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, 
-     edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, 
-     cex.sub=cex, cex=cex)
-

-
# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in 
-# parallel using multiple threads where supported. The number of threads
-# used is determined by the global R option "mc.cores". You can inspect the 
-# current value of this using options("mc.cores") and set it with 
-# options("mc.cores" = <num_cores>). To fully utilise a modern consumer
-# processor, this should be set to 2x the number of available processor 
-# cores as each core supports two threads.
-
- - - - - - - - diff --git a/vignettes/Quick_start_net_dis.R b/vignettes/dendrogram_example_net_dis.R similarity index 100% rename from vignettes/Quick_start_net_dis.R rename to vignettes/dendrogram_example_net_dis.R diff --git a/vignettes/Quick_start_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd similarity index 100% rename from vignettes/Quick_start_net_dis.Rmd rename to vignettes/dendrogram_example_net_dis.Rmd diff --git a/vignettes/Quick_start_net_dis.html b/vignettes/dendrogram_example_net_dis.html similarity index 97% rename from vignettes/Quick_start_net_dis.html rename to vignettes/dendrogram_example_net_dis.html index a37c51ab..2113bddf 100644 --- a/vignettes/Quick_start_net_dis.html +++ b/vignettes/dendrogram_example_net_dis.html @@ -12,17 +12,17 @@ - + Quick start guide for Netdis - + @@ -271,6 +291,9 @@ code > span.fu { color: #900; font-weight: bold; } code > span.er { color: #a61717; background-color: #e3d2d2; } + + + @@ -279,14 +302,14 @@

Quick start guide for Netdis

-

Martin O’Reilly

-

2018-02-07

+

Martin O’Reilly

+

2019-05-02

-
+

Virus PPI example for Netdis

-
+

Load graphs

Use read_simple_graphs to read graph data from all files in a directory that match a specific filename pattern in a format suitable for calculating graphlet-based feature counts using the ORCA package. We use igraph::read_graph to read graph data from files, so support all file formats it supports. See help for igraph::read_graph for a list of supported values for the format parameter and the igraph documentation for descriptions of each of the supported file formats.

The ORCA package we use to efficiently calculate graphlet and orbit counts requires that graphs are undirected, simple (i.e. have no self-loops or multiple edges) and connected (i.e. have no isolated vertices). Therefore, by default, graphs loaded by read_simple_graphs will be coerced to have the above properties. This can be avoided by setting the relevant as_undirected, remove_loops, remove_multiple or remove_isolates parameters to FALSE.

@@ -310,22 +333,22 @@

Load graphs

-
-
-
+ + +

Generate a function to generate expected graphlet counts

Use netdis_expected_graphlet_counts_ego_fn to generate a function that calculates expected ego-network graphlet counts for query graphs based on the statistics of a provided reference graph.

-
-
+ +
-
+ +

Generate NetDis measures between each pair of query graphs

# Netdis measure for graphlets of size 3
 res3 <- netdis_for_all_graphs(centred_counts, 3)
@@ -340,8 +363,8 @@ 

Generate NetDis measures between each pair of query graphs

## HSV-1 0.16526412 0.2917612 0.00000000 0.07602426 0.03434187 ## KSHV 0.01969246 0.2215579 0.07602426 0.00000000 0.13115524 ## VZV 0.15971116 0.4171614 0.03434187 0.13115524 0.00000000
-
-
+ +
+ diff --git a/vignettes/Quick_start_net_emd.R b/vignettes/dendrogram_example_net_emd.R similarity index 69% rename from vignettes/Quick_start_net_emd.R rename to vignettes/dendrogram_example_net_emd.R index e997af69..3123b150 100644 --- a/vignettes/Quick_start_net_emd.R +++ b/vignettes/dendrogram_example_net_emd.R @@ -43,3 +43,24 @@ res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) netemd_mat +## ------------------------------------------------------------------------ +cex=1 +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, + edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, + cex.sub=cex, cex=cex) + +# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in +# parallel using multiple threads where supported. The number of threads +# used is determined by the global R option "mc.cores". You can inspect the +# current value of this using options("mc.cores") and set it with +# options("mc.cores" = ). To fully utilise a modern consumer +# processor, this should be set to 2x the number of available processor +# cores as each core supports two threads. + +## ------------------------------------------------------------------------ +cex=1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) + diff --git a/vignettes/Quick_start_net_emd.Rmd b/vignettes/dendrogram_example_net_emd.Rmd similarity index 100% rename from vignettes/Quick_start_net_emd.Rmd rename to vignettes/dendrogram_example_net_emd.Rmd diff --git a/vignettes/dendrogram_example_net_emd.html b/vignettes/dendrogram_example_net_emd.html new file mode 100644 index 00000000..31c0f980 --- /dev/null +++ b/vignettes/dendrogram_example_net_emd.html @@ -0,0 +1,394 @@ + + + + + + + + + + + + + + + + +Quick start guide for NetEMD + + + + + + + + + + + + + + + + + + + + + +

Quick start guide for NetEMD

+

Martin O’Reilly

+

2019-05-02

+ + + +
+

Virus PPI example for NetEMD

+ +
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
+
# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- 
+# based degree distributions using the default fast "optimise" method and no
+# smoothing (default). The "optimise" method uses the built-in R optimise
+# function to efficiently find the offset with the minimum EMD, but is not
+# guaranteed to find the global minimum if EMD as a function of offset
+# is non-convex and/or multimodal. The smoothing window width determines 
+# whether to calculate the NetEMD from the unaltered discrete GDD histograms
+# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" 
+# smoothing by "smearing" the discrete GDD histogram point masses across bins 
+# of unit width (smoothing_window_width = 1). Returns a named list containing:
+# (i) the NetEMDs and (ii) a table containing the graph names and indices 
+# within the input GDD list for each pair of graphs compared.
+res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0)
+
+# You can also specify method = "fixed_step" to use the much slower method of 
+# exhaustively evaluating the EMD at all offsets separated by a fixed step. 
+# The default step size is 1/2 the the minimum spacing between locations in 
+# either histogram after normalising to unit variance. However, you can 
+# specifiy your own fixed step using the optional "step_size" parameter.
+# Note that this step size is applied to the histograms after they have been 
+# normalised to unit variance
+
+# Convert to matrix for input to dendrogram method
+netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec)
+netemd_mat
+
##             EBV       ECL     HSV-1      KSHV       VZV
+## EBV   0.0000000 0.4876039 0.1662892 0.1607293 0.1994605
+## ECL   0.4876039 0.0000000 0.3986281 0.4024176 0.4029344
+## HSV-1 0.1662892 0.3986281 0.0000000 0.1581520 0.2164003
+## KSHV  0.1607293 0.4024176 0.1581520 0.0000000 0.2323936
+## VZV   0.1994605 0.4029344 0.2164003 0.2323936 0.0000000
+ +

+ + +

+
+ + + + + + + + From 0999e7039ce08a50370891da188e71da73f0f85a Mon Sep 17 00:00:00 2001 From: Martin O'Reilly Date: Thu, 2 May 2019 18:54:38 +0100 Subject: [PATCH 004/188] Update titles for vignetted to reflect file renames --- vignettes/dendrogram_example_net_dis.Rmd | 4 ++-- vignettes/dendrogram_example_net_dis.html | 4 ++-- vignettes/dendrogram_example_net_emd.Rmd | 4 ++-- vignettes/dendrogram_example_net_emd.html | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd index b64bd751..24c53b7a 100644 --- a/vignettes/dendrogram_example_net_dis.Rmd +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -1,10 +1,10 @@ --- -title: "Quick start guide for Netdis" +title: "Dendrogram example for Netdis" author: "Martin O'Reilly" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Quick start for Netdis} + %\VignetteIndexEntry{Dendrogram example for Netdis} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/dendrogram_example_net_dis.html b/vignettes/dendrogram_example_net_dis.html index 2113bddf..32819655 100644 --- a/vignettes/dendrogram_example_net_dis.html +++ b/vignettes/dendrogram_example_net_dis.html @@ -14,7 +14,7 @@ -Quick start guide for Netdis +Dendrogram example for Netdis @@ -301,7 +301,7 @@ -

Quick start guide for Netdis

+

Dendrogram example for Netdis

Martin O’Reilly

2019-05-02

diff --git a/vignettes/dendrogram_example_net_emd.Rmd b/vignettes/dendrogram_example_net_emd.Rmd index 73283132..72fd2750 100644 --- a/vignettes/dendrogram_example_net_emd.Rmd +++ b/vignettes/dendrogram_example_net_emd.Rmd @@ -1,10 +1,10 @@ --- -title: "Quick start guide for NetEMD" +title: "Dendrogram example for NetEMD" author: "Martin O'Reilly" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Quick start for NetEMD} + %\VignetteIndexEntry{Dendrogram example for NetEMD} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/dendrogram_example_net_emd.html b/vignettes/dendrogram_example_net_emd.html index 31c0f980..8228129c 100644 --- a/vignettes/dendrogram_example_net_emd.html +++ b/vignettes/dendrogram_example_net_emd.html @@ -14,7 +14,7 @@ -Quick start guide for NetEMD +Dendrogram example for NetEMD @@ -301,7 +301,7 @@ -

Quick start guide for NetEMD

+

Dendrogram example for NetEMD

Martin O’Reilly

2019-05-02

From ebcf51f673d53c102e67a2383a4b71f6e8e041ab Mon Sep 17 00:00:00 2001 From: Martin O'Reilly Date: Thu, 2 May 2019 21:01:32 +0100 Subject: [PATCH 005/188] Partial Netdis quickstart vignette --- vignettes/quickstart_netdis.R | 38 ++++ vignettes/quickstart_netdis.Rmd | 64 ++++++ vignettes/quickstart_netdis.html | 371 +++++++++++++++++++++++++++++++ 3 files changed, 473 insertions(+) create mode 100644 vignettes/quickstart_netdis.R create mode 100644 vignettes/quickstart_netdis.Rmd create mode 100644 vignettes/quickstart_netdis.html diff --git a/vignettes/quickstart_netdis.R b/vignettes/quickstart_netdis.R new file mode 100644 index 00000000..1ba1135f --- /dev/null +++ b/vignettes/quickstart_netdis.R @@ -0,0 +1,38 @@ +## ------------------------------------------------------------------------ +# Load libraries +library("netdist") +library("purrr") + +## ------------------------------------------------------------------------ +# Maximum graphlet size to calculate counts for. +# We choose the specific graphlet size for the Netdis metric later. +max_graphlet_size = 4 + +## ------------------------------------------------------------------------ +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +## ------------------------------------------------------------------------ + +# Set ego network neighbourhood size +neighbourhood_size = 2 +ego_networks <- purrr::map(graphs, make_named_ego_graph, + order = neighbourhood_size) + +## ------------------------------------------------------------------------ +ego_graphlet_counts <- purrr::map_depth(ego_networks, 2, count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size) + +## ------------------------------------------------------------------------ +# Load reference graph +file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(file, format = "ncol") +# Generate ego networks for reference graph +ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size) +# Count graphlets for ego networks in reference graph +ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size) + diff --git a/vignettes/quickstart_netdis.Rmd b/vignettes/quickstart_netdis.Rmd new file mode 100644 index 00000000..688efb2e --- /dev/null +++ b/vignettes/quickstart_netdis.Rmd @@ -0,0 +1,64 @@ +--- +title: "Quick start guide for Netdis" +author: "Martin O'Reilly" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Quick start for Netdis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts for. +# We choose the specific graphlet size for the Netdis metric later. +max_graphlet_size = 4 +``` + +## Load graphs + +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +``` + +## Generate ego networks +```{r} + +# Set ego network neighbourhood size +neighbourhood_size = 2 +ego_networks <- purrr::map(graphs, make_named_ego_graph, + order = neighbourhood_size) +``` + +## Count graphlets in ego networks +```{r} +ego_graphlet_counts <- purrr::map_depth(ego_networks, 2, count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size) +``` + +## Define function for calculating expected counts for ego networks +Here we generate these from a reference graph. + +### Generate graphlet counts for reference graph. +```{r} +# Load reference graph +file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(file, format = "ncol") +# Generate ego networks for reference graph +ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size) +# Count graphlets for ego networks in reference graph +ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size) +``` \ No newline at end of file diff --git a/vignettes/quickstart_netdis.html b/vignettes/quickstart_netdis.html new file mode 100644 index 00000000..c3f80279 --- /dev/null +++ b/vignettes/quickstart_netdis.html @@ -0,0 +1,371 @@ + + + + + + + + + + + + + + + + +Quick start guide for Netdis + + + + + + + + + + + + + + + + + + + + + +

Quick start guide for Netdis

+

Martin O’Reilly

+

2019-05-02

+ + + + + + + + + + + + + + + + + From c6de62808f3563aa43d517b6c98b2baaf2ebaeff Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 7 Jun 2019 09:59:45 +0100 Subject: [PATCH 006/188] Update dendrogram_example_net_dis.Rmd --- vignettes/dendrogram_example_net_dis.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd index 24c53b7a..d5da4e13 100644 --- a/vignettes/dendrogram_example_net_dis.Rmd +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -8,6 +8,8 @@ vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- +TEST + ## Virus PPI example for Netdis From 54ba2f8eb1232cf6e7ee88096d2b6aaee2e13cfe Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 7 Jun 2019 10:00:25 +0100 Subject: [PATCH 007/188] Update dendrogram_example_net_dis.Rmd --- vignettes/dendrogram_example_net_dis.Rmd | 2 -- 1 file changed, 2 deletions(-) diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd index d5da4e13..24c53b7a 100644 --- a/vignettes/dendrogram_example_net_dis.Rmd +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -8,8 +8,6 @@ vignette: > %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -TEST - ## Virus PPI example for Netdis From 6a2a702d9515cb54b944b7d2ca9deafb6852a146 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 7 Jun 2019 14:29:09 +0100 Subject: [PATCH 008/188] placeholders for next steps in netdis vignette --- R/orca_interface.R | 2 ++ vignettes/quickstart_netdis.Rmd | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/R/orca_interface.R b/R/orca_interface.R index 191fe8d8..d970c616 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -78,6 +78,8 @@ read_simple_graphs <- function(source_dir, format = "ncol", pattern = "*", remove_isolates = remove_isolates) }) # Perform any requested simplifications + # JACK: I don't think this is needed here - + # simplifications are already applied in read_simple_graph call above graphs <- purrr::map( graphs, simplify_graph, as_undirected = as_undirected, remove_loops = remove_loops, remove_multiple = remove_multiple, diff --git a/vignettes/quickstart_netdis.Rmd b/vignettes/quickstart_netdis.Rmd index 688efb2e..50687b04 100644 --- a/vignettes/quickstart_netdis.Rmd +++ b/vignettes/quickstart_netdis.Rmd @@ -56,9 +56,31 @@ Here we generate these from a reference graph. file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), package = "netdist") ref_graph <- read_simple_graph(file, format = "ncol") + # Generate ego networks for reference graph ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size) + # Count graphlets for ego networks in reference graph ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph, max_graphlet_size = max_graphlet_size) +``` + +## Centre counts according to size and density of ego network +```{r} +#query_ego_centred_graphlet_counts <- CENTRED_COUNTS(query_ego_subraph_counts, REF_COUNTS) +``` + +## Sum centred counts across all ego networks +```{r} +#query_sum_centred_graphlet_counts <- SUM_SUBGRAPH_COUNTS(query_ego_centred_subgraph_counts) +``` + +## Calculate netdis statistics +```{r} +#NETDIS_STATISTIC(query_sum_centred_graphlet_counts) +``` + +## Display results +```{r} + ``` \ No newline at end of file From 5a0ce2c2ce56b6eee24f7ade66822ead8d8519ae Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 23 Jul 2019 16:00:20 +0100 Subject: [PATCH 009/188] 2 graph vignette and fn changes in progress Moving towards: * A netdis function that takes two input graphs and a reference graph. * Sub-functions that follow through paper steps, sequentially applied with output of one providing input for next. --- R/measures_net_dis.R | 1 + R/orca_interface.R | 29 +++++++ vignettes/quickstart_netdis.Rmd | 25 ++---- vignettes/quickstart_netdis_2graphs.Rmd | 108 ++++++++++++++++++++++++ 4 files changed, 144 insertions(+), 19 deletions(-) create mode 100644 vignettes/quickstart_netdis_2graphs.Rmd diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index a4825638..55b21933 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -205,6 +205,7 @@ netdis_expected_graphlet_counts_ego_fn <- function( ego_networks <- res$ego_networks # Drop ego-networks that don't have the minimum number of nodes or edges + # JACK - why not put this in make_named_ego_graph? i.e. when generating ego networks in first place drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) })) diff --git a/R/orca_interface.R b/R/orca_interface.R index d970c616..b0070604 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -344,6 +344,7 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) + # Return either graphlet counts, or graphlet counts and ego_networks if(return_ego_networks) { return(list(graphlet_counts = ego_graphlet_counts, ego_networks = ego_networks)) @@ -352,6 +353,34 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size } } +#' ego_to_graphlet_counts +#' JACK To follow through logic of paper steps, wanted to pass +#' ego networks to the function for generating graphlet counts, +#' not the input query graph directly (as in count_graphlets_ego above). +#' +#' Calculates graphlet counts for previously generated ego networks. +#' @param ego_networks Named list of ego networks for a graph. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' @return returns an RxC matrix +#' containing counts of each graphlet (columns, C) for each ego-network (rows, R). +#' Columns are labelled with graphlet IDs and rows are +#' labelled with the ID of the central node in each ego-network. +#' @export +ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { + # Generate graphlet counts for each node in each ego network (returns an ORCA + # format graphlet count matrix for each ego network) + ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size) + + # Reshape the list of per node single row graphlet count matrices to a single + # ORCA format graphlet count matrix with one row per node + ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) + + # Return graphlet counts + return(ego_graphlet_counts) +} + #' Get ego-networks for a graph as a named list #' #' Simple wrapper for the \code{igraph::make_ego_graph} function that names diff --git a/vignettes/quickstart_netdis.Rmd b/vignettes/quickstart_netdis.Rmd index 50687b04..41f91d5d 100644 --- a/vignettes/quickstart_netdis.Rmd +++ b/vignettes/quickstart_netdis.Rmd @@ -50,7 +50,7 @@ ego_graphlet_counts <- purrr::map_depth(ego_networks, 2, count_graphlets_for_gra ## Define function for calculating expected counts for ego networks Here we generate these from a reference graph. -### Generate graphlet counts for reference graph. +### Generate scaled graphlet counts for reference graph. ```{r} # Load reference graph file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), @@ -63,24 +63,11 @@ ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size) # Count graphlets for ego networks in reference graph ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph, max_graphlet_size = max_graphlet_size) -``` -## Centre counts according to size and density of ego network -```{r} -#query_ego_centred_graphlet_counts <- CENTRED_COUNTS(query_ego_subraph_counts, REF_COUNTS) -``` - -## Sum centred counts across all ego networks -```{r} -#query_sum_centred_graphlet_counts <- SUM_SUBGRAPH_COUNTS(query_ego_centred_subgraph_counts) -``` +# Scale ego-network graphlet counts by dividing by total number of k-tuples in + # ego-network (where k is graphlet size) +ref_ego_graphlet_tuples <- + count_graphlet_tuples_ego(ref_ego_networks, max_graphlet_size = max_graphlet_size) -## Calculate netdis statistics -```{r} -#NETDIS_STATISTIC(query_sum_centred_graphlet_counts) +ref_ego_graphlet_counts <- scale_graphlet_count(ref_ego_graphlet_counts, ref_ego_graphlet_tuples) ``` - -## Display results -```{r} - -``` \ No newline at end of file diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd new file mode 100644 index 00000000..67987693 --- /dev/null +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -0,0 +1,108 @@ +--- +title: "Quick start guide for Netdis - 2 graphs" +author: "Martin O'Reilly" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Quick start for Netdis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts for. +# We choose the specific graphlet size for the Netdis metric later. +max_graphlet_size = 4 +``` + +## Load graphs + +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Load reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") +``` + +## Generate ego networks +```{r} + +# Set ego network neighbourhood size +neighbourhood_size = 2 + +# Get ego networks for both query graphs +ego_1 <- make_named_ego_graph(graph_1, order = neighbourhood_size) +ego_2 <- make_named_ego_graph(graph_2, order = neighbourhood_size) +``` + +## Count graphlets in ego networks +```{r} +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) +``` + + +## Define function for mapping ego networks to density bins +```{r} + +``` + +### Generate scaled graphlet counts for reference graph. +```{r} +# Generate ego networks for reference graph +ego_ref <- make_named_ego_graph(ref_graph, order = neighbourhood_size) + +# Count graphlets for ego networks in reference graph +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in + # ego-network (where k is graphlet size) +ref_ego_graphlet_tuples <- + count_graphlet_tuples_ego(ego_ref, max_graphlet_size = max_graphlet_size) + +ref_ego_graphlet_counts <- scale_graphlet_count(graphlet_counts_ref, ref_ego_graphlet_tuples) +``` + +## Calculate average count of each graphlet type per ego network in each density bin of reference graph +```{r} + +``` + + +## Centre and scale graphlet counts of query graphs based on statistics of reference graph +```{r} +#query_ego_centred_graphlet_counts <- CENTRED_COUNTS(query_ego_subraph_counts, REF_COUNTS) +``` + +## Sum centred counts across all ego networks +```{r} +#query_sum_centred_graphlet_counts <- SUM_SUBGRAPH_COUNTS(query_ego_centred_subgraph_counts) +``` + +## Calculate netdis statistics +```{r} +#NETDIS_STATISTIC(query_sum_centred_graphlet_counts) +``` + +## Display results +```{r} + +``` \ No newline at end of file From ea9a19060ac8b7274b13f391e216404cf7da272e Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 24 Jul 2019 17:11:43 +0100 Subject: [PATCH 010/188] dropping of small ego networks moved to make_named_ego_graph Code to filter ego networks with fewer than threshold nodes/edges was duplicated many places. Added that functionality to make_named_ego_graph, where ego networks originally generated. Also, remove unnecessary duplication of simplifying in read_simple_graphs function, as it's already performed in read_simple_graph (which read_simple_graphs calls). --- R/measures_net_dis.R | 40 +++++++++-------- R/orca_interface.R | 36 ++++++++++----- vignettes/quickstart_netdis_2graphs.Rmd | 58 ++++++++++++++++++------- 3 files changed, 89 insertions(+), 45 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 55b21933..53182f8c 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -68,6 +68,10 @@ netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. #' @param neighbourhood_size The number of steps from the source node to include #' nodes for each ego-network. +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' nodes are returned. +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' edges are returned. #' @param return_ego_networks If \code{TRUE}, return ego-networks alongside #' graphlet counts to enable further processing. #' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix @@ -83,20 +87,26 @@ netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, #' } #' @export count_graphlets_ego_scaled <- function( - graph, max_graphlet_size, neighbourhood_size, return_ego_networks = FALSE) { + graph, max_graphlet_size, neighbourhood_size, + min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) { + # Calculate ego-network graphlet counts, also returning the ego networks for # use later in function ego_data <- count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, return_ego_networks = TRUE) ego_graphlet_counts <- ego_data$graphlet_counts ego_networks <- ego_data$ego_networks + # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) ego_graphlet_tuples <- count_graphlet_tuples_ego(ego_networks, max_graphlet_size = max_graphlet_size) ego_graphlet_counts <- scale_graphlet_count(ego_graphlet_counts, ego_graphlet_tuples) + # Return either graphlet counts, or graphlet counts and ego_networks if(return_ego_networks) { return(list(graphlet_counts = ego_graphlet_counts, @@ -141,17 +151,14 @@ netdis_centred_graphlet_counts_ego <- function( # Get unscaled ego-network graphlet counts res <- count_graphlets_ego( graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, return_ego_networks = TRUE) + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE) + actual_counts = res$graphlet_counts ego_networks <- res$ego_networks - # Drop ego-networks that don't have the minimum number of nodes or edges - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { - (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) - })) - actual_counts <- actual_counts[!drop_index,] - ego_networks <- ego_networks[!drop_index] - # Centre these counts by subtracting the expected counts if(is.null(expected_ego_count_fn)) { centred_counts = actual_counts @@ -200,18 +207,13 @@ netdis_expected_graphlet_counts_ego_fn <- function( # graph, also returning the ego networks themselves in order to calculate # their densities res <- count_graphlets_ego_scaled( - graph, max_graphlet_size, neighbourhood_size, return_ego_networks = TRUE) + graph, max_graphlet_size, neighbourhood_size, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, + return_ego_networks = TRUE) + scaled_graphlet_counts = res$graphlet_counts ego_networks <- res$ego_networks - # Drop ego-networks that don't have the minimum number of nodes or edges - # JACK - why not put this in make_named_ego_graph? i.e. when generating ego networks in first place - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { - (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) - })) - scaled_graphlet_counts <- scaled_graphlet_counts[!drop_index,] - ego_networks <- ego_networks[!drop_index] - # Get ego-network densities densities <- purrr::simplify(purrr::map_dbl(ego_networks, igraph::edge_density)) diff --git a/R/orca_interface.R b/R/orca_interface.R index b0070604..4000db68 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -77,13 +77,7 @@ read_simple_graphs <- function(source_dir, format = "ncol", pattern = "*", remove_multiple = remove_multiple, remove_isolates = remove_isolates) }) - # Perform any requested simplifications - # JACK: I don't think this is needed here - - # simplifications are already applied in read_simple_graph call above - graphs <- purrr::map( - graphs, simplify_graph, as_undirected = as_undirected, - remove_loops = remove_loops, remove_multiple = remove_multiple, - remove_isolates = remove_isolates) + # Name each graph with the name of the file it was read from (with any # extension moved) names <- purrr::simplify(purrr::map(strsplit(file_names, "\\."), @@ -318,6 +312,10 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. #' @param neighbourhood_size The number of steps from the source node to include #' nodes for each ego-network. +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' nodes are returned. +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' edges are returned. #' @param return_ego_networks If \code{TRUE}, return ego-networks alongside #' graphlet counts to enable further processing. #' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix @@ -332,11 +330,14 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { #' \item \code{ego_networks}: The ego-networks of the query graph. #' } #' @export -count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size, +count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size, + min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) { # Extract ego network for each node in original graph, naming each ego network # in the list with the name of the node the ego network is generated for - ego_networks <- make_named_ego_graph(graph, order = neighbourhood_size) + ego_networks <- make_named_ego_graph(graph, order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) # Generate graphlet counts for each node in each ego network (returns an ORCA # format graphlet count matrix for each ego network) ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, @@ -389,13 +390,26 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { #' @param graph An \code{igraph} object #' @param order The number of steps from the source node to include #' nodes for each ego-network. +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' nodes are returned. +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' edges are returned. #' @param ... Additional parameters to be passed to the underlying #' \code{igraph::make_ego_graph} function used. #' @export -make_named_ego_graph <- function(graph, order, ...) { +make_named_ego_graph <- function(graph, order, min_ego_nodes=3, + min_ego_edges=1, ...) { + ego_networks <- igraph::make_ego_graph(graph, order, ...) names(ego_networks) <- igraph::V(graph)$name - ego_networks + + # Drop ego-networks that don't have the minimum number of nodes or edges + drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { + (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) + })) + ego_networks <- ego_networks[!drop_index] + + return(ego_networks) } #' Orbit to graphlet counts diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 67987693..445f3dbe 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -21,6 +21,13 @@ library("purrr") # Maximum graphlet size to calculate counts for. # We choose the specific graphlet size for the Netdis metric later. max_graphlet_size = 4 + +# Ego network neighbourhood size +neighbourhood_size = 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 ``` ## Load graphs @@ -44,13 +51,16 @@ ref_graph <- read_simple_graph(ref_path, format = "ncol") ## Generate ego networks ```{r} - -# Set ego network neighbourhood size -neighbourhood_size = 2 - # Get ego networks for both query graphs -ego_1 <- make_named_ego_graph(graph_1, order = neighbourhood_size) -ego_2 <- make_named_ego_graph(graph_2, order = neighbourhood_size) +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) ``` ## Count graphlets in ego networks @@ -59,26 +69,44 @@ graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graph graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) ``` - -## Define function for mapping ego networks to density bins -```{r} - -``` - ### Generate scaled graphlet counts for reference graph. ```{r} # Generate ego networks for reference graph -ego_ref <- make_named_ego_graph(ref_graph, order = neighbourhood_size) +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) # Count graphlets for ego networks in reference graph graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) -ref_ego_graphlet_tuples <- +ego_graphlet_tuples_ref <- count_graphlet_tuples_ego(ego_ref, max_graphlet_size = max_graphlet_size) -ref_ego_graphlet_counts <- scale_graphlet_count(graphlet_counts_ref, ref_ego_graphlet_tuples) +scaled_graphlet_counts_ref <- scale_graphlet_count(graphlet_counts_ref, ego_graphlet_tuples_ref) +``` + +## Define function for mapping ego networks to density bins +```{r} + +min_bin_count <- 5 +num_bins <- 100 + + +# Get ego-network densities +densities_ref <- purrr::simplify(purrr::map_dbl(ego_ref, igraph::edge_density)) + +# Adaptively bin ego-network densities +binned_densities <- binned_densities_adaptive( + densities_ref, min_counts_per_interval = min_bin_count, num_intervals = num_bins) + +# Average graphlet counts across density bins +density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, binned_densities$interval_indexes) + + ``` ## Calculate average count of each graphlet type per ego network in each density bin of reference graph From 54fdabd6a2f39f44deb4148eb8b62a526bec3d7d Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 24 Jul 2019 17:15:18 +0100 Subject: [PATCH 011/188] rename index entry of 2 graphs vignette --- vignettes/quickstart_netdis_2graphs.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 445f3dbe..37f6b0fa 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -1,10 +1,10 @@ --- title: "Quick start guide for Netdis - 2 graphs" -author: "Martin O'Reilly" +author: "Martin O'Reilly, Jack Roberts" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Quick start for Netdis} + %\VignetteIndexEntry{Quick start for Netdis - 2 graphs} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- From 016e7a041072d5b8ebb2be417cd521025e20012e Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 25 Jul 2019 14:48:37 +0100 Subject: [PATCH 012/188] min_ego_nodes and min_ego_edges added to test_orca_interface Default behaviour of ego network generating functions is now min_ego_nodes <- 3 and min_ego_edges <- 1. Set these to zero in function calls so orca tests now passing. Should implement tests to check that filtering of small ego networks is working as expected (i.e. when min_ego_edges and min_ego_nodes > 0) --- NAMESPACE | 1 + man/count_graphlets_ego.Rd | 8 +++- man/count_graphlets_ego_scaled.Rd | 8 +++- man/ego_to_graphlet_counts.Rd | 26 +++++++++++ man/make_named_ego_graph.Rd | 9 +++- tests/testthat/test_orca_interface.R | 50 ++++++++++++++++----- vignettes/quickstart_netdis_2graphs.Rmd | 58 ++++++++++++++++++------- 7 files changed, 129 insertions(+), 31 deletions(-) create mode 100644 man/ego_to_graphlet_counts.Rd diff --git a/NAMESPACE b/NAMESPACE index a49b57ba..caa0c098 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(dhist_mean_location) export(dhist_std) export(dhist_variance) export(ecmf_knots) +export(ego_to_graphlet_counts) export(emd) export(emd_cs) export(emd_fast_no_smoothing) diff --git a/man/count_graphlets_ego.Rd b/man/count_graphlets_ego.Rd index 262e616e..55702f61 100644 --- a/man/count_graphlets_ego.Rd +++ b/man/count_graphlets_ego.Rd @@ -5,7 +5,7 @@ \title{Ego-network graphlet counts} \usage{ count_graphlets_ego(graph, max_graphlet_size = 4, neighbourhood_size, - return_ego_networks = FALSE) + min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) } \arguments{ \item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} @@ -16,6 +16,12 @@ Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} \item{neighbourhood_size}{The number of steps from the source node to include nodes for each ego-network.} +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +nodes are returned.} + +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +edges are returned.} + \item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside graphlet counts to enable further processing.} } diff --git a/man/count_graphlets_ego_scaled.Rd b/man/count_graphlets_ego_scaled.Rd index fb6eb069..76046db8 100644 --- a/man/count_graphlets_ego_scaled.Rd +++ b/man/count_graphlets_ego_scaled.Rd @@ -5,7 +5,7 @@ \title{Scaled graphlet count for ego-networks} \usage{ count_graphlets_ego_scaled(graph, max_graphlet_size, neighbourhood_size, - return_ego_networks = FALSE) + min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) } \arguments{ \item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} @@ -16,6 +16,12 @@ Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} \item{neighbourhood_size}{The number of steps from the source node to include nodes for each ego-network.} +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +nodes are returned.} + +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +edges are returned.} + \item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside graphlet counts to enable further processing.} } diff --git a/man/ego_to_graphlet_counts.Rd b/man/ego_to_graphlet_counts.Rd new file mode 100644 index 00000000..9fe6b0b5 --- /dev/null +++ b/man/ego_to_graphlet_counts.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/orca_interface.R +\name{ego_to_graphlet_counts} +\alias{ego_to_graphlet_counts} +\title{ego_to_graphlet_counts +JACK To follow through logic of paper steps, wanted to pass +ego networks to the function for generating graphlet counts, +not the input query graph directly (as in count_graphlets_ego above).} +\usage{ +ego_to_graphlet_counts(ego_networks, max_graphlet_size = 4) +} +\arguments{ +\item{ego_networks}{Named list of ego networks for a graph.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +} +\value{ +returns an RxC matrix +containing counts of each graphlet (columns, C) for each ego-network (rows, R). +Columns are labelled with graphlet IDs and rows are +labelled with the ID of the central node in each ego-network. +} +\description{ +Calculates graphlet counts for previously generated ego networks. +} diff --git a/man/make_named_ego_graph.Rd b/man/make_named_ego_graph.Rd index 9bfeca25..037c0395 100644 --- a/man/make_named_ego_graph.Rd +++ b/man/make_named_ego_graph.Rd @@ -4,7 +4,8 @@ \alias{make_named_ego_graph} \title{Get ego-networks for a graph as a named list} \usage{ -make_named_ego_graph(graph, order, ...) +make_named_ego_graph(graph, order, min_ego_nodes = 3, + min_ego_edges = 1, ...) } \arguments{ \item{graph}{An \code{igraph} object} @@ -12,6 +13,12 @@ make_named_ego_graph(graph, order, ...) \item{order}{The number of steps from the source node to include nodes for each ego-network.} +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +nodes are returned.} + +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +edges are returned.} + \item{...}{Additional parameters to be passed to the underlying \code{igraph::make_ego_graph} function used.} } diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index fadcb428..73b8260f 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -956,6 +956,9 @@ test_that("make_named_ego_graph labels each ego-network with the correct node na # Test ego-networks of order 1. # We compare edgelists as igraphs do not implement comparison order <- 1 + min_ego_nodes <- 0 + min_ego_edges <- 0 + expected_ego_elists_o1 <- list( n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), @@ -970,9 +973,13 @@ test_that("make_named_ego_graph labels each ego-network with the correct node na ) # Generate actual ego-networks and convert to edge lists for comparison actual_ego_elists_o1 <- - purrr::map(make_named_ego_graph(graph, order), function(g) { - dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) - }) + purrr::map(make_named_ego_graph(graph, order, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges), + function(g) { + dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) + } + ) expect_equal(actual_ego_elists_o1, expected_ego_elists_o1) }) @@ -1074,12 +1081,19 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall colnames(expected_counts_order_2) <- graphlet_labels # Count graphlets in each ego network of the graph with only counts requested + min_ego_nodes <- 0 + min_ego_edges <- 0 + actual_counts_order_1 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1) + count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) actual_counts_order_2 <- count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2) + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) # Test that actual counts match expected with only counts requested (default) expect_equal(actual_counts_order_1, expected_counts_order_1) @@ -1087,8 +1101,12 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # Test that actual and returned ego networks match expected # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) + expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) expected_counts_with_networks_order_1 <- list(graphlet_counts = expected_counts_order_1, ego_networks = expected_ego_networks_order_1) @@ -1097,11 +1115,19 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall ego_networks = expected_ego_networks_order_2) # 2. Calculate actual actual_counts_with_networks_order_1 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE) actual_counts_with_networks_order_2 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE) # Test that actual counts match expected with ego-networks requested expect_equal(actual_counts_with_networks_order_1$graphlet_counts, expected_counts_order_1) expect_equal(actual_counts_with_networks_order_2$graphlet_counts, expected_counts_order_2) diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 37f6b0fa..3d9b781c 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -28,6 +28,10 @@ neighbourhood_size = 2 # Minimum size of ego networks to consider min_ego_nodes <- 3 min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 ``` ## Load graphs @@ -51,7 +55,7 @@ ref_graph <- read_simple_graph(ref_path, format = "ncol") ## Generate ego networks ```{r} -# Get ego networks for both query graphs +# Get ego networks for query graphs and reference graph ego_1 <- make_named_ego_graph(graph_1, order = neighbourhood_size, min_ego_nodes = min_ego_nodes, @@ -61,24 +65,23 @@ ego_2 <- make_named_ego_graph(graph_2, order = neighbourhood_size, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) ``` ## Count graphlets in ego networks ```{r} +# Count graphlets for ego networks in query and reference graphs graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) ``` ### Generate scaled graphlet counts for reference graph. ```{r} -# Generate ego networks for reference graph -ego_ref <- make_named_ego_graph(ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -# Count graphlets for ego networks in reference graph -graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) @@ -91,10 +94,6 @@ scaled_graphlet_counts_ref <- scale_graphlet_count(graphlet_counts_ref, ego_grap ## Define function for mapping ego networks to density bins ```{r} -min_bin_count <- 5 -num_bins <- 100 - - # Get ego-network densities densities_ref <- purrr::simplify(purrr::map_dbl(ego_ref, igraph::edge_density)) @@ -106,14 +105,41 @@ binned_densities <- binned_densities_adaptive( density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( scaled_graphlet_counts_ref, binned_densities$interval_indexes) - ``` -## Calculate average count of each graphlet type per ego network in each density bin of reference graph +## Function to calculate expected graphlet counts for a query network + ```{r} -``` +# Ew : density_binned_graphlet_counts +# bins: binned_densities$breaks + +# fn (ego, Ew, bins) +# make partial with Ew, bins given + + +#netdis_expected_graphlet_counts_ego( +# graph, max_graphlet_size, neighbourhood_size, +# density_breaks, density_binned_reference_counts, +# min_ego_nodes = 3, min_ego_edges = 1) + + # Generate ego-networks for query graph + # no - pass in pre-calculated ego networks + # remove params graph, neighbourhood_size, min_ego_nodes, min_ego_edges + # add param ego_networks + + # Map over query graph ego-networks, using reference graph statistics to + # calculate expected graphlet counts for each ego-network. + expected_graphlet_counts <- + purrr::map(ego_networks, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = density_binned_reference_counts) + names(expected_graphlet_counts) <- names(ego_networks) + # Simplify list to array + t(simplify2array(expected_graphlet_counts)) +``` ## Centre and scale graphlet counts of query graphs based on statistics of reference graph ```{r} From 6db8c4698d6797f256f3428d770b5fc6fa704417 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 25 Jul 2019 15:06:31 +0100 Subject: [PATCH 013/188] ego filtering args added to test_measures_net_dist ego network generating functions now take min_ego_edges and min_ego_nodes arguments to filter out small ego networks, which default to 1 and 3 respectively. Set to zero for tests. Filtering is probably happening explicitly elsewhere in the tests - could be removed with min_ego_edges and min_ego_nodes given non-zero values as appropriate. --- tests/testthat/test_measures_net_dis.R | 83 +++++++++++++++++++------- 1 file changed, 62 insertions(+), 21 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index f045651f..95cf4777 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -144,12 +144,20 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals",{ # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 max_graphlet_size <- 4 + min_ego_edges <- 0 + min_ego_nodes <- 0 + actual_counts_order_1 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1) + count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 1) actual_counts_order_2 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2) + count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 2) + graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count # Set manually verified counts @@ -190,8 +198,12 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals",{ # Test that actual counts and returned ego networks match expected # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) + expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes) + expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes) expected_counts_with_networks_order_1 <- list(graphlet_counts = expected_counts_order_1, ego_networks = expected_ego_networks_order_1) @@ -201,10 +213,15 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals",{ # 2. Calculate actual actual_counts_with_networks_order_1 <- count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) + neighbourhood_size = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + return_ego_networks = TRUE) actual_counts_with_networks_order_2 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) + count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 2, return_ego_networks = TRUE) # 3. Compare # Comparison is not implemented for igraph objects, so convert all igraphs to @@ -469,9 +486,16 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) max_graphlet_size = 4 + min_ego_edges <- 0 + min_ego_nodes <- 0 + # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, order = 1) - ego_networks_o2 <- make_named_ego_graph(graph, order = 2) + ego_networks_o1 <- make_named_ego_graph(graph, order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes) + ego_networks_o2 <- make_named_ego_graph(graph, order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes) # Set manually-verified node counts and densities # 1. Ego-networks of order 1 num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) @@ -587,8 +611,14 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no names(graphlet_sizes) <- graphlet_labels max_graphlet_size = 4 # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, order = 1) - ego_networks_o2 <- make_named_ego_graph(graph, order = 2) + min_ego_nodes <- 0 + min_edgo_edges <- 0 + ego_networks_o1 <- make_named_ego_graph(graph, order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_edgo_edges) + ego_networks_o2 <- make_named_ego_graph(graph, order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_edgo_edges) # Set manually-verified node counts and densities # 1. Ego-networks of order 1 num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) @@ -662,8 +692,8 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)), c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)) ) - min_ego_nodes = 3 - min_ego_edges = 1 + min_ego_nodes <- 3 + min_ego_edges <- 1 # Drop rows for nodes with ewer than minumum required nodes and edges in ego # network scaled_reference_counts_o1 <- @@ -731,7 +761,8 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no neighbourhood_size = 1, density_breaks = breaks_o1, density_binned_reference_counts_o1, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges )) expect_equal(expected_expected_graphlet_counts_ego_o2, netdis_expected_graphlet_counts_ego( @@ -739,18 +770,28 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no neighbourhood_size = 2, density_breaks = breaks_o2, density_binned_reference_counts_o2, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges )) # Generate partially applied functions using function under test actual_expected_graphlet_counts_ego_fn_o1 <- netdis_expected_graphlet_counts_ego_fn( - graph, max_graphlet_size = max_graphlet_size, neighbourhood_size = 1, - min_bin_count = min_bin_count, num_bins = num_bins) + graph, max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_bin_count = min_bin_count, + num_bins = num_bins, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) actual_expected_graphlet_counts_ego_fn_o2 <- netdis_expected_graphlet_counts_ego_fn( - graph, max_graphlet_size = max_graphlet_size, neighbourhood_size = 2, - min_bin_count = min_bin_count, num_bins = num_bins) + graph, max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_bin_count = min_bin_count, + num_bins = num_bins, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) # Generate actual expected accounts by applying generated functions to test # graph actual_expected_graphlet_counts_ego_o1 <- From 00589e01718534552257b5bf1b243a8067ec6c07 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 25 Jul 2019 15:38:33 +0100 Subject: [PATCH 014/188] comment prototype code to check all tests passing --- vignettes/quickstart_netdis_2graphs.Rmd | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 3d9b781c..95de67a5 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -131,14 +131,14 @@ density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. - expected_graphlet_counts <- - purrr::map(ego_networks, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts) - names(expected_graphlet_counts) <- names(ego_networks) - # Simplify list to array - t(simplify2array(expected_graphlet_counts)) +# expected_graphlet_counts <- +# purrr::map(ego_networks, netdis_expected_graphlet_counts, +# max_graphlet_size = max_graphlet_size, +# density_breaks = density_breaks, +# density_binned_reference_counts = density_binned_reference_counts) +# names(expected_graphlet_counts) <- names(ego_networks) +# # Simplify list to array +# t(simplify2array(expected_graphlet_counts)) ``` ## Centre and scale graphlet counts of query graphs based on statistics of reference graph From 707ca648e07f80d42eea168595817a7ca1205352 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 25 Jul 2019 16:05:27 +0100 Subject: [PATCH 015/188] Update quickstart_netdis.Rmd --- vignettes/quickstart_netdis.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/quickstart_netdis.Rmd b/vignettes/quickstart_netdis.Rmd index 41f91d5d..644a21cd 100644 --- a/vignettes/quickstart_netdis.Rmd +++ b/vignettes/quickstart_netdis.Rmd @@ -69,5 +69,5 @@ ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_grap ref_ego_graphlet_tuples <- count_graphlet_tuples_ego(ref_ego_networks, max_graphlet_size = max_graphlet_size) -ref_ego_graphlet_counts <- scale_graphlet_count(ref_ego_graphlet_counts, ref_ego_graphlet_tuples) +#ref_ego_graphlet_counts <- scale_graphlet_count(ref_ego_graphlet_counts, ref_ego_graphlet_tuples) ``` From 9bdfe765404c693d2a01dcf3b221e4a13be7126e Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 26 Jul 2019 17:00:05 +0100 Subject: [PATCH 016/188] expected and centred counts calculated --- vignettes/quickstart_netdis_2graphs.Rmd | 48 ++++++++++--------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 95de67a5..39d15ba2 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -91,59 +91,47 @@ ego_graphlet_tuples_ref <- scaled_graphlet_counts_ref <- scale_graphlet_count(graphlet_counts_ref, ego_graphlet_tuples_ref) ``` -## Define function for mapping ego networks to density bins +## Bin reference ego networks by density and calculate mean graphlet counts in each bin ```{r} # Get ego-network densities densities_ref <- purrr::simplify(purrr::map_dbl(ego_ref, igraph::edge_density)) -# Adaptively bin ego-network densities +# Adaptively bin ref ego-network densities binned_densities <- binned_densities_adaptive( densities_ref, min_counts_per_interval = min_bin_count, num_intervals = num_bins) -# Average graphlet counts across density bins +# Average ref graphlet counts across density bins density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( scaled_graphlet_counts_ref, binned_densities$interval_indexes) ``` -## Function to calculate expected graphlet counts for a query network +## Calculate expected graphlet counts for query networks ```{r} -# Ew : density_binned_graphlet_counts -# bins: binned_densities$breaks +exp_graphlet_counts_1 <- purrr::map(ego_1, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = binned_densities$breaks, + density_binned_reference_counts = density_binned_graphlet_counts) -# fn (ego, Ew, bins) -# make partial with Ew, bins given +exp_graphlet_counts_1 <- t(simplify2array(exp_graphlet_counts_1)) +exp_graphlet_counts_2 <- purrr::map(ego_2, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = binned_densities$breaks, + density_binned_reference_counts = density_binned_graphlet_counts) +exp_graphlet_counts_2 <- t(simplify2array(exp_graphlet_counts_2)) -#netdis_expected_graphlet_counts_ego( -# graph, max_graphlet_size, neighbourhood_size, -# density_breaks, density_binned_reference_counts, -# min_ego_nodes = 3, min_ego_edges = 1) - - # Generate ego-networks for query graph - # no - pass in pre-calculated ego networks - # remove params graph, neighbourhood_size, min_ego_nodes, min_ego_edges - # add param ego_networks - - # Map over query graph ego-networks, using reference graph statistics to - # calculate expected graphlet counts for each ego-network. -# expected_graphlet_counts <- -# purrr::map(ego_networks, netdis_expected_graphlet_counts, -# max_graphlet_size = max_graphlet_size, -# density_breaks = density_breaks, -# density_binned_reference_counts = density_binned_reference_counts) -# names(expected_graphlet_counts) <- names(ego_networks) -# # Simplify list to array -# t(simplify2array(expected_graphlet_counts)) ``` -## Centre and scale graphlet counts of query graphs based on statistics of reference graph +## Centre graphlet counts of query graphs based on statistics of reference graph ```{r} -#query_ego_centred_graphlet_counts <- CENTRED_COUNTS(query_ego_subraph_counts, REF_COUNTS) +centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 + +centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 ``` ## Sum centred counts across all ego networks From aa43c750f5535adac87b0b7535a9ef1cf3c634e6 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 29 Jul 2019 15:13:43 +0100 Subject: [PATCH 017/188] working 2 graph vignette for netdis Vignette to calculate netdis for 2 query graphs and a reference network: * Follows steps of paper. * Output of previous step used as input to next step (as much as possible). * Calculates netdis for all graphlet sizes up to max_graphlet_size * No explicit calls to purrr - all hidden in functions for each step. * Needs a lot of tidying of function names, docs, removal of potentially obsolete functions etc. Questions: * What should be in top level netdis function - current netdis function takes centred graphlet counts (i.e. output of all but last step). Should that be parent function taking two query graphs, ref graph, and parameters? * How to implement comparison of many graphs? This may differ depending on choice above - e.g. when comparing many graphs ideally don't want to be re-calculating ego nets and graphlet counts each time. --- NAMESPACE | 4 + R/measures_net_dis.R | 92 ++++ ...netdis_expected_graphlet_counts_per_ego.Rd | 21 + man/netdis_uptok.Rd | 26 + man/scale_graphlet_counts_ego.Rd | 26 + vignettes/quickstart_netdis.R | 9 + vignettes/quickstart_netdis.html | 69 +-- vignettes/quickstart_netdis_2graphs.R | 117 +++++ vignettes/quickstart_netdis_2graphs.Rmd | 113 +++-- vignettes/quickstart_netdis_2graphs.html | 448 ++++++++++++++++++ 10 files changed, 835 insertions(+), 90 deletions(-) create mode 100644 man/netdis_expected_graphlet_counts_per_ego.Rd create mode 100644 man/netdis_uptok.Rd create mode 100644 man/scale_graphlet_counts_ego.Rd create mode 100644 vignettes/quickstart_netdis_2graphs.R create mode 100644 vignettes/quickstart_netdis_2graphs.html diff --git a/NAMESPACE b/NAMESPACE index caa0c098..ee580a7d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(dhist_mean_location) export(dhist_std) export(dhist_variance) export(ecmf_knots) +export(ego_network_density) export(ego_to_graphlet_counts) export(emd) export(emd_cs) @@ -53,7 +54,9 @@ export(netdis_centred_graphlet_counts_ego) export(netdis_expected_graphlet_counts) export(netdis_expected_graphlet_counts_ego) export(netdis_expected_graphlet_counts_ego_fn) +export(netdis_expected_graphlet_counts_per_ego) export(netdis_for_all_graphs) +export(netdis_uptok) export(normalise_dhist_mass) export(normalise_dhist_variance) export(orbit_key) @@ -61,6 +64,7 @@ export(orbit_to_graphlet_counts) export(read_simple_graph) export(read_simple_graphs) export(scale_graphlet_count) +export(scale_graphlet_counts_ego) export(shift_dhist) export(simplify_graph) export(sort_dhist) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 53182f8c..2dba7b43 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -56,6 +56,38 @@ netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, 0.5 * (1 - netds2) } +#' Netdis - graphlets up to max_graphlet_size +#' +#' Calculate Netdis statistic between two graphs from their Centred Graphlet +#' Counts (generated using \code{netdis_centred_graphlet_counts}). +#' @param centred_graphlet_counts1 Centred Graphlet Counts for graph 1 +#' @param centred_graphlet_counts2 Centred Graphlet Counts for graph 2 +#' @param max_graphlet_size The size of graphlets to use for the Netdis calculation +#' The size of a graphlet is the number of nodes it contains. Netdis is calculated +#' for all graphlets from size 3 to size max_graphlet_size. +#' @return Netdis statistic calculated using centred counts for graphlets of +#' the specified size +#' @export +netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, + max_graphlet_size) +{ + if ((max_graphlet_size > 5) | (max_graphlet_size < 3)) { + stop("max_graphlet_size must be 3, 4 or 5.") + } + + netdis_statistics <- purrr::map(3:max_graphlet_size, + netdis, + centred_graphlet_counts1=sum_graphlet_counts_1, + centred_graphlet_counts2=sum_graphlet_counts_2) + + netdis_statistics <- simplify2array(netdis_statistics) + + names(netdis_statistics) <- sapply("netdis", paste, 3:max_graphlet_size, sep="") + + return(netdis_statistics) +} + + #' Scaled graphlet count for ego-networks #' #' Calculates graphlet counts for the n-step ego-network of each node in a graph, @@ -268,6 +300,35 @@ netdis_expected_graphlet_counts_ego <- function( t(simplify2array(expected_graphlet_counts)) } +#' INTERNAL FUNCTION - Do not call directly +#' +#' JACK To follow through logic of paper steps, wanted to pass +#' ego networks to the function, not the input query graph +#' (as in netdis_expected_graphlet_counts_ego_fn above). +#' +#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to +#' generate a function for calculating expected ego-network graphlet counts +#' from the statistics of a provided reference graph. +#' Temporarily accessible during development. +#' TODO: Remove @export prior to publishing +#' @export +netdis_expected_graphlet_counts_per_ego <- function( + ego_networks, max_graphlet_size, + density_breaks, density_binned_reference_counts) { + + # Map over query graph ego-networks, using reference graph statistics to + # calculate expected graphlet counts for each ego-network. + expected_graphlet_counts <- + purrr::map(ego_networks, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = density_binned_reference_counts) + names(expected_graphlet_counts) <- names(ego_networks) + + # Simplify list to array + t(simplify2array(expected_graphlet_counts)) +} + #' INTERNAL FUNCTION - Do not call directly #' #' Used by \code{netdis_expected_graphlet_counts_ego} to @@ -332,6 +393,37 @@ count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { graphlet_tuple_counts } +#' @export +ego_network_density <- function(ego_networks) { + densities <- purrr::simplify(purrr::map_dbl(ego_networks, + igraph::edge_density)) + + return(densities) +} + + + +#' Scale graphlet counts for an ego network by the n choose k possible +#' choices of k nodes in that ego-network, where n is the number of nodes +#' in the ego network and k is the number of nodes in the graphlet. +#' +#' @param ego_networks Pre-generated ego networks for an input graph. +#' @param graphlet_counts Pre-calculated graphlet counts for each ego_network. +#' @param max_graphlet_size Determines the maximum size of graphlets included +#' in graphlet_counts. +#' @return scaled graphlet counts. +#' @export +scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, + max_graphlet_size) { + ego_graphlet_tuples <- + count_graphlet_tuples_ego(ego_networks, max_graphlet_size = max_graphlet_size) + + scaled_graphlet_counts <- scale_graphlet_count(graphlet_counts, ego_graphlet_tuples) + + return (scaled_graphlet_counts) +} + + #' @export count_graphlet_tuples <- function(graph, max_graphlet_size) { graph_node_count <- igraph::vcount(graph) diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd new file mode 100644 index 00000000..50d5dd4b --- /dev/null +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_expected_graphlet_counts_per_ego} +\alias{netdis_expected_graphlet_counts_per_ego} +\title{INTERNAL FUNCTION - Do not call directly} +\usage{ +netdis_expected_graphlet_counts_per_ego(ego_networks, max_graphlet_size, + density_breaks, density_binned_reference_counts) +} +\description{ +JACK To follow through logic of paper steps, wanted to pass +ego networks to the function, not the input query graph +(as in netdis_expected_graphlet_counts_ego_fn above). +} +\details{ +Used by \code{netdis_expected_graphlet_counts_ego_fn} to +generate a function for calculating expected ego-network graphlet counts +from the statistics of a provided reference graph. +Temporarily accessible during development. +TODO: Remove @export prior to publishing +} diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd new file mode 100644 index 00000000..c31a602d --- /dev/null +++ b/man/netdis_uptok.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_uptok} +\alias{netdis_uptok} +\title{Netdis - graphlets up to max_graphlet_size} +\usage{ +netdis_uptok(centred_graphlet_counts1, centred_graphlet_counts2, + max_graphlet_size) +} +\arguments{ +\item{centred_graphlet_counts1}{Centred Graphlet Counts for graph 1} + +\item{centred_graphlet_counts2}{Centred Graphlet Counts for graph 2} + +\item{max_graphlet_size}{The size of graphlets to use for the Netdis calculation +The size of a graphlet is the number of nodes it contains. Netdis is calculated +for all graphlets from size 3 to size max_graphlet_size.} +} +\value{ +Netdis statistic calculated using centred counts for graphlets of +the specified size +} +\description{ +Calculate Netdis statistic between two graphs from their Centred Graphlet +Counts (generated using \code{netdis_centred_graphlet_counts}). +} diff --git a/man/scale_graphlet_counts_ego.Rd b/man/scale_graphlet_counts_ego.Rd new file mode 100644 index 00000000..46aa515a --- /dev/null +++ b/man/scale_graphlet_counts_ego.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{scale_graphlet_counts_ego} +\alias{scale_graphlet_counts_ego} +\title{Scale graphlet counts for an ego network by the n choose k possible +choices of k nodes in that ego-network, where n is the number of nodes +in the ego network and k is the number of nodes in the graphlet.} +\usage{ +scale_graphlet_counts_ego(ego_networks, graphlet_counts, max_graphlet_size) +} +\arguments{ +\item{ego_networks}{Pre-generated ego networks for an input graph.} + +\item{graphlet_counts}{Pre-calculated graphlet counts for each ego_network.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets included +in graphlet_counts.} +} +\value{ +scaled graphlet counts. +} +\description{ +Scale graphlet counts for an ego network by the n choose k possible +choices of k nodes in that ego-network, where n is the number of nodes +in the ego network and k is the number of nodes in the graphlet. +} diff --git a/vignettes/quickstart_netdis.R b/vignettes/quickstart_netdis.R index 1ba1135f..c223750e 100644 --- a/vignettes/quickstart_netdis.R +++ b/vignettes/quickstart_netdis.R @@ -30,9 +30,18 @@ ego_graphlet_counts <- purrr::map_depth(ego_networks, 2, count_graphlets_for_gra file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), package = "netdist") ref_graph <- read_simple_graph(file, format = "ncol") + # Generate ego networks for reference graph ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size) + # Count graphlets for ego networks in reference graph ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph, max_graphlet_size = max_graphlet_size) +# Scale ego-network graphlet counts by dividing by total number of k-tuples in + # ego-network (where k is graphlet size) +ref_ego_graphlet_tuples <- + count_graphlet_tuples_ego(ref_ego_networks, max_graphlet_size = max_graphlet_size) + +#ref_ego_graphlet_counts <- scale_graphlet_count(ref_ego_graphlet_counts, ref_ego_graphlet_tuples) + diff --git a/vignettes/quickstart_netdis.html b/vignettes/quickstart_netdis.html index c3f80279..278edd5c 100644 --- a/vignettes/quickstart_netdis.html +++ b/vignettes/quickstart_netdis.html @@ -12,7 +12,7 @@ - + Quick start guide for Netdis @@ -37,7 +37,7 @@ pre.numberSource a.sourceLine { position: relative; left: -4em; } pre.numberSource a.sourceLine::before - { content: attr(data-line-number); + { content: attr(title); position: relative; left: -1em; text-align: right; vertical-align: baseline; border: none; pointer-events: all; display: inline-block; -webkit-touch-callout: none; -webkit-user-select: none; @@ -303,55 +303,64 @@

Quick start guide for Netdis

Martin O’Reilly

-

2019-05-02

+

2019-07-29

Define function for calculating expected counts for ego networks

Here we generate these from a reference graph.

- diff --git a/vignettes/quickstart_netdis_2graphs.R b/vignettes/quickstart_netdis_2graphs.R new file mode 100644 index 00000000..0732236f --- /dev/null +++ b/vignettes/quickstart_netdis_2graphs.R @@ -0,0 +1,117 @@ +## ------------------------------------------------------------------------ +# Load libraries +library("netdist") +library("purrr") + +## ------------------------------------------------------------------------ +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Load reference graph +# JACK - need to deal with case where ref graph not used. +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +## ------------------------------------------------------------------------ +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size = 4 + +# Ego network neighbourhood size +neighbourhood_size = 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ------------------------------------------------------------------------ +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +## ------------------------------------------------------------------------ +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +## ------------------------------------------------------------------------ + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(ego_ref, + graphlet_counts_ref, + max_graphlet_size) + +# Get ego-network densities +densities_ref <- ego_network_density(ego_ref) + +# Adaptively bin ref ego-network densities +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks + +# Average ref graphlet counts across density bins +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) + + +## ------------------------------------------------------------------------ +# Calculate expected graphlet counts (using ref graph ego network density bins) +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, + max_graphlet_size, + ref_ego_density_bins, + ref_binned_graphlet_counts) + + +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, + max_graphlet_size, + ref_ego_density_bins, + ref_binned_graphlet_counts) + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 + +centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 + +## ------------------------------------------------------------------------ +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + +## ------------------------------------------------------------------------ +netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + + +## ------------------------------------------------------------------------ + + diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 39d15ba2..e6f5a2e3 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -16,26 +16,7 @@ library("netdist") library("purrr") ``` -## Set Netdis parameters -```{r} -# Maximum graphlet size to calculate counts for. -# We choose the specific graphlet size for the Netdis metric later. -max_graphlet_size = 4 - -# Ego network neighbourhood size -neighbourhood_size = 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Ego network density binning parameters -min_bin_count <- 5 -num_bins <- 100 -``` - ## Load graphs - ```{r} # Set source directory for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") @@ -48,12 +29,30 @@ graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") # Load reference graph +# JACK - need to deal with case where ref graph not used. ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), package = "netdist") ref_graph <- read_simple_graph(ref_path, format = "ncol") ``` -## Generate ego networks +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks for query graphs ```{r} # Get ego networks for query graphs and reference graph ego_1 <- make_named_ego_graph(graph_1, @@ -72,79 +71,73 @@ ego_ref <- make_named_ego_graph(ref_graph, min_ego_edges = min_ego_edges) ``` -## Count graphlets in ego networks +## Count graphlets in ego networks for query graphs ```{r} # Count graphlets for ego networks in query and reference graphs graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) ``` -### Generate scaled graphlet counts for reference graph. +## Bin reference graph by ego network density and calculate mean graphlet counts in each bin ```{r} # Scale ego-network graphlet counts by dividing by total number of k-tuples in - # ego-network (where k is graphlet size) -ego_graphlet_tuples_ref <- - count_graphlet_tuples_ego(ego_ref, max_graphlet_size = max_graphlet_size) - -scaled_graphlet_counts_ref <- scale_graphlet_count(graphlet_counts_ref, ego_graphlet_tuples_ref) -``` - -## Bin reference ego networks by density and calculate mean graphlet counts in each bin -```{r} +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(ego_ref, + graphlet_counts_ref, + max_graphlet_size) # Get ego-network densities -densities_ref <- purrr::simplify(purrr::map_dbl(ego_ref, igraph::edge_density)) +densities_ref <- ego_network_density(ego_ref) # Adaptively bin ref ego-network densities -binned_densities <- binned_densities_adaptive( - densities_ref, min_counts_per_interval = min_bin_count, num_intervals = num_bins) +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks # Average ref graphlet counts across density bins -density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts_ref, binned_densities$interval_indexes) +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) ``` -## Calculate expected graphlet counts for query networks - +## Centre graphlet counts of query graphs based on statistics of reference graph ```{r} +# Calculate expected graphlet counts (using ref graph ego network density bins) +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, + max_graphlet_size, + ref_ego_density_bins, + ref_binned_graphlet_counts) -exp_graphlet_counts_1 <- purrr::map(ego_1, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = binned_densities$breaks, - density_binned_reference_counts = density_binned_graphlet_counts) - -exp_graphlet_counts_1 <- t(simplify2array(exp_graphlet_counts_1)) -exp_graphlet_counts_2 <- purrr::map(ego_2, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = binned_densities$breaks, - density_binned_reference_counts = density_binned_graphlet_counts) - -exp_graphlet_counts_2 <- t(simplify2array(exp_graphlet_counts_2)) - -``` +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, + max_graphlet_size, + ref_ego_density_bins, + ref_binned_graphlet_counts) -## Centre graphlet counts of query graphs based on statistics of reference graph -```{r} +# Centre graphlet counts by subtracting expected counts centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 ``` -## Sum centred counts across all ego networks +## Sum centred graphlet counts across all ego networks ```{r} -#query_sum_centred_graphlet_counts <- SUM_SUBGRAPH_COUNTS(query_ego_centred_subgraph_counts) +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) ``` ## Calculate netdis statistics ```{r} -#NETDIS_STATISTIC(query_sum_centred_graphlet_counts) -``` -## Display results -```{r} +netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) ``` \ No newline at end of file diff --git a/vignettes/quickstart_netdis_2graphs.html b/vignettes/quickstart_netdis_2graphs.html new file mode 100644 index 00000000..13fda1ea --- /dev/null +++ b/vignettes/quickstart_netdis_2graphs.html @@ -0,0 +1,448 @@ + + + + + + + + + + + + + + + + +Quick start guide for Netdis - 2 graphs + + + + + + + + + + + + + + + + + + + + + +

Quick start guide for Netdis - 2 graphs

+

Martin O’Reilly, Jack Roberts

+

2019-07-29

+ + + + + + + + + + + +
+

Calculate netdis statistics

+ +
## [1] 0.1846655 0.1749835
+
+
+

Display results

+
+ + + + + + + + From c1125999b346d9d6cf2868ded302407457cc1d9e Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 29 Jul 2019 15:44:41 +0100 Subject: [PATCH 018/188] minor name tweaks --- vignettes/quickstart_netdis_2graphs.Rmd | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index e6f5a2e3..bcf139df 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -52,7 +52,7 @@ min_bin_count <- 5 num_bins <- 100 ``` -## Generate ego networks for query graphs +## Generate ego networks ```{r} # Get ego networks for query graphs and reference graph ego_1 <- make_named_ego_graph(graph_1, @@ -71,7 +71,7 @@ ego_ref <- make_named_ego_graph(ref_graph, min_ego_edges = min_ego_edges) ``` -## Count graphlets in ego networks for query graphs +## Count graphlets in ego networks ```{r} # Count graphlets for ego networks in query and reference graphs graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) @@ -136,8 +136,9 @@ sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) ## Calculate netdis statistics ```{r} -netdis_uptok(sum_graphlet_counts_1, - sum_graphlet_counts_2, - max_graphlet_size) +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) +print(netdis_result) ``` \ No newline at end of file From ac915cd752783c427a890edf8835cf8b23ea002d Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 29 Jul 2019 18:12:39 +0100 Subject: [PATCH 019/188] started refactoring according to style guides --- R/graph_binning.R | 80 ++++--- R/measures_net_dis.R | 491 ++++++++++++++++++++++++------------------- R/orca_interface.R | 337 +++++++++++++++-------------- 3 files changed, 501 insertions(+), 407 deletions(-) diff --git a/R/graph_binning.R b/R/graph_binning.R index 47a58bcd..8c36cbda 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -1,97 +1,107 @@ #' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to +#' +#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. -#' Temporarily accessible during development. +#' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export -binned_densities_adaptive <- function(densities, min_counts_per_interval, num_intervals) -{ +binned_densities_adaptive <- function(densities, + min_counts_per_interval, + num_intervals) { breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, breaks = num_intervals) interval_indexes <- interval_index(densities, breaks = breaks, out_of_range_intervals = FALSE) - list(densities = densities, interval_indexes = interval_indexes, breaks = breaks) + list( + densities = densities, + interval_indexes = interval_indexes, + breaks = breaks + ) } #' Bin values into intervals based on the provided breaks -#' +#' #' @param x The values to be binned #' @param breaks The boundaries between bins -#' @param out_of_range_intervals If \code{TRUE}, "out of range" values lying -#' below the first break or above the last break will be assigned to additional -#' unbounded lower and upper extrema intervals. If \code{FALSE} these "out of -#' range" values will be assigned to intervals bounded by the lowest or +#' @param out_of_range_intervals If \code{TRUE}, "out of range" values lying +#' below the first break or above the last break will be assigned to additional +#' unbounded lower and upper extrema intervals. If \code{FALSE} these "out of +#' range" values will be assigned to intervals bounded by the lowest or #' uppermost break respectively. #' @return A vector of bin indexes, one per value provided #' @export interval_index <- function(x, breaks, out_of_range_intervals = FALSE) { - # Get indexes for the intervals each value falls into. Setting - # all.inside = TRUE ensures that the minimum and maximum values will be + # Get indexes for the intervals each value falls into. Setting + # all.inside = TRUE ensures that the minimum and maximum values will be # assigned to the intervals they bound. findInterval(x, breaks, all.inside = !out_of_range_intervals) } #' Generate a set of breaks that attempt to be evenly spaced while ensuring each #' interval has the specified minimum count -#' +#' #' Starts by binning the variable by the breaks provided in \code{breaks} (if #' \code{breaks} is a vector), or generating a set of \code{breaks} at uniformly -#' spaced intervals (if \code{breaks} is a single number). It then iteratively -#' merges intervals with counts lower than \code{min_count} by removing breaks +#' spaced intervals (if \code{breaks} is a single number). It then iteratively +#' merges intervals with counts lower than \code{min_count} by removing breaks #' until all remaining intervals have counts of at least \code{min_count}. -#' +#' #' @param x The variable to be binned #' @param min_count The minimum count for each bin #' @param breaks Either a vector containing an intital set of breaks or a single -#' number indicating how many uniformly spaced intervals to use when constructing -#' the initial set of breaks. If a single number is provided, the minumum break -#' will be the minimum value of x and the maximum break will be the maximum -#' value of x. +#' number indicating how many uniformly spaced intervals to use when +#' constructing the initial set of breaks. If a single number is provided, the +#' minumum break will be the minimum value of x and the maximum break will be +#' the maximum value of x. #' #' @export adaptive_breaks <- function(x, min_count, breaks) { - if(length(breaks) == 1) { + if (length(breaks) == 1) { # Similarly to base::cut, we interpret a single number in breaks as the # number of intervals required and generate these evenly spaced min_x <- min(x) max_x <- max(x) - breaks = seq(from = min_x, to = max_x, length.out = breaks + 1) + breaks <- seq(from = min_x, to = max_x, length.out = breaks + 1) } # There is one less interval than there are breaks num_intervals <- length(breaks) - 1 # Get indexes for the intervals each value of x falls into. x_interval_indexes <- interval_index(x, breaks) # Find the lowest interval with fewer than the minimum required count. - # Not all intervals are guaranteed to have members in x. If they don't, they - # won't appear in x_interval_indexes. We therefore append the full list of - # indexes prior to counting and subtract 1 from all counts afterwards to get + # Not all intervals are guaranteed to have members in x. If they don't, they + # won't appear in x_interval_indexes. We therefore append the full list of + # indexes prior to counting and subtract 1 from all counts afterwards to get # an accurate count that includes indexes with no members with zero counts all_interval_indexes <- 1:num_intervals - interval_index_counts <- plyr::count(c(x_interval_indexes, all_interval_indexes)) + interval_index_counts <- plyr::count( + c(x_interval_indexes, all_interval_indexes) + ) interval_index_counts$freq <- interval_index_counts$freq - 1 - + # Find the first interval with fewer members than the minimum specified count - merge_position <- Position(function(i) i < min_count, interval_index_counts$freq) - # Not all intervals are guaranteed to have members, so convert the index + merge_position <- Position( + function(i) i < min_count, + interval_index_counts$freq + ) + # Not all intervals are guaranteed to have members, so convert the index # provided by Position into an index into the full interval list and then add merge_interval_index <- interval_index_counts$x[merge_position] - if(is.na(merge_interval_index)) { + if (is.na(merge_interval_index)) { # If all intervals have at least the minimum count, return the breaks return(breaks) } else { # Remove a break to merge the low count interval with one of its neighbours # and recursively call this function if (merge_interval_index == num_intervals) { - # If low interval is last one, we can only merge with the previous interval - # so remove lower break for low interval + # If low interval is last one, we can only merge with the previous + # interval so remove lower break for low interval merge_break_index <- merge_interval_index } else { - # In all other cases merge low interval with next inteval by removing + # In all other cases merge low interval with next inteval by removing # upper breal for low interval merge_break_index <- merge_interval_index + 1 } return(adaptive_breaks(x, min_count, breaks[-merge_break_index])) } -} \ No newline at end of file +} diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 2dba7b43..64045e54 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,33 +1,36 @@ #' Netdis between all graph pairs using provided Centred Graphlet Counts -#' @param centred_graphlet_counts List containing Centred Graphlet Counts for +#' @param centred_graphlet_counts List containing Centred Graphlet Counts for #' all graphs being compared #' @param graphlet_size The size of graphlets to use for the Netdis calculation #' (only counts for graphlets of the specified size will be used). The size of #' a graphlet is the number of nodes it contains. -#' @return Pairwise Netdis statistics between graphs calculated using centred +#' @return Pairwise Netdis statistics between graphs calculated using centred #' counts for graphlets of the specified size #' @export -netdis_for_all_graphs <- function( - centred_graphlet_counts, graphlet_size, mc.cores = getOption("mc.cores", 2L)) { +netdis_for_all_graphs <- function(centred_graphlet_counts, + graphlet_size, + mc.cores = getOption("mc.cores", 2L)) { comp_spec <- cross_comparison_spec(centred_graphlet_counts) - # NOTE: mcapply only works on unix-like systems with system level forking + # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows # TODO: Look into using the parLappy function on Windows - if(.Platform$OS.type != "unix") { - # Force cores to 1 if system is not unix-like as it will not support + if (.Platform$OS.type != "unix") { + # Force cores to 1 if system is not unix-like as it will not support # forking - mc.cores = 1 + mc.cores <- 1 } - netdis <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) {netdis( - centred_graphlet_counts[[index_a]], centred_graphlet_counts[[index_b]], - graphlet_size = graphlet_size) + netdis <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { + netdis( + centred_graphlet_counts[[index_a]], centred_graphlet_counts[[index_b]], + graphlet_size = graphlet_size + ) }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE)) list(netdis = netdis, comp_spec = comp_spec) } #' Netdis -#' +#' #' Calculate Netdis statistic between two graphs from their Centred Graphlet #' Counts (generated using \code{netdis_centred_graphlet_counts}). #' @param centred_graphlet_counts1 Centred Graphlet Counts for graph 1 @@ -35,141 +38,168 @@ netdis_for_all_graphs <- function( #' @param graphlet_size The size of graphlets to use for the Netdis calculation #' (only counts for graphlets of the specified size will be used). The size of #' a graphlet is the number of nodes it contains. -#' @return Netdis statistic calculated using centred counts for graphlets of +#' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export -netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, - graphlet_size) -{ - # Select subset of centred counts corresponding to graphlets of the +netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, + graphlet_size) { + # Select subset of centred counts corresponding to graphlets of the # specified size ids <- graphlet_ids_for_size(graphlet_size) counts1 <- centred_graphlet_counts1[ids] counts2 <- centred_graphlet_counts2[ids] - + # Calculate normalising constant - norm_const <- sum(counts1^2 / sqrt(counts1^2 + counts2^2),na.rm = TRUE) * - sum(counts2^2 / sqrt(counts1^2 + counts2^2),na.rm = TRUE) + norm_const <- sum(counts1^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) * + sum(counts2^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate intermediate "netD" statistic that falls within range -1..1 - netds2 <- (1/sqrt(norm_const)) * sum((counts1 * counts2) / sqrt(counts1^2 + counts2^2),na.rm = TRUE) + netds2 <- (1 / sqrt(norm_const)) * + sum((counts1 * counts2) / + sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate corresponding "netd" Netdis statistic that falls within range 0..1 0.5 * (1 - netds2) -} +} #' Netdis - graphlets up to max_graphlet_size -#' +#' #' Calculate Netdis statistic between two graphs from their Centred Graphlet #' Counts (generated using \code{netdis_centred_graphlet_counts}). #' @param centred_graphlet_counts1 Centred Graphlet Counts for graph 1 #' @param centred_graphlet_counts2 Centred Graphlet Counts for graph 2 -#' @param max_graphlet_size The size of graphlets to use for the Netdis calculation -#' The size of a graphlet is the number of nodes it contains. Netdis is calculated -#' for all graphlets from size 3 to size max_graphlet_size. -#' @return Netdis statistic calculated using centred counts for graphlets of +#' @param max_graphlet_size max graphlet size to calculate Netdis for. +#' The size of a graphlet is the number of nodes it contains. Netdis is +#' calculated for all graphlets from size 3 to size max_graphlet_size. +#' @return Netdis statistic calculated using centred counts for graphlets of #' the specified size #' @export -netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, - max_graphlet_size) -{ +netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, + max_graphlet_size) { if ((max_graphlet_size > 5) | (max_graphlet_size < 3)) { stop("max_graphlet_size must be 3, 4 or 5.") } - netdis_statistics <- purrr::map(3:max_graphlet_size, - netdis, - centred_graphlet_counts1=sum_graphlet_counts_1, - centred_graphlet_counts2=sum_graphlet_counts_2) - + netdis_statistics <- purrr::map(3:max_graphlet_size, + netdis, + centred_graphlet_counts1 = sum_graphlet_counts_1, + centred_graphlet_counts2 = sum_graphlet_counts_2 + ) + netdis_statistics <- simplify2array(netdis_statistics) - - names(netdis_statistics) <- sapply("netdis", paste, 3:max_graphlet_size, sep="") - - return(netdis_statistics) -} + names(netdis_statistics) <- + sapply( + "netdis", + paste, + 3:max_graphlet_size, + sep = "" + ) + + netdis_statistics +} #' Scaled graphlet count for ego-networks -#' -#' Calculates graphlet counts for the n-step ego-network of each node in a graph, -#' scaled by dividing the graphlet counts for each ego-network by the total -#' number of possible groupings of nodes in the ego-network with the same number -#' of nodes as each graphlet. This scaling factor is choose(n, k), where n is the -#' number of nodes in the ego-network and k is the number of nodes in the graphlet. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' +#' Calculates graphlet counts for the n-step ego-network of each node in +#' a graph, scaled by dividing the graphlet counts for each ego-network by the +#' total number of possible groupings of nodes in the ego-network with the same +#' number of nodes as each graphlet. This scaling factor is choose(n, k), +#' where n is the number of nodes in the ego-network and k is the number of +#' nodes in the graphlet. +#' @param graph A connected, undirected, simple graph as an \code{igraph} +#' object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. #' @param neighbourhood_size The number of steps from the source node to include #' nodes for each ego-network. -#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} #' nodes are returned. -#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} #' edges are returned. -#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside -#' graphlet counts to enable further processing. -#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix -#' containing counts of each graphlet (columns, C) for each ego-network in the -#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are +#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside +#' graphlet counts to enable further processing. +#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix +#' containing counts of each graphlet (columns, C) for each ego-network in the +#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are #' labelled with the ID of the central node in each ego-network (if nodes in the #' input graph are labelled). If \code{return_ego_networks = TRUE}, returns a #' list with the following elements: #' \itemize{ -#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each +#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each #' ego-network in the input graph as described above. #' \item \code{ego_networks}: The ego-networks of the query graph. #' } #' @export -count_graphlets_ego_scaled <- function( - graph, max_graphlet_size, neighbourhood_size, - min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) { - +count_graphlets_ego_scaled <- function(graph, + max_graphlet_size, + neighbourhood_size, + min_ego_nodes = 3, + min_ego_edges = 1, + return_ego_networks = FALSE) { + # Calculate ego-network graphlet counts, also returning the ego networks for # use later in function - ego_data <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE) + ego_data <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) ego_graphlet_counts <- ego_data$graphlet_counts ego_networks <- ego_data$ego_networks - + # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) - ego_graphlet_tuples <- - count_graphlet_tuples_ego(ego_networks, max_graphlet_size = max_graphlet_size) - ego_graphlet_counts <- scale_graphlet_count(ego_graphlet_counts, ego_graphlet_tuples) - + ego_graphlet_tuples <- count_graphlet_tuples_ego( + ego_networks, + max_graphlet_size = max_graphlet_size + ) + ego_graphlet_counts <- scale_graphlet_count( + ego_graphlet_counts, + ego_graphlet_tuples + ) + # Return either graphlet counts, or graphlet counts and ego_networks - if(return_ego_networks) { - return(list(graphlet_counts = ego_graphlet_counts, - ego_networks = ego_networks)) + if (return_ego_networks) { + return(list( + graphlet_counts = ego_graphlet_counts, + ego_networks = ego_networks + )) } else { return(ego_graphlet_counts) } } #' Generate Netdis centred graphlets counts by subtracting expected counts -#' -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' +#' @param graph A connected, undirected, simple graph as an +#' \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes +#' will be counted. #' @param neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. +#' nodes for each ego-network. #' @param expected_ego_count_fn A function for generating expected ego-network -#' graphlet counts for a graph. This function should take a connected, -#' undirected, simple graph as an \code{igraph} object for its only argument. -#' Where \code{expected_ego_count_fn} is specific to particular values of -#' \code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken +#' graphlet counts for a graph. This function should take a connected, +#' undirected, simple graph as an \code{igraph} object for its only argument. +#' Where \code{expected_ego_count_fn} is specific to particular values of +#' \code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken #' to ensure that the values of these parameters passed to this function are #' consistent with those used when creating \code{expected_ego_count_fn}. #' @return A vector with centred counts for each graphlet type -#' @export -netdis_centred_graphlet_counts <- function( - graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn = - NULL) { +#' @export +netdis_centred_graphlet_counts <- function(graph, + max_graphlet_size, + neighbourhood_size, + expected_ego_count_fn = NULL) { # Get centred counts for each ego network centred_counts <- netdis_centred_graphlet_counts_ego( - graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn) + graph, + max_graphlet_size, + neighbourhood_size, + expected_ego_count_fn + ) # Sum centred counts over ego-networks apply(centred_counts, MARGIN = 2, FUN = sum) } @@ -177,173 +207,202 @@ netdis_centred_graphlet_counts <- function( #' TODO: Remove @export prior to publishing #' @export -netdis_centred_graphlet_counts_ego <- function( - graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn = NULL, - min_ego_nodes = 3, min_ego_edges = 1) { - # Get unscaled ego-network graphlet counts - res <- count_graphlets_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE) - - actual_counts = res$graphlet_counts - ego_networks <- res$ego_networks - - # Centre these counts by subtracting the expected counts - if(is.null(expected_ego_count_fn)) { - centred_counts = actual_counts - } else { - centred_counts <- actual_counts - expected_ego_count_fn(graph) - } - centred_counts +netdis_centred_graphlet_counts_ego <- function(graph, + max_graphlet_size, + neighbourhood_size, + expected_ego_count_fn = NULL, + min_ego_nodes = 3, + min_ego_edges = 1) { + # Get unscaled ego-network graphlet counts + res <- count_graphlets_ego( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + + actual_counts <- res$graphlet_counts + + # Centre these counts by subtracting the expected counts + if (is.null(expected_ego_count_fn)) { + centred_counts <- actual_counts + } else { + centred_counts <- actual_counts - expected_ego_count_fn(graph) + } + centred_counts } #' Generate Netdis expected graphlet count function -#' -#' Generates a function to calculate expected ego-network graphlet counts for +#' +#' Generates a function to calculate expected ego-network graphlet counts for #' query graphs based on the statistics of a provided reference graph. -#' -#' Generates graphlet counts for all ego-networks in the supplied reference graph -#' and then averages these graphlet counts over density bins to generate -#' density-dependent reference graphlet counts. Prior to averaging, the graphlet -#' counts are scaled in a size-dependent manner to permit ego-networks with -#' similar densities but different sizes to be averaged together. -#' -#' Returns a function that uses the density-dependent reference graphlet +#' +#' Generates graphlet counts for all ego-networks in the supplied +#' reference graph and then averages these graphlet counts over density bins to +#' generate density-dependent reference graphlet counts. Prior to averaging, +#' the graphlet counts are scaled in a size-dependent manner to permit +#' ego-networks with similar densities but different sizes to be averaged +#' together. +#' +#' Returns a function that uses the density-dependent reference graphlet #' counts to generate expected graphlet counts for all ego-networks in a query #' network. When doing so, it matches ego-networks to reference counts by #' density and reverses the scaling that was applied to the original reference #' counts in order to allow averaging across ego-networks with similar density #' but different numbers of nodes. -#' @param graph A connected, undirected, simple reference graph as an -#' \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' @param graph A connected, undirected, simple reference graph as an +#' \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. #' @param neighbourhood_size The number of steps from the source node to include #' node in ego-network. -#' @return A function taking a connected, undirected, simple query graph as an -#' \code{igraph} object and returning an RxC matrix containing the expected -#' counts of each graphlet (columns, C) for each ego-network in the query graph -#' (rows, R). Columns are labelled with graphlet IDs and rows are labelled with -#' the ID of the central node in each ego-network (if nodes in the query graph +#' @return A function taking a connected, undirected, simple query graph as an +#' \code{igraph} object and returning an RxC matrix containing the expected +#' counts of each graphlet (columns, C) for each ego-network in the query graph +#' (rows, R). Columns are labelled with graphlet IDs and rows are labelled with +#' the ID of the central node in each ego-network (if nodes in the query graph #' are labelled) #' @export -netdis_expected_graphlet_counts_ego_fn <- function( - graph, max_graphlet_size, neighbourhood_size, - min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, num_bins = 100) { - +netdis_expected_graphlet_counts_ego_fn <- function(graph, + max_graphlet_size, + neighbourhood_size, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) { + # Calculate the scaled graphlet counts for all ego networks in the reference # graph, also returning the ego networks themselves in order to calculate # their densities res <- count_graphlets_ego_scaled( - graph, max_graphlet_size, neighbourhood_size, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - return_ego_networks = TRUE) - - scaled_graphlet_counts = res$graphlet_counts + graph, + max_graphlet_size, + neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + + scaled_graphlet_counts <- res$graphlet_counts ego_networks <- res$ego_networks - + # Get ego-network densities - densities <- purrr::simplify(purrr::map_dbl(ego_networks, igraph::edge_density)) - + densities <- purrr::simplify( + purrr::map_dbl(ego_networks, igraph::edge_density) + ) + # Adaptively bin ego-network densities binned_densities <- binned_densities_adaptive( - densities, min_counts_per_interval = min_bin_count, num_intervals = num_bins) - + densities, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins + ) + # Average graphlet counts across density bins density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts, binned_densities$interval_indexes) - + scaled_graphlet_counts, + binned_densities$interval_indexes + ) + # Return a partially applied function with the key reference graph information # built-in purrr::partial( netdis_expected_graphlet_counts_ego, max_graphlet_size = max_graphlet_size, neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, density_breaks = binned_densities$breaks, - density_binned_reference_counts = density_binned_graphlet_counts) + density_binned_reference_counts = density_binned_graphlet_counts + ) } #' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to +#' +#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. -#' Temporarily accessible during development. +#' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export -netdis_expected_graphlet_counts_ego <- function( - graph, max_graphlet_size, neighbourhood_size, - density_breaks, density_binned_reference_counts, - min_ego_nodes = 3, min_ego_edges = 1) { +netdis_expected_graphlet_counts_ego <- function(graph, + max_graphlet_size, + neighbourhood_size, + density_breaks, + density_binned_reference_counts, + min_ego_nodes = 3, + min_ego_edges = 1) { # Generate ego-networks for query graph ego_networks <- make_named_ego_graph(graph, neighbourhood_size) # Drop ego-networks that don't have the minimum number of nodes or edges - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { + drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) })) ego_networks <- ego_networks[!drop_index] - # Map over query graph ego-networks, using reference graph statistics to + # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. - expected_graphlet_counts <- + expected_graphlet_counts <- purrr::map(ego_networks, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts) + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = density_binned_reference_counts + ) names(expected_graphlet_counts) <- names(ego_networks) # Simplify list to array t(simplify2array(expected_graphlet_counts)) } #' INTERNAL FUNCTION - Do not call directly -#' +#' #' JACK To follow through logic of paper steps, wanted to pass #' ego networks to the function, not the input query graph #' (as in netdis_expected_graphlet_counts_ego_fn above). -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to +#' +#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. -#' Temporarily accessible during development. +#' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export -netdis_expected_graphlet_counts_per_ego <- function( - ego_networks, max_graphlet_size, - density_breaks, density_binned_reference_counts) { +netdis_expected_graphlet_counts_per_ego <- function(ego_networks, + max_graphlet_size, + density_breaks, + density_binned_reference_counts) { - # Map over query graph ego-networks, using reference graph statistics to + # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. - expected_graphlet_counts <- + expected_graphlet_counts <- purrr::map(ego_networks, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts) + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = density_binned_reference_counts + ) names(expected_graphlet_counts) <- names(ego_networks) - + # Simplify list to array t(simplify2array(expected_graphlet_counts)) } #' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego} to -#' calculate expected graphlet counts for a query graph ego-network from the +#' +#' Used by \code{netdis_expected_graphlet_counts_ego} to +#' calculate expected graphlet counts for a query graph ego-network from the #' statistics of a provided reference graph. -#' Temporarily accessible during development. +#' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export -netdis_expected_graphlet_counts <- function( - graph, max_graphlet_size, density_breaks, density_binned_reference_counts) { +netdis_expected_graphlet_counts <- function(graph, + max_graphlet_size, + density_breaks, + density_binned_reference_counts) { # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- igraph::edge_density(graph) matched_density_index <- interval_index(query_density, density_breaks) - matched_reference_counts <- density_binned_reference_counts[matched_density_index,] + matched_reference_counts <- + density_binned_reference_counts[matched_density_index, ] # Scale reference counts by multiplying the reference count for each graphlet # by the number of possible sets of k nodes in the query graph, where k is the # number of nodes in the graphlet @@ -351,23 +410,24 @@ netdis_expected_graphlet_counts <- function( } #' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to +#' +#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. -#' Temporarily accessible during development. +#' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export -mean_density_binned_graphlet_counts <- function( - graphlet_counts, density_interval_indexes) { - # The ego network graphlet counts are an E x G matrix with rows (E) representing - # ego networks and columns (G) representing graphlets. We want to calculate - # the mean count for each graphlet / density bin combination, so we will - # use tapply to average counts for each graphlet across density bins, using - # apply to map this operation over graphlets - mean_density_binned_graphlet_counts <- +mean_density_binned_graphlet_counts <- function(graphlet_counts, + density_interval_indexes) { + # The ego network graphlet counts are an E x G matrix with rows (E) + # representing ego networks and columns (G) representing graphlets. We want + # to calculate the mean count for each graphlet / density bin combination, + # so we will use tapply to average counts for each graphlet across density + # bins, using apply to map this operation over graphlets + mean_density_binned_graphlet_counts <- apply(graphlet_counts, MARGIN = 2, function(gc) { - tapply(gc, INDEX = density_interval_indexes, FUN = mean)}) + tapply(gc, INDEX = density_interval_indexes, FUN = mean) + }) } @@ -387,40 +447,46 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { #' @export count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { - graphlet_tuple_counts <- - t(simplify2array(purrr::map(ego_networks, count_graphlet_tuples, - max_graphlet_size = max_graphlet_size))) + graphlet_tuple_counts <- + t(simplify2array(purrr::map(ego_networks, count_graphlet_tuples, + max_graphlet_size = max_graphlet_size + ))) graphlet_tuple_counts } #' @export ego_network_density <- function(ego_networks) { - densities <- purrr::simplify(purrr::map_dbl(ego_networks, - igraph::edge_density)) - + densities <- purrr::simplify(purrr::map_dbl( + ego_networks, + igraph::edge_density + )) + return(densities) } -#' Scale graphlet counts for an ego network by the n choose k possible +#' Scale graphlet counts for an ego network by the n choose k possible #' choices of k nodes in that ego-network, where n is the number of nodes -#' in the ego network and k is the number of nodes in the graphlet. -#' +#' in the ego network and k is the number of nodes in the graphlet. +#' #' @param ego_networks Pre-generated ego networks for an input graph. #' @param graphlet_counts Pre-calculated graphlet counts for each ego_network. #' @param max_graphlet_size Determines the maximum size of graphlets included #' in graphlet_counts. #' @return scaled graphlet counts. #' @export -scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, +scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, max_graphlet_size) { - ego_graphlet_tuples <- - count_graphlet_tuples_ego(ego_networks, max_graphlet_size = max_graphlet_size) - - scaled_graphlet_counts <- scale_graphlet_count(graphlet_counts, ego_graphlet_tuples) - - return (scaled_graphlet_counts) + ego_graphlet_tuples <- count_graphlet_tuples_ego( + ego_networks, + max_graphlet_size = max_graphlet_size) + + scaled_graphlet_counts <- scale_graphlet_count( + graphlet_counts, + ego_graphlet_tuples) + + return(scaled_graphlet_counts) } @@ -430,9 +496,8 @@ count_graphlet_tuples <- function(graph, max_graphlet_size) { graphlet_key <- graphlet_key(max_graphlet_size) graphlet_node_counts <- graphlet_key$node_count graphlet_tuple_counts <- choose(graph_node_count, graphlet_node_counts) - graphlet_tuple_counts <- stats::setNames(graphlet_tuple_counts, graphlet_key$id) + graphlet_tuple_counts <- stats::setNames( + graphlet_tuple_counts, + graphlet_key$id) graphlet_tuple_counts } - - - diff --git a/R/orca_interface.R b/R/orca_interface.R index 4000db68..04e080ed 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -1,9 +1,9 @@ #' Integer index edge list from igraph -#' +#' #' Takes a igraph graph object and generates an edgelist where each edge is #' represented by the integer indexes of its vertices. Note that, where a graph #' has isolated vertices, the indexes for these vertices will not be present -#' in the edge list. Where a graph has no isolated vertices, the edge list will +#' in the edge list. Where a graph has no isolated vertices, the edge list will #' include all vertex indexes from 1 to numVertices. #' @param graph An igraph graph object #' @return A 2 x numEdges edgelist with vertices labelled with integer indices @@ -11,7 +11,7 @@ #' the label for the vertice represented by index N in the edgelist #' @export graph_to_indexed_edges <- function(graph) { - # Use igraph method to get edge list with edges specified using vertex ID + # Use igraph method to get edge list with edges specified using vertex ID # (indexes) rather than names edges <- igraph::get.edgelist(graph, names = FALSE) # Convert edge list from numeric to integer @@ -23,146 +23,164 @@ graph_to_indexed_edges <- function(graph) { } #' Graph from integer index edge list -#' -#' Takes an integer indexed edgelist (where each edge is represented by the +#' +#' Takes an integer indexed edgelist (where each edge is represented by the #' integer indexes of its vertices) and converts it to an igraph format graph. #' If the edge list has a "vertex_names" attribute, this will be used to name #' the vertices in the resultant graph. -#' @param indexed_edges A 2 x numEdges edgelist with vertices labelled with +#' @param indexed_edges A 2 x numEdges edgelist with vertices labelled with #' integer indices, with an optional "vertex_names" attribute #' @return An igraph graph object #' @export indexed_edges_to_graph <- function(indexed_edges) { graph <- igraph::graph_from_edgelist(indexed_edges) - graph <- igraph::set.vertex.attribute(graph, name = "name", value = attr(indexed_edges, "vertex_names")) + graph <- igraph::set.vertex.attribute( + graph, name = "name", + value = attr(indexed_edges, "vertex_names") + ) return(graph) } #' Read all graphs in a directory, simplifying as requested -#' +#' #' Reads graph data from all files in a directory matching the specified #' filename pattern. From each file, an a igraph graph object is constructed -#' and the requested subset of the following simplifications is made in the +#' and the requested subset of the following simplifications is made in the #' following order: #' 1. Makes the graph undirected #' 2. Removes loops (where both endpoints of an edge are the same vertex) -#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each +#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each #' pair of endpoints) -#' 4. Removes isolated vertices (i.e. vertices with no edges after the +#' 4. Removes isolated vertices (i.e. vertices with no edges after the #' previous alterations) #' @param source_dir Path to directory containing files with graph data -#' @param format Format of graph data. Any format supported by +#' @param format Format of graph data. Any format supported by #' \code{igraph::read_graph} can be used. #' @param pattern Pattern to use to filter filenames. Any pattern supported by #' \code{dir} can be used. #' @param as_undirected If TRUE make graph edges undirected #' @param remove_loops If TRUE, remove edgeds that connect a vertex to itself -#' @param remove_multiple If TRUE remove multiple edges connencting the same +#' @param remove_multiple If TRUE remove multiple edges connencting the same #' pair of vertices -#' @param remove_isolates If TRUE, remove vertices with no edges after the +#' @param remove_isolates If TRUE, remove vertices with no edges after the #' previous alterations have been made #' @return A named list of simplified igraph graph object, with the name of each #' graph set to the name of the file it was read from. #' @export -read_simple_graphs <- function(source_dir, format = "ncol", pattern = "*", - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE) { +read_simple_graphs <- function(source_dir, + format = "ncol", + pattern = "*", + as_undirected = TRUE, + remove_loops = TRUE, + remove_multiple = TRUE, + remove_isolates = TRUE) { # Get list of all filenames in directory that match the pattern file_names <- dir(source_dir, pattern = pattern) - # Read graph data from each matched file as an igraph format graph, + # Read graph data from each matched file as an igraph format graph, # simplifying as requested - graphs <- purrr::map(file_names, function(file_name) { - read_simple_graph(file = file.path(source_dir, file_name), format = format, - as_undirected = as_undirected, remove_loops = remove_loops, - remove_multiple = remove_multiple, - remove_isolates = remove_isolates) - }) - + graphs <- purrr::map( + file_names, + function(file_name) { + read_simple_graph( + file = file.path(source_dir, file_name), + format = format, + as_undirected = as_undirected, + remove_loops = remove_loops, + remove_multiple = remove_multiple, + remove_isolates = remove_isolates + ) + } + ) + # Name each graph with the name of the file it was read from (with any # extension moved) - names <- purrr::simplify(purrr::map(strsplit(file_names, "\\."), - function(s) { - if(length(s) == 1) { - s - } else { - paste(utils::head(s, -1), collapse = ".") - } - })) + names <- purrr::simplify( + purrr::map( + strsplit(file_names, "\\."), + function(s) { + if (length(s) == 1) { + s + } else { + paste(utils::head(s, -1), collapse = ".") + } + } + ) + ) attr(graphs, "names") <- names return(graphs) } #' Read a graph from file, simplifying as requested -#' +#' #' Reads graph data from file, constructing an a igraph graph object, making the #' requested subset of the following simplifications in the following order: #' 1. Makes the graph undirected #' 2. Removes loops (where both endpoints of an edge are the same vertex) -#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each +#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each #' pair of endpoints) -#' 4. Removes isolated vertices (i.e. vertices with no edges after the +#' 4. Removes isolated vertices (i.e. vertices with no edges after the #' previous alterations) #' @param file Path to file containing graph data -#' @param format Format of graph data. All formats supported by +#' @param format Format of graph data. All formats supported by #' \code{igraph::read_graph} are supported. #' @param as_undirected If TRUE make graph edges undirected #' @param remove_loops If TRUE, remove edgeds that connect a vertex to itself -#' @param remove_multiple If TRUE remove multiple edges connencting the same +#' @param remove_multiple If TRUE remove multiple edges connencting the same #' pair of vertices -#' @param remove_isolates If TRUE, remove vertices with no edges after the +#' @param remove_isolates If TRUE, remove vertices with no edges after the #' previous alterations have been made #' @return A simplified igraph graph object #' @export -read_simple_graph <- function(file, format, as_undirected = TRUE, - remove_loops = TRUE, remove_multiple = TRUE, +read_simple_graph <- function(file, format, as_undirected = TRUE, + remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE) { # Read graph from file. NOTE: igraph only supported the "directed" argument - # for some formats, but passes it to formats that don't support it, which + # for some formats, but passes it to formats that don't support it, which # then throw an error - if(format %in% c("edgelist", "ncol", "lgl", "dimacs", "dl")) { + if (format %in% c("edgelist", "ncol", "lgl", "dimacs", "dl")) { graph <- igraph::read_graph(file = file, format = format, directed = TRUE) } else { graph <- igraph::read_graph(file = file, format = format) } # Perform any requested simplifications - simplify_graph(graph, as_undirected = as_undirected, - remove_loops = remove_loops, remove_multiple = remove_multiple, + simplify_graph(graph, as_undirected = as_undirected, + remove_loops = remove_loops, remove_multiple = remove_multiple, remove_isolates = remove_isolates) } #' Simplify an igraph -#' +#' #' Takes a igraph graph object and makes the requested subset of the following #' simplifications in the following order: #' 1. Makes the graph undirected #' 2. Removes loops (where both endpoints of an edge are the same vertex) -#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each +#' 3. Removes multiple edges (i.e. ensuring only one edge exists for each #' pair of endpoints) -#' 4. Removes isolated vertices (i.e. vertices with no edges after the +#' 4. Removes isolated vertices (i.e. vertices with no edges after the #' previous alterations) #' @param graph An graph or list of graphs in igraph format #' @param as_undirected If TRUE make graph edges undirected #' @param remove_loops If TRUE, remove edgeds that connect a vertex to itself -#' @param remove_multiple If TRUE remove multiple edges connencting the same +#' @param remove_multiple If TRUE remove multiple edges connencting the same #' pair of vertices -#' @param remove_isolates If TRUE, remove vertices with no edges after the +#' @param remove_isolates If TRUE, remove vertices with no edges after the #' previous alterations have been made #' @return A simplified igraph graph object #' @export -simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, +simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE) { - if(as_undirected) { + if (as_undirected) { # Ensure graph is undirected graph <- igraph::as.undirected(graph, mode = "each") } - if(remove_loops || remove_multiple) { - # Remove loops (where both endpoints of an edge are the same vertex) and + if (remove_loops || remove_multiple) { + # Remove loops (where both endpoints of an edge are the same vertex) and # multiple edges (where two edges have the same endpoints [in the same order # for directed graphs]) - graph <- igraph::simplify(graph, remove.loops = remove_loops, + graph <- igraph::simplify(graph, remove.loops = remove_loops, remove.multiple = remove_multiple) } - if(remove_isolates) { + if (remove_isolates) { # Remove vertices that have no edges connecting them to other vertices # NOTE: Vertices that only connect to themselves will only be removed if # their self-connecting edges have been removed by setting remove_loops to @@ -173,50 +191,51 @@ simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, return(graph) } -#' Convert a matrix of node level features to a discrete histogram for each feature -#' -#' Converts a matrix of node level features (e.g. for ORCA output this is counts -#' of each graphlet or orbit at each graph vertex) to +#' Convert a matrix of node level features to a discrete histogram for +#' each feature. +#' +#' Converts a matrix of node level features (e.g. for ORCA output this is counts +#' of each graphlet or orbit at each graph vertex) to #' a set of discrete histograms (a histogram of counts for each distinct value -#' across all graph vertices for each feature with no binning) -#' @param A number of nodes (rows) by number of features (columns) matrix, where -#' the ij entry is the score of node i on feature j (e.g. for ORCA output this is -#' counts of each graphlet or orbit at each graph vertex) -#' @return Feature histograms: List of discrete histograms for each -#' feature +#' across all graph vertices for each feature with no binning) +#' @param A number of nodes (rows) by number of features (columns) matrix, where +#' the ij entry is the score of node i on feature j (e.g. for ORCA output this +#' is counts of each graphlet or orbit at each graph vertex) +#' @return Feature histograms: List of discrete histograms for each +#' feature #' @export -graph_features_to_histograms <- function(featuresMatrix) { - apply(featuresMatrix, 2, dhist_from_obs) +graph_features_to_histograms <- function(features_matrix) { + apply(features_matrix, 2, dhist_from_obs) } -graph_features_to_histogramsSLOW <- function(featuresMatrix) { - apply(featuresMatrix, 2, dhist_from_obsSLOW) +graph_features_to_histogramsSLOW <- function(features_matrix) { + apply(features_matrix, 2, dhist_from_obsSLOW) } #' Graphlet-based degree distributions (GDDs) -#' +#' #' Generates graphlet-based degree distributions from \code{igraph} graph object, #' using the ORCA fast graphlet orbit counting package. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object. #' @param feature_type Type of graphlet-based feature to count: "graphlet" #' counts the number of graphlets each node participates in; "orbit" calculates #' the number of graphlet orbits each node participates in. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. #' @param ego_neighbourhood_size The number of steps from the source node to include #' nodes for each ego-network. #' @return List of graphlet-based degree distributions, with each distribution #' represented as a \code{dhist} discrete histogram object. #' @export -gdd <- function(graph, feature_type = 'orbit', max_graphlet_size = 4, +gdd <- function(graph, feature_type = 'orbit', max_graphlet_size = 4, ego_neighbourhood_size = 0){ graph <- simplify_graph(graph) if(ego_neighbourhood_size > 0) { if(feature_type != 'graphlet') { stop("Feature type not supported for ego-networks") } else { - out <- count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, + out <- count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, neighbourhood_size = ego_neighbourhood_size) } } else if(feature_type == "orbit") { @@ -231,11 +250,11 @@ gdd <- function(graph, feature_type = 'orbit', max_graphlet_size = 4, } #' Count graphlet orbits for each node in a graph -#' -#' Calculates graphlet orbit counts for each node in an \code{igraph} graph +#' +#' Calculates graphlet orbit counts for each node in an \code{igraph} graph #' object, using the ORCA fast graphlet orbit counting package. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. #' @return ORCA-format matrix containing counts of each graphlet #' orbit (columns) at each node in the graph (rows). @@ -266,14 +285,14 @@ count_orbits_per_node <- function(graph, max_graphlet_size) { } #' Count graphlets for each node in a graph -#' -#' Calculates graphlet counts for each node in an \code{igraph} graph object, -#' using the ORCA fast graphlet orbit counting package. by summing orbits over +#' +#' Calculates graphlet counts for each node in an \code{igraph} graph object, +#' using the ORCA fast graphlet orbit counting package. by summing orbits over #' graphlets. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @return ORCA-format matrix containing counts of each graphlet (columns) at +#' @return ORCA-format matrix containing counts of each graphlet (columns) at #' each node in the graph (rows). #' @export count_graphlets_per_node <- function(graph, max_graphlet_size) { @@ -282,14 +301,14 @@ count_graphlets_per_node <- function(graph, max_graphlet_size) { } #' Count total number of graphlets in a graph -#' -#' Calculates total graphlet counts for a \code{igraph} graph object using the +#' +#' Calculates total graphlet counts for a \code{igraph} graph object using the #' ORCA fast graphlet orbit counting package. Per-node graphlet counts are #' calculated by summing orbits over graphlets. These are then divided by the -#' number of nodes comprising each graphlet to avoid counting the same graphlet +#' number of nodes comprising each graphlet to avoid counting the same graphlet #' multiple times. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. #' @return Vector containing counts of each graphlet for the graph. #' @export @@ -298,34 +317,34 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { # Sum graphlet counts over all nodes (rows) total_counts <- colSums(node_counts) # To ensure we only count each graphlet present in an ego network once, divide - # the graphlet counts by the number of nodes that contribute to + # the graphlet counts by the number of nodes that contribute to # each graphlet type nodes_per_graphlet <- graphlet_key(max_graphlet_size)$node_count return(total_counts / nodes_per_graphlet) } #' Ego-network graphlet counts -#' +#' #' Calculates graphlet counts for the n-step ego-network of each node in a graph -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. #' @param neighbourhood_size The number of steps from the source node to include #' nodes for each ego-network. -#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} #' nodes are returned. -#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} #' edges are returned. -#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside -#' graphlet counts to enable further processing. -#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix -#' containing counts of each graphlet (columns, C) for each ego-network in the -#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are +#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside +#' graphlet counts to enable further processing. +#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix +#' containing counts of each graphlet (columns, C) for each ego-network in the +#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are #' labelled with the ID of the central node in each ego-network (if nodes in the #' input graph are labelled). If \code{return_ego_networks = TRUE}, returns a #' list with the following elements: #' \itemize{ -#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each +#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each #' ego-network in the input graph as described above. #' \item \code{ego_networks}: The ego-networks of the query graph. #' } @@ -336,16 +355,16 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size # Extract ego network for each node in original graph, naming each ego network # in the list with the name of the node the ego network is generated for ego_networks <- make_named_ego_graph(graph, order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) # Generate graphlet counts for each node in each ego network (returns an ORCA # format graphlet count matrix for each ego network) - ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, + ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, max_graphlet_size = max_graphlet_size) # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) - + # Return either graphlet counts, or graphlet counts and ego_networks if(return_ego_networks) { return(list(graphlet_counts = ego_graphlet_counts, ego_networks = ego_networks)) @@ -358,26 +377,26 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size #' JACK To follow through logic of paper steps, wanted to pass #' ego networks to the function for generating graphlet counts, #' not the input query graph directly (as in count_graphlets_ego above). -#' +#' #' Calculates graphlet counts for previously generated ego networks. -#' @param ego_networks Named list of ego networks for a graph. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' @param ego_networks Named list of ego networks for a graph. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @return returns an RxC matrix -#' containing counts of each graphlet (columns, C) for each ego-network (rows, R). -#' Columns are labelled with graphlet IDs and rows are -#' labelled with the ID of the central node in each ego-network. +#' @return returns an RxC matrix +#' containing counts of each graphlet (columns, C) for each ego-network (rows, R). +#' Columns are labelled with graphlet IDs and rows are +#' labelled with the ID of the central node in each ego-network. #' @export ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { # Generate graphlet counts for each node in each ego network (returns an ORCA # format graphlet count matrix for each ego network) - ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, + ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, max_graphlet_size = max_graphlet_size) - + # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) - + # Return graphlet counts return(ego_graphlet_counts) } @@ -390,31 +409,31 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { #' @param graph An \code{igraph} object #' @param order The number of steps from the source node to include #' nodes for each ego-network. -#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} +#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} #' nodes are returned. -#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} +#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} #' edges are returned. -#' @param ... Additional parameters to be passed to the underlying +#' @param ... Additional parameters to be passed to the underlying #' \code{igraph::make_ego_graph} function used. #' @export -make_named_ego_graph <- function(graph, order, min_ego_nodes=3, +make_named_ego_graph <- function(graph, order, min_ego_nodes=3, min_ego_edges=1, ...) { - + ego_networks <- igraph::make_ego_graph(graph, order, ...) names(ego_networks) <- igraph::V(graph)$name - + # Drop ego-networks that don't have the minimum number of nodes or edges - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { + drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) })) ego_networks <- ego_networks[!drop_index] - + return(ego_networks) } #' Orbit to graphlet counts -#' -#' Converts graphlet orbit counts at each vertex to graphlet counts at each +#' +#' Converts graphlet orbit counts at each vertex to graphlet counts at each #' vertex by summing over all orbits contained within each graphlet #' @param orbit_counts ORCA-format matrix containing counts of each graphlet #' orbit (columns) at each vertex in the graph (rows) @@ -423,23 +442,23 @@ make_named_ego_graph <- function(graph, order, min_ego_nodes=3, #' @export orbit_to_graphlet_counts <- function(orbit_counts) { num_orbits <- dim(orbit_counts)[2] - # Indexes to select the orbit(s) that comprise each graphlet. Note that we - # define these in the zero-based indexing used in journal papers, but + # Indexes to select the orbit(s) that comprise each graphlet. Note that we + # define these in the zero-based indexing used in journal papers, but # need to add 1 to convert to the 1-based indexing used by R if(num_orbits == 15) { # Orbits for graphlets comprising up to 4 nodes max_nodes <- 4 - orbit_to_graphlet_map <- - purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14), + orbit_to_graphlet_map <- + purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14), function(indexes){ indexes + 1}) } else if(num_orbits == 73) { # Orbits for graphlets comprising up to 5 nodes max_nodes <- 5 - orbit_to_graphlet_map <- - purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14, 15:17, 18:21, - 22:23, 24:26, 27:30, 31:33, 34, 35:38, 39:42, 43:44, - 45:48, 49:50, 51:53, 54:55, 56:58, 59:61, 62:64, - 65:67, 68:69, 70:71, 72), + orbit_to_graphlet_map <- + purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14, 15:17, 18:21, + 22:23, 24:26, 27:30, 31:33, 34, 35:38, 39:42, 43:44, + 45:48, 49:50, 51:53, 54:55, 56:58, 59:61, 62:64, + 65:67, 68:69, 70:71, 72), function(indexes){ indexes + 1}) } else { stop(("Unsupported number of orbits")) @@ -458,13 +477,13 @@ orbit_to_graphlet_counts <- function(orbit_counts) { } #' Graphlet key -#' +#' #' Metdata about graphlet groups. #' @param max_graphlet_size Maximum number of nodes graphlets can contain #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain -#' \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to +#' \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to #' num_graphlets #' \item \code{node_count}: Number of nodes contained within each graphlet #' } @@ -484,18 +503,18 @@ graphlet_key <- function(max_graphlet_size) { max_node_index <- length(node_count)-1 id <- purrr::simplify(purrr::map(0:max_node_index, function(index) { paste('G', index, sep = "")})) - name <- + name <- return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) } #' Orbit key -#' +#' #' Metdata about orbit groups. #' @param max_graphlet_size Maximum number of nodes graphlets can contain #' @return Metadata list with the following named fields: #' \itemize{ #' \item \code{max_nodes}: Maximum number of nodes graphlets can contain -#' \item \code{id}: ID of each graphlet in format On, where n is in range 0 to +#' \item \code{id}: ID of each graphlet in format On, where n is in range 0 to #' num_orbits #' \item \code{node_count}: Number of nodes contained within each graphlet #' } @@ -515,7 +534,7 @@ orbit_key <- function(max_graphlet_size) { max_node_index <- length(node_count)-1 id <- purrr::simplify(purrr::map(0:max_node_index, function(index) { paste('O', index, sep = "")})) - name <- + name <- return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) } @@ -533,65 +552,65 @@ graphlet_ids_for_size <- function(graphlet_size) { #' Load all graphs in a directory and calculates their Graphlet-based Degree #' Distributions (GDDs) -#' +#' #' Loads graphs from all files matching the given pattern in the given directory, -#' converts them to indexed edge lists compatible with the ORCA fast orbit -#' counting package and calculates the specified set of graphlet-based degree +#' converts them to indexed edge lists compatible with the ORCA fast orbit +#' counting package and calculates the specified set of graphlet-based degree #' distributions usingthe ORCA package. #' @param source_dir Path to graph directory #' @param format Format of graph files #' @param pattern Filename pattern to match graph files -#' @param feature_type Type of graphlet-based degree distributions. Can be +#' @param feature_type Type of graphlet-based degree distributions. Can be #' \code{graphlet} to count graphlets or \code{orbit} to count orbits. #' @return A named list where each element contains a set of GDDs for a single #' @param max_graphlet_size Maximum size of graphlets to use when generating GDD -#' @param ego_neighbourhood_size The number of steps from the source node to -#' include nodes for each ego-network. If set to 0, ego-networks will not be +#' @param ego_neighbourhood_size The number of steps from the source node to +#' include nodes for each ego-network. If set to 0, ego-networks will not be #' used #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. #' @return A named list where each element contains a set of GDDs for a single -#' graph from the source directory. Each set of GDDs is itself a named list, +#' graph from the source directory. Each set of GDDs is itself a named list, #' where each GDD element is a \code{dhist} discrete histogram object. #' @export gdd_for_all_graphs <- function( - source_dir, format = "ncol", pattern = ".txt", feature_type = "orbit", + source_dir, format = "ncol", pattern = ".txt", feature_type = "orbit", max_graphlet_size = 4, ego_neighbourhood_size = 0, mc.cores = getOption("mc.cores", 2L)) { # Create function to read graph from file and generate GDD graphs <- read_simple_graphs( source_dir = source_dir, format = format, pattern = pattern) # Calculate specified GDDs for each graph - # NOTE: mcapply only works on unix-like systems with system level forking + # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows # TODO: Look into using the parLappy function on Windows if(.Platform$OS.type != "unix") { - # Force cores to 1 if system is not unix-like as it will not support + # Force cores to 1 if system is not unix-like as it will not support # forking mc.cores = 1 } - parallel::mcmapply(gdd, graphs, MoreArgs = - list(feature_type = feature_type, + parallel::mcmapply(gdd, graphs, MoreArgs = + list(feature_type = feature_type, max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size), + ego_neighbourhood_size = ego_neighbourhood_size), SIMPLIFY = FALSE, mc.cores = mc.cores) } #' Generate a cross-comparison specification -#' +#' #' Creates a cross-comparison matrix with all possible pair-wise combinations #' of elements from the provided list. #' @param named_list A named list of items for which an exhaustive pair-wise #' cross-comparison is required. #' @return A matrix with one row for each possible pair-wise combination -#' of elements from the provided named list. The first and second columns -#' contain the names of the elements in the pair and the third and fourth +#' of elements from the provided named list. The first and second columns +#' contain the names of the elements in the pair and the third and fourth #'columns contain the indexes of these elements in the provided list. #' @export cross_comparison_spec <- function(named_list) { indexes <- as.data.frame(t(utils::combn(1:length(named_list),2))) - names <- as.data.frame(cbind(names(named_list)[indexes[,1]], + names <- as.data.frame(cbind(names(named_list)[indexes[,1]], names(named_list)[indexes[,2]])) spec <- cbind(names, indexes) colnames(spec) <- c("name_a", "name_b", "index_a", "index_b") @@ -599,14 +618,14 @@ cross_comparison_spec <- function(named_list) { } #' Convert a pair-wise cross-comparison into a matrix format -#' +#' #' Converts a pair-wise cross-comparison into a matrix format #' @param measure A list of pair-wise comparison measiures #' @param cross_comparison_spec A cross-comparison specification generated #' using \code{cross_comparison_spec} #' @return A square symmetric matrix with a zero diagonal, with elements #' Cij and Cji populated from the element from \code{measure} corresponding to -#' the row of \code{cross_comparison_spec} with \code{index_a = i} and +#' the row of \code{cross_comparison_spec} with \code{index_a = i} and #' \code{index_b = j} #' @export cross_comp_to_matrix <- function(measure, cross_comparison_spec) { From 85b9fb4b88a054af6541d55f9112c706b1023698 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 30 Jul 2019 11:58:23 +0100 Subject: [PATCH 020/188] applied styler to whole package --- R/data.R | 8 +- R/dhist.R | 248 +++-- R/emd.R | 197 ++-- R/graph_binning.R | 22 +- R/measures_net_dis.R | 73 +- R/measures_net_emd.R | 157 +-- R/net_emd_speed_benchmark.R | 43 +- R/netdist_package.R | 2 +- R/orca_interface.R | 228 ++-- R/utility_functions.R | 4 +- data-raw/virus.R | 14 +- tests/testthat.R | 2 +- tests/testthat/test-cpp.R | 2 +- tests/testthat/test_dhist.R | 560 ++++++---- tests/testthat/test_emd.R | 547 +++++---- tests/testthat/test_graph_binning.R | 101 +- tests/testthat/test_measures_net_dis.R | 849 +++++++------- tests/testthat/test_measures_net_emd.R | 409 ++++--- tests/testthat/test_orca_interface.R | 1414 ++++++++++++++---------- 19 files changed, 2754 insertions(+), 2126 deletions(-) diff --git a/R/data.R b/R/data.R index 43e2c487..ee5237fd 100644 --- a/R/data.R +++ b/R/data.R @@ -1,8 +1,8 @@ #' Protein-protein interaction (PPI) networks for 5 microorganisms #' -#' A dataset containing the protein-protein interaction networks for the +#' A dataset containing the protein-protein interaction networks for the #' following 5 microorganisms -#'\itemize{ +#' \itemize{ #' \item EBV #' \itemize{ #' \item Common name: Epstein Barr virus @@ -33,11 +33,11 @@ #' \item Scientific name: Human alphaherpesvirus 3 #' \item TaxonomyID: 10335 #' } -#'} +#' } #' #' @format A list of \code{igraph} objects. #' @source \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. #' @source \strong{PPI data (ECL):} Peregrín-Alvarez JM, Xiong X, Su C, Parkinson J (2009) The Modular Organization of Protein Interactions in Escherichia coli. PLoS Comput Biol 5(10): e1000523. \url{https://doi.org/10.1371/journal.pcbi.1000523} #' @source \strong{Taxonomy ground truth:} NCBI taxonomy database. \url{https://www.ncbi.nlm.nih.gov/taxonomy} #' @encoding UTF-8 -"virusppi" \ No newline at end of file +"virusppi" diff --git a/R/dhist.R b/R/dhist.R index af572073..4d4134b7 100644 --- a/R/dhist.R +++ b/R/dhist.R @@ -1,17 +1,17 @@ # HISTOGRAM FUNCTIONS #' Discrete histogram constructor -#' -#' Creates a discrete histogram object of class \code{dhist}, with bin +#' +#' Creates a discrete histogram object of class \code{dhist}, with bin #' \code{locations} and \code{masses} set to the 1D numeric vectors provided. #' @param locations A 1D numeric vector specifying the discrete locations #' of the histogram bins -#' @param masses A 1D numeric vector specifying the mass present at each +#' @param masses A 1D numeric vector specifying the mass present at each #' location #' @param smoothing_window_width If greater than 0, the discrete histogram will #' be treated as having the mass at each location "smoothed" uniformly across #' a bin centred on the location and having width = \code{smoothing_window_width} #' (default = \code{0} - no smoothing) -#' @param sorted Whether or not to return a discrete histogram with locations +#' @param sorted Whether or not to return a discrete histogram with locations #' and masses sorted by ascending mass (default = \code{TRUE}) #' @return A sparse discrete histogram. Format is a \code{dhist} object, which #' is a list of class \code{dhist} with the following named elements: @@ -22,30 +22,32 @@ #' Note that locations where no mass is present are not included in the returned #' \code{dhist} object. Mass in these discrete histograms is treated as being #' present precisely at the specified location. Discrete histograms should not be used -#' for data where observations have been grouped into bins representing ranges +#' for data where observations have been grouped into bins representing ranges #' of observation values. #' @export dhist <- function(locations, masses, smoothing_window_width = 0, sorted = TRUE) { - if(!is_numeric_vector_1d(locations)) { + if (!is_numeric_vector_1d(locations)) { stop("Bin locations must be provided as a 1D numeric vector") } - if(!is_numeric_vector_1d(masses)) { + if (!is_numeric_vector_1d(masses)) { stop("Bin masses must be provided as a 1D numeric vector") } - if(length(locations) != length(masses)) { + if (length(locations) != length(masses)) { stop("The number of bin locations and masses provided must be equal") } - dhist <- list(locations = locations, masses = masses, - smoothing_window_width = smoothing_window_width) + dhist <- list( + locations = locations, masses = masses, + smoothing_window_width = smoothing_window_width + ) class(dhist) <- "dhist" - if(sorted == TRUE) { + if (sorted == TRUE) { dhist <- sort_dhist(dhist) } return(dhist) } #' Compare dhists -#' +#' #' Compares all fields of the dhist and only returns treu if they are all the #' same in both dhists #' @param dhist1 A discrete histogram as a \code{dhist} object @@ -53,23 +55,23 @@ dhist <- function(locations, masses, smoothing_window_width = 0, sorted = TRUE) `==.dhist` <- function(dhist1, dhist2) { class(dhist1) == class(dhist2) && all(mapply(`==`, dhist1$locations, dhist2$locations)) && - all(mapply(`==`, dhist1$masses, dhist2$masses)) && + all(mapply(`==`, dhist1$masses, dhist2$masses)) && dhist1$smoothing_window_width == dhist2$smoothing_window_width } -update_dhist <- +update_dhist <- function(dhist, locations = dhist$locations, masses = dhist$masses, - smoothing_window_width = dhist$smoothing_window_width) { + smoothing_window_width = dhist$smoothing_window_width) { dhist$locations <- locations dhist$masses <- masses dhist$smoothing_window_width <- smoothing_window_width return(dhist) - } + } #' Set dhist smoothing -#' -#' Returns a "smoothed" copy of a \code{dhist} object with its -#' \code{smoothing_window_width} attribute set to the value provided +#' +#' Returns a "smoothed" copy of a \code{dhist} object with its +#' \code{smoothing_window_width} attribute set to the value provided #' \code{smoothing_window_width} parameter. #' @param dhist A discrete histogram as a \code{dhist} object #' @param smoothing_window_width If greater than 0, the discrete histogram will @@ -84,8 +86,8 @@ as_smoothed_dhist <- function(dhist, smoothing_window_width) { } #' Remove dhist smoothing -#' -#' Returns an "unsmoothed" copy of a \code{dhist} object with its +#' +#' Returns an "unsmoothed" copy of a \code{dhist} object with its #' \code{smoothing_window_width} attribute set to 0. #' @param dhist A discrete histogram as a \code{dhist} object #' @return A copy of a \code{dhist} object with its \code{smoothing_window_width} @@ -97,40 +99,40 @@ as_unsmoothed_dhist <- function(dhist) { } #' Check if an object is a \code{dhist} discrete histogram -#' -#' Checks if the input object is of class \code{dhist}. If \code{fast_check} is -#' \code{TRUE} then the only check is whether the object has a class attribute of +#' +#' Checks if the input object is of class \code{dhist}. If \code{fast_check} is +#' \code{TRUE} then the only check is whether the object has a class attribute of #' \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks #' are also made to ensure that the object has the structure required of a -#' \code{dhist} object. +#' \code{dhist} object. #' @param x An arbitrary object -#' @param fast_check Boolean flag indicating whether to perform only a -#' superficial fast check limited to checking the object's class attribute +#' @param fast_check Boolean flag indicating whether to perform only a +#' superficial fast check limited to checking the object's class attribute #' is set to \code{dhist} (default = \code{TRUE}) #' @export is_dhist <- function(x, fast_check = TRUE) { # Quick check that relies on user not to construct variables with "dhist" class # that do not have the required elements - has_class_attr <-(class(x) == "dhist") - if(fast_check) { + has_class_attr <- (class(x) == "dhist") + if (fast_check) { # Early return if fast check requested return(has_class_attr) } # Otherwise check structure has_locations <- purrr::contains(attr(x, "name"), "locations") has_masses <- purrr::contains(attr(x, "name"), "masses") - # Require list with correct class and presence of 1D numeric vector named + # Require list with correct class and presence of 1D numeric vector named # elements "locations" and "masses" return(has_class_attr - && purrr::is_list(x) - && has_locations - && has_masses - && is_numeric_vector_1d(x$locations) - && is_numeric_vector_1d(x$masses)) + && purrr::is_list(x) + && has_locations + && has_masses + && is_numeric_vector_1d(x$locations) + && is_numeric_vector_1d(x$masses)) } #' Discrete histogram from observations (Pure R slow version) -#' +#' #' Generate a sparse discrete histogram from a set of discrete numeric observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which @@ -142,14 +144,16 @@ is_dhist <- function(x, fast_check = TRUE) { #' @export dhist_from_obs_slow <- function(observations) { # Require 1D numeric vector - if(!is_numeric_vector_1d(observations)) { + if (!is_numeric_vector_1d(observations)) { stop("Observations must be provided as a 1D numeric vector") } # Identify unique observations locations <- sort(unique(observations)) # Count occurences of each unique obervation - counts <- sapply(locations, function(location) {sum(observations == location)}) + counts <- sapply(locations, function(location) { + sum(observations == location) + }) # Construct histogram object hist <- dhist(locations = locations, masses = counts) return(hist) @@ -157,7 +161,7 @@ dhist_from_obs_slow <- function(observations) { #' Discrete histogram from observations -#' +#' #' Generate a sparse discrete histogram from a set of discrete numeric observations #' @param observations A vector of discrete numeric observations #' @return A sparse discrete histogram. Format is a \code{dhist} object, which @@ -169,25 +173,25 @@ dhist_from_obs_slow <- function(observations) { #' @export dhist_from_obs <- function(observations) { # Require 1D numeric vector - if(!is_numeric_vector_1d(observations)) { + if (!is_numeric_vector_1d(observations)) { stop("Observations must be provided as a 1D numeric vector") } - if (any(is.na(observations))) { - stop("NA observed in features") - } + if (any(is.na(observations))) { + stop("NA observed in features") + } results <- counts_from_observations(matrix(observations)) # Construct histogram object - hist <- dhist(locations = results[,1], masses = results[,2]) + hist <- dhist(locations = results[, 1], masses = results[, 2]) return(hist) } -#' Generate interpolating empirical cumulative mass function (ECMF) for +#' Generate interpolating empirical cumulative mass function (ECMF) for #' a discrete histogram -#' +#' #' @param dhist A discrete histogram as a \code{dhist} object #' @return An interpolating ECMF as an \code{approxfun} object. This function -#' will return the interpolated cumulative mass for a vector of arbitrary +#' will return the interpolated cumulative mass for a vector of arbitrary #' locations. If \code{dhist$smoothing_window_width} is zero, the ECMF will be #' piecewise constant. If \code{dhist$smoothing_window_width} is one, the ECMF #' will be piece-wise linear. If \code{dhist$smoothing_window_width} is any @@ -199,7 +203,7 @@ dhist_ecmf <- function(dhist) { # Determine cumulative mass at each location cum_mass <- cumsum(dhist$masses) # Generate ECMF - if(dhist$smoothing_window_width == 0) { + if (dhist$smoothing_window_width == 0) { # Avoid any issues with floating point equality comparison completely when # no smoothing is occurring x_knots <- dhist$locations @@ -215,22 +219,22 @@ dhist_ecmf <- function(dhist) { upper_limits <- dhist$locations + hw cum_mass_lower <- cum_mass cum_mass_upper <- cum_mass - # 2. Set lower limit cumulative masses to have the same value as the + # 2. Set lower limit cumulative masses to have the same value as the # upper limit of the previous location. This ensures constant interpolation # between the upper limit of one location and the lower limit of the next - cum_mass_lower <- c(0, utils::head(cum_mass_upper, num_locs -1)) - # 3. Identify upper limits within machine precision of the lower limit of + cum_mass_lower <- c(0, utils::head(cum_mass_upper, num_locs - 1)) + # 3. Identify upper limits within machine precision of the lower limit of # the next location - diff <- abs(utils::head(upper_limits, num_locs -1) - - utils::tail(lower_limits, num_locs -1)) + diff <- abs(utils::head(upper_limits, num_locs - 1) - + utils::tail(lower_limits, num_locs - 1)) tolerance <- .Machine$double.eps drop_indexes <- which(diff <= tolerance) - # 4. Drop upper limits and associated cumulative masses where a lower + # 4. Drop upper limits and associated cumulative masses where a lower # limit exists at the same location (to within machine precision). # NOTE: We need to skip this step entirely if there are no upper limits to # drop as vector[-0] returns an empty vector rather than all entries in the # vector. - if(length(drop_indexes) > 0) { + if (length(drop_indexes) > 0) { upper_limits <- upper_limits[-drop_indexes] cum_mass_upper <- cum_mass_upper[-drop_indexes] } @@ -247,9 +251,11 @@ dhist_ecmf <- function(dhist) { } # Construct ECMF max_mass <- cum_mass[length(cum_mass)] - dhist_ecmf <- stats::approxfun(x = x_knots, y = cum_mass, - method = interpolation_method, yleft = 0, - yright = max_mass, f = 0, ties = min) + dhist_ecmf <- stats::approxfun( + x = x_knots, y = cum_mass, + method = interpolation_method, yleft = 0, + yright = max_mass, f = 0, ties = min + ) class(dhist_ecmf) <- c("dhist_ecmf", class(dhist_ecmf)) attr(dhist_ecmf, "type") <- interpolation_method return(dhist_ecmf) @@ -257,30 +263,30 @@ dhist_ecmf <- function(dhist) { #' Get "knots" for discrete histogram empirical cumulative mass function #' (ECMF). The "knots" are the x-values at which the y-value of the ECDM changes -#' gradient (i.e. the x-values between which the ECMF does its constant or +#' gradient (i.e. the x-values between which the ECMF does its constant or #' linear interpolates) -#' -#' @param dhist_ecmf An object of class \code{dhist_ecmf}, returned from a call +#' +#' @param dhist_ecmf An object of class \code{dhist_ecmf}, returned from a call #' to the \code{dhist_ecmf} function -#' @return x_knots A list of "knots" for the ECMF, containing all x-values at +#' @return x_knots A list of "knots" for the ECMF, containing all x-values at #' which the y-value changes gradient (i.e. the x-values between which the ECMF #' does its constant or linear interpolation) #' @export ecmf_knots <- function(dhist_ecmf) { # dhist_ecmf is a stats::approxfun object and is either a piecewise constant - # or piece-wise linear function, with the x argument of the underlying + # or piece-wise linear function, with the x argument of the underlying # approxfun set to the inflexion points (or knots) of the pricewise function - # We simply recover the value of the x argument by evaluating "x" in the + # We simply recover the value of the x argument by evaluating "x" in the # environment of the dhist_ecmf approxfun - eval(expression(x), envir=environment(dhist_ecmf)) + eval(expression(x), envir = environment(dhist_ecmf)) } -#' Calculate area between two discrete histogram empirical cumulative +#' Calculate area between two discrete histogram empirical cumulative #' mass functions (ECMFs) -#' -#' @param dhist_ecmf1 An object of class \code{dhist_ecmf}, returned from a call +#' +#' @param dhist_ecmf1 An object of class \code{dhist_ecmf}, returned from a call #' to the \code{dhist_ecmf} function -#' @param dhist_ecmf2 An object of class \code{dhist_ecmf}, returned from a call +#' @param dhist_ecmf2 An object of class \code{dhist_ecmf}, returned from a call #' to the \code{dhist_ecmf} function #' @return area The area between the two discrete histogram ECMFs, calculated as #' the integral of the absolute difference between the two ECMFs @@ -289,7 +295,7 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { # Ensure ECMFs have compatible types ecmf_type1 <- attr(dhist_ecmf1, "type") ecmf_type2 <- attr(dhist_ecmf2, "type") - if(ecmf_type1 != ecmf_type2) { + if (ecmf_type1 != ecmf_type2) { stop("ECMFs must have the same type") } ecmf_type <- ecmf_type1 @@ -305,7 +311,7 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { x_lower <- utils::head(x, num_segs) x_upper <- utils::tail(x, num_segs) # Depending on the ECDF type, we calculate the area between ECMFs differently - if(ecmf_type == "constant") { + if (ecmf_type == "constant") { # Area of each rectangular segment between ECMFs is the absolute difference # between the ECMFs at the lower limit of the segment * the width of the # segement @@ -313,7 +319,7 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { ecm_diff_lower <- utils::head(ecm_diff, num_segs) segment_width <- abs(x_upper - x_lower) segment_areas <- ecm_diff_lower * segment_width - } else if(ecmf_type == "linear") { + } else if (ecmf_type == "linear") { # -------------------------------------------------------------- # Determine area between pairs of linear segments from each ECMF # -------------------------------------------------------------- @@ -324,8 +330,8 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { # Determine if ECMFs intersect within each segment. The linear segments from # each ECMF will only intersect if the ordering of the y-components of their # start and end endpoints are different (i.e. the ECMF with the y-component - # at the start of the segment has the higher y-component at the end of the - # segment). An equivalent expression of this condition is that the signs + # at the start of the segment has the higher y-component at the end of the + # segment). An equivalent expression of this condition is that the signs # of the differences between the y-components of the two linear ECMF # segments will differ at the start (lower x-bound) and end (upper x-bound) # of a segment @@ -344,16 +350,20 @@ area_between_dhist_ecmfs <- function(dhist_ecmf1, dhist_ecmf2) { x_diff <- x_upper - x_lower segment_areas <- rep(NaN, num_segs) # Use bowtie area for bowties - segment_areas[bowtie] <- - segment_area_bowtie(x_diff = x_diff[bowtie], - y_diff_lower = y_diff_lower[bowtie], - y_diff_upper = y_diff_upper[bowtie]) + segment_areas[bowtie] <- + segment_area_bowtie( + x_diff = x_diff[bowtie], + y_diff_lower = y_diff_lower[bowtie], + y_diff_upper = y_diff_upper[bowtie] + ) # Use trapezium area for other shapes (trapeziums, triangles and zero-area # co-linear) - segment_areas[trapezium] <- - segment_area_trapezium(x_diff = x_diff[trapezium], - y_diff_lower = y_diff_lower[trapezium], - y_diff_upper = y_diff_upper[trapezium]) + segment_areas[trapezium] <- + segment_area_trapezium( + x_diff = x_diff[trapezium], + y_diff_lower = y_diff_lower[trapezium], + y_diff_upper = y_diff_upper[trapezium] + ) } else { stop("ECMF type not recognised") } @@ -373,19 +383,19 @@ segment_area_bowtie <- function(x_diff, y_diff_lower, y_diff_upper) { # opposite signs and are not both zero. # See issue #21 for verification that this approach is equivalent to the # previous approach when the above conditions hold. - segment_area <- 0.5 * x_diff * (y_diff_lower^2 + y_diff_upper^2) / + segment_area <- 0.5 * x_diff * (y_diff_lower^2 + y_diff_upper^2) / (abs(y_diff_lower) + abs(y_diff_upper)) } #' Area between two offset Empirical Cumulative Mass Functions (ECMFs) -#' -#' @param ecmf1 An Empirical Cululative Mass Function (ECMF) object of class +#' +#' @param ecmf1 An Empirical Cululative Mass Function (ECMF) object of class #' \code{dhist_ecmf} -#' @param ecmf2 An Empirical Cululative Mass Function (ECMF) object of class +#' @param ecmf2 An Empirical Cululative Mass Function (ECMF) object of class #' \code{dhist_ecmf} #' @param offset An offset to add to all locations of the first ECMF. Postive #' offsets will shift the ECMF to the right and negative ones to the left. -#' @return area The area between the two ECMFs, calculated as the integral of +#' @return area The area between the two ECMFs, calculated as the integral of #' the absolute difference between the two ECMFs area_between_offset_ecmfs <- function(ecmf1, ecmf2, offset) { # Construct ECMFs for each normalised histogram @@ -395,11 +405,11 @@ area_between_offset_ecmfs <- function(ecmf1, ecmf2, offset) { } #' Sort discrete histogram -#' -#' Sort a discrete histogram so that locations are in increasing (default) or +#' +#' Sort a discrete histogram so that locations are in increasing (default) or #' decreasing order #' @param dhist A discrete histogram as a \code{dhist} object -#' @param decreasing Logical indicating whether histograms should be sorted in +#' @param decreasing Logical indicating whether histograms should be sorted in #' increasing (default) or decreasing order of location #' @export sort_dhist <- function(dhist, decreasing = FALSE) { @@ -410,8 +420,8 @@ sort_dhist <- function(dhist, decreasing = FALSE) { } #' Shift discrete histogram -#' -#' Shift the locations of a discrete histogram rightwards on the x-axis by the +#' +#' Shift the locations of a discrete histogram rightwards on the x-axis by the #' specified amount #' @param dhist A discrete histogram as a \code{dhist} object #' @param shift The distance to add to all locations @@ -423,7 +433,7 @@ shift_dhist <- function(dhist, shift) { } #' Calculate mean location for a discrete histogram -#' +#' #' Calculates mean location for a discrete histogram by taking a weighted sum #' of each location weighted by the fraction of the total histogram mass at that #' location. @@ -431,25 +441,25 @@ shift_dhist <- function(dhist, shift) { #' @return The mass-weighted mean location #' @export dhist_mean_location <- function(dhist) { - sum((dhist$masses/ sum(dhist$masses)) * dhist$locations) + sum((dhist$masses / sum(dhist$masses)) * dhist$locations) } #' Calculate variance of a discrete histogram -#' +#' #' Calculates variance directly from the discrete histogram by using locations -#' weighted by masses. -#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +#' weighted by masses. +#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses #' may not represent counts so N is not necessarily known #' @param dhist A discrete histogram as a \code{dhist} object #' @return Variance of histogram #' @export dhist_variance <- function(dhist) { mean_centred_locations <- dhist$locations - dhist_mean_location(dhist) - # Variance is E[X^2] - E[X]. However, for mean-centred data, E[X] is zero, + # Variance is E[X^2] - E[X]. However, for mean-centred data, E[X] is zero, # so variance is simply E[X^2]. Centring prior to squaring also helps avoid - # any potential integer overfloww issues (R uses a signed 32-bit integer + # any potential integer overfloww issues (R uses a signed 32-bit integer # representation, so cannot handle integers over ~2.1 billion) - if(dhist$smoothing_window_width == 0) { + if (dhist$smoothing_window_width == 0) { # For unsmoothed discrete histograms, the mass associated with each location # is located precisely at the lcoation. Therefore cariance (i.e. E[X^2]) # is the mass-weighted sum of the mean-centred locations @@ -459,22 +469,22 @@ dhist_variance <- function(dhist) { # uniformly across a bin centred on the location with width = smoothing_window_width # Variance (i.e. E[X^2]) is therefore the mass-weighted sum of the integrals # of x^2 over the mean-centred bins at each location. - hw = dhist$smoothing_window_width / 2 + hw <- dhist$smoothing_window_width / 2 bin_lowers <- mean_centred_locations - hw bin_uppers <- mean_centred_locations + hw # See comment in issue #21 on Github repository for verification that E[X^2] # is calculated as below for a uniform bin - bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + bin_lowers*bin_uppers) / 3 + bin_x2_integrals <- (bin_lowers^2 + bin_uppers^2 + bin_lowers * bin_uppers) / 3 variance <- sum(dhist$masses * bin_x2_integrals) / sum(dhist$masses) } return(variance) } #' Calculate standard deviation of a discrete histogram -#' -#' Calculates standard deviation directly from the discrete histogram by using +#' +#' Calculates standard deviation directly from the discrete histogram by using #' locations weighted by masses. -#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +#' NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses #' may not represent counts so N is not necessarily known #' @param dhist A discrete histogram as a \code{dhist} object #' @return Standard deviation of histogram @@ -484,21 +494,21 @@ dhist_std <- function(dhist) { } #' Centre a discrete histogram around its mean location -#' -#' Centres a discrete histogram around its mass-weighted mean location by +#' +#' Centres a discrete histogram around its mass-weighted mean location by #' subtracting the mass-weighted mean from each location. #' @param dhist A discrete histogram as a \code{dhist} object #' @return The mass-weighted mean location #' @export mean_centre_dhist <- function(dhist) { centred_locations <- dhist$locations - dhist_mean_location(dhist) - dhist <- update_dhist(dhist,locations = centred_locations) + dhist <- update_dhist(dhist, locations = centred_locations) return(dhist) } #' Normalise a discrete histogram to unit mass -#' -#' Normalises a discrete histogram to unit mass by dividing each mass by the +#' +#' Normalises a discrete histogram to unit mass by dividing each mass by the #' total of the non-normalised masses #' @param dhist A discrete histogram as a \code{dhist} object #' @return A discrete histogram normalised to have mass 1 @@ -511,18 +521,18 @@ normalise_dhist_mass <- function(dhist) { } #' Normalise a discrete histogram to unit variance -#' +#' #' Normalises a discrete histogram to unit variance by dividing each centred -#' location by the standard deviation of the discrete histogram before +#' location by the standard deviation of the discrete histogram before #' decentering #' @param dhist A discrete histogram as a \code{dhist} object #' @return A discrete histogram normalised to have variance 1 #' @export normalise_dhist_variance <- function(dhist) { - # Special case for histograms with only one location and no smoothing. + # Special case for histograms with only one location and no smoothing. # Variance is zero / undefined so normalisation fails. Just return bin centres # unchanged - if(length(dhist$locations) == 1 && dhist$smoothing_window_width == 0) { + if (length(dhist$locations) == 1 && dhist$smoothing_window_width == 0) { dhist <- dhist } else { # Centre locations on mean, divide centred locations by standard deviation @@ -534,7 +544,7 @@ normalise_dhist_variance <- function(dhist) { dhist <- update_dhist(dhist, locations = normalised_locations) # If smoothing_window_width not zero, then update it to reflect the variance # normalisation - if(dhist$smoothing_window_width != 0) { + if (dhist$smoothing_window_width != 0) { normalised_smoothing_window_width <- dhist$smoothing_window_width / std_dev dhist <- update_dhist(dhist, smoothing_window_width = normalised_smoothing_window_width) } @@ -543,9 +553,9 @@ normalise_dhist_variance <- function(dhist) { } #' Harmonise a pair of discrete histograms to share a common set of locations -#' +#' #' Where a location only exists in one histogram, add this location to the other -#' histogram with zero mass. This ensures that all location exist in both +#' histogram with zero mass. This ensures that all location exist in both #' histograms. #' @param dhist1 A discrete histogram as a \code{dhist} object #' @param dhist2 A discrete histogram as a \code{dhist} object @@ -562,7 +572,7 @@ harmonise_dhist_locations <- function(dhist1, dhist2) { masses1 <- c(dhist1$masses, rep(0, length(missing_locations1))) masses2 <- c(dhist2$masses, rep(0, length(missing_locations2))) # Construct a new histogram using the dhist constructor to ensure that the - # harmonised histograms have the same properties as if they had been + # harmonised histograms have the same properties as if they had been # constructed with the additional bins in the first place # (e.g. sorted by location) dhist1 <- update_dhist(dhist1, locations = locations1, masses = masses1) @@ -571,7 +581,7 @@ harmonise_dhist_locations <- function(dhist1, dhist2) { } #' Check if 1D numeric vector -#' +#' #' Check if a variable is a 1D numeric vector by checking that: #' \itemize{ #' \item \code{is_numeric(input)}: Input is vector, matrix, array or list of numbers diff --git a/R/emd.R b/R/emd.R index efaec31c..86575521 100644 --- a/R/emd.R +++ b/R/emd.R @@ -1,29 +1,29 @@ -#' Minimum Earth Mover's Distance (EMD) -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' Minimum Earth Mover's Distance (EMD) +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete #' histograms. This is the minimum EMD between the two histograms across all #' possible offsets of histogram 1 against histogram 2. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object -#' @param method The method to use to find the minimum EMD across all potential +#' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use -#' R's built-in \code{stats::optimise} method to efficiently find the offset -#' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' R's built-in \code{stats::optimise} method to efficiently find the offset +#' with the minimal EMD. However, this is not guaranteed to find the global +#' minimum if multiple local minima EMDs exist. You can alternatively specify the +#' "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @return Earth Mover's Distance between the two discrete histograms #' @export min_emd <- function(dhist1, dhist2, method = "optimise") { - # Require input to be a pair of "dhist" discrete histograms - if(!(is_dhist(dhist1) && is_dhist(dhist2))) { + # Require input to be a pair of "dhist" discrete histograms + if (!(is_dhist(dhist1) && is_dhist(dhist2))) { stop("All inputs must be 'dhist' discrete histogram objects") } - if(method == "optimise") { + if (method == "optimise") { return(min_emd_optimise_fast(dhist1, dhist2)) - } else if(method == "optimiseRonly"){ + } else if (method == "optimiseRonly") { return(min_emd_optimise(dhist1, dhist2)) - } else if(method == "exhaustive"){ + } else if (method == "exhaustive") { return(min_emd_exhaustive(dhist1, dhist2)) } else { stop("Method not recognised. Must be 'exhaustive' or ' optimise'") @@ -33,9 +33,9 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' Minimum Earth Mover's Distance (EMD) using fast optimiser search -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete -#' histograms by minimising the offset parameter of the \code{emd} function +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' histograms by minimising the offset parameter of the \code{emd} function #' using the built-in \code{stats::optimise} method. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object @@ -43,15 +43,14 @@ min_emd <- function(dhist1, dhist2, method = "optimise") { #' @export min_emd_optimise_fast <- function(dhist1, dhist2) { # Can we run the C++ fast implementation (only works with no smoothing)? - if ((dhist1$smoothing_window_width==0) && (dhist2$smoothing_window_width==0)) - { + if ((dhist1$smoothing_window_width == 0) && (dhist2$smoothing_window_width == 0)) { # Determine minimum and maximum offset of range in which histograms overlap # (based on sliding histogram 1) min_offset <- min(dhist2$locations) - max(dhist1$locations) max_offset <- max(dhist2$locations) - min(dhist1$locations) # Set lower and upper range for optimise algorithm to be somewhat wider than # range defined by the minimum and maximum offset. This guards against a - # couple of issues that arise if the optimise range is exactly min_offset + # couple of issues that arise if the optimise range is exactly min_offset # to max_offset # 1) If lower and upper are equal, the optimise method will throw an error # 2) It seems that optimise is not guaranteed to explore its lower and upper @@ -63,27 +62,28 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { # Define a single parameter function to minimise emd as a function of offset val1 <- cumsum(dhist1$masses) val2 <- cumsum(dhist2$masses) - val1 <- val1/val1[length(val1)] - val2 <- val2/val2[length(val2)] - loc1=dhist1$locations - loc2=dhist2$locations - count=0 + val1 <- val1 / val1[length(val1)] + val2 <- val2 / val2[length(val2)] + loc1 <- dhist1$locations + loc2 <- dhist2$locations + count <- 0 emd_offset <- function(offset) { - temp1<- emd_fast_no_smoothing(loc1+offset,val1,loc2,val2) + temp1 <- emd_fast_no_smoothing(loc1 + offset, val1, loc2, val2) temp1 } # Get solution from optimiser - soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, - tol = .Machine$double.eps*1000) - # Return mnimum EMD and associated offset - min_emd <- soln$objective - min_offset <- soln$minimum - return(list(min_emd = min_emd, min_offset = min_offset)) + soln <- stats::optimise(emd_offset, + lower = min_offset, upper = max_offset, + tol = .Machine$double.eps * 1000 + ) + # Return mnimum EMD and associated offset + min_emd <- soln$objective + min_offset <- soln$minimum + return(list(min_emd = min_emd, min_offset = min_offset)) } - else - { + else { # Fall back on other version if either dhist is smoothed - return(min_emd_optimise(dhist1, dhist2)); + return(min_emd_optimise(dhist1, dhist2)) } } @@ -91,9 +91,9 @@ min_emd_optimise_fast <- function(dhist1, dhist2) { #' Minimum Earth Mover's Distance (EMD) using optimiser search -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete -#' histograms by minimising the offset parameter of the \code{emd} function +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' histograms by minimising the offset parameter of the \code{emd} function #' using the built-in \code{stats::optimise} method. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object @@ -104,10 +104,10 @@ min_emd_optimise <- function(dhist1, dhist2) { # (based on sliding histogram 1) min_offset <- min(dhist2$locations) - max(dhist1$locations) max_offset <- max(dhist2$locations) - min(dhist1$locations) - + # Set lower and upper range for optimise algorithm to be somewhat wider than # range defined by the minimum and maximum offset. This guards against a - # couple of issues that arise if the optimise range is exactly min_offset + # couple of issues that arise if the optimise range is exactly min_offset # to max_offset # 1) If lower and upper are equal, the optimise method will throw and error # 2) It seems that optimise is not guaranteed to explore its lower and upper @@ -116,7 +116,7 @@ min_emd_optimise <- function(dhist1, dhist2) { buffer <- 0.1 min_offset <- min_offset - buffer max_offset <- max_offset + buffer - + # Define a single parameter function to minimise emd as a function of offset emd_offset <- function(offset) { # Construct ECMFs for each normalised histogram @@ -124,11 +124,13 @@ min_emd_optimise <- function(dhist1, dhist2) { ecmf2 <- dhist_ecmf(dhist2) area_between_dhist_ecmfs(ecmf1, ecmf2) } - + # Get solution from optimiser - soln <- stats::optimise(emd_offset, lower = min_offset, upper = max_offset, - tol = .Machine$double.eps*1000) - + soln <- stats::optimise(emd_offset, + lower = min_offset, upper = max_offset, + tol = .Machine$double.eps * 1000 + ) + # Return mnimum EMD and associated offset min_emd <- soln$objective min_offset <- soln$minimum @@ -136,20 +138,20 @@ min_emd_optimise <- function(dhist1, dhist2) { } #' Minimum Earth Mover's Distance (EMD) using exhaustive search -#' -#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete +#' +#' Calculates the minimum Earth Mover's Distance (EMD) between two discrete #' histograms using an exhaustive search. -#' -#' When "sliding" two piecewise-linear empirical cumulative mass functions -#' (ECMFs) across each other to minimise the EMD between them, it is sufficient -#' to calculate the EMD at all offsets where any knots from the two ECMFs align +#' +#' When "sliding" two piecewise-linear empirical cumulative mass functions +#' (ECMFs) across each other to minimise the EMD between them, it is sufficient +#' to calculate the EMD at all offsets where any knots from the two ECMFs align #' to ensure that the offset with the global minimum EMD is found. #' -#'This is because of the piece-wise linear nature of the two ECMFs. Between any -#'two offsets where knots from the two ECMFs align, EMD will be either constant, -#'or uniformly increasing or decreasing. Therefore, there the EMD between two -#'sets of aligned knots cannot be smaller than the EMD at one or other of the -#'bounding offsets. +#' This is because of the piece-wise linear nature of the two ECMFs. Between any +#' two offsets where knots from the two ECMFs align, EMD will be either constant, +#' or uniformly increasing or decreasing. Therefore, there the EMD between two +#' sets of aligned knots cannot be smaller than the EMD at one or other of the +#' bounding offsets. #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object #' @return Earth Mover's Distance between the two discrete histograms @@ -164,17 +166,18 @@ min_emd_exhaustive <- function(dhist1, dhist2) { cur_offset <- 0 # 0 so that adding first step shift gives initial offset # Set state variables distance_matrix <- NULL - while(step_shift < Inf) { + while (step_shift < Inf) { dhist1 <- shift_dhist(dhist1, step_shift) cur_offset <- cur_offset + step_shift cur_emd <- emd(dhist1, dhist2) - if(cur_emd < min_emd) { + if (cur_emd < min_emd) { min_emd <- cur_emd min_offset <- cur_offset } - res <- shift_to_next_alignment(dhist1$locations, dhist2$locations, - distance_matrix_prev = distance_matrix, - shift_prev = step_shift) + res <- shift_to_next_alignment(dhist1$locations, dhist2$locations, + distance_matrix_prev = distance_matrix, + shift_prev = step_shift + ) step_shift <- res$shift distance_matrix <- res$distance_matrix } @@ -182,31 +185,31 @@ min_emd_exhaustive <- function(dhist1, dhist2) { } #' Minimum shift to next alignment of two location vectors -#' -#' Calculate minimum right shift of first location vector to make any pair of +#' +#' Calculate minimum right shift of first location vector to make any pair of #' locations from the two vectors equal #' @param x1 First location vector. This vector is being shifted rightwards #' @param x2 Second location vector. This vector is remaining unchanged. -#' @return Minimum non-zero right-shift to apply to x1 to align at least one +#' @return Minimum non-zero right-shift to apply to x1 to align at least one #' element of x1 with at least one element of x2 -shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, +shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, shift_prev = NULL) { - if(!is.null(distance_matrix_prev) && !is.null(shift_prev)) { + if (!is.null(distance_matrix_prev) && !is.null(shift_prev)) { # If both distance matrix and shift from previous step provided, use these # to more efficiently calculate distance matrix distance_matrix <- (distance_matrix_prev - shift_prev) } else { - # Otherwise calculate distance matrix from scratch by calculating the + # Otherwise calculate distance matrix from scratch by calculating the # distance from each x1 to each x2 # NOTE: outer() generates a matrix with the first vector mapped to rows and - # the second vector mapped to columns, so the rows will be x2 and the + # the second vector mapped to columns, so the rows will be x2 and the # columns x1 distance_matrix <- outer(x2, x1, "-") - } + } # Calculate the distance from each x1 to each x2 - # outer() generates a matrix with the first vector mapped to rows and the + # outer() generates a matrix with the first vector mapped to rows and the # second vector mapped to columns - # We're stepping x1 from left to right across x2, so drop all negative + # We're stepping x1 from left to right across x2, so drop all negative # distances. Also drop zero distances as we want to step to the next alingment # even when x1 and x2 are already aligned distance_matrix[distance_matrix <= 0] <- Inf @@ -215,8 +218,8 @@ shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, return(list(shift = min(distance_matrix), distance_matrix = distance_matrix)) } -#' Earth Mover's Distance (EMD) -#' +#' Earth Mover's Distance (EMD) +#' #' Calculates the Earth Mover's Distance (EMD) between two discrete histograms #' @param dhist1 A \code{dhist} discrete histogram object #' @param dhist2 A \code{dhist} discrete histogram object @@ -224,20 +227,20 @@ shift_to_next_alignment <- function(x1, x2, distance_matrix_prev = NULL, #' @export emd <- function(dhist1, dhist2) { # Require the inputs to be "dhist" objects - if(!(is_dhist(dhist1) && is_dhist(dhist2))) { + if (!(is_dhist(dhist1) && is_dhist(dhist2))) { stop("All inputs must be 'dhist' discrete histogram objects") } - # Use efficient difference of cumulative histogram method that can also + # Use efficient difference of cumulative histogram method that can also # handle non-integer bin masses and location differences emd_cs(dhist1, dhist2) } #' Earth Mover's Distance (EMD) using the difference of cumulative sums method -#' +#' #' Takes two discrete histograms and calculates the Wasserstein / Earth Mover's -#' Distance between the two histograms by summing the absolute difference +#' Distance between the two histograms by summing the absolute difference #' between the two cumulative histograms. -#' @references +#' @references #' Calculation of the Wasserstein Distance Between Probability Distributions on the Line #' S. S. Vallender, Theory of Probability & Its Applications 1974 18:4, 784-786 #' \url{http://dx.doi.org/10.1137/1118101} @@ -255,11 +258,11 @@ emd_cs <- function(dhist1, dhist2) { } #' Earth Mover's Distance (EMD) using linear programming (LP) -#' -#' Takes two sets of histogram bin masses and bin centres and calculates the +#' +#' Takes two sets of histogram bin masses and bin centres and calculates the #' Earth Mover's Distance between the two histograms by solving the Transport #' Problem using linear programming. -#' +#' #' WARNING: Linear Programming solution will only give a correct answer if all #' masses and distances between bin centres are integers. #' @param bin_masses1 Bin masses for histogram 1 @@ -271,49 +274,51 @@ emd_cs <- function(dhist1, dhist2) { emd_lp <- function(bin_masses1, bin_masses2, bin_centres1, bin_centres2) { num_bins1 <- length(bin_masses1) num_bins2 <- length(bin_masses2) - + # Check inputs: All bins in each histogram must have a mass and centre, so # the bin_mass and bin_centre vectors for each histogram must have the same # length. - if(length(bin_centres1) != num_bins1) { + if (length(bin_centres1) != num_bins1) { stop("Number of bin masses and bin centres provided for histogram 1 must be equal") } - if(length(bin_centres2) != num_bins2) { + if (length(bin_centres2) != num_bins2) { stop("Number of bin masses and bin centres provided for histogram 2 must be equal") } - + # Generate cost matrix cost_mat <- cost_matrix(bin_centres1, bin_centres2) - - # Linear Programming solver requires all bin masses and transportation costs + + # Linear Programming solver requires all bin masses and transportation costs # to be integers to generate correct answer - if(!isTRUE(all.equal(bin_masses1, floor(bin_masses1)))) { + if (!isTRUE(all.equal(bin_masses1, floor(bin_masses1)))) { stop("All bin masses for histogram 1 must be integers for accurate Linear Programming solution") } - if(!isTRUE(all.equal(bin_masses2, floor(bin_masses2)))) { + if (!isTRUE(all.equal(bin_masses2, floor(bin_masses2)))) { stop("All bin masses for histogram 2 must be integers for accurate Linear Programming solution") } - if(!isTRUE(all.equal(cost_mat, floor(cost_mat)))) { + if (!isTRUE(all.equal(cost_mat, floor(cost_mat)))) { stop("All costs must be integers for accurate Linear Programming solution") - } + } row_signs <- rep("==", num_bins1) col_signs <- rep("<=", num_bins2) - s <- lpSolve::lp.transport(cost.mat = cost_mat, row.signs = row_signs, - col.signs = col_signs, row.rhs = bin_masses1, - col.rhs = bin_masses2) + s <- lpSolve::lp.transport( + cost.mat = cost_mat, row.signs = row_signs, + col.signs = col_signs, row.rhs = bin_masses1, + col.rhs = bin_masses2 + ) return(s$objval) } #' Inter-bin cost matrix from bin centres -#' -#' Generates a matrix for the cost of moving a unit of mass between each bin in +#' +#' Generates a matrix for the cost of moving a unit of mass between each bin in #' histogram 1 and each bin in histogram 2. #' @param bin_centres1 Bin centres for histogram 1 #' @param bin_centres2 Bin centres for histogram 2 #' @return Cost matrix cost_matrix <- function(bin_centres1, bin_centres2) { # Calculate distances between all bins in network 1 and all bins in network 2 - num_bins1 <- length(bin_centres1) + num_bins1 <- length(bin_centres1) num_bins2 <- length(bin_centres2) loc_mat1 <- matrix(bin_centres1, nrow = num_bins1, ncol = num_bins2, byrow = FALSE) loc_mat2 <- matrix(bin_centres2, nrow = num_bins1, ncol = num_bins2, byrow = TRUE) diff --git a/R/graph_binning.R b/R/graph_binning.R index 8c36cbda..f04b57df 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -9,10 +9,14 @@ binned_densities_adaptive <- function(densities, min_counts_per_interval, num_intervals) { - breaks <- adaptive_breaks(densities, min_count = min_counts_per_interval, - breaks = num_intervals) - interval_indexes <- interval_index(densities, breaks = breaks, - out_of_range_intervals = FALSE) + breaks <- adaptive_breaks(densities, + min_count = min_counts_per_interval, + breaks = num_intervals + ) + interval_indexes <- interval_index(densities, + breaks = breaks, + out_of_range_intervals = FALSE + ) list( densities = densities, interval_indexes = interval_indexes, @@ -75,15 +79,15 @@ adaptive_breaks <- function(x, min_count, breaks) { # an accurate count that includes indexes with no members with zero counts all_interval_indexes <- 1:num_intervals interval_index_counts <- plyr::count( - c(x_interval_indexes, all_interval_indexes) - ) + c(x_interval_indexes, all_interval_indexes) + ) interval_index_counts$freq <- interval_index_counts$freq - 1 # Find the first interval with fewer members than the minimum specified count merge_position <- Position( - function(i) i < min_count, - interval_index_counts$freq - ) + function(i) i < min_count, + interval_index_counts$freq + ) # Not all intervals are guaranteed to have members, so convert the index # provided by Position into an index into the full interval list and then add merge_interval_index <- interval_index_counts$x[merge_position] diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 64045e54..c0806146 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -54,8 +54,8 @@ netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, sum(counts2^2 / sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate intermediate "netD" statistic that falls within range -1..1 netds2 <- (1 / sqrt(norm_const)) * - sum((counts1 * counts2) / - sqrt(counts1^2 + counts2^2), na.rm = TRUE) + sum((counts1 * counts2) / + sqrt(counts1^2 + counts2^2), na.rm = TRUE) # Calculate corresponding "netd" Netdis statistic that falls within range 0..1 0.5 * (1 - netds2) } @@ -152,13 +152,13 @@ count_graphlets_ego_scaled <- function(graph, # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) ego_graphlet_tuples <- count_graphlet_tuples_ego( - ego_networks, - max_graphlet_size = max_graphlet_size - ) + ego_networks, + max_graphlet_size = max_graphlet_size + ) ego_graphlet_counts <- scale_graphlet_count( - ego_graphlet_counts, - ego_graphlet_tuples - ) + ego_graphlet_counts, + ego_graphlet_tuples + ) # Return either graphlet counts, or graphlet counts and ego_networks if (return_ego_networks) { @@ -195,11 +195,11 @@ netdis_centred_graphlet_counts <- function(graph, expected_ego_count_fn = NULL) { # Get centred counts for each ego network centred_counts <- netdis_centred_graphlet_counts_ego( - graph, - max_graphlet_size, - neighbourhood_size, - expected_ego_count_fn - ) + graph, + max_graphlet_size, + neighbourhood_size, + expected_ego_count_fn + ) # Sum centred counts over ego-networks apply(centred_counts, MARGIN = 2, FUN = sum) } @@ -277,34 +277,34 @@ netdis_expected_graphlet_counts_ego_fn <- function(graph, # graph, also returning the ego networks themselves in order to calculate # their densities res <- count_graphlets_ego_scaled( - graph, - max_graphlet_size, - neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE - ) + graph, + max_graphlet_size, + neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) scaled_graphlet_counts <- res$graphlet_counts ego_networks <- res$ego_networks # Get ego-network densities densities <- purrr::simplify( - purrr::map_dbl(ego_networks, igraph::edge_density) - ) + purrr::map_dbl(ego_networks, igraph::edge_density) + ) # Adaptively bin ego-network densities binned_densities <- binned_densities_adaptive( - densities, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins - ) + densities, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins + ) # Average graphlet counts across density bins density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts, - binned_densities$interval_indexes - ) + scaled_graphlet_counts, + binned_densities$interval_indexes + ) # Return a partially applied function with the key reference graph information # built-in @@ -479,12 +479,14 @@ ego_network_density <- function(ego_networks) { scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, max_graphlet_size) { ego_graphlet_tuples <- count_graphlet_tuples_ego( - ego_networks, - max_graphlet_size = max_graphlet_size) + ego_networks, + max_graphlet_size = max_graphlet_size + ) scaled_graphlet_counts <- scale_graphlet_count( - graphlet_counts, - ego_graphlet_tuples) + graphlet_counts, + ego_graphlet_tuples + ) return(scaled_graphlet_counts) } @@ -497,7 +499,8 @@ count_graphlet_tuples <- function(graph, max_graphlet_size) { graphlet_node_counts <- graphlet_key$node_count graphlet_tuple_counts <- choose(graph_node_count, graphlet_node_counts) graphlet_tuple_counts <- stats::setNames( - graphlet_tuple_counts, - graphlet_key$id) + graphlet_tuple_counts, + graphlet_key$id + ) graphlet_tuple_counts } diff --git a/R/measures_net_emd.R b/R/measures_net_emd.R index b45b667e..cf0fd9cb 100755 --- a/R/measures_net_emd.R +++ b/R/measures_net_emd.R @@ -1,61 +1,64 @@ -#' NetEMDs between all graph pairs using provided Graphlet-based Degree -#' Distributions -#' @param gdds List containing sets of Graphlet-based Degree Distributions for +#' NetEMDs between all graph pairs using provided Graphlet-based Degree +#' Distributions +#' @param gdds List containing sets of Graphlet-based Degree Distributions for #' all graphs being compared -#' @param method The method to use to find the minimum EMD across all potential +#' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use -#' R's built-in \code{stats::optimise} method to efficiently find the offset -#' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' R's built-in \code{stats::optimise} method to efficiently find the offset +#' with the minimal EMD. However, this is not guaranteed to find the global +#' minimum if multiple local minima EMDs exist. You can alternatively specify the +#' "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @param return_details Logical indicating whether to return the individual #' minimal EMDs and associated offsets for all pairs of histograms #' @param smoothing_window_width Width of "top-hat" smoothing window to apply to -#' "smear" point masses across a finite width in the real domain. Default is 0, -#' which results in no smoothing. Care should be taken to select a -#' \code{smoothing_window_width} that is appropriate for the discrete domain +#' "smear" point masses across a finite width in the real domain. Default is 0, +#' which results in no smoothing. Care should be taken to select a +#' \code{smoothing_window_width} that is appropriate for the discrete domain #' (e.g.for the integer domain a width of 1 is the natural choice) #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. -#' @return NetEMD measures between all pairs of graphs for which GDDs +#' @return NetEMD measures between all pairs of graphs for which GDDs #' were provided. Format of returned data depends on the \code{return_details} #' parameter. If set to FALSE, a list is returned with the following named -#' elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, -#' \code{comp_spec}: a comaprison specification table containing the graph names +#' elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, +#' \code{comp_spec}: a comaprison specification table containing the graph names #' and indices within the input GDD list for each pair of graphs compared. #' If \code{return_details} is set to FALSE, the list also contains the following -#' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD +#' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD #' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving #' the minimal EMD for each GDD #' @export net_emds_for_all_graphs <- function( - gdds, method = "optimise", smoothing_window_width = 0, - return_details = FALSE, mc.cores = getOption("mc.cores", 2L)) { + gdds, method = "optimise", smoothing_window_width = 0, + return_details = FALSE, mc.cores = getOption("mc.cores", 2L)) { comp_spec <- cross_comparison_spec(gdds) - # NOTE: mcapply only works on unix-like systems with system level forking + # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows # TODO: Look into using the parLappy function on Windows - if(.Platform$OS.type != "unix") { - # Force cores to 1 if system is not unix-like as it will not support + if (.Platform$OS.type != "unix") { + # Force cores to 1 if system is not unix-like as it will not support # forking - mc.cores = 1 + mc.cores <- 1 } num_features <- length(gdds[[1]]) - out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) {net_emd( - gdds[[index_a]], gdds[[index_b]], method = method, return_details = return_details, - smoothing_window_width = smoothing_window_width) - }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE, mc.cores = mc.cores)) - if(return_details) { - net_emds <- purrr::simplify(purrr::map(out, ~.$net_emd)) - min_emds <- matrix(purrr::simplify(purrr::map(out, ~.$min_emds)), ncol = num_features, byrow = TRUE) - colnames(min_emds) <- purrr::simplify(purrr::map(1:num_features, ~paste("MinEMD_O", .-1, sep = ""))) - min_offsets <- matrix(purrr::simplify(purrr::map(out, ~.$min_offsets)), ncol = num_features, byrow = TRUE) - colnames(min_offsets) <- purrr::simplify(purrr::map(1:num_features, ~paste("MinOffsets_O", .-1, sep = ""))) - min_offsets_std <- matrix(purrr::simplify(purrr::map(out, ~.$min_offsets_std)), ncol = num_features, byrow = TRUE) - colnames(min_offsets_std) <- purrr::simplify(purrr::map(1:num_features, ~paste("MinOffsetsStd_O", .-1, sep = ""))) - ret <- list(net_emds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets,min_offsets_std = min_offsets_std) + out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { + net_emd( + gdds[[index_a]], gdds[[index_b]], + method = method, return_details = return_details, + smoothing_window_width = smoothing_window_width + ) + }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE, mc.cores = mc.cores)) + if (return_details) { + net_emds <- purrr::simplify(purrr::map(out, ~ .$net_emd)) + min_emds <- matrix(purrr::simplify(purrr::map(out, ~ .$min_emds)), ncol = num_features, byrow = TRUE) + colnames(min_emds) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinEMD_O", . - 1, sep = ""))) + min_offsets <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets)), ncol = num_features, byrow = TRUE) + colnames(min_offsets) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsets_O", . - 1, sep = ""))) + min_offsets_std <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets_std)), ncol = num_features, byrow = TRUE) + colnames(min_offsets_std) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsetsStd_O", . - 1, sep = ""))) + ret <- list(net_emds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std) } else { net_emds <- out ret <- list(net_emds = net_emds, comp_spec = comp_spec) @@ -63,7 +66,7 @@ net_emds_for_all_graphs <- function( } #' NetEMD Network Earth Mover's Distance -#' +#' #' Calculates the mean minimum Earth Mover's Distance (EMD) between two sets of #' discrete histograms after normalising each histogram to unit mass and variance. #' This is calculated as follows: @@ -72,45 +75,47 @@ net_emds_for_all_graphs <- function( #' 3. Take the average minimum EMD across all histogram pairs #' @param dhists1 A \code{dhist} discrete histogram object or a list of such objects #' @param dhists2 A \code{dhist} discrete histogram object or a list of such objects -#' @param method The method to use to find the minimum EMD across all potential +#' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use -#' R's built-in \code{stats::optimise} method to efficiently find the offset -#' with the minimal EMD. However, this is not guaranteed to find the global -#' minimum if multiple local minima EMDs exist. You can alternatively specify the -#' "exhaustive" method, which will exhaustively evaluate the EMD between the +#' R's built-in \code{stats::optimise} method to efficiently find the offset +#' with the minimal EMD. However, this is not guaranteed to find the global +#' minimum if multiple local minima EMDs exist. You can alternatively specify the +#' "exhaustive" method, which will exhaustively evaluate the EMD between the #' histograms at all offsets that are candidates for the minimal EMD. #' @param return_details Logical indicating whether to return the individual #' minimal EMDs and associated offsets for all pairs of histograms #' @param smoothing_window_width Width of "top-hat" smoothing window to apply to -#' "smear" point masses across a finite width in the real domain. Default is 0, -#' which results in no smoothing. Care should be taken to select a -#' \code{smoothing_window_width} that is appropriate for the discrete domain +#' "smear" point masses across a finite width in the real domain. Default is 0, +#' which results in no smoothing. Care should be taken to select a +#' \code{smoothing_window_width} that is appropriate for the discrete domain #' (e.g.for the integer domain a width of 1 is the natural choice) -#' @return NetEMD measure for the two sets of discrete histograms +#' @return NetEMD measure for the two sets of discrete histograms #' (\code{return_details = FALSE}) or a list with the following named elements -#' \code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_emds}: +#' \code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_emds}: #' the minimal EMD for each pair of histograms, \code{min_offsets}: the associated #' offsets giving the minimal EMD for each pair of histograms #' @export -net_emd <- function(dhists1, dhists2, method = "optimise", +net_emd <- function(dhists1, dhists2, method = "optimise", return_details = FALSE, smoothing_window_width = 0) { # Require either a pair of "dhist" discrete histograms or two lists of "dhist" # discrete histograms pair_of_dhist_lists <- all(purrr::map_lgl(dhists1, is_dhist)) && all(purrr::map_lgl(dhists2, is_dhist)) - + # If input is two lists of "dhist" discrete histograms, determine the minimum - # EMD and associated offset for pairs of histograms taken from the same + # EMD and associated offset for pairs of histograms taken from the same # position in each list - if(pair_of_dhist_lists) { + if (pair_of_dhist_lists) { details <- purrr::map2(dhists1, dhists2, function(dhist1, dhist2) { - net_emd_single_pair(dhist1, dhist2, method = method, - smoothing_window_width = smoothing_window_width) - }) + net_emd_single_pair(dhist1, dhist2, + method = method, + smoothing_window_width = smoothing_window_width + ) + }) # Collect the minimum EMDs and associated offsets for all histogram pairs min_emds <- purrr::simplify(purrr::transpose(details)$min_emd) min_offsets <- purrr::simplify(purrr::transpose(details)$min_offset) min_offsets_std <- purrr::simplify(purrr::transpose(details)$min_offset_std) - # The NetEMD is the arithmetic mean of the minimum EMDs for each pair of + # The NetEMD is the arithmetic mean of the minimum EMDs for each pair of # histograms arithmetic_mean <- sum(min_emds) / length(min_emds) net_emd <- arithmetic_mean @@ -118,8 +123,8 @@ net_emd <- function(dhists1, dhists2, method = "optimise", # the minumum EMD and associated offsets for the individual histograms # Note that the offsets represent shifts after the histograms have been # scaled to unit variance - if(return_details) { - return(list(net_emd = net_emd, min_emds = min_emds, min_offsets = min_offsets,min_offsets_std=min_offsets_std)) + if (return_details) { + return(list(net_emd = net_emd, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std)) } else { return(arithmetic_mean) } @@ -127,58 +132,60 @@ net_emd <- function(dhists1, dhists2, method = "optimise", else { # Wrap each member of a single pair of histograms is a list and recursively # call this net_emd function. This ensures they are treated the same. - return(net_emd(list(dhists1), list(dhists2), method = method, - return_details = return_details, - smoothing_window_width = smoothing_window_width)) + return(net_emd(list(dhists1), list(dhists2), + method = method, + return_details = return_details, + smoothing_window_width = smoothing_window_width + )) } } -net_emd_single_pair <- function(dhist1, dhist2, method = "optimise", +net_emd_single_pair <- function(dhist1, dhist2, method = "optimise", smoothing_window_width = 0) { - # Present dhists as smoothed or unsmoothed histograms depending on the value + # Present dhists as smoothed or unsmoothed histograms depending on the value # of smoothing_window_width - # NOTE: This MUST be done prior to any variance normalisation as the - # calculation of variance differs depending on whether or not the histograms - # are smoothed (i.e. we need to ensure that the smoothing_window_width + # NOTE: This MUST be done prior to any variance normalisation as the + # calculation of variance differs depending on whether or not the histograms + # are smoothed (i.e. we need to ensure that the smoothing_window_width # attribute of the dhists is set to the smoothing_window_width parameter # provided by the caller) - # TODO: Consider moving the smoothing of histograms outside to the user's + # TODO: Consider moving the smoothing of histograms outside to the user's # calling code. It feels a bit untidy in here. - if(smoothing_window_width == 0) { + if (smoothing_window_width == 0) { dhist1 <- as_unsmoothed_dhist(dhist1) dhist2 <- as_unsmoothed_dhist(dhist2) } else { dhist1 <- as_smoothed_dhist(dhist1, smoothing_window_width) dhist2 <- as_smoothed_dhist(dhist2, smoothing_window_width) } - + # Store means and variances to calculate offset later mean1 <- dhist_mean_location(dhist1) mean2 <- dhist_mean_location(dhist2) - + var1 <- dhist_variance(dhist1) var2 <- dhist_variance(dhist2) - - # Mean centre histograms. This helps with numerical stability as, after + + # Mean centre histograms. This helps with numerical stability as, after # variance normalisation, the differences between locations are often small. # We want to avoid calculating small differences between large numbers as # floating point precision issues can result in accumulating inaccuracies. # Mean-centering histograms results in variance normalised locations being # clustered around zero, rather than some potentially large mean location. - dhist1<-mean_centre_dhist(dhist1) - dhist2<-mean_centre_dhist(dhist2) + dhist1 <- mean_centre_dhist(dhist1) + dhist2 <- mean_centre_dhist(dhist2) # Normalise histogram to unit mass and unit variance dhist1_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist1)) dhist2_norm <- normalise_dhist_variance(normalise_dhist_mass(dhist2)) - + # Calculate minimal EMD result <- min_emd(dhist1_norm, dhist2_norm, method = method) # As we mean-centred the histograms prior to passing to min_emd(), the offset - # returned is not the "true" offset for the supplied histograms. We report + # returned is not the "true" offset for the supplied histograms. We report # this as the "standardised" offset. - result$min_offset_std <- result$min_offset - # We report the "true" offset as the offset with no mean-centring, so need to + result$min_offset_std <- result$min_offset + # We report the "true" offset as the offset with no mean-centring, so need to # adjust to reverse the earlier mean-centring result$min_offset <- result$min_offset + mean2 - mean1 return(result) diff --git a/R/net_emd_speed_benchmark.R b/R/net_emd_speed_benchmark.R index 285e3e14..62c97611 100644 --- a/R/net_emd_speed_benchmark.R +++ b/R/net_emd_speed_benchmark.R @@ -1,35 +1,34 @@ -netEMDSpeedTest <- function() -{ - ##load the data +netEMDSpeedTest <- function() { + ## load the data source_dir <- system.file(file.path("extdata", "random"), package = "netdist") print(source_dir) - edge_format = "ncol" - file_pattern = "" + edge_format <- "ncol" + file_pattern <- "" # source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # edge_format = "ncol" # file_pattern = ".txt" graphs <- read_simple_graphs(source_dir = source_dir, format = edge_format, pattern = file_pattern) - n1=names(graphs) - lab1=c() - gddBuildTime=c() - netEMDtime=c() + n1 <- names(graphs) + lab1 <- c() + gddBuildTime <- c() + netEMDtime <- c() for (i in 1:length(graphs)) { for (j in 1:(i)) { - g1=graphs[[i]] - g2=graphs[[j]] - lab1=append(lab1,paste(n1[i],n1[j],sep=',')) - print(paste(n1[i],n1[j],sep=',')) - fulltimeStart=Sys.time() - gdd1=gdd(g1) - gdd2=gdd(g2) - netEMDStart=Sys.time() - net_emd(gdd1,gdd2) - endTime=Sys.time() - gddBuildTime=append(gddBuildTime,as.double(netEMDStart-fulltimeStart)) - netEMDtime=append(netEMDtime,as.double(endTime-netEMDStart)) + g1 <- graphs[[i]] + g2 <- graphs[[j]] + lab1 <- append(lab1, paste(n1[i], n1[j], sep = ",")) + print(paste(n1[i], n1[j], sep = ",")) + fulltimeStart <- Sys.time() + gdd1 <- gdd(g1) + gdd2 <- gdd(g2) + netEMDStart <- Sys.time() + net_emd(gdd1, gdd2) + endTime <- Sys.time() + gddBuildTime <- append(gddBuildTime, as.double(netEMDStart - fulltimeStart)) + netEMDtime <- append(netEMDtime, as.double(endTime - netEMDStart)) } } list(gddBuildTime = gddBuildTime, netEMDtime = netEMDtime) -} \ No newline at end of file +} diff --git a/R/netdist_package.R b/R/netdist_package.R index b5f3704a..afc198d9 100644 --- a/R/netdist_package.R +++ b/R/netdist_package.R @@ -1,3 +1,3 @@ #' @useDynLib netdist, .registration=TRUE #' @importFrom Rcpp sourceCpp -NULL \ No newline at end of file +NULL diff --git a/R/orca_interface.R b/R/orca_interface.R index 04e080ed..9293fdd5 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -35,9 +35,10 @@ graph_to_indexed_edges <- function(graph) { indexed_edges_to_graph <- function(indexed_edges) { graph <- igraph::graph_from_edgelist(indexed_edges) graph <- igraph::set.vertex.attribute( - graph, name = "name", - value = attr(indexed_edges, "vertex_names") - ) + graph, + name = "name", + value = attr(indexed_edges, "vertex_names") + ) return(graph) } @@ -79,33 +80,33 @@ read_simple_graphs <- function(source_dir, # Read graph data from each matched file as an igraph format graph, # simplifying as requested graphs <- purrr::map( - file_names, - function(file_name) { - read_simple_graph( - file = file.path(source_dir, file_name), - format = format, - as_undirected = as_undirected, - remove_loops = remove_loops, - remove_multiple = remove_multiple, - remove_isolates = remove_isolates - ) - } - ) + file_names, + function(file_name) { + read_simple_graph( + file = file.path(source_dir, file_name), + format = format, + as_undirected = as_undirected, + remove_loops = remove_loops, + remove_multiple = remove_multiple, + remove_isolates = remove_isolates + ) + } + ) # Name each graph with the name of the file it was read from (with any # extension moved) names <- purrr::simplify( - purrr::map( - strsplit(file_names, "\\."), - function(s) { - if (length(s) == 1) { - s - } else { - paste(utils::head(s, -1), collapse = ".") - } - } - ) - ) + purrr::map( + strsplit(file_names, "\\."), + function(s) { + if (length(s) == 1) { + s + } else { + paste(utils::head(s, -1), collapse = ".") + } + } + ) + ) attr(graphs, "names") <- names return(graphs) } @@ -143,9 +144,11 @@ read_simple_graph <- function(file, format, as_undirected = TRUE, graph <- igraph::read_graph(file = file, format = format) } # Perform any requested simplifications - simplify_graph(graph, as_undirected = as_undirected, - remove_loops = remove_loops, remove_multiple = remove_multiple, - remove_isolates = remove_isolates) + simplify_graph(graph, + as_undirected = as_undirected, + remove_loops = remove_loops, remove_multiple = remove_multiple, + remove_isolates = remove_isolates + ) } #' Simplify an igraph @@ -168,7 +171,7 @@ read_simple_graph <- function(file, format, as_undirected = TRUE, #' @return A simplified igraph graph object #' @export simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE) { + remove_multiple = TRUE, remove_isolates = TRUE) { if (as_undirected) { # Ensure graph is undirected graph <- igraph::as.undirected(graph, mode = "each") @@ -177,8 +180,10 @@ simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, # Remove loops (where both endpoints of an edge are the same vertex) and # multiple edges (where two edges have the same endpoints [in the same order # for directed graphs]) - graph <- igraph::simplify(graph, remove.loops = remove_loops, - remove.multiple = remove_multiple) + graph <- igraph::simplify(graph, + remove.loops = remove_loops, + remove.multiple = remove_multiple + ) } if (remove_isolates) { # Remove vertices that have no edges connecting them to other vertices @@ -228,23 +233,25 @@ graph_features_to_histogramsSLOW <- function(features_matrix) { #' @return List of graphlet-based degree distributions, with each distribution #' represented as a \code{dhist} discrete histogram object. #' @export -gdd <- function(graph, feature_type = 'orbit', max_graphlet_size = 4, - ego_neighbourhood_size = 0){ +gdd <- function(graph, feature_type = "orbit", max_graphlet_size = 4, + ego_neighbourhood_size = 0) { graph <- simplify_graph(graph) - if(ego_neighbourhood_size > 0) { - if(feature_type != 'graphlet') { + if (ego_neighbourhood_size > 0) { + if (feature_type != "graphlet") { stop("Feature type not supported for ego-networks") } else { - out <- count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = ego_neighbourhood_size) + out <- count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = ego_neighbourhood_size + ) } - } else if(feature_type == "orbit") { + } else if (feature_type == "orbit") { out <- count_orbits_per_node(graph, max_graphlet_size = max_graphlet_size) - } else if(feature_type == "graphlet") { + } else if (feature_type == "graphlet") { out <- count_graphlets_per_node(graph, max_graphlet_size = max_graphlet_size) } else { - stop('gdd: unrecognised feature_type') + stop("gdd: unrecognised feature_type") } graph_features_to_histograms(out) } @@ -260,16 +267,16 @@ gdd <- function(graph, feature_type = 'orbit', max_graphlet_size = 4, #' orbit (columns) at each node in the graph (rows). #' @export count_orbits_per_node <- function(graph, max_graphlet_size) { - if(max_graphlet_size == 4) { + if (max_graphlet_size == 4) { orca_fn <- orca::count4 - } else if(max_graphlet_size == 5) { + } else if (max_graphlet_size == 5) { orca_fn <- orca::count5 } else { stop("Unsupported maximum graphlet size") } indexed_edges <- graph_to_indexed_edges(graph) num_edges <- dim(indexed_edges)[[1]] - if(num_edges >= 1) { + if (num_edges >= 1) { orbit_counts <- orca_fn(indexed_edges) } else { # ORCA functions expect at least one edge, so handle this case separately @@ -354,19 +361,22 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size return_ego_networks = FALSE) { # Extract ego network for each node in original graph, naming each ego network # in the list with the name of the node the ego network is generated for - ego_networks <- make_named_ego_graph(graph, order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ego_networks <- make_named_ego_graph(graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) # Generate graphlet counts for each node in each ego network (returns an ORCA # format graphlet count matrix for each ego network) ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size) + max_graphlet_size = max_graphlet_size + ) # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) # Return either graphlet counts, or graphlet counts and ego_networks - if(return_ego_networks) { + if (return_ego_networks) { return(list(graphlet_counts = ego_graphlet_counts, ego_networks = ego_networks)) } else { return(ego_graphlet_counts) @@ -391,7 +401,8 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { # Generate graphlet counts for each node in each ego network (returns an ORCA # format graphlet count matrix for each ego network) ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size) + max_graphlet_size = max_graphlet_size + ) # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node @@ -416,9 +427,8 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { #' @param ... Additional parameters to be passed to the underlying #' \code{igraph::make_ego_graph} function used. #' @export -make_named_ego_graph <- function(graph, order, min_ego_nodes=3, - min_ego_edges=1, ...) { - +make_named_ego_graph <- function(graph, order, min_ego_nodes = 3, + min_ego_edges = 1, ...) { ego_networks <- igraph::make_ego_graph(graph, order, ...) names(ego_networks) <- igraph::V(graph)$name @@ -445,28 +455,39 @@ orbit_to_graphlet_counts <- function(orbit_counts) { # Indexes to select the orbit(s) that comprise each graphlet. Note that we # define these in the zero-based indexing used in journal papers, but # need to add 1 to convert to the 1-based indexing used by R - if(num_orbits == 15) { + if (num_orbits == 15) { # Orbits for graphlets comprising up to 4 nodes max_nodes <- 4 orbit_to_graphlet_map <- - purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14), - function(indexes){ indexes + 1}) - } else if(num_orbits == 73) { + purrr::map( + list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14), + function(indexes) { + indexes + 1 + } + ) + } else if (num_orbits == 73) { # Orbits for graphlets comprising up to 5 nodes max_nodes <- 5 orbit_to_graphlet_map <- - purrr::map(list(0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14, 15:17, 18:21, - 22:23, 24:26, 27:30, 31:33, 34, 35:38, 39:42, 43:44, - 45:48, 49:50, 51:53, 54:55, 56:58, 59:61, 62:64, - 65:67, 68:69, 70:71, 72), - function(indexes){ indexes + 1}) + purrr::map( + list( + 0, 1:2, 3, 4:5, 6:7, 8, 9:11, 12:13, 14, 15:17, 18:21, + 22:23, 24:26, 27:30, 31:33, 34, 35:38, 39:42, 43:44, + 45:48, 49:50, 51:53, 54:55, 56:58, 59:61, 62:64, + 65:67, 68:69, 70:71, 72 + ), + function(indexes) { + indexes + 1 + } + ) } else { stop(("Unsupported number of orbits")) } # Sum counts across orbits in graphlets - graphlet_counts <- sapply(orbit_to_graphlet_map, function(indexes){ - rowSums(orbit_counts[,indexes, drop = FALSE])}) - if(dim(orbit_counts)[[1]] == 1) { + graphlet_counts <- sapply(orbit_to_graphlet_map, function(indexes) { + rowSums(orbit_counts[, indexes, drop = FALSE]) + }) + if (dim(orbit_counts)[[1]] == 1) { # If orbit counts has only a single row, sapply returns a vector # rather than a matrix, so convert to a matrix by adding dim dim(graphlet_counts) <- c(1, length(graphlet_counts)) @@ -489,22 +510,23 @@ orbit_to_graphlet_counts <- function(orbit_counts) { #' } #' @export graphlet_key <- function(max_graphlet_size) { - if(max_graphlet_size == 2) { + if (max_graphlet_size == 2) { node_count <- c(2) - } else if(max_graphlet_size == 3) { - node_count <- c(2, rep(3,2)) - } else if(max_graphlet_size == 4) { - node_count <- c(2, rep(3,2), rep(4,6)) + } else if (max_graphlet_size == 3) { + node_count <- c(2, rep(3, 2)) + } else if (max_graphlet_size == 4) { + node_count <- c(2, rep(3, 2), rep(4, 6)) } else if (max_graphlet_size == 5) { - node_count <- c(2, rep(3,2), rep(4,6), rep(5, 21)) + node_count <- c(2, rep(3, 2), rep(4, 6), rep(5, 21)) } else { stop("Unsupported maximum graphlet size") } - max_node_index <- length(node_count)-1 + max_node_index <- length(node_count) - 1 id <- purrr::simplify(purrr::map(0:max_node_index, function(index) { - paste('G', index, sep = "")})) + paste("G", index, sep = "") + })) name <- - return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) + return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) } #' Orbit key @@ -520,20 +542,21 @@ graphlet_key <- function(max_graphlet_size) { #' } #' @export orbit_key <- function(max_graphlet_size) { - if(max_graphlet_size == 2) { + if (max_graphlet_size == 2) { node_count <- c(2) - } else if(max_graphlet_size == 3) { - node_count <- c(2, rep(3,3)) - } else if(max_graphlet_size == 4) { - node_count <- c(2, rep(3,3), rep(4,11)) + } else if (max_graphlet_size == 3) { + node_count <- c(2, rep(3, 3)) + } else if (max_graphlet_size == 4) { + node_count <- c(2, rep(3, 3), rep(4, 11)) } else if (max_graphlet_size == 5) { - node_count <- c(2, rep(3,3), rep(4,11), rep(5, 58)) + node_count <- c(2, rep(3, 3), rep(4, 11), rep(5, 58)) } else { stop("Unsupported maximum graphlet size") } - max_node_index <- length(node_count)-1 + max_node_index <- length(node_count) - 1 id <- purrr::simplify(purrr::map(0:max_node_index, function(index) { - paste('O', index, sep = "")})) + paste("O", index, sep = "") + })) name <- return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) } @@ -547,7 +570,7 @@ orbit_key <- function(max_graphlet_size) { #' @export graphlet_ids_for_size <- function(graphlet_size) { graphlet_key <- graphlet_key(graphlet_size) - graphlet_key$id[graphlet_key$node_count==graphlet_size] + graphlet_key$id[graphlet_key$node_count == graphlet_size] } #' Load all graphs in a directory and calculates their Graphlet-based Degree @@ -574,27 +597,32 @@ graphlet_ids_for_size <- function(graphlet_size) { #' where each GDD element is a \code{dhist} discrete histogram object. #' @export gdd_for_all_graphs <- function( - source_dir, format = "ncol", pattern = ".txt", feature_type = "orbit", - max_graphlet_size = 4, ego_neighbourhood_size = 0, - mc.cores = getOption("mc.cores", 2L)) { + source_dir, format = "ncol", pattern = ".txt", feature_type = "orbit", + max_graphlet_size = 4, ego_neighbourhood_size = 0, + mc.cores = getOption("mc.cores", 2L)) { # Create function to read graph from file and generate GDD graphs <- read_simple_graphs( - source_dir = source_dir, format = format, pattern = pattern) + source_dir = source_dir, format = format, pattern = pattern + ) # Calculate specified GDDs for each graph # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows # TODO: Look into using the parLappy function on Windows - if(.Platform$OS.type != "unix") { + if (.Platform$OS.type != "unix") { # Force cores to 1 if system is not unix-like as it will not support # forking - mc.cores = 1 + mc.cores <- 1 } - parallel::mcmapply(gdd, graphs, MoreArgs = - list(feature_type = feature_type, - max_graphlet_size = max_graphlet_size, - ego_neighbourhood_size = ego_neighbourhood_size), - SIMPLIFY = FALSE, mc.cores = mc.cores) + parallel::mcmapply(gdd, graphs, + MoreArgs = + list( + feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ), + SIMPLIFY = FALSE, mc.cores = mc.cores + ) } #' Generate a cross-comparison specification @@ -606,12 +634,14 @@ gdd_for_all_graphs <- function( #' @return A matrix with one row for each possible pair-wise combination #' of elements from the provided named list. The first and second columns #' contain the names of the elements in the pair and the third and fourth -#'columns contain the indexes of these elements in the provided list. +#' columns contain the indexes of these elements in the provided list. #' @export cross_comparison_spec <- function(named_list) { - indexes <- as.data.frame(t(utils::combn(1:length(named_list),2))) - names <- as.data.frame(cbind(names(named_list)[indexes[,1]], - names(named_list)[indexes[,2]])) + indexes <- as.data.frame(t(utils::combn(1:length(named_list), 2))) + names <- as.data.frame(cbind( + names(named_list)[indexes[, 1]], + names(named_list)[indexes[, 2]] + )) spec <- cbind(names, indexes) colnames(spec) <- c("name_a", "name_b", "index_a", "index_b") return(spec) @@ -630,7 +660,7 @@ cross_comparison_spec <- function(named_list) { #' @export cross_comp_to_matrix <- function(measure, cross_comparison_spec) { num_items <- max(c(cross_comparison_spec$index_a, cross_comparison_spec$index_b)) - out <- matrix(data = 0, nrow = num_items, ncol = num_items); + out <- matrix(data = 0, nrow = num_items, ncol = num_items) out[cbind(cross_comparison_spec$index_a, cross_comparison_spec$index_b)] <- measure out[cbind(cross_comparison_spec$index_b, cross_comparison_spec$index_a)] <- measure row_labels <- rep("", num_items) diff --git a/R/utility_functions.R b/R/utility_functions.R index b5b1cf5b..25c58b6a 100755 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -1,8 +1,8 @@ # VECTOR FUNCTIONS rotl_vec <- function(vec, lshift) { num_els <- length(vec) - select_mask <- ((1:num_els + lshift) %% num_els) - select_mask[select_mask==0] <- num_els + select_mask <- ((1:num_els + lshift) %% num_els) + select_mask[select_mask == 0] <- num_els return(vec[select_mask]) } diff --git a/data-raw/virus.R b/data-raw/virus.R index 9fb046f6..0d4d42a2 100644 --- a/data-raw/virus.R +++ b/data-raw/virus.R @@ -7,12 +7,12 @@ load_virus_data <- function(filename) { read_simple_graph(file = file.path(data_dir, filename), format = "ncol") } -virusppi <- list(EBV = load_virus_data("EBV.txt"), - ECL = load_virus_data("ECL.txt"), - `HSV-1` = load_virus_data("HSV-1.txt"), - KSHV = load_virus_data("KSHV.txt"), - VZV = load_virus_data("VZV.txt") - ) +virusppi <- list( + EBV = load_virus_data("EBV.txt"), + ECL = load_virus_data("ECL.txt"), + `HSV-1` = load_virus_data("HSV-1.txt"), + KSHV = load_virus_data("KSHV.txt"), + VZV = load_virus_data("VZV.txt") +) devtools::use_data(virusppi, overwrite = TRUE) - diff --git a/tests/testthat.R b/tests/testthat.R index 76669a6d..bc52a947 100755 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) library(netdist) -test_check("netdist") \ No newline at end of file +test_check("netdist") diff --git a/tests/testthat/test-cpp.R b/tests/testthat/test-cpp.R index 37ffa32b..8c682729 100644 --- a/tests/testthat/test-cpp.R +++ b/tests/testthat/test-cpp.R @@ -1,4 +1,4 @@ context("C++") test_that("Catch unit tests pass", { - expect_cpp_tests_pass("netdist") + expect_cpp_tests_pass("netdist") }) diff --git a/tests/testthat/test_dhist.R b/tests/testthat/test_dhist.R index 4aae5e28..37dc59b2 100644 --- a/tests/testthat/test_dhist.R +++ b/tests/testthat/test_dhist.R @@ -1,6 +1,6 @@ context("dhist: Discrete histogram from observations") test_that("discrete_hist generates correct discrete histograms for random integer observations", { - # Method for generating random observations containing specific locations a + # Method for generating random observations containing specific locations a # specific number of times random_observations <- function(locations, counts) { # Construct vector containing each location replicated "count" times @@ -8,266 +8,314 @@ test_that("discrete_hist generates correct discrete histograms for random intege # Randomise the order of the observations sample(observations, size = length(observations), replace = FALSE) } - + set.seed(2684) num_tests <- 100 - + run_test <- function() { # Set parameters for generation of random observation sets num_observations <- 100 - location_range <- -(num_observations*3):(num_observations*3) - # Do not allow zero counts as these locations will not be present in the + location_range <- -(num_observations * 3):(num_observations * 3) + # Do not allow zero counts as these locations will not be present in the # observations generated from the locations and counts count_range <- 1:10 - + # Generate random observation sets locations <- sample(location_range, num_observations, replace = FALSE) counts <- sample(count_range, num_observations, replace = TRUE) - + # Construct vector containing each location replicated "count" times observations_orig <- purrr::simplify(purrr::map2(locations, counts, rep)) # Randomise the order of the observations observations <- sample(observations_orig, size = length(observations_orig), replace = FALSE) - + # Generate discrete histograms hist <- dhist_from_obs(observations) - - # discrete_hist will drop bins with zero counts, so remove these from the - # expected data (not necessary now we've restricted counts to be >= 1, but + + # discrete_hist will drop bins with zero counts, so remove these from the + # expected data (not necessary now we've restricted counts to be >= 1, but # the bug where we generated test locations with zero counts was so annoying # to identify that we're going with a belt and braces approach) non_zero_count_indexes <- counts != 0 expected_locations <- locations[non_zero_count_indexes] expected_counts <- counts[non_zero_count_indexes] - # dhist_from_obs will return results with bins ordered by ascending location, + # dhist_from_obs will return results with bins ordered by ascending location, # so sort expected data to match sorted_locations <- sort(expected_locations, index.return = TRUE) sorted_location_indexes <- sorted_locations$ix expected_locations <- expected_locations[sorted_location_indexes] expected_counts <- expected_counts[sorted_location_indexes] - - # Check that histogram locations and counts match those used to generate the + + # Check that histogram locations and counts match those used to generate the # observations expect_true(all.equal(hist$locations, expected_locations)) expect_true(all.equal(hist$masses, expected_counts)) } - - for(i in 1:num_tests) { + + for (i in 1:num_tests) { run_test() } }) context("dhist: constructor, equality operator and as_* transformation functions") test_that("dhist constuctor has correct locations and masses (default smoothing, unsorted)", { - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) actual1 <- dhist(locations = locations1, masses = masses1, sorted = FALSE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) actual2 <- dhist(locations = locations2, masses = masses2, sorted = FALSE) - + expected_class <- "dhist" expected_smoothing_window_width <- 0 - - expected1 = list(locations = locations1, masses = masses1, - smoothing_window_width = expected_smoothing_window_width) + + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = expected_smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = locations2, masses = masses2, - smoothing_window_width = expected_smoothing_window_width) + + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = expected_smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (default smoothing, sorted)", { - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) actual1 <- dhist(locations = locations1, masses = masses1, sorted = TRUE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) actual2 <- dhist(locations = locations2, masses = masses2, sorted = TRUE) - + expected_class <- "dhist" expected_smoothing_window_width <- 0 - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = expected_smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = expected_smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = expected_smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (default smoothing, default sorting)", { - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) actual1 <- dhist(locations = locations1, masses = masses1) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) actual2 <- dhist(locations = locations2, masses = masses2) - + expected_class <- "dhist" expected_smoothing_window_width <- 0 - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = expected_smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = expected_smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = expected_smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = expected_smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (specified smoothing, unsorted)", { smoothing_window_width <- 1 - - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = FALSE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = FALSE) - + + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = FALSE + ) + expected_class <- "dhist" - - expected1 = list(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width) + + expected1 <- list( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width) + + expected2 <- list( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (specified smoothing, sorted)", { smoothing_window_width <- 1 - - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width, sorted = TRUE) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width, sorted = TRUE) - + + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width, sorted = TRUE + ) + expected_class <- "dhist" - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("dhist constuctor has correct locations and masses (specified smoothing, default sorting)", { smoothing_window_width <- 1 - - locations1 = c(7, 42, 1, 21, 101, 9) - masses1 = c(15, 12, 16, 13, 11, 14) - actual1 <- dhist(locations = locations1, masses = masses1, - smoothing_window_width = smoothing_window_width) - locations2 = c(3, 0, -62, 7, 16, -58) - masses2 = c(23, 24, 26, 22, 21, 25) - actual2 <- dhist(locations = locations2, masses = masses2, - smoothing_window_width = smoothing_window_width) - + + locations1 <- c(7, 42, 1, 21, 101, 9) + masses1 <- c(15, 12, 16, 13, 11, 14) + actual1 <- dhist( + locations = locations1, masses = masses1, + smoothing_window_width = smoothing_window_width + ) + locations2 <- c(3, 0, -62, 7, 16, -58) + masses2 <- c(23, 24, 26, 22, 21, 25) + actual2 <- dhist( + locations = locations2, masses = masses2, + smoothing_window_width = smoothing_window_width + ) + expected_class <- "dhist" - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), - masses = c(16, 15, 14, 13 ,12, 11), - smoothing_window_width = smoothing_window_width) + + expected1 <- list( + locations = c(1, 7, 9, 21, 42, 101), + masses = c(16, 15, 14, 13, 12, 11), + smoothing_window_width = smoothing_window_width + ) class(expected1) <- expected_class - - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), - masses = c(26, 25, 24, 23, 22, 21), - smoothing_window_width = smoothing_window_width) + + expected2 <- list( + locations = c(-62, -58, 0, 3, 7, 16), + masses = c(26, 25, 24, 23, 22, 21), + smoothing_window_width = smoothing_window_width + ) class(expected2) <- expected_class - + expect_equal(actual1, expected1) expect_equal(actual2, expected2) }) test_that("as_smoothed_dhist sets smoothing_window_width correctly", { - dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14)) + dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14) + ) expected_smoothing_window_width_pre <- 0 expected_smoothing_window_width_post <- 1 - - expect_equal(dhist_pre$smoothing_window_width, - expected_smoothing_window_width_pre) + + expect_equal( + dhist_pre$smoothing_window_width, + expected_smoothing_window_width_pre + ) dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) - expect_equal(dhist_post$smoothing_window_width, - expected_smoothing_window_width_post) + expect_equal( + dhist_post$smoothing_window_width, + expected_smoothing_window_width_post + ) }) test_that("as_unsmoothed_dhist sets smoothing_window_width correctly", { - dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14), - smoothing_window_width <- 1) + dhist_pre <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14), + smoothing_window_width <- 1 + ) expected_smoothing_window_width_pre <- 1 expected_smoothing_window_width_post <- 0 - - expect_equal(dhist_pre$smoothing_window_width, - expected_smoothing_window_width_pre) + + expect_equal( + dhist_pre$smoothing_window_width, + expected_smoothing_window_width_pre + ) dhist_post <- as_smoothed_dhist(dhist_pre, expected_smoothing_window_width_post) - expect_equal(dhist_post$smoothing_window_width, - expected_smoothing_window_width_post) + expect_equal( + dhist_post$smoothing_window_width, + expected_smoothing_window_width_post + ) }) test_that("Identical dhists are considered equal", { - dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14), - smoothing_window = 0) + dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14), + smoothing_window = 0 + ) dhist2 <- dhist1 expect_true(dhist1 == dhist2) }) test_that("Non-identical dhists are NOT considered equal", { - dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), - masses = c(15, 12, 16, 13, 11, 14), - smoothing_window = 0) - + dhist1 <- dhist(locations <- c(7, 42, 1, 21, 101, 9), + masses = c(15, 12, 16, 13, 11, 14), + smoothing_window = 0 + ) + # Change a single element of the locations field dhist2_one_location_mismatch <- dhist1 dhist2_one_location_mismatch$locations[3] <- dhist2_one_location_mismatch$locations[1] + 1 expect_false(dhist1 == dhist2_one_location_mismatch) - + # Change a single element of the masses field dhist2_one_mass_mismatch <- dhist1 dhist2_one_mass_mismatch$masses[2] <- dhist2_one_mass_mismatch$masses[1] + 1 expect_false(dhist1 == dhist2_one_mass_mismatch) - + # Change the smoothing window field dhist2_smoothing_mismatch <- dhist1 dhist2_smoothing_mismatch$smoothing_window_width <- 1 expect_false(dhist1 == dhist2_smoothing_mismatch) - + # Change class dhist2_class_mismatch <- dhist1 attr(dhist2_class_mismatch, "class") <- "mismatch" @@ -276,21 +324,21 @@ test_that("Non-identical dhists are NOT considered equal", { context("dhist: Discrete histogram variance") test_that("dhist_variance difference for smoothed and unsmoothed dhists is smoothing_window_width^2 / 12", { - dhist <- dhist(locations <- c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) + dhist <- dhist(locations <- c(7, 42, 1, 21, 101, 9), masses = c(15, 12, 16, 13, 11, 14)) # Be careful: ensure that no smoothing window width results in overlapping bins smoothing_window_width_A <- 1 smoothing_window_width_B <- 2 dhist_unsmoothed <- as_unsmoothed_dhist(dhist) dhist_smoothed_A <- as_smoothed_dhist(dhist, smoothing_window_width_A) dhist_smoothed_B <- as_smoothed_dhist(dhist, smoothing_window_width_B) - + var_unsmoothed <- dhist_variance(dhist_unsmoothed) var_smoothed_A <- dhist_variance(dhist_smoothed_A) var_smoothed_B <- dhist_variance(dhist_smoothed_B) - + expected_var_smoothed_A <- var_unsmoothed + ((smoothing_window_width_A^2) / 12) expected_var_smoothed_B <- var_unsmoothed + ((smoothing_window_width_B^2) / 12) - + expect_equal(var_smoothed_A, expected_var_smoothed_A) expect_equal(var_smoothed_B, expected_var_smoothed_B) }) @@ -298,23 +346,27 @@ test_that("dhist_variance difference for smoothed and unsmoothed dhists is smoot test_that("dhist_variance returns sigma^2 for unsmoothed normal histograms", { num_hists <- 5 num_bins <- 100001 - + mus <- runif(num_hists, -10, 10) sigmas <- runif(num_hists, 0, 10) - - rand_locations <- function(mu, sigma) {return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins))} - + + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) masses <- dnorm(locations, mean = mu, sd = sigma) return(dhist(masses = masses, locations = locations)) }) - + actuals <- purrr::map_dbl(rand_dhists, dhist_variance) - expected <- purrr::map_dbl(sigmas, function(sigma) {return(sigma^2)}) - + expected <- purrr::map_dbl(sigmas, function(sigma) { + return(sigma^2) + }) + expect_equalish <- function(actual, expected) { - scaled_diff <- abs(actual - expected)/min(actual, expected) + scaled_diff <- abs(actual - expected) / min(actual, expected) max_diff <- 1e-4 return(expect_lte(scaled_diff, max_diff)) } @@ -326,25 +378,35 @@ test_that("normalise_dhist_mass output sums to 1", { # Generate histograms with random masses (no centres needed for this test) num_hists <- 10 num_bins <- 100 - + mass_min <- 0 mass_max <- 100 - rand_bin_masses <- function() {return(runif(num_bins, mass_min, mass_max))} + rand_bin_masses <- function() { + return(runif(num_bins, mass_min, mass_max)) + } bin_mass_lists <- replicate(num_hists, rand_bin_masses(), simplify = FALSE) # Locations are unimportant as they do not affect mass normalisation locations <- 1:num_bins smoothing_window_width <- 1 - + normalised_dhists <- purrr::map(bin_mass_lists, function(masses) { - normalise_dhist_mass(dhist(masses = masses, locations = locations, - smoothing_window_width = smoothing_window_width)) + normalise_dhist_mass(dhist( + masses = masses, locations = locations, + smoothing_window_width = smoothing_window_width + )) }) expected_total_mass <- 1 # Check total masses match expectations - purrr::map_dbl(normalised_dhists, function(dhist) {expect_equal(sum(dhist$masses), expected_total_mass)}) + purrr::map_dbl(normalised_dhists, function(dhist) { + expect_equal(sum(dhist$masses), expected_total_mass) + }) # Check other histogram properties unchanged - purrr::walk(normalised_dhists, function(dhist) {expect_equal(dhist$locations, locations)}) - purrr::walk(normalised_dhists, function(dhist) {expect_equal(dhist$smoothing_window_width, smoothing_window_width)}) + purrr::walk(normalised_dhists, function(dhist) { + expect_equal(dhist$locations, locations) + }) + purrr::walk(normalised_dhists, function(dhist) { + expect_equal(dhist$smoothing_window_width, smoothing_window_width) + }) }) context("dhist: Discrete histogram variance normalisation") @@ -352,60 +414,92 @@ test_that("normalise_histogram_variance output has variance of 1 for random inte # Generate histograms with random masses and random centres num_hists <- 10 num_bins <- 70 - + mass_min <- 0 mass_max <- 100 - rand_masses <- function() {return(runif(num_bins, mass_min, mass_max))} - + rand_masses <- function() { + return(runif(num_bins, mass_min, mass_max)) + } + centre_min <- -30 centre_max <- 70 - rand_locations <- function() {return(round(sample(centre_min:centre_max, num_bins), digits = 0))} - + rand_locations <- function() { + return(round(sample(centre_min:centre_max, num_bins), digits = 0)) + } + rand_dhists <- replicate(num_hists, dhist(masses = rand_masses(), locations = rand_locations()), simplify = FALSE) - + smoothing_window_width <- 1 rand_dhists_unsmoothed <- purrr::map(rand_dhists, as_unsmoothed_dhist) rand_dhists_smoothed <- purrr::map(rand_dhists, as_smoothed_dhist, smoothing_window_width = smoothing_window_width) - + expected_post_norm_smoothing_windows <- purrr::map_dbl(rand_dhists_smoothed, function(dhist) { - smoothing_window_width/dhist_std(dhist) - }) - - actual_dhist_unsmoothed <- purrr::map(rand_dhists_unsmoothed, function(dhist) {normalise_dhist_variance(dhist)}) - actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) {normalise_dhist_variance(dhist)}) + smoothing_window_width / dhist_std(dhist) + }) + + actual_dhist_unsmoothed <- purrr::map(rand_dhists_unsmoothed, function(dhist) { + normalise_dhist_variance(dhist) + }) + actual_dhist_smoothed <- purrr::map(rand_dhists_smoothed, function(dhist) { + normalise_dhist_variance(dhist) + }) expected_variance <- 1 # Check variance of normalised hostograms is as expected - purrr::walk(actual_dhist_unsmoothed, function(dhist) {expect_equal(dhist_variance(dhist), expected_variance)}) - purrr::walk(actual_dhist_smoothed, function(dhist) {expect_equal(dhist_variance(dhist), expected_variance)}) + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist_variance(dhist), expected_variance) + }) + purrr::walk(actual_dhist_smoothed, function(dhist) { + expect_equal(dhist_variance(dhist), expected_variance) + }) # Check smoothing window is as expected (0 for unsmoothe; smoothing_window_width/sigma for smoothed) - purrr::walk(actual_dhist_unsmoothed, function(dhist) {expect_equal(dhist$smoothing_window_width, 0)}) - purrr::walk2(actual_dhist_smoothed, expected_post_norm_smoothing_windows, - function(dhist, sww) {expect_equal(dhist$smoothing_window_width, sww)}) + purrr::walk(actual_dhist_unsmoothed, function(dhist) { + expect_equal(dhist$smoothing_window_width, 0) + }) + purrr::walk2( + actual_dhist_smoothed, expected_post_norm_smoothing_windows, + function(dhist, sww) { + expect_equal(dhist$smoothing_window_width, sww) + } + ) # Check masses unaltered - purrr::walk2(actual_dhist_unsmoothed, rand_dhists_unsmoothed, - function(actual, expected) {expect_equal(actual$masses, expected$masses)}) - purrr::walk2(actual_dhist_smoothed, rand_dhists_smoothed, - function(actual, expected) {expect_equal(actual$masses, expected$masses)}) + purrr::walk2( + actual_dhist_unsmoothed, rand_dhists_unsmoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) + purrr::walk2( + actual_dhist_smoothed, rand_dhists_smoothed, + function(actual, expected) { + expect_equal(actual$masses, expected$masses) + } + ) }) test_that("normalise_histogram_variance output has variance of 1 for normal histograms", { num_hists <- 5 num_bins <- 100001 - + mus <- runif(num_hists, -10, 10) sigmas <- runif(num_hists, 0, 10) - - rand_locations <- function(mu, sigma) {return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins))} - + + rand_locations <- function(mu, sigma) { + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } + rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) masses <- dnorm(locations, mean = mu, sd = sigma) return(dhist(masses = masses, locations = locations)) }) - actuals <- purrr::map(rand_dhists, function(dhist) {dhist_variance(normalise_dhist_variance(dhist))}) + actuals <- purrr::map(rand_dhists, function(dhist) { + dhist_variance(normalise_dhist_variance(dhist)) + }) expected <- 1 - purrr::map_dbl(actuals, function(actual) {expect_equal(actual, expected)}) + purrr::map_dbl(actuals, function(actual) { + expect_equal(actual, expected) + }) }) context("dhist: Sort dhist") @@ -416,12 +510,12 @@ test_that("sort_dhist works", { class(dhist1) <- "dhist" dhist2 <- list(locations = c(3, 0, -62, 7, 16, -58), masses = c(23, 24, 26, 22, 21, 25)) class(dhist2) <- "dhist" - - expected1 = list(locations = c(1, 7, 9, 21, 42, 101), masses = c(16, 15, 14, 13 ,12, 11)) + + expected1 <- list(locations = c(1, 7, 9, 21, 42, 101), masses = c(16, 15, 14, 13, 12, 11)) class(expected1) <- "dhist" - expected2 = list(locations = c(-62, -58, 0, 3, 7, 16), masses = c(26, 25, 24, 23, 22, 21)) + expected2 <- list(locations = c(-62, -58, 0, 3, 7, 16), masses = c(26, 25, 24, 23, 22, 21)) class(expected2) <- "dhist" - + actual1 <- sort_dhist(dhist1) actual2 <- sort_dhist(dhist2) @@ -433,7 +527,7 @@ context("dhist: ECMF") test_that("dhist_ecmf returns correct step function when smoothing_window_width is zero", { dhist1 <- dhist(locations = c(1, 2, 4, 7, 11, 16, 22), masses = c(21, 22, 23, 27, 31, 36, 42)) dhist1_unsmoothed <- as_unsmoothed_dhist(dhist1) - + ecmf1 <- dhist_ecmf(dhist1) actual_knots1 <- ecmf_knots(ecmf1) actual_knots_ecds1 <- ecmf1(actual_knots1) @@ -441,14 +535,14 @@ test_that("dhist_ecmf returns correct step function when smoothing_window_width actual_inter_knots_ecds1 <- ecmf1(inter_knots_x) extra_knots <- c(actual_knots1[1] - 1, actual_knots1[length(actual_knots1)] + 1) actual_extra_knots_ecds1 <- ecmf1(extra_knots) - + cum_masses1 <- cumsum(dhist1$masses) max_cum_mass <- cum_masses1[length(cum_masses1)] expected_knots_ecds1 <- cum_masses1 - expected_inter_knots_ecds1 <- head(expected_knots_ecds1, length(expected_knots_ecds1) -1) + expected_inter_knots_ecds1 <- head(expected_knots_ecds1, length(expected_knots_ecds1) - 1) expected_extra_knots_ecds1 <- c(0, max_cum_mass) expected_knots1 <- dhist1$locations - + expect_equal(actual_knots1, expected_knots1) expect_equal(actual_knots_ecds1, expected_knots_ecds1) expect_equal(actual_inter_knots_ecds1, expected_inter_knots_ecds1) @@ -457,7 +551,7 @@ test_that("dhist_ecmf returns correct step function when smoothing_window_width context("dhist: Area between ECMFs (simple integer dhists)") test_that("area_between_dhist_ecmfs returns correct value for simple integer dhists", { - # Example dhists constructed by hand to result in lots of "bowtie" segments + # Example dhists constructed by hand to result in lots of "bowtie" segments # for smoothed ECMFs and to allow expected areas to be calculated by hand # Unsmoothed locations are on an integer grid, smoothed bin edges are on a # half-integer grid @@ -465,18 +559,18 @@ test_that("area_between_dhist_ecmfs returns correct value for simple integer dhi # Smoothed ECMF crossing points are on a quarter-integer grid dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) - + # Set up smoothed and unsmoothed versions of histograms smoothing_window_width <- 1 dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - + # Set expected area expected_area_unsmoothed <- 4 expected_area_smoothed <- 3 - + # Generate ecmfs ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) @@ -486,7 +580,7 @@ test_that("area_between_dhist_ecmfs returns correct value for simple integer dhi # Calculate area between ECMFs actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - + # Compare caculated areas with expected areas expect_equal(actual_area_unsmoothed, expected_area_unsmoothed) expect_equal(actual_area_smoothed, expected_area_smoothed) @@ -494,29 +588,29 @@ test_that("area_between_dhist_ecmfs returns correct value for simple integer dhi context("dhist: Area between ECMFs (non-integer normalised dhists)") test_that("area_between_dhist_ecmfs returns correct value for non-integer normalised dhists", { - - # Previous simple integer grid where both histograms have been separately - # normalised to unit mass and variance. Has locations and masses at a range + + # Previous simple integer grid where both histograms have been separately + # normalised to unit mass and variance. Has locations and masses at a range # of floating point locations. Has bowties, triangles and trapeziums. dhistA <- dhist(locations = c(1, 3, 4), masses = c(2, 1, 1)) dhistB <- dhist(locations = c(0, 2, 4, 5), masses = c(0.5, 2, 0.5, 1)) dhistA <- normalise_dhist_mass(normalise_dhist_variance(dhistA)) dhistB <- normalise_dhist_mass(normalise_dhist_variance(dhistB)) - + # Set up smoothed and unsmoothed versions of histograms smoothing_window_width <- 1 dhistA_unsmoothed <- as_unsmoothed_dhist(dhistA) dhistB_unsmoothed <- as_unsmoothed_dhist(dhistB) dhistA_smoothed <- as_smoothed_dhist(dhistA, smoothing_window_width) dhistB_smoothed <- as_smoothed_dhist(dhistB, smoothing_window_width) - + # Generate ecmfs ecmfA_unsmoothed <- dhist_ecmf(dhistA_unsmoothed) ecmfB_unsmoothed <- dhist_ecmf(dhistB_unsmoothed) ecmfA_smoothed <- dhist_ecmf(dhistA_smoothed) ecmfB_smoothed <- dhist_ecmf(dhistB_smoothed) - + # Define some functions to make calculation of manually measured areas easier rectangle_area <- function(width, height) { return(width * height) @@ -532,42 +626,46 @@ test_that("area_between_dhist_ecmfs returns correct value for non-integer normal # Actual grid counts preserved in data to facilitate less tedious manual # checking if required # --- Unsmoothed --- - area_A_unsmoothed <- rectangle_area(width = 10*0.02, height = 12.5*0.01) - area_B_unsmoothed <- rectangle_area(width = 50.5*0.02, height = 37.5*0.01) - area_C_unsmoothed <- rectangle_area(width = 26*0.02, height = 12.5*0.01) - area_D_unsmoothed <- rectangle_area(width = 34.5*0.02, height = 12.5*0.01) - area_E_unsmoothed <- rectangle_area(width = 26.5*0.02, height = 25*0.01) - expected_area_unsmoothed <- - sum(area_A_unsmoothed, area_B_unsmoothed, area_C_unsmoothed, - area_D_unsmoothed, area_E_unsmoothed) + area_A_unsmoothed <- rectangle_area(width = 10 * 0.02, height = 12.5 * 0.01) + area_B_unsmoothed <- rectangle_area(width = 50.5 * 0.02, height = 37.5 * 0.01) + area_C_unsmoothed <- rectangle_area(width = 26 * 0.02, height = 12.5 * 0.01) + area_D_unsmoothed <- rectangle_area(width = 34.5 * 0.02, height = 12.5 * 0.01) + area_E_unsmoothed <- rectangle_area(width = 26.5 * 0.02, height = 25 * 0.01) + expected_area_unsmoothed <- + sum( + area_A_unsmoothed, area_B_unsmoothed, area_C_unsmoothed, + area_D_unsmoothed, area_E_unsmoothed + ) # --- Smoothed --- - area_A_smoothed <- triangle_area(base = 2.75*0.01, height = 6.5*0.02) - area_B_smoothed <- triangle_area(base = 2.75*0.01, height = 3*0.02) - area_C_smoothed <- triangle_area(base = 18.5*0.01, height = 21*0.02) - area_D_smoothed <- trapezium_area(side_a = 18.5*0.01, side_b = 37.5*0.01, height = 14.5*0.02) - area_E_smoothed <- trapezium_area(side_a = 37.5*0.01, side_b = 37.5*0.01, height = 16*0.02) - area_F_smoothed <- triangle_area(base = 37.5*0.01, height = 22.5*0.02) - area_G_smoothed <- triangle_area(base = 7.5*0.01, height = 8*0.02) - area_H_smoothed <- triangle_area(base = 7.5*0.01, height = 11*0.02) - area_I_smoothed <- triangle_area(base = 12.5*0.01, height = 19.5*0.02) - area_J_smoothed <- trapezium_area(side_a = 12.5*0.01, side_b = 20*0.01, height = 30.5*0.02) - area_K_smoothed <- trapezium_area(side_a = 20*0.01, side_b = 18*0.01, height = 8*0.02) - area_L_smoothed <- triangle_area(base = 18*0.01, height = 22*0.02) - expected_area_smoothed <- - sum(area_A_smoothed, area_B_smoothed, area_C_smoothed, area_D_smoothed, - area_E_smoothed, area_F_smoothed, area_G_smoothed, area_H_smoothed, - area_I_smoothed, area_J_smoothed, area_K_smoothed, area_L_smoothed) - + area_A_smoothed <- triangle_area(base = 2.75 * 0.01, height = 6.5 * 0.02) + area_B_smoothed <- triangle_area(base = 2.75 * 0.01, height = 3 * 0.02) + area_C_smoothed <- triangle_area(base = 18.5 * 0.01, height = 21 * 0.02) + area_D_smoothed <- trapezium_area(side_a = 18.5 * 0.01, side_b = 37.5 * 0.01, height = 14.5 * 0.02) + area_E_smoothed <- trapezium_area(side_a = 37.5 * 0.01, side_b = 37.5 * 0.01, height = 16 * 0.02) + area_F_smoothed <- triangle_area(base = 37.5 * 0.01, height = 22.5 * 0.02) + area_G_smoothed <- triangle_area(base = 7.5 * 0.01, height = 8 * 0.02) + area_H_smoothed <- triangle_area(base = 7.5 * 0.01, height = 11 * 0.02) + area_I_smoothed <- triangle_area(base = 12.5 * 0.01, height = 19.5 * 0.02) + area_J_smoothed <- trapezium_area(side_a = 12.5 * 0.01, side_b = 20 * 0.01, height = 30.5 * 0.02) + area_K_smoothed <- trapezium_area(side_a = 20 * 0.01, side_b = 18 * 0.01, height = 8 * 0.02) + area_L_smoothed <- triangle_area(base = 18 * 0.01, height = 22 * 0.02) + expected_area_smoothed <- + sum( + area_A_smoothed, area_B_smoothed, area_C_smoothed, area_D_smoothed, + area_E_smoothed, area_F_smoothed, area_G_smoothed, area_H_smoothed, + area_I_smoothed, area_J_smoothed, area_K_smoothed, area_L_smoothed + ) + # Calculate area between ECMFs actual_area_unsmoothed <- area_between_dhist_ecmfs(ecmfA_unsmoothed, ecmfB_unsmoothed) actual_area_smoothed <- area_between_dhist_ecmfs(ecmfA_smoothed, ecmfB_smoothed) - + # Compare caculated areas with expected areas expect_equalish_manual <- function(actual, expected, relative_tolerance) { relative_diff <- abs(actual - expected) / expected expect_lte(relative_diff, relative_tolerance) } - + # Given manual measurement of areas between curves, consider area correct # if actual and expected areas are within 1% of each other expect_equalish_manual(actual_area_unsmoothed, expected_area_unsmoothed, 0.01) @@ -578,7 +676,7 @@ context("dhist: Harmonise dhist locations") test_that("harmonise_dhist_locations works A", { dhist1 <- dhist(masses = c(11, 12, 13), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) dhist2 <- dhist(masses = c(21, 22, 23), locations = c(2, 4, 6), smoothing_window_width = 1, sorted = FALSE) - + expected <- list( dhist1 = dhist(masses = c(11, 12, 13, 0, 0, 0), locations = c(1, 3, 5, 2, 4, 6), smoothing_window_width = 1, sorted = FALSE), dhist2 = dhist(masses = c(21, 22, 23, 0, 0, 0), locations = c(2, 4, 6, 1, 3, 5), smoothing_window_width = 1, sorted = FALSE) @@ -590,7 +688,7 @@ test_that("harmonise_dhist_locations works A", { test_that("harmonise_dhist_locations works B", { dhist1 <- dhist(masses = c(1, 1, 1), locations = c(1, 3, 5), smoothing_window_width = 1, sorted = FALSE) dhist2 <- dhist(masses = c(1, 1, 1), locations = c(4, 5, 6), smoothing_window_width = 1, sorted = FALSE) - + expected <- list( dhist1 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(1, 3, 5, 4, 6), smoothing_window_width = 1, sorted = FALSE), dhist2 = dhist(masses = c(1, 1, 1, 0, 0), locations = c(4, 5, 6, 1, 3), smoothing_window_width = 1, sorted = FALSE) diff --git a/tests/testthat/test_emd.R b/tests/testthat/test_emd.R index 58e39c22..960a00c0 100644 --- a/tests/testthat/test_emd.R +++ b/tests/testthat/test_emd.R @@ -1,275 +1,336 @@ context("EMD: Cost matrix") # COST_MATRIX: Property-based tests -test_that("cost_matrix returns all zeros when all bin locations are identical",{ +test_that("cost_matrix returns all zeros when all bin locations are identical", { bin_centres1 <- c(1, 1, 1, 1, 1, 1, 1) bin_centres2 <- bin_centres1 - expected <- matrix(0, nrow = length(bin_centres1), - ncol = length(bin_centres2)) + expected <- matrix(0, + nrow = length(bin_centres1), + ncol = length(bin_centres2) + ) expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) }) test_that("cost_matrix returns zeros along diagonal when both sets of bin locations are the same", { - bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - bin_centres2 <- bin_centres1 - expected <- rep(0, length(bin_centres1)) - expect_equal(diag(cost_matrix(bin_centres1, bin_centres2)), expected) - }) + bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) + bin_centres2 <- bin_centres1 + expected <- rep(0, length(bin_centres1)) + expect_equal(diag(cost_matrix(bin_centres1, bin_centres2)), expected) +}) test_that("cost_matrix returns zeros along diagonal and taxicab distance from all zeros for all other elements when both sets of bin locations are the same and are a sequence of consecutive integers", { - bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - bin_centres2 <- bin_centres1 - num_bins <- length(bin_centres1) - expected <- toeplitz(1:num_bins)-1 - expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) - }) + bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) + bin_centres2 <- bin_centres1 + num_bins <- length(bin_centres1) + expected <- toeplitz(1:num_bins) - 1 + expect_equal(cost_matrix(bin_centres1, bin_centres2), expected) +}) test_that("cost_matrix is correct size when the two histograms are of different lengths", { - bin_centres1 <- c(1, 2, 3, 4, 5, 6, 7) - bin_centres2 <- c(8, 9, 10) - - cm <- cost_matrix(bin_centres1, bin_centres2) - - expect_equal(nrow(cm), length(bin_centres1)) - expect_equal(ncol(cm), length(bin_centres2)) - }) + bin_centres1 <- c(1, 2, 3, 4, 5, 6, 7) + bin_centres2 <- c(8, 9, 10) + + cm <- cost_matrix(bin_centres1, bin_centres2) + + expect_equal(nrow(cm), length(bin_centres1)) + expect_equal(ncol(cm), length(bin_centres2)) +}) context("EMD: EMD") # EMD: Property-based tests test_that("EMD methods return 0 when comparing a 1D feature distribution to - itself",{ - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_masses2 <- bin_masses1 - bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - bin_centres2 <- bin_centres1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - - expected <- 0 - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - }) + itself", { + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_masses2 <- bin_masses1 + bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) + bin_centres2 <- bin_centres1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + + expected <- 0 + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) +}) test_that("EMD methods return numBins/2 when offsetting a symmetric discrete triangle distribution by 1", { - cost_fn <- function(triangle_width) { - move_dist <- ceiling((triangle_width+1)/2) - num_moves <- ceiling(triangle_width/2) - return(move_dist * num_moves) - } - - # Triangle(4, even), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - num_nonzero_bins <- sum(bin_masses1 > 0) - expected <- cost_fn(num_nonzero_bins) - emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(4, even), shifting by changing centres - bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - num_nonzero_bins <- sum(bin_masses1 > 0) - expected <- cost_fn(num_nonzero_bins) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, odd), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, odd), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, even), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(5, even), shifting by changing centres - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(6, odd), shifting by changing masses - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - # Triangle(6, odd), shifting by changing centres - bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) - bin_masses2 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) + 1 - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - expected <- cost_fn(sum(bin_masses1 > 0)) - expect_equal(emd_lp(bin_masses1, bin_masses2, - bin_centres1, bin_centres2), expected) - expect_equal(emd_cs(histogram1, histogram2), expected) - expect_equal(emd(histogram1, histogram2), expected) - - }) + cost_fn <- function(triangle_width) { + move_dist <- ceiling((triangle_width + 1) / 2) + num_moves <- ceiling(triangle_width / 2) + return(move_dist * num_moves) + } + + # Triangle(4, even), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + num_nonzero_bins <- sum(bin_masses1 > 0) + expected <- cost_fn(num_nonzero_bins) + emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(4, even), shifting by changing centres + bin_masses1 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + num_nonzero_bins <- sum(bin_masses1 > 0) + expected <- cost_fn(num_nonzero_bins) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, odd), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, odd), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, even), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(5, even), shifting by changing centres + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(6, odd), shifting by changing masses + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) + + # Triangle(6, odd), shifting by changing centres + bin_masses1 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) + bin_masses2 <- c(0, 1, 2, 3, 4, 5, 6, 5, 4, 3, 2, 1, 0) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + 1 + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + expected <- cost_fn(sum(bin_masses1 > 0)) + expect_equal(emd_lp( + bin_masses1, bin_masses2, + bin_centres1, bin_centres2 + ), expected) + expect_equal(emd_cs(histogram1, histogram2), expected) + expect_equal(emd(histogram1, histogram2), expected) +}) test_that("EMD methods return same result for densely and sparsely specified bins", { - sparse_bin_masses1 <- c(1, 1, 1, 1, 1, 1) - sparse_bin_masses2 <- c(1, 1, 1, 1, 1, 1) - sparse_bin_centres1 <- c(1, 2, 4, 7, 11, 16) - sparse_bin_centres2 <- c(21, 22, 24, 27, 31, 36) - sparse_histogram1 <- dhist(masses = sparse_bin_masses1, - locations = sparse_bin_centres1) - sparse_histogram2 <- dhist(masses = sparse_bin_masses2, - locations = sparse_bin_centres2) - - dense_bin_centres1 <- 1:36 - dense_bin_centres2 <- dense_bin_centres1 - bin_mass_sequence <- c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1) - bin_mass_padding <- rep(0, length(dense_bin_centres1) - - length(bin_mass_sequence)) - dense_bin_masses1 <- c(bin_mass_sequence, bin_mass_padding) - dense_bin_masses2 <- c(bin_mass_padding, bin_mass_sequence) - dense_histogram1 <- dhist(masses = dense_bin_masses1, - locations = dense_bin_centres1) - dense_histogram2 <- dhist(masses = dense_bin_masses2, - locations = dense_bin_centres2) - - expect_equal(emd_lp(dense_bin_masses1, dense_bin_masses2, - dense_bin_centres1, dense_bin_centres2), - emd_lp(sparse_bin_masses1, sparse_bin_masses2, - sparse_bin_centres1, sparse_bin_centres2)) - expect_equal(emd_cs(dense_histogram1, dense_histogram2), - emd_cs(sparse_histogram1,sparse_histogram2)) - expect_equal(emd(dense_histogram1, dense_histogram2), - emd(sparse_histogram1, sparse_histogram2)) - }) + sparse_bin_masses1 <- c(1, 1, 1, 1, 1, 1) + sparse_bin_masses2 <- c(1, 1, 1, 1, 1, 1) + sparse_bin_centres1 <- c(1, 2, 4, 7, 11, 16) + sparse_bin_centres2 <- c(21, 22, 24, 27, 31, 36) + sparse_histogram1 <- dhist( + masses = sparse_bin_masses1, + locations = sparse_bin_centres1 + ) + sparse_histogram2 <- dhist( + masses = sparse_bin_masses2, + locations = sparse_bin_centres2 + ) + + dense_bin_centres1 <- 1:36 + dense_bin_centres2 <- dense_bin_centres1 + bin_mass_sequence <- c(1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1) + bin_mass_padding <- rep(0, length(dense_bin_centres1) - + length(bin_mass_sequence)) + dense_bin_masses1 <- c(bin_mass_sequence, bin_mass_padding) + dense_bin_masses2 <- c(bin_mass_padding, bin_mass_sequence) + dense_histogram1 <- dhist( + masses = dense_bin_masses1, + locations = dense_bin_centres1 + ) + dense_histogram2 <- dhist( + masses = dense_bin_masses2, + locations = dense_bin_centres2 + ) + + expect_equal( + emd_lp( + dense_bin_masses1, dense_bin_masses2, + dense_bin_centres1, dense_bin_centres2 + ), + emd_lp( + sparse_bin_masses1, sparse_bin_masses2, + sparse_bin_centres1, sparse_bin_centres2 + ) + ) + expect_equal( + emd_cs(dense_histogram1, dense_histogram2), + emd_cs(sparse_histogram1, sparse_histogram2) + ) + expect_equal( + emd(dense_histogram1, dense_histogram2), + emd(sparse_histogram1, sparse_histogram2) + ) +}) test_that("EMD methods return same result when order of densely specified bins is changed", { - bin_masses1 <- c(1, 1, 1, 1, 0, 0, 0, 0, 0) - bin_masses2 <- c(0, 0, 0, 0, 0, 1, 1, 1, 1) - bin_centres1 <- 1:length(bin_masses1) - bin_centres2 <- 1:length(bin_masses2) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - - permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) - permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) - - permuted_bin_masses1 <- bin_masses1[permuted_indexes1] - permuted_bin_centres1 <- bin_centres1[permuted_indexes1] - permuted_bin_masses2 <- bin_masses2[permuted_indexes2] - permuted_bin_centres2 <- bin_centres2[permuted_indexes2] - permuted_histogram1 <- dhist(masses = permuted_bin_masses1, - locations = permuted_bin_centres1) - permuted_histogram2 <- dhist(masses = permuted_bin_masses2, - locations = permuted_bin_centres2) - - expect_equal(emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), - emd_lp(permuted_bin_masses1, permuted_bin_masses2, - permuted_bin_centres1, permuted_bin_centres2)) - expect_equal(emd_cs(histogram1, histogram2), - emd_cs(permuted_histogram1, permuted_histogram2)) - expect_equal(emd(histogram1, histogram2), - emd(permuted_histogram1, permuted_histogram2)) - }) + bin_masses1 <- c(1, 1, 1, 1, 0, 0, 0, 0, 0) + bin_masses2 <- c(0, 0, 0, 0, 0, 1, 1, 1, 1) + bin_centres1 <- 1:length(bin_masses1) + bin_centres2 <- 1:length(bin_masses2) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + + permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) + permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) + + permuted_bin_masses1 <- bin_masses1[permuted_indexes1] + permuted_bin_centres1 <- bin_centres1[permuted_indexes1] + permuted_bin_masses2 <- bin_masses2[permuted_indexes2] + permuted_bin_centres2 <- bin_centres2[permuted_indexes2] + permuted_histogram1 <- dhist( + masses = permuted_bin_masses1, + locations = permuted_bin_centres1 + ) + permuted_histogram2 <- dhist( + masses = permuted_bin_masses2, + locations = permuted_bin_centres2 + ) + + expect_equal( + emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), + emd_lp( + permuted_bin_masses1, permuted_bin_masses2, + permuted_bin_centres1, permuted_bin_centres2 + ) + ) + expect_equal( + emd_cs(histogram1, histogram2), + emd_cs(permuted_histogram1, permuted_histogram2) + ) + expect_equal( + emd(histogram1, histogram2), + emd(permuted_histogram1, permuted_histogram2) + ) +}) test_that("EMD methods return same result when order of sparsely specified bins is changed", { - bin_masses1 <- c(1, 1, 1, 1, 1, 1) - bin_masses2 <- c(1, 1, 1, 1, 1, 1) - bin_centres1 <- c(1, 2, 4, 8, 16, 32) - bin_centres2 <- c(-32, -16, -8, -4, -2, -1) - histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) - histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) - - permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) - permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) - - permuted_bin_masses1 <- bin_masses1[permuted_indexes1] - permuted_bin_centres1 <- bin_centres1[permuted_indexes1] - permuted_bin_masses2 <- bin_masses2[permuted_indexes2] - permuted_bin_centres2 <- bin_centres2[permuted_indexes2] - permuted_histogram1 <- dhist(masses = permuted_bin_masses1, - locations = permuted_bin_centres1) - permuted_histogram2 <- dhist(masses = permuted_bin_masses2, - locations = permuted_bin_centres2) - - expect_equal(emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), - emd_lp(permuted_bin_masses1, permuted_bin_masses2, - permuted_bin_centres1, permuted_bin_centres2)) - expect_equal(emd_cs(histogram1, histogram2), - emd_cs(permuted_histogram1, permuted_histogram2)) - expect_equal(emd(histogram1, histogram2), - emd(permuted_histogram1, permuted_histogram2)) - }) + bin_masses1 <- c(1, 1, 1, 1, 1, 1) + bin_masses2 <- c(1, 1, 1, 1, 1, 1) + bin_centres1 <- c(1, 2, 4, 8, 16, 32) + bin_centres2 <- c(-32, -16, -8, -4, -2, -1) + histogram1 <- dhist(masses = bin_masses1, locations = bin_centres1) + histogram2 <- dhist(masses = bin_masses2, locations = bin_centres2) + + permuted_indexes1 <- sample(1:length(bin_masses1), replace = FALSE) + permuted_indexes2 <- sample(1:length(bin_masses2), replace = FALSE) + + permuted_bin_masses1 <- bin_masses1[permuted_indexes1] + permuted_bin_centres1 <- bin_centres1[permuted_indexes1] + permuted_bin_masses2 <- bin_masses2[permuted_indexes2] + permuted_bin_centres2 <- bin_centres2[permuted_indexes2] + permuted_histogram1 <- dhist( + masses = permuted_bin_masses1, + locations = permuted_bin_centres1 + ) + permuted_histogram2 <- dhist( + masses = permuted_bin_masses2, + locations = permuted_bin_centres2 + ) + + expect_equal( + emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2), + emd_lp( + permuted_bin_masses1, permuted_bin_masses2, + permuted_bin_centres1, permuted_bin_centres2 + ) + ) + expect_equal( + emd_cs(histogram1, histogram2), + emd_cs(permuted_histogram1, permuted_histogram2) + ) + expect_equal( + emd(histogram1, histogram2), + emd(permuted_histogram1, permuted_histogram2) + ) +}) context("EMD: Next step") test_that("next_step gives correct shift and matrix for simple x1, x2", { x1 <- c(-3000, -2000, -1000, 0, 1000, 2000, 3000, 4000) x2 <- c(-3100, -2100, -1100, 10, 100, 1100, 2100, 3100, 4100) - + expected_shift <- 10 expected_distance_matrix <- rbind( c(-100, -1100, -2100, -3100, -4100, -5100, -6100, -7100), @@ -282,7 +343,7 @@ test_that("next_step gives correct shift and matrix for simple x1, x2", { c(6100, 5100, 4100, 3100, 2100, 1100, 100, -900), c(7100, 6100, 5100, 4100, 3100, 2100, 1100, 100) ) - expected_distance_matrix[expected_distance_matrix<=0] <- Inf + expected_distance_matrix[expected_distance_matrix <= 0] <- Inf expected <- list(shift = 10, distance_matrix = expected_distance_matrix) actual <- shift_to_next_alignment(x1, x2) expect_equal(actual, expected) @@ -295,7 +356,7 @@ test_that("next_step gives correct shift for random x1, x2", { x1_min <- -1000 x1_max <- 1000 x1_prec <- 10 - x1 <- unique(sort(trunc(runif(27, x1_min, x1_max)/x1_prec) * x1_prec)) + x1 <- unique(sort(trunc(runif(27, x1_min, x1_max) / x1_prec) * x1_prec)) # Initialise x2 to a copy of x1 with all elements shifted right by 40% of # the minimum spacing between elements std_shift <- 0.4 * x1_prec @@ -304,7 +365,7 @@ test_that("next_step gives correct shift for random x1, x2", { min_shift <- 0.1 * x1_prec x2_rand_ind <- trunc(runif(1, 1, length(x2) + 1)) x2[x2_rand_ind] <- x2[x2_rand_ind] - std_shift + min_shift - + expected_shift <- min_shift actual_shift <- shift_to_next_alignment(x1, x2)$shift expect_equal(actual_shift, expected_shift) @@ -314,14 +375,14 @@ test_that("next_step gives correct shift for random x1, x2", { context("EMD: MinEMD") test_that("min_emd_ methods correctly compare a non-offset 1D feature - distribution to itself",{ + distribution to itself", { bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) bin_masses2 <- bin_masses1 bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) bin_centres2 <- bin_centres1 dhist1 <- dhist(masses = bin_masses1, locations = bin_centres1) dhist2 <- dhist(masses = bin_masses2, locations = bin_centres2) - + expected <- list(min_emd = 0, min_offset = 0) # Check min_emd_optimise actual_optimise <- min_emd_exhaustive(dhist1, dhist2) @@ -332,8 +393,8 @@ test_that("min_emd_ methods correctly compare a non-offset 1D feature }) test_that("min_emd_ methods correctly compare an offset 1D feature - distribution to itself",{ - offset = 10 + distribution to itself", { + offset <- 10 bin_masses1 <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) bin_masses2 <- bin_masses1 bin_centres1 <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) @@ -344,7 +405,7 @@ test_that("min_emd_ methods correctly compare an offset 1D feature bin_centres2 <- bin_centres1 + offset dhist1 <- dhist(masses = bin_masses1, locations = bin_centres1) dhist2 <- dhist(masses = bin_masses2, locations = bin_centres2) - + expected <- list(min_emd = 0, min_offset = offset) # Check min_emd_optimise actual_optimise <- min_emd_exhaustive(dhist1, dhist2) diff --git a/tests/testthat/test_graph_binning.R b/tests/testthat/test_graph_binning.R index 958efa80..2e063f02 100644 --- a/tests/testthat/test_graph_binning.R +++ b/tests/testthat/test_graph_binning.R @@ -1,67 +1,79 @@ context("Graph binning: Adaptive binning") test_that("adaptive_breaks merges 2 lowest bins where only first bin is below minimum", { min_count <- 5 - x <- c(1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1)) + x <- c( + 1.5, rep(2.2, min_count), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 3, 4, 5, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges 3 lowest bins where lowest 2 combined are below minimum", { min_count <- 5 - x <- c(1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), - rep(5.5, min_count), rep(6.5, min_count + 1)) + x <- c( + 1.5, rep(2.2, 2), rep(3.5, min_count), rep(4.5, min_count), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 4, 5, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges pair of bins in middle", { min_count <- 5 - x <- c(rep(1.6, min_count), rep(2.2, min_count), rep(3.5, 2), rep(4.5, 3), - rep(5.5, min_count), rep(6.5, min_count + 1)) + x <- c( + rep(1.6, min_count), rep(2.2, min_count), rep(3.5, 2), rep(4.5, 3), + rep(5.5, min_count), rep(6.5, min_count + 1) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 3, 5, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges two spearated pairs of bins in middle", { min_count <- 5 - x <- c(rep(1.6, min_count), rep(2.2, 2), rep(3.5, 3), rep(4.5, min_count), - rep(5.5, 3), rep(6.5, 2), rep(7.8, min_count)) + x <- c( + rep(1.6, min_count), rep(2.2, 2), rep(3.5, 3), rep(4.5, min_count), + rep(5.5, 3), rep(6.5, 2), rep(7.8, min_count) + ) initial_breaks <- 1:8 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 4, 5, 7, 8) - + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges 2 uppermost bins where both are below minimum", { min_count <- 5 - x <- c(rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3)) + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, 2), rep(6.5, 3) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) - final_breaks_expected <- c(1, 2,3, 4, 5, 7) - + final_breaks_expected <- c(1, 2, 3, 4, 5, 7) + expect_equal(final_breaks_actual, final_breaks_expected) }) test_that("adaptive_breaks merges 2 uppermost bins where only last bin is below minimum", { min_count <- 5 - x <- c(rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), - rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3)) + x <- c( + rep(1.5, min_count), rep(2.2, min_count), rep(3.5, min_count), + rep(4.5, min_count), rep(5.5, min_count), rep(6.5, 3) + ) initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 3, 4, 5, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) @@ -71,7 +83,7 @@ test_that("adaptive_breaks merges bins with no members with the next bin", { initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) @@ -81,23 +93,27 @@ test_that("adaptive_breaks merges 2 bins below minimum, plus the empty bins betw initial_breaks <- 1:7 final_breaks_actual <- adaptive_breaks(x, min_count = min_count, breaks = initial_breaks) final_breaks_expected <- c(1, 2, 6, 7) - + expect_equal(final_breaks_actual, final_breaks_expected) }) context("Graph binning: Adaptively binned densities") test_that("binned_densities_adaptive works", { # Helper function - test_binning <- function(densities, min_counts_per_interval, num_intervals, - breaks, expected_breaks, expected_interval_indexes) { + test_binning <- function(densities, min_counts_per_interval, num_intervals, + breaks, expected_breaks, expected_interval_indexes) { # Set up expected output - expected <- list(densities = densities, - interval_indexes = expected_interval_indexes, - breaks = expected_breaks) + expected <- list( + densities = densities, + interval_indexes = expected_interval_indexes, + breaks = expected_breaks + ) # Calculate actual output actual <- binned_densities_adaptive( - densities, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals) + densities, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) # Check actual matches expected expect_equal(actual, expected) } @@ -105,27 +121,32 @@ test_that("binned_densities_adaptive works", { densities <- c(0, 0.099, 0.2, 0.299, 0.4, 0.49, 0.6, 0.699, 0.8, 0.899, 1.0) min_counts_per_interval <- 2 num_intervals <- 100 - expected_breaks <-c(0, 0.1, 0.3, 0.5, 0.7, 1.0) + expected_breaks <- c(0, 0.1, 0.3, 0.5, 0.7, 1.0) expected_interval_indexes <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5) - test_binning(densities, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals, expected_breaks = expected_breaks, - expected_interval_indexes = expected_interval_indexes) + test_binning(densities, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals, expected_breaks = expected_breaks, + expected_interval_indexes = expected_interval_indexes + ) # Test 2: densities <- c(0, 0.012, 0.099, 0.201, 0.299, 0.402, 0.49, 0.596, 0.699, 0.803, 0.899, 1.0) min_counts_per_interval <- 2 num_intervals <- 100 - expected_breaks <-c(0, 0.02, 0.21, 0.41, 0.6, 0.81, 1.0) + expected_breaks <- c(0, 0.02, 0.21, 0.41, 0.6, 0.81, 1.0) expected_interval_indexes <- c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) - test_binning(densities, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals, expected_breaks = expected_breaks, - expected_interval_indexes = expected_interval_indexes) + test_binning(densities, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals, expected_breaks = expected_breaks, + expected_interval_indexes = expected_interval_indexes + ) }) -expected_binned_graphlet_counts <- +expected_binned_graphlet_counts <- function(graphs, binning_fn, max_graphlet_size) { binned_graphs <- binning_fn(graphs) - ref_counts <- purrr::map(binned_graphs$graphs, count_graphlets_for_graph, - max_graphlet_size) + ref_counts <- purrr::map( + binned_graphs$graphs, count_graphlets_for_graph, + max_graphlet_size + ) ref_counts } - diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 95cf4777..31ad4454 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -2,10 +2,12 @@ context("Measures Netdis: Graphlet tuples") test_message <- paste("count_graphlet_tuples and count_graphlet_tuples_ego give", - "choose(node_count, graphlet_size) for each graph + graphlet", - "combination",sep = " ") + "choose(node_count, graphlet_size) for each graph + graphlet", + "combination", + sep = " " + ) test_that(test_message, { - # Create some test graphs with known node counts (this is the only graph + # Create some test graphs with known node counts (this is the only graph # property we care about for this test) graph_n11 <- igraph::erdos.renyi.game(11, p = 1, type = "gnp") graph_n37 <- igraph::erdos.renyi.game(37, p = 1, type = "gnp") @@ -13,22 +15,22 @@ test_that(test_message, { # Calculate expected graph tuple count for graphlets of various sizes. There # is 1 graphlet of size 1, 2 of size 3, 6 of size 4, and 21 of size 5 graphlet_tuple_counts <- function(n, max_graphlet_size) { - if(max_graphlet_size >= 2) { + if (max_graphlet_size >= 2) { tuple_counts <- rep(choose(n, 2), 1) } - if(max_graphlet_size >= 3) { + if (max_graphlet_size >= 3) { tuple_counts <- c(tuple_counts, rep(choose(n, 3), 2)) } - if(max_graphlet_size >= 4) { + if (max_graphlet_size >= 4) { tuple_counts <- c(tuple_counts, rep(choose(n, 4), 6)) } - if(max_graphlet_size >= 5) { + if (max_graphlet_size >= 5) { tuple_counts <- c(tuple_counts, rep(choose(n, 5), 21)) } tuple_counts <- setNames(tuple_counts, graphlet_key(max_graphlet_size)$id) tuple_counts } - + # === TEST count_graphlet_tuples === # Generate expected tuple counts for graphlets up to size 4 and 5 expected_tuple_count_n11_gs4 <- graphlet_tuple_counts(11, 4) @@ -51,13 +53,14 @@ test_that(test_message, { expect_equal(expected_tuple_count_n11_gs5, actual_tuple_count_n11_gs5) expect_equal(expected_tuple_count_n37_gs5, actual_tuple_count_n37_gs5) expect_equal(expected_tuple_count_n73_gs5, actual_tuple_count_n73_gs5) - + # === TEST count_graphlet_tuples_ego === - # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar + # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar # to the method under test. However, it's a simple method so maybe that's ok? graphlet_tuple_counts_ego <- function(ego_networks, max_graphlet_size) { t(sapply(ego_networks, FUN = function(g) { - graphlet_tuple_counts(length(igraph::V(g)), max_graphlet_size)})) + graphlet_tuple_counts(length(igraph::V(g)), max_graphlet_size) + })) } # Generate ego networks for each graph graph_n11_ego1 <- make_named_ego_graph(graph_n11, order = 1) @@ -68,7 +71,7 @@ test_that(test_message, { graph_n73_ego2 <- make_named_ego_graph(graph_n73, order = 2) # Generate expected tuple counts for graphlets up to size 4 and 5 # 1. For ego-networks of order 1 - expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) + expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) expected_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego1, 4) expected_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego1, 4) expected_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego1, 5) @@ -81,7 +84,7 @@ test_that(test_message, { expected_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego2, 5) expected_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego2, 5) expected_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego2, 5) - + # Calculate actual tuple counts # 1. For ego-networks of order 1 actual_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego(graph_n11_ego1, 4) @@ -97,7 +100,7 @@ test_that(test_message, { actual_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego(graph_n11_ego2, 5) actual_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego(graph_n37_ego2, 5) actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego(graph_n73_ego2, 5) - + # Compare expected with actual # 1. For ego-networks of order 1 expect_equal(expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4) @@ -116,48 +119,52 @@ test_that(test_message, { }) context("Measures Netdis: Ego-network scaled graphlet outputs for manually verified networks") -test_that("Ego-network 4-node graphlet counts match manually verified totals",{ +test_that("Ego-network 4-node graphlet counts match manually verified totals", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - + # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 max_graphlet_size <- 4 min_ego_edges <- 0 min_ego_nodes <- 0 - - actual_counts_order_1 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 1) - actual_counts_order_2 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 2) - + + actual_counts_order_1 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 1 + ) + actual_counts_order_2 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 2 + ) + graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count # Set manually verified counts @@ -169,7 +176,7 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals",{ c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) @@ -178,110 +185,134 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals",{ colnames(expected_counts_order_1) <- graphlet_labels # 2-step ego networks expected_counts_order_2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10 , k)), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1) / zeros_to_ones(choose(7 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)) + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) ) rownames(expected_counts_order_2) <- node_labels colnames(expected_counts_order_2) <- graphlet_labels - + # Test that actual counts match expected with only counts requested (default) expect_equal(actual_counts_order_1, expected_counts_order_1) expect_equal(actual_counts_order_2, expected_counts_order_2) - + # Test that actual counts and returned ego networks match expected # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes) + expected_ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + expected_ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) expected_counts_with_networks_order_1 <- - list(graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1) - expected_counts_with_networks_order_2 <- - list(graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2) + list( + graphlet_counts = expected_counts_order_1, + ego_networks = expected_ego_networks_order_1 + ) + expected_counts_with_networks_order_2 <- + list( + graphlet_counts = expected_counts_order_2, + ego_networks = expected_ego_networks_order_2 + ) # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - return_ego_networks = TRUE) - actual_counts_with_networks_order_2 <- - count_graphlets_ego_scaled(graph, max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 2, return_ego_networks = TRUE) - + actual_counts_with_networks_order_1 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + return_ego_networks = TRUE + ) + actual_counts_with_networks_order_2 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 2, return_ego_networks = TRUE + ) + # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to + # Comparison is not implemented for igraph objects, so convert all igraphs to # indexed edge list and then compare. Do in-situ replacement of igraphs with # indexed edge lists to ensure we are checking full properties of returned # objects (i.e. named lists with matching elements). # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map(expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map(expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map(actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map(actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) + expected_counts_with_networks_order_1$ego_networks <- + purrr::map( + expected_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + expected_counts_with_networks_order_2$ego_networks <- + purrr::map( + expected_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_1$ego_networks <- + purrr::map( + actual_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_2$ego_networks <- + purrr::map( + actual_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) # 3b. Do comparison - expect_equal(actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1) - expect_equal(actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2) + expect_equal( + actual_counts_with_networks_order_1, + expected_counts_with_networks_order_1 + ) + expect_equal( + actual_counts_with_networks_order_2, + expected_counts_with_networks_order_2 + ) }) context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") -test_that("Ego-network 4-node density-binned reference counts match manually verified totals",{ +test_that("Ego-network 4-node density-binned reference counts match manually verified totals", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + # Set parameters for test - max_graphlet_size = 4 + max_graphlet_size <- 4 min_counts_per_interval <- 2 num_intervals <- 100 - + # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") # Set manually verified ego-network node counts and edge densities - #1 . Ego-networks of order 1 + # 1 . Ego-networks of order 1 expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) @@ -295,7 +326,7 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) # Order 2 expected densities should be: # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 - + # Set manually verified density bins for ego-networks # 1. Ego-networks of order 1 expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) @@ -307,13 +338,15 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver ) # Check binned densities are as expected actual_binned_densities_o1 <- binned_densities_adaptive( - expected_densities_o1, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals) + expected_densities_o1, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) # 2. Ego-networks of order 2 - expected_min_break_o2 <- 1/3 + expected_min_break_o2 <- 1 / 3 expected_max_break_o2 <- 0.6 - expected_initial_interval_o2 <- + expected_initial_interval_o2 <- (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) @@ -324,10 +357,12 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver ) # Check binned densities are as expected actual_binned_densities_o2 <- binned_densities_adaptive( - expected_densities_o2, min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals) + expected_densities_o2, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) - + # Set manually verified scaled ego-network graphlet counts graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count @@ -339,7 +374,7 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) @@ -348,57 +383,63 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver colnames(expected_counts_o1) <- graphlet_labels # 2-step ego networks expected_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10 , k)), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1) / zeros_to_ones(choose(7 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)) + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) ) rownames(expected_counts_o2) <- node_labels colnames(expected_counts_o2) <- graphlet_labels - + # Calculate binned average expected counts based on manually verified counts # and density bins # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - mean_counts_bin1_o1 <- (expected_counts_o1[1,] + expected_counts_o1[2,]) / 2 - mean_counts_bin2_o1 <- (expected_counts_o1[6,] + expected_counts_o1[7,] + - expected_counts_o1[8,]) / 3 - mean_counts_bin3_o1 <- (expected_counts_o1[3,] + expected_counts_o1[4,] + - expected_counts_o1[5,] + expected_counts_o1[9,] + - expected_counts_o1[10,]) / 5 + mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + + expected_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + + expected_counts_o1[5, ] + expected_counts_o1[9, ] + + expected_counts_o1[10, ]) / 5 expected_mean_density_binned_counts_o1 <- rbind( mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 ) rownames(expected_mean_density_binned_counts_o1) <- 1:3 # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_counts_bin1_o2 <- (expected_counts_o2[1,] + expected_counts_o2[4,]) / 2 - mean_counts_bin2_o2 <- (expected_counts_o2[2,] + expected_counts_o2[6,] + - expected_counts_o2[7,]) / 3 - mean_counts_bin3_o2 <- (expected_counts_o2[3,] + expected_counts_o2[5,]) / 2 - mean_counts_bin4_o2 <- (expected_counts_o2[8,] + expected_counts_o2[9,] + - expected_counts_o2[10,]) / 3 + mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + + expected_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + + expected_counts_o2[10, ]) / 3 expected_mean_density_binned_counts_o2 <- rbind( - mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, mean_counts_bin4_o2 ) rownames(expected_mean_density_binned_counts_o2) <- 1:4 - + # Calculate actual output of function under test actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( - expected_counts_o1, expected_interval_indexes_o1) + expected_counts_o1, expected_interval_indexes_o1 + ) actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( - expected_counts_o2, expected_interval_indexes_o2) - + expected_counts_o2, expected_interval_indexes_o2 + ) + # Check actual output vs expected - expect_equal(actual_mean_density_binned_counts_o1, - expected_mean_density_binned_counts_o1) - expect_equal(actual_mean_density_binned_counts_o2, - expected_mean_density_binned_counts_o2) + expect_equal( + actual_mean_density_binned_counts_o1, + expected_mean_density_binned_counts_o1 + ) + expect_equal( + actual_mean_density_binned_counts_o2, + expected_mean_density_binned_counts_o2 + ) }) context("Measures Netdis: Expected graphlet counts") @@ -407,21 +448,22 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { rand_graph <- function(num_nodes, density) { max_edges <- choose(num_nodes, 2) num_edges <- density * max_edges - igraph::erdos.renyi.game(num_nodes, num_edges , "gnm", - loops = FALSE, directed = FALSE) + igraph::erdos.renyi.game(num_nodes, num_edges, "gnm", + loops = FALSE, directed = FALSE + ) } # Set up some dummy reference density breaks and scaled reference counts density_breaks <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0) scaled_reference_counts <- rbind( - c( 1, 2, 3, 4, 5, 6, 7, 8, 9), + c(1, 2, 3, 4, 5, 6, 7, 8, 9), c(11, 12, 13, 14, 15, 16, 17, 18, 19), c(21, 22, 23, 24, 25, 26, 27, 28, 29), c(31, 32, 33, 34, 35, 36, 37, 38, 39), c(41, 42, 43, 44, 45, 46, 47, 48, 49), c(51, 52, 53, 54, 55, 56, 57, 58, 59), - c(61, 62, 63, 64, 65, 66, 67, 68 ,69), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), c(71, 72, 73, 74, 75, 76, 77, 78, 79), - c(81, 82, 83, 84, 85, 86 ,87, 88, 89), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), c(91, 92, 93, 94, 95, 96, 97, 98, 99) ) graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") @@ -429,35 +471,37 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { rownames(scaled_reference_counts) <- 1:10 graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) names(graphlet_sizes) <- graphlet_labels - max_graphlet_size = 4 - + max_graphlet_size <- 4 + # Generate some test graphs densities <- c(0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95) density_indexes <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) num_nodes <- rep(120, 10) graphs <- purrr::map2(num_nodes, densities, rand_graph) - + # Helper function to calculate expected expected graphlet counts expected_expected_graphlet_counts_fn <- function(density_index, node_count) { - reference_counts <- scaled_reference_counts[density_index,] + reference_counts <- scaled_reference_counts[density_index, ] reference_counts * choose(node_count, graphlet_sizes) - } # Determine expected and actual expected graphlet counts - expected_expected_graphlet_counts <- + expected_expected_graphlet_counts <- purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) - actual_expected_graphlet_counts <- - purrr::map(graphs, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = scaled_reference_counts) + actual_expected_graphlet_counts <- + purrr::map(graphs, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = scaled_reference_counts + ) # Loop over each graph and compare expected with actual # NOTE: v2.0.0 of testthat library made a breaking change that means using # map, mapply etc can cause failures under certain conditions # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 - for(i in 1:length(actual_expected_graphlet_counts)) { - expect_equal(actual_expected_graphlet_counts[i], - expected_expected_graphlet_counts[i]) + for (i in 1:length(actual_expected_graphlet_counts)) { + expect_equal( + actual_expected_graphlet_counts[i], + expected_expected_graphlet_counts[i] + ) } }) @@ -466,36 +510,40 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - max_graphlet_size = 4 + max_graphlet_size <- 4 min_ego_edges <- 0 min_ego_nodes <- 0 - + # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes) - ego_networks_o2 <- make_named_ego_graph(graph, order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes) + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) # Set manually-verified node counts and densities # 1. Ego-networks of order 1 num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) @@ -515,24 +563,24 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) # Set dummy reference counts scaled_reference_counts <- rbind( - c( 1, 2, 3, 4, 5, 6, 7, 8, 9), + c(1, 2, 3, 4, 5, 6, 7, 8, 9), c(11, 12, 13, 14, 15, 16, 17, 18, 19), c(21, 22, 23, 24, 25, 26, 27, 28, 29), c(31, 32, 33, 34, 35, 36, 37, 38, 39), c(41, 42, 43, 44, 45, 46, 47, 48, 49), c(51, 52, 53, 54, 55, 56, 57, 58, 59), - c(61, 62, 63, 64, 65, 66, 67, 68 ,69), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), c(71, 72, 73, 74, 75, 76, 77, 78, 79), - c(81, 82, 83, 84, 85, 86 ,87, 88, 89), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), c(91, 92, 93, 94, 95, 96, 97, 98, 99) ) expected_dims <- dim(scaled_reference_counts) - min_ego_nodes = 3 - min_ego_edges = 1 - + min_ego_nodes <- 3 + min_ego_edges <- 1 + # Helper function to calculate expected expected graphlet counts expected_expected_graphlet_counts_fn <- function(density_index, node_count) { - reference_counts <- scaled_reference_counts[density_index,] + reference_counts <- scaled_reference_counts[density_index, ] reference_counts * choose(node_count, graphlet_sizes) } # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet @@ -553,7 +601,7 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes # Set row labels to ego network names rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) - # Drop rows for nodes with ewer than minumum required nodes and edges in ego + # Drop rows for nodes with ewer than minumum required nodes and edges in ego # network expected_expected_graphlet_counts_ego_o1 <- expected_expected_graphlet_counts_ego_o1[ @@ -562,63 +610,75 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes expected_expected_graphlet_counts_ego_o2 <- expected_expected_graphlet_counts_ego_o2[ (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] - + ] + # Calculate actual output of function under test - actual_expected_graphlet_counts_ego_o1 <- + actual_expected_graphlet_counts_ego_o1 <- netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, density_breaks = breaks, + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - actual_expected_graphlet_counts_ego_o2 <- + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges + ) + actual_expected_graphlet_counts_ego_o2 <- netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, density_breaks = breaks, + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges + ) + # Compare actual to expected - expect_equal(actual_expected_graphlet_counts_ego_o1, - actual_expected_graphlet_counts_ego_o1) - expect_equal(actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2) + expect_equal( + actual_expected_graphlet_counts_ego_o1, + actual_expected_graphlet_counts_ego_o1 + ) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 + ) }) test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 nodes", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) names(graphlet_sizes) <- graphlet_labels - max_graphlet_size = 4 + max_graphlet_size <- 4 # Make graph ego networks min_ego_nodes <- 0 min_edgo_edges <- 0 - ego_networks_o1 <- make_named_ego_graph(graph, order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_edgo_edges) - ego_networks_o2 <- make_named_ego_graph(graph, order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_edgo_edges) + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_edgo_edges + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_edgo_edges + ) # Set manually-verified node counts and densities # 1. Ego-networks of order 1 num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) @@ -632,37 +692,49 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no max_edges_o2 <- choose(num_nodes_o2, 2) densities_o2 <- num_edges_o2 / max_edges_o2 # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 - # Set manually determined density breaks and indexes, based on a min bin count + # Set manually determined density breaks and indexes, based on a min bin count # of 2 and an initial request for 100 bins - min_bin_count = 2 - num_bins = 100 - num_breaks = num_bins + 1 + min_bin_count <- 2 + num_bins <- 100 + num_breaks <- num_bins + 1 min_density_o1 <- 0.5 max_density_o1 <- 1.0 - breaks_o1 <- seq(min_density_o1, max_density_o1,length.out = num_breaks)[c(1, 22, 42, 101)] + breaks_o1 <- seq(min_density_o1, max_density_o1, length.out = num_breaks)[c(1, 22, 42, 101)] density_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - min_density_o2 <- 1/3 + min_density_o2 <- 1 / 3 max_density_o2 <- 0.6 - breaks_o2 <- seq(min_density_o2, max_density_o2,length.out = num_breaks)[c(1, 10, 51, 64, 101)] + breaks_o2 <- seq(min_density_o2, max_density_o2, length.out = num_breaks)[c(1, 10, 51, 64, 101)] density_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - # Guard against errors in manually determined breaks and indexes by checking + # Guard against errors in manually determined breaks and indexes by checking # against already tested code. This also lets us ensure we handle densities # falling exactly on a bin boundary the same as the code under test. comp_binned_densities_o1 <- binned_densities_adaptive( - densities_o1, min_counts_per_interval = min_bin_count, - num_intervals = num_bins) + densities_o1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins + ) comp_binned_densities_o2 <- binned_densities_adaptive( - densities_o2, min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - expect_equal(comp_binned_densities_o1, - list(densities = densities_o1, - interval_indexes = density_indexes_o1, - breaks = breaks_o1)) - expect_equal(comp_binned_densities_o2, - list(densities = densities_o2, - interval_indexes = density_indexes_o2, - breaks = breaks_o2)) - + densities_o2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins + ) + expect_equal( + comp_binned_densities_o1, + list( + densities = densities_o1, + interval_indexes = density_indexes_o1, + breaks = breaks_o1 + ) + ) + expect_equal( + comp_binned_densities_o2, + list( + densities = densities_o2, + interval_indexes = density_indexes_o2, + breaks = breaks_o2 + ) + ) + # Set manually verified scaled ego-network graphlet counts graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count @@ -674,69 +746,69 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) ) # 2-step ego networks scaled_reference_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10 , k)), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8 , k)), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8 , k)), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1) / zeros_to_ones(choose(7 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6 , k)) + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) ) min_ego_nodes <- 3 min_ego_edges <- 1 - # Drop rows for nodes with ewer than minumum required nodes and edges in ego + # Drop rows for nodes with ewer than minumum required nodes and edges in ego # network scaled_reference_counts_o1 <- scaled_reference_counts_o1[ (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), - ] + ] scaled_reference_counts_o2 <- scaled_reference_counts_o2[ (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] + ] density_indexes_o1 <- density_indexes_o1[ (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges) ] density_indexes_o2 <- density_indexes_o2[ (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges) - ] + ] # Average manually verified scaled reference counts across density bins density_binned_reference_counts_o1 <- rbind( - (scaled_reference_counts_o1[1,] + scaled_reference_counts_o1[2,]) / 2, - (scaled_reference_counts_o1[4,] + scaled_reference_counts_o1[5,] + - scaled_reference_counts_o1[6,]) / 3, - ( scaled_reference_counts_o1[3,] + - scaled_reference_counts_o1[7,] + - scaled_reference_counts_o1[8,]) / 3 + (scaled_reference_counts_o1[1, ] + scaled_reference_counts_o1[2, ]) / 2, + (scaled_reference_counts_o1[4, ] + scaled_reference_counts_o1[5, ] + + scaled_reference_counts_o1[6, ]) / 3, + (scaled_reference_counts_o1[3, ] + + scaled_reference_counts_o1[7, ] + + scaled_reference_counts_o1[8, ]) / 3 ) rownames(density_binned_reference_counts_o1) <- 1:3 density_binned_reference_counts_o2 <- rbind( - (scaled_reference_counts_o2[1,] + scaled_reference_counts_o2[4,]) / 2, - (scaled_reference_counts_o2[2,] + scaled_reference_counts_o2[6,] + - scaled_reference_counts_o2[7,]) / 3, - (scaled_reference_counts_o2[3,] + scaled_reference_counts_o2[5,]) / 2, - (scaled_reference_counts_o2[8,] + scaled_reference_counts_o2[9,] + - scaled_reference_counts_o2[10,]) / 3 + (scaled_reference_counts_o2[1, ] + scaled_reference_counts_o2[4, ]) / 2, + (scaled_reference_counts_o2[2, ] + scaled_reference_counts_o2[6, ] + + scaled_reference_counts_o2[7, ]) / 3, + (scaled_reference_counts_o2[3, ] + scaled_reference_counts_o2[5, ]) / 2, + (scaled_reference_counts_o2[8, ] + scaled_reference_counts_o2[9, ] + + scaled_reference_counts_o2[10, ]) / 3 ) rownames(density_binned_reference_counts_o2) <- 1:4 - + # Helper functions to calculate expected expected graphlet counts expected_expected_graphlet_counts_o1_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o1[density_index,] + reference_counts <- density_binned_reference_counts_o1[density_index, ] reference_counts * choose(node_count, graphlet_sizes) } expected_expected_graphlet_counts_o2_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o2[density_index,] + reference_counts <- density_binned_reference_counts_o2[density_index, ] reference_counts * choose(node_count, graphlet_sizes) } # Calculate expected graphlet counts @@ -744,66 +816,79 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no density_indexes_o1, num_nodes_o1[(num_nodes_o1 >= min_ego_nodes)], expected_expected_graphlet_counts_o1_fn ))) - rownames(expected_expected_graphlet_counts_ego_o1) <- + rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1[(num_nodes_o1 >= min_ego_nodes)]) expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( density_indexes_o2, num_nodes_o2[(num_nodes_o2 >= min_ego_nodes)], expected_expected_graphlet_counts_o2_fn ))) - rownames(expected_expected_graphlet_counts_ego_o2) <- + rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2[(num_nodes_o2 >= min_ego_nodes)]) - - # Sanity check manually derived expected expected counts by comparing against + + # Sanity check manually derived expected expected counts by comparing against # pre-tested fully applied expected_graphlet_counts_ego function - expect_equal(expected_expected_graphlet_counts_ego_o1, - netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - density_breaks = breaks_o1, - density_binned_reference_counts_o1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - )) - expect_equal(expected_expected_graphlet_counts_ego_o2, - netdis_expected_graphlet_counts_ego( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - density_breaks = breaks_o2, - density_binned_reference_counts_o2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - )) - + expect_equal( + expected_expected_graphlet_counts_ego_o1, + netdis_expected_graphlet_counts_ego( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + density_breaks = breaks_o1, + density_binned_reference_counts_o1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + ) + expect_equal( + expected_expected_graphlet_counts_ego_o2, + netdis_expected_graphlet_counts_ego( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + density_breaks = breaks_o2, + density_binned_reference_counts_o2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + ) + # Generate partially applied functions using function under test - actual_expected_graphlet_counts_ego_fn_o1 <- + actual_expected_graphlet_counts_ego_fn_o1 <- netdis_expected_graphlet_counts_ego_fn( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_bin_count = min_bin_count, + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_bin_count = min_bin_count, num_bins = num_bins, - min_ego_nodes = min_ego_nodes, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges - ) - actual_expected_graphlet_counts_ego_fn_o2 <- + ) + actual_expected_graphlet_counts_ego_fn_o2 <- netdis_expected_graphlet_counts_ego_fn( - graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - min_bin_count = min_bin_count, + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_bin_count = min_bin_count, num_bins = num_bins, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) # Generate actual expected accounts by applying generated functions to test # graph - actual_expected_graphlet_counts_ego_o1 <- + actual_expected_graphlet_counts_ego_o1 <- actual_expected_graphlet_counts_ego_fn_o1(graph) - actual_expected_graphlet_counts_ego_o2 <- + actual_expected_graphlet_counts_ego_o2 <- actual_expected_graphlet_counts_ego_fn_o2(graph) - + # Compare actual to expected - expect_equal(actual_expected_graphlet_counts_ego_o1, - expected_expected_graphlet_counts_ego_o1) - expect_equal(actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2) + expect_equal( + actual_expected_graphlet_counts_ego_o1, + expected_expected_graphlet_counts_ego_o1 + ) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 + ) }) context("Measures Netdis: Centered graphlet counts") @@ -846,72 +931,86 @@ test_that("netdis_centred_graphlet_counts_ego is correct", { c("n23", "n24") ) ref_graph <- igraph::graph_from_edgelist(ref_elist, directed = FALSE) - + query_elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) query_graph <- igraph::graph_from_edgelist(query_elist, directed = FALSE) - - max_graphlet_size = 4 + + max_graphlet_size <- 4 # Use pre-tested functions to generate ego-network graphlet counts # 1. Reference graph ego-network graphlet counts ref_o1 <- count_graphlets_ego( - ref_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, return_ego_networks = TRUE + ) ego_counts_ref_o1 <- ref_o1$graphlet_counts ego_networks_ref_o1 <- ref_o1$ego_networks density_ref_o1 <- sapply(ego_networks_ref_o1, igraph::edge_density) - + ref_o2 <- count_graphlets_ego( - ref_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, return_ego_networks = TRUE + ) ego_counts_ref_o2 <- ref_o2$graphlet_counts ego_networks_ref_o2 <- ref_o2$ego_networks density_ref_o2 <- sapply(ego_networks_ref_o2, igraph::edge_density) # 2. Query graph ego-network graphlet countsa query_o1 <- count_graphlets_ego( - query_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE) + query_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, return_ego_networks = TRUE + ) ego_counts_query_o1 <- query_o1$graphlet_counts ego_networks_query_o1 <- query_o1$ego_networks density_query_o1 <- sapply(ego_networks_query_o1, igraph::edge_density) - + query_o2 <- count_graphlets_ego( - query_graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE) + query_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, return_ego_networks = TRUE + ) ego_counts_query_o2 <- query_o2$graphlet_counts ego_networks_query_o2 <- query_o2$ego_networks density_query_o2 <- sapply(ego_networks_query_o2, igraph::edge_density) - - centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, - query_node_counts, ref_node_count, - min_nodes, min_edges, - min_bin_count, num_bins) { + + centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, + query_node_counts, ref_node_count, + min_nodes, min_edges, + min_bin_count, num_bins) { graphlet_node_counts_k4 <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) # 1. Calculate scaling factors for each reference and query graphlet count # These are nCk, where n is the number of nodes in the network and # k is the number of nodes in the graphlet ref_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) {choose(ref_node_count, k)}) + graphlet_node_counts_k4, FUN <- function(k) { + choose(ref_node_count, k) + } + ) query_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) {choose(query_node_count, k)}) + graphlet_node_counts_k4, FUN <- function(k) { + choose(query_node_count, k) + } + ) # 2. Calculate scaled reference counts by dividing by ref_scale_factor ref_scaled_graphlet_count <- query_graphlet_count / ref_scale_factor - # + # } -}) \ No newline at end of file +}) diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index b3a4ea3d..c44c6891 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -1,65 +1,92 @@ - self_net_emd <- function(histogram, shift, method) { - net_emd(histogram, shift_dhist(histogram, shift), method = method) - } - expected <- 0 - - locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - histogram <- dhist(locations = locations, masses = masses) - - expect_equal(self_net_emd(histogram, shift = 1, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 1, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.5, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.5, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.1, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.1, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.05, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.05, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0.01, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0.01, "exhaustive"), expected) - expect_equal(self_net_emd(histogram, shift = 0, "optimise"), expected) - expect_equal(self_net_emd(histogram, shift = 0, "exhaustive"), expected) - - expect_self_net_emd_correct <- function(histogram, shift, method, - return_details = FALSE) { - self_net_emd <- net_emd(histogram, shift_dhist(histogram, shift), - method = method, return_details = return_details) - loc=histogram$locations - mass=histogram$masses - var=sum(loc*loc*mass)/sum(mass)-(sum(loc*mass)/sum(mass))^2 - expected <- list(net_emd = 0, min_emds = 0, min_offsets = shift, - min_offsets_std = 0) - expect_equal(self_net_emd, expected) - } - - locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) - masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) - histogram <- dhist(locations = locations, masses = masses) - expect_self_net_emd_correct(histogram, shift = 1, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 1, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.5, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.5, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.1, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.1, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.05, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.05, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.01, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0.01, "exhaustive", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0, "optimise", - return_details = TRUE) - expect_self_net_emd_correct(histogram, shift = 0, "exhaustive", - return_details = TRUE) +self_net_emd <- function(histogram, shift, method) { + net_emd(histogram, shift_dhist(histogram, shift), method = method) +} +expected <- 0 + +locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) +masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) +histogram <- dhist(locations = locations, masses = masses) + +expect_equal(self_net_emd(histogram, shift = 1, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 1, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.5, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.5, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.1, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.1, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.05, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.05, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0.01, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0.01, "exhaustive"), expected) +expect_equal(self_net_emd(histogram, shift = 0, "optimise"), expected) +expect_equal(self_net_emd(histogram, shift = 0, "exhaustive"), expected) + +expect_self_net_emd_correct <- function(histogram, shift, method, + return_details = FALSE) { + self_net_emd <- net_emd(histogram, shift_dhist(histogram, shift), + method = method, return_details = return_details + ) + loc <- histogram$locations + mass <- histogram$masses + var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 + expected <- list( + net_emd = 0, min_emds = 0, min_offsets = shift, + min_offsets_std = 0 + ) + expect_equal(self_net_emd, expected) +} + +locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) +masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) +histogram <- dhist(locations = locations, masses = masses) +expect_self_net_emd_correct(histogram, + shift = 1, "optimise", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 1, "exhaustive", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.5, "optimise", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.5, "exhaustive", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.1, "optimise", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.1, "exhaustive", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.05, "optimise", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.05, "exhaustive", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.01, "optimise", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0.01, "exhaustive", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0, "optimise", + return_details = TRUE +) +expect_self_net_emd_correct(histogram, + shift = 0, "exhaustive", + return_details = TRUE +) test_that("net_emd returns 0 when comparing any normal histogram against itself (no offset)", { num_hists <- 5 @@ -70,7 +97,7 @@ test_that("net_emd returns 0 when comparing any normal histogram against itself rand_locations <- function(mu, sigma) { return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) - } + } rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) @@ -81,22 +108,21 @@ test_that("net_emd returns 0 when comparing any normal histogram against itself expected <- 0 actuals_opt <- purrr::map(rand_dhists, function(dhist) { net_emd(dhist, dhist, method = "optimise") - }) + }) purrr::walk(actuals_opt, function(actual) { expect_equal(actual, expected) - }) + }) actuals_exhaustive_default <- purrr::map(rand_dhists, function(dhist) { net_emd(dhist, dhist, method = "exhaustive") - }) + }) purrr::walk(actuals_exhaustive_default, function(actual) { expect_equal(actual, expected) - }) + }) }) test_that("net_emd returns 0 when comparing any normal histogram randomly offset against itself", { - num_hists <- 2 num_bins <- 101 num_offsets <- 3 @@ -107,7 +133,7 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset rand_locations <- function(mu, sigma) { return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) - } + } rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) @@ -119,29 +145,38 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset net_emd_offset_self <- function(dhist, offsets, method) { net_emds <- purrr::map_dbl(offsets, function(offset) { - net_emd(dhist, shift_dhist(dhist, offset), method = method)}) + net_emd(dhist, shift_dhist(dhist, offset), method = method) + }) return(net_emds) } expected <- 0 - actuals_list_opt <- purrr::map2(rand_dhists, offset_lists, - function(dhist, offsets) { - net_emd_offset_self(dhist, offsets, method = "optimise")}) + actuals_list_opt <- purrr::map2( + rand_dhists, offset_lists, + function(dhist, offsets) { + net_emd_offset_self(dhist, offsets, method = "optimise") + } + ) purrr::walk(actuals_list_opt, function(actuals) { - purrr::walk(actuals, function(actual) { - expect_equal(actual, expected)}) + purrr::walk(actuals, function(actual) { + expect_equal(actual, expected) + }) }) - actuals_list_exhaustive <- purrr::map2(rand_dhists, offset_lists, - function(dhist, offsets) { - net_emd_offset_self(dhist, offsets, method = "exhaustive")}) + actuals_list_exhaustive <- purrr::map2( + rand_dhists, offset_lists, + function(dhist, offsets) { + net_emd_offset_self(dhist, offsets, method = "exhaustive") + } + ) purrr::walk(actuals_list_exhaustive, function(actuals) { - purrr::walk(actuals, function(actual) {expect_equal(actual, expected)}) + purrr::walk(actuals, function(actual) { + expect_equal(actual, expected) + }) }) }) test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any normal histogram randomly offset against itself", { - num_hists <- 2 num_bins <- 101 num_offsets <- 3 @@ -151,7 +186,8 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any offsets <- runif(num_offsets, -10, 10) rand_locations <- function(mu, sigma) { - return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins))} + return(seq(mu - 5 * sigma, mu + 5 * sigma, length.out = num_bins)) + } rand_dhists <- purrr::map2(mus, sigmas, function(mu, sigma) { locations <- rand_locations(mu, sigma) @@ -163,27 +199,33 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any expect_self_net_emd_correct <- function(histogram, shift, method, return_details = FALSE) { - self_net_emd <- net_emd(histogram, shift_dhist(histogram, shift), - method, return_details) - loc=histogram$locations - mass=histogram$masses - var=sum(loc*loc*mass)/sum(mass)-(sum(loc*mass)/sum(mass))^2 - expected <- list(net_emd = 0, min_emds = 0, min_offsets = shift, - min_offsets_std = 0) - expect_equal(self_net_emd, expected) - } + self_net_emd <- net_emd( + histogram, shift_dhist(histogram, shift), + method, return_details + ) + loc <- histogram$locations + mass <- histogram$masses + var <- sum(loc * loc * mass) / sum(mass) - (sum(loc * mass) / sum(mass))^2 + expected <- list( + net_emd = 0, min_emds = 0, min_offsets = shift, + min_offsets_std = 0 + ) + expect_equal(self_net_emd, expected) + } purrr::walk2(rand_dhists, offset_lists, function(dhist, offsets) { - purrr::walk(offsets, function(offset){ + purrr::walk(offsets, function(offset) { expect_self_net_emd_correct(dhist, offset, "optimise", - return_details = TRUE) + return_details = TRUE + ) }) }) purrr::walk2(rand_dhists, offset_lists, function(dhist, offsets) { - purrr::walk(offsets, function(offset){ + purrr::walk(offsets, function(offset) { expect_self_net_emd_correct(dhist, offset, "exhaustive", - return_details = TRUE) + return_details = TRUE + ) }) }) }) @@ -192,12 +234,12 @@ test_that("net_emd returns analytically derived non-zero solutions for distribut where the analytical solution is known", { # Helper functions to create dhists for a given value of "p" two_bin_dhist <- function(p) { - dhist(locations = c(0, 1), masses = c(p, 1-p)) + dhist(locations = c(0, 1), masses = c(p, 1 - p)) } three_bin_dhist <- function(p) { - dhist(locations = c(-1, 0, 1), masses = c(0.5*p*(1-p), 1-(p*(1-p)), 0.5*p*(1-p))) + dhist(locations = c(-1, 0, 1), masses = c(0.5 * p * (1 - p), 1 - (p * (1 - p)), 0.5 * p * (1 - p))) } - + # Helper function to test actual vs expected test_pair <- function(p, expected) { dhistA <- two_bin_dhist(p) @@ -207,32 +249,31 @@ test_that("net_emd returns analytically derived non-zero solutions for distribut # optimised NetEMD is ~1e-09, so set a slightly looser tolerance here expect_equal(net_emd(dhistA, dhistB, method = "optimise"), expected, tolerance = 1e-08) } - + # Test for p values with analytically calculated NetEMD - test_pair(1/2, 1) - test_pair(1/3, 1/sqrt(2)) - test_pair(1/5, 1/2) - + test_pair(1 / 2, 1) + test_pair(1 / 3, 1 / sqrt(2)) + test_pair(1 / 5, 1 / 2) }) context("Measures NetEMD: Virus PPI (EMD)") # EMD and NET_EMD: Virus PPI datasets test_that("emd return 0 when comparing graphlet orbit degree distributions of virus PPI graphs to themselves", { - # Load viurs PPI network data in ORCA-compatible edge list format - data_indexes <- 1:length(virusppi) - data_names <- attr(virusppi, "name") + # Load viurs PPI network data in ORCA-compatible edge list format + data_indexes <- 1:length(virusppi) + data_names <- attr(virusppi, "name") - # Calculate graphlet-based degree distributions up to graphlet order 4 - virus_gdd <- purrr::map(virusppi, gdd) + # Calculate graphlet-based degree distributions up to graphlet order 4 + virus_gdd <- purrr::map(virusppi, gdd) - # Map over virus PPI networks - purrr::walk(virus_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equal(emd(gdd_Ox, gdd_Ox), 0) - }) - }) - }) + # Map over virus PPI networks + purrr::walk(virus_gdd, function(gdd) { + purrr::walk(gdd, function(gdd_Ox) { + expect_equal(emd(gdd_Ox, gdd_Ox), 0) + }) + }) +}) context("Measures NetEMD: Virus PPI (NetEMD)") test_that("net_emd return 0 when comparing graphlet orbit degree distributions @@ -253,8 +294,10 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over virus PPI networks purrr::walk(virus_gdd, function(gdd) { purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(net_emd(gdd_Ox, gdd_Ox, method = "optimise", - smoothing_window_width = 0), 0) + expect_equalish(net_emd(gdd_Ox, gdd_Ox, + method = "optimise", + smoothing_window_width = 0 + ), 0) }) }) }) @@ -263,97 +306,113 @@ context("Measures NetEMD: Random graphs (EMD)") # EMD and NET_EMD: Random graph datasets test_that("emd return 0 when comparing graphlet orbit degree distributions of random graphs to themselves", { - # Load random graph data in ORCA-compatible edge list format - random_graphs <- read_simple_graphs( - system.file(package = "netdist", "extdata", "random"), - format = "ncol", pattern = "*") - data_indexes <- 1:length(random_graphs) - data_names <- attr(random_graphs, "name") - - # Calculate graphlet-based degree distributions up to graphlet order 4 - random_gdd <- purrr::map(random_graphs, gdd) - - # Map over random graphs - purrr::walk(random_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equal(emd(gdd_Ox, gdd_Ox), 0) - }) - }) - }) + # Load random graph data in ORCA-compatible edge list format + random_graphs <- read_simple_graphs( + system.file(package = "netdist", "extdata", "random"), + format = "ncol", pattern = "*" + ) + data_indexes <- 1:length(random_graphs) + data_names <- attr(random_graphs, "name") + + # Calculate graphlet-based degree distributions up to graphlet order 4 + random_gdd <- purrr::map(random_graphs, gdd) + + # Map over random graphs + purrr::walk(random_gdd, function(gdd) { + purrr::walk(gdd, function(gdd_Ox) { + expect_equal(emd(gdd_Ox, gdd_Ox), 0) + }) + }) +}) context("Measures NetEMD: Random graphs (NetEMD)") test_that("net_emd return 0 when comparing graphlet orbit degree distributions of random graphs to themselves", { - # Load random graph data in ORCA-compatible edge list format - random_graphs <- read_simple_graphs( - system.file(package = "netdist", "extdata", "random"), - format = "ncol", pattern = "*") - data_indexes <- 1:length(random_graphs) - data_names <- attr(random_graphs, "name") - - # Calculate graphlet-based degree distributions up to graphlet order 4 - random_gdd <- purrr::map(random_graphs, gdd) - - expect_equalish <- function(actual, expected) { - diff <- abs(actual - expected) - max_diff <- 1e-12 - return(expect_lte(diff, max_diff)) - } - - # Map over random graphs - purrr::walk(random_gdd, function(gdd) { - purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(net_emd(gdd_Ox, gdd_Ox, method = "optimise", - smoothing_window_width = 0), 0) - }) - }) - }) + # Load random graph data in ORCA-compatible edge list format + random_graphs <- read_simple_graphs( + system.file(package = "netdist", "extdata", "random"), + format = "ncol", pattern = "*" + ) + data_indexes <- 1:length(random_graphs) + data_names <- attr(random_graphs, "name") + + # Calculate graphlet-based degree distributions up to graphlet order 4 + random_gdd <- purrr::map(random_graphs, gdd) + + expect_equalish <- function(actual, expected) { + diff <- abs(actual - expected) + max_diff <- 1e-12 + return(expect_lte(diff, max_diff)) + } + + # Map over random graphs + purrr::walk(random_gdd, function(gdd) { + purrr::walk(gdd, function(gdd_Ox) { + expect_equalish(net_emd(gdd_Ox, gdd_Ox, + method = "optimise", + smoothing_window_width = 0 + ), 0) + }) + }) +}) context("Measures NetEMD: All graphs in directory") test_that("net_emds_for_all_graphs works", { # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - edge_format = "ncol" - file_pattern = ".txt" + edge_format <- "ncol" + file_pattern <- ".txt" # Set number of threads to use at once for parallel processing. - num_threads = getOption("mc.cores", 2L) + num_threads <- getOption("mc.cores", 2L) # Use previously tested GDD code to generate inputs to function under test gdds_orbits_g4 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "orbit", max_graphlet_size = 4) + feature_type = "orbit", max_graphlet_size = 4 + ) gdds_orbits_g5 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "orbit", max_graphlet_size = 5) + feature_type = "orbit", max_graphlet_size = 5 + ) gdds_graphlets_g4 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 4) + feature_type = "graphlet", max_graphlet_size = 4 + ) gdds_graphlets_g5 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 5) + feature_type = "graphlet", max_graphlet_size = 5 + ) gdds_graphlets_g4_e1 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 1) + feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 1 + ) gdds_graphlets_g5_e1 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 1) + feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 1 + ) gdds_graphlets_g4_e2 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 2) + feature_type = "graphlet", max_graphlet_size = 4, ego_neighbourhood_size = 2 + ) gdds_graphlets_g5_e2 <- gdd_for_all_graphs( source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 2) + feature_type = "graphlet", max_graphlet_size = 5, ego_neighbourhood_size = 2 + ) # Use previously tested NetEMD function to generate expected NetEMD scores # individually and combine into expected output for code under test - expected_net_emd_fn<- function(gdds) { - list(net_emds = c(net_emd(gdds$EBV, gdds$ECL), net_emd(gdds$EBV, gdds$HSV), - net_emd(gdds$EBV, gdds$KSHV), net_emd(gdds$EBV, gdds$VZV), - net_emd(gdds$ECL, gdds$HSV), net_emd(gdds$ECL, gdds$KSHV), - net_emd(gdds$ECL, gdds$VZV), net_emd(gdds$HSV, gdds$KSHV), - net_emd(gdds$HSV, gdds$VZV), net_emd(gdds$KSHV, gdds$VZV)), - comp_spec = cross_comparison_spec(gdds)) + expected_net_emd_fn <- function(gdds) { + list( + net_emds = c( + net_emd(gdds$EBV, gdds$ECL), net_emd(gdds$EBV, gdds$HSV), + net_emd(gdds$EBV, gdds$KSHV), net_emd(gdds$EBV, gdds$VZV), + net_emd(gdds$ECL, gdds$HSV), net_emd(gdds$ECL, gdds$KSHV), + net_emd(gdds$ECL, gdds$VZV), net_emd(gdds$HSV, gdds$KSHV), + net_emd(gdds$HSV, gdds$VZV), net_emd(gdds$KSHV, gdds$VZV) + ), + comp_spec = cross_comparison_spec(gdds) + ) } # Comparison function for clarity @@ -374,5 +433,3 @@ test_that("net_emds_for_all_graphs works", { compare_fn(gdds_graphlets_g4_e2) compare_fn(gdds_graphlets_g5_e2) }) - - diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index 73b8260f..0cc59366 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -3,73 +3,105 @@ test_that("Graph to indexed edge list round trip conversion works", { data_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") g_orig <- igraph::read_graph(file = file.path(data_dir, "EBV.txt"), format = "ncol") g_rtrip <- netdist::indexed_edges_to_graph(graph_to_indexed_edges(g_orig)) - expect_true(all.equal(igraph::get.edgelist(g_orig),igraph::get.edgelist(g_orig))) + expect_true(all.equal(igraph::get.edgelist(g_orig), igraph::get.edgelist(g_orig))) }) context("ORCA interface: Graphlet key") test_that("graphlet_key gives correct output for all supported max graphlet sizes", { correct_graphlet_key_2 <- list(max_nodes = 2, id = c("G0"), node_count = c(2)) - correct_graphlet_key_3 <- list(max_nodes = 3, id = c("G0", "G1", "G2"), - node_count = c(2, 3, 3)) - correct_graphlet_key_4 = list(max_nodes = 4, - id = c("G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8"), - node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4)) - correct_graphlet_key_5 <- list(max_nodes = 5, - id = c("G0", "G1", "G2", "G3", "G4", "G5", "G6", - "G7", "G8", "G9", "G10", "G11", "G12", - "G13", "G14", "G15", "G16", "G17", - "G18", "G19", "G20", "G21", "G22", - "G23", "G24", "G25", "G26", "G27", - "G28", "G29"), - node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5)) + correct_graphlet_key_3 <- list( + max_nodes = 3, id = c("G0", "G1", "G2"), + node_count = c(2, 3, 3) + ) + correct_graphlet_key_4 <- list( + max_nodes = 4, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8" + ), + node_count = c(2, 3, 3, 4, 4, 4, 4, 4, 4) + ) + correct_graphlet_key_5 <- list( + max_nodes = 5, + id = c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", + "G7", "G8", "G9", "G10", "G11", "G12", + "G13", "G14", "G15", "G16", "G17", + "G18", "G19", "G20", "G21", "G22", + "G23", "G24", "G25", "G26", "G27", + "G28", "G29" + ), + node_count = c( + 2, 3, 3, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5 + ) + ) expect_equal(graphlet_key(2), correct_graphlet_key_2) expect_equal(graphlet_key(3), correct_graphlet_key_3) expect_equal(graphlet_key(4), correct_graphlet_key_4) expect_equal(graphlet_key(5), correct_graphlet_key_5) - }) +}) test_that("graphlet_key gives error for unsupported max graphlet sizes", { max_size_too_low <- c(1, 0, -1, -2, -3, -4, -5, -6) max_size_too_high <- c(6, 7, 8, 9, 10) max_size_not_int <- c(2.5, 3.5, 4.5) - purrr::map(max_size_too_low, function(s) {expect_error(graphlet_key(s))}) - purrr::map(max_size_too_high, function(s) {expect_error(graphlet_key(s))}) - purrr::map(max_size_not_int, function(s) {expect_error(graphlet_key(s))}) + purrr::map(max_size_too_low, function(s) { + expect_error(graphlet_key(s)) + }) + purrr::map(max_size_too_high, function(s) { + expect_error(graphlet_key(s)) + }) + purrr::map(max_size_not_int, function(s) { + expect_error(graphlet_key(s)) + }) }) context("ORCA interface: Orbit key") test_that("orbit_key gives correct output for all supported max graphlet sizes", { correct_orbit_key_2 <- list(max_nodes = 2, id = c("O0"), node_count = c(2)) - correct_orbit_key_3 <- list(max_nodes = 3, id = c("O0", "O1", "O2", "O3"), - node_count = c(2, 3, 3, 3)) - correct_orbit_key_4 = list(max_nodes = 4, - id = c("O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14"), - node_count = c(2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)) - correct_orbit_key_5 <- list(max_nodes = 5, - id = c("O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", - "O18", "O19", "O20", "O21", "O22", - "O23", "O24", "O25", "O26", "O27", "O28", "O29", - "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", - "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", - "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", - "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", - "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", - "O70", "O71", "O72"), - node_count = c(2, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5)) + correct_orbit_key_3 <- list( + max_nodes = 3, id = c("O0", "O1", "O2", "O3"), + node_count = c(2, 3, 3, 3) + ) + correct_orbit_key_4 <- list( + max_nodes = 4, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 + ) + ) + correct_orbit_key_5 <- list( + max_nodes = 5, + id = c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14", "O15", "O16", "O17", + "O18", "O19", "O20", "O21", "O22", + "O23", "O24", "O25", "O26", "O27", "O28", "O29", + "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", + "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", + "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", + "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", + "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", + "O70", "O71", "O72" + ), + node_count = c( + 2, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5 + ) + ) expect_equal(orbit_key(2), correct_orbit_key_2) expect_equal(orbit_key(3), correct_orbit_key_3) expect_equal(orbit_key(4), correct_orbit_key_4) @@ -79,26 +111,32 @@ test_that("orbit_key gives correct output for all supported max graphlet sizes", context("ORCA interface: Graph cross comparison") test_that("cross_comparison_spec works for virus PPI data", { # Load viurs PPI network data in ORCA-compatible edge list format - expected_name_A <- c(rep("EBV", 4), rep("ECL", 3), rep("HSV-1", 2), - rep("KSHV", 1), rep("VZV", 0)) + expected_name_A <- c( + rep("EBV", 4), rep("ECL", 3), rep("HSV-1", 2), + rep("KSHV", 1), rep("VZV", 0) + ) expected_index_A <- c(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1), rep(5, 0)) - expected_name_B <- c(c("ECL", "HSV-1", "KSHV", "VZV"), c("HSV-1", "KSHV", "VZV"), - c("KSHV", "VZV"), c("VZV")) + expected_name_B <- c( + c("ECL", "HSV-1", "KSHV", "VZV"), c("HSV-1", "KSHV", "VZV"), + c("KSHV", "VZV"), c("VZV") + ) expected_index_B <- c(c(2, 3, 4, 5), c(3, 4, 5), c(4, 5), c(5)) - expected <- as.data.frame(cbind(expected_name_A, expected_name_B, - expected_index_A, expected_index_B)) + expected <- as.data.frame(cbind( + expected_name_A, expected_name_B, + expected_index_A, expected_index_B + )) colnames(expected) <- c("name_a", "name_b", "index_a", "index_b") - + actual <- cross_comparison_spec(virusppi) - + matched_output <- function(actual, expected) { dims_match <- identical(dim(as.matrix(expected)), dim(as.matrix(actual))) data_matches <- identical(as.matrix(expected), as.matrix(actual)) headers_match <- identical(colnames(expected), colnames(actual)) return(dims_match && data_matches && headers_match) } - - # Check that actual output matches one of the two acceptable outputs at each + + # Check that actual output matches one of the two acceptable outputs at each # cell expect_true(matched_output(actual, expected)) }) @@ -107,35 +145,47 @@ context("ORCA interface: Orbit count wrapper") test_that("Single and zero node graphs are gracefully handled", { single_node_graph <- igraph::graph_from_adjacency_matrix(0, mode = "undirected") zero_node_graph <- igraph::delete.vertices(single_node_graph, 1) - names4 <- c("O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", - "O10", "O11", "O12", "O13", "O14") - names5 <- c(names4, c("O15", "O16", "O17", "O18", "O19", "O20", "O21", "O22", - "O23", "O24", "O25", "O26", "O27", "O28", "O29", - "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", - "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", - "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", - "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", - "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", - "O70", "O71", "O72")) + names4 <- c( + "O0", "O1", "O2", "O3", "O4", "O5", "O6", "O7", "O8", "O9", + "O10", "O11", "O12", "O13", "O14" + ) + names5 <- c(names4, c( + "O15", "O16", "O17", "O18", "O19", "O20", "O21", "O22", + "O23", "O24", "O25", "O26", "O27", "O28", "O29", + "O30", "O31", "O32", "O33", "O34", "O35", "O36", "O37", + "O38", "O39", "O40", "O41", "O42", "O43", "O44", "O45", + "O46", "O47", "O48", "O49", "O50", "O51", "O52", "O53", + "O54", "O55", "O56", "O57", "O58", "O59", "O60", "O61", + "O62", "O63", "O64", "O65", "O66", "O67", "O68", "O69", + "O70", "O71", "O72" + )) expected_zero_node_counts4 <- matrix(0, nrow = 0, ncol = length(names4)) colnames(expected_zero_node_counts4) <- names4 expected_zero_node_counts5 <- matrix(0, nrow = 0, ncol = length(names5)) colnames(expected_zero_node_counts5) <- names5 - + expected_single_node_counts4 <- matrix(0, nrow = 1, ncol = length(names4)) colnames(expected_single_node_counts4) <- names4 expected_single_node_counts5 <- matrix(0, nrow = 1, ncol = length(names5)) colnames(expected_single_node_counts5) <- names5 - - expect_equal(expected_zero_node_counts4, - count_orbits_per_node(zero_node_graph, max_graphlet_size = 4)) - expect_equal(expected_zero_node_counts5, - count_orbits_per_node(zero_node_graph, max_graphlet_size = 5)) - - expect_equal(expected_single_node_counts4, - count_orbits_per_node(single_node_graph, max_graphlet_size = 4)) - expect_equal(expected_single_node_counts5, - count_orbits_per_node(single_node_graph, max_graphlet_size = 5)) + + expect_equal( + expected_zero_node_counts4, + count_orbits_per_node(zero_node_graph, max_graphlet_size = 4) + ) + expect_equal( + expected_zero_node_counts5, + count_orbits_per_node(zero_node_graph, max_graphlet_size = 5) + ) + + expect_equal( + expected_single_node_counts4, + count_orbits_per_node(single_node_graph, max_graphlet_size = 4) + ) + expect_equal( + expected_single_node_counts5, + count_orbits_per_node(single_node_graph, max_graphlet_size = 5) + ) }) context("ORCA interface: Simplify graph") @@ -153,14 +203,14 @@ test_that("simplify_graph works", { rownames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") colnames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") - + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -170,143 +220,191 @@ test_that("simplify_graph works", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # Check "do nothing" option works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")), + igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # Check directed -> undirected works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus")), + igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - + # 1: Check DIRECTED simplifications # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - + # 2: Check UNDIRECTED simplifications individually # 2a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple + # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + ), igraph::as_adjacency_matrix(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) - ) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) + ) }) context("GDD: test simplify graph") @@ -324,14 +422,14 @@ test_that("gdd simplifies works", { rownames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") colnames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") - + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -341,182 +439,229 @@ test_that("gdd simplifies works", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # Check "do nothing" option works - t1<-gdd(igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")) - t2<-gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) - expect_equal(t1,t2) + t1 <- gdd(igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")) + t2 <- gdd(simplify_graph( + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) + expect_equal(t1, t2) # Check directed -> undirected works expect_equal( gdd( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus")), + igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - + # 1: Check DIRECTED simplifications # 1a. Loop removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # 1b. Multiple edge removal expect_equal( -gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed")), + gdd( + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1ab. Loop + multiple edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1bc. Multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) # 1abc. Loop + multiple + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + ), gdd(simplify_graph( - graph, as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - + # 2: Check UNDIRECTED simplifications individually # 2a. Loop removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple + # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2c. Isolate edge removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2ac. Loop + isolate removal expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + graph, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( gdd( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max")), + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + ), gdd(simplify_graph( - graph, as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) - ) + graph, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) + ) }) context("Features to Histograms Test ") test_that("Features to Histograms Test", { - #basic test - c1<-matrix(c(1,2,3,4,5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(1,2,3,4,5)) - expect_equal(res[[1]]$masses,c(1,1,1,1,1)) - #multiple - c1<-matrix(c(1,1,3,4,5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(1,3,4,5)) - expect_equal(res[[1]]$masses,c(2,1,1,1)) - #non-integer - c1<-matrix(c(0.1,0.1,0.3,0.4,0.5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(0.1,0.3,0.4,0.5)) - expect_equal(res[[1]]$masses,c(2,1,1,1)) - #Negative - c1<-matrix(c(0.1,-0.1,0.3,-0.4,0.5),nrow=5) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(-0.4,-0.1,0.1,0.3,0.5)) - expect_equal(res[[1]]$masses,c(1,1,1,1,1)) - #negative multiple - c1<-matrix(c(0.1,-0.1,0.3,-0.4,0.5,-0.4),nrow=6) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(-0.4,-0.1,0.1,0.3,0.5)) - expect_equal(res[[1]]$masses,c(2,1,1,1,1)) - #small (testing machine precision) - c1<-matrix(c(10^-8,10^-9,10^-2,10^3,10^-8,10^-10),nrow=6) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(10^-10,10^-9,10^-8,10^-2,10^3)) - expect_equal(res[[1]]$masses,c(1,1,2,1,1)) - #irrational - c1<-matrix(c(pi,sqrt(2),sqrt(2)/pi,sqrt(3),sqrt(2),sqrt(2)/pi),nrow=6) - res<-graph_features_to_histograms(c1) - expect_equal(res[[1]]$locations,c(sqrt(2)/pi,sqrt(2),sqrt(3),pi)) - expect_equal(res[[1]]$masses,c(2,2,1,1)) + # basic test + c1 <- matrix(c(1, 2, 3, 4, 5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(1, 2, 3, 4, 5)) + expect_equal(res[[1]]$masses, c(1, 1, 1, 1, 1)) + # multiple + c1 <- matrix(c(1, 1, 3, 4, 5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(1, 3, 4, 5)) + expect_equal(res[[1]]$masses, c(2, 1, 1, 1)) + # non-integer + c1 <- matrix(c(0.1, 0.1, 0.3, 0.4, 0.5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(0.1, 0.3, 0.4, 0.5)) + expect_equal(res[[1]]$masses, c(2, 1, 1, 1)) + # Negative + c1 <- matrix(c(0.1, -0.1, 0.3, -0.4, 0.5), nrow = 5) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(-0.4, -0.1, 0.1, 0.3, 0.5)) + expect_equal(res[[1]]$masses, c(1, 1, 1, 1, 1)) + # negative multiple + c1 <- matrix(c(0.1, -0.1, 0.3, -0.4, 0.5, -0.4), nrow = 6) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(-0.4, -0.1, 0.1, 0.3, 0.5)) + expect_equal(res[[1]]$masses, c(2, 1, 1, 1, 1)) + # small (testing machine precision) + c1 <- matrix(c(10^-8, 10^-9, 10^-2, 10^3, 10^-8, 10^-10), nrow = 6) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(10^-10, 10^-9, 10^-8, 10^-2, 10^3)) + expect_equal(res[[1]]$masses, c(1, 1, 2, 1, 1)) + # irrational + c1 <- matrix(c(pi, sqrt(2), sqrt(2) / pi, sqrt(3), sqrt(2), sqrt(2) / pi), nrow = 6) + res <- graph_features_to_histograms(c1) + expect_equal(res[[1]]$locations, c(sqrt(2) / pi, sqrt(2), sqrt(3), pi)) + expect_equal(res[[1]]$masses, c(2, 2, 1, 1)) }) @@ -538,20 +683,22 @@ test_that("read_simple_graph works", { graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") # Save graph to temp directory path <- file.path(tempdir(), "read_simple_graph_test_input.txt") - format = "graphml" + format <- "graphml" igraph::write_graph(graph, path, format = format) # Sanity check round trip of graph to file and back check_graph <- igraph::read_graph(file = path, format = format) - expect_equal(igraph::as_adjacency_matrix(graph), - igraph::as_adjacency_matrix(check_graph)) - + expect_equal( + igraph::as_adjacency_matrix(graph), + igraph::as_adjacency_matrix(check_graph) + ) + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -561,142 +708,190 @@ test_that("read_simple_graph works", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # Check "do nothing" option works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # Check directed -> undirected works expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(adj_mat, mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) - + # 1: Check DIRECTED simplifications # 1a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) ) # 1b. Multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1ab. Loop + multiple edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 1ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) ) # 1bc. Multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) # 1abc. Loop + multiple + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = FALSE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "directed") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = FALSE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) - + # 2: Check UNDIRECTED simplifications individually # 2a. Loop removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = FALSE)) - ) - # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_loops(adj_mat), mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = FALSE + )) + ) + # 2b. Multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(adj_mat), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2c. Isolate edge removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = FALSE, remove_isolates = TRUE)) - ) - # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_isolates(adj_mat), mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = FALSE, remove_isolates = TRUE + )) + ) + # 2ab. Loop + multiple edge removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = FALSE)) + igraph::graph_from_adjacency_matrix(remove_multiples(remove_loops(adj_mat)), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = FALSE + )) ) # 2ac. Loop + isolate removal expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = FALSE, remove_isolates = TRUE)) - ) - # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_isolates(remove_loops(adj_mat)), mode = "plus") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = FALSE, remove_isolates = TRUE + )) + ) + # 2bc. Multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = FALSE, - remove_multiple = TRUE, remove_isolates = TRUE)) - ) - # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(adj_mat)), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = FALSE, + remove_multiple = TRUE, remove_isolates = TRUE + )) + ) + # 2abc. Loop + multiple + isolate removal (use mode = "max" to avoid generating multiple # edges where nodes are mutually connected in adjacency matrix) expect_equal( igraph::as_adjacency_matrix( - igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max")), - igraph::as_adjacency_matrix(read_simple_graph(file = path, format = format, - as_undirected = TRUE, remove_loops = TRUE, - remove_multiple = TRUE, remove_isolates = TRUE)) + igraph::graph_from_adjacency_matrix(remove_isolates(remove_multiples(remove_loops(adj_mat))), mode = "max") + ), + igraph::as_adjacency_matrix(read_simple_graph( + file = path, format = format, + as_undirected = TRUE, remove_loops = TRUE, + remove_multiple = TRUE, remove_isolates = TRUE + )) ) }) @@ -715,20 +910,20 @@ test_that("read_simple_files works (all files in a directory)", { colnames(adj_mat) <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7") graph <- igraph::graph_from_adjacency_matrix(adj_mat, mode = "directed") # Save graphs to temp directory - format = "graphml" + format <- "graphml" base_dir <- tempdir() - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_1.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_2.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_3.txt"), format = format) - igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_4.txt"), format = format) - + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_1.txt"), format = format) + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_2.txt"), format = format) + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_3.txt"), format = format) + igraph::write_graph(graph, file = file.path(base_dir, "oltw54387eNS_4.txt"), format = format) + # Helper functions to amend adjacency matrix to generate simplified graphs remove_loops <- function(adj_mat) { diag(adj_mat) <- 0 return(adj_mat) } remove_multiples <- function(adj_mat) { - adj_mat[adj_mat>1] <- 1 + adj_mat[adj_mat > 1] <- 1 return(adj_mat) } remove_isolates <- function(adj_mat) { @@ -738,24 +933,31 @@ test_that("read_simple_files works (all files in a directory)", { adj_mat <- adj_mat[keep_nodes, keep_nodes] return(adj_mat) } - + # No simplification graphs_actual <- read_simple_graphs( - base_dir, format = format, pattern = "oltw54387eNS*", as_undirected = FALSE, - remove_loops = FALSE, remove_multiple = FALSE, remove_isolates = FALSE) - purrr::walk(graphs_actual, ~expect_equal( - igraph::as_adjacency_matrix(.x), igraph::as_adjacency_matrix(graph))) - + base_dir, + format = format, pattern = "oltw54387eNS*", as_undirected = FALSE, + remove_loops = FALSE, remove_multiple = FALSE, remove_isolates = FALSE + ) + purrr::walk(graphs_actual, ~ expect_equal( + igraph::as_adjacency_matrix(.x), igraph::as_adjacency_matrix(graph) + )) + # Full ORCA compatible simplification graphs_actual <- read_simple_graphs( - base_dir, format = format, pattern = "oltw54387eNS*", as_undirected = TRUE, - remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE) - purrr::walk(graphs_actual, ~expect_equal( + base_dir, + format = format, pattern = "oltw54387eNS*", as_undirected = TRUE, + remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE + ) + purrr::walk(graphs_actual, ~ expect_equal( igraph::as_adjacency_matrix(.x), igraph::as_adjacency_matrix( igraph::as.undirected( igraph::graph_from_adjacency_matrix( - remove_isolates(remove_multiples(remove_loops(adj_mat)))) - )) + remove_isolates(remove_multiples(remove_loops(adj_mat))) + ) + ) + ) )) }) @@ -786,7 +988,7 @@ test_that("orbit_to_graphlet_counts summation works", { g16_indexes <- c(35:38) + 1 g17_indexes <- c(39:42) + 1 g18_indexes <- c(43:44) + 1 - g19_indexes <- c(45:48) +1 + g19_indexes <- c(45:48) + 1 g20_indexes <- c(49:50) + 1 g21_indexes <- c(51:53) + 1 g22_indexes <- c(54:55) + 1 @@ -829,21 +1031,25 @@ test_that("orbit_to_graphlet_counts summation works", { g28_counts <- rowSums(orbit_counts_5[, g28_indexes, drop = FALSE]) g29_counts <- rowSums(orbit_counts_5[, g29_indexes, drop = FALSE]) # Define expected graphlet count matrix for graphlets up to 5 nodes - expected_graphlet_counts_5 <- - cbind(g0_counts, g1_counts, g2_counts, g3_counts, g4_counts, g5_counts, - g6_counts, g7_counts, g8_counts, g9_counts, g10_counts, g11_counts, - g12_counts, g13_counts, g14_counts, g15_counts, g16_counts, - g17_counts, g18_counts, g19_counts, g20_counts, g21_counts, - g22_counts, g23_counts, g24_counts, g25_counts, g26_counts, - g27_counts, g28_counts, g29_counts) - colnames(expected_graphlet_counts_5) <- - c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9","G10", - "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", - "G20", "G21", "G22", "G23", "G24", "G25", "G26", "G27", "G28", - "G29") + expected_graphlet_counts_5 <- + cbind( + g0_counts, g1_counts, g2_counts, g3_counts, g4_counts, g5_counts, + g6_counts, g7_counts, g8_counts, g9_counts, g10_counts, g11_counts, + g12_counts, g13_counts, g14_counts, g15_counts, g16_counts, + g17_counts, g18_counts, g19_counts, g20_counts, g21_counts, + g22_counts, g23_counts, g24_counts, g25_counts, g26_counts, + g27_counts, g28_counts, g29_counts + ) + colnames(expected_graphlet_counts_5) <- + c( + "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8", "G9", "G10", + "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", + "G20", "G21", "G22", "G23", "G24", "G25", "G26", "G27", "G28", + "G29" + ) # Define epected graphlet count matrix for graphlets up to 4 nodes by selecting # a subset of the matrix for graphlets up to 5 nodes - expected_graphlet_counts_4 <- expected_graphlet_counts_5[,1:9] + expected_graphlet_counts_4 <- expected_graphlet_counts_5[, 1:9] # Calculate actual graphlet counts from functions under test actual_graphlet_counts_4 <- orbit_to_graphlet_counts(orbit_counts_4) actual_graphlet_counts_5 <- orbit_to_graphlet_counts(orbit_counts_5) @@ -854,197 +1060,199 @@ test_that("orbit_to_graphlet_counts summation works", { context("ORCA interface: Named ego networks") test_that("make_named_ego_graph labels each ego-network with the correct node name", { - # Helper function to sort edgelists in consistent order + # Helper function to sort edgelists in consistent order sort_edge_list <- function(edge_list) { - edge_list[order(edge_list[,1],edge_list[,2], decreasing = FALSE),] + edge_list[order(edge_list[, 1], edge_list[, 2], decreasing = FALSE), ] } # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - # The expectation below is based on igraph::graph_from_edgelist adding nodes + # The expectation below is based on igraph::graph_from_edgelist adding nodes # in the order they appear in the edge list, and igraph::V returning them # in this same order - expected_node_names <- c("n1","n2","n3","n4","n5","n6","n7","n8","n9","n10") - + expected_node_names <- c("n1", "n2", "n3", "n4", "n5", "n6", "n7", "n8", "n9", "n10") + # Expected edgelists for ego networks of order 1 expected_ego_elist_n1_o1 <- rbind( - c("n1","n2"), - c("n1","n4"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6") + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6") ) expected_ego_elist_n2_o1 <- rbind( - c("n1","n2"), - c("n1","n4"), - c("n2","n3"), - c("n2","n4"), - c("n2","n5") + c("n1", "n2"), + c("n1", "n4"), + c("n2", "n3"), + c("n2", "n4"), + c("n2", "n5") ) expected_ego_elist_n3_o1 <- rbind( - c("n2","n3") + c("n2", "n3") ) expected_ego_elist_n4_o1 <- rbind( - c("n1","n2"), - c("n1","n4"), - c("n1","n6"), - c("n2","n4"), - c("n4","n6") + c("n1", "n2"), + c("n1", "n4"), + c("n1", "n6"), + c("n2", "n4"), + c("n4", "n6") ) expected_ego_elist_n5_o1 <- rbind( - c("n2","n5") + c("n2", "n5") ) expected_ego_elist_n6_o1 <- rbind( - c("n1","n4"), - c("n1","n6"), - c("n4","n6"), - c("n6","n8") + c("n1", "n4"), + c("n1", "n6"), + c("n4", "n6"), + c("n6", "n8") ) expected_ego_elist_n7_o1 <- rbind( - c("n1","n7"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n7"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) expected_ego_elist_n8_o1 <- rbind( - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) expected_ego_elist_n9_o1 <- rbind( - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) expected_ego_elist_n10_o1 <- rbind( - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") - ) - + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + # Test ego-networks of order 1. # We compare edgelists as igraphs do not implement comparison order <- 1 min_ego_nodes <- 0 min_ego_edges <- 0 - + expected_ego_elists_o1 <- list( - n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), - n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), - n3 = dplyr::arrange(data.frame(expected_ego_elist_n3_o1), X1, X2), - n4 = dplyr::arrange(data.frame(expected_ego_elist_n4_o1), X1, X2), - n5 = dplyr::arrange(data.frame(expected_ego_elist_n5_o1), X1, X2), - n6 = dplyr::arrange(data.frame(expected_ego_elist_n6_o1), X1, X2), - n7 = dplyr::arrange(data.frame(expected_ego_elist_n7_o1), X1, X2), - n8 = dplyr::arrange(data.frame(expected_ego_elist_n8_o1), X1, X2), - n9 = dplyr::arrange(data.frame(expected_ego_elist_n9_o1), X1, X2), + n1 = dplyr::arrange(data.frame(expected_ego_elist_n1_o1), X1, X2), + n2 = dplyr::arrange(data.frame(expected_ego_elist_n2_o1), X1, X2), + n3 = dplyr::arrange(data.frame(expected_ego_elist_n3_o1), X1, X2), + n4 = dplyr::arrange(data.frame(expected_ego_elist_n4_o1), X1, X2), + n5 = dplyr::arrange(data.frame(expected_ego_elist_n5_o1), X1, X2), + n6 = dplyr::arrange(data.frame(expected_ego_elist_n6_o1), X1, X2), + n7 = dplyr::arrange(data.frame(expected_ego_elist_n7_o1), X1, X2), + n8 = dplyr::arrange(data.frame(expected_ego_elist_n8_o1), X1, X2), + n9 = dplyr::arrange(data.frame(expected_ego_elist_n9_o1), X1, X2), n10 = dplyr::arrange(data.frame(expected_ego_elist_n10_o1), X1, X2) ) # Generate actual ego-networks and convert to edge lists for comparison - actual_ego_elists_o1 <- - purrr::map(make_named_ego_graph(graph, order, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges), - function(g) { - dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) - } - ) + actual_ego_elists_o1 <- + purrr::map( + make_named_ego_graph(graph, order, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ), + function(g) { + dplyr::arrange(data.frame(igraph::as_edgelist(g)), X1, X2) + } + ) expect_equal(actual_ego_elists_o1, expected_ego_elists_o1) }) context("ORCA interface: Graphlet counts") test_that("count_graphlets_for_graph works", { - # Set up a small sample network with at least that contains at least one of + # Set up a small sample network with at least that contains at least one of # each graphlet elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + # Setgraphlet labels to use for names in expected counts graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - + # Manually verified graphlet counts - expected_counts <- c(15, 18,6, 21,3,1, 11, 1, 1) + expected_counts <- c(15, 18, 6, 21, 3, 1, 11, 1, 1) names(expected_counts) <- graphlet_labels - + # Test actual_counts <- count_graphlets_for_graph(graph, max_graphlet_size = 4) expect_equal(expected_counts, actual_counts) }) context("ORCA interface: Ego-network graphlet counts") -test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manually verified totals for test graph",{ +test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manually verified totals for test graph", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( - c("n1","n2"), - c("n2","n3"), - c("n1","n4"), - c("n2","n5"), - c("n1","n6"), - c("n1","n7"), - c("n2","n4"), - c("n4","n6"), - c("n6","n8"), - c("n7","n8"), - c("n7","n9"), - c("n7","n10"), - c("n8","n9"), - c("n8","n10"), - c("n9","n10") + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") ) graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - + # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - + max_graphlet_size <- 4 graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count @@ -1057,7 +1265,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall c(5, 2, 2, 0, 0, 0, 0, 1, 0), c(1, 0, 0, 0, 0, 0, 0, 0, 0), c(4, 2, 1, 0, 0, 0, 1, 0, 0), - c(7, 3, 4, 0, 0, 0, 3, 0, 1), + c(7, 3, 4, 0, 0, 0, 3, 0, 1), c(7, 3, 4, 0, 0, 0, 3, 0, 1), c(6, 0, 4, 0, 0, 0, 0, 0, 1), c(6, 0, 4, 0, 0, 0, 0, 0, 1) @@ -1067,94 +1275,120 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # 2-step ego networks expected_counts_order_2 <- rbind( c(15, 18, 6, 21, 3, 1, 11, 1, 1), - c( 8, 10, 2, 6, 3, 0, 4, 1, 0), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(10, 14, 2, 11, 3, 1, 5, 1, 0), - c( 5, 5, 1, 0, 2, 0, 2, 0, 0), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(11, 10, 5, 10 ,0 ,1, 8, 0, 1), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1), - c( 9, 8, 4, 4, 0, 1, 6, 0, 1) + c(8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 1, 0, 2, 0, 2, 0, 0), + c(10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 1, 0, 2, 0, 2, 0, 0), + c(13, 13, 6, 15, 1, 1, 9, 1, 1), + c(13, 13, 6, 15, 1, 1, 9, 1, 1), + c(11, 10, 5, 10, 0, 1, 8, 0, 1), + c(9, 8, 4, 4, 0, 1, 6, 0, 1), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) ) rownames(expected_counts_order_2) <- node_labels colnames(expected_counts_order_2) <- graphlet_labels - + # Count graphlets in each ego network of the graph with only counts requested min_ego_nodes <- 0 min_ego_edges <- 0 - - actual_counts_order_1 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - actual_counts_order_2 <- - count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + + actual_counts_order_1 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + actual_counts_order_2 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) # Test that actual counts match expected with only counts requested (default) expect_equal(actual_counts_order_1, expected_counts_order_1) expect_equal(actual_counts_order_2, expected_counts_order_2) - + # Test that actual and returned ego networks match expected # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + expected_ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + expected_ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) expected_counts_with_networks_order_1 <- - list(graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1) - expected_counts_with_networks_order_2 <- - list(graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2) + list( + graphlet_counts = expected_counts_order_1, + ego_networks = expected_ego_networks_order_1 + ) + expected_counts_with_networks_order_2 <- + list( + graphlet_counts = expected_counts_order_2, + ego_networks = expected_ego_networks_order_2 + ) # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE) - actual_counts_with_networks_order_2 <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE) + actual_counts_with_networks_order_1 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) + actual_counts_with_networks_order_2 <- + count_graphlets_ego(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = TRUE + ) # Test that actual counts match expected with ego-networks requested expect_equal(actual_counts_with_networks_order_1$graphlet_counts, expected_counts_order_1) expect_equal(actual_counts_with_networks_order_2$graphlet_counts, expected_counts_order_2) - + # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to + # Comparison is not implemented for igraph objects, so convert all igraphs to # indexed edge list and then compare. Do in-situ replacement of igraphs with # indexed edge lists to ensure we are checking full properties of returned # objects (i.e. named lists with matching elements). # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map(expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map(expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map(actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map(actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges) + expected_counts_with_networks_order_1$ego_networks <- + purrr::map( + expected_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + expected_counts_with_networks_order_2$ego_networks <- + purrr::map( + expected_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_1$ego_networks <- + purrr::map( + actual_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_2$ego_networks <- + purrr::map( + actual_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) # 3b. Do comparison - expect_equal(actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1) - expect_equal(actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2) + expect_equal( + actual_counts_with_networks_order_1, + expected_counts_with_networks_order_1 + ) + expect_equal( + actual_counts_with_networks_order_2, + expected_counts_with_networks_order_2 + ) }) # context("ORCA interface: Graphlet-based degree distributions") @@ -1181,7 +1415,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # gdd_orbit_default_actual <- gdd(graph, feature_type = "orbit") # gdd_graphlet_default_actual <- gdd(graph, feature_type = "graphlet") # gdd_default_default_actual <- gdd(graph) -# # Compare actual gdd with expected gdd +# # Compare actual gdd with expected gdd # expect_equal(gdd_orbit_4_actual, gdd_orbit_4_expected) # expect_equal(gdd_orbit_5_actual, gdd_orbit_5_expected) # expect_equal(gdd_graphlet_4_actual, gdd_graphlet_4_expected) @@ -1191,7 +1425,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # expect_equal(gdd_orbit_default_actual, gdd_orbit_4_expected) # expect_equal(gdd_graphlet_default_actual, gdd_graphlet_4_expected) # expect_equal(gdd_default_default_actual, gdd_orbit_4_expected) -# +# # # Check gdd throws error for invalid feature type # expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4)) # expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5)) @@ -1202,9 +1436,9 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2)) # expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3)) # expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6)) -# +# # }) -# +# # context("ORCA interface: Ego-network graphlet outputs for manually verified networks") # test_that("Ego-network 4-node graphlet counts match manually verified totals # and gdd gives expected discrete histograms",{ @@ -1228,20 +1462,20 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # c("n9","n10") # ) # graph <- igraph::graph_from_edgelist(elist, directed = FALSE) -# +# # # Set node and graphlet labels to use for row and col names in expected counts # node_labels <- igraph::V(graph)$name # graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") -# +# # # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 # max_graphlet_size <- 4 -# actual_counts_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_order_1 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 1) -# actual_counts_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_order_2 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 2) -# +# # # Set manually verified ego-network graphlet counts # # 1-step ego networks # expected_counts_order_1 <- rbind( @@ -1273,55 +1507,55 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # ) # rownames(expected_counts_order_2) <- node_labels # colnames(expected_counts_order_2) <- graphlet_labels -# +# # # Test that actual counts match expected with only counts requested (default) # expect_equal(actual_counts_order_1, expected_counts_order_1) # expect_equal(actual_counts_order_2, expected_counts_order_2) -# +# # # Test that actual counts and returned ego networks match expected # # 1. Define expected # expected_ego_networks_order_1 <- make_named_ego_graph(graph, order = 1) # expected_ego_networks_order_2 <- make_named_ego_graph(graph, order = 2) -# expected_counts_with_networks_order_1 <- +# expected_counts_with_networks_order_1 <- # list(graphlet_counts = expected_counts_order_1, # ego_networks = expected_ego_networks_order_1) -# expected_counts_with_networks_order_2 <- +# expected_counts_with_networks_order_2 <- # list(graphlet_counts = expected_counts_order_2, # ego_networks = expected_ego_networks_order_2) # # 2. Calculate actual -# actual_counts_with_networks_order_1 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_with_networks_order_1 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 1, return_ego_networks = TRUE) -# actual_counts_with_networks_order_2 <- -# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, +# actual_counts_with_networks_order_2 <- +# count_graphlets_ego(graph, max_graphlet_size = max_graphlet_size, # neighbourhood_size = 2, return_ego_networks = TRUE) # # 3. Compare -# # Comparison is not implemented for igraph objects, so convert all igraphs to +# # Comparison is not implemented for igraph objects, so convert all igraphs to # # indexed edge list and then compare. Do in-situ replacement of igraphs with # # indexed edge lists to ensure we are checking full properties of returned # # objects (i.e. named lists with matching elements). # # 3a. Convert expected and actual ego networks from igraphs to indexed edges -# expected_counts_with_networks_order_1$ego_networks <- -# purrr::map(expected_counts_with_networks_order_1$ego_networks, +# expected_counts_with_networks_order_1$ego_networks <- +# purrr::map(expected_counts_with_networks_order_1$ego_networks, # graph_to_indexed_edges) -# expected_counts_with_networks_order_2$ego_networks <- -# purrr::map(expected_counts_with_networks_order_2$ego_networks, +# expected_counts_with_networks_order_2$ego_networks <- +# purrr::map(expected_counts_with_networks_order_2$ego_networks, # graph_to_indexed_edges) -# actual_counts_with_networks_order_1$ego_networks <- -# purrr::map(actual_counts_with_networks_order_1$ego_networks, +# actual_counts_with_networks_order_1$ego_networks <- +# purrr::map(actual_counts_with_networks_order_1$ego_networks, # graph_to_indexed_edges) -# actual_counts_with_networks_order_2$ego_networks <- -# purrr::map(actual_counts_with_networks_order_2$ego_networks, +# actual_counts_with_networks_order_2$ego_networks <- +# purrr::map(actual_counts_with_networks_order_2$ego_networks, # graph_to_indexed_edges) # # 3b. Do comparison -# expect_equal(actual_counts_with_networks_order_1, +# expect_equal(actual_counts_with_networks_order_1, # expected_counts_with_networks_order_1) -# expect_equal(actual_counts_with_networks_order_2, +# expect_equal(actual_counts_with_networks_order_2, # expected_counts_with_networks_order_2) -# +# # # Test that gdd method gives the expected graphlet degree distributions # # 1-step ego-networks -# actual_gdd_order_1 <- gdd(graph, feature_type = "graphlet", +# actual_gdd_order_1 <- gdd(graph, feature_type = "graphlet", # max_graphlet_size = 4, ego_neighbourhood_size = 1) # expected_gdd_order_1 <- list( # G0 = dhist(locations = c(1, 4, 5, 6, 7), masses = c(2, 1, 2, 3, 2)), @@ -1336,7 +1570,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # ) # expect_equal(actual_gdd_order_1, expected_gdd_order_1) # # 2-step ego-networks -# actual_gdd_order_2 <- gdd(graph, feature_type = "graphlet", +# actual_gdd_order_2 <- gdd(graph, feature_type = "graphlet", # max_graphlet_size = 4, ego_neighbourhood_size = 2) # expected_gdd_order_2 <- list( # G0 = dhist(locations = c(5, 8, 9, 10, 11, 13, 15), masses = c(2, 1, 2, 1, 1, 2, 1)), @@ -1350,46 +1584,46 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # G8 = dhist(locations = c(0, 1), masses = c(4, 6)) # ) # expect_equal(actual_gdd_order_2, expected_gdd_order_2) -# +# # # Check gdd throws error for invalid feature type -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 4, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, +# expect_error(gdd(graph, feature_type = "foo", max_graphlet_size = 5, # ego_neighbourhood_size = 1)) # # We don't support orbit feature type for ego networks (i.e. neighbourhood > 0) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 4, +# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 4, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 5, +# expect_error(gdd(graph, feature_type = "orbit", max_graphlet_size = 5, # ego_neighbourhood_size = 1)) # # Check gdd throws error for invalid maximum graphlet size -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 2, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 3, # ego_neighbourhood_size = 1)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, # ego_neighbourhood_size = 0)) -# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, +# expect_error(gdd(graph, feature_type = "graphlet", max_graphlet_size = 6, # ego_neighbourhood_size = 1)) # }) -# +# # context("ORCA interface: GDD for all graphs in a directory") # test_that("gdd_for_all_graphs works", { # # Set source directory and file properties for Virus PPI graph edge files # source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # edge_format = "ncol" # file_pattern = ".txt" -# +# # # Set number of threads to use at once for parallel processing. # num_threads = getOption("mc.cores", 2L) -# +# # # Use previously tested gdd code to calculate expected gdds # expected_gdd_fn <- function(feature_type, max_graphlet_size, ego_neighbourhood_size) { # gdds <- list( @@ -1402,12 +1636,12 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # names(gdds) <- c("EBV", "ECL", "HSV-1", "KSHV", "VZV") # gdds # } -# +# # # Use code under test to generate actual gdds # actual_gdd_fn <- function (feature_type, max_graphlet_size, ego_neighbourhood_size) { -# gdd_for_all_graphs(source_dir = source_dir, format = edge_format, -# pattern = file_pattern, feature_type = feature_type, -# max_graphlet_size = max_graphlet_size, +# gdd_for_all_graphs(source_dir = source_dir, format = edge_format, +# pattern = file_pattern, feature_type = feature_type, +# max_graphlet_size = max_graphlet_size, # ego_neighbourhood_size = ego_neighbourhood_size, # mc.cores = num_threads) # } From 33ccb41fa489c467ea8d34f4ac4c23c528c4f904 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 30 Jul 2019 16:49:09 +0100 Subject: [PATCH 021/188] added function taking 2 graphs, output netdis statistic --- NAMESPACE | 1 + R/measures_net_dis.R | 106 +++++++++++- R/orca_interface.R | 18 +- man/adaptive_breaks.Rd | 12 +- man/area_between_dhist_ecmfs.Rd | 8 +- man/area_between_offset_ecmfs.Rd | 6 +- man/as_smoothed_dhist.Rd | 4 +- man/as_unsmoothed_dhist.Rd | 2 +- man/binned_densities_adaptive.Rd | 4 +- man/cost_matrix.Rd | 2 +- man/count_graphlets_ego.Rd | 16 +- man/count_graphlets_ego_scaled.Rd | 32 ++-- man/count_graphlets_for_graph.Rd | 6 +- man/count_graphlets_per_node.Rd | 8 +- man/count_orbits_per_node.Rd | 4 +- man/cross_comp_to_matrix.Rd | 2 +- man/cross_comparison_spec.Rd | 4 +- man/dhist.Rd | 8 +- man/dhist_ecmf.Rd | 6 +- man/dhist_std.Rd | 4 +- man/dhist_variance.Rd | 4 +- man/ecmf_knots.Rd | 8 +- man/ego_to_graphlet_counts.Rd | 8 +- man/emd_cs.Rd | 2 +- man/emd_lp.Rd | 2 +- man/gdd.Rd | 2 +- man/gdd_for_all_graphs.Rd | 12 +- man/graph_features_to_histograms.Rd | 17 +- man/graph_to_indexed_edges.Rd | 2 +- man/graphlet_key.Rd | 2 +- man/harmonise_dhist_locations.Rd | 2 +- man/indexed_edges_to_graph.Rd | 4 +- man/interval_index.Rd | 6 +- man/is_dhist.Rd | 8 +- man/make_named_ego_graph.Rd | 6 +- man/mean_centre_dhist.Rd | 2 +- man/mean_density_binned_graphlet_counts.Rd | 4 +- man/min_emd.Rd | 12 +- man/min_emd_exhaustive.Rd | 14 +- man/min_emd_optimise.Rd | 4 +- man/min_emd_optimise_fast.Rd | 4 +- man/net_emd.Rd | 20 +-- man/net_emds_for_all_graphs.Rd | 30 ++-- man/netdis.Rd | 2 +- man/netdis_centred_graphlet_counts.Rd | 16 +- man/netdis_expected_graphlet_counts.Rd | 6 +- man/netdis_expected_graphlet_counts_ego.Rd | 4 +- man/netdis_expected_graphlet_counts_ego_fn.Rd | 31 ++-- ...netdis_expected_graphlet_counts_per_ego.Rd | 4 +- man/netdis_for_all_graphs.Rd | 4 +- man/netdis_one_to_one.Rd | 38 +++++ man/netdis_uptok.Rd | 8 +- man/normalise_dhist_mass.Rd | 2 +- man/normalise_dhist_variance.Rd | 2 +- man/orbit_key.Rd | 2 +- man/orbit_to_graphlet_counts.Rd | 2 +- man/read_simple_graph.Rd | 10 +- man/read_simple_graphs.Rd | 12 +- man/scale_graphlet_counts_ego.Rd | 4 +- man/shift_dhist.Rd | 2 +- man/shift_to_next_alignment.Rd | 4 +- man/simplify_graph.Rd | 8 +- man/sort_dhist.Rd | 4 +- man/virusppi.Rd | 2 +- vignettes/quickstart_netdis_2graphs.R | 23 ++- vignettes/quickstart_netdis_2graphs.html | 158 ++++++++++-------- vignettes/quickstart_netdis_functions.Rmd | 120 +++++++++++++ 67 files changed, 597 insertions(+), 299 deletions(-) create mode 100644 man/netdis_one_to_one.Rd create mode 100644 vignettes/quickstart_netdis_functions.Rmd diff --git a/NAMESPACE b/NAMESPACE index ee580a7d..5c7bcef3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(netdis_expected_graphlet_counts_ego) export(netdis_expected_graphlet_counts_ego_fn) export(netdis_expected_graphlet_counts_per_ego) export(netdis_for_all_graphs) +export(netdis_one_to_one) export(netdis_uptok) export(normalise_dhist_mass) export(normalise_dhist_variance) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index c0806146..7668a777 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,3 +1,105 @@ +#' Netdis between two graphs +#' @param graph_1 First query graph +#' @param graph_2 Second query graph +#' @param ref_graph Reference graph +#' @param max_graphlet_size Generate graphlets up to this size +#' @param neighbourhood_size Ego network neighbourhood size +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges +#' @param min_bin_count Minimum number of ego networks in each density bin +#' @param num_bins Number of density bins to generate +#' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes +#' up to and including max_graphlet_size +#' @export +netdis_one_to_one <- function(graph_1, graph_2, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) { + ## ------------------------------------------------------------------------ + # Get ego networks for query graphs and reference graph + ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + ## ------------------------------------------------------------------------ + # Count graphlets for ego networks in query and reference graphs + graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) + graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + + graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + + ## ------------------------------------------------------------------------ + # Scale ego-network graphlet counts by dividing by total number of k-tuples in + # ego-network (where k is graphlet size) + scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(ego_ref, + graphlet_counts_ref, + max_graphlet_size) + + # Get ego-network densities + densities_ref <- ego_network_density(ego_ref) + + # Adaptively bin ref ego-network densities + binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + + ref_ego_density_bins <- binned_densities$breaks + + # Average ref graphlet counts across density bins + ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) + + + ## ------------------------------------------------------------------------ + # Calculate expected graphlet counts (using ref graph ego network density bins) + exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, + max_graphlet_size, + ref_ego_density_bins, + ref_binned_graphlet_counts) + + + exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, + max_graphlet_size, + ref_ego_density_bins, + ref_binned_graphlet_counts) + + # Centre graphlet counts by subtracting expected counts + centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 + + centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 + + ## ------------------------------------------------------------------------ + # Sum centred graphlet counts across all ego networks + sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + + sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + + ## ------------------------------------------------------------------------ + # Calculate netdis statistics + netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +} + #' Netdis between all graph pairs using provided Centred Graphlet Counts #' @param centred_graphlet_counts List containing Centred Graphlet Counts for #' all graphs being compared @@ -80,8 +182,8 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, netdis_statistics <- purrr::map(3:max_graphlet_size, netdis, - centred_graphlet_counts1 = sum_graphlet_counts_1, - centred_graphlet_counts2 = sum_graphlet_counts_2 + centred_graphlet_counts1 = centred_graphlet_counts1, + centred_graphlet_counts2 = centred_graphlet_counts2 ) netdis_statistics <- simplify2array(netdis_statistics) diff --git a/R/orca_interface.R b/R/orca_interface.R index 9293fdd5..47ebe7ed 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -627,17 +627,29 @@ gdd_for_all_graphs <- function( #' Generate a cross-comparison specification #' -#' Creates a cross-comparison matrix with all possible pair-wise combinations +#' Creates a cross-comparison matrix with pair-wise combinations #' of elements from the provided list. #' @param named_list A named list of items for which an exhaustive pair-wise #' cross-comparison is required. +#' @param how How to generate pair-wise combinations. Either "many-to-many" +#' (default) which generates all possible pair-wise combinations, or +#' "one-to-many" which generates all combinations between the first element +#' in named_list and the rest of the elements only. #' @return A matrix with one row for each possible pair-wise combination #' of elements from the provided named list. The first and second columns #' contain the names of the elements in the pair and the third and fourth #' columns contain the indexes of these elements in the provided list. #' @export -cross_comparison_spec <- function(named_list) { - indexes <- as.data.frame(t(utils::combn(1:length(named_list), 2))) +cross_comparison_spec <- function(named_list, how = "many-to-many") { + if (how == "one-to-many") { + indexes <- data.frame( + rep(1, length(named_list)-1), + 2:length(named_list) + ) + } else { + indexes <- as.data.frame(t(utils::combn(1:length(named_list), 2))) + } + names <- as.data.frame(cbind( names(named_list)[indexes[, 1]], names(named_list)[indexes[, 2]] diff --git a/man/adaptive_breaks.Rd b/man/adaptive_breaks.Rd index 497cfc31..d94b0e74 100644 --- a/man/adaptive_breaks.Rd +++ b/man/adaptive_breaks.Rd @@ -13,15 +13,15 @@ adaptive_breaks(x, min_count, breaks) \item{min_count}{The minimum count for each bin} \item{breaks}{Either a vector containing an intital set of breaks or a single -number indicating how many uniformly spaced intervals to use when constructing -the initial set of breaks. If a single number is provided, the minumum break -will be the minimum value of x and the maximum break will be the maximum -value of x.} +number indicating how many uniformly spaced intervals to use when +constructing the initial set of breaks. If a single number is provided, the +minumum break will be the minimum value of x and the maximum break will be +the maximum value of x.} } \description{ Starts by binning the variable by the breaks provided in \code{breaks} (if \code{breaks} is a vector), or generating a set of \code{breaks} at uniformly -spaced intervals (if \code{breaks} is a single number). It then iteratively -merges intervals with counts lower than \code{min_count} by removing breaks +spaced intervals (if \code{breaks} is a single number). It then iteratively +merges intervals with counts lower than \code{min_count} by removing breaks until all remaining intervals have counts of at least \code{min_count}. } diff --git a/man/area_between_dhist_ecmfs.Rd b/man/area_between_dhist_ecmfs.Rd index d5320d15..f123ec5b 100644 --- a/man/area_between_dhist_ecmfs.Rd +++ b/man/area_between_dhist_ecmfs.Rd @@ -2,16 +2,16 @@ % Please edit documentation in R/dhist.R \name{area_between_dhist_ecmfs} \alias{area_between_dhist_ecmfs} -\title{Calculate area between two discrete histogram empirical cumulative +\title{Calculate area between two discrete histogram empirical cumulative mass functions (ECMFs)} \usage{ area_between_dhist_ecmfs(dhist_ecmf1, dhist_ecmf2) } \arguments{ -\item{dhist_ecmf1}{An object of class \code{dhist_ecmf}, returned from a call +\item{dhist_ecmf1}{An object of class \code{dhist_ecmf}, returned from a call to the \code{dhist_ecmf} function} -\item{dhist_ecmf2}{An object of class \code{dhist_ecmf}, returned from a call +\item{dhist_ecmf2}{An object of class \code{dhist_ecmf}, returned from a call to the \code{dhist_ecmf} function} } \value{ @@ -19,6 +19,6 @@ area The area between the two discrete histogram ECMFs, calculated as the integral of the absolute difference between the two ECMFs } \description{ -Calculate area between two discrete histogram empirical cumulative +Calculate area between two discrete histogram empirical cumulative mass functions (ECMFs) } diff --git a/man/area_between_offset_ecmfs.Rd b/man/area_between_offset_ecmfs.Rd index 1c55beb7..9dc8149d 100644 --- a/man/area_between_offset_ecmfs.Rd +++ b/man/area_between_offset_ecmfs.Rd @@ -7,17 +7,17 @@ area_between_offset_ecmfs(ecmf1, ecmf2, offset) } \arguments{ -\item{ecmf1}{An Empirical Cululative Mass Function (ECMF) object of class +\item{ecmf1}{An Empirical Cululative Mass Function (ECMF) object of class \code{dhist_ecmf}} -\item{ecmf2}{An Empirical Cululative Mass Function (ECMF) object of class +\item{ecmf2}{An Empirical Cululative Mass Function (ECMF) object of class \code{dhist_ecmf}} \item{offset}{An offset to add to all locations of the first ECMF. Postive offsets will shift the ECMF to the right and negative ones to the left.} } \value{ -area The area between the two ECMFs, calculated as the integral of +area The area between the two ECMFs, calculated as the integral of the absolute difference between the two ECMFs } \description{ diff --git a/man/as_smoothed_dhist.Rd b/man/as_smoothed_dhist.Rd index 935b1043..1dfc0c14 100644 --- a/man/as_smoothed_dhist.Rd +++ b/man/as_smoothed_dhist.Rd @@ -18,7 +18,7 @@ A copy of a \code{dhist} object with its \code{smoothing_window_width} attribute set to the value provided \code{smoothing_window_width} parameter. } \description{ -Returns a "smoothed" copy of a \code{dhist} object with its -\code{smoothing_window_width} attribute set to the value provided +Returns a "smoothed" copy of a \code{dhist} object with its +\code{smoothing_window_width} attribute set to the value provided \code{smoothing_window_width} parameter. } diff --git a/man/as_unsmoothed_dhist.Rd b/man/as_unsmoothed_dhist.Rd index 0ce52261..d4304810 100644 --- a/man/as_unsmoothed_dhist.Rd +++ b/man/as_unsmoothed_dhist.Rd @@ -14,6 +14,6 @@ A copy of a \code{dhist} object with its \code{smoothing_window_width} attribute set to 0. } \description{ -Returns an "unsmoothed" copy of a \code{dhist} object with its +Returns an "unsmoothed" copy of a \code{dhist} object with its \code{smoothing_window_width} attribute set to 0. } diff --git a/man/binned_densities_adaptive.Rd b/man/binned_densities_adaptive.Rd index 5a1e380f..da9bdcc8 100644 --- a/man/binned_densities_adaptive.Rd +++ b/man/binned_densities_adaptive.Rd @@ -8,9 +8,9 @@ binned_densities_adaptive(densities, min_counts_per_interval, num_intervals) } \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to +Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. -Temporarily accessible during development. +Temporarily accessible during development. TODO: Remove @export prior to publishing } diff --git a/man/cost_matrix.Rd b/man/cost_matrix.Rd index 5ae538e0..10d28f75 100755 --- a/man/cost_matrix.Rd +++ b/man/cost_matrix.Rd @@ -15,6 +15,6 @@ cost_matrix(bin_centres1, bin_centres2) Cost matrix } \description{ -Generates a matrix for the cost of moving a unit of mass between each bin in +Generates a matrix for the cost of moving a unit of mass between each bin in histogram 1 and each bin in histogram 2. } diff --git a/man/count_graphlets_ego.Rd b/man/count_graphlets_ego.Rd index 55702f61..edd489db 100644 --- a/man/count_graphlets_ego.Rd +++ b/man/count_graphlets_ego.Rd @@ -10,30 +10,30 @@ count_graphlets_ego(graph, max_graphlet_size = 4, neighbourhood_size, \arguments{ \item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} \item{neighbourhood_size}{The number of steps from the source node to include nodes for each ego-network.} -\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} nodes are returned.} -\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} edges are returned.} -\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside +\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside graphlet counts to enable further processing.} } \value{ -If \code{return_ego_networks = FALSE}, returns an RxC matrix -containing counts of each graphlet (columns, C) for each ego-network in the -input graph (rows, R). Columns are labelled with graphlet IDs and rows are +If \code{return_ego_networks = FALSE}, returns an RxC matrix +containing counts of each graphlet (columns, C) for each ego-network in the +input graph (rows, R). Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network (if nodes in the input graph are labelled). If \code{return_ego_networks = TRUE}, returns a list with the following elements: \itemize{ - \item \code{graphlet_counts}: A matrix containing graphlet counts for each + \item \code{graphlet_counts}: A matrix containing graphlet counts for each ego-network in the input graph as described above. \item \code{ego_networks}: The ego-networks of the query graph. } diff --git a/man/count_graphlets_ego_scaled.Rd b/man/count_graphlets_ego_scaled.Rd index 76046db8..5dfb334f 100644 --- a/man/count_graphlets_ego_scaled.Rd +++ b/man/count_graphlets_ego_scaled.Rd @@ -8,40 +8,42 @@ count_graphlets_ego_scaled(graph, max_graphlet_size, neighbourhood_size, min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} +object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} \item{neighbourhood_size}{The number of steps from the source node to include nodes for each ego-network.} -\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} nodes are returned.} -\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} edges are returned.} -\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside +\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside graphlet counts to enable further processing.} } \value{ -If \code{return_ego_networks = FALSE}, returns an RxC matrix -containing counts of each graphlet (columns, C) for each ego-network in the -input graph (rows, R). Columns are labelled with graphlet IDs and rows are +If \code{return_ego_networks = FALSE}, returns an RxC matrix +containing counts of each graphlet (columns, C) for each ego-network in the +input graph (rows, R). Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network (if nodes in the input graph are labelled). If \code{return_ego_networks = TRUE}, returns a list with the following elements: \itemize{ - \item \code{graphlet_counts}: A matrix containing graphlet counts for each + \item \code{graphlet_counts}: A matrix containing graphlet counts for each ego-network in the input graph as described above. \item \code{ego_networks}: The ego-networks of the query graph. } } \description{ -Calculates graphlet counts for the n-step ego-network of each node in a graph, -scaled by dividing the graphlet counts for each ego-network by the total -number of possible groupings of nodes in the ego-network with the same number -of nodes as each graphlet. This scaling factor is choose(n, k), where n is the -number of nodes in the ego-network and k is the number of nodes in the graphlet. +Calculates graphlet counts for the n-step ego-network of each node in +a graph, scaled by dividing the graphlet counts for each ego-network by the +total number of possible groupings of nodes in the ego-network with the same +number of nodes as each graphlet. This scaling factor is choose(n, k), +where n is the number of nodes in the ego-network and k is the number of +nodes in the graphlet. } diff --git a/man/count_graphlets_for_graph.Rd b/man/count_graphlets_for_graph.Rd index 56db3c7d..696b78f9 100644 --- a/man/count_graphlets_for_graph.Rd +++ b/man/count_graphlets_for_graph.Rd @@ -9,16 +9,16 @@ count_graphlets_for_graph(graph, max_graphlet_size) \arguments{ \item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} } \value{ Vector containing counts of each graphlet for the graph. } \description{ -Calculates total graphlet counts for a \code{igraph} graph object using the +Calculates total graphlet counts for a \code{igraph} graph object using the ORCA fast graphlet orbit counting package. Per-node graphlet counts are calculated by summing orbits over graphlets. These are then divided by the -number of nodes comprising each graphlet to avoid counting the same graphlet +number of nodes comprising each graphlet to avoid counting the same graphlet multiple times. } diff --git a/man/count_graphlets_per_node.Rd b/man/count_graphlets_per_node.Rd index c8353aed..25fa1c67 100644 --- a/man/count_graphlets_per_node.Rd +++ b/man/count_graphlets_per_node.Rd @@ -9,15 +9,15 @@ count_graphlets_per_node(graph, max_graphlet_size) \arguments{ \item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} } \value{ -ORCA-format matrix containing counts of each graphlet (columns) at +ORCA-format matrix containing counts of each graphlet (columns) at each node in the graph (rows). } \description{ -Calculates graphlet counts for each node in an \code{igraph} graph object, -using the ORCA fast graphlet orbit counting package. by summing orbits over +Calculates graphlet counts for each node in an \code{igraph} graph object, +using the ORCA fast graphlet orbit counting package. by summing orbits over graphlets. } diff --git a/man/count_orbits_per_node.Rd b/man/count_orbits_per_node.Rd index d8233c32..76fd28cb 100644 --- a/man/count_orbits_per_node.Rd +++ b/man/count_orbits_per_node.Rd @@ -9,7 +9,7 @@ count_orbits_per_node(graph, max_graphlet_size) \arguments{ \item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} } \value{ @@ -17,6 +17,6 @@ ORCA-format matrix containing counts of each graphlet orbit (columns) at each node in the graph (rows). } \description{ -Calculates graphlet orbit counts for each node in an \code{igraph} graph +Calculates graphlet orbit counts for each node in an \code{igraph} graph object, using the ORCA fast graphlet orbit counting package. } diff --git a/man/cross_comp_to_matrix.Rd b/man/cross_comp_to_matrix.Rd index d26bfa48..7b76edd0 100644 --- a/man/cross_comp_to_matrix.Rd +++ b/man/cross_comp_to_matrix.Rd @@ -15,7 +15,7 @@ using \code{cross_comparison_spec}} \value{ A square symmetric matrix with a zero diagonal, with elements Cij and Cji populated from the element from \code{measure} corresponding to -the row of \code{cross_comparison_spec} with \code{index_a = i} and +the row of \code{cross_comparison_spec} with \code{index_a = i} and \code{index_b = j} } \description{ diff --git a/man/cross_comparison_spec.Rd b/man/cross_comparison_spec.Rd index 790a74b5..1f64d9b1 100644 --- a/man/cross_comparison_spec.Rd +++ b/man/cross_comparison_spec.Rd @@ -12,8 +12,8 @@ cross-comparison is required.} } \value{ A matrix with one row for each possible pair-wise combination -of elements from the provided named list. The first and second columns -contain the names of the elements in the pair and the third and fourth +of elements from the provided named list. The first and second columns +contain the names of the elements in the pair and the third and fourth columns contain the indexes of these elements in the provided list. } \description{ diff --git a/man/dhist.Rd b/man/dhist.Rd index cb9e6890..4566df12 100644 --- a/man/dhist.Rd +++ b/man/dhist.Rd @@ -10,7 +10,7 @@ dhist(locations, masses, smoothing_window_width = 0, sorted = TRUE) \item{locations}{A 1D numeric vector specifying the discrete locations of the histogram bins} -\item{masses}{A 1D numeric vector specifying the mass present at each +\item{masses}{A 1D numeric vector specifying the mass present at each location} \item{smoothing_window_width}{If greater than 0, the discrete histogram will @@ -18,7 +18,7 @@ be treated as having the mass at each location "smoothed" uniformly across a bin centred on the location and having width = \code{smoothing_window_width} (default = \code{0} - no smoothing)} -\item{sorted}{Whether or not to return a discrete histogram with locations +\item{sorted}{Whether or not to return a discrete histogram with locations and masses sorted by ascending mass (default = \code{TRUE})} } \value{ @@ -31,10 +31,10 @@ is a list of class \code{dhist} with the following named elements: Note that locations where no mass is present are not included in the returned \code{dhist} object. Mass in these discrete histograms is treated as being present precisely at the specified location. Discrete histograms should not be used -for data where observations have been grouped into bins representing ranges +for data where observations have been grouped into bins representing ranges of observation values. } \description{ -Creates a discrete histogram object of class \code{dhist}, with bin +Creates a discrete histogram object of class \code{dhist}, with bin \code{locations} and \code{masses} set to the 1D numeric vectors provided. } diff --git a/man/dhist_ecmf.Rd b/man/dhist_ecmf.Rd index 132b5f23..447f0966 100644 --- a/man/dhist_ecmf.Rd +++ b/man/dhist_ecmf.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/dhist.R \name{dhist_ecmf} \alias{dhist_ecmf} -\title{Generate interpolating empirical cumulative mass function (ECMF) for +\title{Generate interpolating empirical cumulative mass function (ECMF) for a discrete histogram} \usage{ dhist_ecmf(dhist) @@ -12,13 +12,13 @@ dhist_ecmf(dhist) } \value{ An interpolating ECMF as an \code{approxfun} object. This function -will return the interpolated cumulative mass for a vector of arbitrary +will return the interpolated cumulative mass for a vector of arbitrary locations. If \code{dhist$smoothing_window_width} is zero, the ECMF will be piecewise constant. If \code{dhist$smoothing_window_width} is one, the ECMF will be piece-wise linear. If \code{dhist$smoothing_window_width} is any other value, the ECMF will not correctly represent the cumulative mass. } \description{ -Generate interpolating empirical cumulative mass function (ECMF) for +Generate interpolating empirical cumulative mass function (ECMF) for a discrete histogram } diff --git a/man/dhist_std.Rd b/man/dhist_std.Rd index 544d6e6b..120064c5 100644 --- a/man/dhist_std.Rd +++ b/man/dhist_std.Rd @@ -13,8 +13,8 @@ dhist_std(dhist) Standard deviation of histogram } \description{ -Calculates standard deviation directly from the discrete histogram by using +Calculates standard deviation directly from the discrete histogram by using locations weighted by masses. -NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses may not represent counts so N is not necessarily known } diff --git a/man/dhist_variance.Rd b/man/dhist_variance.Rd index 6ea354f5..f35c7eb9 100644 --- a/man/dhist_variance.Rd +++ b/man/dhist_variance.Rd @@ -14,7 +14,7 @@ Variance of histogram } \description{ Calculates variance directly from the discrete histogram by using locations -weighted by masses. -NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses +weighted by masses. +NOTE: Does not apply bias correction (i.e. N-1 denominator) as masses may not represent counts so N is not necessarily known } diff --git a/man/ecmf_knots.Rd b/man/ecmf_knots.Rd index aed18724..5d75167d 100644 --- a/man/ecmf_knots.Rd +++ b/man/ecmf_knots.Rd @@ -4,23 +4,23 @@ \alias{ecmf_knots} \title{Get "knots" for discrete histogram empirical cumulative mass function (ECMF). The "knots" are the x-values at which the y-value of the ECDM changes -gradient (i.e. the x-values between which the ECMF does its constant or +gradient (i.e. the x-values between which the ECMF does its constant or linear interpolates)} \usage{ ecmf_knots(dhist_ecmf) } \arguments{ -\item{dhist_ecmf}{An object of class \code{dhist_ecmf}, returned from a call +\item{dhist_ecmf}{An object of class \code{dhist_ecmf}, returned from a call to the \code{dhist_ecmf} function} } \value{ -x_knots A list of "knots" for the ECMF, containing all x-values at +x_knots A list of "knots" for the ECMF, containing all x-values at which the y-value changes gradient (i.e. the x-values between which the ECMF does its constant or linear interpolation) } \description{ Get "knots" for discrete histogram empirical cumulative mass function (ECMF). The "knots" are the x-values at which the y-value of the ECDM changes -gradient (i.e. the x-values between which the ECMF does its constant or +gradient (i.e. the x-values between which the ECMF does its constant or linear interpolates) } diff --git a/man/ego_to_graphlet_counts.Rd b/man/ego_to_graphlet_counts.Rd index 9fe6b0b5..79aed9f8 100644 --- a/man/ego_to_graphlet_counts.Rd +++ b/man/ego_to_graphlet_counts.Rd @@ -12,13 +12,13 @@ ego_to_graphlet_counts(ego_networks, max_graphlet_size = 4) \arguments{ \item{ego_networks}{Named list of ego networks for a graph.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} } \value{ -returns an RxC matrix -containing counts of each graphlet (columns, C) for each ego-network (rows, R). -Columns are labelled with graphlet IDs and rows are +returns an RxC matrix +containing counts of each graphlet (columns, C) for each ego-network (rows, R). +Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network. } \description{ diff --git a/man/emd_cs.Rd b/man/emd_cs.Rd index 112b3977..1e580760 100755 --- a/man/emd_cs.Rd +++ b/man/emd_cs.Rd @@ -16,7 +16,7 @@ Earth Mover's Distance between the two input histograms } \description{ Takes two discrete histograms and calculates the Wasserstein / Earth Mover's -Distance between the two histograms by summing the absolute difference +Distance between the two histograms by summing the absolute difference between the two cumulative histograms. } \references{ diff --git a/man/emd_lp.Rd b/man/emd_lp.Rd index 9c90e3b9..f934de99 100755 --- a/man/emd_lp.Rd +++ b/man/emd_lp.Rd @@ -19,7 +19,7 @@ emd_lp(bin_masses1, bin_masses2, bin_centres1, bin_centres2) Earth Mover's Distance between the two input histograms } \description{ -Takes two sets of histogram bin masses and bin centres and calculates the +Takes two sets of histogram bin masses and bin centres and calculates the Earth Mover's Distance between the two histograms by solving the Transport Problem using linear programming. } diff --git a/man/gdd.Rd b/man/gdd.Rd index 83dc624e..e33a8699 100644 --- a/man/gdd.Rd +++ b/man/gdd.Rd @@ -14,7 +14,7 @@ gdd(graph, feature_type = "orbit", max_graphlet_size = 4, counts the number of graphlets each node participates in; "orbit" calculates the number of graphlet orbits each node participates in.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} \item{ego_neighbourhood_size}{The number of steps from the source node to include diff --git a/man/gdd_for_all_graphs.Rd b/man/gdd_for_all_graphs.Rd index 51f4380d..98e55666 100644 --- a/man/gdd_for_all_graphs.Rd +++ b/man/gdd_for_all_graphs.Rd @@ -16,13 +16,13 @@ gdd_for_all_graphs(source_dir, format = "ncol", pattern = ".txt", \item{pattern}{Filename pattern to match graph files} -\item{feature_type}{Type of graphlet-based degree distributions. Can be +\item{feature_type}{Type of graphlet-based degree distributions. Can be \code{graphlet} to count graphlets or \code{orbit} to count orbits.} \item{max_graphlet_size}{Maximum size of graphlets to use when generating GDD} -\item{ego_neighbourhood_size}{The number of steps from the source node to -include nodes for each ego-network. If set to 0, ego-networks will not be +\item{ego_neighbourhood_size}{The number of steps from the source node to +include nodes for each ego-network. If set to 0, ego-networks will not be used} \item{mc.cores}{Number of cores to use for parallel processing. Defaults to @@ -32,12 +32,12 @@ the \code{mc.cores} option set in the R environment.} A named list where each element contains a set of GDDs for a single A named list where each element contains a set of GDDs for a single -graph from the source directory. Each set of GDDs is itself a named list, +graph from the source directory. Each set of GDDs is itself a named list, where each GDD element is a \code{dhist} discrete histogram object. } \description{ Loads graphs from all files matching the given pattern in the given directory, -converts them to indexed edge lists compatible with the ORCA fast orbit -counting package and calculates the specified set of graphlet-based degree +converts them to indexed edge lists compatible with the ORCA fast orbit +counting package and calculates the specified set of graphlet-based degree distributions usingthe ORCA package. } diff --git a/man/graph_features_to_histograms.Rd b/man/graph_features_to_histograms.Rd index a003a710..877ce9c1 100644 --- a/man/graph_features_to_histograms.Rd +++ b/man/graph_features_to_histograms.Rd @@ -2,22 +2,23 @@ % Please edit documentation in R/orca_interface.R \name{graph_features_to_histograms} \alias{graph_features_to_histograms} -\title{Convert a matrix of node level features to a discrete histogram for each feature} +\title{Convert a matrix of node level features to a discrete histogram for +each feature.} \usage{ -graph_features_to_histograms(featuresMatrix) +graph_features_to_histograms(features_matrix) } \arguments{ -\item{A}{number of nodes (rows) by number of features (columns) matrix, where -the ij entry is the score of node i on feature j (e.g. for ORCA output this is -counts of each graphlet or orbit at each graph vertex)} +\item{A}{number of nodes (rows) by number of features (columns) matrix, where +the ij entry is the score of node i on feature j (e.g. for ORCA output this +is counts of each graphlet or orbit at each graph vertex)} } \value{ -Feature histograms: List of discrete histograms for each +Feature histograms: List of discrete histograms for each feature } \description{ -Converts a matrix of node level features (e.g. for ORCA output this is counts -of each graphlet or orbit at each graph vertex) to +Converts a matrix of node level features (e.g. for ORCA output this is counts +of each graphlet or orbit at each graph vertex) to a set of discrete histograms (a histogram of counts for each distinct value across all graph vertices for each feature with no binning) } diff --git a/man/graph_to_indexed_edges.Rd b/man/graph_to_indexed_edges.Rd index fa72273d..02aa7cc0 100644 --- a/man/graph_to_indexed_edges.Rd +++ b/man/graph_to_indexed_edges.Rd @@ -18,6 +18,6 @@ the label for the vertice represented by index N in the edgelist Takes a igraph graph object and generates an edgelist where each edge is represented by the integer indexes of its vertices. Note that, where a graph has isolated vertices, the indexes for these vertices will not be present -in the edge list. Where a graph has no isolated vertices, the edge list will +in the edge list. Where a graph has no isolated vertices, the edge list will include all vertex indexes from 1 to numVertices. } diff --git a/man/graphlet_key.Rd b/man/graphlet_key.Rd index c2b4ce84..b60e6f59 100644 --- a/man/graphlet_key.Rd +++ b/man/graphlet_key.Rd @@ -13,7 +13,7 @@ graphlet_key(max_graphlet_size) Metadata list with the following named fields: \itemize{ \item \code{max_nodes}: Maximum number of nodes graphlets can contain - \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to + \item \code{id}: ID of each graphlet in format Gn, where n is in range 0 to num_graphlets \item \code{node_count}: Number of nodes contained within each graphlet } diff --git a/man/harmonise_dhist_locations.Rd b/man/harmonise_dhist_locations.Rd index 6b941442..18b7e34a 100644 --- a/man/harmonise_dhist_locations.Rd +++ b/man/harmonise_dhist_locations.Rd @@ -16,6 +16,6 @@ Harmonised histograms } \description{ Where a location only exists in one histogram, add this location to the other -histogram with zero mass. This ensures that all location exist in both +histogram with zero mass. This ensures that all location exist in both histograms. } diff --git a/man/indexed_edges_to_graph.Rd b/man/indexed_edges_to_graph.Rd index c51e223c..971da377 100644 --- a/man/indexed_edges_to_graph.Rd +++ b/man/indexed_edges_to_graph.Rd @@ -7,14 +7,14 @@ indexed_edges_to_graph(indexed_edges) } \arguments{ -\item{indexed_edges}{A 2 x numEdges edgelist with vertices labelled with +\item{indexed_edges}{A 2 x numEdges edgelist with vertices labelled with integer indices, with an optional "vertex_names" attribute} } \value{ An igraph graph object } \description{ -Takes an integer indexed edgelist (where each edge is represented by the +Takes an integer indexed edgelist (where each edge is represented by the integer indexes of its vertices) and converts it to an igraph format graph. If the edge list has a "vertex_names" attribute, this will be used to name the vertices in the resultant graph. diff --git a/man/interval_index.Rd b/man/interval_index.Rd index 2c8a539e..94a5f40a 100644 --- a/man/interval_index.Rd +++ b/man/interval_index.Rd @@ -12,9 +12,9 @@ interval_index(x, breaks, out_of_range_intervals = FALSE) \item{breaks}{The boundaries between bins} \item{out_of_range_intervals}{If \code{TRUE}, "out of range" values lying -below the first break or above the last break will be assigned to additional -unbounded lower and upper extrema intervals. If \code{FALSE} these "out of -range" values will be assigned to intervals bounded by the lowest or +below the first break or above the last break will be assigned to additional +unbounded lower and upper extrema intervals. If \code{FALSE} these "out of +range" values will be assigned to intervals bounded by the lowest or uppermost break respectively.} } \value{ diff --git a/man/is_dhist.Rd b/man/is_dhist.Rd index 429a4b3e..1f550d6f 100644 --- a/man/is_dhist.Rd +++ b/man/is_dhist.Rd @@ -9,13 +9,13 @@ is_dhist(x, fast_check = TRUE) \arguments{ \item{x}{An arbitrary object} -\item{fast_check}{Boolean flag indicating whether to perform only a -superficial fast check limited to checking the object's class attribute +\item{fast_check}{Boolean flag indicating whether to perform only a +superficial fast check limited to checking the object's class attribute is set to \code{dhist} (default = \code{TRUE})} } \description{ -Checks if the input object is of class \code{dhist}. If \code{fast_check} is -\code{TRUE} then the only check is whether the object has a class attribute of +Checks if the input object is of class \code{dhist}. If \code{fast_check} is +\code{TRUE} then the only check is whether the object has a class attribute of \code{dhist}. If \code{fast_check} is \code{FALSE} (default), then checks are also made to ensure that the object has the structure required of a \code{dhist} object. diff --git a/man/make_named_ego_graph.Rd b/man/make_named_ego_graph.Rd index 037c0395..1c0db5a7 100644 --- a/man/make_named_ego_graph.Rd +++ b/man/make_named_ego_graph.Rd @@ -13,13 +13,13 @@ make_named_ego_graph(graph, order, min_ego_nodes = 3, \item{order}{The number of steps from the source node to include nodes for each ego-network.} -\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} +\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} nodes are returned.} -\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} +\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} edges are returned.} -\item{...}{Additional parameters to be passed to the underlying +\item{...}{Additional parameters to be passed to the underlying \code{igraph::make_ego_graph} function used.} } \description{ diff --git a/man/mean_centre_dhist.Rd b/man/mean_centre_dhist.Rd index 933e68c9..b4b18ae1 100644 --- a/man/mean_centre_dhist.Rd +++ b/man/mean_centre_dhist.Rd @@ -13,6 +13,6 @@ mean_centre_dhist(dhist) The mass-weighted mean location } \description{ -Centres a discrete histogram around its mass-weighted mean location by +Centres a discrete histogram around its mass-weighted mean location by subtracting the mass-weighted mean from each location. } diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index 4aac4a0d..4f6e940f 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -8,9 +8,9 @@ mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes) } \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to +Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. -Temporarily accessible during development. +Temporarily accessible during development. TODO: Remove @export prior to publishing } diff --git a/man/min_emd.Rd b/man/min_emd.Rd index 3b78a6b8..2799f48a 100644 --- a/man/min_emd.Rd +++ b/man/min_emd.Rd @@ -11,19 +11,19 @@ min_emd(dhist1, dhist2, method = "optimise") \item{dhist2}{A \code{dhist} discrete histogram object} -\item{method}{The method to use to find the minimum EMD across all potential +\item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use -R's built-in \code{stats::optimise} method to efficiently find the offset -with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +R's built-in \code{stats::optimise} method to efficiently find the offset +with the minimal EMD. However, this is not guaranteed to find the global +minimum if multiple local minima EMDs exist. You can alternatively specify the +"exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} } \value{ Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete +Calculates the minimum Earth Mover's Distance (EMD) between two discrete histograms. This is the minimum EMD between the two histograms across all possible offsets of histogram 1 against histogram 2. } diff --git a/man/min_emd_exhaustive.Rd b/man/min_emd_exhaustive.Rd index ca20cae5..cd02830b 100644 --- a/man/min_emd_exhaustive.Rd +++ b/man/min_emd_exhaustive.Rd @@ -15,18 +15,18 @@ min_emd_exhaustive(dhist1, dhist2) Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete +Calculates the minimum Earth Mover's Distance (EMD) between two discrete histograms using an exhaustive search. } \details{ -When "sliding" two piecewise-linear empirical cumulative mass functions -(ECMFs) across each other to minimise the EMD between them, it is sufficient -to calculate the EMD at all offsets where any knots from the two ECMFs align +When "sliding" two piecewise-linear empirical cumulative mass functions +(ECMFs) across each other to minimise the EMD between them, it is sufficient +to calculate the EMD at all offsets where any knots from the two ECMFs align to ensure that the offset with the global minimum EMD is found. -This is because of the piece-wise linear nature of the two ECMFs. Between any +This is because of the piece-wise linear nature of the two ECMFs. Between any two offsets where knots from the two ECMFs align, EMD will be either constant, -or uniformly increasing or decreasing. Therefore, there the EMD between two -sets of aligned knots cannot be smaller than the EMD at one or other of the +or uniformly increasing or decreasing. Therefore, there the EMD between two +sets of aligned knots cannot be smaller than the EMD at one or other of the bounding offsets. } diff --git a/man/min_emd_optimise.Rd b/man/min_emd_optimise.Rd index 25eb0e9e..19b04b1d 100644 --- a/man/min_emd_optimise.Rd +++ b/man/min_emd_optimise.Rd @@ -15,7 +15,7 @@ min_emd_optimise(dhist1, dhist2) Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete -histograms by minimising the offset parameter of the \code{emd} function +Calculates the minimum Earth Mover's Distance (EMD) between two discrete +histograms by minimising the offset parameter of the \code{emd} function using the built-in \code{stats::optimise} method. } diff --git a/man/min_emd_optimise_fast.Rd b/man/min_emd_optimise_fast.Rd index 8f9931e8..ecc1da25 100644 --- a/man/min_emd_optimise_fast.Rd +++ b/man/min_emd_optimise_fast.Rd @@ -15,7 +15,7 @@ min_emd_optimise_fast(dhist1, dhist2) Earth Mover's Distance between the two discrete histograms } \description{ -Calculates the minimum Earth Mover's Distance (EMD) between two discrete -histograms by minimising the offset parameter of the \code{emd} function +Calculates the minimum Earth Mover's Distance (EMD) between two discrete +histograms by minimising the offset parameter of the \code{emd} function using the built-in \code{stats::optimise} method. } diff --git a/man/net_emd.Rd b/man/net_emd.Rd index ef9d5851..dd0815d2 100755 --- a/man/net_emd.Rd +++ b/man/net_emd.Rd @@ -12,27 +12,27 @@ net_emd(dhists1, dhists2, method = "optimise", return_details = FALSE, \item{dhists2}{A \code{dhist} discrete histogram object or a list of such objects} -\item{method}{The method to use to find the minimum EMD across all potential +\item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use -R's built-in \code{stats::optimise} method to efficiently find the offset -with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +R's built-in \code{stats::optimise} method to efficiently find the offset +with the minimal EMD. However, this is not guaranteed to find the global +minimum if multiple local minima EMDs exist. You can alternatively specify the +"exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} \item{return_details}{Logical indicating whether to return the individual minimal EMDs and associated offsets for all pairs of histograms} \item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to -"smear" point masses across a finite width in the real domain. Default is 0, -which results in no smoothing. Care should be taken to select a -\code{smoothing_window_width} that is appropriate for the discrete domain +"smear" point masses across a finite width in the real domain. Default is 0, +which results in no smoothing. Care should be taken to select a +\code{smoothing_window_width} that is appropriate for the discrete domain (e.g.for the integer domain a width of 1 is the natural choice)} } \value{ -NetEMD measure for the two sets of discrete histograms +NetEMD measure for the two sets of discrete histograms (\code{return_details = FALSE}) or a list with the following named elements -\code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_emds}: +\code{net_emd}: the NetEMD for the set of histogram pairs, \code{min_emds}: the minimal EMD for each pair of histograms, \code{min_offsets}: the associated offsets giving the minimal EMD for each pair of histograms } diff --git a/man/net_emds_for_all_graphs.Rd b/man/net_emds_for_all_graphs.Rd index 071d9af2..e01b7ae4 100644 --- a/man/net_emds_for_all_graphs.Rd +++ b/man/net_emds_for_all_graphs.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_emd.R \name{net_emds_for_all_graphs} \alias{net_emds_for_all_graphs} -\title{NetEMDs between all graph pairs using provided Graphlet-based Degree +\title{NetEMDs between all graph pairs using provided Graphlet-based Degree Distributions} \usage{ net_emds_for_all_graphs(gdds, method = "optimise", @@ -10,21 +10,21 @@ net_emds_for_all_graphs(gdds, method = "optimise", mc.cores = getOption("mc.cores", 2L)) } \arguments{ -\item{gdds}{List containing sets of Graphlet-based Degree Distributions for +\item{gdds}{List containing sets of Graphlet-based Degree Distributions for all graphs being compared} -\item{method}{The method to use to find the minimum EMD across all potential +\item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use -R's built-in \code{stats::optimise} method to efficiently find the offset -with the minimal EMD. However, this is not guaranteed to find the global -minimum if multiple local minima EMDs exist. You can alternatively specify the -"exhaustive" method, which will exhaustively evaluate the EMD between the +R's built-in \code{stats::optimise} method to efficiently find the offset +with the minimal EMD. However, this is not guaranteed to find the global +minimum if multiple local minima EMDs exist. You can alternatively specify the +"exhaustive" method, which will exhaustively evaluate the EMD between the histograms at all offsets that are candidates for the minimal EMD.} \item{smoothing_window_width}{Width of "top-hat" smoothing window to apply to -"smear" point masses across a finite width in the real domain. Default is 0, -which results in no smoothing. Care should be taken to select a -\code{smoothing_window_width} that is appropriate for the discrete domain +"smear" point masses across a finite width in the real domain. Default is 0, +which results in no smoothing. Care should be taken to select a +\code{smoothing_window_width} that is appropriate for the discrete domain (e.g.for the integer domain a width of 1 is the natural choice)} \item{return_details}{Logical indicating whether to return the individual @@ -34,18 +34,18 @@ minimal EMDs and associated offsets for all pairs of histograms} the \code{mc.cores} option set in the R environment.} } \value{ -NetEMD measures between all pairs of graphs for which GDDs +NetEMD measures between all pairs of graphs for which GDDs were provided. Format of returned data depends on the \code{return_details} parameter. If set to FALSE, a list is returned with the following named -elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, -\code{comp_spec}: a comaprison specification table containing the graph names +elements:\code{net_emd}: a vector of NetEMDs for each pair of graphs, +\code{comp_spec}: a comaprison specification table containing the graph names and indices within the input GDD list for each pair of graphs compared. If \code{return_details} is set to FALSE, the list also contains the following -matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD +matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD used to compute the NetEMD, \code{min_offsets}: the associated offsets giving the minimal EMD for each GDD } \description{ -NetEMDs between all graph pairs using provided Graphlet-based Degree +NetEMDs between all graph pairs using provided Graphlet-based Degree Distributions } diff --git a/man/netdis.Rd b/man/netdis.Rd index 64843d89..9f477425 100644 --- a/man/netdis.Rd +++ b/man/netdis.Rd @@ -16,7 +16,7 @@ netdis(centred_graphlet_counts1, centred_graphlet_counts2, graphlet_size) a graphlet is the number of nodes it contains.} } \value{ -Netdis statistic calculated using centred counts for graphlets of +Netdis statistic calculated using centred counts for graphlets of the specified size } \description{ diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd index 7b183f38..f9aaba3d 100644 --- a/man/netdis_centred_graphlet_counts.Rd +++ b/man/netdis_centred_graphlet_counts.Rd @@ -8,19 +8,21 @@ netdis_centred_graphlet_counts(graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn = NULL) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an +\code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes +will be counted.} \item{neighbourhood_size}{The number of steps from the source node to include nodes for each ego-network.} \item{expected_ego_count_fn}{A function for generating expected ego-network -graphlet counts for a graph. This function should take a connected, -undirected, simple graph as an \code{igraph} object for its only argument. -Where \code{expected_ego_count_fn} is specific to particular values of -\code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken +graphlet counts for a graph. This function should take a connected, +undirected, simple graph as an \code{igraph} object for its only argument. +Where \code{expected_ego_count_fn} is specific to particular values of +\code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken to ensure that the values of these parameters passed to this function are consistent with those used when creating \code{expected_ego_count_fn}.} } diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index 896936ea..b3d3e557 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -8,9 +8,9 @@ netdis_expected_graphlet_counts(graph, max_graphlet_size, density_breaks, density_binned_reference_counts) } \description{ -Used by \code{netdis_expected_graphlet_counts_ego} to -calculate expected graphlet counts for a query graph ego-network from the +Used by \code{netdis_expected_graphlet_counts_ego} to +calculate expected graphlet counts for a query graph ego-network from the statistics of a provided reference graph. -Temporarily accessible during development. +Temporarily accessible during development. TODO: Remove @export prior to publishing } diff --git a/man/netdis_expected_graphlet_counts_ego.Rd b/man/netdis_expected_graphlet_counts_ego.Rd index 73cf94cf..ede22ee3 100644 --- a/man/netdis_expected_graphlet_counts_ego.Rd +++ b/man/netdis_expected_graphlet_counts_ego.Rd @@ -9,9 +9,9 @@ netdis_expected_graphlet_counts_ego(graph, max_graphlet_size, min_ego_nodes = 3, min_ego_edges = 1) } \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to +Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. -Temporarily accessible during development. +Temporarily accessible during development. TODO: Remove @export prior to publishing } diff --git a/man/netdis_expected_graphlet_counts_ego_fn.Rd b/man/netdis_expected_graphlet_counts_ego_fn.Rd index 5655b8f0..328cf6bf 100644 --- a/man/netdis_expected_graphlet_counts_ego_fn.Rd +++ b/man/netdis_expected_graphlet_counts_ego_fn.Rd @@ -9,35 +9,36 @@ netdis_expected_graphlet_counts_ego_fn(graph, max_graphlet_size, min_bin_count = 5, num_bins = 100) } \arguments{ -\item{graph}{A connected, undirected, simple reference graph as an +\item{graph}{A connected, undirected, simple reference graph as an \code{igraph} object.} -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} \item{neighbourhood_size}{The number of steps from the source node to include node in ego-network.} } \value{ -A function taking a connected, undirected, simple query graph as an -\code{igraph} object and returning an RxC matrix containing the expected -counts of each graphlet (columns, C) for each ego-network in the query graph -(rows, R). Columns are labelled with graphlet IDs and rows are labelled with -the ID of the central node in each ego-network (if nodes in the query graph +A function taking a connected, undirected, simple query graph as an +\code{igraph} object and returning an RxC matrix containing the expected +counts of each graphlet (columns, C) for each ego-network in the query graph +(rows, R). Columns are labelled with graphlet IDs and rows are labelled with +the ID of the central node in each ego-network (if nodes in the query graph are labelled) } \description{ -Generates a function to calculate expected ego-network graphlet counts for +Generates a function to calculate expected ego-network graphlet counts for query graphs based on the statistics of a provided reference graph. } \details{ -Generates graphlet counts for all ego-networks in the supplied reference graph -and then averages these graphlet counts over density bins to generate -density-dependent reference graphlet counts. Prior to averaging, the graphlet -counts are scaled in a size-dependent manner to permit ego-networks with -similar densities but different sizes to be averaged together. +Generates graphlet counts for all ego-networks in the supplied +reference graph and then averages these graphlet counts over density bins to +generate density-dependent reference graphlet counts. Prior to averaging, +the graphlet counts are scaled in a size-dependent manner to permit +ego-networks with similar densities but different sizes to be averaged +together. -Returns a function that uses the density-dependent reference graphlet +Returns a function that uses the density-dependent reference graphlet counts to generate expected graphlet counts for all ego-networks in a query network. When doing so, it matches ego-networks to reference counts by density and reverses the scaling that was applied to the original reference diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index 50d5dd4b..d68c3114 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -13,9 +13,9 @@ ego networks to the function, not the input query graph (as in netdis_expected_graphlet_counts_ego_fn above). } \details{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to +Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. -Temporarily accessible during development. +Temporarily accessible during development. TODO: Remove @export prior to publishing } diff --git a/man/netdis_for_all_graphs.Rd b/man/netdis_for_all_graphs.Rd index e8ebe66d..fb30bad8 100644 --- a/man/netdis_for_all_graphs.Rd +++ b/man/netdis_for_all_graphs.Rd @@ -8,7 +8,7 @@ netdis_for_all_graphs(centred_graphlet_counts, graphlet_size, mc.cores = getOption("mc.cores", 2L)) } \arguments{ -\item{centred_graphlet_counts}{List containing Centred Graphlet Counts for +\item{centred_graphlet_counts}{List containing Centred Graphlet Counts for all graphs being compared} \item{graphlet_size}{The size of graphlets to use for the Netdis calculation @@ -16,7 +16,7 @@ all graphs being compared} a graphlet is the number of nodes it contains.} } \value{ -Pairwise Netdis statistics between graphs calculated using centred +Pairwise Netdis statistics between graphs calculated using centred counts for graphlets of the specified size } \description{ diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd new file mode 100644 index 00000000..02447d0e --- /dev/null +++ b/man/netdis_one_to_one.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_one_to_one} +\alias{netdis_one_to_one} +\title{Netdis between two graphs} +\usage{ +netdis_one_to_one(graph_1, graph_2, ref_graph, max_graphlet_size = 4, + neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, + min_bin_count = 5, num_bins = 100) +} +\arguments{ +\item{graph_1}{First query graph} + +\item{graph_2}{Second query graph} + +\item{ref_graph}{Reference graph} + +\item{max_graphlet_size}{Generate graphlets up to this size} + +\item{neighbourhood_size}{Ego network neighbourhood size} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges} + +\item{min_bin_count}{Minimum number of ego networks in each density bin} + +\item{num_bins}{Number of density bins to generate} +} +\value{ +Netdis statistics between graph_1 and graph_2 for graphlet sizes +up to and including max_graphlet_size +} +\description{ +Netdis between two graphs +} diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd index c31a602d..7e491b60 100644 --- a/man/netdis_uptok.Rd +++ b/man/netdis_uptok.Rd @@ -12,12 +12,12 @@ netdis_uptok(centred_graphlet_counts1, centred_graphlet_counts2, \item{centred_graphlet_counts2}{Centred Graphlet Counts for graph 2} -\item{max_graphlet_size}{The size of graphlets to use for the Netdis calculation -The size of a graphlet is the number of nodes it contains. Netdis is calculated -for all graphlets from size 3 to size max_graphlet_size.} +\item{max_graphlet_size}{max graphlet size to calculate Netdis for. +The size of a graphlet is the number of nodes it contains. Netdis is +calculated for all graphlets from size 3 to size max_graphlet_size.} } \value{ -Netdis statistic calculated using centred counts for graphlets of +Netdis statistic calculated using centred counts for graphlets of the specified size } \description{ diff --git a/man/normalise_dhist_mass.Rd b/man/normalise_dhist_mass.Rd index d4856d76..c732c203 100644 --- a/man/normalise_dhist_mass.Rd +++ b/man/normalise_dhist_mass.Rd @@ -13,6 +13,6 @@ normalise_dhist_mass(dhist) A discrete histogram normalised to have mass 1 } \description{ -Normalises a discrete histogram to unit mass by dividing each mass by the +Normalises a discrete histogram to unit mass by dividing each mass by the total of the non-normalised masses } diff --git a/man/normalise_dhist_variance.Rd b/man/normalise_dhist_variance.Rd index 2e3b65d9..ce7d0537 100644 --- a/man/normalise_dhist_variance.Rd +++ b/man/normalise_dhist_variance.Rd @@ -14,6 +14,6 @@ A discrete histogram normalised to have variance 1 } \description{ Normalises a discrete histogram to unit variance by dividing each centred -location by the standard deviation of the discrete histogram before +location by the standard deviation of the discrete histogram before decentering } diff --git a/man/orbit_key.Rd b/man/orbit_key.Rd index de20fd9f..7d4cd43f 100644 --- a/man/orbit_key.Rd +++ b/man/orbit_key.Rd @@ -13,7 +13,7 @@ orbit_key(max_graphlet_size) Metadata list with the following named fields: \itemize{ \item \code{max_nodes}: Maximum number of nodes graphlets can contain - \item \code{id}: ID of each graphlet in format On, where n is in range 0 to + \item \code{id}: ID of each graphlet in format On, where n is in range 0 to num_orbits \item \code{node_count}: Number of nodes contained within each graphlet } diff --git a/man/orbit_to_graphlet_counts.Rd b/man/orbit_to_graphlet_counts.Rd index e763c12d..835ecb6e 100644 --- a/man/orbit_to_graphlet_counts.Rd +++ b/man/orbit_to_graphlet_counts.Rd @@ -15,6 +15,6 @@ An ORCA-style matrix containing counts of each graphlet (columns) at each vertex in the graph (rows) } \description{ -Converts graphlet orbit counts at each vertex to graphlet counts at each +Converts graphlet orbit counts at each vertex to graphlet counts at each vertex by summing over all orbits contained within each graphlet } diff --git a/man/read_simple_graph.Rd b/man/read_simple_graph.Rd index d97588b7..3895b0ef 100644 --- a/man/read_simple_graph.Rd +++ b/man/read_simple_graph.Rd @@ -11,17 +11,17 @@ read_simple_graph(file, format, as_undirected = TRUE, \arguments{ \item{file}{Path to file containing graph data} -\item{format}{Format of graph data. All formats supported by +\item{format}{Format of graph data. All formats supported by \code{igraph::read_graph} are supported.} \item{as_undirected}{If TRUE make graph edges undirected} \item{remove_loops}{If TRUE, remove edgeds that connect a vertex to itself} -\item{remove_multiple}{If TRUE remove multiple edges connencting the same +\item{remove_multiple}{If TRUE remove multiple edges connencting the same pair of vertices} -\item{remove_isolates}{If TRUE, remove vertices with no edges after the +\item{remove_isolates}{If TRUE, remove vertices with no edges after the previous alterations have been made} } \value{ @@ -32,8 +32,8 @@ Reads graph data from file, constructing an a igraph graph object, making the requested subset of the following simplifications in the following order: 1. Makes the graph undirected 2. Removes loops (where both endpoints of an edge are the same vertex) - 3. Removes multiple edges (i.e. ensuring only one edge exists for each + 3. Removes multiple edges (i.e. ensuring only one edge exists for each pair of endpoints) - 4. Removes isolated vertices (i.e. vertices with no edges after the + 4. Removes isolated vertices (i.e. vertices with no edges after the previous alterations) } diff --git a/man/read_simple_graphs.Rd b/man/read_simple_graphs.Rd index 05624ae3..4e907180 100644 --- a/man/read_simple_graphs.Rd +++ b/man/read_simple_graphs.Rd @@ -11,7 +11,7 @@ read_simple_graphs(source_dir, format = "ncol", pattern = "*", \arguments{ \item{source_dir}{Path to directory containing files with graph data} -\item{format}{Format of graph data. Any format supported by +\item{format}{Format of graph data. Any format supported by \code{igraph::read_graph} can be used.} \item{pattern}{Pattern to use to filter filenames. Any pattern supported by @@ -21,10 +21,10 @@ read_simple_graphs(source_dir, format = "ncol", pattern = "*", \item{remove_loops}{If TRUE, remove edgeds that connect a vertex to itself} -\item{remove_multiple}{If TRUE remove multiple edges connencting the same +\item{remove_multiple}{If TRUE remove multiple edges connencting the same pair of vertices} -\item{remove_isolates}{If TRUE, remove vertices with no edges after the +\item{remove_isolates}{If TRUE, remove vertices with no edges after the previous alterations have been made} } \value{ @@ -34,12 +34,12 @@ graph set to the name of the file it was read from. \description{ Reads graph data from all files in a directory matching the specified filename pattern. From each file, an a igraph graph object is constructed -and the requested subset of the following simplifications is made in the +and the requested subset of the following simplifications is made in the following order: 1. Makes the graph undirected 2. Removes loops (where both endpoints of an edge are the same vertex) - 3. Removes multiple edges (i.e. ensuring only one edge exists for each + 3. Removes multiple edges (i.e. ensuring only one edge exists for each pair of endpoints) - 4. Removes isolated vertices (i.e. vertices with no edges after the + 4. Removes isolated vertices (i.e. vertices with no edges after the previous alterations) } diff --git a/man/scale_graphlet_counts_ego.Rd b/man/scale_graphlet_counts_ego.Rd index 46aa515a..c4bae9b1 100644 --- a/man/scale_graphlet_counts_ego.Rd +++ b/man/scale_graphlet_counts_ego.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{scale_graphlet_counts_ego} \alias{scale_graphlet_counts_ego} -\title{Scale graphlet counts for an ego network by the n choose k possible +\title{Scale graphlet counts for an ego network by the n choose k possible choices of k nodes in that ego-network, where n is the number of nodes in the ego network and k is the number of nodes in the graphlet.} \usage{ @@ -20,7 +20,7 @@ in graphlet_counts.} scaled graphlet counts. } \description{ -Scale graphlet counts for an ego network by the n choose k possible +Scale graphlet counts for an ego network by the n choose k possible choices of k nodes in that ego-network, where n is the number of nodes in the ego network and k is the number of nodes in the graphlet. } diff --git a/man/shift_dhist.Rd b/man/shift_dhist.Rd index 8ace8c44..967d569c 100644 --- a/man/shift_dhist.Rd +++ b/man/shift_dhist.Rd @@ -15,6 +15,6 @@ shift_dhist(dhist, shift) A shifted discrete histogram as a \code{dhist} object } \description{ -Shift the locations of a discrete histogram rightwards on the x-axis by the +Shift the locations of a discrete histogram rightwards on the x-axis by the specified amount } diff --git a/man/shift_to_next_alignment.Rd b/man/shift_to_next_alignment.Rd index c558f142..9974d404 100644 --- a/man/shift_to_next_alignment.Rd +++ b/man/shift_to_next_alignment.Rd @@ -13,10 +13,10 @@ shift_to_next_alignment(x1, x2, distance_matrix_prev = NULL, \item{x2}{Second location vector. This vector is remaining unchanged.} } \value{ -Minimum non-zero right-shift to apply to x1 to align at least one +Minimum non-zero right-shift to apply to x1 to align at least one element of x1 with at least one element of x2 } \description{ -Calculate minimum right shift of first location vector to make any pair of +Calculate minimum right shift of first location vector to make any pair of locations from the two vectors equal } diff --git a/man/simplify_graph.Rd b/man/simplify_graph.Rd index fa4f716a..f4e37722 100644 --- a/man/simplify_graph.Rd +++ b/man/simplify_graph.Rd @@ -14,10 +14,10 @@ simplify_graph(graph, as_undirected = TRUE, remove_loops = TRUE, \item{remove_loops}{If TRUE, remove edgeds that connect a vertex to itself} -\item{remove_multiple}{If TRUE remove multiple edges connencting the same +\item{remove_multiple}{If TRUE remove multiple edges connencting the same pair of vertices} -\item{remove_isolates}{If TRUE, remove vertices with no edges after the +\item{remove_isolates}{If TRUE, remove vertices with no edges after the previous alterations have been made} } \value{ @@ -28,8 +28,8 @@ Takes a igraph graph object and makes the requested subset of the following simplifications in the following order: 1. Makes the graph undirected 2. Removes loops (where both endpoints of an edge are the same vertex) - 3. Removes multiple edges (i.e. ensuring only one edge exists for each + 3. Removes multiple edges (i.e. ensuring only one edge exists for each pair of endpoints) - 4. Removes isolated vertices (i.e. vertices with no edges after the + 4. Removes isolated vertices (i.e. vertices with no edges after the previous alterations) } diff --git a/man/sort_dhist.Rd b/man/sort_dhist.Rd index 9a881332..a5b0331f 100644 --- a/man/sort_dhist.Rd +++ b/man/sort_dhist.Rd @@ -9,10 +9,10 @@ sort_dhist(dhist, decreasing = FALSE) \arguments{ \item{dhist}{A discrete histogram as a \code{dhist} object} -\item{decreasing}{Logical indicating whether histograms should be sorted in +\item{decreasing}{Logical indicating whether histograms should be sorted in increasing (default) or decreasing order of location} } \description{ -Sort a discrete histogram so that locations are in increasing (default) or +Sort a discrete histogram so that locations are in increasing (default) or decreasing order } diff --git a/man/virusppi.Rd b/man/virusppi.Rd index fa16de16..ddbb8eb2 100644 --- a/man/virusppi.Rd +++ b/man/virusppi.Rd @@ -17,7 +17,7 @@ virusppi } \description{ -A dataset containing the protein-protein interaction networks for the +A dataset containing the protein-protein interaction networks for the following 5 microorganisms \itemize{ \item EBV diff --git a/vignettes/quickstart_netdis_2graphs.R b/vignettes/quickstart_netdis_2graphs.R index 0732236f..cb64148a 100644 --- a/vignettes/quickstart_netdis_2graphs.R +++ b/vignettes/quickstart_netdis_2graphs.R @@ -22,10 +22,10 @@ ref_graph <- read_simple_graph(ref_path, format = "ncol") ## ------------------------------------------------------------------------ # Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size = 4 +max_graphlet_size <- 4 # Ego network neighbourhood size -neighbourhood_size = 2 +neighbourhood_size <- 2 # Minimum size of ego networks to consider min_ego_nodes <- 3 @@ -35,6 +35,16 @@ min_ego_edges <- 1 min_bin_count <- 5 num_bins <- 100 +## ------------------------------------------------------------------------ +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) + ## ------------------------------------------------------------------------ # Get ego networks for query graphs and reference graph ego_1 <- make_named_ego_graph(graph_1, @@ -107,11 +117,10 @@ sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) ## ------------------------------------------------------------------------ -netdis_uptok(sum_graphlet_counts_1, - sum_graphlet_counts_2, - max_graphlet_size) +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) -## ------------------------------------------------------------------------ - +print(netdis_result) diff --git a/vignettes/quickstart_netdis_2graphs.html b/vignettes/quickstart_netdis_2graphs.html index 13fda1ea..9c0a882b 100644 --- a/vignettes/quickstart_netdis_2graphs.html +++ b/vignettes/quickstart_netdis_2graphs.html @@ -12,7 +12,7 @@ - + Quick start guide for Netdis - 2 graphs @@ -303,7 +303,7 @@

Quick start guide for Netdis - 2 graphs

Martin O’Reilly, Jack Roberts

-

2019-07-29

+

2019-07-30

@@ -334,10 +334,10 @@

Load graphs

-
-

Generate ego networks for query graphs

- + -
-

Count graphlets in ego networks for query graphs

- +

Bin reference graph by ego network density and calculate mean graphlet counts in each bin

- +

Calculate netdis statistics

- -
## [1] 0.1846655 0.1749835
-
- diff --git a/vignettes/quickstart_netdis_functions.Rmd b/vignettes/quickstart_netdis_functions.Rmd new file mode 100644 index 00000000..79731db5 --- /dev/null +++ b/vignettes/quickstart_netdis_functions.Rmd @@ -0,0 +1,120 @@ +--- +title: "Quick start guide for usage of netdis functions" +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Quick start for Netdis functions} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Load reference graph +# JACK - need to deal with case where ref graph not used. +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Compare two graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Calculate netdis statistics +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) +``` + +## Compare one graph to many other graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graphs_compare <- 0 + +# Calculate netdis statistics +netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) +``` + +## Do pairwise netdis calculations for many graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graphs <- 0 + +# Calculate netdis statistics +netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) +``` \ No newline at end of file From 70277bcf76bda2418619a4b2834e0cfc0adb5362 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 30 Jul 2019 19:10:54 +0100 Subject: [PATCH 022/188] functions for one-to-one, one-to-many and many-to-many --- NAMESPACE | 2 + R/measures_net_dis.R | 178 ++++++-- man/cross_comparison_spec.Rd | 9 +- man/netdis_many_to_many.Rd | 40 ++ man/netdis_one_to_many.Rd | 39 ++ vignettes/quickstart_netdis_2graphs.R | 10 - vignettes/quickstart_netdis_2graphs.html | 138 +++---- vignettes/quickstart_netdis_functions.R | 94 +++++ vignettes/quickstart_netdis_functions.Rmd | 12 +- vignettes/quickstart_netdis_functions.html | 448 +++++++++++++++++++++ 10 files changed, 841 insertions(+), 129 deletions(-) create mode 100644 man/netdis_many_to_many.Rd create mode 100644 man/netdis_one_to_many.Rd create mode 100644 vignettes/quickstart_netdis_functions.R create mode 100644 vignettes/quickstart_netdis_functions.html diff --git a/NAMESPACE b/NAMESPACE index 5c7bcef3..90921885 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,8 @@ export(netdis_expected_graphlet_counts_ego) export(netdis_expected_graphlet_counts_ego_fn) export(netdis_expected_graphlet_counts_per_ego) export(netdis_for_all_graphs) +export(netdis_many_to_many) +export(netdis_one_to_many) export(netdis_one_to_one) export(netdis_uptok) export(normalise_dhist_mass) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 7668a777..814889c7 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -21,29 +21,129 @@ netdis_one_to_one <- function(graph_1, graph_2, min_ego_edges = 1, min_bin_count = 5, num_bins = 100) { + + # bundle graphs into one vector with format needed for + # netdis many-to-many + graphs <- list(graph_1 = graph_1, graph_2 = graph_2) + + # calculate netdis + result <- netdis_many_to_many( + graphs, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100 + ) + + # extract netdis statistics from list returned by netdis_many_to_many + result$netdis[, 1] +} + +#' Netdis comparisons between one graph and many other graphs +#' @param graph_1 query graph - this graph will be compared with +#' all graphs in graphs_compare +#' @param graphs_compare graphs graph_1 will be compared with +#' @param ref_graph Reference graph +#' @param max_graphlet_size Generate graphlets up to this size +#' @param neighbourhood_size Ego network neighbourhood size +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges +#' @param min_bin_count Minimum number of ego networks in each density bin +#' @param num_bins Number of density bins to generate +#' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes +#' up to and including max_graphlet_size +#' @export +netdis_one_to_many <- function(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) { + + # bundle graph_1 and graphs_compare to one vector, with + # graph_1 at start as needed for netdis_many_to_many call + graphs <- append(graphs_compare, list(graph_1=graph_1), after=0) + + # calculate netdis + result <- netdis_many_to_many( + graphs, + ref_graph, + comparisons = 'one-to-many', + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100 + ) + + # restructure netdis_many_to_many output + colnames(result$netdis) <- result$comp_spec$name_b + result$netdis +} + + +#' Netdis between all graph pairs +#' @param graphs Query graphs +#' @param ref_graph Reference graph +#' @param comparisons Which comparisons to perform between graphs. +#' Can be "many-to-many" (all pairwise combinations) or "one-to-many" +#' (compare first graph in graphs to all other graphs.) +#' @param max_graphlet_size Generate graphlets up to this size +#' @param neighbourhood_size Ego network neighbourhood size +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges +#' @param min_bin_count Minimum number of ego networks in each density bin +#' @param num_bins Number of density bins to generate +#' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes +#' up to and including max_graphlet_size +#' @export +netdis_many_to_many <- function(graphs, + ref_graph, + comparisons = 'many-to-many', + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) { ## ------------------------------------------------------------------------ # Get ego networks for query graphs and reference graph - ego_1 <- make_named_ego_graph(graph_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - - ego_2 <- make_named_ego_graph(graph_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ego_networks <- purrr::map( + graphs, make_named_ego_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) - ego_ref <- make_named_ego_graph(ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ego_ref <- make_named_ego_graph( + ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) ## ------------------------------------------------------------------------ # Count graphlets for ego networks in query and reference graphs - graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) - graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) - - graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + graphlet_counts <- purrr::map( + ego_networks, + ego_to_graphlet_counts, + max_graphlet_size = max_graphlet_size + ) + + graphlet_counts_ref <- ego_to_graphlet_counts( + ego_ref, + max_graphlet_size = max_graphlet_size + ) ## ------------------------------------------------------------------------ # Scale ego-network graphlet counts by dividing by total number of k-tuples in @@ -70,33 +170,41 @@ netdis_one_to_one <- function(graph_1, graph_2, ## ------------------------------------------------------------------------ # Calculate expected graphlet counts (using ref graph ego network density bins) - exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, - max_graphlet_size, - ref_ego_density_bins, - ref_binned_graphlet_counts) - - - exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, - max_graphlet_size, - ref_ego_density_bins, - ref_binned_graphlet_counts) + exp_graphlet_counts <- purrr::map( + ego_networks, + netdis_expected_graphlet_counts_per_ego, + max_graphlet_size = max_graphlet_size, + density_breaks = ref_ego_density_bins, + density_binned_reference_counts = ref_binned_graphlet_counts + ) # Centre graphlet counts by subtracting expected counts - centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 - - centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 + centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) ## ------------------------------------------------------------------------ # Sum centred graphlet counts across all ego networks - sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) - sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + ## ------------------------------------------------------------------------ + # Generate pairwise comparisons + comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = comparisons) ## ------------------------------------------------------------------------ # Calculate netdis statistics - netdis_uptok(sum_graphlet_counts_1, - sum_graphlet_counts_2, - max_graphlet_size) + results <- parallel::mcmapply( + function(index_a, index_b) { + netdis_uptok( + sum_graphlet_counts[[index_a]], + sum_graphlet_counts[[index_b]], + max_graphlet_size = max_graphlet_size + ) + }, + comp_spec$index_a, + comp_spec$index_b, + SIMPLIFY = TRUE) + + + list(netdis = results, comp_spec = comp_spec) } diff --git a/man/cross_comparison_spec.Rd b/man/cross_comparison_spec.Rd index 1f64d9b1..35ab245f 100644 --- a/man/cross_comparison_spec.Rd +++ b/man/cross_comparison_spec.Rd @@ -4,11 +4,16 @@ \alias{cross_comparison_spec} \title{Generate a cross-comparison specification} \usage{ -cross_comparison_spec(named_list) +cross_comparison_spec(named_list, how = "many-to-many") } \arguments{ \item{named_list}{A named list of items for which an exhaustive pair-wise cross-comparison is required.} + +\item{how}{How to generate pair-wise combinations. Either "many-to-many" +(default) which generates all possible pair-wise combinations, or +"one-to-many" which generates all combinations between the first element +in named_list and the rest of the elements only.} } \value{ A matrix with one row for each possible pair-wise combination @@ -17,6 +22,6 @@ contain the names of the elements in the pair and the third and fourth columns contain the indexes of these elements in the provided list. } \description{ -Creates a cross-comparison matrix with all possible pair-wise combinations +Creates a cross-comparison matrix with pair-wise combinations of elements from the provided list. } diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd new file mode 100644 index 00000000..e648d8bf --- /dev/null +++ b/man/netdis_many_to_many.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_many_to_many} +\alias{netdis_many_to_many} +\title{Netdis between all graph pairs} +\usage{ +netdis_many_to_many(graphs, ref_graph, comparisons = "many-to-many", + max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, + min_ego_edges = 1, min_bin_count = 5, num_bins = 100) +} +\arguments{ +\item{graphs}{Query graphs} + +\item{ref_graph}{Reference graph} + +\item{comparisons}{Which comparisons to perform between graphs. +Can be "many-to-many" (all pairwise combinations) or "one-to-many" +(compare first graph in graphs to all other graphs.)} + +\item{max_graphlet_size}{Generate graphlets up to this size} + +\item{neighbourhood_size}{Ego network neighbourhood size} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges} + +\item{min_bin_count}{Minimum number of ego networks in each density bin} + +\item{num_bins}{Number of density bins to generate} +} +\value{ +Netdis statistics between graph_1 and graph_2 for graphlet sizes +up to and including max_graphlet_size +} +\description{ +Netdis between all graph pairs +} diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd new file mode 100644 index 00000000..860701ea --- /dev/null +++ b/man/netdis_one_to_many.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_one_to_many} +\alias{netdis_one_to_many} +\title{Netdis comparisons between one graph and many other graphs} +\usage{ +netdis_one_to_many(graph_1, graphs_compare, ref_graph, + max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, + min_ego_edges = 1, min_bin_count = 5, num_bins = 100) +} +\arguments{ +\item{graph_1}{query graph - this graph will be compared with +all graphs in graphs_compare} + +\item{graphs_compare}{graphs graph_1 will be compared with} + +\item{ref_graph}{Reference graph} + +\item{max_graphlet_size}{Generate graphlets up to this size} + +\item{neighbourhood_size}{Ego network neighbourhood size} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges} + +\item{min_bin_count}{Minimum number of ego networks in each density bin} + +\item{num_bins}{Number of density bins to generate} +} +\value{ +Netdis statistics between graph_1 and graph_2 for graphlet sizes +up to and including max_graphlet_size +} +\description{ +Netdis comparisons between one graph and many other graphs +} diff --git a/vignettes/quickstart_netdis_2graphs.R b/vignettes/quickstart_netdis_2graphs.R index cb64148a..a32a2533 100644 --- a/vignettes/quickstart_netdis_2graphs.R +++ b/vignettes/quickstart_netdis_2graphs.R @@ -35,16 +35,6 @@ min_ego_edges <- 1 min_bin_count <- 5 num_bins <- 100 -## ------------------------------------------------------------------------ -netdis_one_to_one(graph_1, graph_2, - ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) - ## ------------------------------------------------------------------------ # Get ego networks for query graphs and reference graph ego_1 <- make_named_ego_graph(graph_1, diff --git a/vignettes/quickstart_netdis_2graphs.html b/vignettes/quickstart_netdis_2graphs.html index 9c0a882b..2758180e 100644 --- a/vignettes/quickstart_netdis_2graphs.html +++ b/vignettes/quickstart_netdis_2graphs.html @@ -346,98 +346,88 @@

Set Netdis parameters

# Ego network density binning parameters min_bin_count <- 5 num_bins <- 100
- -
##   netdis3   netdis4 
-## 0.1846655 0.1749835

Bin reference graph by ego network density and calculate mean graphlet counts in each bin

- +
diff --git a/vignettes/quickstart_netdis_functions.R b/vignettes/quickstart_netdis_functions.R new file mode 100644 index 00000000..6f38bf50 --- /dev/null +++ b/vignettes/quickstart_netdis_functions.R @@ -0,0 +1,94 @@ +## ------------------------------------------------------------------------ +# Load libraries +library("netdist") +library("purrr") + +## ------------------------------------------------------------------------ +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Load reference graph +# JACK - need to deal with case where ref graph not used. +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +## ------------------------------------------------------------------------ +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + + +## ------------------------------------------------------------------------ +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Calculate netdis statistics +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) + +## ------------------------------------------------------------------------ +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +graph_1 <- graphs$EBV +graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + +# Calculate netdis statistics +netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) + +## ------------------------------------------------------------------------ +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +# Calculate netdis statistics +netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + min_bin_count = 5, + num_bins = 100) + diff --git a/vignettes/quickstart_netdis_functions.Rmd b/vignettes/quickstart_netdis_functions.Rmd index 79731db5..87db36f3 100644 --- a/vignettes/quickstart_netdis_functions.Rmd +++ b/vignettes/quickstart_netdis_functions.Rmd @@ -83,12 +83,9 @@ netdis_one_to_one(graph_1, graph_2, ## Compare one graph to many other graphs ```{r} # Load query graphs -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graphs_compare <- 0 +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +graph_1 <- graphs$EBV +graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] # Calculate netdis statistics netdis_one_to_many(graph_1, graphs_compare, @@ -105,8 +102,7 @@ netdis_one_to_many(graph_1, graphs_compare, ```{r} # Load query graphs source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -graphs <- 0 +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") # Calculate netdis statistics netdis_many_to_many(graphs, diff --git a/vignettes/quickstart_netdis_functions.html b/vignettes/quickstart_netdis_functions.html new file mode 100644 index 00000000..eb7981fa --- /dev/null +++ b/vignettes/quickstart_netdis_functions.html @@ -0,0 +1,448 @@ + + + + + + + + + + + + + + + + +Quick start guide for usage of netdis functions + + + + + + + + + + + + + + + + + + + + + +

Quick start guide for usage of netdis functions

+

Jack Roberts

+

2019-07-30

+ + + + + + + + +
+

Do pairwise netdis calculations for many graphs

+ +
## $netdis
+##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
+## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
+## 
+## $comp_spec
+##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + From 6f08ab7a930c0177f16745e7546a47abf1d6c06d Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 30 Jul 2019 19:18:16 +0100 Subject: [PATCH 023/188] Update quickstart_netdis_functions.Rmd --- vignettes/quickstart_netdis_functions.Rmd | 62 +++++++++-------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/vignettes/quickstart_netdis_functions.Rmd b/vignettes/quickstart_netdis_functions.Rmd index 87db36f3..eef7a30d 100644 --- a/vignettes/quickstart_netdis_functions.Rmd +++ b/vignettes/quickstart_netdis_functions.Rmd @@ -16,25 +16,6 @@ library("netdist") library("purrr") ``` -## Load graphs -```{r} -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -# Load query graphs -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - -# Load reference graph -# JACK - need to deal with case where ref graph not used. -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(ref_path, format = "ncol") -``` - ## Set Netdis parameters ```{r} # Maximum graphlet size to calculate counts and netdis statistic for. @@ -72,12 +53,12 @@ graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), # Calculate netdis statistics netdis_one_to_one(graph_1, graph_2, ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + min_bin_count = min_bin_count, + num_bins = num_bins) ``` ## Compare one graph to many other graphs @@ -90,12 +71,12 @@ graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] # Calculate netdis statistics netdis_one_to_many(graph_1, graphs_compare, ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + min_bin_count = min_bin_count, + num_bins = num_bins) ``` ## Do pairwise netdis calculations for many graphs @@ -105,12 +86,15 @@ source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") # Calculate netdis statistics -netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + min_bin_count = min_bin_count, + num_bins = num_bins) + +print(results$netdis) +print(results$comp_spec) ``` \ No newline at end of file From c8fe4787ac47bb539f073847dbd34663a568c710 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 30 Aug 2019 14:42:30 +0100 Subject: [PATCH 024/188] vignette using geometric poisson approximation --- vignettes/netdis_2graphs_polya-aeppli.Rmd | 184 ++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 vignettes/netdis_2graphs_polya-aeppli.Rmd diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd new file mode 100644 index 00000000..0142cb0a --- /dev/null +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -0,0 +1,184 @@ +--- +title: "Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis - 2 graphs with GP Approximation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + +``` + +## Bin ego networks by density +```{r} + +# Get ego-network densities +densities_1 <- ego_network_density(ego_1) +densities_2 <- ego_network_density(ego_2) + +# Adaptively bin ego-network densities +binned_densities_1 <- binned_densities_adaptive(densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_1 <- binned_densities_1$breaks + +binned_densities_2 <- binned_densities_adaptive(densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_2 <- binned_densities_2$breaks +``` + +## Calculate expected graphlet counts in each bin using geometric poisson approximation +```{r} + +expected_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + bin_indexes) + + exp_counts_bin <- function(bin_idx) { + counts <- graphlet_counts[bin_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx,] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) + theta_d <- 2*means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- (1 / length(graphlet_idx)) * + sum( + 2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]) + ) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk + } + + nbins <- length(unique(bin_indexes)) + expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] = 0 + + expected_counts_bin +} + + +binned_graphlet_counts_1 <- expected_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) + +binned_graphlet_counts_2 <- expected_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size) + + +``` + +## Centre graphlet counts of query graphs using binned expected counts +```{r} +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, + max_graphlet_size, + ego_density_bins_1, + binned_graphlet_counts_1) + + +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, + max_graphlet_size, + ego_density_bins_2, + binned_graphlet_counts_2) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 + +centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 +``` + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) +``` + +## Calculate netdis statistics +```{r} + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) +``` \ No newline at end of file From 479f424328d784ce85af4ac5588503760bf9796a Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 30 Aug 2019 18:09:07 +0100 Subject: [PATCH 025/188] started constant counts vignette but bugs - netdis_expected_graphlet_counts applies scaling so shouldn't be used for GP approx or constant count subtraction as written --- R/measures_net_dis.R | 2 +- vignettes/netdis_2graphs_constant_exp.Rmd | 119 ++++++++++++++++++++++ vignettes/netdis_2graphs_polya-aeppli.Rmd | 2 - vignettes/quickstart_netdis_2graphs.Rmd | 26 ++--- 4 files changed, 133 insertions(+), 16 deletions(-) create mode 100644 vignettes/netdis_2graphs_constant_exp.Rmd diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 814889c7..59a71bbd 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -568,7 +568,7 @@ netdis_expected_graphlet_counts_ego <- function(graph, #' #' JACK To follow through logic of paper steps, wanted to pass #' ego networks to the function, not the input query graph -#' (as in netdis_expected_graphlet_counts_ego_fn above). +#' (as in netdis_expected_graphlet_counts_ego above). #' #' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts diff --git a/vignettes/netdis_2graphs_constant_exp.Rmd b/vignettes/netdis_2graphs_constant_exp.Rmd new file mode 100644 index 00000000..455884d0 --- /dev/null +++ b/vignettes/netdis_2graphs_constant_exp.Rmd @@ -0,0 +1,119 @@ +--- +title: "Netdis - 2 graphs with Constant Expected Counts for Each Graphlet" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis - 2 graphs with constant expected counts} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) +``` + +## Bin ego networks by density +```{r} +mean_graphlet_counts_1 <- colMeans(graphlet_counts_1) +mean_graphlet_counts_2 <- colMeans(graphlet_counts_2) + +# functions take expected counts as 2 dimensional vectors so create 2nd dim +dim(mean_graphlet_counts_1) <- c(1, length(mean_graphlet_counts_1)) +dim(mean_graphlet_counts_2) <- c(1, length(mean_graphlet_counts_2)) + +bins <- c(0, 1) +``` + + +## Centre graphlet counts of query graphs using binned expected counts +```{r} +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, + max_graphlet_size, + bins, + mean_graphlet_counts_1) + + +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, + max_graphlet_size, + bins, + mean_graphlet_counts_2) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 + +centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 +``` + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) +``` + +## Calculate netdis statistics +```{r} + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) +``` \ No newline at end of file diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index 0142cb0a..69656136 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -135,7 +135,6 @@ expected_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) expected_counts_bin } - binned_graphlet_counts_1 <- expected_counts_gp(graphlet_counts_1, binned_densities_1$interval_indexes, max_graphlet_size) @@ -144,7 +143,6 @@ binned_graphlet_counts_2 <- expected_counts_gp(graphlet_counts_2, binned_densities_2$interval_indexes, max_graphlet_size) - ``` ## Centre graphlet counts of query graphs using binned expected counts diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index bcf139df..c357a544 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -28,11 +28,6 @@ graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") -# Load reference graph -# JACK - need to deal with case where ref graph not used. -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(ref_path, format = "ncol") ``` ## Set Netdis parameters @@ -64,11 +59,6 @@ ego_2 <- make_named_ego_graph(graph_2, order = neighbourhood_size, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - -ego_ref <- make_named_ego_graph(ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) ``` ## Count graphlets in ego networks @@ -76,12 +66,22 @@ ego_ref <- make_named_ego_graph(ref_graph, # Count graphlets for ego networks in query and reference graphs graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) - -graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) ``` -## Bin reference graph by ego network density and calculate mean graphlet counts in each bin +## Use a reference graph to calculate expected graphlet counts in ego network density bins ```{r} +# Load reference graph +# JACK - need to deal with case where ref graph not used. +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) From e310b71c9154420112987bad95d3e89981bda133 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 2 Sep 2019 17:05:16 +0100 Subject: [PATCH 026/188] make scaling counts with graphlet tuples optional --- R/measures_net_dis.R | 22 +++++++++++++++------- vignettes/netdis_2graphs_constant_exp.Rmd | 8 +++++--- vignettes/netdis_2graphs_polya-aeppli.Rmd | 6 ++++-- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 59a71bbd..2226d73c 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -579,7 +579,8 @@ netdis_expected_graphlet_counts_ego <- function(graph, netdis_expected_graphlet_counts_per_ego <- function(ego_networks, max_graphlet_size, density_breaks, - density_binned_reference_counts) { + density_binned_reference_counts, + scale_counts_fn=count_graphlet_tuples) { # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. @@ -587,7 +588,8 @@ netdis_expected_graphlet_counts_per_ego <- function(ego_networks, purrr::map(ego_networks, netdis_expected_graphlet_counts, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts + density_binned_reference_counts = density_binned_reference_counts, + scale_counts_fn = scale_counts_fn ) names(expected_graphlet_counts) <- names(ego_networks) @@ -606,17 +608,23 @@ netdis_expected_graphlet_counts_per_ego <- function(ego_networks, netdis_expected_graphlet_counts <- function(graph, max_graphlet_size, density_breaks, - density_binned_reference_counts) { + density_binned_reference_counts, + scale_counts_fn=count_graphlet_tuples) { # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- igraph::edge_density(graph) matched_density_index <- interval_index(query_density, density_breaks) matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - # Scale reference counts by multiplying the reference count for each graphlet - # by the number of possible sets of k nodes in the query graph, where k is the - # number of nodes in the graphlet - matched_reference_counts * count_graphlet_tuples(graph, max_graphlet_size) + + if (!is.null(scale_counts_fn)) { + # Scale reference counts e.g. by multiplying the reference count for each graphlet + # by the number of possible sets of k nodes in the query graph, where k is the + # number of nodes in the graphlet + matched_reference_counts <- matched_reference_counts * scale_counts_fn(graph, max_graphlet_size) + } + + matched_reference_counts } #' INTERNAL FUNCTION - Do not call directly diff --git a/vignettes/netdis_2graphs_constant_exp.Rmd b/vignettes/netdis_2graphs_constant_exp.Rmd index 455884d0..82ded85a 100644 --- a/vignettes/netdis_2graphs_constant_exp.Rmd +++ b/vignettes/netdis_2graphs_constant_exp.Rmd @@ -69,7 +69,7 @@ graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graph graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) ``` -## Bin ego networks by density +## Use mean graphlet counts as expected counts ```{r} mean_graphlet_counts_1 <- colMeans(graphlet_counts_1) mean_graphlet_counts_2 <- colMeans(graphlet_counts_2) @@ -88,13 +88,15 @@ bins <- c(0, 1) exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, max_graphlet_size, bins, - mean_graphlet_counts_1) + mean_graphlet_counts_1, + scale_counts_fn = NULL) exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, max_graphlet_size, bins, - mean_graphlet_counts_2) + mean_graphlet_counts_2, + scale_counts_fn = NULL) # Centre graphlet counts by subtracting expected counts centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index 69656136..abd36c8f 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -151,13 +151,15 @@ binned_graphlet_counts_2 <- expected_counts_gp(graphlet_counts_2, exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, max_graphlet_size, ego_density_bins_1, - binned_graphlet_counts_1) + binned_graphlet_counts_1, + scale_counts_fn = NULL) exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, max_graphlet_size, ego_density_bins_2, - binned_graphlet_counts_2) + binned_graphlet_counts_2, + scale_counts_fn = NULL) # Centre graphlet counts by subtracting expected counts centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 From 3bafe7c534c56253a4c6621d2517bbedbda8e0eb Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 4 Sep 2019 11:50:47 +0100 Subject: [PATCH 027/188] expected counts using query networks rather than ref network --- R/measures_net_dis.R | 253 ++++++++++++++----- vignettes/netdis_2graphs_constant_exp.Rmd | 11 +- vignettes/netdis_2graphs_polya-aeppli.Rmd | 14 +- vignettes/netdis_multigraph_polya-aeppli.Rmd | 191 ++++++++++++++ vignettes/quickstart_netdis_functions.Rmd | 12 +- 5 files changed, 402 insertions(+), 79 deletions(-) create mode 100644 vignettes/netdis_multigraph_polya-aeppli.Rmd diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 2226d73c..409dacc0 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -19,8 +19,9 @@ netdis_one_to_one <- function(graph_1, graph_2, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) { + binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego, max_graphlet_size = max_graphlet_size), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, max_graphlet_size = max_graphlet_size, scale_counts_fn=count_graphlet_tuples)) { # bundle graphs into one vector with format needed for # netdis many-to-many @@ -34,8 +35,9 @@ netdis_one_to_one <- function(graph_1, graph_2, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100 + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn ) # extract netdis statistics from list returned by netdis_many_to_many @@ -64,8 +66,9 @@ netdis_one_to_many <- function(graph_1, graphs_compare, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) { + binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego, max_graphlet_size = max_graphlet_size), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, max_graphlet_size = max_graphlet_size, scale_counts_fn=count_graphlet_tuples)) { # bundle graph_1 and graphs_compare to one vector, with # graph_1 at start as needed for netdis_many_to_many call @@ -80,8 +83,9 @@ netdis_one_to_many <- function(graph_1, graphs_compare, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100 + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn ) # restructure netdis_many_to_many output @@ -114,10 +118,18 @@ netdis_many_to_many <- function(graphs, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) { + binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, + max_graphlet_size = max_graphlet_size, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, + max_graphlet_size = max_graphlet_size, + scale_counts_fn=count_graphlet_tuples)) { ## ------------------------------------------------------------------------ - # Get ego networks for query graphs and reference graph + # Get ego networks for query graphs ego_networks <- purrr::map( graphs, make_named_ego_graph, order = neighbourhood_size, @@ -125,59 +137,86 @@ netdis_many_to_many <- function(graphs, min_ego_edges = min_ego_edges ) - ego_ref <- make_named_ego_graph( - ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - ## ------------------------------------------------------------------------ - # Count graphlets for ego networks in query and reference graphs + # Count graphlets for ego networks in query graphs graphlet_counts <- purrr::map( ego_networks, ego_to_graphlet_counts, max_graphlet_size = max_graphlet_size ) - - graphlet_counts_ref <- ego_to_graphlet_counts( - ego_ref, - max_graphlet_size = max_graphlet_size - ) ## ------------------------------------------------------------------------ - # Scale ego-network graphlet counts by dividing by total number of k-tuples in - # ego-network (where k is graphlet size) - scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(ego_ref, - graphlet_counts_ref, - max_graphlet_size) - - # Get ego-network densities - densities_ref <- ego_network_density(ego_ref) - - # Adaptively bin ref ego-network densities - binned_densities <- binned_densities_adaptive(densities_ref, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - - ref_ego_density_bins <- binned_densities$breaks - - # Average ref graphlet counts across density bins - ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts_ref, - binned_densities$interval_indexes) - + # Case where expected counts calculated using a reference network + if (!is.null(ref_graph)) { + # Get ego networks + ego_ref <- make_named_ego_graph( + ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + # Get ego network graphlet counts + graphlet_counts_ref <- ego_to_graphlet_counts( + ego_ref, + max_graphlet_size = max_graphlet_size + ) + + # Get ego-network densities + densities_ref <- ego_network_density(ego_ref) + + # bin ref ego-network densities + binned_densities <- binning_fn(densities_ref) + + ref_ego_density_bins <- binned_densities$breaks + # Average ref graphlet counts across density bins + ref_binned_graphlet_counts <- bin_counts_fn( + graphlet_counts_ref, + binned_densities$interval_indexes, + ego_networks = ego_ref + ) + + # Calculate expected graphlet counts (using ref graph ego network density bins) + exp_graphlet_counts <- purrr::map( + ego_networks, + exp_counts_fn, + density_breaks = ref_ego_density_bins, + density_binned_reference_counts = ref_binned_graphlet_counts + ) + ## ------------------------------------------------------------------------ - # Calculate expected graphlet counts (using ref graph ego network density bins) - exp_graphlet_counts <- purrr::map( - ego_networks, - netdis_expected_graphlet_counts_per_ego, - max_graphlet_size = max_graphlet_size, - density_breaks = ref_ego_density_bins, - density_binned_reference_counts = ref_binned_graphlet_counts - ) + } else { + # Case where expected counts calculated using query networks + + # Get ego-network densities + densities <- purrr::map(ego_networks, + ego_network_density) + + # bin ref ego-network densities + binned_densities <- purrr::map(densities, + binning_fn) + + # extract bin breaks and indexes from binning results + ego_density_bins <- purrr::map(binned_densities, + function(x) {x$breaks}) + ego_density_bin_indexes <- purrr::map(binned_densities, + function(x) {x$interval_indexes}) + + + # Calculate expected counts in each bin + binned_graphlet_counts <- mapply(bin_counts_fn, + graphlet_counts, + ego_density_bin_indexes) + + # Calculate expected graphlet counts for each ego network + exp_graphlet_counts <- mapply(exp_counts_fn, + ego_networks, + ego_density_bins, + binned_graphlet_counts) + } + ## ------------------------------------------------------------------------ # Centre graphlet counts by subtracting expected counts centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) @@ -580,7 +619,7 @@ netdis_expected_graphlet_counts_per_ego <- function(ego_networks, max_graphlet_size, density_breaks, density_binned_reference_counts, - scale_counts_fn=count_graphlet_tuples) { + scale_counts_fn=NULL) { # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. @@ -609,7 +648,7 @@ netdis_expected_graphlet_counts <- function(graph, max_graphlet_size, density_breaks, density_binned_reference_counts, - scale_counts_fn=count_graphlet_tuples) { + scale_counts_fn=NULL) { # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- igraph::edge_density(graph) @@ -636,7 +675,8 @@ netdis_expected_graphlet_counts <- function(graph, #' TODO: Remove @export prior to publishing #' @export mean_density_binned_graphlet_counts <- function(graphlet_counts, - density_interval_indexes) { + density_interval_indexes, + agg_fn = mean) { # The ego network graphlet counts are an E x G matrix with rows (E) # representing ego networks and columns (G) representing graphlets. We want # to calculate the mean count for each graphlet / density bin combination, @@ -644,8 +684,106 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, # bins, using apply to map this operation over graphlets mean_density_binned_graphlet_counts <- apply(graphlet_counts, MARGIN = 2, function(gc) { - tapply(gc, INDEX = density_interval_indexes, FUN = mean) + tapply(gc, INDEX = density_interval_indexes, FUN = agg_fn) }) + + # if only 1 bin (i.e. no binning) will be left with a 1D list. + # convert it into a 2D list. + if (is.null(dim(mean_density_binned_graphlet_counts))) { + dim(mean_density_binned_graphlet_counts) <- c(1, length(mean_density_binned_graphlet_counts)) + colnames(mean_density_binned_graphlet_counts) <- colnames(graphlet_counts) + } + + mean_density_binned_graphlet_counts +} + +#' For case where don't want to use binning, return +#' a single bin which covers full range of possible +#' densities. +single_density_bin <- function(densities) { + + binned_densities <- list(densities = densities, + interval_indexes = rep(1, length(densities)), + breaks = c(0, 1)) +} + +#' INTERNAL FUNCTION - Do not call directly +#' +#' Used to +#' generate a function for calculating expected graphlet counts in each +#' density bin. +#' @param agg_fn Function to aggregate counts in each bin (default \code{agg_fn = mean}). +#' @param scale_fn Optional function to apply a transformation to graphlet_counts, must +#' have arguments graphlet_counts, ego_networks and max_graphlet_size. +#' @param ego_networks Optionally passed and used by scale_fn. +#' @param max_graphlet_size Optionally passed and used by scale_fn. +#' Temporarily accessible during development. +#' TODO: Remove @export prior to publishing +#' @export +density_binned_counts <- function(graphlet_counts, density_interval_indexes, + agg_fn = mean, + scale_fn = NULL, ego_networks = NULL, + max_graphlet_size = NULL) { + + if (!is.null(scale_fn)) { + # Scale ego-network graphlet counts e.g. + # by dividing by total number of k-tuples in + # ego-network (where k is graphlet size) + graphlet_counts <- scale_fn(graphlet_counts, + ego_networks = ego_networks, + max_graphlet_size = max_graphlet_size) + } + + mean_density_binned_graphlet_counts(graphlet_counts, + density_interval_indexes, + agg_fn = agg_fn) + +} + + +#' Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + exp_counts_bin <- function(bin_idx) { + counts <- graphlet_counts[bin_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx,] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) + theta_d <- 2*means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- (1 / length(graphlet_idx)) * + sum( + 2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]) + ) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk + } + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] = 0 + + expected_counts_bin } @@ -656,6 +794,7 @@ zeros_to_ones <- function(v) { v } + #' @export scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { # Avoid divide by zero errors by replacing all zeros with ones in the diff --git a/vignettes/netdis_2graphs_constant_exp.Rmd b/vignettes/netdis_2graphs_constant_exp.Rmd index 82ded85a..6acab937 100644 --- a/vignettes/netdis_2graphs_constant_exp.Rmd +++ b/vignettes/netdis_2graphs_constant_exp.Rmd @@ -71,17 +71,16 @@ graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graph ## Use mean graphlet counts as expected counts ```{r} -mean_graphlet_counts_1 <- colMeans(graphlet_counts_1) -mean_graphlet_counts_2 <- colMeans(graphlet_counts_2) +# rep(1, nrow(graphlet_counts)): list of ones as bin index, i.e. everything in same bin +mean_graphlet_counts_1 <- density_binned_counts(graphlet_counts_1, + rep(1, nrow(graphlet_counts_1))) -# functions take expected counts as 2 dimensional vectors so create 2nd dim -dim(mean_graphlet_counts_1) <- c(1, length(mean_graphlet_counts_1)) -dim(mean_graphlet_counts_2) <- c(1, length(mean_graphlet_counts_2)) +mean_graphlet_counts_2 <- density_binned_counts(graphlet_counts_2, + rep(1, nrow(graphlet_counts_2))) bins <- c(0, 1) ``` - ## Centre graphlet counts of query graphs using binned expected counts ```{r} # Calculate expected graphlet counts for each ego network diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index abd36c8f..cfdde82a 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -94,7 +94,7 @@ ego_density_bins_2 <- binned_densities_2$breaks ## Calculate expected graphlet counts in each bin using geometric poisson approximation ```{r} -expected_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { +density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( graphlet_counts, @@ -135,13 +135,13 @@ expected_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) expected_counts_bin } -binned_graphlet_counts_1 <- expected_counts_gp(graphlet_counts_1, - binned_densities_1$interval_indexes, - max_graphlet_size) +binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) -binned_graphlet_counts_2 <- expected_counts_gp(graphlet_counts_2, - binned_densities_2$interval_indexes, - max_graphlet_size) +binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size) ``` diff --git a/vignettes/netdis_multigraph_polya-aeppli.Rmd b/vignettes/netdis_multigraph_polya-aeppli.Rmd new file mode 100644 index 00000000..50e0fed9 --- /dev/null +++ b/vignettes/netdis_multigraph_polya-aeppli.Rmd @@ -0,0 +1,191 @@ +--- +title: "Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis - 2 graphs with GP Approximation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + +graphs <- list(EBV = graph_1, ECL = graph_2) +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs +ego_networks <- purrr::map( + graphs, make_named_ego_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query graphs +graphlet_counts <- purrr::map( + ego_networks, + ego_to_graphlet_counts, + max_graphlet_size = max_graphlet_size +) +``` + +## Bin ego networks by density +```{r} + +# Get ego-network densities +densities <- purrr::map(ego_networks, + ego_network_density) + +binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100) +# Adaptively bin ego-network densities +binned_densities <- purrr::map(densities, + binning_fn) + +ego_density_bins <- purrr::map(binned_densities, function(x) {x$breaks}) +ego_density_bin_indexes <- purrr::map(binned_densities, function(x) {x$interval_indexes}) + +``` + +## Calculate expected graphlet counts in each bin using geometric poisson approximation +```{r} + +density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + bin_indexes) + + exp_counts_bin <- function(bin_idx) { + counts <- graphlet_counts[bin_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx,] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) + theta_d <- 2*means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- (1 / length(graphlet_idx)) * + sum( + 2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]) + ) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk + } + + nbins <- length(unique(bin_indexes)) + expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] = 0 + + expected_counts_bin +} + + +bin_counts_fn = purrr::partial(density_binned_counts_gp, + max_graphlet_size = max_graphlet_size) + +binned_graphlet_counts <- mapply(bin_counts_fn, + graphlet_counts, + ego_density_bin_indexes) +``` + +## Centre graphlet counts of query graphs using binned expected counts +```{r} +# Calculate expected graphlet counts for each ego network +exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, + max_graphlet_size = max_graphlet_size, + scale_counts_fn=NULL) + +exp_graphlet_counts <- mapply(exp_counts_fn, + ego_networks, + ego_density_bins, + binned_graphlet_counts) + + + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) + +``` + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) +``` + +## Calculate netdis statistics +```{r} + +# Generate pairwise comparisons +comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = "many-to-many") + +## ------------------------------------------------------------------------ +# Calculate netdis statistics +results <- parallel::mcmapply( + function(index_a, index_b) { + netdis_uptok( + sum_graphlet_counts[[index_a]], + sum_graphlet_counts[[index_b]], + max_graphlet_size = max_graphlet_size + ) + }, + comp_spec$index_a, + comp_spec$index_b, + SIMPLIFY = TRUE) + + +list(netdis = results, comp_spec = comp_spec) +``` \ No newline at end of file diff --git a/vignettes/quickstart_netdis_functions.Rmd b/vignettes/quickstart_netdis_functions.Rmd index eef7a30d..f9809f35 100644 --- a/vignettes/quickstart_netdis_functions.Rmd +++ b/vignettes/quickstart_netdis_functions.Rmd @@ -56,9 +56,7 @@ netdis_one_to_one(graph_1, graph_2, max_graphlet_size = max_graphlet_size, neighbourhood_size = neighbourhood_size, min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - min_bin_count = min_bin_count, - num_bins = num_bins) + min_ego_edges = min_ego_edges) ``` ## Compare one graph to many other graphs @@ -74,9 +72,7 @@ netdis_one_to_many(graph_1, graphs_compare, max_graphlet_size = max_graphlet_size, neighbourhood_size = neighbourhood_size, min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - min_bin_count = min_bin_count, - num_bins = num_bins) + min_ego_edges = min_ego_edges) ``` ## Do pairwise netdis calculations for many graphs @@ -91,9 +87,7 @@ results <- netdis_many_to_many(graphs, max_graphlet_size = max_graphlet_size, neighbourhood_size = neighbourhood_size, min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - min_bin_count = min_bin_count, - num_bins = num_bins) + min_ego_edges = min_ego_edges) print(results$netdis) print(results$comp_spec) From 8def02f55fb63ee2df185a0addcd6d4f00789f17 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 4 Sep 2019 17:08:37 +0100 Subject: [PATCH 028/188] vignette for use of customisation functions --- NAMESPACE | 2 + R/measures_net_dis.R | 50 ++++--- man/density_binned_counts.Rd | 27 ++++ man/density_binned_counts_gp.Rd | 12 ++ man/mean_density_binned_graphlet_counts.Rd | 2 +- man/netdis_expected_graphlet_counts.Rd | 2 +- ...netdis_expected_graphlet_counts_per_ego.Rd | 5 +- man/netdis_many_to_many.Rd | 8 +- man/netdis_one_to_many.Rd | 8 +- man/netdis_one_to_one.Rd | 7 +- man/single_density_bin.Rd | 15 +++ vignettes/netdis_customisations.Rmd | 125 ++++++++++++++++++ vignettes/netdis_multigraph_polya-aeppli.Rmd | 4 +- ...s.Rmd => quickstart_netdis_comparison.Rmd} | 4 +- 14 files changed, 243 insertions(+), 28 deletions(-) create mode 100644 man/density_binned_counts.Rd create mode 100644 man/density_binned_counts_gp.Rd create mode 100644 man/single_density_bin.Rd create mode 100644 vignettes/netdis_customisations.Rmd rename vignettes/{quickstart_netdis_functions.Rmd => quickstart_netdis_comparison.Rmd} (95%) diff --git a/NAMESPACE b/NAMESPACE index 90921885..db3a1ef2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,8 @@ export(count_orbits_per_node) export(counts_from_observations) export(cross_comp_to_matrix) export(cross_comparison_spec) +export(density_binned_counts) +export(density_binned_counts_gp) export(dhist) export(dhist_ecmf) export(dhist_from_obs) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 409dacc0..47ca8517 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -20,8 +20,8 @@ netdis_one_to_one <- function(graph_1, graph_2, min_ego_nodes = 3, min_ego_edges = 1, binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego, max_graphlet_size = max_graphlet_size), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, max_graphlet_size = max_graphlet_size, scale_counts_fn=count_graphlet_tuples)) { + bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_counts_fn=count_graphlet_tuples)) { # bundle graphs into one vector with format needed for # netdis many-to-many @@ -36,7 +36,6 @@ netdis_one_to_one <- function(graph_1, graph_2, min_ego_nodes = 3, min_ego_edges = 1, binning_fn = binning_fn, - bin_counts_fn = bin_counts_fn, exp_counts_fn = exp_counts_fn ) @@ -67,8 +66,8 @@ netdis_one_to_many <- function(graph_1, graphs_compare, min_ego_nodes = 3, min_ego_edges = 1, binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego, max_graphlet_size = max_graphlet_size), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, max_graphlet_size = max_graphlet_size, scale_counts_fn=count_graphlet_tuples)) { + bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_counts_fn=count_graphlet_tuples)) { # bundle graph_1 and graphs_compare to one vector, with # graph_1 at start as needed for netdis_many_to_many call @@ -122,11 +121,9 @@ netdis_many_to_many <- function(graphs, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, - max_graphlet_size = max_graphlet_size, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - max_graphlet_size = max_graphlet_size, scale_counts_fn=count_graphlet_tuples)) { ## ------------------------------------------------------------------------ # Get ego networks for query graphs @@ -174,7 +171,8 @@ netdis_many_to_many <- function(graphs, ref_binned_graphlet_counts <- bin_counts_fn( graphlet_counts_ref, binned_densities$interval_indexes, - ego_networks = ego_ref + ego_networks = ego_ref, + max_graphlet_size = max_graphlet_size ) # Calculate expected graphlet counts (using ref graph ego network density bins) @@ -182,7 +180,8 @@ netdis_many_to_many <- function(graphs, ego_networks, exp_counts_fn, density_breaks = ref_ego_density_bins, - density_binned_reference_counts = ref_binned_graphlet_counts + density_binned_reference_counts = ref_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size ) ## ------------------------------------------------------------------------ @@ -198,8 +197,8 @@ netdis_many_to_many <- function(graphs, binning_fn) # extract bin breaks and indexes from binning results - ego_density_bins <- purrr::map(binned_densities, - function(x) {x$breaks}) + ego_density_bin_breaks <- purrr::map(binned_densities, + function(x) {x$breaks}) ego_density_bin_indexes <- purrr::map(binned_densities, function(x) {x$interval_indexes}) @@ -207,13 +206,17 @@ netdis_many_to_many <- function(graphs, # Calculate expected counts in each bin binned_graphlet_counts <- mapply(bin_counts_fn, graphlet_counts, - ego_density_bin_indexes) + ego_density_bin_indexes, + max_graphlet_size = max_graphlet_size, + SIMPLIFY = FALSE) # Calculate expected graphlet counts for each ego network exp_graphlet_counts <- mapply(exp_counts_fn, ego_networks, - ego_density_bins, - binned_graphlet_counts) + ego_density_bin_breaks, + binned_graphlet_counts, + max_graphlet_size = max_graphlet_size, + SIMPLIFY = FALSE) } ## ------------------------------------------------------------------------ @@ -583,6 +586,10 @@ netdis_expected_graphlet_counts_ego <- function(graph, density_binned_reference_counts, min_ego_nodes = 3, min_ego_edges = 1) { + + #print("netdis_expected_graphlet_counts_ego") + #print(density_binned_reference_counts) + # Generate ego-networks for query graph ego_networks <- make_named_ego_graph(graph, neighbourhood_size) # Drop ego-networks that don't have the minimum number of nodes or edges @@ -616,11 +623,15 @@ netdis_expected_graphlet_counts_ego <- function(graph, #' TODO: Remove @export prior to publishing #' @export netdis_expected_graphlet_counts_per_ego <- function(ego_networks, - max_graphlet_size, density_breaks, density_binned_reference_counts, + max_graphlet_size, scale_counts_fn=NULL) { - + + + #print("netdis_expected_graphlet_counts_per_ego") + #print(density_binned_reference_counts) + # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. expected_graphlet_counts <- @@ -649,10 +660,15 @@ netdis_expected_graphlet_counts <- function(graph, density_breaks, density_binned_reference_counts, scale_counts_fn=NULL) { + + #print("netdis_expected_graphlet_counts") + #print(density_binned_reference_counts) + # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- igraph::edge_density(graph) matched_density_index <- interval_index(query_density, density_breaks) + matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] @@ -752,7 +768,7 @@ density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes) exp_counts_bin <- function(bin_idx) { - counts <- graphlet_counts[bin_indexes == bin_idx, ] + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] means <- mean_binned_graphlet_counts[bin_idx,] mean_sub_counts <- sweep(counts, 2, means) diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd new file mode 100644 index 00000000..b0cdefb6 --- /dev/null +++ b/man/density_binned_counts.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{density_binned_counts} +\alias{density_binned_counts} +\title{INTERNAL FUNCTION - Do not call directly} +\usage{ +density_binned_counts(graphlet_counts, density_interval_indexes, + agg_fn = mean, scale_fn = NULL, ego_networks = NULL, + max_graphlet_size = NULL) +} +\arguments{ +\item{agg_fn}{Function to aggregate counts in each bin (default \code{agg_fn = mean}).} + +\item{scale_fn}{Optional function to apply a transformation to graphlet_counts, must +have arguments graphlet_counts, ego_networks and max_graphlet_size.} + +\item{ego_networks}{Optionally passed and used by scale_fn.} + +\item{max_graphlet_size}{Optionally passed and used by scale_fn. +Temporarily accessible during development. +TODO: Remove @export prior to publishing} +} +\description{ +Used to +generate a function for calculating expected graphlet counts in each +density bin. +} diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd new file mode 100644 index 00000000..1f2ae0b4 --- /dev/null +++ b/man/density_binned_counts_gp.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{density_binned_counts_gp} +\alias{density_binned_counts_gp} +\title{Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation} +\usage{ +density_binned_counts_gp(graphlet_counts, density_interval_indexes, + max_graphlet_size) +} +\description{ +Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation +} diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index 4f6e940f..a027db67 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -5,7 +5,7 @@ \title{INTERNAL FUNCTION - Do not call directly} \usage{ mean_density_binned_graphlet_counts(graphlet_counts, - density_interval_indexes) + density_interval_indexes, agg_fn = mean) } \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index b3d3e557..0aa352c1 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -5,7 +5,7 @@ \title{INTERNAL FUNCTION - Do not call directly} \usage{ netdis_expected_graphlet_counts(graph, max_graphlet_size, density_breaks, - density_binned_reference_counts) + density_binned_reference_counts, scale_counts_fn = NULL) } \description{ Used by \code{netdis_expected_graphlet_counts_ego} to diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index d68c3114..852a2673 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -5,12 +5,13 @@ \title{INTERNAL FUNCTION - Do not call directly} \usage{ netdis_expected_graphlet_counts_per_ego(ego_networks, max_graphlet_size, - density_breaks, density_binned_reference_counts) + density_breaks, density_binned_reference_counts, + scale_counts_fn = NULL) } \description{ JACK To follow through logic of paper steps, wanted to pass ego networks to the function, not the input query graph -(as in netdis_expected_graphlet_counts_ego_fn above). +(as in netdis_expected_graphlet_counts_ego above). } \details{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index e648d8bf..ec3912e8 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -6,7 +6,13 @@ \usage{ netdis_many_to_many(graphs, ref_graph, comparisons = "many-to-many", max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, - min_ego_edges = 1, min_bin_count = 5, num_bins = 100) + min_ego_edges = 1, + binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_counts_fn = count_graphlet_tuples)) } \arguments{ \item{graphs}{Query graphs} diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index 860701ea..4c6195e4 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -6,7 +6,13 @@ \usage{ netdis_one_to_many(graph_1, graphs_compare, ref_graph, max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, - min_ego_edges = 1, min_bin_count = 5, num_bins = 100) + min_ego_edges = 1, + binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_counts_fn = count_graphlet_tuples)) } \arguments{ \item{graph_1}{query graph - this graph will be compared with diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 02447d0e..a89430c6 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -6,7 +6,12 @@ \usage{ netdis_one_to_one(graph_1, graph_2, ref_graph, max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, num_bins = 100) + binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_counts_fn = count_graphlet_tuples)) } \arguments{ \item{graph_1}{First query graph} diff --git a/man/single_density_bin.Rd b/man/single_density_bin.Rd new file mode 100644 index 00000000..9d89d7ce --- /dev/null +++ b/man/single_density_bin.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{single_density_bin} +\alias{single_density_bin} +\title{For case where don't want to use binning, return +a single bin which covers full range of possible +densities.} +\usage{ +single_density_bin(densities) +} +\description{ +For case where don't want to use binning, return +a single bin which covers full range of possible +densities. +} diff --git a/vignettes/netdis_customisations.Rmd b/vignettes/netdis_customisations.Rmd new file mode 100644 index 00000000..342a8e99 --- /dev/null +++ b/vignettes/netdis_customisations.Rmd @@ -0,0 +1,125 @@ +--- +title: "Quick start guide for usage of netdis with different pairwise comparisons." +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis pairwise comparisons} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Load query graphs +```{r} +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +``` + +## Default Expected Counts with Reference Graph +```{r} + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Binning Parameters +```{r} + +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, + num_intervals = 50) + + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn) + +print(results$netdis) +print(results$comp_spec) + + +``` + +## With Modified Expected Counts: Geometric Poisson +```{r} +bin_counts_fn <- density_binned_counts_gp + +exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_counts_fn = NULL) + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Expected Counts: Simple Mean +```{r} + +binning_fn <- single_density_bin +bin_counts_fn <- density_binned_counts +exp_counts_fn <- netdis_expected_graphlet_counts_per_ego + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) + + +``` \ No newline at end of file diff --git a/vignettes/netdis_multigraph_polya-aeppli.Rmd b/vignettes/netdis_multigraph_polya-aeppli.Rmd index 50e0fed9..9f605aad 100644 --- a/vignettes/netdis_multigraph_polya-aeppli.Rmd +++ b/vignettes/netdis_multigraph_polya-aeppli.Rmd @@ -134,7 +134,7 @@ density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_ } -bin_counts_fn = purrr::partial(density_binned_counts_gp, +bin_counts_fn <- purrr::partial(density_binned_counts_gp, max_graphlet_size = max_graphlet_size) binned_graphlet_counts <- mapply(bin_counts_fn, @@ -145,7 +145,7 @@ binned_graphlet_counts <- mapply(bin_counts_fn, ## Centre graphlet counts of query graphs using binned expected counts ```{r} # Calculate expected graphlet counts for each ego network -exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, +exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, max_graphlet_size = max_graphlet_size, scale_counts_fn=NULL) diff --git a/vignettes/quickstart_netdis_functions.Rmd b/vignettes/quickstart_netdis_comparison.Rmd similarity index 95% rename from vignettes/quickstart_netdis_functions.Rmd rename to vignettes/quickstart_netdis_comparison.Rmd index f9809f35..80caba22 100644 --- a/vignettes/quickstart_netdis_functions.Rmd +++ b/vignettes/quickstart_netdis_comparison.Rmd @@ -1,10 +1,10 @@ --- -title: "Quick start guide for usage of netdis functions" +title: "Quick start guide for usage of netdis with different pairwise comparisons." author: "Jack Roberts" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Quick start for Netdis functions} + %\VignetteIndexEntry{Netdis pairwise comparisons} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- From c01b003ead776c419462b593552f2ffc1c99cd27 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 5 Sep 2019 10:32:51 +0100 Subject: [PATCH 029/188] fix broken tests for old netdis interface --- R/measures_net_dis.R | 12 ++++++++---- tests/testthat/test_measures_net_dis.R | 21 ++++++++++++++------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 47ca8517..c69810b0 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -523,7 +523,8 @@ netdis_expected_graphlet_counts_ego_fn <- function(graph, min_ego_nodes = 3, min_ego_edges = 1, min_bin_count = 5, - num_bins = 100) { + num_bins = 100, + scale_counts_fn = NULL) { # Calculate the scaled graphlet counts for all ego networks in the reference # graph, also returning the ego networks themselves in order to calculate @@ -567,7 +568,8 @@ netdis_expected_graphlet_counts_ego_fn <- function(graph, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, density_breaks = binned_densities$breaks, - density_binned_reference_counts = density_binned_graphlet_counts + density_binned_reference_counts = density_binned_graphlet_counts, + scale_counts_fn = scale_counts_fn ) } @@ -585,7 +587,8 @@ netdis_expected_graphlet_counts_ego <- function(graph, density_breaks, density_binned_reference_counts, min_ego_nodes = 3, - min_ego_edges = 1) { + min_ego_edges = 1, + scale_counts_fn=NULL) { #print("netdis_expected_graphlet_counts_ego") #print(density_binned_reference_counts) @@ -603,7 +606,8 @@ netdis_expected_graphlet_counts_ego <- function(graph, purrr::map(ego_networks, netdis_expected_graphlet_counts, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts + density_binned_reference_counts = density_binned_reference_counts, + scale_counts_fn=scale_counts_fn ) names(expected_graphlet_counts) <- names(ego_networks) # Simplify list to array diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 31ad4454..1f692006 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -491,7 +491,8 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { purrr::map(graphs, netdis_expected_graphlet_counts, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, - density_binned_reference_counts = scaled_reference_counts + density_binned_reference_counts = scaled_reference_counts, + scale_counts_fn=count_graphlet_tuples ) # Loop over each graph and compare expected with actual # NOTE: v2.0.0 of testthat library made a breaking change that means using @@ -619,7 +620,8 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes max_graphlet_size = max_graphlet_size, neighbourhood_size = 1, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, + scale_counts_fn=count_graphlet_tuples ) actual_expected_graphlet_counts_ego_o2 <- netdis_expected_graphlet_counts_ego( @@ -627,7 +629,8 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes max_graphlet_size = max_graphlet_size, neighbourhood_size = 2, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, + scale_counts_fn=count_graphlet_tuples ) # Compare actual to expected @@ -836,7 +839,8 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no density_breaks = breaks_o1, density_binned_reference_counts_o1, min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges + min_ego_edges = min_ego_edges, + scale_counts_fn=count_graphlet_tuples ) ) expect_equal( @@ -848,7 +852,8 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no density_breaks = breaks_o2, density_binned_reference_counts_o2, min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges + min_ego_edges = min_ego_edges, + scale_counts_fn=count_graphlet_tuples ) ) @@ -861,7 +866,8 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no min_bin_count = min_bin_count, num_bins = num_bins, min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges + min_ego_edges = min_ego_edges, + scale_counts_fn=count_graphlet_tuples ) actual_expected_graphlet_counts_ego_fn_o2 <- netdis_expected_graphlet_counts_ego_fn( @@ -871,7 +877,8 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no min_bin_count = min_bin_count, num_bins = num_bins, min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges + min_ego_edges = min_ego_edges, + scale_counts_fn=count_graphlet_tuples ) # Generate actual expected accounts by applying generated functions to test # graph From f5c03b9e1efcd544ad856f288275ec8c42d6b70f Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 5 Sep 2019 13:42:00 +0100 Subject: [PATCH 030/188] fix vignette names --- R/measures_net_dis.R | 24 ++++++++++---------- tests/testthat/test_measures_net_dis.R | 14 ++++++------ vignettes/netdis_2graphs_constant_exp.Rmd | 8 +++---- vignettes/netdis_customisations.Rmd | 6 ++--- vignettes/netdis_multigraph_polya-aeppli.Rmd | 15 ++++-------- vignettes/quickstart_netdis_2graphs.Rmd | 10 ++++---- vignettes/quickstart_netdis_comparison.Rmd | 2 +- 7 files changed, 37 insertions(+), 42 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index c69810b0..f8740a22 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -21,7 +21,7 @@ netdis_one_to_one <- function(graph_1, graph_2, min_ego_edges = 1, binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_counts_fn=count_graphlet_tuples)) { + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_fn=count_graphlet_tuples)) { # bundle graphs into one vector with format needed for # netdis many-to-many @@ -67,7 +67,7 @@ netdis_one_to_many <- function(graph_1, graphs_compare, min_ego_edges = 1, binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_counts_fn=count_graphlet_tuples)) { + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_fn=count_graphlet_tuples)) { # bundle graph_1 and graphs_compare to one vector, with # graph_1 at start as needed for netdis_many_to_many call @@ -124,7 +124,7 @@ netdis_many_to_many <- function(graphs, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_counts_fn=count_graphlet_tuples)) { + scale_fn=count_graphlet_tuples)) { ## ------------------------------------------------------------------------ # Get ego networks for query graphs ego_networks <- purrr::map( @@ -524,7 +524,7 @@ netdis_expected_graphlet_counts_ego_fn <- function(graph, min_ego_edges = 1, min_bin_count = 5, num_bins = 100, - scale_counts_fn = NULL) { + scale_fn = NULL) { # Calculate the scaled graphlet counts for all ego networks in the reference # graph, also returning the ego networks themselves in order to calculate @@ -569,7 +569,7 @@ netdis_expected_graphlet_counts_ego_fn <- function(graph, min_ego_edges = min_ego_edges, density_breaks = binned_densities$breaks, density_binned_reference_counts = density_binned_graphlet_counts, - scale_counts_fn = scale_counts_fn + scale_fn = scale_fn ) } @@ -588,7 +588,7 @@ netdis_expected_graphlet_counts_ego <- function(graph, density_binned_reference_counts, min_ego_nodes = 3, min_ego_edges = 1, - scale_counts_fn=NULL) { + scale_fn=NULL) { #print("netdis_expected_graphlet_counts_ego") #print(density_binned_reference_counts) @@ -607,7 +607,7 @@ netdis_expected_graphlet_counts_ego <- function(graph, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = density_binned_reference_counts, - scale_counts_fn=scale_counts_fn + scale_fn=scale_fn ) names(expected_graphlet_counts) <- names(ego_networks) # Simplify list to array @@ -630,7 +630,7 @@ netdis_expected_graphlet_counts_per_ego <- function(ego_networks, density_breaks, density_binned_reference_counts, max_graphlet_size, - scale_counts_fn=NULL) { + scale_fn=NULL) { #print("netdis_expected_graphlet_counts_per_ego") @@ -643,7 +643,7 @@ netdis_expected_graphlet_counts_per_ego <- function(ego_networks, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = density_binned_reference_counts, - scale_counts_fn = scale_counts_fn + scale_fn = scale_fn ) names(expected_graphlet_counts) <- names(ego_networks) @@ -663,7 +663,7 @@ netdis_expected_graphlet_counts <- function(graph, max_graphlet_size, density_breaks, density_binned_reference_counts, - scale_counts_fn=NULL) { + scale_fn=NULL) { #print("netdis_expected_graphlet_counts") #print(density_binned_reference_counts) @@ -676,11 +676,11 @@ netdis_expected_graphlet_counts <- function(graph, matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - if (!is.null(scale_counts_fn)) { + if (!is.null(scale_fn)) { # Scale reference counts e.g. by multiplying the reference count for each graphlet # by the number of possible sets of k nodes in the query graph, where k is the # number of nodes in the graphlet - matched_reference_counts <- matched_reference_counts * scale_counts_fn(graph, max_graphlet_size) + matched_reference_counts <- matched_reference_counts * scale_fn(graph, max_graphlet_size) } matched_reference_counts diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 1f692006..60c14ebc 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -492,7 +492,7 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = scaled_reference_counts, - scale_counts_fn=count_graphlet_tuples + scale_fn=count_graphlet_tuples ) # Loop over each graph and compare expected with actual # NOTE: v2.0.0 of testthat library made a breaking change that means using @@ -621,7 +621,7 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes neighbourhood_size = 1, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_counts_fn=count_graphlet_tuples + scale_fn=count_graphlet_tuples ) actual_expected_graphlet_counts_ego_o2 <- netdis_expected_graphlet_counts_ego( @@ -630,7 +630,7 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes neighbourhood_size = 2, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_counts_fn=count_graphlet_tuples + scale_fn=count_graphlet_tuples ) # Compare actual to expected @@ -840,7 +840,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no density_binned_reference_counts_o1, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_counts_fn=count_graphlet_tuples + scale_fn=count_graphlet_tuples ) ) expect_equal( @@ -853,7 +853,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no density_binned_reference_counts_o2, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_counts_fn=count_graphlet_tuples + scale_fn=count_graphlet_tuples ) ) @@ -867,7 +867,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no num_bins = num_bins, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_counts_fn=count_graphlet_tuples + scale_fn=count_graphlet_tuples ) actual_expected_graphlet_counts_ego_fn_o2 <- netdis_expected_graphlet_counts_ego_fn( @@ -878,7 +878,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no num_bins = num_bins, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_counts_fn=count_graphlet_tuples + scale_fn=count_graphlet_tuples ) # Generate actual expected accounts by applying generated functions to test # graph diff --git a/vignettes/netdis_2graphs_constant_exp.Rmd b/vignettes/netdis_2graphs_constant_exp.Rmd index 6acab937..898152cc 100644 --- a/vignettes/netdis_2graphs_constant_exp.Rmd +++ b/vignettes/netdis_2graphs_constant_exp.Rmd @@ -85,17 +85,17 @@ bins <- c(0, 1) ```{r} # Calculate expected graphlet counts for each ego network exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, - max_graphlet_size, bins, mean_graphlet_counts_1, - scale_counts_fn = NULL) + max_graphlet_size, + scale_fn = NULL) exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, - max_graphlet_size, bins, mean_graphlet_counts_2, - scale_counts_fn = NULL) + max_graphlet_size, + scale_fn = NULL) # Centre graphlet counts by subtracting expected counts centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 diff --git a/vignettes/netdis_customisations.Rmd b/vignettes/netdis_customisations.Rmd index 342a8e99..afcac7a8 100644 --- a/vignettes/netdis_customisations.Rmd +++ b/vignettes/netdis_customisations.Rmd @@ -1,10 +1,10 @@ --- -title: "Quick start guide for usage of netdis with different pairwise comparisons." +title: "Usage of netdis with binning and expected counts customisations." author: "Jack Roberts" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Netdis pairwise comparisons} + %\VignetteIndexEntry{Netdis function customisations} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -84,7 +84,7 @@ print(results$comp_spec) bin_counts_fn <- density_binned_counts_gp exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_counts_fn = NULL) + scale_fn = NULL) # Calculate netdis statistics results <- netdis_many_to_many(graphs, diff --git a/vignettes/netdis_multigraph_polya-aeppli.Rmd b/vignettes/netdis_multigraph_polya-aeppli.Rmd index 9f605aad..cd48f9f9 100644 --- a/vignettes/netdis_multigraph_polya-aeppli.Rmd +++ b/vignettes/netdis_multigraph_polya-aeppli.Rmd @@ -1,10 +1,10 @@ --- -title: "Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation" +title: "Netdis - Multiple graphs with Expected Counts from Geometric Poisson Approximation" author: "Martin O'Reilly, Jack Roberts" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Netdis - 2 graphs with GP Approximation} + %\VignetteIndexEntry{Netdis - multiple graphs with GP Approximation} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -22,14 +22,7 @@ library("purrr") source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query graphs -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - - -graphs <- list(EBV = graph_1, ECL = graph_2) +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") ``` ## Set Netdis parameters @@ -147,7 +140,7 @@ binned_graphlet_counts <- mapply(bin_counts_fn, # Calculate expected graphlet counts for each ego network exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, max_graphlet_size = max_graphlet_size, - scale_counts_fn=NULL) + scale_fn=NULL) exp_graphlet_counts <- mapply(exp_counts_fn, ego_networks, diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index c357a544..f65e5276 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -110,15 +110,17 @@ ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( ```{r} # Calculate expected graphlet counts (using ref graph ego network density bins) exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, - max_graphlet_size, ref_ego_density_bins, - ref_binned_graphlet_counts) + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, - max_graphlet_size, ref_ego_density_bins, - ref_binned_graphlet_counts) + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) # Centre graphlet counts by subtracting expected counts centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 diff --git a/vignettes/quickstart_netdis_comparison.Rmd b/vignettes/quickstart_netdis_comparison.Rmd index 80caba22..b5aefe61 100644 --- a/vignettes/quickstart_netdis_comparison.Rmd +++ b/vignettes/quickstart_netdis_comparison.Rmd @@ -1,5 +1,5 @@ --- -title: "Quick start guide for usage of netdis with different pairwise comparisons." +title: "Usage of netdis with different pairwise comparison options." author: "Jack Roberts" date: "`r Sys.Date()`" output: rmarkdown::html_vignette From 036f1e1c220aaefe11245f9492868159b6609559 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 5 Sep 2019 14:02:33 +0100 Subject: [PATCH 031/188] fix old arg names and ordering in vignettes --- man/netdis_expected_graphlet_counts.Rd | 2 +- man/netdis_expected_graphlet_counts_ego.Rd | 2 +- man/netdis_expected_graphlet_counts_ego_fn.Rd | 2 +- ...netdis_expected_graphlet_counts_per_ego.Rd | 5 +- man/netdis_many_to_many.Rd | 2 +- man/netdis_one_to_many.Rd | 2 +- man/netdis_one_to_one.Rd | 2 +- vignettes/netdis_2graphs_constant_exp.R | 92 ++++ vignettes/netdis_2graphs_constant_exp.html | 426 +++++++++++++++ vignettes/netdis_2graphs_polya-aeppli.R | 154 ++++++ vignettes/netdis_2graphs_polya-aeppli.Rmd | 8 +- vignettes/netdis_2graphs_polya-aeppli.html | 485 +++++++++++++++++ vignettes/netdis_customisations.R | 101 ++++ vignettes/netdis_customisations.html | 494 +++++++++++++++++ vignettes/netdis_multigraph_polya-aeppli.R | 154 ++++++ vignettes/netdis_multigraph_polya-aeppli.html | 505 ++++++++++++++++++ vignettes/quickstart_netdis.html | 98 ++-- vignettes/quickstart_netdis_2graphs.R | 34 +- vignettes/quickstart_netdis_2graphs.html | 226 ++++---- vignettes/quickstart_netdis_comparison.R | 74 +++ vignettes/quickstart_netdis_comparison.html | 426 +++++++++++++++ 21 files changed, 3106 insertions(+), 188 deletions(-) create mode 100644 vignettes/netdis_2graphs_constant_exp.R create mode 100644 vignettes/netdis_2graphs_constant_exp.html create mode 100644 vignettes/netdis_2graphs_polya-aeppli.R create mode 100644 vignettes/netdis_2graphs_polya-aeppli.html create mode 100644 vignettes/netdis_customisations.R create mode 100644 vignettes/netdis_customisations.html create mode 100644 vignettes/netdis_multigraph_polya-aeppli.R create mode 100644 vignettes/netdis_multigraph_polya-aeppli.html create mode 100644 vignettes/quickstart_netdis_comparison.R create mode 100644 vignettes/quickstart_netdis_comparison.html diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index 0aa352c1..3afa593c 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -5,7 +5,7 @@ \title{INTERNAL FUNCTION - Do not call directly} \usage{ netdis_expected_graphlet_counts(graph, max_graphlet_size, density_breaks, - density_binned_reference_counts, scale_counts_fn = NULL) + density_binned_reference_counts, scale_fn = NULL) } \description{ Used by \code{netdis_expected_graphlet_counts_ego} to diff --git a/man/netdis_expected_graphlet_counts_ego.Rd b/man/netdis_expected_graphlet_counts_ego.Rd index ede22ee3..f7933b82 100644 --- a/man/netdis_expected_graphlet_counts_ego.Rd +++ b/man/netdis_expected_graphlet_counts_ego.Rd @@ -6,7 +6,7 @@ \usage{ netdis_expected_graphlet_counts_ego(graph, max_graphlet_size, neighbourhood_size, density_breaks, density_binned_reference_counts, - min_ego_nodes = 3, min_ego_edges = 1) + min_ego_nodes = 3, min_ego_edges = 1, scale_fn = NULL) } \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to diff --git a/man/netdis_expected_graphlet_counts_ego_fn.Rd b/man/netdis_expected_graphlet_counts_ego_fn.Rd index 328cf6bf..30cbc379 100644 --- a/man/netdis_expected_graphlet_counts_ego_fn.Rd +++ b/man/netdis_expected_graphlet_counts_ego_fn.Rd @@ -6,7 +6,7 @@ \usage{ netdis_expected_graphlet_counts_ego_fn(graph, max_graphlet_size, neighbourhood_size, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, num_bins = 100) + min_bin_count = 5, num_bins = 100, scale_fn = NULL) } \arguments{ \item{graph}{A connected, undirected, simple reference graph as an diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index 852a2673..ffadc1df 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -4,9 +4,8 @@ \alias{netdis_expected_graphlet_counts_per_ego} \title{INTERNAL FUNCTION - Do not call directly} \usage{ -netdis_expected_graphlet_counts_per_ego(ego_networks, max_graphlet_size, - density_breaks, density_binned_reference_counts, - scale_counts_fn = NULL) +netdis_expected_graphlet_counts_per_ego(ego_networks, density_breaks, + density_binned_reference_counts, max_graphlet_size, scale_fn = NULL) } \description{ JACK To follow through logic of paper steps, wanted to pass diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index ec3912e8..63940358 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -12,7 +12,7 @@ netdis_many_to_many(graphs, ref_graph, comparisons = "many-to-many", bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_counts_fn = count_graphlet_tuples)) + scale_fn = count_graphlet_tuples)) } \arguments{ \item{graphs}{Query graphs} diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index 4c6195e4..6f95956e 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -12,7 +12,7 @@ netdis_one_to_many(graph_1, graphs_compare, ref_graph, bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_counts_fn = count_graphlet_tuples)) + scale_fn = count_graphlet_tuples)) } \arguments{ \item{graph_1}{query graph - this graph will be compared with diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index a89430c6..0750f34e 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -11,7 +11,7 @@ netdis_one_to_one(graph_1, graph_2, ref_graph, max_graphlet_size = 4, bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_counts_fn = count_graphlet_tuples)) + scale_fn = count_graphlet_tuples)) } \arguments{ \item{graph_1}{First query graph} diff --git a/vignettes/netdis_2graphs_constant_exp.R b/vignettes/netdis_2graphs_constant_exp.R new file mode 100644 index 00000000..9b2f9c46 --- /dev/null +++ b/vignettes/netdis_2graphs_constant_exp.R @@ -0,0 +1,92 @@ +## ------------------------------------------------------------------------ +# Load libraries +library("netdist") +library("purrr") + +## ------------------------------------------------------------------------ +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + +## ------------------------------------------------------------------------ +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ------------------------------------------------------------------------ +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + +## ------------------------------------------------------------------------ +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + +## ------------------------------------------------------------------------ +# rep(1, nrow(graphlet_counts)): list of ones as bin index, i.e. everything in same bin +mean_graphlet_counts_1 <- density_binned_counts(graphlet_counts_1, + rep(1, nrow(graphlet_counts_1))) + +mean_graphlet_counts_2 <- density_binned_counts(graphlet_counts_2, + rep(1, nrow(graphlet_counts_2))) + +bins <- c(0, 1) + +## ------------------------------------------------------------------------ +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, + bins, + mean_graphlet_counts_1, + max_graphlet_size, + scale_fn = NULL) + + +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, + bins, + mean_graphlet_counts_2, + max_graphlet_size, + scale_fn = NULL) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 + +centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 + +## ------------------------------------------------------------------------ +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + +## ------------------------------------------------------------------------ + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) + diff --git a/vignettes/netdis_2graphs_constant_exp.html b/vignettes/netdis_2graphs_constant_exp.html new file mode 100644 index 00000000..c3a8ab25 --- /dev/null +++ b/vignettes/netdis_2graphs_constant_exp.html @@ -0,0 +1,426 @@ + + + + + + + + + + + + + + + + +Netdis - 2 graphs with Constant Expected Counts for Each Graphlet + + + + + + + + + + + + + + + + + + + + + +

Netdis - 2 graphs with Constant Expected Counts for Each Graphlet

+

Martin O’Reilly, Jack Roberts

+

2019-09-05

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Load graphs

+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+
+

Generate ego networks

+
# Get ego networks for query graphs and reference graph
+ego_1 <- make_named_ego_graph(graph_1, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+ego_2 <- make_named_ego_graph(graph_2, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+
+

Count graphlets in ego networks

+
# Count graphlets for ego networks in query and reference graphs
+graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
+graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
+
+
+

Use mean graphlet counts as expected counts

+
# rep(1, nrow(graphlet_counts)): list of ones as bin index, i.e. everything in same bin
+mean_graphlet_counts_1 <- density_binned_counts(graphlet_counts_1,
+                                                rep(1, nrow(graphlet_counts_1)))
+
+mean_graphlet_counts_2 <- density_binned_counts(graphlet_counts_2,
+                                                rep(1, nrow(graphlet_counts_2)))
+
+bins <- c(0, 1)
+
+
+

Centre graphlet counts of query graphs using binned expected counts

+
# Calculate expected graphlet counts for each ego network
+exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, 
+                                                                 bins, 
+                                                                 mean_graphlet_counts_1,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+
+
+exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, 
+                                                                 bins, 
+                                                                 mean_graphlet_counts_2,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1
+
+centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2
+
+
+

Sum centred graphlet counts across all ego networks

+
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
+
+sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
+
+
+

Calculate netdis statistics

+
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
+                              sum_graphlet_counts_2, 
+                              max_graphlet_size)
+
+print(netdis_result)
+
##   netdis3   netdis4 
+## 0.3116860 0.9592365
+
+ + + + + + + + diff --git a/vignettes/netdis_2graphs_polya-aeppli.R b/vignettes/netdis_2graphs_polya-aeppli.R new file mode 100644 index 00000000..70dc8f23 --- /dev/null +++ b/vignettes/netdis_2graphs_polya-aeppli.R @@ -0,0 +1,154 @@ +## ------------------------------------------------------------------------ +# Load libraries +library("netdist") +library("purrr") + +## ------------------------------------------------------------------------ +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + +## ------------------------------------------------------------------------ +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ------------------------------------------------------------------------ +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + +## ------------------------------------------------------------------------ +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + + +## ------------------------------------------------------------------------ + +# Get ego-network densities +densities_1 <- ego_network_density(ego_1) +densities_2 <- ego_network_density(ego_2) + +# Adaptively bin ego-network densities +binned_densities_1 <- binned_densities_adaptive(densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_1 <- binned_densities_1$breaks + +binned_densities_2 <- binned_densities_adaptive(densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_2 <- binned_densities_2$breaks + +## ------------------------------------------------------------------------ + +density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + bin_indexes) + + exp_counts_bin <- function(bin_idx) { + counts <- graphlet_counts[bin_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx,] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) + theta_d <- 2*means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- (1 / length(graphlet_idx)) * + sum( + 2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]) + ) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk + } + + nbins <- length(unique(bin_indexes)) + expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] = 0 + + expected_counts_bin +} + +binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) + +binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size) + + +## ------------------------------------------------------------------------ +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, + ego_density_bins_1, + binned_graphlet_counts_1, + max_graphlet_size, + scale_fn = NULL) + + +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, + ego_density_bins_2, + binned_graphlet_counts_2, + max_graphlet_size, + scale_fn = NULL) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 + +centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 + +## ------------------------------------------------------------------------ +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + +## ------------------------------------------------------------------------ + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) + diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index cfdde82a..a5f2f7e9 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -149,17 +149,17 @@ binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, ```{r} # Calculate expected graphlet counts for each ego network exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, - max_graphlet_size, ego_density_bins_1, binned_graphlet_counts_1, - scale_counts_fn = NULL) + max_graphlet_size, + scale_fn = NULL) exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, - max_graphlet_size, ego_density_bins_2, binned_graphlet_counts_2, - scale_counts_fn = NULL) + max_graphlet_size, + scale_fn = NULL) # Centre graphlet counts by subtracting expected counts centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 diff --git a/vignettes/netdis_2graphs_polya-aeppli.html b/vignettes/netdis_2graphs_polya-aeppli.html new file mode 100644 index 00000000..09d7f1cd --- /dev/null +++ b/vignettes/netdis_2graphs_polya-aeppli.html @@ -0,0 +1,485 @@ + + + + + + + + + + + + + + + + +Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation + + + + + + + + + + + + + + + + + + + + + +

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

+

Martin O’Reilly, Jack Roberts

+

2019-09-05

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Load graphs

+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+
+

Generate ego networks

+
# Get ego networks for query graphs and reference graph
+ego_1 <- make_named_ego_graph(graph_1, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+ego_2 <- make_named_ego_graph(graph_2, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+
+

Count graphlets in ego networks

+
# Count graphlets for ego networks in query and reference graphs
+graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
+graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
+
+
+

Bin ego networks by density

+
# Get ego-network densities
+densities_1 <- ego_network_density(ego_1)
+densities_2 <- ego_network_density(ego_2)
+
+# Adaptively bin ego-network densities
+binned_densities_1 <- binned_densities_adaptive(densities_1, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_1 <- binned_densities_1$breaks
+
+binned_densities_2 <- binned_densities_adaptive(densities_2, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_2 <- binned_densities_2$breaks
+
+
+

Calculate expected graphlet counts in each bin using geometric poisson approximation

+
density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) {
+  
+  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
+                                  graphlet_counts, 
+                                  bin_indexes)
+  
+  exp_counts_bin <- function(bin_idx) {
+    counts <- graphlet_counts[bin_indexes == bin_idx, ]
+    means <- mean_binned_graphlet_counts[bin_idx,]
+    
+    mean_sub_counts <- sweep(counts, 2, means)
+    
+    Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1)
+    theta_d <- 2*means / (Vd_sq + means)
+
+    exp_counts_dk <- vector()
+    for (k in 2:max_graphlet_size) {
+      graphlet_idx <- graphlet_ids_for_size(k)
+      
+      lambda_dk <- (1 / length(graphlet_idx)) * 
+                   sum(
+                     2 * means[graphlet_idx]^2 /
+                     (Vd_sq[graphlet_idx] + means[graphlet_idx])
+                   )
+      
+      exp_counts_dk <- append(exp_counts_dk,
+                              lambda_dk / theta_d[graphlet_idx])
+    }
+    
+    exp_counts_dk
+  }
+  
+  nbins <- length(unique(bin_indexes))
+  expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins))
+  
+  # deal with NAs caused by bins with zero counts for a graphlet
+  expected_counts_bin[is.nan(expected_counts_bin)] = 0
+  
+  expected_counts_bin
+}
+
+binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1,
+                                                     binned_densities_1$interval_indexes,
+                                                     max_graphlet_size)
+
+binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2,
+                                                     binned_densities_2$interval_indexes,
+                                                     max_graphlet_size)
+
+
+

Centre graphlet counts of query graphs using binned expected counts

+
# Calculate expected graphlet counts for each ego network
+exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, 
+                                                                 ego_density_bins_1, 
+                                                                 binned_graphlet_counts_1,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+
+
+exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, 
+                                                                 ego_density_bins_2, 
+                                                                 binned_graphlet_counts_2,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1
+
+centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2
+
+
+

Sum centred graphlet counts across all ego networks

+
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
+
+sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
+
+
+

Calculate netdis statistics

+
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
+                              sum_graphlet_counts_2, 
+                              max_graphlet_size)
+
+print(netdis_result)
+
##   netdis3   netdis4 
+## 0.8822527 0.1892755
+
+ + + + + + + + diff --git a/vignettes/netdis_customisations.R b/vignettes/netdis_customisations.R new file mode 100644 index 00000000..0026313e --- /dev/null +++ b/vignettes/netdis_customisations.R @@ -0,0 +1,101 @@ +## ------------------------------------------------------------------------ +# Load libraries +library("netdist") +library("purrr") + +## ------------------------------------------------------------------------ +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + + +## ------------------------------------------------------------------------ +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +## ------------------------------------------------------------------------ + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) + +## ------------------------------------------------------------------------ + +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, + num_intervals = 50) + + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn) + +print(results$netdis) +print(results$comp_spec) + + + +## ------------------------------------------------------------------------ +bin_counts_fn <- density_binned_counts_gp + +exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_fn = NULL) + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) + +## ------------------------------------------------------------------------ + +binning_fn <- single_density_bin +bin_counts_fn <- density_binned_counts +exp_counts_fn <- netdis_expected_graphlet_counts_per_ego + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) + + + diff --git a/vignettes/netdis_customisations.html b/vignettes/netdis_customisations.html new file mode 100644 index 00000000..c60d9905 --- /dev/null +++ b/vignettes/netdis_customisations.html @@ -0,0 +1,494 @@ + + + + + + + + + + + + + + + + +Usage of netdis with binning and expected counts customisations. + + + + + + + + + + + + + + + + + + + + + +

Usage of netdis with binning and expected counts customisations.

+

Jack Roberts

+

2019-09-05

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+
+

Load query graphs

+
source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+
+
+

Default Expected Counts with Reference Graph

+
# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges)
+
+print(results$netdis)
+
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
+## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Binning Parameters

+
binning_fn <- purrr::partial(binned_densities_adaptive,
+                             min_counts_per_interval = 10,
+                             num_intervals = 50)
+
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               binning_fn = binning_fn)
+
+print(results$netdis)
+
##               [,1]        [,2]        [,3]        [,4]      [,5]      [,6]
+## netdis3 0.08499773 0.005900766 0.009547675 0.007177066 0.1078916 0.1144589
+## netdis4 0.20037679 0.045244760 0.018904439 0.112043371 0.3361503 0.2631420
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.1101426 0.0006494388 2.478794e-05 0.0004097632
+## netdis4 0.4818139 0.0274434372 3.227187e-02 0.0928126401
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Expected Counts: Geometric Poisson

+
bin_counts_fn <- density_binned_counts_gp
+
+exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego,
+                                scale_fn = NULL)
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph = NULL,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               bin_counts_fn = bin_counts_fn,
+                               exp_counts_fn = exp_counts_fn)
+
+print(results$netdis)
+
##              [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
+## netdis3 0.8822527 0.9101084 0.8838054 0.96266771 0.04173551 0.03585169
+## netdis4 0.1892755 0.5752533 0.3719671 0.04604718 0.60705460 0.20370907
+##               [,7]         [,8]        [,9]       [,10]
+## netdis3 0.06271238 0.0004211575 0.005364888 0.009114229
+## netdis4 0.12978965 0.7178967193 0.490256248 0.371848474
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Expected Counts: Simple Mean

+
binning_fn <- single_density_bin
+bin_counts_fn <- density_binned_counts
+exp_counts_fn <- netdis_expected_graphlet_counts_per_ego
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph = NULL,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               binning_fn = binning_fn,
+                               bin_counts_fn = bin_counts_fn,
+                               exp_counts_fn = exp_counts_fn)
+
+print(results$netdis)
+
##              [,1]      [,2]      [,3]       [,4]      [,5]      [,6]
+## netdis3 0.3116860 0.8254261 0.8768637 0.04053921 0.8531485 0.8226894
+## netdis4 0.9592365 0.2009423 0.7974697 0.21688688 0.7734930 0.2144558
+##              [,7]       [,8]      [,9]     [,10]
+## netdis3 0.2353732 0.01970843 0.8288649 0.9167543
+## netdis4 0.8030030 0.39992007 0.3300305 0.6301018
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + diff --git a/vignettes/netdis_multigraph_polya-aeppli.R b/vignettes/netdis_multigraph_polya-aeppli.R new file mode 100644 index 00000000..58e62b9f --- /dev/null +++ b/vignettes/netdis_multigraph_polya-aeppli.R @@ -0,0 +1,154 @@ +## ------------------------------------------------------------------------ +# Load libraries +library("netdist") +library("purrr") + +## ------------------------------------------------------------------------ +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +## ------------------------------------------------------------------------ +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ------------------------------------------------------------------------ +# Get ego networks for query graphs +ego_networks <- purrr::map( + graphs, make_named_ego_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges +) + +## ------------------------------------------------------------------------ +# Count graphlets for ego networks in query graphs +graphlet_counts <- purrr::map( + ego_networks, + ego_to_graphlet_counts, + max_graphlet_size = max_graphlet_size +) + +## ------------------------------------------------------------------------ + +# Get ego-network densities +densities <- purrr::map(ego_networks, + ego_network_density) + +binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100) +# Adaptively bin ego-network densities +binned_densities <- purrr::map(densities, + binning_fn) + +ego_density_bins <- purrr::map(binned_densities, function(x) {x$breaks}) +ego_density_bin_indexes <- purrr::map(binned_densities, function(x) {x$interval_indexes}) + + +## ------------------------------------------------------------------------ + +density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + bin_indexes) + + exp_counts_bin <- function(bin_idx) { + counts <- graphlet_counts[bin_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx,] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) + theta_d <- 2*means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- (1 / length(graphlet_idx)) * + sum( + 2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]) + ) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk + } + + nbins <- length(unique(bin_indexes)) + expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] = 0 + + expected_counts_bin +} + + +bin_counts_fn <- purrr::partial(density_binned_counts_gp, + max_graphlet_size = max_graphlet_size) + +binned_graphlet_counts <- mapply(bin_counts_fn, + graphlet_counts, + ego_density_bin_indexes) + +## ------------------------------------------------------------------------ +# Calculate expected graphlet counts for each ego network +exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, + max_graphlet_size = max_graphlet_size, + scale_fn=NULL) + +exp_graphlet_counts <- mapply(exp_counts_fn, + ego_networks, + ego_density_bins, + binned_graphlet_counts) + + + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) + + +## ------------------------------------------------------------------------ +sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) + +## ------------------------------------------------------------------------ + +# Generate pairwise comparisons +comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = "many-to-many") + +## ------------------------------------------------------------------------ +# Calculate netdis statistics +results <- parallel::mcmapply( + function(index_a, index_b) { + netdis_uptok( + sum_graphlet_counts[[index_a]], + sum_graphlet_counts[[index_b]], + max_graphlet_size = max_graphlet_size + ) + }, + comp_spec$index_a, + comp_spec$index_b, + SIMPLIFY = TRUE) + + +list(netdis = results, comp_spec = comp_spec) + diff --git a/vignettes/netdis_multigraph_polya-aeppli.html b/vignettes/netdis_multigraph_polya-aeppli.html new file mode 100644 index 00000000..d23b9487 --- /dev/null +++ b/vignettes/netdis_multigraph_polya-aeppli.html @@ -0,0 +1,505 @@ + + + + + + + + + + + + + + + + +Netdis - Multiple graphs with Expected Counts from Geometric Poisson Approximation + + + + + + + + + + + + + + + + + + + + + +

Netdis - Multiple graphs with Expected Counts from Geometric Poisson Approximation

+

Martin O’Reilly, Jack Roberts

+

2019-09-05

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Load graphs

+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+
+

Generate ego networks

+
# Get ego networks for query graphs
+ego_networks <- purrr::map(
+  graphs, make_named_ego_graph,
+  order = neighbourhood_size, 
+  min_ego_nodes = min_ego_nodes, 
+  min_ego_edges = min_ego_edges
+)
+
+
+

Count graphlets in ego networks

+
# Count graphlets for ego networks in query graphs
+graphlet_counts <- purrr::map(
+  ego_networks,
+  ego_to_graphlet_counts,
+  max_graphlet_size = max_graphlet_size
+)
+
+
+

Bin ego networks by density

+
# Get ego-network densities
+densities <- purrr::map(ego_networks,
+                        ego_network_density)
+
+binning_fn = purrr::partial(binned_densities_adaptive,
+                            min_counts_per_interval = 5,
+                            num_intervals = 100)
+# Adaptively bin ego-network densities
+binned_densities <- purrr::map(densities,
+                               binning_fn)
+
+ego_density_bins <- purrr::map(binned_densities, function(x) {x$breaks})
+ego_density_bin_indexes <- purrr::map(binned_densities, function(x) {x$interval_indexes})
+
+
+

Calculate expected graphlet counts in each bin using geometric poisson approximation

+
density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) {
+  
+  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
+                                  graphlet_counts, 
+                                  bin_indexes)
+  
+  exp_counts_bin <- function(bin_idx) {
+    counts <- graphlet_counts[bin_indexes == bin_idx, ]
+    means <- mean_binned_graphlet_counts[bin_idx,]
+    
+    mean_sub_counts <- sweep(counts, 2, means)
+    
+    Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1)
+    theta_d <- 2*means / (Vd_sq + means)
+
+    exp_counts_dk <- vector()
+    for (k in 2:max_graphlet_size) {
+      graphlet_idx <- graphlet_ids_for_size(k)
+      
+      lambda_dk <- (1 / length(graphlet_idx)) * 
+                   sum(
+                     2 * means[graphlet_idx]^2 /
+                     (Vd_sq[graphlet_idx] + means[graphlet_idx])
+                   )
+      
+      exp_counts_dk <- append(exp_counts_dk,
+                              lambda_dk / theta_d[graphlet_idx])
+    }
+    
+    exp_counts_dk
+  }
+  
+  nbins <- length(unique(bin_indexes))
+  expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins))
+  
+  # deal with NAs caused by bins with zero counts for a graphlet
+  expected_counts_bin[is.nan(expected_counts_bin)] = 0
+  
+  expected_counts_bin
+}
+
+
+bin_counts_fn <- purrr::partial(density_binned_counts_gp,
+                               max_graphlet_size = max_graphlet_size)
+
+binned_graphlet_counts <- mapply(bin_counts_fn,
+                                 graphlet_counts,
+                                 ego_density_bin_indexes)
+
+
+

Centre graphlet counts of query graphs using binned expected counts

+
# Calculate expected graphlet counts for each ego network
+exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego,
+                               max_graphlet_size = max_graphlet_size,
+                               scale_fn=NULL)
+
+exp_graphlet_counts <- mapply(exp_counts_fn,
+                              ego_networks,
+                              ego_density_bins,
+                              binned_graphlet_counts)
+
+
+
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts <-  mapply("-", graphlet_counts, exp_graphlet_counts)
+
+
+

Sum centred graphlet counts across all ego networks

+
sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums)
+
+
+

Calculate netdis statistics

+
# Generate pairwise comparisons
+comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = "many-to-many")
+
+## ------------------------------------------------------------------------
+# Calculate netdis statistics
+results <- parallel::mcmapply(
+    function(index_a, index_b) {
+      netdis_uptok(
+        sum_graphlet_counts[[index_a]], 
+        sum_graphlet_counts[[index_b]],
+        max_graphlet_size = max_graphlet_size
+      )
+    },
+    comp_spec$index_a,
+    comp_spec$index_b,
+    SIMPLIFY = TRUE)
+  
+  
+list(netdis = results, comp_spec = comp_spec)
+
## $netdis
+##              [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
+## netdis3 0.8822527 0.9101084 0.8838054 0.96266771 0.04173551 0.03585169
+## netdis4 0.1892755 0.5752533 0.3719671 0.04604718 0.60705460 0.20370907
+##               [,7]         [,8]        [,9]       [,10]
+## netdis3 0.06271238 0.0004211575 0.005364888 0.009114229
+## netdis4 0.12978965 0.7178967193 0.490256248 0.371848474
+## 
+## $comp_spec
+##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + diff --git a/vignettes/quickstart_netdis.html b/vignettes/quickstart_netdis.html index 278edd5c..8f35b252 100644 --- a/vignettes/quickstart_netdis.html +++ b/vignettes/quickstart_netdis.html @@ -12,7 +12,7 @@ - + Quick start guide for Netdis @@ -20,9 +20,9 @@ + + + + + + + + + + + + + + + + + +

Usage of netdis with different pairwise comparison options.

+

Jack Roberts

+

2019-09-05

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+# Reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+
+

Compare two graphs

+
# Load query graphs
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+# Calculate netdis statistics
+netdis_one_to_one(graph_1, graph_2,
+                  ref_graph,
+                  max_graphlet_size = max_graphlet_size,
+                  neighbourhood_size = neighbourhood_size,
+                  min_ego_nodes = min_ego_nodes,
+                  min_ego_edges = min_ego_edges)
+
##   netdis3   netdis4 
+## 0.1846655 0.1749835
+
+
+

Compare one graph to many other graphs

+
# Load query graphs
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+graph_1 <- graphs$EBV
+graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")]
+
+# Calculate netdis statistics
+netdis_one_to_many(graph_1, graphs_compare,
+                   ref_graph,
+                   max_graphlet_size = max_graphlet_size,
+                   neighbourhood_size = neighbourhood_size,
+                   min_ego_nodes = min_ego_nodes,
+                   min_ego_edges = min_ego_edges)
+
##               ECL       HSV-1       KSHV         VZV
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160
+
+
+

Do pairwise netdis calculations for many graphs

+
# Load query graphs
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges)
+
+print(results$netdis)
+
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
+## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + From 1d700425ff80b7ba01da5551e7ee89fe9b1aa91e Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 5 Sep 2019 15:09:46 +0100 Subject: [PATCH 032/188] export single_density_bin function --- NAMESPACE | 1 + R/measures_net_dis.R | 1 + vignettes/netdis_customisations.Rmd | 3 --- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index db3a1ef2..2bda06d9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,6 +72,7 @@ export(scale_graphlet_count) export(scale_graphlet_counts_ego) export(shift_dhist) export(simplify_graph) +export(single_density_bin) export(sort_dhist) export(zeros_to_ones) importFrom(Rcpp,sourceCpp) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index f8740a22..4eaddaa3 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -720,6 +720,7 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, #' For case where don't want to use binning, return #' a single bin which covers full range of possible #' densities. +#' @export single_density_bin <- function(densities) { binned_densities <- list(densities = densities, diff --git a/vignettes/netdis_customisations.Rmd b/vignettes/netdis_customisations.Rmd index afcac7a8..56baed73 100644 --- a/vignettes/netdis_customisations.Rmd +++ b/vignettes/netdis_customisations.Rmd @@ -102,7 +102,6 @@ print(results$comp_spec) ## With Modified Expected Counts: Simple Mean ```{r} - binning_fn <- single_density_bin bin_counts_fn <- density_binned_counts exp_counts_fn <- netdis_expected_graphlet_counts_per_ego @@ -120,6 +119,4 @@ results <- netdis_many_to_many(graphs, print(results$netdis) print(results$comp_spec) - - ``` \ No newline at end of file From 72982019ffb62b6a8559088606f3806cc7edfd75 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 5 Sep 2019 17:30:28 +0100 Subject: [PATCH 033/188] some docs and formatting improvements --- R/measures_net_dis.R | 68 +- tests/testthat/test_measures_new_net_dis.R | 1023 ++++++++++++++++++++ 2 files changed, 1070 insertions(+), 21 deletions(-) create mode 100644 tests/testthat/test_measures_new_net_dis.R diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 4eaddaa3..0858d874 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -19,9 +19,14 @@ netdis_one_to_one <- function(graph_1, graph_2, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_fn=count_graphlet_tuples)) { + binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_fn = count_graphlet_tuples)) { # bundle graphs into one vector with format needed for # netdis many-to-many @@ -65,19 +70,24 @@ netdis_one_to_many <- function(graph_1, graphs_compare, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, scale_fn=count_graphlet_tuples)) { + binning_fn = purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100), + bin_counts_fn = purrr::partial(density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_fn = count_graphlet_tuples)) { # bundle graph_1 and graphs_compare to one vector, with # graph_1 at start as needed for netdis_many_to_many call - graphs <- append(graphs_compare, list(graph_1=graph_1), after=0) + graphs <- append(graphs_compare, list(graph_1 = graph_1), after = 0) # calculate netdis result <- netdis_many_to_many( graphs, ref_graph, - comparisons = 'one-to-many', + comparisons = "one-to-many", max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, @@ -112,7 +122,7 @@ netdis_one_to_many <- function(graph_1, graphs_compare, #' @export netdis_many_to_many <- function(graphs, ref_graph, - comparisons = 'many-to-many', + comparisons = "many-to-many", max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, @@ -124,7 +134,7 @@ netdis_many_to_many <- function(graphs, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn=count_graphlet_tuples)) { + scale_fn = count_graphlet_tuples)) { ## ------------------------------------------------------------------------ # Get ego networks for query graphs ego_networks <- purrr::map( @@ -198,9 +208,13 @@ netdis_many_to_many <- function(graphs, # extract bin breaks and indexes from binning results ego_density_bin_breaks <- purrr::map(binned_densities, - function(x) {x$breaks}) + function(x) { + x$breaks + }) ego_density_bin_indexes <- purrr::map(binned_densities, - function(x) {x$interval_indexes}) + function(x) { + x$interval_indexes + }) # Calculate expected counts in each bin @@ -590,9 +604,6 @@ netdis_expected_graphlet_counts_ego <- function(graph, min_ego_edges = 1, scale_fn=NULL) { - #print("netdis_expected_graphlet_counts_ego") - #print(density_binned_reference_counts) - # Generate ego-networks for query graph ego_networks <- make_named_ego_graph(graph, neighbourhood_size) # Drop ego-networks that don't have the minimum number of nodes or edges @@ -633,9 +644,6 @@ netdis_expected_graphlet_counts_per_ego <- function(ego_networks, scale_fn=NULL) { - #print("netdis_expected_graphlet_counts_per_ego") - #print(density_binned_reference_counts) - # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. expected_graphlet_counts <- @@ -665,9 +673,6 @@ netdis_expected_graphlet_counts <- function(graph, density_binned_reference_counts, scale_fn=NULL) { - #print("netdis_expected_graphlet_counts") - #print(density_binned_reference_counts) - # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- igraph::edge_density(graph) @@ -808,6 +813,11 @@ density_binned_counts_gp <- function(graphlet_counts, } +#' Replace zero values in a vector with ones. Used by +#' \code{scale_graphlet_count} to prevent divide by +#' zero errors. +#' @param v A vector. +#' TODO remove export #' @export zeros_to_ones <- function(v) { zero_index <- which(v == 0) @@ -816,6 +826,10 @@ zeros_to_ones <- function(v) { } +#' Divide graphlet counts by pre-computed scaling factor from +#' \code{count_graphlet_tuples} output. +#' @param graphlet_count Pre-computed graphlet counts. +#' @param graphlet_tuples Pre-computed \code{count_graphlet_tuples} output. #' @export scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { # Avoid divide by zero errors by replacing all zeros with ones in the @@ -823,6 +837,10 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { graphlet_count / zeros_to_ones(graphlet_tuples) } +#' Run count_graphlet_tuples across pre-computed ego networks. +#' @param ego_networks Pre-generated ego networks for an input graph. +#' @param max_graphlet_size Determines the maximum size of graphlets included +#' in the tuple counts. #' @export count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { graphlet_tuple_counts <- @@ -832,6 +850,8 @@ count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { graphlet_tuple_counts } +#' Calculate ego network edge densities. +#' @param ego_networks Pre-generated ego networks for an input graph. #' @export ego_network_density <- function(ego_networks) { densities <- purrr::simplify(purrr::map_dbl( @@ -870,6 +890,12 @@ scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, } +# For each graphlet calculate the number of possible sets of k nodes in the +# query graph, where k is the number of nodes in the graphlet. +#' @param graph A connected, undirected, simple graph as an \code{igraph} +#' object. +#' @param max_graphlet_size Determines the maximum size of graphlets included +#' in the tuple counts. #' @export count_graphlet_tuples <- function(graph, max_graphlet_size) { graph_node_count <- igraph::vcount(graph) diff --git a/tests/testthat/test_measures_new_net_dis.R b/tests/testthat/test_measures_new_net_dis.R new file mode 100644 index 00000000..60c14ebc --- /dev/null +++ b/tests/testthat/test_measures_new_net_dis.R @@ -0,0 +1,1023 @@ + +context("Measures Netdis: Graphlet tuples") +test_message <- + paste("count_graphlet_tuples and count_graphlet_tuples_ego give", + "choose(node_count, graphlet_size) for each graph + graphlet", + "combination", + sep = " " + ) +test_that(test_message, { + # Create some test graphs with known node counts (this is the only graph + # property we care about for this test) + graph_n11 <- igraph::erdos.renyi.game(11, p = 1, type = "gnp") + graph_n37 <- igraph::erdos.renyi.game(37, p = 1, type = "gnp") + graph_n73 <- igraph::erdos.renyi.game(73, p = 1, type = "gnp") + # Calculate expected graph tuple count for graphlets of various sizes. There + # is 1 graphlet of size 1, 2 of size 3, 6 of size 4, and 21 of size 5 + graphlet_tuple_counts <- function(n, max_graphlet_size) { + if (max_graphlet_size >= 2) { + tuple_counts <- rep(choose(n, 2), 1) + } + if (max_graphlet_size >= 3) { + tuple_counts <- c(tuple_counts, rep(choose(n, 3), 2)) + } + if (max_graphlet_size >= 4) { + tuple_counts <- c(tuple_counts, rep(choose(n, 4), 6)) + } + if (max_graphlet_size >= 5) { + tuple_counts <- c(tuple_counts, rep(choose(n, 5), 21)) + } + tuple_counts <- setNames(tuple_counts, graphlet_key(max_graphlet_size)$id) + tuple_counts + } + + # === TEST count_graphlet_tuples === + # Generate expected tuple counts for graphlets up to size 4 and 5 + expected_tuple_count_n11_gs4 <- graphlet_tuple_counts(11, 4) + expected_tuple_count_n37_gs4 <- graphlet_tuple_counts(37, 4) + expected_tuple_count_n73_gs4 <- graphlet_tuple_counts(73, 4) + expected_tuple_count_n11_gs5 <- graphlet_tuple_counts(11, 5) + expected_tuple_count_n37_gs5 <- graphlet_tuple_counts(37, 5) + expected_tuple_count_n73_gs5 <- graphlet_tuple_counts(73, 5) + # Generate actual tuple counts for graphlets up to size 4 and 5 + actual_tuple_count_n11_gs4 <- count_graphlet_tuples(graph_n11, 4) + actual_tuple_count_n37_gs4 <- count_graphlet_tuples(graph_n37, 4) + actual_tuple_count_n73_gs4 <- count_graphlet_tuples(graph_n73, 4) + actual_tuple_count_n11_gs5 <- count_graphlet_tuples(graph_n11, 5) + actual_tuple_count_n37_gs5 <- count_graphlet_tuples(graph_n37, 5) + actual_tuple_count_n73_gs5 <- count_graphlet_tuples(graph_n73, 5) + # Compare expected tuple counts with actual + expect_equal(expected_tuple_count_n11_gs4, actual_tuple_count_n11_gs4) + expect_equal(expected_tuple_count_n37_gs4, actual_tuple_count_n37_gs4) + expect_equal(expected_tuple_count_n73_gs4, actual_tuple_count_n73_gs4) + expect_equal(expected_tuple_count_n11_gs5, actual_tuple_count_n11_gs5) + expect_equal(expected_tuple_count_n37_gs5, actual_tuple_count_n37_gs5) + expect_equal(expected_tuple_count_n73_gs5, actual_tuple_count_n73_gs5) + + # === TEST count_graphlet_tuples_ego === + # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar + # to the method under test. However, it's a simple method so maybe that's ok? + graphlet_tuple_counts_ego <- function(ego_networks, max_graphlet_size) { + t(sapply(ego_networks, FUN = function(g) { + graphlet_tuple_counts(length(igraph::V(g)), max_graphlet_size) + })) + } + # Generate ego networks for each graph + graph_n11_ego1 <- make_named_ego_graph(graph_n11, order = 1) + graph_n37_ego1 <- make_named_ego_graph(graph_n37, order = 1) + graph_n73_ego1 <- make_named_ego_graph(graph_n73, order = 1) + graph_n11_ego2 <- make_named_ego_graph(graph_n11, order = 2) + graph_n37_ego2 <- make_named_ego_graph(graph_n37, order = 2) + graph_n73_ego2 <- make_named_ego_graph(graph_n73, order = 2) + # Generate expected tuple counts for graphlets up to size 4 and 5 + # 1. For ego-networks of order 1 + expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) + expected_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego1, 4) + expected_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego1, 4) + expected_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego1, 5) + expected_tuple_count_n37_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego1, 5) + expected_tuple_count_n73_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego1, 5) + # 2. For ego-networks of order 2 + expected_tuple_count_n11_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego2, 4) + expected_tuple_count_n37_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego2, 4) + expected_tuple_count_n73_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego2, 4) + expected_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego2, 5) + expected_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego2, 5) + expected_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego2, 5) + + # Calculate actual tuple counts + # 1. For ego-networks of order 1 + actual_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego(graph_n11_ego1, 4) + actual_tuple_count_n37_ego1_gs4 <- count_graphlet_tuples_ego(graph_n37_ego1, 4) + actual_tuple_count_n73_ego1_gs4 <- count_graphlet_tuples_ego(graph_n73_ego1, 4) + actual_tuple_count_n11_ego1_gs5 <- count_graphlet_tuples_ego(graph_n11_ego1, 5) + actual_tuple_count_n37_ego1_gs5 <- count_graphlet_tuples_ego(graph_n37_ego1, 5) + actual_tuple_count_n73_ego1_gs5 <- count_graphlet_tuples_ego(graph_n73_ego1, 5) + # 2. For ego-networks of order 2 + actual_tuple_count_n11_ego2_gs4 <- count_graphlet_tuples_ego(graph_n11_ego2, 4) + actual_tuple_count_n37_ego2_gs4 <- count_graphlet_tuples_ego(graph_n37_ego2, 4) + actual_tuple_count_n73_ego2_gs4 <- count_graphlet_tuples_ego(graph_n73_ego2, 4) + actual_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego(graph_n11_ego2, 5) + actual_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego(graph_n37_ego2, 5) + actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego(graph_n73_ego2, 5) + + # Compare expected with actual + # 1. For ego-networks of order 1 + expect_equal(expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4) + expect_equal(expected_tuple_count_n37_ego1_gs4, actual_tuple_count_n37_ego1_gs4) + expect_equal(expected_tuple_count_n73_ego1_gs4, actual_tuple_count_n73_ego1_gs4) + expect_equal(expected_tuple_count_n11_ego1_gs5, actual_tuple_count_n11_ego1_gs5) + expect_equal(expected_tuple_count_n37_ego1_gs5, actual_tuple_count_n37_ego1_gs5) + expect_equal(expected_tuple_count_n73_ego1_gs5, actual_tuple_count_n73_ego1_gs5) + # 2. For ego-networks of order 2 + expect_equal(expected_tuple_count_n11_ego2_gs4, actual_tuple_count_n11_ego2_gs4) + expect_equal(expected_tuple_count_n37_ego2_gs4, actual_tuple_count_n37_ego2_gs4) + expect_equal(expected_tuple_count_n73_ego2_gs4, actual_tuple_count_n73_ego2_gs4) + expect_equal(expected_tuple_count_n11_ego2_gs5, actual_tuple_count_n11_ego2_gs5) + expect_equal(expected_tuple_count_n37_ego2_gs5, actual_tuple_count_n37_ego2_gs5) + expect_equal(expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5) +}) + +context("Measures Netdis: Ego-network scaled graphlet outputs for manually verified networks") +test_that("Ego-network 4-node graphlet counts match manually verified totals", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 + max_graphlet_size <- 4 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + actual_counts_order_1 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 1 + ) + actual_counts_order_2 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 2 + ) + + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + expected_counts_order_1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(expected_counts_order_1) <- node_labels + colnames(expected_counts_order_1) <- graphlet_labels + # 2-step ego networks + expected_counts_order_2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_order_2) <- node_labels + colnames(expected_counts_order_2) <- graphlet_labels + + # Test that actual counts match expected with only counts requested (default) + expect_equal(actual_counts_order_1, expected_counts_order_1) + expect_equal(actual_counts_order_2, expected_counts_order_2) + + # Test that actual counts and returned ego networks match expected + # 1. Define expected + expected_ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + expected_ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + expected_counts_with_networks_order_1 <- + list( + graphlet_counts = expected_counts_order_1, + ego_networks = expected_ego_networks_order_1 + ) + expected_counts_with_networks_order_2 <- + list( + graphlet_counts = expected_counts_order_2, + ego_networks = expected_ego_networks_order_2 + ) + # 2. Calculate actual + actual_counts_with_networks_order_1 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + return_ego_networks = TRUE + ) + actual_counts_with_networks_order_2 <- + count_graphlets_ego_scaled(graph, + max_graphlet_size = max_graphlet_size, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes, + neighbourhood_size = 2, return_ego_networks = TRUE + ) + + # 3. Compare + # Comparison is not implemented for igraph objects, so convert all igraphs to + # indexed edge list and then compare. Do in-situ replacement of igraphs with + # indexed edge lists to ensure we are checking full properties of returned + # objects (i.e. named lists with matching elements). + # 3a. Convert expected and actual ego networks from igraphs to indexed edges + expected_counts_with_networks_order_1$ego_networks <- + purrr::map( + expected_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + expected_counts_with_networks_order_2$ego_networks <- + purrr::map( + expected_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_1$ego_networks <- + purrr::map( + actual_counts_with_networks_order_1$ego_networks, + graph_to_indexed_edges + ) + actual_counts_with_networks_order_2$ego_networks <- + purrr::map( + actual_counts_with_networks_order_2$ego_networks, + graph_to_indexed_edges + ) + # 3b. Do comparison + expect_equal( + actual_counts_with_networks_order_1, + expected_counts_with_networks_order_1 + ) + expect_equal( + actual_counts_with_networks_order_2, + expected_counts_with_networks_order_2 + ) +}) + +context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") +test_that("Ego-network 4-node density-binned reference counts match manually verified totals", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) + expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) + expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Set manually verified density bins for ego-networks + # 1. Ego-networks of order 1 + expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) + expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + expected_binned_densities_o1 <- list( + densities = expected_densities_o1, + interval_indexes = expected_interval_indexes_o1, + breaks = expected_breaks_o1 + ) + # Check binned densities are as expected + actual_binned_densities_o1 <- binned_densities_adaptive( + expected_densities_o1, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) + # 2. Ego-networks of order 2 + expected_min_break_o2 <- 1 / 3 + expected_max_break_o2 <- 0.6 + expected_initial_interval_o2 <- + (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 + expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) + expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + expected_binned_densities_o2 <- list( + densities = expected_densities_o2, + interval_indexes = expected_interval_indexes_o2, + breaks = expected_breaks_o2 + ) + # Check binned densities are as expected + actual_binned_densities_o2 <- binned_densities_adaptive( + expected_densities_o2, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) + + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + expected_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(expected_counts_o1) <- node_labels + colnames(expected_counts_o1) <- graphlet_labels + # 2-step ego networks + expected_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_o2) <- node_labels + colnames(expected_counts_o2) <- graphlet_labels + + # Calculate binned average expected counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + + expected_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + + expected_counts_o1[5, ] + expected_counts_o1[9, ] + + expected_counts_o1[10, ]) / 5 + expected_mean_density_binned_counts_o1 <- rbind( + mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 + ) + rownames(expected_mean_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + + expected_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + + expected_counts_o2[10, ]) / 3 + expected_mean_density_binned_counts_o2 <- rbind( + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin4_o2 + ) + rownames(expected_mean_density_binned_counts_o2) <- 1:4 + + # Calculate actual output of function under test + actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( + expected_counts_o1, expected_interval_indexes_o1 + ) + actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( + expected_counts_o2, expected_interval_indexes_o2 + ) + + # Check actual output vs expected + expect_equal( + actual_mean_density_binned_counts_o1, + expected_mean_density_binned_counts_o1 + ) + expect_equal( + actual_mean_density_binned_counts_o2, + expected_mean_density_binned_counts_o2 + ) +}) + +context("Measures Netdis: Expected graphlet counts") +test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { + # Helper function to generate graphs with known density and number of nodes + rand_graph <- function(num_nodes, density) { + max_edges <- choose(num_nodes, 2) + num_edges <- density * max_edges + igraph::erdos.renyi.game(num_nodes, num_edges, "gnm", + loops = FALSE, directed = FALSE + ) + } + # Set up some dummy reference density breaks and scaled reference counts + density_breaks <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0) + scaled_reference_counts <- rbind( + c(1, 2, 3, 4, 5, 6, 7, 8, 9), + c(11, 12, 13, 14, 15, 16, 17, 18, 19), + c(21, 22, 23, 24, 25, 26, 27, 28, 29), + c(31, 32, 33, 34, 35, 36, 37, 38, 39), + c(41, 42, 43, 44, 45, 46, 47, 48, 49), + c(51, 52, 53, 54, 55, 56, 57, 58, 59), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), + c(71, 72, 73, 74, 75, 76, 77, 78, 79), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), + c(91, 92, 93, 94, 95, 96, 97, 98, 99) + ) + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + colnames(scaled_reference_counts) <- graphlet_labels + rownames(scaled_reference_counts) <- 1:10 + graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) + names(graphlet_sizes) <- graphlet_labels + max_graphlet_size <- 4 + + # Generate some test graphs + densities <- c(0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95) + density_indexes <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) + num_nodes <- rep(120, 10) + graphs <- purrr::map2(num_nodes, densities, rand_graph) + + # Helper function to calculate expected expected graphlet counts + expected_expected_graphlet_counts_fn <- function(density_index, node_count) { + reference_counts <- scaled_reference_counts[density_index, ] + reference_counts * choose(node_count, graphlet_sizes) + } + # Determine expected and actual expected graphlet counts + expected_expected_graphlet_counts <- + purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) + actual_expected_graphlet_counts <- + purrr::map(graphs, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = scaled_reference_counts, + scale_fn=count_graphlet_tuples + ) + # Loop over each graph and compare expected with actual + # NOTE: v2.0.0 of testthat library made a breaking change that means using + # map, mapply etc can cause failures under certain conditions + # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 + for (i in 1:length(actual_expected_graphlet_counts)) { + expect_equal( + actual_expected_graphlet_counts[i], + expected_expected_graphlet_counts[i] + ) + } +}) + +test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes", { + # Helper function to generate graphs with known density and number of nodes + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) + max_graphlet_size <- 4 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Make graph ego networks + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + # Set manually-verified node counts and densities + # 1. Ego-networks of order 1 + num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edges_o1 <- choose(num_nodes_o1, 2) + densities_o1 <- num_edges_o1 / max_edges_o1 + # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 + # 2. Ego-networks of order 2 + num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edges_o2 <- choose(num_nodes_o2, 2) + densities_o2 <- num_edges_o2 / max_edges_o2 + # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 + # Set manually defined density breaks and indexes + breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) + density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) + density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) + # Set dummy reference counts + scaled_reference_counts <- rbind( + c(1, 2, 3, 4, 5, 6, 7, 8, 9), + c(11, 12, 13, 14, 15, 16, 17, 18, 19), + c(21, 22, 23, 24, 25, 26, 27, 28, 29), + c(31, 32, 33, 34, 35, 36, 37, 38, 39), + c(41, 42, 43, 44, 45, 46, 47, 48, 49), + c(51, 52, 53, 54, 55, 56, 57, 58, 59), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), + c(71, 72, 73, 74, 75, 76, 77, 78, 79), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), + c(91, 92, 93, 94, 95, 96, 97, 98, 99) + ) + expected_dims <- dim(scaled_reference_counts) + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # Helper function to calculate expected expected graphlet counts + expected_expected_graphlet_counts_fn <- function(density_index, node_count) { + reference_counts <- scaled_reference_counts[density_index, ] + reference_counts * choose(node_count, graphlet_sizes) + } + # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet + # types as columns and ego networks for nodes in graph as rows + expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( + density_indexes_o1, num_nodes_o1, expected_expected_graphlet_counts_fn + ))) + expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( + density_indexes_o2, num_nodes_o2, expected_expected_graphlet_counts_fn + ))) + # Sanity check for expected output shape. Should be matrix with graphlet types + # as columns and nodes as rows + expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) + expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) + # Set column labels to graphlet names + colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels + colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels + # Set row labels to ego network names + rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) + # Drop rows for nodes with ewer than minumum required nodes and edges in ego + # network + expected_expected_graphlet_counts_ego_o1 <- + expected_expected_graphlet_counts_ego_o1[ + (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), + ] + expected_expected_graphlet_counts_ego_o2 <- + expected_expected_graphlet_counts_ego_o2[ + (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), + ] + + # Calculate actual output of function under test + actual_expected_graphlet_counts_ego_o1 <- + netdis_expected_graphlet_counts_ego( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, density_breaks = breaks, + density_binned_reference_counts = scaled_reference_counts, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, + scale_fn=count_graphlet_tuples + ) + actual_expected_graphlet_counts_ego_o2 <- + netdis_expected_graphlet_counts_ego( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, density_breaks = breaks, + density_binned_reference_counts = scaled_reference_counts, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, + scale_fn=count_graphlet_tuples + ) + + # Compare actual to expected + expect_equal( + actual_expected_graphlet_counts_ego_o1, + actual_expected_graphlet_counts_ego_o1 + ) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 + ) +}) + +test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 nodes", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) + names(graphlet_sizes) <- graphlet_labels + max_graphlet_size <- 4 + # Make graph ego networks + min_ego_nodes <- 0 + min_edgo_edges <- 0 + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_edgo_edges + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_edgo_edges + ) + # Set manually-verified node counts and densities + # 1. Ego-networks of order 1 + num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edges_o1 <- choose(num_nodes_o1, 2) + densities_o1 <- num_edges_o1 / max_edges_o1 + # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 + # 2. Ego-networks of order 2 + num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edges_o2 <- choose(num_nodes_o2, 2) + densities_o2 <- num_edges_o2 / max_edges_o2 + # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 + # Set manually determined density breaks and indexes, based on a min bin count + # of 2 and an initial request for 100 bins + min_bin_count <- 2 + num_bins <- 100 + num_breaks <- num_bins + 1 + min_density_o1 <- 0.5 + max_density_o1 <- 1.0 + breaks_o1 <- seq(min_density_o1, max_density_o1, length.out = num_breaks)[c(1, 22, 42, 101)] + density_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + min_density_o2 <- 1 / 3 + max_density_o2 <- 0.6 + breaks_o2 <- seq(min_density_o2, max_density_o2, length.out = num_breaks)[c(1, 10, 51, 64, 101)] + density_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + # Guard against errors in manually determined breaks and indexes by checking + # against already tested code. This also lets us ensure we handle densities + # falling exactly on a bin boundary the same as the code under test. + comp_binned_densities_o1 <- binned_densities_adaptive( + densities_o1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins + ) + comp_binned_densities_o2 <- binned_densities_adaptive( + densities_o2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins + ) + expect_equal( + comp_binned_densities_o1, + list( + densities = densities_o1, + interval_indexes = density_indexes_o1, + breaks = breaks_o1 + ) + ) + expect_equal( + comp_binned_densities_o2, + list( + densities = densities_o2, + interval_indexes = density_indexes_o2, + breaks = breaks_o2 + ) + ) + + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + scaled_reference_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + # 2-step ego networks + scaled_reference_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + min_ego_nodes <- 3 + min_ego_edges <- 1 + # Drop rows for nodes with ewer than minumum required nodes and edges in ego + # network + scaled_reference_counts_o1 <- + scaled_reference_counts_o1[ + (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), + ] + scaled_reference_counts_o2 <- + scaled_reference_counts_o2[ + (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), + ] + density_indexes_o1 <- density_indexes_o1[ + (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges) + ] + density_indexes_o2 <- density_indexes_o2[ + (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges) + ] + # Average manually verified scaled reference counts across density bins + density_binned_reference_counts_o1 <- rbind( + (scaled_reference_counts_o1[1, ] + scaled_reference_counts_o1[2, ]) / 2, + (scaled_reference_counts_o1[4, ] + scaled_reference_counts_o1[5, ] + + scaled_reference_counts_o1[6, ]) / 3, + (scaled_reference_counts_o1[3, ] + + scaled_reference_counts_o1[7, ] + + scaled_reference_counts_o1[8, ]) / 3 + ) + rownames(density_binned_reference_counts_o1) <- 1:3 + density_binned_reference_counts_o2 <- rbind( + (scaled_reference_counts_o2[1, ] + scaled_reference_counts_o2[4, ]) / 2, + (scaled_reference_counts_o2[2, ] + scaled_reference_counts_o2[6, ] + + scaled_reference_counts_o2[7, ]) / 3, + (scaled_reference_counts_o2[3, ] + scaled_reference_counts_o2[5, ]) / 2, + (scaled_reference_counts_o2[8, ] + scaled_reference_counts_o2[9, ] + + scaled_reference_counts_o2[10, ]) / 3 + ) + rownames(density_binned_reference_counts_o2) <- 1:4 + + # Helper functions to calculate expected expected graphlet counts + expected_expected_graphlet_counts_o1_fn <- function(density_index, node_count) { + reference_counts <- density_binned_reference_counts_o1[density_index, ] + reference_counts * choose(node_count, graphlet_sizes) + } + expected_expected_graphlet_counts_o2_fn <- function(density_index, node_count) { + reference_counts <- density_binned_reference_counts_o2[density_index, ] + reference_counts * choose(node_count, graphlet_sizes) + } + # Calculate expected graphlet counts + expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( + density_indexes_o1, num_nodes_o1[(num_nodes_o1 >= min_ego_nodes)], + expected_expected_graphlet_counts_o1_fn + ))) + rownames(expected_expected_graphlet_counts_ego_o1) <- + names(ego_networks_o1[(num_nodes_o1 >= min_ego_nodes)]) + expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( + density_indexes_o2, num_nodes_o2[(num_nodes_o2 >= min_ego_nodes)], + expected_expected_graphlet_counts_o2_fn + ))) + rownames(expected_expected_graphlet_counts_ego_o2) <- + names(ego_networks_o2[(num_nodes_o2 >= min_ego_nodes)]) + + # Sanity check manually derived expected expected counts by comparing against + # pre-tested fully applied expected_graphlet_counts_ego function + expect_equal( + expected_expected_graphlet_counts_ego_o1, + netdis_expected_graphlet_counts_ego( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + density_breaks = breaks_o1, + density_binned_reference_counts_o1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + scale_fn=count_graphlet_tuples + ) + ) + expect_equal( + expected_expected_graphlet_counts_ego_o2, + netdis_expected_graphlet_counts_ego( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + density_breaks = breaks_o2, + density_binned_reference_counts_o2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + scale_fn=count_graphlet_tuples + ) + ) + + # Generate partially applied functions using function under test + actual_expected_graphlet_counts_ego_fn_o1 <- + netdis_expected_graphlet_counts_ego_fn( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, + min_bin_count = min_bin_count, + num_bins = num_bins, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + scale_fn=count_graphlet_tuples + ) + actual_expected_graphlet_counts_ego_fn_o2 <- + netdis_expected_graphlet_counts_ego_fn( + graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, + min_bin_count = min_bin_count, + num_bins = num_bins, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + scale_fn=count_graphlet_tuples + ) + # Generate actual expected accounts by applying generated functions to test + # graph + actual_expected_graphlet_counts_ego_o1 <- + actual_expected_graphlet_counts_ego_fn_o1(graph) + actual_expected_graphlet_counts_ego_o2 <- + actual_expected_graphlet_counts_ego_fn_o2(graph) + + # Compare actual to expected + expect_equal( + actual_expected_graphlet_counts_ego_o1, + expected_expected_graphlet_counts_ego_o1 + ) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 + ) +}) + +context("Measures Netdis: Centered graphlet counts") +test_that("netdis_centred_graphlet_counts_ego is correct", { + # Set up small sample networks each with each graphlet represented in at least + # one ego network + ref_elist <- rbind( + c("n1", "n2"), + c("n1", "n3"), + c("n1", "n4"), + c("n1", "n5"), + c("n1", "n6"), + c("n2", "n7"), + c("n2", "n8"), + c("n2", "n9"), + c("n9", "n10"), + c("n10", "n11"), + c("n11", "n12"), + c("n11", "n13"), + c("n2", "n14"), + c("n8", "n14"), + c("n12", "n15"), + c("n12", "n16"), + c("n15", "n17"), + c("n12", "n18"), + c("n15", "n18"), + c("n16", "n17"), + c("n16", "n18"), + c("n17", "n18"), + c("n16", "n19"), + c("n16", "n20"), + c("n16", "n21"), + c("n19", "n20"), + c("n19", "n21"), + c("n15", "n22"), + c("n15", "n23"), + c("n15", "n24"), + c("n22", "n23"), + c("n22", "n24"), + c("n23", "n24") + ) + ref_graph <- igraph::graph_from_edgelist(ref_elist, directed = FALSE) + + query_elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + query_graph <- igraph::graph_from_edgelist(query_elist, directed = FALSE) + + max_graphlet_size <- 4 + # Use pre-tested functions to generate ego-network graphlet counts + # 1. Reference graph ego-network graphlet counts + ref_o1 <- count_graphlets_ego( + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, return_ego_networks = TRUE + ) + ego_counts_ref_o1 <- ref_o1$graphlet_counts + ego_networks_ref_o1 <- ref_o1$ego_networks + density_ref_o1 <- sapply(ego_networks_ref_o1, igraph::edge_density) + + ref_o2 <- count_graphlets_ego( + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, return_ego_networks = TRUE + ) + ego_counts_ref_o2 <- ref_o2$graphlet_counts + ego_networks_ref_o2 <- ref_o2$ego_networks + density_ref_o2 <- sapply(ego_networks_ref_o2, igraph::edge_density) + + # 2. Query graph ego-network graphlet countsa + query_o1 <- count_graphlets_ego( + query_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 1, return_ego_networks = TRUE + ) + ego_counts_query_o1 <- query_o1$graphlet_counts + ego_networks_query_o1 <- query_o1$ego_networks + density_query_o1 <- sapply(ego_networks_query_o1, igraph::edge_density) + + query_o2 <- count_graphlets_ego( + query_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = 2, return_ego_networks = TRUE + ) + ego_counts_query_o2 <- query_o2$graphlet_counts + ego_networks_query_o2 <- query_o2$ego_networks + density_query_o2 <- sapply(ego_networks_query_o2, igraph::edge_density) + + centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, + query_node_counts, ref_node_count, + min_nodes, min_edges, + min_bin_count, num_bins) { + graphlet_node_counts_k4 <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) + # 1. Calculate scaling factors for each reference and query graphlet count + # These are nCk, where n is the number of nodes in the network and + # k is the number of nodes in the graphlet + ref_scale_factor <- sapply( + graphlet_node_counts_k4, FUN <- function(k) { + choose(ref_node_count, k) + } + ) + query_scale_factor <- sapply( + graphlet_node_counts_k4, FUN <- function(k) { + choose(query_node_count, k) + } + ) + # 2. Calculate scaled reference counts by dividing by ref_scale_factor + ref_scaled_graphlet_count <- query_graphlet_count / ref_scale_factor + # + } +}) From 03d6262c03c92853b67764bbc24e81f96e2d6237 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 6 Sep 2019 12:03:36 +0100 Subject: [PATCH 034/188] add a docstring for all function parameters --- R/graph_binning.R | 3 + R/measures_net_dis.R | 422 ++++++++++++------ R/orca_interface.R | 7 +- man/binned_densities_adaptive.Rd | 9 +- man/count_graphlet_tuples_ego.Rd | 17 + man/density_binned_counts.Rd | 21 +- man/density_binned_counts_gp.Rd | 15 +- man/ego_network_density.Rd | 14 + man/graph_features_to_histograms.Rd | 7 +- man/mean_density_binned_graphlet_counts.Rd | 13 +- man/netdis_centred_graphlet_counts_ego.Rd | 35 +- man/netdis_expected_graphlet_counts.Rd | 26 +- man/netdis_expected_graphlet_counts_ego.Rd | 29 +- man/netdis_expected_graphlet_counts_ego_fn.Rd | 15 + ...netdis_expected_graphlet_counts_per_ego.Rd | 22 +- man/netdis_for_all_graphs.Rd | 4 + man/netdis_many_to_many.Rd | 27 +- man/netdis_one_to_many.Rd | 23 +- man/netdis_one_to_one.Rd | 23 +- man/scale_graphlet_count.Rd | 18 + man/single_density_bin.Rd | 14 +- man/zeros_to_ones.Rd | 19 + 22 files changed, 605 insertions(+), 178 deletions(-) create mode 100644 man/count_graphlet_tuples_ego.Rd create mode 100644 man/ego_network_density.Rd create mode 100644 man/scale_graphlet_count.Rd create mode 100644 man/zeros_to_ones.Rd diff --git a/R/graph_binning.R b/R/graph_binning.R index f04b57df..cf38bde9 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -4,6 +4,9 @@ #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. #' Temporarily accessible during development. +#' @param densities Density values to use for binning. +#' @param min_counts_per_interval Minimum count for each bin. +#' @param num_intervals Initial number of density bins to generate. #' TODO: Remove @export prior to publishing #' @export binned_densities_adaptive <- function(densities, diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 0858d874..9460dd15 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -8,8 +8,24 @@ #' than min_ego_nodes nodes #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges -#' @param min_bin_count Minimum number of ego networks in each density bin -#' @param num_bins Number of density bins to generate +#' @param binning_fn Function used to bin ego network densities. Takes densities +#' as its single argument, and returns a named list including keys \code{breaks} +#' (list of bin edges) and \code{interval_indexes} (density bin index for each +#' ego network). (Default: \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +#' (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +#' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and +#' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the +#' approach used in the original netdis paper). +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' (Default: \code{netdis_expected_graphlet_counts_per_ego} with +#' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in +#' the original netdis paper). #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export @@ -19,19 +35,22 @@ netdis_one_to_one <- function(graph_1, graph_2, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, - agg_fn = mean, - scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) { - + binning_fn = purrr::partial( + binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100), + bin_counts_fn = purrr::partial( + density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial( + netdis_expected_graphlet_counts_per_ego, + scale_fn = count_graphlet_tuples)) { + # bundle graphs into one vector with format needed for # netdis many-to-many graphs <- list(graph_1 = graph_1, graph_2 = graph_2) - + # calculate netdis result <- netdis_many_to_many( graphs, @@ -43,7 +62,7 @@ netdis_one_to_one <- function(graph_1, graph_2, binning_fn = binning_fn, exp_counts_fn = exp_counts_fn ) - + # extract netdis statistics from list returned by netdis_many_to_many result$netdis[, 1] } @@ -59,8 +78,24 @@ netdis_one_to_one <- function(graph_1, graph_2, #' than min_ego_nodes nodes #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges -#' @param min_bin_count Minimum number of ego networks in each density bin -#' @param num_bins Number of density bins to generate +#' @param binning_fn Function used to bin ego network densities. Takes densities +#' as its single argument, and returns a named list including keys \code{breaks} +#' (list of bin edges) and \code{interval_indexes} (density bin index for each +#' ego network). (Default: \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +#' (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +#' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and +#' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the +#' approach used in the original netdis paper). +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' (Default: \code{netdis_expected_graphlet_counts_per_ego} with +#' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in +#' the original netdis paper). #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export @@ -70,15 +105,18 @@ netdis_one_to_many <- function(graph_1, graphs_compare, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, - agg_fn = mean, - scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) { - + binning_fn = purrr::partial( + binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100), + bin_counts_fn = purrr::partial( + density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial( + netdis_expected_graphlet_counts_per_ego, + scale_fn = count_graphlet_tuples)) { + # bundle graph_1 and graphs_compare to one vector, with # graph_1 at start as needed for netdis_many_to_many call graphs <- append(graphs_compare, list(graph_1 = graph_1), after = 0) @@ -96,7 +134,7 @@ netdis_one_to_many <- function(graph_1, graphs_compare, bin_counts_fn = bin_counts_fn, exp_counts_fn = exp_counts_fn ) - + # restructure netdis_many_to_many output colnames(result$netdis) <- result$comp_spec$name_b result$netdis @@ -115,10 +153,26 @@ netdis_one_to_many <- function(graph_1, graphs_compare, #' than min_ego_nodes nodes #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges -#' @param min_bin_count Minimum number of ego networks in each density bin -#' @param num_bins Number of density bins to generate -#' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes -#' up to and including max_graphlet_size +#' @param binning_fn Function used to bin ego network densities. Takes densities +#' as its single argument, and returns a named list including keys \code{breaks} +#' (list of bin edges) and \code{interval_indexes} (density bin index for each +#' ego network). (Default: \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +#' (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +#' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and +#' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the +#' approach used in the original netdis paper). +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' (Default: \code{netdis_expected_graphlet_counts_per_ego} with +#' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in +#' the original netdis paper). +#' @return Netdis statistics between query graphs for graphlet sizes +#' up to and including max_graphlet_size. #' @export netdis_many_to_many <- function(graphs, ref_graph, @@ -127,23 +181,26 @@ netdis_many_to_many <- function(graphs, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, - agg_fn = mean, - scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) { + binning_fn = purrr::partial( + binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100), + bin_counts_fn = purrr::partial( + density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial( + netdis_expected_graphlet_counts_per_ego, + scale_fn = count_graphlet_tuples)) { ## ------------------------------------------------------------------------ # Get ego networks for query graphs ego_networks <- purrr::map( graphs, make_named_ego_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - + ## ------------------------------------------------------------------------ # Count graphlets for ego networks in query graphs graphlet_counts <- purrr::map( @@ -151,41 +208,42 @@ netdis_many_to_many <- function(graphs, ego_to_graphlet_counts, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ # Case where expected counts calculated using a reference network if (!is.null(ref_graph)) { # Get ego networks ego_ref <- make_named_ego_graph( - ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, + ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - + # Get ego network graphlet counts graphlet_counts_ref <- ego_to_graphlet_counts( ego_ref, max_graphlet_size = max_graphlet_size ) - + # Get ego-network densities densities_ref <- ego_network_density(ego_ref) - + # bin ref ego-network densities binned_densities <- binning_fn(densities_ref) - + ref_ego_density_bins <- binned_densities$breaks - + # Average ref graphlet counts across density bins ref_binned_graphlet_counts <- bin_counts_fn( - graphlet_counts_ref, + graphlet_counts_ref, binned_densities$interval_indexes, ego_networks = ego_ref, max_graphlet_size = max_graphlet_size ) - - # Calculate expected graphlet counts (using ref graph ego network density bins) + + # Calculate expected graphlet counts (using ref + # graph ego network density bins) exp_graphlet_counts <- purrr::map( ego_networks, exp_counts_fn, @@ -193,19 +251,19 @@ netdis_many_to_many <- function(graphs, density_binned_reference_counts = ref_binned_graphlet_counts, max_graphlet_size = max_graphlet_size ) - + ## ------------------------------------------------------------------------ } else { # Case where expected counts calculated using query networks - + # Get ego-network densities densities <- purrr::map(ego_networks, ego_network_density) - + # bin ref ego-network densities binned_densities <- purrr::map(densities, binning_fn) - + # extract bin breaks and indexes from binning results ego_density_bin_breaks <- purrr::map(binned_densities, function(x) { @@ -215,15 +273,15 @@ netdis_many_to_many <- function(graphs, function(x) { x$interval_indexes }) - - + + # Calculate expected counts in each bin binned_graphlet_counts <- mapply(bin_counts_fn, graphlet_counts, ego_density_bin_indexes, max_graphlet_size = max_graphlet_size, SIMPLIFY = FALSE) - + # Calculate expected graphlet counts for each ego network exp_graphlet_counts <- mapply(exp_counts_fn, ego_networks, @@ -232,25 +290,25 @@ netdis_many_to_many <- function(graphs, max_graphlet_size = max_graphlet_size, SIMPLIFY = FALSE) } - - ## ------------------------------------------------------------------------ + + ## ------------------------------------------------------------------------ # Centre graphlet counts by subtracting expected counts centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) - + ## ------------------------------------------------------------------------ # Sum centred graphlet counts across all ego networks sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) - + ## ------------------------------------------------------------------------ # Generate pairwise comparisons comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = comparisons) - + ## ------------------------------------------------------------------------ # Calculate netdis statistics results <- parallel::mcmapply( function(index_a, index_b) { netdis_uptok( - sum_graphlet_counts[[index_a]], + sum_graphlet_counts[[index_a]], sum_graphlet_counts[[index_b]], max_graphlet_size = max_graphlet_size ) @@ -258,10 +316,10 @@ netdis_many_to_many <- function(graphs, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = TRUE) - - + + list(netdis = results, comp_spec = comp_spec) - + } #' Netdis between all graph pairs using provided Centred Graphlet Counts @@ -270,6 +328,9 @@ netdis_many_to_many <- function(graphs, #' @param graphlet_size The size of graphlets to use for the Netdis calculation #' (only counts for graphlets of the specified size will be used). The size of #' a graphlet is the number of nodes it contains. +#' @param mc.cores Number of cores to run on. NOTE: only works on unix-like +#' systems with system level forking capability. This means it will work on +#' Linux and OSX, but not Windows. #' @return Pairwise Netdis statistics between graphs calculated using centred #' counts for graphlets of the specified size #' @export @@ -471,6 +532,29 @@ netdis_centred_graphlet_counts <- function(graph, } +#' Generate Netdis centred graphlets counts by subtracting expected counts +#' +#' @param graph A connected, undirected, simple graph as an +#' \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes +#' will be counted. +#' @param neighbourhood_size The number of steps from the source node to include +#' nodes for each ego-network. +#' @param expected_ego_count_fn A function for generating expected ego-network +#' graphlet counts for a graph. This function should take a connected, +#' undirected, simple graph as an \code{igraph} object for its only argument. +#' Where \code{expected_ego_count_fn} is specific to particular values of +#' \code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken +#' to ensure that the values of these parameters passed to this function are +#' consistent with those used when creating \code{expected_ego_count_fn}. +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges +#' @return A vector with centred counts for each graphlet type in +#' each ego network. +#' #' TODO: Remove @export prior to publishing #' @export netdis_centred_graphlet_counts_ego <- function(graph, @@ -524,6 +608,16 @@ netdis_centred_graphlet_counts_ego <- function(graph, #' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. #' @param neighbourhood_size The number of steps from the source node to include #' node in ego-network. +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges +#' @param min_bin_count Minimum count of ego networks in each density bin. +#' @param num_bins Initial number of density bins to generate. +#' @param scale_fn Optional function to scale calculated expected counts, taking +#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +#' factor that the looked up \code{density_binned_reference_counts} values will +#' be multiplied by. #' @return A function taking a connected, undirected, simple query graph as an #' \code{igraph} object and returning an RxC matrix containing the expected #' counts of each graphlet (columns, C) for each ego-network in the query graph @@ -593,6 +687,24 @@ netdis_expected_graphlet_counts_ego_fn <- function(graph, #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. #' Temporarily accessible during development. +#' @param graph A connected, undirected, simple reference graph as an +#' \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +#' @param neighbourhood_size The number of steps from the source node to include +#' node in ego-network. +#' @param density_breaks Density values defining bin edges. +#' @param density_binned_reference_counts Reference network graphlet counts for +#' each density bin. +#' @param min_ego_nodes Filter ego networks which have fewer +#' than min_ego_nodes nodes +#' @param min_ego_edges Filter ego networks which have fewer +#' than min_ego_edges edges +#' @param scale_fn Optional function to scale calculated expected counts, taking +#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +#' factor that the looked up \code{density_binned_reference_counts} values will +#' be multiplied by. +#' #' TODO: Remove @export prior to publishing #' @export netdis_expected_graphlet_counts_ego <- function(graph, @@ -603,7 +715,7 @@ netdis_expected_graphlet_counts_ego <- function(graph, min_ego_nodes = 3, min_ego_edges = 1, scale_fn=NULL) { - + # Generate ego-networks for query graph ego_networks <- make_named_ego_graph(graph, neighbourhood_size) # Drop ego-networks that don't have the minimum number of nodes or edges @@ -618,7 +730,7 @@ netdis_expected_graphlet_counts_ego <- function(graph, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = density_binned_reference_counts, - scale_fn=scale_fn + scale_fn = scale_fn ) names(expected_graphlet_counts) <- names(ego_networks) # Simplify list to array @@ -634,16 +746,30 @@ netdis_expected_graphlet_counts_ego <- function(graph, #' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. -#' Temporarily accessible during development. +#' +#' @param ego_networks The number of steps from the source node to include +#' node in ego-network. +#' @param density_breaks Density values defining bin edges. +#' @param density_binned_reference_counts Reference network graphlet counts for +#' each density bin. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +#' @param scale_fn Optional function to scale calculated expected counts, taking +#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +#' factor that the looked up \code{density_binned_reference_counts} values will +#' be multiplied by. +#' +#' #' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export -netdis_expected_graphlet_counts_per_ego <- function(ego_networks, - density_breaks, - density_binned_reference_counts, - max_graphlet_size, - scale_fn=NULL) { - - +netdis_expected_graphlet_counts_per_ego <- function( + ego_networks, + density_breaks, + density_binned_reference_counts, + max_graphlet_size, + scale_fn=NULL) { + + # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. expected_graphlet_counts <- @@ -662,8 +788,20 @@ netdis_expected_graphlet_counts_per_ego <- function(ego_networks, #' INTERNAL FUNCTION - Do not call directly #' #' Used by \code{netdis_expected_graphlet_counts_ego} to -#' calculate expected graphlet counts for a query graph ego-network from the -#' statistics of a provided reference graph. +#' calculate expected graphlet counts for a query graph +#' ego-network from the statistics of a provided reference +#' graph. +#' @param graph A connected, undirected, simple reference graph as an +#' \code{igraph} object. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. +#' @param density_breaks Density values defining bin edges. +#' @param density_binned_reference_counts Reference network graphlet counts for +#' each density bin. +#' @param scale_fn Optional function to scale calculated expected counts, taking +#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +#' factor that the looked up \code{density_binned_reference_counts} values will +#' be multiplied by. #' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export @@ -672,7 +810,7 @@ netdis_expected_graphlet_counts <- function(graph, density_breaks, density_binned_reference_counts, scale_fn=NULL) { - + # Look up average scaled graphlet counts for graphs of similar density # in the reference graph query_density <- igraph::edge_density(graph) @@ -680,14 +818,16 @@ netdis_expected_graphlet_counts <- function(graph, matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - + if (!is.null(scale_fn)) { - # Scale reference counts e.g. by multiplying the reference count for each graphlet - # by the number of possible sets of k nodes in the query graph, where k is the - # number of nodes in the graphlet - matched_reference_counts <- matched_reference_counts * scale_fn(graph, max_graphlet_size) + # Scale reference counts e.g. by multiplying the + # reference count for each graphlet by the number + # of possible sets of k nodes in the query graph, + # where k is the number of nodes in the graphlet. + matched_reference_counts <- matched_reference_counts * + scale_fn(graph, max_graphlet_size) } - + matched_reference_counts } @@ -696,6 +836,11 @@ netdis_expected_graphlet_counts <- function(graph, #' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param agg_fn Function to aggregate counts in each bin +#' (default \code{agg_fn = mean}). #' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export @@ -711,104 +856,116 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, apply(graphlet_counts, MARGIN = 2, function(gc) { tapply(gc, INDEX = density_interval_indexes, FUN = agg_fn) }) - + # if only 1 bin (i.e. no binning) will be left with a 1D list. # convert it into a 2D list. if (is.null(dim(mean_density_binned_graphlet_counts))) { - dim(mean_density_binned_graphlet_counts) <- c(1, length(mean_density_binned_graphlet_counts)) - colnames(mean_density_binned_graphlet_counts) <- colnames(graphlet_counts) + dim(mean_density_binned_graphlet_counts) <- + c(1, length(mean_density_binned_graphlet_counts)) + + colnames(mean_density_binned_graphlet_counts) <- + colnames(graphlet_counts) } - + mean_density_binned_graphlet_counts } -#' For case where don't want to use binning, return -#' a single bin which covers full range of possible -#' densities. +#' For case where don't want to use binning, return a single bin which covers +#' the full range of possible density values. +#' @param densities Ego network density values (only used to return +#' a list of indexes of the required length.) #' @export single_density_bin <- function(densities) { - - binned_densities <- list(densities = densities, - interval_indexes = rep(1, length(densities)), - breaks = c(0, 1)) + list(densities = densities, + interval_indexes = rep(1, length(densities)), + breaks = c(0, 1)) } #' INTERNAL FUNCTION - Do not call directly #' -#' Used to -#' generate a function for calculating expected graphlet counts in each -#' density bin. -#' @param agg_fn Function to aggregate counts in each bin (default \code{agg_fn = mean}). -#' @param scale_fn Optional function to apply a transformation to graphlet_counts, must -#' have arguments graphlet_counts, ego_networks and max_graphlet_size. +#' Used to calculate expected graphlet counts for each density bin. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param agg_fn Function to aggregate counts in each bin +#' (default \code{agg_fn = mean}). +#' @param scale_fn Optional function to apply a transformation +#' to graphlet_counts, must have arguments graphlet_counts, +#' ego_networks and max_graphlet_size. #' @param ego_networks Optionally passed and used by scale_fn. #' @param max_graphlet_size Optionally passed and used by scale_fn. -#' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing #' @export -density_binned_counts <- function(graphlet_counts, density_interval_indexes, +density_binned_counts <- function(graphlet_counts, + density_interval_indexes, agg_fn = mean, - scale_fn = NULL, ego_networks = NULL, + scale_fn = NULL, + ego_networks = NULL, max_graphlet_size = NULL) { - + if (!is.null(scale_fn)) { # Scale ego-network graphlet counts e.g. - # by dividing by total number of k-tuples in + # by dividing by total number of k-tuples in # ego-network (where k is graphlet size) graphlet_counts <- scale_fn(graphlet_counts, ego_networks = ego_networks, max_graphlet_size = max_graphlet_size) } - + mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes, agg_fn = agg_fn) - + } -#' Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation +#' Calculate expected counts in density bins using +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. #' @export density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes, max_graphlet_size) { - + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - graphlet_counts, + graphlet_counts, density_interval_indexes) - + exp_counts_bin <- function(bin_idx) { counts <- graphlet_counts[density_interval_indexes == bin_idx, ] - means <- mean_binned_graphlet_counts[bin_idx,] - + means <- mean_binned_graphlet_counts[bin_idx, ] + mean_sub_counts <- sweep(counts, 2, means) - - Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) - theta_d <- 2*means / (Vd_sq + means) - + + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + theta_d <- 2 * means / (Vd_sq + means) + exp_counts_dk <- vector() for (k in 2:max_graphlet_size) { graphlet_idx <- graphlet_ids_for_size(k) - - lambda_dk <- (1 / length(graphlet_idx)) * + + lambda_dk <- (1 / length(graphlet_idx)) * sum( 2 * means[graphlet_idx]^2 / (Vd_sq[graphlet_idx] + means[graphlet_idx]) ) - + exp_counts_dk <- append(exp_counts_dk, lambda_dk / theta_d[graphlet_idx]) } - + exp_counts_dk } - + nbins <- length(unique(density_interval_indexes)) expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) - + # deal with NAs caused by bins with zero counts for a graphlet - expected_counts_bin[is.nan(expected_counts_bin)] = 0 - + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + expected_counts_bin } @@ -890,8 +1047,9 @@ scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, } -# For each graphlet calculate the number of possible sets of k nodes in the -# query graph, where k is the number of nodes in the graphlet. +#' For each graphlet calculate the number of possible sets of k nodes in the +#' query graph, where k is the number of nodes in the graphlet. +#' #' @param graph A connected, undirected, simple graph as an \code{igraph} #' object. #' @param max_graphlet_size Determines the maximum size of graphlets included diff --git a/R/orca_interface.R b/R/orca_interface.R index 47ebe7ed..aef97b78 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -203,9 +203,10 @@ simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, #' of each graphlet or orbit at each graph vertex) to #' a set of discrete histograms (a histogram of counts for each distinct value #' across all graph vertices for each feature with no binning) -#' @param A number of nodes (rows) by number of features (columns) matrix, where -#' the ij entry is the score of node i on feature j (e.g. for ORCA output this -#' is counts of each graphlet or orbit at each graph vertex) +#' @param features_matrix A number of nodes (rows) by number of features +#' (columns) matrix, where the ij entry is the score of node i on feature j +#' (e.g. for ORCA output this is counts of each graphlet or orbit at each +#' graph vertex) #' @return Feature histograms: List of discrete histograms for each #' feature #' @export diff --git a/man/binned_densities_adaptive.Rd b/man/binned_densities_adaptive.Rd index da9bdcc8..d33cbc25 100644 --- a/man/binned_densities_adaptive.Rd +++ b/man/binned_densities_adaptive.Rd @@ -7,10 +7,17 @@ binned_densities_adaptive(densities, min_counts_per_interval, num_intervals) } +\arguments{ +\item{densities}{Density values to use for binning.} + +\item{min_counts_per_interval}{Minimum count for each bin.} + +\item{num_intervals}{Initial number of density bins to generate. +TODO: Remove @export prior to publishing} +} \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. Temporarily accessible during development. -TODO: Remove @export prior to publishing } diff --git a/man/count_graphlet_tuples_ego.Rd b/man/count_graphlet_tuples_ego.Rd new file mode 100644 index 00000000..ab628b9f --- /dev/null +++ b/man/count_graphlet_tuples_ego.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{count_graphlet_tuples_ego} +\alias{count_graphlet_tuples_ego} +\title{Run count_graphlet_tuples across pre-computed ego networks.} +\usage{ +count_graphlet_tuples_ego(ego_networks, max_graphlet_size) +} +\arguments{ +\item{ego_networks}{Pre-generated ego networks for an input graph.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets included +in the tuple counts.} +} +\description{ +Run count_graphlet_tuples across pre-computed ego networks. +} diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd index b0cdefb6..64db44af 100644 --- a/man/density_binned_counts.Rd +++ b/man/density_binned_counts.Rd @@ -9,19 +9,22 @@ density_binned_counts(graphlet_counts, density_interval_indexes, max_graphlet_size = NULL) } \arguments{ -\item{agg_fn}{Function to aggregate counts in each bin (default \code{agg_fn = mean}).} +\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} -\item{scale_fn}{Optional function to apply a transformation to graphlet_counts, must -have arguments graphlet_counts, ego_networks and max_graphlet_size.} +\item{density_interval_indexes}{Density bin index for +each ego network.} + +\item{agg_fn}{Function to aggregate counts in each bin +(default \code{agg_fn = mean}).} + +\item{scale_fn}{Optional function to apply a transformation +to graphlet_counts, must have arguments graphlet_counts, +ego_networks and max_graphlet_size.} \item{ego_networks}{Optionally passed and used by scale_fn.} -\item{max_graphlet_size}{Optionally passed and used by scale_fn. -Temporarily accessible during development. -TODO: Remove @export prior to publishing} +\item{max_graphlet_size}{Optionally passed and used by scale_fn.} } \description{ -Used to -generate a function for calculating expected graphlet counts in each -density bin. +Used to calculate expected graphlet counts for each density bin. } diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd index 1f2ae0b4..ade810ac 100644 --- a/man/density_binned_counts_gp.Rd +++ b/man/density_binned_counts_gp.Rd @@ -2,11 +2,22 @@ % Please edit documentation in R/measures_net_dis.R \name{density_binned_counts_gp} \alias{density_binned_counts_gp} -\title{Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation} +\title{Calculate expected counts in density bins using +geometric poisson (Polya-Aeppli) approximation.} \usage{ density_binned_counts_gp(graphlet_counts, density_interval_indexes, max_graphlet_size) } +\arguments{ +\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} + +\item{density_interval_indexes}{Density bin index for +each ego network.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets +included in graphlet_counts.} +} \description{ -Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation +Calculate expected counts in density bins using +geometric poisson (Polya-Aeppli) approximation. } diff --git a/man/ego_network_density.Rd b/man/ego_network_density.Rd new file mode 100644 index 00000000..26d68250 --- /dev/null +++ b/man/ego_network_density.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{ego_network_density} +\alias{ego_network_density} +\title{Calculate ego network edge densities.} +\usage{ +ego_network_density(ego_networks) +} +\arguments{ +\item{ego_networks}{Pre-generated ego networks for an input graph.} +} +\description{ +Calculate ego network edge densities. +} diff --git a/man/graph_features_to_histograms.Rd b/man/graph_features_to_histograms.Rd index 877ce9c1..2479d593 100644 --- a/man/graph_features_to_histograms.Rd +++ b/man/graph_features_to_histograms.Rd @@ -8,9 +8,10 @@ each feature.} graph_features_to_histograms(features_matrix) } \arguments{ -\item{A}{number of nodes (rows) by number of features (columns) matrix, where -the ij entry is the score of node i on feature j (e.g. for ORCA output this -is counts of each graphlet or orbit at each graph vertex)} +\item{features_matrix}{A number of nodes (rows) by number of features +(columns) matrix, where the ij entry is the score of node i on feature j +(e.g. for ORCA output this is counts of each graphlet or orbit at each +graph vertex)} } \value{ Feature histograms: List of discrete histograms for each diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index a027db67..537589b4 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -7,10 +7,19 @@ mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes, agg_fn = mean) } +\arguments{ +\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} + +\item{density_interval_indexes}{Density bin index for +each ego network.} + +\item{agg_fn}{Function to aggregate counts in each bin +(default \code{agg_fn = mean}). +Temporarily accessible during development. +TODO: Remove @export prior to publishing} +} \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. -Temporarily accessible during development. -TODO: Remove @export prior to publishing } diff --git a/man/netdis_centred_graphlet_counts_ego.Rd b/man/netdis_centred_graphlet_counts_ego.Rd index d5fba09d..b2f0b351 100644 --- a/man/netdis_centred_graphlet_counts_ego.Rd +++ b/man/netdis_centred_graphlet_counts_ego.Rd @@ -2,12 +2,43 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_centred_graphlet_counts_ego} \alias{netdis_centred_graphlet_counts_ego} -\title{TODO: Remove @export prior to publishing} +\title{Generate Netdis centred graphlets counts by subtracting expected counts} \usage{ netdis_centred_graphlet_counts_ego(graph, max_graphlet_size, neighbourhood_size, expected_ego_count_fn = NULL, min_ego_nodes = 3, min_ego_edges = 1) } -\description{ +\arguments{ +\item{graph}{A connected, undirected, simple graph as an +\code{igraph} object.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes +will be counted.} + +\item{neighbourhood_size}{The number of steps from the source node to include +nodes for each ego-network.} + +\item{expected_ego_count_fn}{A function for generating expected ego-network +graphlet counts for a graph. This function should take a connected, +undirected, simple graph as an \code{igraph} object for its only argument. +Where \code{expected_ego_count_fn} is specific to particular values of +\code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken +to ensure that the values of these parameters passed to this function are +consistent with those used when creating \code{expected_ego_count_fn}.} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges} +} +\value{ +A vector with centred counts for each graphlet type in +each ego network. + TODO: Remove @export prior to publishing } +\description{ +Generate Netdis centred graphlets counts by subtracting expected counts +} diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index 3afa593c..839d94f4 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -7,10 +7,28 @@ netdis_expected_graphlet_counts(graph, max_graphlet_size, density_breaks, density_binned_reference_counts, scale_fn = NULL) } +\arguments{ +\item{graph}{A connected, undirected, simple reference graph as an +\code{igraph} object.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} + +\item{density_breaks}{Density values defining bin edges.} + +\item{density_binned_reference_counts}{Reference network graphlet counts for +each density bin.} + +\item{scale_fn}{Optional function to scale calculated expected counts, taking +\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +factor that the looked up \code{density_binned_reference_counts} values will +be multiplied by. +Temporarily accessible during development. +TODO: Remove @export prior to publishing} +} \description{ Used by \code{netdis_expected_graphlet_counts_ego} to -calculate expected graphlet counts for a query graph ego-network from the -statistics of a provided reference graph. -Temporarily accessible during development. -TODO: Remove @export prior to publishing +calculate expected graphlet counts for a query graph +ego-network from the statistics of a provided reference +graph. } diff --git a/man/netdis_expected_graphlet_counts_ego.Rd b/man/netdis_expected_graphlet_counts_ego.Rd index f7933b82..76fcd6b2 100644 --- a/man/netdis_expected_graphlet_counts_ego.Rd +++ b/man/netdis_expected_graphlet_counts_ego.Rd @@ -8,10 +8,37 @@ netdis_expected_graphlet_counts_ego(graph, max_graphlet_size, neighbourhood_size, density_breaks, density_binned_reference_counts, min_ego_nodes = 3, min_ego_edges = 1, scale_fn = NULL) } +\arguments{ +\item{graph}{A connected, undirected, simple reference graph as an +\code{igraph} object.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} + +\item{neighbourhood_size}{The number of steps from the source node to include +node in ego-network.} + +\item{density_breaks}{Density values defining bin edges.} + +\item{density_binned_reference_counts}{Reference network graphlet counts for +each density bin.} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges} + +\item{scale_fn}{Optional function to scale calculated expected counts, taking +\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +factor that the looked up \code{density_binned_reference_counts} values will +be multiplied by. + +TODO: Remove @export prior to publishing} +} \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. Temporarily accessible during development. -TODO: Remove @export prior to publishing } diff --git a/man/netdis_expected_graphlet_counts_ego_fn.Rd b/man/netdis_expected_graphlet_counts_ego_fn.Rd index 30cbc379..9a7d4b90 100644 --- a/man/netdis_expected_graphlet_counts_ego_fn.Rd +++ b/man/netdis_expected_graphlet_counts_ego_fn.Rd @@ -17,6 +17,21 @@ Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} \item{neighbourhood_size}{The number of steps from the source node to include node in ego-network.} + +\item{min_ego_nodes}{Filter ego networks which have fewer +than min_ego_nodes nodes} + +\item{min_ego_edges}{Filter ego networks which have fewer +than min_ego_edges edges} + +\item{min_bin_count}{Minimum count of ego networks in each density bin.} + +\item{num_bins}{Initial number of density bins to generate.} + +\item{scale_fn}{Optional function to scale calculated expected counts, taking +\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +factor that the looked up \code{density_binned_reference_counts} values will +be multiplied by.} } \value{ A function taking a connected, undirected, simple query graph as an diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index ffadc1df..f94312dd 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -7,6 +7,26 @@ netdis_expected_graphlet_counts_per_ego(ego_networks, density_breaks, density_binned_reference_counts, max_graphlet_size, scale_fn = NULL) } +\arguments{ +\item{ego_networks}{The number of steps from the source node to include +node in ego-network.} + +\item{density_breaks}{Density values defining bin edges.} + +\item{density_binned_reference_counts}{Reference network graphlet counts for +each density bin.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} + +\item{scale_fn}{Optional function to scale calculated expected counts, taking +\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale +factor that the looked up \code{density_binned_reference_counts} values will +be multiplied by. + +#' Temporarily accessible during development. +TODO: Remove @export prior to publishing} +} \description{ JACK To follow through logic of paper steps, wanted to pass ego networks to the function, not the input query graph @@ -16,6 +36,4 @@ ego networks to the function, not the input query graph Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. -Temporarily accessible during development. -TODO: Remove @export prior to publishing } diff --git a/man/netdis_for_all_graphs.Rd b/man/netdis_for_all_graphs.Rd index fb30bad8..383a8a30 100644 --- a/man/netdis_for_all_graphs.Rd +++ b/man/netdis_for_all_graphs.Rd @@ -14,6 +14,10 @@ all graphs being compared} \item{graphlet_size}{The size of graphlets to use for the Netdis calculation (only counts for graphlets of the specified size will be used). The size of a graphlet is the number of nodes it contains.} + +\item{mc.cores}{Number of cores to run on. NOTE: only works on unix-like +systems with system level forking capability. This means it will work on +Linux and OSX, but not Windows.} } \value{ Pairwise Netdis statistics between graphs calculated using centred diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index 63940358..fdf13239 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -33,13 +33,30 @@ than min_ego_nodes nodes} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges} -\item{min_bin_count}{Minimum number of ego networks in each density bin} - -\item{num_bins}{Number of density bins to generate} +\item{binning_fn}{Function used to bin ego network densities. Takes densities +as its single argument, and returns a named list including keys \code{breaks} +(list of bin edges) and \code{interval_indexes} (density bin index for each +ego network). (Default: \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +(bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +(Default: \code{density_binned_counts} with \code{agg_fn = mean} and +\code{scale_fn = scale_graphlet_counts_ego}, which mirrors the +approach used in the original netdis paper).} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +(Default: \code{netdis_expected_graphlet_counts_per_ego} with +\code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in +the original netdis paper).} } \value{ -Netdis statistics between graph_1 and graph_2 for graphlet sizes -up to and including max_graphlet_size +Netdis statistics between query graphs for graphlet sizes +up to and including max_graphlet_size. } \description{ Netdis between all graph pairs diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index 6f95956e..6498d3ef 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -32,9 +32,26 @@ than min_ego_nodes nodes} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges} -\item{min_bin_count}{Minimum number of ego networks in each density bin} - -\item{num_bins}{Number of density bins to generate} +\item{binning_fn}{Function used to bin ego network densities. Takes densities +as its single argument, and returns a named list including keys \code{breaks} +(list of bin edges) and \code{interval_indexes} (density bin index for each +ego network). (Default: \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +(bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +(Default: \code{density_binned_counts} with \code{agg_fn = mean} and +\code{scale_fn = scale_graphlet_counts_ego}, which mirrors the +approach used in the original netdis paper).} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +(Default: \code{netdis_expected_graphlet_counts_per_ego} with +\code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in +the original netdis paper).} } \value{ Netdis statistics between graph_1 and graph_2 for graphlet sizes diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 0750f34e..78e151cd 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -30,9 +30,26 @@ than min_ego_nodes nodes} \item{min_ego_edges}{Filter ego networks which have fewer than min_ego_edges edges} -\item{min_bin_count}{Minimum number of ego networks in each density bin} - -\item{num_bins}{Number of density bins to generate} +\item{binning_fn}{Function used to bin ego network densities. Takes densities +as its single argument, and returns a named list including keys \code{breaks} +(list of bin edges) and \code{interval_indexes} (density bin index for each +ego network). (Default: \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +(bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +(Default: \code{density_binned_counts} with \code{agg_fn = mean} and +\code{scale_fn = scale_graphlet_counts_ego}, which mirrors the +approach used in the original netdis paper).} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +(Default: \code{netdis_expected_graphlet_counts_per_ego} with +\code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in +the original netdis paper).} } \value{ Netdis statistics between graph_1 and graph_2 for graphlet sizes diff --git a/man/scale_graphlet_count.Rd b/man/scale_graphlet_count.Rd new file mode 100644 index 00000000..8a313c27 --- /dev/null +++ b/man/scale_graphlet_count.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{scale_graphlet_count} +\alias{scale_graphlet_count} +\title{Divide graphlet counts by pre-computed scaling factor from +\code{count_graphlet_tuples} output.} +\usage{ +scale_graphlet_count(graphlet_count, graphlet_tuples) +} +\arguments{ +\item{graphlet_count}{Pre-computed graphlet counts.} + +\item{graphlet_tuples}{Pre-computed \code{count_graphlet_tuples} output.} +} +\description{ +Divide graphlet counts by pre-computed scaling factor from +\code{count_graphlet_tuples} output. +} diff --git a/man/single_density_bin.Rd b/man/single_density_bin.Rd index 9d89d7ce..ed801c45 100644 --- a/man/single_density_bin.Rd +++ b/man/single_density_bin.Rd @@ -2,14 +2,16 @@ % Please edit documentation in R/measures_net_dis.R \name{single_density_bin} \alias{single_density_bin} -\title{For case where don't want to use binning, return -a single bin which covers full range of possible -densities.} +\title{For case where don't want to use binning, return a single bin which covers +the full range of possible density values.} \usage{ single_density_bin(densities) } +\arguments{ +\item{densities}{Ego network density values (only used to return +a list of indexes of the required length.)} +} \description{ -For case where don't want to use binning, return -a single bin which covers full range of possible -densities. +For case where don't want to use binning, return a single bin which covers +the full range of possible density values. } diff --git a/man/zeros_to_ones.Rd b/man/zeros_to_ones.Rd new file mode 100644 index 00000000..ca662211 --- /dev/null +++ b/man/zeros_to_ones.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{zeros_to_ones} +\alias{zeros_to_ones} +\title{Replace zero values in a vector with ones. Used by +\code{scale_graphlet_count} to prevent divide by +zero errors.} +\usage{ +zeros_to_ones(v) +} +\arguments{ +\item{v}{A vector. +TODO remove export} +} +\description{ +Replace zero values in a vector with ones. Used by +\code{scale_graphlet_count} to prevent divide by +zero errors. +} From 12871d4ae482135e89c7cf01701d5b55c3a95de6 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 6 Sep 2019 14:02:54 +0100 Subject: [PATCH 035/188] dendogram vignette with new interface --- vignettes/dendrogram_example_net_dis_new.Rmd | 122 +++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 vignettes/dendrogram_example_net_dis_new.Rmd diff --git a/vignettes/dendrogram_example_net_dis_new.Rmd b/vignettes/dendrogram_example_net_dis_new.Rmd new file mode 100644 index 00000000..637eef3b --- /dev/null +++ b/vignettes/dendrogram_example_net_dis_new.Rmd @@ -0,0 +1,122 @@ +--- +title: "Dendrogram example for Netdis" +author: "Martin O'Reilly" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Dendrogram example for Netdis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Virus PPI example for Netdis + +### Load graphs +Use `read_simple_graphs` to read graph data from all files in a directory that +match a specific filename pattern in a format suitable for calculating +graphlet-based feature counts using the +[ORCA package](https://CRAN.R-project.org/package=orca). +We use `igraph::read_graph` to read graph data from files, so support +all file formats it supports. See help for `igraph::read_graph` for a list of +supported values for the `format` parameter and the [igraph documentation](http://igraph.org/c/doc/igraph-Foreign.html#igraph_read_graph_edgelist) +for descriptions of each of the supported file formats. + +The ORCA package we use to efficiently calculate graphlet and orbit counts +requires that graphs are _undirected_, _simple_ (i.e. have no self-loops or +multiple edges) and _connected_ (i.e. have no isolated vertices). Therefore, by +default, graphs loaded by `read_simple_graphs` will be coerced to have the above +properties. This can be avoided by setting the relevant `as_undirected`, +`remove_loops`, `remove_multiple` or `remove_isolates` parameters to `FALSE`. +```{r} +library("netdist") +edge_format = "ncol" +# Load reference graph (used for Netdis. Not required for NetEMD) +ref_path = file.path(system.file(file.path("extdata", "random"), + package = "netdist"), + "ER_1250_10_1") +ref_graph <- read_simple_graph(ref_path, format = edge_format) + +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" + +# Load all graphs in the source folder matching the filename pattern +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) +print(names(query_graphs)) +``` + +In this example we will use counts of graphlets containing up to 4 nodes and +consider ego-network neighbourhoods of size 2 (i.e. the immediate neighbours of +each node plus their immediate neighbours). +```{r} +# Set the maximum graphlet size to compute counts for +max_graphlet_size <- 4 +neighbourhood_size <- 2 +``` + +## Generate NetDis measures between each pair of query graphs +```{r} + +# Calculate netdis measure for graphlets up to size max_graphlet_size +netdis_result <- netdis_many_to_many(query_graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size) + +# Netdis measure for graphlets of size 3 +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) + +print(netdis3_mat) + +# Netdis measure for graphlets of size 4 +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print(netdis4_mat) +``` + +## Generate dendrograms +```{r} +graphdists <- as.dist(netdis4_mat) +par(mfrow = c(1, 2)) +cex <- 1 + +# Dendrogram based on Netdis measure for graphlets of size 3 +title <- paste("Netdis: graphlet size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + +# Dendrogram based on Netdis measure for graphlets of size 4 +title = paste("Netdis: graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) +``` +```{r} +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) +``` +```{r} +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 4, sep = "") +heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) +``` \ No newline at end of file From f6e566d06646cb52c62625037a18e0ff06fb36a3 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 9 Sep 2019 14:37:34 +0100 Subject: [PATCH 036/188] started adding tests for new functions --- tests/testthat/test_measures_net_dis.R | 253 ++++ tests/testthat/test_measures_new_net_dis.R | 1023 ----------------- tests/testthat/test_orca_interface.R | 95 ++ vignettes/dendrogram_example_net_dis.Rmd | 99 +- vignettes/dendrogram_example_net_dis_new.Rmd | 122 -- ...on.Rmd => netdis_pairwise_comparisons.Rmd} | 2 +- 6 files changed, 404 insertions(+), 1190 deletions(-) delete mode 100644 tests/testthat/test_measures_new_net_dis.R delete mode 100644 vignettes/dendrogram_example_net_dis_new.Rmd rename vignettes/{quickstart_netdis_comparison.Rmd => netdis_pairwise_comparisons.Rmd} (97%) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 60c14ebc..5615efec 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -279,6 +279,80 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { ) }) +context("Measures Netdis: Ego-network density values match those for manually verified networks") +test_that("Ego-network 4-node density values match manually verified totals", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) + expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) + names(expected_densities_o1) <- node_labels + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) + expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) + names(expected_densities_o2) <- node_labels + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Generate order 1 and 2 ego networks with previously tested function + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + + # Calculate densities + actual_densities_o1 <- ego_network_density(ego_networks_o1) + actual_densities_o2 <- ego_network_density(ego_networks_o2) + + # Check densities match expected values + expect_equal(actual_densities_o1, expected_densities_o1) + expect_equal(actual_densities_o2, expected_densities_o2) + +}) + context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") test_that("Ego-network 4-node density-binned reference counts match manually verified totals", { # Set up a small sample network with at least one ego-network that contains @@ -442,6 +516,185 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver ) }) + + + + + + + + +# JACK ---------------------------------------------------------------------- +context("Measures Netdis: Ego-network density-binned counts for manually verified networks") +test_that("Ego-network 4-node density-binned counts match manually verified totals", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set parameters for test + max_graphlet_size <- 4 + min_counts_per_interval <- 2 + num_intervals <- 100 + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Set manually verified ego-network node counts and edge densities + # 1 . Ego-networks of order 1 + expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) + expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) + # Order 1 expected densities should be: + # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 + # 2. Ego-networks of order 2 + expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) + expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) + # Order 2 expected densities should be: + # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 + + # Set manually verified density bins for ego-networks + # 1. Ego-networks of order 1 + expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) + expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) + expected_binned_densities_o1 <- list( + densities = expected_densities_o1, + interval_indexes = expected_interval_indexes_o1, + breaks = expected_breaks_o1 + ) + # Check binned densities are as expected + actual_binned_densities_o1 <- binned_densities_adaptive( + expected_densities_o1, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) + # 2. Ego-networks of order 2 + expected_min_break_o2 <- 1 / 3 + expected_max_break_o2 <- 0.6 + expected_initial_interval_o2 <- + (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 + expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) + expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) + expected_binned_densities_o2 <- list( + densities = expected_densities_o2, + interval_indexes = expected_interval_indexes_o2, + breaks = expected_breaks_o2 + ) + # Check binned densities are as expected + actual_binned_densities_o2 <- binned_densities_adaptive( + expected_densities_o2, + min_counts_per_interval = min_counts_per_interval, + num_intervals = num_intervals + ) + expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) + + # Set manually verified scaled ego-network graphlet counts + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # 1-step ego networks + expected_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(expected_counts_o1) <- node_labels + colnames(expected_counts_o1) <- graphlet_labels + # 2-step ego networks + expected_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_o2) <- node_labels + colnames(expected_counts_o2) <- graphlet_labels + + # Calculate binned average expected counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 + mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + + expected_counts_o1[8, ]) / 3 + mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + + expected_counts_o1[5, ] + expected_counts_o1[9, ] + + expected_counts_o1[10, ]) / 5 + expected_mean_density_binned_counts_o1 <- rbind( + mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 + ) + rownames(expected_mean_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 + mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + + expected_counts_o2[7, ]) / 3 + mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 + mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + + expected_counts_o2[10, ]) / 3 + expected_mean_density_binned_counts_o2 <- rbind( + mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, + mean_counts_bin4_o2 + ) + rownames(expected_mean_density_binned_counts_o2) <- 1:4 + + # Calculate actual output of function under test + actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( + expected_counts_o1, expected_interval_indexes_o1 + ) + actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( + expected_counts_o2, expected_interval_indexes_o2 + ) + + # Check actual output vs expected + expect_equal( + actual_mean_density_binned_counts_o1, + expected_mean_density_binned_counts_o1 + ) + expect_equal( + actual_mean_density_binned_counts_o2, + expected_mean_density_binned_counts_o2 + ) +}) +# /JACK ---------------------------------------------------------------------- + + + + + + + context("Measures Netdis: Expected graphlet counts") test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes diff --git a/tests/testthat/test_measures_new_net_dis.R b/tests/testthat/test_measures_new_net_dis.R deleted file mode 100644 index 60c14ebc..00000000 --- a/tests/testthat/test_measures_new_net_dis.R +++ /dev/null @@ -1,1023 +0,0 @@ - -context("Measures Netdis: Graphlet tuples") -test_message <- - paste("count_graphlet_tuples and count_graphlet_tuples_ego give", - "choose(node_count, graphlet_size) for each graph + graphlet", - "combination", - sep = " " - ) -test_that(test_message, { - # Create some test graphs with known node counts (this is the only graph - # property we care about for this test) - graph_n11 <- igraph::erdos.renyi.game(11, p = 1, type = "gnp") - graph_n37 <- igraph::erdos.renyi.game(37, p = 1, type = "gnp") - graph_n73 <- igraph::erdos.renyi.game(73, p = 1, type = "gnp") - # Calculate expected graph tuple count for graphlets of various sizes. There - # is 1 graphlet of size 1, 2 of size 3, 6 of size 4, and 21 of size 5 - graphlet_tuple_counts <- function(n, max_graphlet_size) { - if (max_graphlet_size >= 2) { - tuple_counts <- rep(choose(n, 2), 1) - } - if (max_graphlet_size >= 3) { - tuple_counts <- c(tuple_counts, rep(choose(n, 3), 2)) - } - if (max_graphlet_size >= 4) { - tuple_counts <- c(tuple_counts, rep(choose(n, 4), 6)) - } - if (max_graphlet_size >= 5) { - tuple_counts <- c(tuple_counts, rep(choose(n, 5), 21)) - } - tuple_counts <- setNames(tuple_counts, graphlet_key(max_graphlet_size)$id) - tuple_counts - } - - # === TEST count_graphlet_tuples === - # Generate expected tuple counts for graphlets up to size 4 and 5 - expected_tuple_count_n11_gs4 <- graphlet_tuple_counts(11, 4) - expected_tuple_count_n37_gs4 <- graphlet_tuple_counts(37, 4) - expected_tuple_count_n73_gs4 <- graphlet_tuple_counts(73, 4) - expected_tuple_count_n11_gs5 <- graphlet_tuple_counts(11, 5) - expected_tuple_count_n37_gs5 <- graphlet_tuple_counts(37, 5) - expected_tuple_count_n73_gs5 <- graphlet_tuple_counts(73, 5) - # Generate actual tuple counts for graphlets up to size 4 and 5 - actual_tuple_count_n11_gs4 <- count_graphlet_tuples(graph_n11, 4) - actual_tuple_count_n37_gs4 <- count_graphlet_tuples(graph_n37, 4) - actual_tuple_count_n73_gs4 <- count_graphlet_tuples(graph_n73, 4) - actual_tuple_count_n11_gs5 <- count_graphlet_tuples(graph_n11, 5) - actual_tuple_count_n37_gs5 <- count_graphlet_tuples(graph_n37, 5) - actual_tuple_count_n73_gs5 <- count_graphlet_tuples(graph_n73, 5) - # Compare expected tuple counts with actual - expect_equal(expected_tuple_count_n11_gs4, actual_tuple_count_n11_gs4) - expect_equal(expected_tuple_count_n37_gs4, actual_tuple_count_n37_gs4) - expect_equal(expected_tuple_count_n73_gs4, actual_tuple_count_n73_gs4) - expect_equal(expected_tuple_count_n11_gs5, actual_tuple_count_n11_gs5) - expect_equal(expected_tuple_count_n37_gs5, actual_tuple_count_n37_gs5) - expect_equal(expected_tuple_count_n73_gs5, actual_tuple_count_n73_gs5) - - # === TEST count_graphlet_tuples_ego === - # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar - # to the method under test. However, it's a simple method so maybe that's ok? - graphlet_tuple_counts_ego <- function(ego_networks, max_graphlet_size) { - t(sapply(ego_networks, FUN = function(g) { - graphlet_tuple_counts(length(igraph::V(g)), max_graphlet_size) - })) - } - # Generate ego networks for each graph - graph_n11_ego1 <- make_named_ego_graph(graph_n11, order = 1) - graph_n37_ego1 <- make_named_ego_graph(graph_n37, order = 1) - graph_n73_ego1 <- make_named_ego_graph(graph_n73, order = 1) - graph_n11_ego2 <- make_named_ego_graph(graph_n11, order = 2) - graph_n37_ego2 <- make_named_ego_graph(graph_n37, order = 2) - graph_n73_ego2 <- make_named_ego_graph(graph_n73, order = 2) - # Generate expected tuple counts for graphlets up to size 4 and 5 - # 1. For ego-networks of order 1 - expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) - expected_tuple_count_n37_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego1, 4) - expected_tuple_count_n73_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego1, 4) - expected_tuple_count_n11_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego1, 5) - expected_tuple_count_n37_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego1, 5) - expected_tuple_count_n73_ego1_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego1, 5) - # 2. For ego-networks of order 2 - expected_tuple_count_n11_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego2, 4) - expected_tuple_count_n37_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n37_ego2, 4) - expected_tuple_count_n73_ego2_gs4 <- graphlet_tuple_counts_ego(graph_n73_ego2, 4) - expected_tuple_count_n11_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n11_ego2, 5) - expected_tuple_count_n37_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n37_ego2, 5) - expected_tuple_count_n73_ego2_gs5 <- graphlet_tuple_counts_ego(graph_n73_ego2, 5) - - # Calculate actual tuple counts - # 1. For ego-networks of order 1 - actual_tuple_count_n11_ego1_gs4 <- count_graphlet_tuples_ego(graph_n11_ego1, 4) - actual_tuple_count_n37_ego1_gs4 <- count_graphlet_tuples_ego(graph_n37_ego1, 4) - actual_tuple_count_n73_ego1_gs4 <- count_graphlet_tuples_ego(graph_n73_ego1, 4) - actual_tuple_count_n11_ego1_gs5 <- count_graphlet_tuples_ego(graph_n11_ego1, 5) - actual_tuple_count_n37_ego1_gs5 <- count_graphlet_tuples_ego(graph_n37_ego1, 5) - actual_tuple_count_n73_ego1_gs5 <- count_graphlet_tuples_ego(graph_n73_ego1, 5) - # 2. For ego-networks of order 2 - actual_tuple_count_n11_ego2_gs4 <- count_graphlet_tuples_ego(graph_n11_ego2, 4) - actual_tuple_count_n37_ego2_gs4 <- count_graphlet_tuples_ego(graph_n37_ego2, 4) - actual_tuple_count_n73_ego2_gs4 <- count_graphlet_tuples_ego(graph_n73_ego2, 4) - actual_tuple_count_n11_ego2_gs5 <- count_graphlet_tuples_ego(graph_n11_ego2, 5) - actual_tuple_count_n37_ego2_gs5 <- count_graphlet_tuples_ego(graph_n37_ego2, 5) - actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego(graph_n73_ego2, 5) - - # Compare expected with actual - # 1. For ego-networks of order 1 - expect_equal(expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4) - expect_equal(expected_tuple_count_n37_ego1_gs4, actual_tuple_count_n37_ego1_gs4) - expect_equal(expected_tuple_count_n73_ego1_gs4, actual_tuple_count_n73_ego1_gs4) - expect_equal(expected_tuple_count_n11_ego1_gs5, actual_tuple_count_n11_ego1_gs5) - expect_equal(expected_tuple_count_n37_ego1_gs5, actual_tuple_count_n37_ego1_gs5) - expect_equal(expected_tuple_count_n73_ego1_gs5, actual_tuple_count_n73_ego1_gs5) - # 2. For ego-networks of order 2 - expect_equal(expected_tuple_count_n11_ego2_gs4, actual_tuple_count_n11_ego2_gs4) - expect_equal(expected_tuple_count_n37_ego2_gs4, actual_tuple_count_n37_ego2_gs4) - expect_equal(expected_tuple_count_n73_ego2_gs4, actual_tuple_count_n73_ego2_gs4) - expect_equal(expected_tuple_count_n11_ego2_gs5, actual_tuple_count_n11_ego2_gs5) - expect_equal(expected_tuple_count_n37_ego2_gs5, actual_tuple_count_n37_ego2_gs5) - expect_equal(expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5) -}) - -context("Measures Netdis: Ego-network scaled graphlet outputs for manually verified networks") -test_that("Ego-network 4-node graphlet counts match manually verified totals", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - - # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 - max_graphlet_size <- 4 - min_ego_edges <- 0 - min_ego_nodes <- 0 - - actual_counts_order_1 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 1 - ) - actual_counts_order_2 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 2 - ) - - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # Set manually verified counts - # 1-step ego networks - expected_counts_order_1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - rownames(expected_counts_order_1) <- node_labels - colnames(expected_counts_order_1) <- graphlet_labels - # 2-step ego networks - expected_counts_order_2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - rownames(expected_counts_order_2) <- node_labels - colnames(expected_counts_order_2) <- graphlet_labels - - # Test that actual counts match expected with only counts requested (default) - expect_equal(actual_counts_order_1, expected_counts_order_1) - expect_equal(actual_counts_order_2, expected_counts_order_2) - - # Test that actual counts and returned ego networks match expected - # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, - order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, - order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - expected_counts_with_networks_order_1 <- - list( - graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1 - ) - expected_counts_with_networks_order_2 <- - list( - graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2 - ) - # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - return_ego_networks = TRUE - ) - actual_counts_with_networks_order_2 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 2, return_ego_networks = TRUE - ) - - # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to - # indexed edge list and then compare. Do in-situ replacement of igraphs with - # indexed edge lists to ensure we are checking full properties of returned - # objects (i.e. named lists with matching elements). - # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map( - expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges - ) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map( - expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges - ) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map( - actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges - ) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map( - actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges - ) - # 3b. Do comparison - expect_equal( - actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1 - ) - expect_equal( - actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2 - ) -}) - -context("Measures Netdis: Ego-network density-binned reference counts for manually verified networks") -test_that("Ego-network 4-node density-binned reference counts match manually verified totals", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - - # Set parameters for test - max_graphlet_size <- 4 - min_counts_per_interval <- 2 - num_intervals <- 100 - - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - - # Set manually verified ego-network node counts and edge densities - # 1 . Ego-networks of order 1 - expected_node_counts_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - expected_edge_counts_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edge_counts_o1 <- choose(expected_node_counts_o1, 2) - expected_densities_o1 <- c(expected_edge_counts_o1 / max_edge_counts_o1) - # Order 1 expected densities should be: - # 0.6, 0.5, 1.0, 0.83, 1.0, 0.67, 0.7, 0.7, 1.0, 1.0 - # 2. Ego-networks of order 2 - expected_node_counts_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - expected_edge_counts_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edge_counts_o2 <- choose(expected_node_counts_o2, 2) - expected_densities_o2 <- c(expected_edge_counts_o2 / max_edge_counts_o2) - # Order 2 expected densities should be: - # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 - - # Set manually verified density bins for ego-networks - # 1. Ego-networks of order 1 - expected_breaks_o1 <- c(0.5, 0.605, 0.705, 1) - expected_interval_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - expected_binned_densities_o1 <- list( - densities = expected_densities_o1, - interval_indexes = expected_interval_indexes_o1, - breaks = expected_breaks_o1 - ) - # Check binned densities are as expected - actual_binned_densities_o1 <- binned_densities_adaptive( - expected_densities_o1, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) - # 2. Ego-networks of order 2 - expected_min_break_o2 <- 1 / 3 - expected_max_break_o2 <- 0.6 - expected_initial_interval_o2 <- - (expected_max_break_o2 - expected_min_break_o2) / (num_intervals) # 0.00266666667 - expected_breaks_o2 <- expected_min_break_o2 + (expected_initial_interval_o2 * c(0, 9, 50, 63, 100)) - expected_interval_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - expected_binned_densities_o2 <- list( - densities = expected_densities_o2, - interval_indexes = expected_interval_indexes_o2, - breaks = expected_breaks_o2 - ) - # Check binned densities are as expected - actual_binned_densities_o2 <- binned_densities_adaptive( - expected_densities_o2, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - expected_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - rownames(expected_counts_o1) <- node_labels - colnames(expected_counts_o1) <- graphlet_labels - # 2-step ego networks - expected_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - rownames(expected_counts_o2) <- node_labels - colnames(expected_counts_o2) <- graphlet_labels - - # Calculate binned average expected counts based on manually verified counts - # and density bins - # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 - mean_counts_bin1_o1 <- (expected_counts_o1[1, ] + expected_counts_o1[2, ]) / 2 - mean_counts_bin2_o1 <- (expected_counts_o1[6, ] + expected_counts_o1[7, ] + - expected_counts_o1[8, ]) / 3 - mean_counts_bin3_o1 <- (expected_counts_o1[3, ] + expected_counts_o1[4, ] + - expected_counts_o1[5, ] + expected_counts_o1[9, ] + - expected_counts_o1[10, ]) / 5 - expected_mean_density_binned_counts_o1 <- rbind( - mean_counts_bin1_o1, mean_counts_bin2_o1, mean_counts_bin3_o1 - ) - rownames(expected_mean_density_binned_counts_o1) <- 1:3 - # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 - mean_counts_bin1_o2 <- (expected_counts_o2[1, ] + expected_counts_o2[4, ]) / 2 - mean_counts_bin2_o2 <- (expected_counts_o2[2, ] + expected_counts_o2[6, ] + - expected_counts_o2[7, ]) / 3 - mean_counts_bin3_o2 <- (expected_counts_o2[3, ] + expected_counts_o2[5, ]) / 2 - mean_counts_bin4_o2 <- (expected_counts_o2[8, ] + expected_counts_o2[9, ] + - expected_counts_o2[10, ]) / 3 - expected_mean_density_binned_counts_o2 <- rbind( - mean_counts_bin1_o2, mean_counts_bin2_o2, mean_counts_bin3_o2, - mean_counts_bin4_o2 - ) - rownames(expected_mean_density_binned_counts_o2) <- 1:4 - - # Calculate actual output of function under test - actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( - expected_counts_o1, expected_interval_indexes_o1 - ) - actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( - expected_counts_o2, expected_interval_indexes_o2 - ) - - # Check actual output vs expected - expect_equal( - actual_mean_density_binned_counts_o1, - expected_mean_density_binned_counts_o1 - ) - expect_equal( - actual_mean_density_binned_counts_o2, - expected_mean_density_binned_counts_o2 - ) -}) - -context("Measures Netdis: Expected graphlet counts") -test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { - # Helper function to generate graphs with known density and number of nodes - rand_graph <- function(num_nodes, density) { - max_edges <- choose(num_nodes, 2) - num_edges <- density * max_edges - igraph::erdos.renyi.game(num_nodes, num_edges, "gnm", - loops = FALSE, directed = FALSE - ) - } - # Set up some dummy reference density breaks and scaled reference counts - density_breaks <- c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0) - scaled_reference_counts <- rbind( - c(1, 2, 3, 4, 5, 6, 7, 8, 9), - c(11, 12, 13, 14, 15, 16, 17, 18, 19), - c(21, 22, 23, 24, 25, 26, 27, 28, 29), - c(31, 32, 33, 34, 35, 36, 37, 38, 39), - c(41, 42, 43, 44, 45, 46, 47, 48, 49), - c(51, 52, 53, 54, 55, 56, 57, 58, 59), - c(61, 62, 63, 64, 65, 66, 67, 68, 69), - c(71, 72, 73, 74, 75, 76, 77, 78, 79), - c(81, 82, 83, 84, 85, 86, 87, 88, 89), - c(91, 92, 93, 94, 95, 96, 97, 98, 99) - ) - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - colnames(scaled_reference_counts) <- graphlet_labels - rownames(scaled_reference_counts) <- 1:10 - graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - names(graphlet_sizes) <- graphlet_labels - max_graphlet_size <- 4 - - # Generate some test graphs - densities <- c(0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95) - density_indexes <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) - num_nodes <- rep(120, 10) - graphs <- purrr::map2(num_nodes, densities, rand_graph) - - # Helper function to calculate expected expected graphlet counts - expected_expected_graphlet_counts_fn <- function(density_index, node_count) { - reference_counts <- scaled_reference_counts[density_index, ] - reference_counts * choose(node_count, graphlet_sizes) - } - # Determine expected and actual expected graphlet counts - expected_expected_graphlet_counts <- - purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) - actual_expected_graphlet_counts <- - purrr::map(graphs, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = scaled_reference_counts, - scale_fn=count_graphlet_tuples - ) - # Loop over each graph and compare expected with actual - # NOTE: v2.0.0 of testthat library made a breaking change that means using - # map, mapply etc can cause failures under certain conditions - # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 - for (i in 1:length(actual_expected_graphlet_counts)) { - expect_equal( - actual_expected_graphlet_counts[i], - expected_expected_graphlet_counts[i] - ) - } -}) - -test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes", { - # Helper function to generate graphs with known density and number of nodes - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - max_graphlet_size <- 4 - min_ego_edges <- 0 - min_ego_nodes <- 0 - - # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, - order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - ego_networks_o2 <- make_named_ego_graph(graph, - order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - # Set manually-verified node counts and densities - # 1. Ego-networks of order 1 - num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edges_o1 <- choose(num_nodes_o1, 2) - densities_o1 <- num_edges_o1 / max_edges_o1 - # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 - # 2. Ego-networks of order 2 - num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edges_o2 <- choose(num_nodes_o2, 2) - densities_o2 <- num_edges_o2 / max_edges_o2 - # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 - # Set manually defined density breaks and indexes - breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) - density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) - density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) - # Set dummy reference counts - scaled_reference_counts <- rbind( - c(1, 2, 3, 4, 5, 6, 7, 8, 9), - c(11, 12, 13, 14, 15, 16, 17, 18, 19), - c(21, 22, 23, 24, 25, 26, 27, 28, 29), - c(31, 32, 33, 34, 35, 36, 37, 38, 39), - c(41, 42, 43, 44, 45, 46, 47, 48, 49), - c(51, 52, 53, 54, 55, 56, 57, 58, 59), - c(61, 62, 63, 64, 65, 66, 67, 68, 69), - c(71, 72, 73, 74, 75, 76, 77, 78, 79), - c(81, 82, 83, 84, 85, 86, 87, 88, 89), - c(91, 92, 93, 94, 95, 96, 97, 98, 99) - ) - expected_dims <- dim(scaled_reference_counts) - min_ego_nodes <- 3 - min_ego_edges <- 1 - - # Helper function to calculate expected expected graphlet counts - expected_expected_graphlet_counts_fn <- function(density_index, node_count) { - reference_counts <- scaled_reference_counts[density_index, ] - reference_counts * choose(node_count, graphlet_sizes) - } - # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet - # types as columns and ego networks for nodes in graph as rows - expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( - density_indexes_o1, num_nodes_o1, expected_expected_graphlet_counts_fn - ))) - expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( - density_indexes_o2, num_nodes_o2, expected_expected_graphlet_counts_fn - ))) - # Sanity check for expected output shape. Should be matrix with graphlet types - # as columns and nodes as rows - expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) - expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) - # Set column labels to graphlet names - colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels - colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels - # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) - # Drop rows for nodes with ewer than minumum required nodes and edges in ego - # network - expected_expected_graphlet_counts_ego_o1 <- - expected_expected_graphlet_counts_ego_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), - ] - expected_expected_graphlet_counts_ego_o2 <- - expected_expected_graphlet_counts_ego_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] - - # Calculate actual output of function under test - actual_expected_graphlet_counts_ego_o1 <- - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, density_breaks = breaks, - density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples - ) - actual_expected_graphlet_counts_ego_o2 <- - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, density_breaks = breaks, - density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples - ) - - # Compare actual to expected - expect_equal( - actual_expected_graphlet_counts_ego_o1, - actual_expected_graphlet_counts_ego_o1 - ) - expect_equal( - actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2 - ) -}) - -test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 nodes", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - names(graphlet_sizes) <- graphlet_labels - max_graphlet_size <- 4 - # Make graph ego networks - min_ego_nodes <- 0 - min_edgo_edges <- 0 - ego_networks_o1 <- make_named_ego_graph(graph, - order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_edgo_edges - ) - ego_networks_o2 <- make_named_ego_graph(graph, - order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_edgo_edges - ) - # Set manually-verified node counts and densities - # 1. Ego-networks of order 1 - num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edges_o1 <- choose(num_nodes_o1, 2) - densities_o1 <- num_edges_o1 / max_edges_o1 - # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 - # 2. Ego-networks of order 2 - num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edges_o2 <- choose(num_nodes_o2, 2) - densities_o2 <- num_edges_o2 / max_edges_o2 - # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 - # Set manually determined density breaks and indexes, based on a min bin count - # of 2 and an initial request for 100 bins - min_bin_count <- 2 - num_bins <- 100 - num_breaks <- num_bins + 1 - min_density_o1 <- 0.5 - max_density_o1 <- 1.0 - breaks_o1 <- seq(min_density_o1, max_density_o1, length.out = num_breaks)[c(1, 22, 42, 101)] - density_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - min_density_o2 <- 1 / 3 - max_density_o2 <- 0.6 - breaks_o2 <- seq(min_density_o2, max_density_o2, length.out = num_breaks)[c(1, 10, 51, 64, 101)] - density_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - # Guard against errors in manually determined breaks and indexes by checking - # against already tested code. This also lets us ensure we handle densities - # falling exactly on a bin boundary the same as the code under test. - comp_binned_densities_o1 <- binned_densities_adaptive( - densities_o1, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins - ) - comp_binned_densities_o2 <- binned_densities_adaptive( - densities_o2, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins - ) - expect_equal( - comp_binned_densities_o1, - list( - densities = densities_o1, - interval_indexes = density_indexes_o1, - breaks = breaks_o1 - ) - ) - expect_equal( - comp_binned_densities_o2, - list( - densities = densities_o2, - interval_indexes = density_indexes_o2, - breaks = breaks_o2 - ) - ) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - scaled_reference_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - # 2-step ego networks - scaled_reference_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - min_ego_nodes <- 3 - min_ego_edges <- 1 - # Drop rows for nodes with ewer than minumum required nodes and edges in ego - # network - scaled_reference_counts_o1 <- - scaled_reference_counts_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), - ] - scaled_reference_counts_o2 <- - scaled_reference_counts_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] - density_indexes_o1 <- density_indexes_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges) - ] - density_indexes_o2 <- density_indexes_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges) - ] - # Average manually verified scaled reference counts across density bins - density_binned_reference_counts_o1 <- rbind( - (scaled_reference_counts_o1[1, ] + scaled_reference_counts_o1[2, ]) / 2, - (scaled_reference_counts_o1[4, ] + scaled_reference_counts_o1[5, ] + - scaled_reference_counts_o1[6, ]) / 3, - (scaled_reference_counts_o1[3, ] + - scaled_reference_counts_o1[7, ] + - scaled_reference_counts_o1[8, ]) / 3 - ) - rownames(density_binned_reference_counts_o1) <- 1:3 - density_binned_reference_counts_o2 <- rbind( - (scaled_reference_counts_o2[1, ] + scaled_reference_counts_o2[4, ]) / 2, - (scaled_reference_counts_o2[2, ] + scaled_reference_counts_o2[6, ] + - scaled_reference_counts_o2[7, ]) / 3, - (scaled_reference_counts_o2[3, ] + scaled_reference_counts_o2[5, ]) / 2, - (scaled_reference_counts_o2[8, ] + scaled_reference_counts_o2[9, ] + - scaled_reference_counts_o2[10, ]) / 3 - ) - rownames(density_binned_reference_counts_o2) <- 1:4 - - # Helper functions to calculate expected expected graphlet counts - expected_expected_graphlet_counts_o1_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o1[density_index, ] - reference_counts * choose(node_count, graphlet_sizes) - } - expected_expected_graphlet_counts_o2_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o2[density_index, ] - reference_counts * choose(node_count, graphlet_sizes) - } - # Calculate expected graphlet counts - expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( - density_indexes_o1, num_nodes_o1[(num_nodes_o1 >= min_ego_nodes)], - expected_expected_graphlet_counts_o1_fn - ))) - rownames(expected_expected_graphlet_counts_ego_o1) <- - names(ego_networks_o1[(num_nodes_o1 >= min_ego_nodes)]) - expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( - density_indexes_o2, num_nodes_o2[(num_nodes_o2 >= min_ego_nodes)], - expected_expected_graphlet_counts_o2_fn - ))) - rownames(expected_expected_graphlet_counts_ego_o2) <- - names(ego_networks_o2[(num_nodes_o2 >= min_ego_nodes)]) - - # Sanity check manually derived expected expected counts by comparing against - # pre-tested fully applied expected_graphlet_counts_ego function - expect_equal( - expected_expected_graphlet_counts_ego_o1, - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - density_breaks = breaks_o1, - density_binned_reference_counts_o1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples - ) - ) - expect_equal( - expected_expected_graphlet_counts_ego_o2, - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - density_breaks = breaks_o2, - density_binned_reference_counts_o2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples - ) - ) - - # Generate partially applied functions using function under test - actual_expected_graphlet_counts_ego_fn_o1 <- - netdis_expected_graphlet_counts_ego_fn( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_bin_count = min_bin_count, - num_bins = num_bins, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples - ) - actual_expected_graphlet_counts_ego_fn_o2 <- - netdis_expected_graphlet_counts_ego_fn( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - min_bin_count = min_bin_count, - num_bins = num_bins, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples - ) - # Generate actual expected accounts by applying generated functions to test - # graph - actual_expected_graphlet_counts_ego_o1 <- - actual_expected_graphlet_counts_ego_fn_o1(graph) - actual_expected_graphlet_counts_ego_o2 <- - actual_expected_graphlet_counts_ego_fn_o2(graph) - - # Compare actual to expected - expect_equal( - actual_expected_graphlet_counts_ego_o1, - expected_expected_graphlet_counts_ego_o1 - ) - expect_equal( - actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2 - ) -}) - -context("Measures Netdis: Centered graphlet counts") -test_that("netdis_centred_graphlet_counts_ego is correct", { - # Set up small sample networks each with each graphlet represented in at least - # one ego network - ref_elist <- rbind( - c("n1", "n2"), - c("n1", "n3"), - c("n1", "n4"), - c("n1", "n5"), - c("n1", "n6"), - c("n2", "n7"), - c("n2", "n8"), - c("n2", "n9"), - c("n9", "n10"), - c("n10", "n11"), - c("n11", "n12"), - c("n11", "n13"), - c("n2", "n14"), - c("n8", "n14"), - c("n12", "n15"), - c("n12", "n16"), - c("n15", "n17"), - c("n12", "n18"), - c("n15", "n18"), - c("n16", "n17"), - c("n16", "n18"), - c("n17", "n18"), - c("n16", "n19"), - c("n16", "n20"), - c("n16", "n21"), - c("n19", "n20"), - c("n19", "n21"), - c("n15", "n22"), - c("n15", "n23"), - c("n15", "n24"), - c("n22", "n23"), - c("n22", "n24"), - c("n23", "n24") - ) - ref_graph <- igraph::graph_from_edgelist(ref_elist, directed = FALSE) - - query_elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - query_graph <- igraph::graph_from_edgelist(query_elist, directed = FALSE) - - max_graphlet_size <- 4 - # Use pre-tested functions to generate ego-network graphlet counts - # 1. Reference graph ego-network graphlet counts - ref_o1 <- count_graphlets_ego( - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE - ) - ego_counts_ref_o1 <- ref_o1$graphlet_counts - ego_networks_ref_o1 <- ref_o1$ego_networks - density_ref_o1 <- sapply(ego_networks_ref_o1, igraph::edge_density) - - ref_o2 <- count_graphlets_ego( - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE - ) - ego_counts_ref_o2 <- ref_o2$graphlet_counts - ego_networks_ref_o2 <- ref_o2$ego_networks - density_ref_o2 <- sapply(ego_networks_ref_o2, igraph::edge_density) - - # 2. Query graph ego-network graphlet countsa - query_o1 <- count_graphlets_ego( - query_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE - ) - ego_counts_query_o1 <- query_o1$graphlet_counts - ego_networks_query_o1 <- query_o1$ego_networks - density_query_o1 <- sapply(ego_networks_query_o1, igraph::edge_density) - - query_o2 <- count_graphlets_ego( - query_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE - ) - ego_counts_query_o2 <- query_o2$graphlet_counts - ego_networks_query_o2 <- query_o2$ego_networks - density_query_o2 <- sapply(ego_networks_query_o2, igraph::edge_density) - - centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, - query_node_counts, ref_node_count, - min_nodes, min_edges, - min_bin_count, num_bins) { - graphlet_node_counts_k4 <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - # 1. Calculate scaling factors for each reference and query graphlet count - # These are nCk, where n is the number of nodes in the network and - # k is the number of nodes in the graphlet - ref_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) { - choose(ref_node_count, k) - } - ) - query_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) { - choose(query_node_count, k) - } - ) - # 2. Calculate scaled reference counts by dividing by ref_scale_factor - ref_scaled_graphlet_count <- query_graphlet_count / ref_scale_factor - # - } -}) diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index 0cc59366..b6efd1f0 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -1391,6 +1391,101 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall ) }) +context("ORCA interface: Ego-network graphlet counts") +test_that("ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manually verified totals for test graph", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + max_graphlet_size <- 4 + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + expected_counts_order_1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 1, 0, 2, 0, 2, 0, 0), + c(1, 0, 0, 0, 0, 0, 0, 0, 0), + c(5, 2, 2, 0, 0, 0, 0, 1, 0), + c(1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 2, 1, 0, 0, 0, 1, 0, 0), + c(7, 3, 4, 0, 0, 0, 3, 0, 1), + c(7, 3, 4, 0, 0, 0, 3, 0, 1), + c(6, 0, 4, 0, 0, 0, 0, 0, 1), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) + ) + rownames(expected_counts_order_1) <- node_labels + colnames(expected_counts_order_1) <- graphlet_labels + # 2-step ego networks + expected_counts_order_2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1), + c(8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 1, 0, 2, 0, 2, 0, 0), + c(10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 1, 0, 2, 0, 2, 0, 0), + c(13, 13, 6, 15, 1, 1, 9, 1, 1), + c(13, 13, 6, 15, 1, 1, 9, 1, 1), + c(11, 10, 5, 10, 0, 1, 8, 0, 1), + c(9, 8, 4, 4, 0, 1, 6, 0, 1), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) + ) + rownames(expected_counts_order_2) <- node_labels + colnames(expected_counts_order_2) <- graphlet_labels + + # Count graphlets in each ego network of the graph with only counts requested + min_ego_nodes <- 0 + min_ego_edges <- 0 + + # Test that actual and returned ego graphlet counts match + # 1. Generate ego networks with previously tested function. + ego_networks_order_1 <- make_named_ego_graph(graph, + order = 1, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + ego_networks_order_2 <- make_named_ego_graph(graph, + order = 2, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges + ) + + #2. Calculate counts with ego_to_graphlet_counts. + actual_counts_order_1 <- + ego_to_graphlet_counts(ego_networks_order_1, + max_graphlet_size = max_graphlet_size + ) + actual_counts_order_2 <- + ego_to_graphlet_counts(ego_networks_order_2, + max_graphlet_size = max_graphlet_size + ) + + # 3. Test that actual counts match expected + expect_equal(actual_counts_order_1, expected_counts_order_1) + expect_equal(actual_counts_order_2, expected_counts_order_2) + +}) + # context("ORCA interface: Graphlet-based degree distributions") # test_that("gdd works", { # graph <- netdist::virusppi$EBV diff --git a/vignettes/dendrogram_example_net_dis.Rmd b/vignettes/dendrogram_example_net_dis.Rmd index 24c53b7a..e8dce40e 100644 --- a/vignettes/dendrogram_example_net_dis.Rmd +++ b/vignettes/dendrogram_example_net_dis.Rmd @@ -31,17 +31,21 @@ properties. This can be avoided by setting the relevant `as_undirected`, library("netdist") edge_format = "ncol" # Load reference graph (used for Netdis. Not required for NetEMD) -ref_path = file.path(system.file(file.path("extdata", "random"), package = "netdist"), "ER_1250_10_1") +ref_path = file.path(system.file(file.path("extdata", "random"), + package = "netdist"), + "ER_1250_10_1") ref_graph <- read_simple_graph(ref_path, format = edge_format) # Set source directory and file properties for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -edge_format = "ncol" -file_pattern = "*" +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" # Load all graphs in the source folder matching the filename pattern -query_graphs <- read_simple_graphs(source_dir, format = edge_format, - pattern = file_pattern) +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) print(names(query_graphs)) ``` @@ -54,60 +58,67 @@ max_graphlet_size <- 4 neighbourhood_size <- 2 ``` -## Generate a function to generate expected graphlet counts -Use `netdis_expected_graphlet_counts_ego_fn` to generate a function that -calculates expected ego-network graphlet counts for query graphs based on the -statistics of a provided reference graph. - +## Generate NetDis measures between each pair of query graphs ```{r} -expected_count_fn <- netdis_expected_graphlet_counts_ego_fn( - ref_graph, max_graphlet_size, neighbourhood_size) -``` -## Generate centred graphlet counts for a set of query graphs -```{r} -centred_counts <- purrr::map(query_graphs, netdis_centred_graphlet_counts, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - expected_ego_count_fn = expected_count_fn) -``` +# Calculate netdis measure for graphlets up to size max_graphlet_size +netdis_result <- netdis_many_to_many(query_graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size) -## Generate NetDis measures between each pair of query graphs -```{r} # Netdis measure for graphlets of size 3 -res3 <- netdis_for_all_graphs(centred_counts, 3) -netdis3_mat <- cross_comp_to_matrix(res3$netdis, res3$comp_spec) +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) + +print("Netdis: graphlet size = 3") +print(netdis3_mat) + # Netdis measure for graphlets of size 4 -res4 <- netdis_for_all_graphs(centred_counts, 4) -netdis4_mat <- cross_comp_to_matrix(res4$netdis, res4$comp_spec) -netdis4_mat +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print("Netdis: graphlet size = 4") +print(netdis4_mat) ``` ## Generate dendrograms ```{r} -graphdists<-as.dist(netdis4_mat) -par(mfrow=c(1,2)) -cex=1 +graphdists <- as.dist(netdis4_mat) +par(mfrow = c(1, 2)) +cex <- 1 + # Dendrogram based on Netdis measure for graphlets of size 3 -title = paste("Netdis: graphlet size = ", 3, sep = "") -plot(phangorn::upgma(as.dist(netdis3_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + # Dendrogram based on Netdis measure for graphlets of size 4 title = paste("Netdis: graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netdis4_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) ``` ```{r} -cex=1.5 +cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 3, sep = "") -heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) ``` ```{r} -cex=1.5 +cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 4, sep = "") -heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +title <- paste("Netdis: graphlet size = ", 4, sep = "") +heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) ``` \ No newline at end of file diff --git a/vignettes/dendrogram_example_net_dis_new.Rmd b/vignettes/dendrogram_example_net_dis_new.Rmd deleted file mode 100644 index 637eef3b..00000000 --- a/vignettes/dendrogram_example_net_dis_new.Rmd +++ /dev/null @@ -1,122 +0,0 @@ ---- -title: "Dendrogram example for Netdis" -author: "Martin O'Reilly" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Dendrogram example for Netdis} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -## Virus PPI example for Netdis - -### Load graphs -Use `read_simple_graphs` to read graph data from all files in a directory that -match a specific filename pattern in a format suitable for calculating -graphlet-based feature counts using the -[ORCA package](https://CRAN.R-project.org/package=orca). -We use `igraph::read_graph` to read graph data from files, so support -all file formats it supports. See help for `igraph::read_graph` for a list of -supported values for the `format` parameter and the [igraph documentation](http://igraph.org/c/doc/igraph-Foreign.html#igraph_read_graph_edgelist) -for descriptions of each of the supported file formats. - -The ORCA package we use to efficiently calculate graphlet and orbit counts -requires that graphs are _undirected_, _simple_ (i.e. have no self-loops or -multiple edges) and _connected_ (i.e. have no isolated vertices). Therefore, by -default, graphs loaded by `read_simple_graphs` will be coerced to have the above -properties. This can be avoided by setting the relevant `as_undirected`, -`remove_loops`, `remove_multiple` or `remove_isolates` parameters to `FALSE`. -```{r} -library("netdist") -edge_format = "ncol" -# Load reference graph (used for Netdis. Not required for NetEMD) -ref_path = file.path(system.file(file.path("extdata", "random"), - package = "netdist"), - "ER_1250_10_1") -ref_graph <- read_simple_graph(ref_path, format = edge_format) - -# Set source directory and file properties for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), - package = "netdist") -edge_format <- "ncol" -file_pattern <- "*" - -# Load all graphs in the source folder matching the filename pattern -query_graphs <- read_simple_graphs(source_dir, - format = edge_format, - pattern = file_pattern) -print(names(query_graphs)) -``` - -In this example we will use counts of graphlets containing up to 4 nodes and -consider ego-network neighbourhoods of size 2 (i.e. the immediate neighbours of -each node plus their immediate neighbours). -```{r} -# Set the maximum graphlet size to compute counts for -max_graphlet_size <- 4 -neighbourhood_size <- 2 -``` - -## Generate NetDis measures between each pair of query graphs -```{r} - -# Calculate netdis measure for graphlets up to size max_graphlet_size -netdis_result <- netdis_many_to_many(query_graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size) - -# Netdis measure for graphlets of size 3 -res3 <- netdis_result$netdis["netdis3", ] -netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) - -print(netdis3_mat) - -# Netdis measure for graphlets of size 4 -res4 <- netdis_result$netdis["netdis4", ] -netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) - -print(netdis4_mat) -``` - -## Generate dendrograms -```{r} -graphdists <- as.dist(netdis4_mat) -par(mfrow = c(1, 2)) -cex <- 1 - -# Dendrogram based on Netdis measure for graphlets of size 3 -title <- paste("Netdis: graphlet size = ", 3, sep = "") -plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), - use.edge.length = FALSE, - edge.width = cex*2, - main = title, - cex.lab = cex, cex.axis = cex, - cex.main = cex, cex.sub = cex, - cex = cex) - -# Dendrogram based on Netdis measure for graphlets of size 4 -title = paste("Netdis: graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), - use.edge.length = FALSE, - edge.width = cex*2, - main = title, - cex.lab = cex, cex.axis = cex, - cex.main = cex, cex.sub = cex, - cex = cex) -``` -```{r} -cex <- 1.5 -col <- colorRampPalette(colors = c("blue","white"))(100) -title <- paste("Netdis: graphlet size = ", 3, sep = "") -heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, - cexRow = cex, cexCol = cex, symm = TRUE) -``` -```{r} -cex <- 1.5 -col <- colorRampPalette(colors = c("blue","white"))(100) -title <- paste("Netdis: graphlet size = ", 4, sep = "") -heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, - cexRow = cex, cexCol = cex, symm = TRUE) -``` \ No newline at end of file diff --git a/vignettes/quickstart_netdis_comparison.Rmd b/vignettes/netdis_pairwise_comparisons.Rmd similarity index 97% rename from vignettes/quickstart_netdis_comparison.Rmd rename to vignettes/netdis_pairwise_comparisons.Rmd index b5aefe61..d5809c4d 100644 --- a/vignettes/quickstart_netdis_comparison.Rmd +++ b/vignettes/netdis_pairwise_comparisons.Rmd @@ -1,5 +1,5 @@ --- -title: "Usage of netdis with different pairwise comparison options." +title: "Usage of netdis interfaces for different pairwise comparison options." author: "Jack Roberts" date: "`r Sys.Date()`" output: rmarkdown::html_vignette From f6fb92ec2da8d3770cfa16f637be066ddc154785 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 9 Sep 2019 18:38:23 +0100 Subject: [PATCH 037/188] few more tests --- tests/testthat/test_measures_net_dis.R | 300 +++++++++++++++++++++---- 1 file changed, 260 insertions(+), 40 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 5615efec..39615699 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -516,17 +516,117 @@ test_that("Ego-network 4-node density-binned reference counts match manually ver ) }) +context("Measures Netdis: scale_graphlet_counts_ego for manually verified networks") +test_that("Ego-network 4-node graphlet counts match manually verified totals", { + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + + # Set node and graphlet labels to use for row and col names in expected counts + node_labels <- igraph::V(graph)$name + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + + # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 + max_graphlet_size <- 4 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Use previously tested functions to generate ego networks and calcualte graphlet + # counts. + #ego nets + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + + #graphlet counts + graphlet_counts_o1 <- + ego_to_graphlet_counts(ego_networks_o1, + max_graphlet_size = max_graphlet_size + ) + graphlet_counts_o2 <- + ego_to_graphlet_counts(ego_networks_o2, + max_graphlet_size = max_graphlet_size + ) + + + # Calculate scaled counts with scale_graphlet_counts_ego + # (function to test). + actual_counts_o1 <- + scale_graphlet_counts_ego(ego_networks_o1, + graphlet_counts_o1, + max_graphlet_size = max_graphlet_size + ) + actual_counts_o2 <- + scale_graphlet_counts_ego(ego_networks_o2, + graphlet_counts_o2, + max_graphlet_size = max_graphlet_size + ) + + graphlet_key <- graphlet_key(max_graphlet_size) + k <- graphlet_key$node_count + # Set manually verified counts + # 1-step ego networks + expected_counts_o1 <- rbind( + c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), + c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), + c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), + c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) + ) + rownames(expected_counts_o1) <- node_labels + colnames(expected_counts_o1) <- graphlet_labels + # 2-step ego networks + expected_counts_o2 <- rbind( + c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), + c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), + c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), + c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), + c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) + ) + rownames(expected_counts_o2) <- node_labels + colnames(expected_counts_o2) <- graphlet_labels + + # Test that actual counts match expected + expect_equal(actual_counts_o1, expected_counts_o1) + expect_equal(actual_counts_o2, expected_counts_o2) +}) - - - - - - - -# JACK ---------------------------------------------------------------------- context("Measures Netdis: Ego-network density-binned counts for manually verified networks") -test_that("Ego-network 4-node density-binned counts match manually verified totals", { +test_that("density_binned_counts output matches manually verified totals with different scaling and aggregation functions", { # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets elist <- rbind( @@ -552,6 +652,8 @@ test_that("Ego-network 4-node density-binned counts match manually verified tota max_graphlet_size <- 4 min_counts_per_interval <- 2 num_intervals <- 100 + min_ego_edges <- 0 + min_ego_nodes <- 0 # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name @@ -582,13 +684,6 @@ test_that("Ego-network 4-node density-binned counts match manually verified tota interval_indexes = expected_interval_indexes_o1, breaks = expected_breaks_o1 ) - # Check binned densities are as expected - actual_binned_densities_o1 <- binned_densities_adaptive( - expected_densities_o1, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o1, expected_binned_densities_o1) # 2. Ego-networks of order 2 expected_min_break_o2 <- 1 / 3 expected_max_break_o2 <- 0.6 @@ -601,14 +696,7 @@ test_that("Ego-network 4-node density-binned counts match manually verified tota interval_indexes = expected_interval_indexes_o2, breaks = expected_breaks_o2 ) - # Check binned densities are as expected - actual_binned_densities_o2 <- binned_densities_adaptive( - expected_densities_o2, - min_counts_per_interval = min_counts_per_interval, - num_intervals = num_intervals - ) - expect_equal(actual_binned_densities_o2, expected_binned_densities_o2) - + # Set manually verified scaled ego-network graphlet counts graphlet_key <- graphlet_key(max_graphlet_size) k <- graphlet_key$node_count @@ -669,31 +757,163 @@ test_that("Ego-network 4-node density-binned counts match manually verified tota ) rownames(expected_mean_density_binned_counts_o2) <- 1:4 - # Calculate actual output of function under test - actual_mean_density_binned_counts_o1 <- mean_density_binned_graphlet_counts( - expected_counts_o1, expected_interval_indexes_o1 - ) - actual_mean_density_binned_counts_o2 <- mean_density_binned_graphlet_counts( - expected_counts_o2, expected_interval_indexes_o2 - ) + # density_binned_counts with default arguments should give + # mean graphlet count in each density bin + actual_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1) + + actual_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2) # Check actual output vs expected expect_equal( - actual_mean_density_binned_counts_o1, + actual_density_binned_counts_o1, expected_mean_density_binned_counts_o1 ) expect_equal( - actual_mean_density_binned_counts_o2, + actual_density_binned_counts_o2, expected_mean_density_binned_counts_o2 ) + + # Calculate max binned counts based on manually verified counts + # and density bins + # Order 1: Expected interval indexes: 1, 1, 3, 3, 3, 2, 2, 2, 3, 3 + # apply(x, 2, max): returns max of each column in x + max_counts_bin1_o1 <- apply(rbind(expected_counts_o1[1, ], expected_counts_o1[2, ]), 2, max) + max_counts_bin2_o1 <- apply(rbind(expected_counts_o1[6, ], expected_counts_o1[7, ], + expected_counts_o1[8, ]), 2, max) + max_counts_bin3_o1 <- apply(rbind(expected_counts_o1[3, ], expected_counts_o1[4, ], + expected_counts_o1[5, ], expected_counts_o1[9, ], + expected_counts_o1[10, ]), 2, max) + + expected_max_density_binned_counts_o1 <- rbind( + max_counts_bin1_o1, max_counts_bin2_o1, max_counts_bin3_o1 + ) + rownames(expected_max_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + max_counts_bin1_o2 <- apply(rbind(expected_counts_o2[1, ], expected_counts_o2[4, ]), 2, max) + max_counts_bin2_o2 <- apply(rbind(expected_counts_o2[2, ], expected_counts_o2[6, ], + expected_counts_o2[7, ]), 2, max) + max_counts_bin3_o2 <- apply(rbind(expected_counts_o2[3, ], expected_counts_o2[5, ]), 2, max) + max_counts_bin4_o2 <- apply(rbind(expected_counts_o2[8, ], expected_counts_o2[9, ], + expected_counts_o2[10, ]), 2, max) + + expected_max_density_binned_counts_o2 <- rbind( + max_counts_bin1_o2, max_counts_bin2_o2, max_counts_bin3_o2, + max_counts_bin4_o2 + ) + rownames(expected_max_density_binned_counts_o2) <- 1:4 + + # density_binned_counts with agg_fn = max should give + # max graphlet count in each density bin + agg_fn <- max + scale_fn <- NULL + + actual_max_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn) + + actual_max_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn) + + # Check actual output vs expected + expect_equal( + actual_max_density_binned_counts_o1, + expected_max_density_binned_counts_o1 + ) + expect_equal( + actual_max_density_binned_counts_o2, + expected_max_density_binned_counts_o2 + ) + + # density_binned_counts with scale_fn = scale_graphlet_counts_ego + # should give mean graphlet counts in each density bin scaled by + # count_graphlet_tuples. + agg_fn <- mean + scale_fn <- scale_graphlet_counts_ego + + # generate ego networks using previously tested function + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + # calculate expected counts using previously tested function + expected_scaled_counts_o1 <- + scale_graphlet_counts_ego(ego_networks_o1, + expected_counts_o1, + max_graphlet_size = max_graphlet_size + ) + expected_scaled_counts_o2 <- + scale_graphlet_counts_ego(ego_networks_o2, + expected_counts_o2, + max_graphlet_size = max_graphlet_size + ) + + # calculate mean expected counts using expected density bins + mean_scaled_counts_bin1_o1 <- (expected_scaled_counts_o1[1, ] + expected_scaled_counts_o1[2, ]) / 2 + mean_scaled_counts_bin2_o1 <- (expected_scaled_counts_o1[6, ] + expected_scaled_counts_o1[7, ] + + expected_scaled_counts_o1[8, ]) / 3 + mean_scaled_counts_bin3_o1 <- (expected_scaled_counts_o1[3, ] + expected_scaled_counts_o1[4, ] + + expected_scaled_counts_o1[5, ] + expected_scaled_counts_o1[9, ] + + expected_scaled_counts_o1[10, ]) / 5 + expected_scaled_density_binned_counts_o1 <- rbind( + mean_scaled_counts_bin1_o1, mean_scaled_counts_bin2_o1, mean_scaled_counts_bin3_o1 + ) + rownames(expected_scaled_density_binned_counts_o1) <- 1:3 + # Order 2: Expected interval indexes: 1, 3, 3, 1, 3, 2, 2, 4, 4, 4 + mean_scaled_counts_bin1_o2 <- (expected_scaled_counts_o2[1, ] + expected_scaled_counts_o2[4, ]) / 2 + mean_scaled_counts_bin2_o2 <- (expected_scaled_counts_o2[2, ] + expected_scaled_counts_o2[6, ] + + expected_scaled_counts_o2[7, ]) / 3 + mean_scaled_counts_bin3_o2 <- (expected_scaled_counts_o2[3, ] + expected_scaled_counts_o2[5, ]) / 2 + mean_scaled_counts_bin4_o2 <- (expected_scaled_counts_o2[8, ] + expected_scaled_counts_o2[9, ] + + expected_scaled_counts_o2[10, ]) / 3 + expected_scaled_density_binned_counts_o2 <- rbind( + mean_scaled_counts_bin1_o2, mean_scaled_counts_bin2_o2, mean_scaled_counts_bin3_o2, + mean_scaled_counts_bin4_o2 + ) + rownames(expected_scaled_density_binned_counts_o2) <- 1:4 + + # Calculate scaled binned counts with density_binned_counts (function to test) + actual_scaled_density_binned_counts_o1 <- density_binned_counts( + expected_counts_o1, + expected_interval_indexes_o1, + agg_fn = agg_fn, + scale_fn = scale_fn, + ego_networks = ego_networks_o1, + max_graphlet_size = max_graphlet_size) + + actual_scaled_density_binned_counts_o2 <- density_binned_counts( + expected_counts_o2, + expected_interval_indexes_o2, + agg_fn = agg_fn, + scale_fn = scale_fn, + ego_networks = ego_networks_o2, + max_graphlet_size = max_graphlet_size) + + # Check actual output vs expected + expect_equal( + actual_scaled_density_binned_counts_o1, + expected_scaled_density_binned_counts_o1 + ) + expect_equal( + actual_scaled_density_binned_counts_o2, + expected_scaled_density_binned_counts_o2 + ) + }) -# /JACK ---------------------------------------------------------------------- - - - - - - context("Measures Netdis: Expected graphlet counts") test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { From 74e684961e5bd536830d4dd3a3622fb9aad8427a Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 10 Sep 2019 17:49:38 +0100 Subject: [PATCH 038/188] test for netdis_expected_graphlet_counts_per_ego --- R/measures_net_dis.R | 7 +- tests/testthat/test_measures_net_dis.R | 233 ++++++++++++++++++++++++- 2 files changed, 229 insertions(+), 11 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 9460dd15..17eb3d5b 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -782,7 +782,9 @@ netdis_expected_graphlet_counts_per_ego <- function( names(expected_graphlet_counts) <- names(ego_networks) # Simplify list to array - t(simplify2array(expected_graphlet_counts)) + expected_graphlet_counts <- t(simplify2array(expected_graphlet_counts)) + colnames(expected_graphlet_counts) <- graphlet_key(max_graphlet_size)$id + expected_graphlet_counts } #' INTERNAL FUNCTION - Do not call directly @@ -818,7 +820,7 @@ netdis_expected_graphlet_counts <- function(graph, matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - + if (!is.null(scale_fn)) { # Scale reference counts e.g. by multiplying the # reference count for each graphlet by the number @@ -827,7 +829,6 @@ netdis_expected_graphlet_counts <- function(graph, matched_reference_counts <- matched_reference_counts * scale_fn(graph, max_graphlet_size) } - matched_reference_counts } diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 39615699..85ec52da 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -952,6 +952,33 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { num_nodes <- rep(120, 10) graphs <- purrr::map2(num_nodes, densities, rand_graph) + # WITH scale_fn = NULL (bin counts directly with no scaling) + # Helper function to calculate expected expected graphlet counts + expected_expected_graphlet_counts_fn <- function(density_index) { + scaled_reference_counts[density_index, ] + } + # Determine expected and actual expected graphlet counts + expected_expected_graphlet_counts <- + purrr::map(density_indexes, expected_expected_graphlet_counts_fn) + actual_expected_graphlet_counts <- + purrr::map(graphs, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = scaled_reference_counts, + scale_fn = NULL + ) + # Loop over each graph and compare expected with actual + # NOTE: v2.0.0 of testthat library made a breaking change that means using + # map, mapply etc can cause failures under certain conditions + # See: https://github.com/r-lib/testthat/releases/tag/v2.0.0 + for (i in 1:length(actual_expected_graphlet_counts)) { + expect_equal( + actual_expected_graphlet_counts[i], + expected_expected_graphlet_counts[i] + ) + } + + # WITH scale_fn = count_graphlet_tuples (default netdis from paper) # Helper function to calculate expected expected graphlet counts expected_expected_graphlet_counts_fn <- function(density_index, node_count) { reference_counts <- scaled_reference_counts[density_index, ] @@ -965,7 +992,7 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = scaled_reference_counts, - scale_fn=count_graphlet_tuples + scale_fn = count_graphlet_tuples ) # Loop over each graph and compare expected with actual # NOTE: v2.0.0 of testthat library made a breaking change that means using @@ -979,6 +1006,7 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { } }) + test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes # Set up a small sample network with at least one ego-network that contains @@ -1094,7 +1122,7 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes neighbourhood_size = 1, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples + scale_fn = count_graphlet_tuples ) actual_expected_graphlet_counts_ego_o2 <- netdis_expected_graphlet_counts_ego( @@ -1103,13 +1131,195 @@ test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes neighbourhood_size = 2, density_breaks = breaks, density_binned_reference_counts = scaled_reference_counts, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples + scale_fn = count_graphlet_tuples ) # Compare actual to expected expect_equal( actual_expected_graphlet_counts_ego_o1, - actual_expected_graphlet_counts_ego_o1 + expected_expected_graphlet_counts_ego_o1 + ) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 + ) +}) + +test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 nodes", { + # Helper function to generate graphs with known density and number of nodes + # Set up a small sample network with at least one ego-network that contains + # at least one of each graphlets + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) + max_graphlet_size <- 4 + min_ego_edges <- 0 + min_ego_nodes <- 0 + + # Make graph ego networks + ego_networks_o1 <- make_named_ego_graph(graph, + order = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + ego_networks_o2 <- make_named_ego_graph(graph, + order = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes + ) + # Set manually-verified node counts and densities + # 1. Ego-networks of order 1 + num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) + num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) + max_edges_o1 <- choose(num_nodes_o1, 2) + densities_o1 <- num_edges_o1 / max_edges_o1 + # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 + # 2. Ego-networks of order 2 + num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) + num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) + max_edges_o2 <- choose(num_nodes_o2, 2) + densities_o2 <- num_edges_o2 / max_edges_o2 + # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 + # Set manually defined density breaks and indexes + breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) + density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) + density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) + # Set dummy reference counts + scaled_reference_counts <- rbind( + c(1, 2, 3, 4, 5, 6, 7, 8, 9), + c(11, 12, 13, 14, 15, 16, 17, 18, 19), + c(21, 22, 23, 24, 25, 26, 27, 28, 29), + c(31, 32, 33, 34, 35, 36, 37, 38, 39), + c(41, 42, 43, 44, 45, 46, 47, 48, 49), + c(51, 52, 53, 54, 55, 56, 57, 58, 59), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), + c(71, 72, 73, 74, 75, 76, 77, 78, 79), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), + c(91, 92, 93, 94, 95, 96, 97, 98, 99) + ) + expected_dims <- dim(scaled_reference_counts) + min_ego_nodes <- 3 + min_ego_edges <- 1 + + #------------------------------------------------------- + # With scale_fn = count_graphlet_tuples (default netdis paper) + #------------------------------------------------------- + # Helper function to calculate expected expected graphlet counts + expected_expected_graphlet_counts_fn <- function(density_index, node_count) { + reference_counts <- scaled_reference_counts[density_index, ] + reference_counts * choose(node_count, graphlet_sizes) + } + # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet + # types as columns and ego networks for nodes in graph as rows + expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( + density_indexes_o1, num_nodes_o1, expected_expected_graphlet_counts_fn + ))) + expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( + density_indexes_o2, num_nodes_o2, expected_expected_graphlet_counts_fn + ))) + # Sanity check for expected output shape. Should be matrix with graphlet types + # as columns and nodes as rows + expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) + expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) + # Set column labels to graphlet names + colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels + colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels + # Set row labels to ego network names + rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) + + # Calculate actual output of function under test + actual_expected_graphlet_counts_ego_o1 <- + netdis_expected_graphlet_counts_per_ego( + ego_networks_o1, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples + ) + actual_expected_graphlet_counts_ego_o2 <- + netdis_expected_graphlet_counts_per_ego( + ego_networks_o2, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = count_graphlet_tuples + ) + + # Compare actual to expected + expect_equal( + actual_expected_graphlet_counts_ego_o1, + expected_expected_graphlet_counts_ego_o1 + ) + expect_equal( + actual_expected_graphlet_counts_ego_o2, + expected_expected_graphlet_counts_ego_o2 + ) + + #------------------------------------------------------- + # With scale_fn = NULL (take reference counts directly) + #------------------------------------------------------- + # Helper function to calculate expected expected graphlet counts + expected_expected_graphlet_counts_fn <- function(density_index) { + scaled_reference_counts[density_index, ] + } + # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet + # types as columns and ego networks for nodes in graph as rows + expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map( + density_indexes_o1, expected_expected_graphlet_counts_fn + ))) + expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map( + density_indexes_o2, expected_expected_graphlet_counts_fn + ))) + # Sanity check for expected output shape. Should be matrix with graphlet types + # as columns and nodes as rows + expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) + expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) + # Set column labels to graphlet names + colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels + colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels + # Set row labels to ego network names + rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) + + # Calculate actual output of function under test + actual_expected_graphlet_counts_ego_o1 <- + netdis_expected_graphlet_counts_per_ego( + ego_networks_o1, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = NULL + ) + actual_expected_graphlet_counts_ego_o2 <- + netdis_expected_graphlet_counts_per_ego( + ego_networks_o2, + breaks, + scaled_reference_counts, + max_graphlet_size, + scale_fn = NULL + ) + # Compare actual to expected + expect_equal( + actual_expected_graphlet_counts_ego_o1, + expected_expected_graphlet_counts_ego_o1 ) expect_equal( actual_expected_graphlet_counts_ego_o2, @@ -1313,7 +1523,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no density_binned_reference_counts_o1, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples + scale_fn = count_graphlet_tuples ) ) expect_equal( @@ -1326,7 +1536,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no density_binned_reference_counts_o2, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples + scale_fn = count_graphlet_tuples ) ) @@ -1340,7 +1550,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no num_bins = num_bins, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples + scale_fn = count_graphlet_tuples ) actual_expected_graphlet_counts_ego_fn_o2 <- netdis_expected_graphlet_counts_ego_fn( @@ -1351,7 +1561,7 @@ test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 no num_bins = num_bins, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn=count_graphlet_tuples + scale_fn = count_graphlet_tuples ) # Generate actual expected accounts by applying generated functions to test # graph @@ -1494,3 +1704,10 @@ test_that("netdis_centred_graphlet_counts_ego is correct", { # } }) + + + +context("Netdis: Statistic calculation") +test_that("netdis statistic function is correct", { + +}) From 65c2ea79562ac5fa0ead2217005d3f07193d526d Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 11 Sep 2019 11:07:35 +0100 Subject: [PATCH 039/188] commit to mark pre-removal of old functions --- R/measures_net_dis.R | 18 +++++++++++++++++- R/orca_interface.R | 9 ++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 17eb3d5b..5d0438b1 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,3 +1,4 @@ +#' FLAGUSED #' Netdis between two graphs #' @param graph_1 First query graph #' @param graph_2 Second query graph @@ -67,6 +68,7 @@ netdis_one_to_one <- function(graph_1, graph_2, result$netdis[, 1] } +#' FLAGUSED #' Netdis comparisons between one graph and many other graphs #' @param graph_1 query graph - this graph will be compared with #' all graphs in graphs_compare @@ -141,6 +143,7 @@ netdis_one_to_many <- function(graph_1, graphs_compare, } +#' FLAGUSED #' Netdis between all graph pairs #' @param graphs Query graphs #' @param ref_graph Reference graph @@ -356,6 +359,7 @@ netdis_for_all_graphs <- function(centred_graphlet_counts, list(netdis = netdis, comp_spec = comp_spec) } +#' FLAGUSED #' Netdis #' #' Calculate Netdis statistic between two graphs from their Centred Graphlet @@ -387,6 +391,7 @@ netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, 0.5 * (1 - netds2) } +#' FLAGUSED #' Netdis - graphlets up to max_graphlet_size #' #' Calculate Netdis statistic between two graphs from their Centred Graphlet @@ -737,6 +742,7 @@ netdis_expected_graphlet_counts_ego <- function(graph, t(simplify2array(expected_graphlet_counts)) } +#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' #' JACK To follow through logic of paper steps, wanted to pass @@ -787,6 +793,7 @@ netdis_expected_graphlet_counts_per_ego <- function( expected_graphlet_counts } +#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' #' Used by \code{netdis_expected_graphlet_counts_ego} to @@ -832,6 +839,7 @@ netdis_expected_graphlet_counts <- function(graph, matched_reference_counts } +#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' #' Used by \code{netdis_expected_graphlet_counts_ego_fn} to @@ -871,6 +879,7 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, mean_density_binned_graphlet_counts } +#' FLAGUSED #' For case where don't want to use binning, return a single bin which covers #' the full range of possible density values. #' @param densities Ego network density values (only used to return @@ -882,6 +891,7 @@ single_density_bin <- function(densities) { breaks = c(0, 1)) } +#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' #' Used to calculate expected graphlet counts for each density bin. @@ -919,6 +929,7 @@ density_binned_counts <- function(graphlet_counts, } +#' FLAGUSED #' Calculate expected counts in density bins using #' geometric poisson (Polya-Aeppli) approximation. #' @param graphlet_counts Graphlet counts for a number of ego_networks. @@ -971,6 +982,7 @@ density_binned_counts_gp <- function(graphlet_counts, } +#' FLAGUSED #' Replace zero values in a vector with ones. Used by #' \code{scale_graphlet_count} to prevent divide by #' zero errors. @@ -984,6 +996,7 @@ zeros_to_ones <- function(v) { } +#' FLAGUSED #' Divide graphlet counts by pre-computed scaling factor from #' \code{count_graphlet_tuples} output. #' @param graphlet_count Pre-computed graphlet counts. @@ -995,6 +1008,7 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { graphlet_count / zeros_to_ones(graphlet_tuples) } +#' FLAGUSED #' Run count_graphlet_tuples across pre-computed ego networks. #' @param ego_networks Pre-generated ego networks for an input graph. #' @param max_graphlet_size Determines the maximum size of graphlets included @@ -1008,6 +1022,7 @@ count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { graphlet_tuple_counts } +#' FLAGUSED #' Calculate ego network edge densities. #' @param ego_networks Pre-generated ego networks for an input graph. #' @export @@ -1021,7 +1036,7 @@ ego_network_density <- function(ego_networks) { } - +#' FLAGUSED #' Scale graphlet counts for an ego network by the n choose k possible #' choices of k nodes in that ego-network, where n is the number of nodes #' in the ego network and k is the number of nodes in the graphlet. @@ -1048,6 +1063,7 @@ scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, } +#' FLAGUSED #' For each graphlet calculate the number of possible sets of k nodes in the #' query graph, where k is the number of nodes in the graphlet. #' diff --git a/R/orca_interface.R b/R/orca_interface.R index aef97b78..d77bedb6 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -42,6 +42,7 @@ indexed_edges_to_graph <- function(indexed_edges) { return(graph) } +#' FLAGUSED #' Read all graphs in a directory, simplifying as requested #' #' Reads graph data from all files in a directory matching the specified @@ -111,6 +112,7 @@ read_simple_graphs <- function(source_dir, return(graphs) } +#' FLAGUSED #' Read a graph from file, simplifying as requested #' #' Reads graph data from file, constructing an a igraph graph object, making the @@ -151,6 +153,7 @@ read_simple_graph <- function(file, format, as_undirected = TRUE, ) } +#' FLAGUSED #' Simplify an igraph #' #' Takes a igraph graph object and makes the requested subset of the following @@ -384,6 +387,7 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size } } +#' FLAGUSED #' ego_to_graphlet_counts #' JACK To follow through logic of paper steps, wanted to pass #' ego networks to the function for generating graphlet counts, @@ -413,6 +417,7 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { return(ego_graphlet_counts) } +#' FLAGUSED #' Get ego-networks for a graph as a named list #' #' Simple wrapper for the \code{igraph::make_ego_graph} function that names @@ -562,6 +567,7 @@ orbit_key <- function(max_graphlet_size) { return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) } +#' FLAGUSED #' Graphlet IDs for size #' #' List IDs for all graphlets of a specified size @@ -626,6 +632,7 @@ gdd_for_all_graphs <- function( ) } +#' FLAGUSED #' Generate a cross-comparison specification #' #' Creates a cross-comparison matrix with pair-wise combinations @@ -644,7 +651,7 @@ gdd_for_all_graphs <- function( cross_comparison_spec <- function(named_list, how = "many-to-many") { if (how == "one-to-many") { indexes <- data.frame( - rep(1, length(named_list)-1), + rep(1, length(named_list) - 1), 2:length(named_list) ) } else { From 10d3ca7b9d09d3f28f6c245a0b9ebc74eb9fc63e Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 11 Sep 2019 11:31:33 +0100 Subject: [PATCH 040/188] delete unused functions in measures_net_dis --- NAMESPACE | 6 - R/measures_net_dis.R | 349 +---- man/count_graphlet_tuples.Rd | 22 + man/count_graphlet_tuples_ego.Rd | 4 +- man/count_graphlets_ego_scaled.Rd | 49 - man/cross_comparison_spec.Rd | 3 +- man/density_binned_counts.Rd | 5 +- man/density_binned_counts_gp.Rd | 6 +- man/ego_network_density.Rd | 4 +- man/ego_to_graphlet_counts.Rd | 3 +- man/graphlet_ids_for_size.Rd | 3 +- man/make_named_ego_graph.Rd | 3 +- man/mean_density_binned_graphlet_counts.Rd | 5 +- man/netdis.Rd | 3 +- man/netdis_centred_graphlet_counts.Rd | 34 - man/netdis_centred_graphlet_counts_ego.Rd | 44 - man/netdis_expected_graphlet_counts.Rd | 5 +- man/netdis_expected_graphlet_counts_ego.Rd | 44 - man/netdis_expected_graphlet_counts_ego_fn.Rd | 62 - ...netdis_expected_graphlet_counts_per_ego.Rd | 3 +- man/netdis_for_all_graphs.Rd | 28 - man/netdis_many_to_many.Rd | 12 +- man/netdis_one_to_many.Rd | 12 +- man/netdis_one_to_one.Rd | 12 +- man/netdis_uptok.Rd | 3 +- man/read_simple_graph.Rd | 3 +- man/read_simple_graphs.Rd | 3 +- man/scale_graphlet_count.Rd | 4 +- man/scale_graphlet_counts_ego.Rd | 4 +- man/simplify_graph.Rd | 3 +- man/single_density_bin.Rd | 4 +- man/zeros_to_ones.Rd | 4 +- tests/testthat/test_measures_net_dis.R | 1346 ++++++++--------- vignettes/dendrogram_example_net_dis.R | 89 +- vignettes/dendrogram_example_net_dis.html | 170 ++- vignettes/netdis_2graphs_constant_exp.R | 92 -- vignettes/netdis_2graphs_constant_exp.Rmd | 120 -- vignettes/netdis_2graphs_constant_exp.html | 426 ------ vignettes/netdis_customisations.R | 3 - vignettes/netdis_customisations.html | 4 +- vignettes/netdis_multigraph_polya-aeppli.R | 154 -- vignettes/netdis_multigraph_polya-aeppli.Rmd | 184 --- vignettes/netdis_multigraph_polya-aeppli.html | 505 ------- ...arison.R => netdis_pairwise_comparisons.R} | 0 ....html => netdis_pairwise_comparisons.html} | 8 +- vignettes/quickstart_netdis.R | 47 - vignettes/quickstart_netdis.Rmd | 73 - vignettes/quickstart_netdis.html | 382 ----- vignettes/quickstart_netdis_functions.R | 94 -- vignettes/quickstart_netdis_functions.html | 448 ------ 50 files changed, 926 insertions(+), 3968 deletions(-) create mode 100644 man/count_graphlet_tuples.Rd delete mode 100644 man/count_graphlets_ego_scaled.Rd delete mode 100644 man/netdis_centred_graphlet_counts.Rd delete mode 100644 man/netdis_centred_graphlet_counts_ego.Rd delete mode 100644 man/netdis_expected_graphlet_counts_ego.Rd delete mode 100644 man/netdis_expected_graphlet_counts_ego_fn.Rd delete mode 100644 man/netdis_for_all_graphs.Rd delete mode 100644 vignettes/netdis_2graphs_constant_exp.R delete mode 100644 vignettes/netdis_2graphs_constant_exp.Rmd delete mode 100644 vignettes/netdis_2graphs_constant_exp.html delete mode 100644 vignettes/netdis_multigraph_polya-aeppli.R delete mode 100644 vignettes/netdis_multigraph_polya-aeppli.Rmd delete mode 100644 vignettes/netdis_multigraph_polya-aeppli.html rename vignettes/{quickstart_netdis_comparison.R => netdis_pairwise_comparisons.R} (100%) rename vignettes/{quickstart_netdis_comparison.html => netdis_pairwise_comparisons.html} (98%) delete mode 100644 vignettes/quickstart_netdis.R delete mode 100644 vignettes/quickstart_netdis.Rmd delete mode 100644 vignettes/quickstart_netdis.html delete mode 100644 vignettes/quickstart_netdis_functions.R delete mode 100644 vignettes/quickstart_netdis_functions.html diff --git a/NAMESPACE b/NAMESPACE index 2bda06d9..69e2208c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,6 @@ export(binned_densities_adaptive) export(count_graphlet_tuples) export(count_graphlet_tuples_ego) export(count_graphlets_ego) -export(count_graphlets_ego_scaled) export(count_graphlets_for_graph) export(count_graphlets_per_node) export(count_orbits_per_node) @@ -51,13 +50,8 @@ export(min_emd_optimise_fast) export(net_emd) export(net_emds_for_all_graphs) export(netdis) -export(netdis_centred_graphlet_counts) -export(netdis_centred_graphlet_counts_ego) export(netdis_expected_graphlet_counts) -export(netdis_expected_graphlet_counts_ego) -export(netdis_expected_graphlet_counts_ego_fn) export(netdis_expected_graphlet_counts_per_ego) -export(netdis_for_all_graphs) export(netdis_many_to_many) export(netdis_one_to_many) export(netdis_one_to_one) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 5d0438b1..ae40a076 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -325,40 +325,6 @@ netdis_many_to_many <- function(graphs, } -#' Netdis between all graph pairs using provided Centred Graphlet Counts -#' @param centred_graphlet_counts List containing Centred Graphlet Counts for -#' all graphs being compared -#' @param graphlet_size The size of graphlets to use for the Netdis calculation -#' (only counts for graphlets of the specified size will be used). The size of -#' a graphlet is the number of nodes it contains. -#' @param mc.cores Number of cores to run on. NOTE: only works on unix-like -#' systems with system level forking capability. This means it will work on -#' Linux and OSX, but not Windows. -#' @return Pairwise Netdis statistics between graphs calculated using centred -#' counts for graphlets of the specified size -#' @export -netdis_for_all_graphs <- function(centred_graphlet_counts, - graphlet_size, - mc.cores = getOption("mc.cores", 2L)) { - comp_spec <- cross_comparison_spec(centred_graphlet_counts) - # NOTE: mcapply only works on unix-like systems with system level forking - # capability. This means it will work on Linux and OSX, but not Windows. - # For now, we just revert to single threaded operation on Windows - # TODO: Look into using the parLappy function on Windows - if (.Platform$OS.type != "unix") { - # Force cores to 1 if system is not unix-like as it will not support - # forking - mc.cores <- 1 - } - netdis <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { - netdis( - centred_graphlet_counts[[index_a]], centred_graphlet_counts[[index_b]], - graphlet_size = graphlet_size - ) - }, comp_spec$index_a, comp_spec$index_b, SIMPLIFY = FALSE)) - list(netdis = netdis, comp_spec = comp_spec) -} - #' FLAGUSED #' Netdis #' @@ -429,319 +395,6 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, netdis_statistics } -#' Scaled graphlet count for ego-networks -#' -#' Calculates graphlet counts for the n-step ego-network of each node in -#' a graph, scaled by dividing the graphlet counts for each ego-network by the -#' total number of possible groupings of nodes in the ego-network with the same -#' number of nodes as each graphlet. This scaling factor is choose(n, k), -#' where n is the number of nodes in the ego-network and k is the number of -#' nodes in the graphlet. -#' @param graph A connected, undirected, simple graph as an \code{igraph} -#' object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. -#' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} -#' nodes are returned. -#' @param min_ego_edges Only ego networks with at least \code{min_ego_edges} -#' edges are returned. -#' @param return_ego_networks If \code{TRUE}, return ego-networks alongside -#' graphlet counts to enable further processing. -#' @return If \code{return_ego_networks = FALSE}, returns an RxC matrix -#' containing counts of each graphlet (columns, C) for each ego-network in the -#' input graph (rows, R). Columns are labelled with graphlet IDs and rows are -#' labelled with the ID of the central node in each ego-network (if nodes in the -#' input graph are labelled). If \code{return_ego_networks = TRUE}, returns a -#' list with the following elements: -#' \itemize{ -#' \item \code{graphlet_counts}: A matrix containing graphlet counts for each -#' ego-network in the input graph as described above. -#' \item \code{ego_networks}: The ego-networks of the query graph. -#' } -#' @export -count_graphlets_ego_scaled <- function(graph, - max_graphlet_size, - neighbourhood_size, - min_ego_nodes = 3, - min_ego_edges = 1, - return_ego_networks = FALSE) { - - # Calculate ego-network graphlet counts, also returning the ego networks for - # use later in function - ego_data <- - count_graphlets_ego(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE - ) - ego_graphlet_counts <- ego_data$graphlet_counts - ego_networks <- ego_data$ego_networks - - # Scale ego-network graphlet counts by dividing by total number of k-tuples in - # ego-network (where k is graphlet size) - ego_graphlet_tuples <- count_graphlet_tuples_ego( - ego_networks, - max_graphlet_size = max_graphlet_size - ) - ego_graphlet_counts <- scale_graphlet_count( - ego_graphlet_counts, - ego_graphlet_tuples - ) - - # Return either graphlet counts, or graphlet counts and ego_networks - if (return_ego_networks) { - return(list( - graphlet_counts = ego_graphlet_counts, - ego_networks = ego_networks - )) - } else { - return(ego_graphlet_counts) - } -} - -#' Generate Netdis centred graphlets counts by subtracting expected counts -#' -#' @param graph A connected, undirected, simple graph as an -#' \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes -#' will be counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. -#' @param expected_ego_count_fn A function for generating expected ego-network -#' graphlet counts for a graph. This function should take a connected, -#' undirected, simple graph as an \code{igraph} object for its only argument. -#' Where \code{expected_ego_count_fn} is specific to particular values of -#' \code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken -#' to ensure that the values of these parameters passed to this function are -#' consistent with those used when creating \code{expected_ego_count_fn}. -#' @return A vector with centred counts for each graphlet type -#' @export -netdis_centred_graphlet_counts <- function(graph, - max_graphlet_size, - neighbourhood_size, - expected_ego_count_fn = NULL) { - # Get centred counts for each ego network - centred_counts <- netdis_centred_graphlet_counts_ego( - graph, - max_graphlet_size, - neighbourhood_size, - expected_ego_count_fn - ) - # Sum centred counts over ego-networks - apply(centred_counts, MARGIN = 2, FUN = sum) -} - - -#' Generate Netdis centred graphlets counts by subtracting expected counts -#' -#' @param graph A connected, undirected, simple graph as an -#' \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes -#' will be counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. -#' @param expected_ego_count_fn A function for generating expected ego-network -#' graphlet counts for a graph. This function should take a connected, -#' undirected, simple graph as an \code{igraph} object for its only argument. -#' Where \code{expected_ego_count_fn} is specific to particular values of -#' \code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken -#' to ensure that the values of these parameters passed to this function are -#' consistent with those used when creating \code{expected_ego_count_fn}. -#' @param min_ego_nodes Filter ego networks which have fewer -#' than min_ego_nodes nodes -#' @param min_ego_edges Filter ego networks which have fewer -#' than min_ego_edges edges -#' @return A vector with centred counts for each graphlet type in -#' each ego network. -#' -#' TODO: Remove @export prior to publishing -#' @export -netdis_centred_graphlet_counts_ego <- function(graph, - max_graphlet_size, - neighbourhood_size, - expected_ego_count_fn = NULL, - min_ego_nodes = 3, - min_ego_edges = 1) { - # Get unscaled ego-network graphlet counts - res <- count_graphlets_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE - ) - - actual_counts <- res$graphlet_counts - - # Centre these counts by subtracting the expected counts - if (is.null(expected_ego_count_fn)) { - centred_counts <- actual_counts - } else { - centred_counts <- actual_counts - expected_ego_count_fn(graph) - } - centred_counts -} - -#' Generate Netdis expected graphlet count function -#' -#' Generates a function to calculate expected ego-network graphlet counts for -#' query graphs based on the statistics of a provided reference graph. -#' -#' Generates graphlet counts for all ego-networks in the supplied -#' reference graph and then averages these graphlet counts over density bins to -#' generate density-dependent reference graphlet counts. Prior to averaging, -#' the graphlet counts are scaled in a size-dependent manner to permit -#' ego-networks with similar densities but different sizes to be averaged -#' together. -#' -#' Returns a function that uses the density-dependent reference graphlet -#' counts to generate expected graphlet counts for all ego-networks in a query -#' network. When doing so, it matches ego-networks to reference counts by -#' density and reverses the scaling that was applied to the original reference -#' counts in order to allow averaging across ego-networks with similar density -#' but different numbers of nodes. -#' @param graph A connected, undirected, simple reference graph as an -#' \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' node in ego-network. -#' @param min_ego_nodes Filter ego networks which have fewer -#' than min_ego_nodes nodes -#' @param min_ego_edges Filter ego networks which have fewer -#' than min_ego_edges edges -#' @param min_bin_count Minimum count of ego networks in each density bin. -#' @param num_bins Initial number of density bins to generate. -#' @param scale_fn Optional function to scale calculated expected counts, taking -#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -#' factor that the looked up \code{density_binned_reference_counts} values will -#' be multiplied by. -#' @return A function taking a connected, undirected, simple query graph as an -#' \code{igraph} object and returning an RxC matrix containing the expected -#' counts of each graphlet (columns, C) for each ego-network in the query graph -#' (rows, R). Columns are labelled with graphlet IDs and rows are labelled with -#' the ID of the central node in each ego-network (if nodes in the query graph -#' are labelled) -#' @export -netdis_expected_graphlet_counts_ego_fn <- function(graph, - max_graphlet_size, - neighbourhood_size, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100, - scale_fn = NULL) { - - # Calculate the scaled graphlet counts for all ego networks in the reference - # graph, also returning the ego networks themselves in order to calculate - # their densities - res <- count_graphlets_ego_scaled( - graph, - max_graphlet_size, - neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - return_ego_networks = TRUE - ) - - scaled_graphlet_counts <- res$graphlet_counts - ego_networks <- res$ego_networks - - # Get ego-network densities - densities <- purrr::simplify( - purrr::map_dbl(ego_networks, igraph::edge_density) - ) - - # Adaptively bin ego-network densities - binned_densities <- binned_densities_adaptive( - densities, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins - ) - - # Average graphlet counts across density bins - density_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts, - binned_densities$interval_indexes - ) - - # Return a partially applied function with the key reference graph information - # built-in - purrr::partial( - netdis_expected_graphlet_counts_ego, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - density_breaks = binned_densities$breaks, - density_binned_reference_counts = density_binned_graphlet_counts, - scale_fn = scale_fn - ) -} - -#' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to -#' generate a function for calculating expected ego-network graphlet counts -#' from the statistics of a provided reference graph. -#' Temporarily accessible during development. -#' @param graph A connected, undirected, simple reference graph as an -#' \code{igraph} object. -#' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. -#' @param neighbourhood_size The number of steps from the source node to include -#' node in ego-network. -#' @param density_breaks Density values defining bin edges. -#' @param density_binned_reference_counts Reference network graphlet counts for -#' each density bin. -#' @param min_ego_nodes Filter ego networks which have fewer -#' than min_ego_nodes nodes -#' @param min_ego_edges Filter ego networks which have fewer -#' than min_ego_edges edges -#' @param scale_fn Optional function to scale calculated expected counts, taking -#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -#' factor that the looked up \code{density_binned_reference_counts} values will -#' be multiplied by. -#' -#' TODO: Remove @export prior to publishing -#' @export -netdis_expected_graphlet_counts_ego <- function(graph, - max_graphlet_size, - neighbourhood_size, - density_breaks, - density_binned_reference_counts, - min_ego_nodes = 3, - min_ego_edges = 1, - scale_fn=NULL) { - - # Generate ego-networks for query graph - ego_networks <- make_named_ego_graph(graph, neighbourhood_size) - # Drop ego-networks that don't have the minimum number of nodes or edges - drop_index <- purrr::simplify(purrr::map(ego_networks, function(g) { - (igraph::vcount(g) < min_ego_nodes) | (igraph::ecount(g) < min_ego_edges) - })) - ego_networks <- ego_networks[!drop_index] - # Map over query graph ego-networks, using reference graph statistics to - # calculate expected graphlet counts for each ego-network. - expected_graphlet_counts <- - purrr::map(ego_networks, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts, - scale_fn = scale_fn - ) - names(expected_graphlet_counts) <- names(ego_networks) - # Simplify list to array - t(simplify2array(expected_graphlet_counts)) -} - #' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' @@ -1008,6 +661,7 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { graphlet_count / zeros_to_ones(graphlet_tuples) } + #' FLAGUSED #' Run count_graphlet_tuples across pre-computed ego networks. #' @param ego_networks Pre-generated ego networks for an input graph. @@ -1022,6 +676,7 @@ count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { graphlet_tuple_counts } + #' FLAGUSED #' Calculate ego network edge densities. #' @param ego_networks Pre-generated ego networks for an input graph. diff --git a/man/count_graphlet_tuples.Rd b/man/count_graphlet_tuples.Rd new file mode 100644 index 00000000..7975385e --- /dev/null +++ b/man/count_graphlet_tuples.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{count_graphlet_tuples} +\alias{count_graphlet_tuples} +\title{FLAGUSED +For each graphlet calculate the number of possible sets of k nodes in the +query graph, where k is the number of nodes in the graphlet.} +\usage{ +count_graphlet_tuples(graph, max_graphlet_size) +} +\arguments{ +\item{graph}{A connected, undirected, simple graph as an \code{igraph} +object.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets included +in the tuple counts.} +} +\description{ +FLAGUSED +For each graphlet calculate the number of possible sets of k nodes in the +query graph, where k is the number of nodes in the graphlet. +} diff --git a/man/count_graphlet_tuples_ego.Rd b/man/count_graphlet_tuples_ego.Rd index ab628b9f..ae3173a5 100644 --- a/man/count_graphlet_tuples_ego.Rd +++ b/man/count_graphlet_tuples_ego.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{count_graphlet_tuples_ego} \alias{count_graphlet_tuples_ego} -\title{Run count_graphlet_tuples across pre-computed ego networks.} +\title{FLAGUSED +Run count_graphlet_tuples across pre-computed ego networks.} \usage{ count_graphlet_tuples_ego(ego_networks, max_graphlet_size) } @@ -13,5 +14,6 @@ count_graphlet_tuples_ego(ego_networks, max_graphlet_size) in the tuple counts.} } \description{ +FLAGUSED Run count_graphlet_tuples across pre-computed ego networks. } diff --git a/man/count_graphlets_ego_scaled.Rd b/man/count_graphlets_ego_scaled.Rd deleted file mode 100644 index 5dfb334f..00000000 --- a/man/count_graphlets_ego_scaled.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{count_graphlets_ego_scaled} -\alias{count_graphlets_ego_scaled} -\title{Scaled graphlet count for ego-networks} -\usage{ -count_graphlets_ego_scaled(graph, max_graphlet_size, neighbourhood_size, - min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) -} -\arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} -object.} - -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} - -\item{neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} - -\item{min_ego_nodes}{Only ego networks with at least \code{min_ego_nodes} -nodes are returned.} - -\item{min_ego_edges}{Only ego networks with at least \code{min_ego_edges} -edges are returned.} - -\item{return_ego_networks}{If \code{TRUE}, return ego-networks alongside -graphlet counts to enable further processing.} -} -\value{ -If \code{return_ego_networks = FALSE}, returns an RxC matrix -containing counts of each graphlet (columns, C) for each ego-network in the -input graph (rows, R). Columns are labelled with graphlet IDs and rows are -labelled with the ID of the central node in each ego-network (if nodes in the -input graph are labelled). If \code{return_ego_networks = TRUE}, returns a -list with the following elements: -\itemize{ - \item \code{graphlet_counts}: A matrix containing graphlet counts for each - ego-network in the input graph as described above. - \item \code{ego_networks}: The ego-networks of the query graph. -} -} -\description{ -Calculates graphlet counts for the n-step ego-network of each node in -a graph, scaled by dividing the graphlet counts for each ego-network by the -total number of possible groupings of nodes in the ego-network with the same -number of nodes as each graphlet. This scaling factor is choose(n, k), -where n is the number of nodes in the ego-network and k is the number of -nodes in the graphlet. -} diff --git a/man/cross_comparison_spec.Rd b/man/cross_comparison_spec.Rd index 35ab245f..bf21f490 100644 --- a/man/cross_comparison_spec.Rd +++ b/man/cross_comparison_spec.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/orca_interface.R \name{cross_comparison_spec} \alias{cross_comparison_spec} -\title{Generate a cross-comparison specification} +\title{FLAGUSED +Generate a cross-comparison specification} \usage{ cross_comparison_spec(named_list, how = "many-to-many") } diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd index 64db44af..d827e730 100644 --- a/man/density_binned_counts.Rd +++ b/man/density_binned_counts.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{density_binned_counts} \alias{density_binned_counts} -\title{INTERNAL FUNCTION - Do not call directly} +\title{FLAGUSED +INTERNAL FUNCTION - Do not call directly} \usage{ density_binned_counts(graphlet_counts, density_interval_indexes, agg_fn = mean, scale_fn = NULL, ego_networks = NULL, @@ -11,7 +12,7 @@ density_binned_counts(graphlet_counts, density_interval_indexes, \arguments{ \item{graphlet_counts}{Graphlet counts for a number of ego_networks.} -\item{density_interval_indexes}{Density bin index for +\item{density_interval_indexes}{Density bin index for each ego network.} \item{agg_fn}{Function to aggregate counts in each bin diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd index ade810ac..a423c077 100644 --- a/man/density_binned_counts_gp.Rd +++ b/man/density_binned_counts_gp.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{density_binned_counts_gp} \alias{density_binned_counts_gp} -\title{Calculate expected counts in density bins using +\title{FLAGUSED +Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation.} \usage{ density_binned_counts_gp(graphlet_counts, density_interval_indexes, @@ -11,13 +12,14 @@ density_binned_counts_gp(graphlet_counts, density_interval_indexes, \arguments{ \item{graphlet_counts}{Graphlet counts for a number of ego_networks.} -\item{density_interval_indexes}{Density bin index for +\item{density_interval_indexes}{Density bin index for each ego network.} \item{max_graphlet_size}{Determines the maximum size of graphlets included in graphlet_counts.} } \description{ +FLAGUSED Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation. } diff --git a/man/ego_network_density.Rd b/man/ego_network_density.Rd index 26d68250..e4d36189 100644 --- a/man/ego_network_density.Rd +++ b/man/ego_network_density.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{ego_network_density} \alias{ego_network_density} -\title{Calculate ego network edge densities.} +\title{FLAGUSED +Calculate ego network edge densities.} \usage{ ego_network_density(ego_networks) } @@ -10,5 +11,6 @@ ego_network_density(ego_networks) \item{ego_networks}{Pre-generated ego networks for an input graph.} } \description{ +FLAGUSED Calculate ego network edge densities. } diff --git a/man/ego_to_graphlet_counts.Rd b/man/ego_to_graphlet_counts.Rd index 79aed9f8..b5f2d95d 100644 --- a/man/ego_to_graphlet_counts.Rd +++ b/man/ego_to_graphlet_counts.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/orca_interface.R \name{ego_to_graphlet_counts} \alias{ego_to_graphlet_counts} -\title{ego_to_graphlet_counts +\title{FLAGUSED +ego_to_graphlet_counts JACK To follow through logic of paper steps, wanted to pass ego networks to the function for generating graphlet counts, not the input query graph directly (as in count_graphlets_ego above).} diff --git a/man/graphlet_ids_for_size.Rd b/man/graphlet_ids_for_size.Rd index 9545ceb0..b97e8048 100644 --- a/man/graphlet_ids_for_size.Rd +++ b/man/graphlet_ids_for_size.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/orca_interface.R \name{graphlet_ids_for_size} \alias{graphlet_ids_for_size} -\title{Graphlet IDs for size} +\title{FLAGUSED +Graphlet IDs for size} \usage{ graphlet_ids_for_size(graphlet_size) } diff --git a/man/make_named_ego_graph.Rd b/man/make_named_ego_graph.Rd index 1c0db5a7..de5401fa 100644 --- a/man/make_named_ego_graph.Rd +++ b/man/make_named_ego_graph.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/orca_interface.R \name{make_named_ego_graph} \alias{make_named_ego_graph} -\title{Get ego-networks for a graph as a named list} +\title{FLAGUSED +Get ego-networks for a graph as a named list} \usage{ make_named_ego_graph(graph, order, min_ego_nodes = 3, min_ego_edges = 1, ...) diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index 537589b4..dc95201f 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{mean_density_binned_graphlet_counts} \alias{mean_density_binned_graphlet_counts} -\title{INTERNAL FUNCTION - Do not call directly} +\title{FLAGUSED +INTERNAL FUNCTION - Do not call directly} \usage{ mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes, agg_fn = mean) @@ -10,7 +11,7 @@ mean_density_binned_graphlet_counts(graphlet_counts, \arguments{ \item{graphlet_counts}{Graphlet counts for a number of ego_networks.} -\item{density_interval_indexes}{Density bin index for +\item{density_interval_indexes}{Density bin index for each ego network.} \item{agg_fn}{Function to aggregate counts in each bin diff --git a/man/netdis.Rd b/man/netdis.Rd index 9f477425..011e8594 100644 --- a/man/netdis.Rd +++ b/man/netdis.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis} \alias{netdis} -\title{Netdis} +\title{FLAGUSED +Netdis} \usage{ netdis(centred_graphlet_counts1, centred_graphlet_counts2, graphlet_size) } diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd deleted file mode 100644 index f9aaba3d..00000000 --- a/man/netdis_centred_graphlet_counts.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_centred_graphlet_counts} -\alias{netdis_centred_graphlet_counts} -\title{Generate Netdis centred graphlets counts by subtracting expected counts} -\usage{ -netdis_centred_graphlet_counts(graph, max_graphlet_size, - neighbourhood_size, expected_ego_count_fn = NULL) -} -\arguments{ -\item{graph}{A connected, undirected, simple graph as an -\code{igraph} object.} - -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes -will be counted.} - -\item{neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} - -\item{expected_ego_count_fn}{A function for generating expected ego-network -graphlet counts for a graph. This function should take a connected, -undirected, simple graph as an \code{igraph} object for its only argument. -Where \code{expected_ego_count_fn} is specific to particular values of -\code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken -to ensure that the values of these parameters passed to this function are -consistent with those used when creating \code{expected_ego_count_fn}.} -} -\value{ -A vector with centred counts for each graphlet type -} -\description{ -Generate Netdis centred graphlets counts by subtracting expected counts -} diff --git a/man/netdis_centred_graphlet_counts_ego.Rd b/man/netdis_centred_graphlet_counts_ego.Rd deleted file mode 100644 index b2f0b351..00000000 --- a/man/netdis_centred_graphlet_counts_ego.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_centred_graphlet_counts_ego} -\alias{netdis_centred_graphlet_counts_ego} -\title{Generate Netdis centred graphlets counts by subtracting expected counts} -\usage{ -netdis_centred_graphlet_counts_ego(graph, max_graphlet_size, - neighbourhood_size, expected_ego_count_fn = NULL, min_ego_nodes = 3, - min_ego_edges = 1) -} -\arguments{ -\item{graph}{A connected, undirected, simple graph as an -\code{igraph} object.} - -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes -will be counted.} - -\item{neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} - -\item{expected_ego_count_fn}{A function for generating expected ego-network -graphlet counts for a graph. This function should take a connected, -undirected, simple graph as an \code{igraph} object for its only argument. -Where \code{expected_ego_count_fn} is specific to particular values of -\code{max_graphlet_size} or \code{neighbourhood_size}, care should be taken -to ensure that the values of these parameters passed to this function are -consistent with those used when creating \code{expected_ego_count_fn}.} - -\item{min_ego_nodes}{Filter ego networks which have fewer -than min_ego_nodes nodes} - -\item{min_ego_edges}{Filter ego networks which have fewer -than min_ego_edges edges} -} -\value{ -A vector with centred counts for each graphlet type in -each ego network. - -TODO: Remove @export prior to publishing -} -\description{ -Generate Netdis centred graphlets counts by subtracting expected counts -} diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index 839d94f4..fe01b72e 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_expected_graphlet_counts} \alias{netdis_expected_graphlet_counts} -\title{INTERNAL FUNCTION - Do not call directly} +\title{FLAGUSED +INTERNAL FUNCTION - Do not call directly} \usage{ netdis_expected_graphlet_counts(graph, max_graphlet_size, density_breaks, density_binned_reference_counts, scale_fn = NULL) @@ -28,7 +29,7 @@ TODO: Remove @export prior to publishing} } \description{ Used by \code{netdis_expected_graphlet_counts_ego} to -calculate expected graphlet counts for a query graph +calculate expected graphlet counts for a query graph ego-network from the statistics of a provided reference graph. } diff --git a/man/netdis_expected_graphlet_counts_ego.Rd b/man/netdis_expected_graphlet_counts_ego.Rd deleted file mode 100644 index 76fcd6b2..00000000 --- a/man/netdis_expected_graphlet_counts_ego.Rd +++ /dev/null @@ -1,44 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_expected_graphlet_counts_ego} -\alias{netdis_expected_graphlet_counts_ego} -\title{INTERNAL FUNCTION - Do not call directly} -\usage{ -netdis_expected_graphlet_counts_ego(graph, max_graphlet_size, - neighbourhood_size, density_breaks, density_binned_reference_counts, - min_ego_nodes = 3, min_ego_edges = 1, scale_fn = NULL) -} -\arguments{ -\item{graph}{A connected, undirected, simple reference graph as an -\code{igraph} object.} - -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} - -\item{neighbourhood_size}{The number of steps from the source node to include -node in ego-network.} - -\item{density_breaks}{Density values defining bin edges.} - -\item{density_binned_reference_counts}{Reference network graphlet counts for -each density bin.} - -\item{min_ego_nodes}{Filter ego networks which have fewer -than min_ego_nodes nodes} - -\item{min_ego_edges}{Filter ego networks which have fewer -than min_ego_edges edges} - -\item{scale_fn}{Optional function to scale calculated expected counts, taking -\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -factor that the looked up \code{density_binned_reference_counts} values will -be multiplied by. - -TODO: Remove @export prior to publishing} -} -\description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to -generate a function for calculating expected ego-network graphlet counts -from the statistics of a provided reference graph. -Temporarily accessible during development. -} diff --git a/man/netdis_expected_graphlet_counts_ego_fn.Rd b/man/netdis_expected_graphlet_counts_ego_fn.Rd deleted file mode 100644 index 9a7d4b90..00000000 --- a/man/netdis_expected_graphlet_counts_ego_fn.Rd +++ /dev/null @@ -1,62 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_expected_graphlet_counts_ego_fn} -\alias{netdis_expected_graphlet_counts_ego_fn} -\title{Generate Netdis expected graphlet count function} -\usage{ -netdis_expected_graphlet_counts_ego_fn(graph, max_graphlet_size, - neighbourhood_size, min_ego_nodes = 3, min_ego_edges = 1, - min_bin_count = 5, num_bins = 100, scale_fn = NULL) -} -\arguments{ -\item{graph}{A connected, undirected, simple reference graph as an -\code{igraph} object.} - -\item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} - -\item{neighbourhood_size}{The number of steps from the source node to include -node in ego-network.} - -\item{min_ego_nodes}{Filter ego networks which have fewer -than min_ego_nodes nodes} - -\item{min_ego_edges}{Filter ego networks which have fewer -than min_ego_edges edges} - -\item{min_bin_count}{Minimum count of ego networks in each density bin.} - -\item{num_bins}{Initial number of density bins to generate.} - -\item{scale_fn}{Optional function to scale calculated expected counts, taking -\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -factor that the looked up \code{density_binned_reference_counts} values will -be multiplied by.} -} -\value{ -A function taking a connected, undirected, simple query graph as an -\code{igraph} object and returning an RxC matrix containing the expected -counts of each graphlet (columns, C) for each ego-network in the query graph -(rows, R). Columns are labelled with graphlet IDs and rows are labelled with -the ID of the central node in each ego-network (if nodes in the query graph -are labelled) -} -\description{ -Generates a function to calculate expected ego-network graphlet counts for -query graphs based on the statistics of a provided reference graph. -} -\details{ -Generates graphlet counts for all ego-networks in the supplied -reference graph and then averages these graphlet counts over density bins to -generate density-dependent reference graphlet counts. Prior to averaging, -the graphlet counts are scaled in a size-dependent manner to permit -ego-networks with similar densities but different sizes to be averaged -together. - -Returns a function that uses the density-dependent reference graphlet -counts to generate expected graphlet counts for all ego-networks in a query -network. When doing so, it matches ego-networks to reference counts by -density and reverses the scaling that was applied to the original reference -counts in order to allow averaging across ego-networks with similar density -but different numbers of nodes. -} diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index f94312dd..9abe227b 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_expected_graphlet_counts_per_ego} \alias{netdis_expected_graphlet_counts_per_ego} -\title{INTERNAL FUNCTION - Do not call directly} +\title{FLAGUSED +INTERNAL FUNCTION - Do not call directly} \usage{ netdis_expected_graphlet_counts_per_ego(ego_networks, density_breaks, density_binned_reference_counts, max_graphlet_size, scale_fn = NULL) diff --git a/man/netdis_for_all_graphs.Rd b/man/netdis_for_all_graphs.Rd deleted file mode 100644 index 383a8a30..00000000 --- a/man/netdis_for_all_graphs.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/measures_net_dis.R -\name{netdis_for_all_graphs} -\alias{netdis_for_all_graphs} -\title{Netdis between all graph pairs using provided Centred Graphlet Counts} -\usage{ -netdis_for_all_graphs(centred_graphlet_counts, graphlet_size, - mc.cores = getOption("mc.cores", 2L)) -} -\arguments{ -\item{centred_graphlet_counts}{List containing Centred Graphlet Counts for -all graphs being compared} - -\item{graphlet_size}{The size of graphlets to use for the Netdis calculation -(only counts for graphlets of the specified size will be used). The size of -a graphlet is the number of nodes it contains.} - -\item{mc.cores}{Number of cores to run on. NOTE: only works on unix-like -systems with system level forking capability. This means it will work on -Linux and OSX, but not Windows.} -} -\value{ -Pairwise Netdis statistics between graphs calculated using centred -counts for graphlets of the specified size -} -\description{ -Netdis between all graph pairs using provided Centred Graphlet Counts -} diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index fdf13239..82551939 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_many_to_many} \alias{netdis_many_to_many} -\title{Netdis between all graph pairs} +\title{FLAGUSED +Netdis between all graph pairs} \usage{ netdis_many_to_many(graphs, ref_graph, comparisons = "many-to-many", max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, @@ -40,17 +41,17 @@ ego network). (Default: \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in -each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. -(Default: \code{density_binned_counts} with \code{agg_fn = mean} and +(Default: \code{density_binned_counts} with \code{agg_fn = mean} and \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the approach used in the original netdis paper).} -\item{exp_counts_fn}{Function used to map from binned reference counts to +\item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_graphlet_counts_per_ego} with +(Default: \code{netdis_expected_graphlet_counts_per_ego} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} } @@ -59,5 +60,6 @@ Netdis statistics between query graphs for graphlet sizes up to and including max_graphlet_size. } \description{ +FLAGUSED Netdis between all graph pairs } diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index 6498d3ef..e72e000c 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_one_to_many} \alias{netdis_one_to_many} -\title{Netdis comparisons between one graph and many other graphs} +\title{FLAGUSED +Netdis comparisons between one graph and many other graphs} \usage{ netdis_one_to_many(graph_1, graphs_compare, ref_graph, max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, @@ -39,17 +40,17 @@ ego network). (Default: \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in -each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. -(Default: \code{density_binned_counts} with \code{agg_fn = mean} and +(Default: \code{density_binned_counts} with \code{agg_fn = mean} and \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the approach used in the original netdis paper).} -\item{exp_counts_fn}{Function used to map from binned reference counts to +\item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_graphlet_counts_per_ego} with +(Default: \code{netdis_expected_graphlet_counts_per_ego} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} } @@ -58,5 +59,6 @@ Netdis statistics between graph_1 and graph_2 for graphlet sizes up to and including max_graphlet_size } \description{ +FLAGUSED Netdis comparisons between one graph and many other graphs } diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 78e151cd..65e8c89c 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_one_to_one} \alias{netdis_one_to_one} -\title{Netdis between two graphs} +\title{FLAGUSED +Netdis between two graphs} \usage{ netdis_one_to_one(graph_1, graph_2, ref_graph, max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, @@ -37,17 +38,17 @@ ego network). (Default: \code{binned_densities_adaptive} with \code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in -each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} +each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. -(Default: \code{density_binned_counts} with \code{agg_fn = mean} and +(Default: \code{density_binned_counts} with \code{agg_fn = mean} and \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the approach used in the original netdis paper).} -\item{exp_counts_fn}{Function used to map from binned reference counts to +\item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_graphlet_counts_per_ego} with +(Default: \code{netdis_expected_graphlet_counts_per_ego} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} } @@ -56,5 +57,6 @@ Netdis statistics between graph_1 and graph_2 for graphlet sizes up to and including max_graphlet_size } \description{ +FLAGUSED Netdis between two graphs } diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd index 7e491b60..4c335179 100644 --- a/man/netdis_uptok.Rd +++ b/man/netdis_uptok.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_uptok} \alias{netdis_uptok} -\title{Netdis - graphlets up to max_graphlet_size} +\title{FLAGUSED +Netdis - graphlets up to max_graphlet_size} \usage{ netdis_uptok(centred_graphlet_counts1, centred_graphlet_counts2, max_graphlet_size) diff --git a/man/read_simple_graph.Rd b/man/read_simple_graph.Rd index 3895b0ef..4db79754 100644 --- a/man/read_simple_graph.Rd +++ b/man/read_simple_graph.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/orca_interface.R \name{read_simple_graph} \alias{read_simple_graph} -\title{Read a graph from file, simplifying as requested} +\title{FLAGUSED +Read a graph from file, simplifying as requested} \usage{ read_simple_graph(file, format, as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, diff --git a/man/read_simple_graphs.Rd b/man/read_simple_graphs.Rd index 4e907180..da0c9fce 100644 --- a/man/read_simple_graphs.Rd +++ b/man/read_simple_graphs.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/orca_interface.R \name{read_simple_graphs} \alias{read_simple_graphs} -\title{Read all graphs in a directory, simplifying as requested} +\title{FLAGUSED +Read all graphs in a directory, simplifying as requested} \usage{ read_simple_graphs(source_dir, format = "ncol", pattern = "*", as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, diff --git a/man/scale_graphlet_count.Rd b/man/scale_graphlet_count.Rd index 8a313c27..6137b18d 100644 --- a/man/scale_graphlet_count.Rd +++ b/man/scale_graphlet_count.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{scale_graphlet_count} \alias{scale_graphlet_count} -\title{Divide graphlet counts by pre-computed scaling factor from +\title{FLAGUSED +Divide graphlet counts by pre-computed scaling factor from \code{count_graphlet_tuples} output.} \usage{ scale_graphlet_count(graphlet_count, graphlet_tuples) @@ -13,6 +14,7 @@ scale_graphlet_count(graphlet_count, graphlet_tuples) \item{graphlet_tuples}{Pre-computed \code{count_graphlet_tuples} output.} } \description{ +FLAGUSED Divide graphlet counts by pre-computed scaling factor from \code{count_graphlet_tuples} output. } diff --git a/man/scale_graphlet_counts_ego.Rd b/man/scale_graphlet_counts_ego.Rd index c4bae9b1..cba6fc93 100644 --- a/man/scale_graphlet_counts_ego.Rd +++ b/man/scale_graphlet_counts_ego.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{scale_graphlet_counts_ego} \alias{scale_graphlet_counts_ego} -\title{Scale graphlet counts for an ego network by the n choose k possible +\title{FLAGUSED +Scale graphlet counts for an ego network by the n choose k possible choices of k nodes in that ego-network, where n is the number of nodes in the ego network and k is the number of nodes in the graphlet.} \usage{ @@ -20,6 +21,7 @@ in graphlet_counts.} scaled graphlet counts. } \description{ +FLAGUSED Scale graphlet counts for an ego network by the n choose k possible choices of k nodes in that ego-network, where n is the number of nodes in the ego network and k is the number of nodes in the graphlet. diff --git a/man/simplify_graph.Rd b/man/simplify_graph.Rd index f4e37722..80ac4e57 100644 --- a/man/simplify_graph.Rd +++ b/man/simplify_graph.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/orca_interface.R \name{simplify_graph} \alias{simplify_graph} -\title{Simplify an igraph} +\title{FLAGUSED +Simplify an igraph} \usage{ simplify_graph(graph, as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE) diff --git a/man/single_density_bin.Rd b/man/single_density_bin.Rd index ed801c45..dda61c79 100644 --- a/man/single_density_bin.Rd +++ b/man/single_density_bin.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{single_density_bin} \alias{single_density_bin} -\title{For case where don't want to use binning, return a single bin which covers +\title{FLAGUSED +For case where don't want to use binning, return a single bin which covers the full range of possible density values.} \usage{ single_density_bin(densities) @@ -12,6 +13,7 @@ single_density_bin(densities) a list of indexes of the required length.)} } \description{ +FLAGUSED For case where don't want to use binning, return a single bin which covers the full range of possible density values. } diff --git a/man/zeros_to_ones.Rd b/man/zeros_to_ones.Rd index ca662211..4f5dc1c7 100644 --- a/man/zeros_to_ones.Rd +++ b/man/zeros_to_ones.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/measures_net_dis.R \name{zeros_to_ones} \alias{zeros_to_ones} -\title{Replace zero values in a vector with ones. Used by +\title{FLAGUSED +Replace zero values in a vector with ones. Used by \code{scale_graphlet_count} to prevent divide by zero errors.} \usage{ @@ -13,6 +14,7 @@ zeros_to_ones(v) TODO remove export} } \description{ +FLAGUSED Replace zero values in a vector with ones. Used by \code{scale_graphlet_count} to prevent divide by zero errors. diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 85ec52da..27152b5b 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -118,166 +118,166 @@ test_that(test_message, { expect_equal(expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5) }) -context("Measures Netdis: Ego-network scaled graphlet outputs for manually verified networks") -test_that("Ego-network 4-node graphlet counts match manually verified totals", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - - # Set node and graphlet labels to use for row and col names in expected counts - node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - - # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 - max_graphlet_size <- 4 - min_ego_edges <- 0 - min_ego_nodes <- 0 - - actual_counts_order_1 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 1 - ) - actual_counts_order_2 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 2 - ) - - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # Set manually verified counts - # 1-step ego networks - expected_counts_order_1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - rownames(expected_counts_order_1) <- node_labels - colnames(expected_counts_order_1) <- graphlet_labels - # 2-step ego networks - expected_counts_order_2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - rownames(expected_counts_order_2) <- node_labels - colnames(expected_counts_order_2) <- graphlet_labels - - # Test that actual counts match expected with only counts requested (default) - expect_equal(actual_counts_order_1, expected_counts_order_1) - expect_equal(actual_counts_order_2, expected_counts_order_2) - - # Test that actual counts and returned ego networks match expected - # 1. Define expected - expected_ego_networks_order_1 <- make_named_ego_graph(graph, - order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - expected_ego_networks_order_2 <- make_named_ego_graph(graph, - order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - expected_counts_with_networks_order_1 <- - list( - graphlet_counts = expected_counts_order_1, - ego_networks = expected_ego_networks_order_1 - ) - expected_counts_with_networks_order_2 <- - list( - graphlet_counts = expected_counts_order_2, - ego_networks = expected_ego_networks_order_2 - ) - # 2. Calculate actual - actual_counts_with_networks_order_1 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - return_ego_networks = TRUE - ) - actual_counts_with_networks_order_2 <- - count_graphlets_ego_scaled(graph, - max_graphlet_size = max_graphlet_size, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes, - neighbourhood_size = 2, return_ego_networks = TRUE - ) - - # 3. Compare - # Comparison is not implemented for igraph objects, so convert all igraphs to - # indexed edge list and then compare. Do in-situ replacement of igraphs with - # indexed edge lists to ensure we are checking full properties of returned - # objects (i.e. named lists with matching elements). - # 3a. Convert expected and actual ego networks from igraphs to indexed edges - expected_counts_with_networks_order_1$ego_networks <- - purrr::map( - expected_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges - ) - expected_counts_with_networks_order_2$ego_networks <- - purrr::map( - expected_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges - ) - actual_counts_with_networks_order_1$ego_networks <- - purrr::map( - actual_counts_with_networks_order_1$ego_networks, - graph_to_indexed_edges - ) - actual_counts_with_networks_order_2$ego_networks <- - purrr::map( - actual_counts_with_networks_order_2$ego_networks, - graph_to_indexed_edges - ) - # 3b. Do comparison - expect_equal( - actual_counts_with_networks_order_1, - expected_counts_with_networks_order_1 - ) - expect_equal( - actual_counts_with_networks_order_2, - expected_counts_with_networks_order_2 - ) -}) +# context("Measures Netdis: Ego-network scaled graphlet outputs for manually verified networks") +# test_that("Ego-network 4-node graphlet counts match manually verified totals", { +# # Set up a small sample network with at least one ego-network that contains +# # at least one of each graphlets +# elist <- rbind( +# c("n1", "n2"), +# c("n2", "n3"), +# c("n1", "n4"), +# c("n2", "n5"), +# c("n1", "n6"), +# c("n1", "n7"), +# c("n2", "n4"), +# c("n4", "n6"), +# c("n6", "n8"), +# c("n7", "n8"), +# c("n7", "n9"), +# c("n7", "n10"), +# c("n8", "n9"), +# c("n8", "n10"), +# c("n9", "n10") +# ) +# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) +# +# # Set node and graphlet labels to use for row and col names in expected counts +# node_labels <- igraph::V(graph)$name +# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") +# +# # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 +# max_graphlet_size <- 4 +# min_ego_edges <- 0 +# min_ego_nodes <- 0 +# +# actual_counts_order_1 <- +# count_graphlets_ego_scaled(graph, +# max_graphlet_size = max_graphlet_size, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes, +# neighbourhood_size = 1 +# ) +# actual_counts_order_2 <- +# count_graphlets_ego_scaled(graph, +# max_graphlet_size = max_graphlet_size, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes, +# neighbourhood_size = 2 +# ) +# +# graphlet_key <- graphlet_key(max_graphlet_size) +# k <- graphlet_key$node_count +# # Set manually verified counts +# # 1-step ego networks +# expected_counts_order_1 <- rbind( +# c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), +# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), +# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), +# c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), +# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), +# c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), +# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), +# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), +# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), +# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) +# ) +# rownames(expected_counts_order_1) <- node_labels +# colnames(expected_counts_order_1) <- graphlet_labels +# # 2-step ego networks +# expected_counts_order_2 <- rbind( +# c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), +# c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), +# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), +# c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), +# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), +# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), +# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), +# c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), +# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), +# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) +# ) +# rownames(expected_counts_order_2) <- node_labels +# colnames(expected_counts_order_2) <- graphlet_labels +# +# # Test that actual counts match expected with only counts requested (default) +# expect_equal(actual_counts_order_1, expected_counts_order_1) +# expect_equal(actual_counts_order_2, expected_counts_order_2) +# +# # Test that actual counts and returned ego networks match expected +# # 1. Define expected +# expected_ego_networks_order_1 <- make_named_ego_graph(graph, +# order = 1, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes +# ) +# expected_ego_networks_order_2 <- make_named_ego_graph(graph, +# order = 2, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes +# ) +# expected_counts_with_networks_order_1 <- +# list( +# graphlet_counts = expected_counts_order_1, +# ego_networks = expected_ego_networks_order_1 +# ) +# expected_counts_with_networks_order_2 <- +# list( +# graphlet_counts = expected_counts_order_2, +# ego_networks = expected_ego_networks_order_2 +# ) +# # 2. Calculate actual +# actual_counts_with_networks_order_1 <- +# count_graphlets_ego_scaled(graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 1, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes, +# return_ego_networks = TRUE +# ) +# actual_counts_with_networks_order_2 <- +# count_graphlets_ego_scaled(graph, +# max_graphlet_size = max_graphlet_size, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes, +# neighbourhood_size = 2, return_ego_networks = TRUE +# ) +# +# # 3. Compare +# # Comparison is not implemented for igraph objects, so convert all igraphs to +# # indexed edge list and then compare. Do in-situ replacement of igraphs with +# # indexed edge lists to ensure we are checking full properties of returned +# # objects (i.e. named lists with matching elements). +# # 3a. Convert expected and actual ego networks from igraphs to indexed edges +# expected_counts_with_networks_order_1$ego_networks <- +# purrr::map( +# expected_counts_with_networks_order_1$ego_networks, +# graph_to_indexed_edges +# ) +# expected_counts_with_networks_order_2$ego_networks <- +# purrr::map( +# expected_counts_with_networks_order_2$ego_networks, +# graph_to_indexed_edges +# ) +# actual_counts_with_networks_order_1$ego_networks <- +# purrr::map( +# actual_counts_with_networks_order_1$ego_networks, +# graph_to_indexed_edges +# ) +# actual_counts_with_networks_order_2$ego_networks <- +# purrr::map( +# actual_counts_with_networks_order_2$ego_networks, +# graph_to_indexed_edges +# ) +# # 3b. Do comparison +# expect_equal( +# actual_counts_with_networks_order_1, +# expected_counts_with_networks_order_1 +# ) +# expect_equal( +# actual_counts_with_networks_order_2, +# expected_counts_with_networks_order_2 +# ) +# }) context("Measures Netdis: Ego-network density values match those for manually verified networks") test_that("Ego-network 4-node density values match manually verified totals", { @@ -1007,143 +1007,143 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { }) -test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes", { - # Helper function to generate graphs with known density and number of nodes - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - max_graphlet_size <- 4 - min_ego_edges <- 0 - min_ego_nodes <- 0 - - # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, - order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - ego_networks_o2 <- make_named_ego_graph(graph, - order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - # Set manually-verified node counts and densities - # 1. Ego-networks of order 1 - num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edges_o1 <- choose(num_nodes_o1, 2) - densities_o1 <- num_edges_o1 / max_edges_o1 - # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 - # 2. Ego-networks of order 2 - num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edges_o2 <- choose(num_nodes_o2, 2) - densities_o2 <- num_edges_o2 / max_edges_o2 - # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 - # Set manually defined density breaks and indexes - breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) - density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) - density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) - # Set dummy reference counts - scaled_reference_counts <- rbind( - c(1, 2, 3, 4, 5, 6, 7, 8, 9), - c(11, 12, 13, 14, 15, 16, 17, 18, 19), - c(21, 22, 23, 24, 25, 26, 27, 28, 29), - c(31, 32, 33, 34, 35, 36, 37, 38, 39), - c(41, 42, 43, 44, 45, 46, 47, 48, 49), - c(51, 52, 53, 54, 55, 56, 57, 58, 59), - c(61, 62, 63, 64, 65, 66, 67, 68, 69), - c(71, 72, 73, 74, 75, 76, 77, 78, 79), - c(81, 82, 83, 84, 85, 86, 87, 88, 89), - c(91, 92, 93, 94, 95, 96, 97, 98, 99) - ) - expected_dims <- dim(scaled_reference_counts) - min_ego_nodes <- 3 - min_ego_edges <- 1 - - # Helper function to calculate expected expected graphlet counts - expected_expected_graphlet_counts_fn <- function(density_index, node_count) { - reference_counts <- scaled_reference_counts[density_index, ] - reference_counts * choose(node_count, graphlet_sizes) - } - # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet - # types as columns and ego networks for nodes in graph as rows - expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( - density_indexes_o1, num_nodes_o1, expected_expected_graphlet_counts_fn - ))) - expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( - density_indexes_o2, num_nodes_o2, expected_expected_graphlet_counts_fn - ))) - # Sanity check for expected output shape. Should be matrix with graphlet types - # as columns and nodes as rows - expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) - expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) - # Set column labels to graphlet names - colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels - colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels - # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) - # Drop rows for nodes with ewer than minumum required nodes and edges in ego - # network - expected_expected_graphlet_counts_ego_o1 <- - expected_expected_graphlet_counts_ego_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), - ] - expected_expected_graphlet_counts_ego_o2 <- - expected_expected_graphlet_counts_ego_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] - - # Calculate actual output of function under test - actual_expected_graphlet_counts_ego_o1 <- - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, density_breaks = breaks, - density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn = count_graphlet_tuples - ) - actual_expected_graphlet_counts_ego_o2 <- - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, density_breaks = breaks, - density_binned_reference_counts = scaled_reference_counts, - min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - scale_fn = count_graphlet_tuples - ) - - # Compare actual to expected - expect_equal( - actual_expected_graphlet_counts_ego_o1, - expected_expected_graphlet_counts_ego_o1 - ) - expect_equal( - actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2 - ) -}) +# test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes", { +# # Helper function to generate graphs with known density and number of nodes +# # Set up a small sample network with at least one ego-network that contains +# # at least one of each graphlets +# elist <- rbind( +# c("n1", "n2"), +# c("n2", "n3"), +# c("n1", "n4"), +# c("n2", "n5"), +# c("n1", "n6"), +# c("n1", "n7"), +# c("n2", "n4"), +# c("n4", "n6"), +# c("n6", "n8"), +# c("n7", "n8"), +# c("n7", "n9"), +# c("n7", "n10"), +# c("n8", "n9"), +# c("n8", "n10"), +# c("n9", "n10") +# ) +# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) +# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") +# graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) +# max_graphlet_size <- 4 +# min_ego_edges <- 0 +# min_ego_nodes <- 0 +# +# # Make graph ego networks +# ego_networks_o1 <- make_named_ego_graph(graph, +# order = 1, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes +# ) +# ego_networks_o2 <- make_named_ego_graph(graph, +# order = 2, +# min_ego_edges = min_ego_edges, +# min_ego_nodes = min_ego_nodes +# ) +# # Set manually-verified node counts and densities +# # 1. Ego-networks of order 1 +# num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) +# num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) +# max_edges_o1 <- choose(num_nodes_o1, 2) +# densities_o1 <- num_edges_o1 / max_edges_o1 +# # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 +# # 2. Ego-networks of order 2 +# num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) +# num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) +# max_edges_o2 <- choose(num_nodes_o2, 2) +# densities_o2 <- num_edges_o2 / max_edges_o2 +# # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 +# # Set manually defined density breaks and indexes +# breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) +# density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) +# density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) +# # Set dummy reference counts +# scaled_reference_counts <- rbind( +# c(1, 2, 3, 4, 5, 6, 7, 8, 9), +# c(11, 12, 13, 14, 15, 16, 17, 18, 19), +# c(21, 22, 23, 24, 25, 26, 27, 28, 29), +# c(31, 32, 33, 34, 35, 36, 37, 38, 39), +# c(41, 42, 43, 44, 45, 46, 47, 48, 49), +# c(51, 52, 53, 54, 55, 56, 57, 58, 59), +# c(61, 62, 63, 64, 65, 66, 67, 68, 69), +# c(71, 72, 73, 74, 75, 76, 77, 78, 79), +# c(81, 82, 83, 84, 85, 86, 87, 88, 89), +# c(91, 92, 93, 94, 95, 96, 97, 98, 99) +# ) +# expected_dims <- dim(scaled_reference_counts) +# min_ego_nodes <- 3 +# min_ego_edges <- 1 +# +# # Helper function to calculate expected expected graphlet counts +# expected_expected_graphlet_counts_fn <- function(density_index, node_count) { +# reference_counts <- scaled_reference_counts[density_index, ] +# reference_counts * choose(node_count, graphlet_sizes) +# } +# # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet +# # types as columns and ego networks for nodes in graph as rows +# expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( +# density_indexes_o1, num_nodes_o1, expected_expected_graphlet_counts_fn +# ))) +# expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( +# density_indexes_o2, num_nodes_o2, expected_expected_graphlet_counts_fn +# ))) +# # Sanity check for expected output shape. Should be matrix with graphlet types +# # as columns and nodes as rows +# expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) +# expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) +# # Set column labels to graphlet names +# colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels +# colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels +# # Set row labels to ego network names +# rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) +# rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) +# # Drop rows for nodes with ewer than minumum required nodes and edges in ego +# # network +# expected_expected_graphlet_counts_ego_o1 <- +# expected_expected_graphlet_counts_ego_o1[ +# (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), +# ] +# expected_expected_graphlet_counts_ego_o2 <- +# expected_expected_graphlet_counts_ego_o2[ +# (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), +# ] +# +# # Calculate actual output of function under test +# actual_expected_graphlet_counts_ego_o1 <- +# netdis_expected_graphlet_counts_ego( +# graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 1, density_breaks = breaks, +# density_binned_reference_counts = scaled_reference_counts, +# min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, +# scale_fn = count_graphlet_tuples +# ) +# actual_expected_graphlet_counts_ego_o2 <- +# netdis_expected_graphlet_counts_ego( +# graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 2, density_breaks = breaks, +# density_binned_reference_counts = scaled_reference_counts, +# min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, +# scale_fn = count_graphlet_tuples +# ) +# +# # Compare actual to expected +# expect_equal( +# actual_expected_graphlet_counts_ego_o1, +# expected_expected_graphlet_counts_ego_o1 +# ) +# expect_equal( +# actual_expected_graphlet_counts_ego_o2, +# expected_expected_graphlet_counts_ego_o2 +# ) +# }) test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes @@ -1327,383 +1327,383 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n ) }) -test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 nodes", { - # Set up a small sample network with at least one ego-network that contains - # at least one of each graphlets - elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - graph <- igraph::graph_from_edgelist(elist, directed = FALSE) - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") - graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - names(graphlet_sizes) <- graphlet_labels - max_graphlet_size <- 4 - # Make graph ego networks - min_ego_nodes <- 0 - min_edgo_edges <- 0 - ego_networks_o1 <- make_named_ego_graph(graph, - order = 1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_edgo_edges - ) - ego_networks_o2 <- make_named_ego_graph(graph, - order = 2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_edgo_edges - ) - # Set manually-verified node counts and densities - # 1. Ego-networks of order 1 - num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) - num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) - max_edges_o1 <- choose(num_nodes_o1, 2) - densities_o1 <- num_edges_o1 / max_edges_o1 - # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 - # 2. Ego-networks of order 2 - num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) - num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) - max_edges_o2 <- choose(num_nodes_o2, 2) - densities_o2 <- num_edges_o2 / max_edges_o2 - # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 - # Set manually determined density breaks and indexes, based on a min bin count - # of 2 and an initial request for 100 bins - min_bin_count <- 2 - num_bins <- 100 - num_breaks <- num_bins + 1 - min_density_o1 <- 0.5 - max_density_o1 <- 1.0 - breaks_o1 <- seq(min_density_o1, max_density_o1, length.out = num_breaks)[c(1, 22, 42, 101)] - density_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) - min_density_o2 <- 1 / 3 - max_density_o2 <- 0.6 - breaks_o2 <- seq(min_density_o2, max_density_o2, length.out = num_breaks)[c(1, 10, 51, 64, 101)] - density_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) - # Guard against errors in manually determined breaks and indexes by checking - # against already tested code. This also lets us ensure we handle densities - # falling exactly on a bin boundary the same as the code under test. - comp_binned_densities_o1 <- binned_densities_adaptive( - densities_o1, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins - ) - comp_binned_densities_o2 <- binned_densities_adaptive( - densities_o2, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins - ) - expect_equal( - comp_binned_densities_o1, - list( - densities = densities_o1, - interval_indexes = density_indexes_o1, - breaks = breaks_o1 - ) - ) - expect_equal( - comp_binned_densities_o2, - list( - densities = densities_o2, - interval_indexes = density_indexes_o2, - breaks = breaks_o2 - ) - ) - - # Set manually verified scaled ego-network graphlet counts - graphlet_key <- graphlet_key(max_graphlet_size) - k <- graphlet_key$node_count - # 1-step ego networks - scaled_reference_counts_o1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), - c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), - c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) - ) - # 2-step ego networks - scaled_reference_counts_o2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), - c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), - c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), - c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) - ) - min_ego_nodes <- 3 - min_ego_edges <- 1 - # Drop rows for nodes with ewer than minumum required nodes and edges in ego - # network - scaled_reference_counts_o1 <- - scaled_reference_counts_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), - ] - scaled_reference_counts_o2 <- - scaled_reference_counts_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), - ] - density_indexes_o1 <- density_indexes_o1[ - (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges) - ] - density_indexes_o2 <- density_indexes_o2[ - (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges) - ] - # Average manually verified scaled reference counts across density bins - density_binned_reference_counts_o1 <- rbind( - (scaled_reference_counts_o1[1, ] + scaled_reference_counts_o1[2, ]) / 2, - (scaled_reference_counts_o1[4, ] + scaled_reference_counts_o1[5, ] + - scaled_reference_counts_o1[6, ]) / 3, - (scaled_reference_counts_o1[3, ] + - scaled_reference_counts_o1[7, ] + - scaled_reference_counts_o1[8, ]) / 3 - ) - rownames(density_binned_reference_counts_o1) <- 1:3 - density_binned_reference_counts_o2 <- rbind( - (scaled_reference_counts_o2[1, ] + scaled_reference_counts_o2[4, ]) / 2, - (scaled_reference_counts_o2[2, ] + scaled_reference_counts_o2[6, ] + - scaled_reference_counts_o2[7, ]) / 3, - (scaled_reference_counts_o2[3, ] + scaled_reference_counts_o2[5, ]) / 2, - (scaled_reference_counts_o2[8, ] + scaled_reference_counts_o2[9, ] + - scaled_reference_counts_o2[10, ]) / 3 - ) - rownames(density_binned_reference_counts_o2) <- 1:4 - - # Helper functions to calculate expected expected graphlet counts - expected_expected_graphlet_counts_o1_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o1[density_index, ] - reference_counts * choose(node_count, graphlet_sizes) - } - expected_expected_graphlet_counts_o2_fn <- function(density_index, node_count) { - reference_counts <- density_binned_reference_counts_o2[density_index, ] - reference_counts * choose(node_count, graphlet_sizes) - } - # Calculate expected graphlet counts - expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( - density_indexes_o1, num_nodes_o1[(num_nodes_o1 >= min_ego_nodes)], - expected_expected_graphlet_counts_o1_fn - ))) - rownames(expected_expected_graphlet_counts_ego_o1) <- - names(ego_networks_o1[(num_nodes_o1 >= min_ego_nodes)]) - expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( - density_indexes_o2, num_nodes_o2[(num_nodes_o2 >= min_ego_nodes)], - expected_expected_graphlet_counts_o2_fn - ))) - rownames(expected_expected_graphlet_counts_ego_o2) <- - names(ego_networks_o2[(num_nodes_o2 >= min_ego_nodes)]) - - # Sanity check manually derived expected expected counts by comparing against - # pre-tested fully applied expected_graphlet_counts_ego function - expect_equal( - expected_expected_graphlet_counts_ego_o1, - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - density_breaks = breaks_o1, - density_binned_reference_counts_o1, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn = count_graphlet_tuples - ) - ) - expect_equal( - expected_expected_graphlet_counts_ego_o2, - netdis_expected_graphlet_counts_ego( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - density_breaks = breaks_o2, - density_binned_reference_counts_o2, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn = count_graphlet_tuples - ) - ) - - # Generate partially applied functions using function under test - actual_expected_graphlet_counts_ego_fn_o1 <- - netdis_expected_graphlet_counts_ego_fn( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, - min_bin_count = min_bin_count, - num_bins = num_bins, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn = count_graphlet_tuples - ) - actual_expected_graphlet_counts_ego_fn_o2 <- - netdis_expected_graphlet_counts_ego_fn( - graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, - min_bin_count = min_bin_count, - num_bins = num_bins, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - scale_fn = count_graphlet_tuples - ) - # Generate actual expected accounts by applying generated functions to test - # graph - actual_expected_graphlet_counts_ego_o1 <- - actual_expected_graphlet_counts_ego_fn_o1(graph) - actual_expected_graphlet_counts_ego_o2 <- - actual_expected_graphlet_counts_ego_fn_o2(graph) - - # Compare actual to expected - expect_equal( - actual_expected_graphlet_counts_ego_o1, - expected_expected_graphlet_counts_ego_o1 - ) - expect_equal( - actual_expected_graphlet_counts_ego_o2, - expected_expected_graphlet_counts_ego_o2 - ) -}) - -context("Measures Netdis: Centered graphlet counts") -test_that("netdis_centred_graphlet_counts_ego is correct", { - # Set up small sample networks each with each graphlet represented in at least - # one ego network - ref_elist <- rbind( - c("n1", "n2"), - c("n1", "n3"), - c("n1", "n4"), - c("n1", "n5"), - c("n1", "n6"), - c("n2", "n7"), - c("n2", "n8"), - c("n2", "n9"), - c("n9", "n10"), - c("n10", "n11"), - c("n11", "n12"), - c("n11", "n13"), - c("n2", "n14"), - c("n8", "n14"), - c("n12", "n15"), - c("n12", "n16"), - c("n15", "n17"), - c("n12", "n18"), - c("n15", "n18"), - c("n16", "n17"), - c("n16", "n18"), - c("n17", "n18"), - c("n16", "n19"), - c("n16", "n20"), - c("n16", "n21"), - c("n19", "n20"), - c("n19", "n21"), - c("n15", "n22"), - c("n15", "n23"), - c("n15", "n24"), - c("n22", "n23"), - c("n22", "n24"), - c("n23", "n24") - ) - ref_graph <- igraph::graph_from_edgelist(ref_elist, directed = FALSE) - - query_elist <- rbind( - c("n1", "n2"), - c("n2", "n3"), - c("n1", "n4"), - c("n2", "n5"), - c("n1", "n6"), - c("n1", "n7"), - c("n2", "n4"), - c("n4", "n6"), - c("n6", "n8"), - c("n7", "n8"), - c("n7", "n9"), - c("n7", "n10"), - c("n8", "n9"), - c("n8", "n10"), - c("n9", "n10") - ) - query_graph <- igraph::graph_from_edgelist(query_elist, directed = FALSE) - - max_graphlet_size <- 4 - # Use pre-tested functions to generate ego-network graphlet counts - # 1. Reference graph ego-network graphlet counts - ref_o1 <- count_graphlets_ego( - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE - ) - ego_counts_ref_o1 <- ref_o1$graphlet_counts - ego_networks_ref_o1 <- ref_o1$ego_networks - density_ref_o1 <- sapply(ego_networks_ref_o1, igraph::edge_density) - - ref_o2 <- count_graphlets_ego( - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE - ) - ego_counts_ref_o2 <- ref_o2$graphlet_counts - ego_networks_ref_o2 <- ref_o2$ego_networks - density_ref_o2 <- sapply(ego_networks_ref_o2, igraph::edge_density) - - # 2. Query graph ego-network graphlet countsa - query_o1 <- count_graphlets_ego( - query_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 1, return_ego_networks = TRUE - ) - ego_counts_query_o1 <- query_o1$graphlet_counts - ego_networks_query_o1 <- query_o1$ego_networks - density_query_o1 <- sapply(ego_networks_query_o1, igraph::edge_density) +# test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 nodes", { +# # Set up a small sample network with at least one ego-network that contains +# # at least one of each graphlets +# elist <- rbind( +# c("n1", "n2"), +# c("n2", "n3"), +# c("n1", "n4"), +# c("n2", "n5"), +# c("n1", "n6"), +# c("n1", "n7"), +# c("n2", "n4"), +# c("n4", "n6"), +# c("n6", "n8"), +# c("n7", "n8"), +# c("n7", "n9"), +# c("n7", "n10"), +# c("n8", "n9"), +# c("n8", "n10"), +# c("n9", "n10") +# ) +# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) +# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") +# graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) +# names(graphlet_sizes) <- graphlet_labels +# max_graphlet_size <- 4 +# # Make graph ego networks +# min_ego_nodes <- 0 +# min_edgo_edges <- 0 +# ego_networks_o1 <- make_named_ego_graph(graph, +# order = 1, +# min_ego_nodes = min_ego_nodes, +# min_ego_edges = min_edgo_edges +# ) +# ego_networks_o2 <- make_named_ego_graph(graph, +# order = 2, +# min_ego_nodes = min_ego_nodes, +# min_ego_edges = min_edgo_edges +# ) +# # Set manually-verified node counts and densities +# # 1. Ego-networks of order 1 +# num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) +# num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) +# max_edges_o1 <- choose(num_nodes_o1, 2) +# densities_o1 <- num_edges_o1 / max_edges_o1 +# # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 +# # 2. Ego-networks of order 2 +# num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) +# num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) +# max_edges_o2 <- choose(num_nodes_o2, 2) +# densities_o2 <- num_edges_o2 / max_edges_o2 +# # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 +# # Set manually determined density breaks and indexes, based on a min bin count +# # of 2 and an initial request for 100 bins +# min_bin_count <- 2 +# num_bins <- 100 +# num_breaks <- num_bins + 1 +# min_density_o1 <- 0.5 +# max_density_o1 <- 1.0 +# breaks_o1 <- seq(min_density_o1, max_density_o1, length.out = num_breaks)[c(1, 22, 42, 101)] +# density_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) +# min_density_o2 <- 1 / 3 +# max_density_o2 <- 0.6 +# breaks_o2 <- seq(min_density_o2, max_density_o2, length.out = num_breaks)[c(1, 10, 51, 64, 101)] +# density_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) +# # Guard against errors in manually determined breaks and indexes by checking +# # against already tested code. This also lets us ensure we handle densities +# # falling exactly on a bin boundary the same as the code under test. +# comp_binned_densities_o1 <- binned_densities_adaptive( +# densities_o1, +# min_counts_per_interval = min_bin_count, +# num_intervals = num_bins +# ) +# comp_binned_densities_o2 <- binned_densities_adaptive( +# densities_o2, +# min_counts_per_interval = min_bin_count, +# num_intervals = num_bins +# ) +# expect_equal( +# comp_binned_densities_o1, +# list( +# densities = densities_o1, +# interval_indexes = density_indexes_o1, +# breaks = breaks_o1 +# ) +# ) +# expect_equal( +# comp_binned_densities_o2, +# list( +# densities = densities_o2, +# interval_indexes = density_indexes_o2, +# breaks = breaks_o2 +# ) +# ) +# +# # Set manually verified scaled ego-network graphlet counts +# graphlet_key <- graphlet_key(max_graphlet_size) +# k <- graphlet_key$node_count +# # 1-step ego networks +# scaled_reference_counts_o1 <- rbind( +# c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), +# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), +# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), +# c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), +# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), +# c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), +# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), +# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), +# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), +# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) +# ) +# # 2-step ego networks +# scaled_reference_counts_o2 <- rbind( +# c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), +# c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), +# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), +# c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), +# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), +# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), +# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), +# c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), +# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), +# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) +# ) +# min_ego_nodes <- 3 +# min_ego_edges <- 1 +# # Drop rows for nodes with ewer than minumum required nodes and edges in ego +# # network +# scaled_reference_counts_o1 <- +# scaled_reference_counts_o1[ +# (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), +# ] +# scaled_reference_counts_o2 <- +# scaled_reference_counts_o2[ +# (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), +# ] +# density_indexes_o1 <- density_indexes_o1[ +# (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges) +# ] +# density_indexes_o2 <- density_indexes_o2[ +# (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges) +# ] +# # Average manually verified scaled reference counts across density bins +# density_binned_reference_counts_o1 <- rbind( +# (scaled_reference_counts_o1[1, ] + scaled_reference_counts_o1[2, ]) / 2, +# (scaled_reference_counts_o1[4, ] + scaled_reference_counts_o1[5, ] + +# scaled_reference_counts_o1[6, ]) / 3, +# (scaled_reference_counts_o1[3, ] + +# scaled_reference_counts_o1[7, ] + +# scaled_reference_counts_o1[8, ]) / 3 +# ) +# rownames(density_binned_reference_counts_o1) <- 1:3 +# density_binned_reference_counts_o2 <- rbind( +# (scaled_reference_counts_o2[1, ] + scaled_reference_counts_o2[4, ]) / 2, +# (scaled_reference_counts_o2[2, ] + scaled_reference_counts_o2[6, ] + +# scaled_reference_counts_o2[7, ]) / 3, +# (scaled_reference_counts_o2[3, ] + scaled_reference_counts_o2[5, ]) / 2, +# (scaled_reference_counts_o2[8, ] + scaled_reference_counts_o2[9, ] + +# scaled_reference_counts_o2[10, ]) / 3 +# ) +# rownames(density_binned_reference_counts_o2) <- 1:4 +# +# # Helper functions to calculate expected expected graphlet counts +# expected_expected_graphlet_counts_o1_fn <- function(density_index, node_count) { +# reference_counts <- density_binned_reference_counts_o1[density_index, ] +# reference_counts * choose(node_count, graphlet_sizes) +# } +# expected_expected_graphlet_counts_o2_fn <- function(density_index, node_count) { +# reference_counts <- density_binned_reference_counts_o2[density_index, ] +# reference_counts * choose(node_count, graphlet_sizes) +# } +# # Calculate expected graphlet counts +# expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( +# density_indexes_o1, num_nodes_o1[(num_nodes_o1 >= min_ego_nodes)], +# expected_expected_graphlet_counts_o1_fn +# ))) +# rownames(expected_expected_graphlet_counts_ego_o1) <- +# names(ego_networks_o1[(num_nodes_o1 >= min_ego_nodes)]) +# expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( +# density_indexes_o2, num_nodes_o2[(num_nodes_o2 >= min_ego_nodes)], +# expected_expected_graphlet_counts_o2_fn +# ))) +# rownames(expected_expected_graphlet_counts_ego_o2) <- +# names(ego_networks_o2[(num_nodes_o2 >= min_ego_nodes)]) +# +# # Sanity check manually derived expected expected counts by comparing against +# # pre-tested fully applied expected_graphlet_counts_ego function +# expect_equal( +# expected_expected_graphlet_counts_ego_o1, +# netdis_expected_graphlet_counts_ego( +# graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 1, +# density_breaks = breaks_o1, +# density_binned_reference_counts_o1, +# min_ego_nodes = min_ego_nodes, +# min_ego_edges = min_ego_edges, +# scale_fn = count_graphlet_tuples +# ) +# ) +# expect_equal( +# expected_expected_graphlet_counts_ego_o2, +# netdis_expected_graphlet_counts_ego( +# graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 2, +# density_breaks = breaks_o2, +# density_binned_reference_counts_o2, +# min_ego_nodes = min_ego_nodes, +# min_ego_edges = min_ego_edges, +# scale_fn = count_graphlet_tuples +# ) +# ) +# +# # Generate partially applied functions using function under test +# actual_expected_graphlet_counts_ego_fn_o1 <- +# netdis_expected_graphlet_counts_ego_fn( +# graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 1, +# min_bin_count = min_bin_count, +# num_bins = num_bins, +# min_ego_nodes = min_ego_nodes, +# min_ego_edges = min_ego_edges, +# scale_fn = count_graphlet_tuples +# ) +# actual_expected_graphlet_counts_ego_fn_o2 <- +# netdis_expected_graphlet_counts_ego_fn( +# graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 2, +# min_bin_count = min_bin_count, +# num_bins = num_bins, +# min_ego_nodes = min_ego_nodes, +# min_ego_edges = min_ego_edges, +# scale_fn = count_graphlet_tuples +# ) +# # Generate actual expected accounts by applying generated functions to test +# # graph +# actual_expected_graphlet_counts_ego_o1 <- +# actual_expected_graphlet_counts_ego_fn_o1(graph) +# actual_expected_graphlet_counts_ego_o2 <- +# actual_expected_graphlet_counts_ego_fn_o2(graph) +# +# # Compare actual to expected +# expect_equal( +# actual_expected_graphlet_counts_ego_o1, +# expected_expected_graphlet_counts_ego_o1 +# ) +# expect_equal( +# actual_expected_graphlet_counts_ego_o2, +# expected_expected_graphlet_counts_ego_o2 +# ) +# }) - query_o2 <- count_graphlets_ego( - query_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = 2, return_ego_networks = TRUE - ) - ego_counts_query_o2 <- query_o2$graphlet_counts - ego_networks_query_o2 <- query_o2$ego_networks - density_query_o2 <- sapply(ego_networks_query_o2, igraph::edge_density) - - centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, - query_node_counts, ref_node_count, - min_nodes, min_edges, - min_bin_count, num_bins) { - graphlet_node_counts_k4 <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) - # 1. Calculate scaling factors for each reference and query graphlet count - # These are nCk, where n is the number of nodes in the network and - # k is the number of nodes in the graphlet - ref_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) { - choose(ref_node_count, k) - } - ) - query_scale_factor <- sapply( - graphlet_node_counts_k4, FUN <- function(k) { - choose(query_node_count, k) - } - ) - # 2. Calculate scaled reference counts by dividing by ref_scale_factor - ref_scaled_graphlet_count <- query_graphlet_count / ref_scale_factor - # - } -}) +# context("Measures Netdis: Centered graphlet counts") +# test_that("netdis_centred_graphlet_counts_ego is correct", { +# # Set up small sample networks each with each graphlet represented in at least +# # one ego network +# ref_elist <- rbind( +# c("n1", "n2"), +# c("n1", "n3"), +# c("n1", "n4"), +# c("n1", "n5"), +# c("n1", "n6"), +# c("n2", "n7"), +# c("n2", "n8"), +# c("n2", "n9"), +# c("n9", "n10"), +# c("n10", "n11"), +# c("n11", "n12"), +# c("n11", "n13"), +# c("n2", "n14"), +# c("n8", "n14"), +# c("n12", "n15"), +# c("n12", "n16"), +# c("n15", "n17"), +# c("n12", "n18"), +# c("n15", "n18"), +# c("n16", "n17"), +# c("n16", "n18"), +# c("n17", "n18"), +# c("n16", "n19"), +# c("n16", "n20"), +# c("n16", "n21"), +# c("n19", "n20"), +# c("n19", "n21"), +# c("n15", "n22"), +# c("n15", "n23"), +# c("n15", "n24"), +# c("n22", "n23"), +# c("n22", "n24"), +# c("n23", "n24") +# ) +# ref_graph <- igraph::graph_from_edgelist(ref_elist, directed = FALSE) +# +# query_elist <- rbind( +# c("n1", "n2"), +# c("n2", "n3"), +# c("n1", "n4"), +# c("n2", "n5"), +# c("n1", "n6"), +# c("n1", "n7"), +# c("n2", "n4"), +# c("n4", "n6"), +# c("n6", "n8"), +# c("n7", "n8"), +# c("n7", "n9"), +# c("n7", "n10"), +# c("n8", "n9"), +# c("n8", "n10"), +# c("n9", "n10") +# ) +# query_graph <- igraph::graph_from_edgelist(query_elist, directed = FALSE) +# +# max_graphlet_size <- 4 +# # Use pre-tested functions to generate ego-network graphlet counts +# # 1. Reference graph ego-network graphlet counts +# ref_o1 <- count_graphlets_ego( +# ref_graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 1, return_ego_networks = TRUE +# ) +# ego_counts_ref_o1 <- ref_o1$graphlet_counts +# ego_networks_ref_o1 <- ref_o1$ego_networks +# density_ref_o1 <- sapply(ego_networks_ref_o1, igraph::edge_density) +# +# ref_o2 <- count_graphlets_ego( +# ref_graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 2, return_ego_networks = TRUE +# ) +# ego_counts_ref_o2 <- ref_o2$graphlet_counts +# ego_networks_ref_o2 <- ref_o2$ego_networks +# density_ref_o2 <- sapply(ego_networks_ref_o2, igraph::edge_density) +# +# # 2. Query graph ego-network graphlet countsa +# query_o1 <- count_graphlets_ego( +# query_graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 1, return_ego_networks = TRUE +# ) +# ego_counts_query_o1 <- query_o1$graphlet_counts +# ego_networks_query_o1 <- query_o1$ego_networks +# density_query_o1 <- sapply(ego_networks_query_o1, igraph::edge_density) +# +# query_o2 <- count_graphlets_ego( +# query_graph, +# max_graphlet_size = max_graphlet_size, +# neighbourhood_size = 2, return_ego_networks = TRUE +# ) +# ego_counts_query_o2 <- query_o2$graphlet_counts +# ego_networks_query_o2 <- query_o2$ego_networks +# density_query_o2 <- sapply(ego_networks_query_o2, igraph::edge_density) +# +# centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, +# query_node_counts, ref_node_count, +# min_nodes, min_edges, +# min_bin_count, num_bins) { +# graphlet_node_counts_k4 <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) +# # 1. Calculate scaling factors for each reference and query graphlet count +# # These are nCk, where n is the number of nodes in the network and +# # k is the number of nodes in the graphlet +# ref_scale_factor <- sapply( +# graphlet_node_counts_k4, FUN <- function(k) { +# choose(ref_node_count, k) +# } +# ) +# query_scale_factor <- sapply( +# graphlet_node_counts_k4, FUN <- function(k) { +# choose(query_node_count, k) +# } +# ) +# # 2. Calculate scaled reference counts by dividing by ref_scale_factor +# ref_scaled_graphlet_count <- query_graphlet_count / ref_scale_factor +# # +# } +# }) diff --git a/vignettes/dendrogram_example_net_dis.R b/vignettes/dendrogram_example_net_dis.R index c7bda001..a37a143f 100644 --- a/vignettes/dendrogram_example_net_dis.R +++ b/vignettes/dendrogram_example_net_dis.R @@ -2,17 +2,21 @@ library("netdist") edge_format = "ncol" # Load reference graph (used for Netdis. Not required for NetEMD) -ref_path = file.path(system.file(file.path("extdata", "random"), package = "netdist"), "ER_1250_10_1") +ref_path = file.path(system.file(file.path("extdata", "random"), + package = "netdist"), + "ER_1250_10_1") ref_graph <- read_simple_graph(ref_path, format = edge_format) # Set source directory and file properties for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -edge_format = "ncol" -file_pattern = "*" +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" # Load all graphs in the source folder matching the filename pattern -query_graphs <- read_simple_graphs(source_dir, format = edge_format, - pattern = file_pattern) +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) print(names(query_graphs)) ## ------------------------------------------------------------------------ @@ -21,48 +25,63 @@ max_graphlet_size <- 4 neighbourhood_size <- 2 ## ------------------------------------------------------------------------ -expected_count_fn <- netdis_expected_graphlet_counts_ego_fn( - ref_graph, max_graphlet_size, neighbourhood_size) -## ------------------------------------------------------------------------ -centred_counts <- purrr::map(query_graphs, netdis_centred_graphlet_counts, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - expected_ego_count_fn = expected_count_fn) +# Calculate netdis measure for graphlets up to size max_graphlet_size +netdis_result <- netdis_many_to_many(query_graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size) -## ------------------------------------------------------------------------ # Netdis measure for graphlets of size 3 -res3 <- netdis_for_all_graphs(centred_counts, 3) -netdis3_mat <- cross_comp_to_matrix(res3$netdis, res3$comp_spec) +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) + +print("Netdis: graphlet size = 3") +print(netdis3_mat) + # Netdis measure for graphlets of size 4 -res4 <- netdis_for_all_graphs(centred_counts, 4) -netdis4_mat <- cross_comp_to_matrix(res4$netdis, res4$comp_spec) -netdis4_mat +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print("Netdis: graphlet size = 4") +print(netdis4_mat) ## ------------------------------------------------------------------------ -graphdists<-as.dist(netdis4_mat) -par(mfrow=c(1,2)) -cex=1 +graphdists <- as.dist(netdis4_mat) +par(mfrow = c(1, 2)) +cex <- 1 + # Dendrogram based on Netdis measure for graphlets of size 3 -title = paste("Netdis: graphlet size = ", 3, sep = "") -plot(phangorn::upgma(as.dist(netdis3_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + # Dendrogram based on Netdis measure for graphlets of size 4 title = paste("Netdis: graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netdis4_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) ## ------------------------------------------------------------------------ -cex=1.5 +cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 3, sep = "") -heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) ## ------------------------------------------------------------------------ -cex=1.5 +cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("Netdis: graphlet size = ", 4, sep = "") -heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +title <- paste("Netdis: graphlet size = ", 4, sep = "") +heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) diff --git a/vignettes/dendrogram_example_net_dis.html b/vignettes/dendrogram_example_net_dis.html index 32819655..f53bacfc 100644 --- a/vignettes/dendrogram_example_net_dis.html +++ b/vignettes/dendrogram_example_net_dis.html @@ -12,7 +12,7 @@ - + Dendrogram example for Netdis @@ -20,9 +20,9 @@ - - - - - - - - - - - - - - - - - -

Netdis - 2 graphs with Constant Expected Counts for Each Graphlet

-

Martin O’Reilly, Jack Roberts

-

2019-09-05

- - - -
-

Load required libraries

-
# Load libraries
-library("netdist")
-library("purrr")
-
-
-

Load graphs

-
# Set source directory for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-
-# Load query graphs
-graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
-                             format = "ncol")
-
-graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
-                             format = "ncol")
-
-
-

Set Netdis parameters

-
# Maximum graphlet size to calculate counts and netdis statistic for.
-max_graphlet_size <- 4
-
-# Ego network neighbourhood size
-neighbourhood_size <- 2
-
-# Minimum size of ego networks to consider
-min_ego_nodes <- 3
-min_ego_edges <- 1
-
-# Ego network density binning parameters
-min_bin_count <- 5
-num_bins <- 100
-
-
-

Generate ego networks

-
# Get ego networks for query graphs and reference graph
-ego_1 <- make_named_ego_graph(graph_1, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
-
-ego_2 <- make_named_ego_graph(graph_2, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
-
-
-

Count graphlets in ego networks

-
# Count graphlets for ego networks in query and reference graphs
-graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
-graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
-
-
-

Use mean graphlet counts as expected counts

-
# rep(1, nrow(graphlet_counts)): list of ones as bin index, i.e. everything in same bin
-mean_graphlet_counts_1 <- density_binned_counts(graphlet_counts_1,
-                                                rep(1, nrow(graphlet_counts_1)))
-
-mean_graphlet_counts_2 <- density_binned_counts(graphlet_counts_2,
-                                                rep(1, nrow(graphlet_counts_2)))
-
-bins <- c(0, 1)
-
-
-

Centre graphlet counts of query graphs using binned expected counts

-
# Calculate expected graphlet counts for each ego network
-exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, 
-                                                                 bins, 
-                                                                 mean_graphlet_counts_1,
-                                                                 max_graphlet_size,
-                                                                 scale_fn = NULL)
-
-
-exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, 
-                                                                 bins, 
-                                                                 mean_graphlet_counts_2,
-                                                                 max_graphlet_size,
-                                                                 scale_fn = NULL)
-# Centre graphlet counts by subtracting expected counts
-centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1
-
-centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2
-
-
-

Sum centred graphlet counts across all ego networks

-
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
-
-sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
-
-
-

Calculate netdis statistics

-
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
-                              sum_graphlet_counts_2, 
-                              max_graphlet_size)
-
-print(netdis_result)
-
##   netdis3   netdis4 
-## 0.3116860 0.9592365
-
- - - - - - - - diff --git a/vignettes/netdis_customisations.R b/vignettes/netdis_customisations.R index 0026313e..e7ede873 100644 --- a/vignettes/netdis_customisations.R +++ b/vignettes/netdis_customisations.R @@ -78,7 +78,6 @@ print(results$netdis) print(results$comp_spec) ## ------------------------------------------------------------------------ - binning_fn <- single_density_bin bin_counts_fn <- density_binned_counts exp_counts_fn <- netdis_expected_graphlet_counts_per_ego @@ -97,5 +96,3 @@ results <- netdis_many_to_many(graphs, print(results$netdis) print(results$comp_spec) - - diff --git a/vignettes/netdis_customisations.html b/vignettes/netdis_customisations.html index c60d9905..0eed1bf1 100644 --- a/vignettes/netdis_customisations.html +++ b/vignettes/netdis_customisations.html @@ -12,7 +12,7 @@ - + Usage of netdis with binning and expected counts customisations. @@ -305,7 +305,7 @@

Usage of netdis with binning and expected counts customisations.

Jack Roberts

-

2019-09-05

+

2019-09-11

diff --git a/vignettes/netdis_multigraph_polya-aeppli.R b/vignettes/netdis_multigraph_polya-aeppli.R deleted file mode 100644 index 58e62b9f..00000000 --- a/vignettes/netdis_multigraph_polya-aeppli.R +++ /dev/null @@ -1,154 +0,0 @@ -## ------------------------------------------------------------------------ -# Load libraries -library("netdist") -library("purrr") - -## ------------------------------------------------------------------------ -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -# Load query graphs -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") - -## ------------------------------------------------------------------------ -# Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size <- 4 - -# Ego network neighbourhood size -neighbourhood_size <- 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Ego network density binning parameters -min_bin_count <- 5 -num_bins <- 100 - -## ------------------------------------------------------------------------ -# Get ego networks for query graphs -ego_networks <- purrr::map( - graphs, make_named_ego_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges -) - -## ------------------------------------------------------------------------ -# Count graphlets for ego networks in query graphs -graphlet_counts <- purrr::map( - ego_networks, - ego_to_graphlet_counts, - max_graphlet_size = max_graphlet_size -) - -## ------------------------------------------------------------------------ - -# Get ego-network densities -densities <- purrr::map(ego_networks, - ego_network_density) - -binning_fn = purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100) -# Adaptively bin ego-network densities -binned_densities <- purrr::map(densities, - binning_fn) - -ego_density_bins <- purrr::map(binned_densities, function(x) {x$breaks}) -ego_density_bin_indexes <- purrr::map(binned_densities, function(x) {x$interval_indexes}) - - -## ------------------------------------------------------------------------ - -density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { - - mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - graphlet_counts, - bin_indexes) - - exp_counts_bin <- function(bin_idx) { - counts <- graphlet_counts[bin_indexes == bin_idx, ] - means <- mean_binned_graphlet_counts[bin_idx,] - - mean_sub_counts <- sweep(counts, 2, means) - - Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) - theta_d <- 2*means / (Vd_sq + means) - - exp_counts_dk <- vector() - for (k in 2:max_graphlet_size) { - graphlet_idx <- graphlet_ids_for_size(k) - - lambda_dk <- (1 / length(graphlet_idx)) * - sum( - 2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]) - ) - - exp_counts_dk <- append(exp_counts_dk, - lambda_dk / theta_d[graphlet_idx]) - } - - exp_counts_dk - } - - nbins <- length(unique(bin_indexes)) - expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) - - # deal with NAs caused by bins with zero counts for a graphlet - expected_counts_bin[is.nan(expected_counts_bin)] = 0 - - expected_counts_bin -} - - -bin_counts_fn <- purrr::partial(density_binned_counts_gp, - max_graphlet_size = max_graphlet_size) - -binned_graphlet_counts <- mapply(bin_counts_fn, - graphlet_counts, - ego_density_bin_indexes) - -## ------------------------------------------------------------------------ -# Calculate expected graphlet counts for each ego network -exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, - max_graphlet_size = max_graphlet_size, - scale_fn=NULL) - -exp_graphlet_counts <- mapply(exp_counts_fn, - ego_networks, - ego_density_bins, - binned_graphlet_counts) - - - -# Centre graphlet counts by subtracting expected counts -centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) - - -## ------------------------------------------------------------------------ -sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) - -## ------------------------------------------------------------------------ - -# Generate pairwise comparisons -comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = "many-to-many") - -## ------------------------------------------------------------------------ -# Calculate netdis statistics -results <- parallel::mcmapply( - function(index_a, index_b) { - netdis_uptok( - sum_graphlet_counts[[index_a]], - sum_graphlet_counts[[index_b]], - max_graphlet_size = max_graphlet_size - ) - }, - comp_spec$index_a, - comp_spec$index_b, - SIMPLIFY = TRUE) - - -list(netdis = results, comp_spec = comp_spec) - diff --git a/vignettes/netdis_multigraph_polya-aeppli.Rmd b/vignettes/netdis_multigraph_polya-aeppli.Rmd deleted file mode 100644 index cd48f9f9..00000000 --- a/vignettes/netdis_multigraph_polya-aeppli.Rmd +++ /dev/null @@ -1,184 +0,0 @@ ---- -title: "Netdis - Multiple graphs with Expected Counts from Geometric Poisson Approximation" -author: "Martin O'Reilly, Jack Roberts" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Netdis - multiple graphs with GP Approximation} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -## Load required libraries -```{r} -# Load libraries -library("netdist") -library("purrr") -``` - -## Load graphs -```{r} -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -# Load query graphs -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") -``` - -## Set Netdis parameters -```{r} -# Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size <- 4 - -# Ego network neighbourhood size -neighbourhood_size <- 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Ego network density binning parameters -min_bin_count <- 5 -num_bins <- 100 -``` - -## Generate ego networks -```{r} -# Get ego networks for query graphs -ego_networks <- purrr::map( - graphs, make_named_ego_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges -) -``` - -## Count graphlets in ego networks -```{r} -# Count graphlets for ego networks in query graphs -graphlet_counts <- purrr::map( - ego_networks, - ego_to_graphlet_counts, - max_graphlet_size = max_graphlet_size -) -``` - -## Bin ego networks by density -```{r} - -# Get ego-network densities -densities <- purrr::map(ego_networks, - ego_network_density) - -binning_fn = purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100) -# Adaptively bin ego-network densities -binned_densities <- purrr::map(densities, - binning_fn) - -ego_density_bins <- purrr::map(binned_densities, function(x) {x$breaks}) -ego_density_bin_indexes <- purrr::map(binned_densities, function(x) {x$interval_indexes}) - -``` - -## Calculate expected graphlet counts in each bin using geometric poisson approximation -```{r} - -density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { - - mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - graphlet_counts, - bin_indexes) - - exp_counts_bin <- function(bin_idx) { - counts <- graphlet_counts[bin_indexes == bin_idx, ] - means <- mean_binned_graphlet_counts[bin_idx,] - - mean_sub_counts <- sweep(counts, 2, means) - - Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) - theta_d <- 2*means / (Vd_sq + means) - - exp_counts_dk <- vector() - for (k in 2:max_graphlet_size) { - graphlet_idx <- graphlet_ids_for_size(k) - - lambda_dk <- (1 / length(graphlet_idx)) * - sum( - 2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]) - ) - - exp_counts_dk <- append(exp_counts_dk, - lambda_dk / theta_d[graphlet_idx]) - } - - exp_counts_dk - } - - nbins <- length(unique(bin_indexes)) - expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) - - # deal with NAs caused by bins with zero counts for a graphlet - expected_counts_bin[is.nan(expected_counts_bin)] = 0 - - expected_counts_bin -} - - -bin_counts_fn <- purrr::partial(density_binned_counts_gp, - max_graphlet_size = max_graphlet_size) - -binned_graphlet_counts <- mapply(bin_counts_fn, - graphlet_counts, - ego_density_bin_indexes) -``` - -## Centre graphlet counts of query graphs using binned expected counts -```{r} -# Calculate expected graphlet counts for each ego network -exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, - max_graphlet_size = max_graphlet_size, - scale_fn=NULL) - -exp_graphlet_counts <- mapply(exp_counts_fn, - ego_networks, - ego_density_bins, - binned_graphlet_counts) - - - -# Centre graphlet counts by subtracting expected counts -centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) - -``` - -## Sum centred graphlet counts across all ego networks -```{r} -sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) -``` - -## Calculate netdis statistics -```{r} - -# Generate pairwise comparisons -comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = "many-to-many") - -## ------------------------------------------------------------------------ -# Calculate netdis statistics -results <- parallel::mcmapply( - function(index_a, index_b) { - netdis_uptok( - sum_graphlet_counts[[index_a]], - sum_graphlet_counts[[index_b]], - max_graphlet_size = max_graphlet_size - ) - }, - comp_spec$index_a, - comp_spec$index_b, - SIMPLIFY = TRUE) - - -list(netdis = results, comp_spec = comp_spec) -``` \ No newline at end of file diff --git a/vignettes/netdis_multigraph_polya-aeppli.html b/vignettes/netdis_multigraph_polya-aeppli.html deleted file mode 100644 index d23b9487..00000000 --- a/vignettes/netdis_multigraph_polya-aeppli.html +++ /dev/null @@ -1,505 +0,0 @@ - - - - - - - - - - - - - - - - -Netdis - Multiple graphs with Expected Counts from Geometric Poisson Approximation - - - - - - - - - - - - - - - - - - - - - -

Netdis - Multiple graphs with Expected Counts from Geometric Poisson Approximation

-

Martin O’Reilly, Jack Roberts

-

2019-09-05

- - - -
-

Load required libraries

-
# Load libraries
-library("netdist")
-library("purrr")
-
-
-

Load graphs

-
# Set source directory for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-
-# Load query graphs
-graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
-
-
-

Set Netdis parameters

-
# Maximum graphlet size to calculate counts and netdis statistic for.
-max_graphlet_size <- 4
-
-# Ego network neighbourhood size
-neighbourhood_size <- 2
-
-# Minimum size of ego networks to consider
-min_ego_nodes <- 3
-min_ego_edges <- 1
-
-# Ego network density binning parameters
-min_bin_count <- 5
-num_bins <- 100
-
-
-

Generate ego networks

-
# Get ego networks for query graphs
-ego_networks <- purrr::map(
-  graphs, make_named_ego_graph,
-  order = neighbourhood_size, 
-  min_ego_nodes = min_ego_nodes, 
-  min_ego_edges = min_ego_edges
-)
-
-
-

Count graphlets in ego networks

-
# Count graphlets for ego networks in query graphs
-graphlet_counts <- purrr::map(
-  ego_networks,
-  ego_to_graphlet_counts,
-  max_graphlet_size = max_graphlet_size
-)
-
-
-

Bin ego networks by density

-
# Get ego-network densities
-densities <- purrr::map(ego_networks,
-                        ego_network_density)
-
-binning_fn = purrr::partial(binned_densities_adaptive,
-                            min_counts_per_interval = 5,
-                            num_intervals = 100)
-# Adaptively bin ego-network densities
-binned_densities <- purrr::map(densities,
-                               binning_fn)
-
-ego_density_bins <- purrr::map(binned_densities, function(x) {x$breaks})
-ego_density_bin_indexes <- purrr::map(binned_densities, function(x) {x$interval_indexes})
-
-
-

Calculate expected graphlet counts in each bin using geometric poisson approximation

-
density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) {
-  
-  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
-                                  graphlet_counts, 
-                                  bin_indexes)
-  
-  exp_counts_bin <- function(bin_idx) {
-    counts <- graphlet_counts[bin_indexes == bin_idx, ]
-    means <- mean_binned_graphlet_counts[bin_idx,]
-    
-    mean_sub_counts <- sweep(counts, 2, means)
-    
-    Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1)
-    theta_d <- 2*means / (Vd_sq + means)
-
-    exp_counts_dk <- vector()
-    for (k in 2:max_graphlet_size) {
-      graphlet_idx <- graphlet_ids_for_size(k)
-      
-      lambda_dk <- (1 / length(graphlet_idx)) * 
-                   sum(
-                     2 * means[graphlet_idx]^2 /
-                     (Vd_sq[graphlet_idx] + means[graphlet_idx])
-                   )
-      
-      exp_counts_dk <- append(exp_counts_dk,
-                              lambda_dk / theta_d[graphlet_idx])
-    }
-    
-    exp_counts_dk
-  }
-  
-  nbins <- length(unique(bin_indexes))
-  expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins))
-  
-  # deal with NAs caused by bins with zero counts for a graphlet
-  expected_counts_bin[is.nan(expected_counts_bin)] = 0
-  
-  expected_counts_bin
-}
-
-
-bin_counts_fn <- purrr::partial(density_binned_counts_gp,
-                               max_graphlet_size = max_graphlet_size)
-
-binned_graphlet_counts <- mapply(bin_counts_fn,
-                                 graphlet_counts,
-                                 ego_density_bin_indexes)
-
-
-

Centre graphlet counts of query graphs using binned expected counts

-
# Calculate expected graphlet counts for each ego network
-exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego,
-                               max_graphlet_size = max_graphlet_size,
-                               scale_fn=NULL)
-
-exp_graphlet_counts <- mapply(exp_counts_fn,
-                              ego_networks,
-                              ego_density_bins,
-                              binned_graphlet_counts)
-
-
-
-# Centre graphlet counts by subtracting expected counts
-centred_graphlet_counts <-  mapply("-", graphlet_counts, exp_graphlet_counts)
-
-
-

Sum centred graphlet counts across all ego networks

-
sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums)
-
-
-

Calculate netdis statistics

-
# Generate pairwise comparisons
-comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = "many-to-many")
-
-## ------------------------------------------------------------------------
-# Calculate netdis statistics
-results <- parallel::mcmapply(
-    function(index_a, index_b) {
-      netdis_uptok(
-        sum_graphlet_counts[[index_a]], 
-        sum_graphlet_counts[[index_b]],
-        max_graphlet_size = max_graphlet_size
-      )
-    },
-    comp_spec$index_a,
-    comp_spec$index_b,
-    SIMPLIFY = TRUE)
-  
-  
-list(netdis = results, comp_spec = comp_spec)
-
## $netdis
-##              [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
-## netdis3 0.8822527 0.9101084 0.8838054 0.96266771 0.04173551 0.03585169
-## netdis4 0.1892755 0.5752533 0.3719671 0.04604718 0.60705460 0.20370907
-##               [,7]         [,8]        [,9]       [,10]
-## netdis3 0.06271238 0.0004211575 0.005364888 0.009114229
-## netdis4 0.12978965 0.7178967193 0.490256248 0.371848474
-## 
-## $comp_spec
-##    name_a name_b index_a index_b
-## 1     EBV    ECL       1       2
-## 2     EBV  HSV-1       1       3
-## 3     EBV   KSHV       1       4
-## 4     EBV    VZV       1       5
-## 5     ECL  HSV-1       2       3
-## 6     ECL   KSHV       2       4
-## 7     ECL    VZV       2       5
-## 8   HSV-1   KSHV       3       4
-## 9   HSV-1    VZV       3       5
-## 10   KSHV    VZV       4       5
-
- - - - - - - - diff --git a/vignettes/quickstart_netdis_comparison.R b/vignettes/netdis_pairwise_comparisons.R similarity index 100% rename from vignettes/quickstart_netdis_comparison.R rename to vignettes/netdis_pairwise_comparisons.R diff --git a/vignettes/quickstart_netdis_comparison.html b/vignettes/netdis_pairwise_comparisons.html similarity index 98% rename from vignettes/quickstart_netdis_comparison.html rename to vignettes/netdis_pairwise_comparisons.html index 2c889039..a8e28f93 100644 --- a/vignettes/quickstart_netdis_comparison.html +++ b/vignettes/netdis_pairwise_comparisons.html @@ -12,9 +12,9 @@ - + -Usage of netdis with different pairwise comparison options. +Usage of netdis interfaces for different pairwise comparison options. @@ -303,9 +303,9 @@ -

Usage of netdis with different pairwise comparison options.

+

Usage of netdis interfaces for different pairwise comparison options.

Jack Roberts

-

2019-09-05

+

2019-09-11

diff --git a/vignettes/quickstart_netdis.R b/vignettes/quickstart_netdis.R deleted file mode 100644 index c223750e..00000000 --- a/vignettes/quickstart_netdis.R +++ /dev/null @@ -1,47 +0,0 @@ -## ------------------------------------------------------------------------ -# Load libraries -library("netdist") -library("purrr") - -## ------------------------------------------------------------------------ -# Maximum graphlet size to calculate counts for. -# We choose the specific graphlet size for the Netdis metric later. -max_graphlet_size = 4 - -## ------------------------------------------------------------------------ -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -# Load query graphs -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") - -## ------------------------------------------------------------------------ - -# Set ego network neighbourhood size -neighbourhood_size = 2 -ego_networks <- purrr::map(graphs, make_named_ego_graph, - order = neighbourhood_size) - -## ------------------------------------------------------------------------ -ego_graphlet_counts <- purrr::map_depth(ego_networks, 2, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size) - -## ------------------------------------------------------------------------ -# Load reference graph -file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(file, format = "ncol") - -# Generate ego networks for reference graph -ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size) - -# Count graphlets for ego networks in reference graph -ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size) - -# Scale ego-network graphlet counts by dividing by total number of k-tuples in - # ego-network (where k is graphlet size) -ref_ego_graphlet_tuples <- - count_graphlet_tuples_ego(ref_ego_networks, max_graphlet_size = max_graphlet_size) - -#ref_ego_graphlet_counts <- scale_graphlet_count(ref_ego_graphlet_counts, ref_ego_graphlet_tuples) - diff --git a/vignettes/quickstart_netdis.Rmd b/vignettes/quickstart_netdis.Rmd deleted file mode 100644 index 644a21cd..00000000 --- a/vignettes/quickstart_netdis.Rmd +++ /dev/null @@ -1,73 +0,0 @@ ---- -title: "Quick start guide for Netdis" -author: "Martin O'Reilly" -date: "`r Sys.Date()`" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Quick start for Netdis} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -## Load required libraries -```{r} -# Load libraries -library("netdist") -library("purrr") -``` - -## Set Netdis parameters -```{r} -# Maximum graphlet size to calculate counts for. -# We choose the specific graphlet size for the Netdis metric later. -max_graphlet_size = 4 -``` - -## Load graphs - -```{r} -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -# Load query graphs -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") -``` - -## Generate ego networks -```{r} - -# Set ego network neighbourhood size -neighbourhood_size = 2 -ego_networks <- purrr::map(graphs, make_named_ego_graph, - order = neighbourhood_size) -``` - -## Count graphlets in ego networks -```{r} -ego_graphlet_counts <- purrr::map_depth(ego_networks, 2, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size) -``` - -## Define function for calculating expected counts for ego networks -Here we generate these from a reference graph. - -### Generate scaled graphlet counts for reference graph. -```{r} -# Load reference graph -file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(file, format = "ncol") - -# Generate ego networks for reference graph -ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size) - -# Count graphlets for ego networks in reference graph -ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size) - -# Scale ego-network graphlet counts by dividing by total number of k-tuples in - # ego-network (where k is graphlet size) -ref_ego_graphlet_tuples <- - count_graphlet_tuples_ego(ref_ego_networks, max_graphlet_size = max_graphlet_size) - -#ref_ego_graphlet_counts <- scale_graphlet_count(ref_ego_graphlet_counts, ref_ego_graphlet_tuples) -``` diff --git a/vignettes/quickstart_netdis.html b/vignettes/quickstart_netdis.html deleted file mode 100644 index 8f35b252..00000000 --- a/vignettes/quickstart_netdis.html +++ /dev/null @@ -1,382 +0,0 @@ - - - - - - - - - - - - - - - - -Quick start guide for Netdis - - - - - - - - - - - - - - - - - - - - - -

Quick start guide for Netdis

-

Martin O’Reilly

-

2019-09-05

- - - -
-

Load required libraries

-
# Load libraries
-library("netdist")
-library("purrr")
-
-
-

Set Netdis parameters

-
# Maximum graphlet size to calculate counts for.
-# We choose the specific graphlet size for the Netdis metric later.
-max_graphlet_size = 4
-
-
-

Load graphs

-
# Set source directory for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-# Load query graphs
-graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
-
-
-

Generate ego networks

-
# Set ego network neighbourhood size
-neighbourhood_size = 2
-ego_networks <- purrr::map(graphs, make_named_ego_graph, 
-                           order = neighbourhood_size)
-
-
-

Count graphlets in ego networks

-
ego_graphlet_counts <- purrr::map_depth(ego_networks, 2, count_graphlets_for_graph,
-                              max_graphlet_size = max_graphlet_size)
-
-
-

Define function for calculating expected counts for ego networks

-

Here we generate these from a reference graph.

-
-

Generate scaled graphlet counts for reference graph.

-
# Load reference graph
-file <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
-                    package = "netdist")
-ref_graph <- read_simple_graph(file, format = "ncol")
-
-# Generate ego networks for reference graph
-ref_ego_networks <- make_named_ego_graph(ref_graph, order = neighbourhood_size)
-
-# Count graphlets for ego networks in reference graph
-ref_ego_graphlet_counts <- purrr::map(ref_ego_networks, count_graphlets_for_graph,
-                              max_graphlet_size = max_graphlet_size)
-
-# Scale ego-network graphlet counts by dividing by total number of k-tuples in
-  # ego-network (where k is graphlet size)
-ref_ego_graphlet_tuples <- 
-    count_graphlet_tuples_ego(ref_ego_networks, max_graphlet_size = max_graphlet_size)
-
-#ref_ego_graphlet_counts <- scale_graphlet_count(ref_ego_graphlet_counts, ref_ego_graphlet_tuples)
-
-
- - - - - - - - diff --git a/vignettes/quickstart_netdis_functions.R b/vignettes/quickstart_netdis_functions.R deleted file mode 100644 index 6f38bf50..00000000 --- a/vignettes/quickstart_netdis_functions.R +++ /dev/null @@ -1,94 +0,0 @@ -## ------------------------------------------------------------------------ -# Load libraries -library("netdist") -library("purrr") - -## ------------------------------------------------------------------------ -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -# Load query graphs -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - -# Load reference graph -# JACK - need to deal with case where ref graph not used. -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(ref_path, format = "ncol") - -## ------------------------------------------------------------------------ -# Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size <- 4 - -# Ego network neighbourhood size -neighbourhood_size <- 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Ego network density binning parameters -min_bin_count <- 5 -num_bins <- 100 - -# Reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(ref_path, format = "ncol") - - -## ------------------------------------------------------------------------ -# Load query graphs -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - -# Calculate netdis statistics -netdis_one_to_one(graph_1, graph_2, - ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) - -## ------------------------------------------------------------------------ -# Load query graphs -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") -graph_1 <- graphs$EBV -graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] - -# Calculate netdis statistics -netdis_one_to_many(graph_1, graphs_compare, - ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) - -## ------------------------------------------------------------------------ -# Load query graphs -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") - -# Calculate netdis statistics -netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - min_bin_count = 5, - num_bins = 100) - diff --git a/vignettes/quickstart_netdis_functions.html b/vignettes/quickstart_netdis_functions.html deleted file mode 100644 index eb7981fa..00000000 --- a/vignettes/quickstart_netdis_functions.html +++ /dev/null @@ -1,448 +0,0 @@ - - - - - - - - - - - - - - - - -Quick start guide for usage of netdis functions - - - - - - - - - - - - - - - - - - - - - -

Quick start guide for usage of netdis functions

-

Jack Roberts

-

2019-07-30

- - - - - - - - -
-

Do pairwise netdis calculations for many graphs

- -
## $netdis
-##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
-## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
-## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
-##              [,7]         [,8]         [,9]        [,10]
-## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
-## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
-## 
-## $comp_spec
-##    name_a name_b index_a index_b
-## 1     EBV    ECL       1       2
-## 2     EBV  HSV-1       1       3
-## 3     EBV   KSHV       1       4
-## 4     EBV    VZV       1       5
-## 5     ECL  HSV-1       2       3
-## 6     ECL   KSHV       2       4
-## 7     ECL    VZV       2       5
-## 8   HSV-1   KSHV       3       4
-## 9   HSV-1    VZV       3       5
-## 10   KSHV    VZV       4       5
-
- - - - - - - - From 2d2c2e12e5b372dffddcf2778f63b9dc74ff9182 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 11 Sep 2019 13:11:21 +0100 Subject: [PATCH 041/188] rough test for netdis statistic functions --- tests/testthat/test_measures_net_dis.R | 66 +++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 27152b5b..f4530b33 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1705,9 +1705,71 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n # } # }) - +context("Netdis: Geometric Poisson") +test_that("expected counts using geometric poisson approximation are correct", { + +}) context("Netdis: Statistic calculation") -test_that("netdis statistic function is correct", { +test_that("netdis statistic function output matches manually verified result", { + + # arbitrary counts of correct size for graphlets up to size 5 + counts_1 <- c(11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14) + counts_2 <- c(12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10) + + # add graphlet names + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids + + # manually verified results + expected_netdis_3 <- 0.03418796 + expected_netdis_4 <- 0.02091792 + expected_netdis_5 <- 0.03826385 + + # check function to test + actual_netdis_3 <- netdis(counts_1, counts_2, 3) + actual_netdis_4 <- netdis(counts_1, counts_2, 4) + actual_netdis_5 <- netdis(counts_1, counts_2, 5) + + expect_equal(expected_netdis_3, actual_netdis_3) + expect_equal(expected_netdis_4, actual_netdis_4) + expect_equal(expected_netdis_5, actual_netdis_5) + +}) +test_that("netdis_uptok gives expected netdis result for graphlets up to size k", { + # arbitrary counts of correct size for graphlets up to size 5 + counts_1 <- c(11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14) + counts_2 <- c(12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10) + + # add graphlet names + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids + + # manually verified results + expected_netdis <- c(0.03418796, 0.02091792, 0.03826385) + names(expected_netdis) <- c("netdis3", "netdis4", "netdis5") + + # check function to test + actual_netdis <- netdis_uptok(counts_1, counts_2, 5) + + expect_equal(expected_netdis, actual_netdis) +}) + +context("Netdis: full calculation pipeline") +test_that("netdis_many_to_many gives expected result", { + +}) + +context("Netdis: functions for different pairwise comparisons") +test_that("netdis_one_to_one gives expected result", { + +}) +test_that("netdis_one_to_many gives expected result", { }) From 6dffaefb309231ffacfc9be3bfc7501d76099eb9 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 11 Sep 2019 13:23:41 +0100 Subject: [PATCH 042/188] netdis_one_to_one test --- tests/testthat/test_measures_net_dis.R | 35 +++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index f4530b33..d30ddf9c 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1763,12 +1763,45 @@ test_that("netdis_uptok gives expected netdis result for graphlets up to size k" context("Netdis: full calculation pipeline") test_that("netdis_many_to_many gives expected result", { - + }) context("Netdis: functions for different pairwise comparisons") test_that("netdis_one_to_one gives expected result", { + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + # Load query and reference graphs + graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + + graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") + ref_graph <- read_simple_graph(ref_path, format = "ncol") + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified results + expected_netdis <- c(0.1846655, 0.1749835) + names(expected_netdis) <- c("netdis3", "netdis4") + + # check function to test + actual_netdis <- netdis_one_to_one(graph_1, + graph_2, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + expect_equal(expected_netdis, actual_netdis) }) test_that("netdis_one_to_many gives expected result", { From 8034b02b5db676608b7a76b16f2402b12e653296 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 23 Sep 2019 13:55:43 +0100 Subject: [PATCH 043/188] add tolerance to expect_equal --- tests/testthat/test_measures_net_dis.R | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index d30ddf9c..3f1e3504 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1805,4 +1805,34 @@ test_that("netdis_one_to_one gives expected result", { }) test_that("netdis_one_to_many gives expected result", { + # Load query and reference graphs + graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + graph_1 <- graphs$EBV + graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + + ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") + ref_graph <- read_simple_graph(ref_path, format = "ncol") + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified results + # ECL HSV-1 KSHV VZV + # netdis3 0.1846655 0.008264222 0.01005385 0.006777578 + # netdis4 0.1749835 0.165264120 0.01969246 0.159711160 + + # Calculate netdis statistics + actual_netdis <- netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + + }) From fcb36dbd58936781650c9f9db00347827f8d9a77 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 23 Sep 2019 13:57:12 +0100 Subject: [PATCH 044/188] add tolerance (fix previous commit) --- tests/testthat/test_measures_net_dis.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 3f1e3504..632b2a36 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1801,7 +1801,7 @@ test_that("netdis_one_to_one gives expected result", { min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - expect_equal(expected_netdis, actual_netdis) + expect_equal(expected_netdis, actual_netdis, tolerance = .002, scale = 1) }) test_that("netdis_one_to_many gives expected result", { From 3a685c2a941d75985bea2ee00054fea81eb5b51a Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 23 Sep 2019 14:26:07 +0100 Subject: [PATCH 045/188] add missing definition of source_dir --- tests/testthat/test_measures_net_dis.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 632b2a36..227cd523 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1804,6 +1804,8 @@ test_that("netdis_one_to_one gives expected result", { expect_equal(expected_netdis, actual_netdis, tolerance = .002, scale = 1) }) test_that("netdis_one_to_many gives expected result", { + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") # Load query and reference graphs graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") @@ -1827,11 +1829,11 @@ test_that("netdis_one_to_many gives expected result", { # Calculate netdis statistics actual_netdis <- netdis_one_to_many(graph_1, graphs_compare, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) From c535b141899f20507c76f851af768535b2070855 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 23 Sep 2019 14:35:28 +0100 Subject: [PATCH 046/188] remove commented tests using old functions --- tests/testthat/test_measures_net_dis.R | 687 +------------------------ 1 file changed, 9 insertions(+), 678 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 227cd523..15cbc8af 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -118,167 +118,6 @@ test_that(test_message, { expect_equal(expected_tuple_count_n73_ego2_gs5, actual_tuple_count_n73_ego2_gs5) }) -# context("Measures Netdis: Ego-network scaled graphlet outputs for manually verified networks") -# test_that("Ego-network 4-node graphlet counts match manually verified totals", { -# # Set up a small sample network with at least one ego-network that contains -# # at least one of each graphlets -# elist <- rbind( -# c("n1", "n2"), -# c("n2", "n3"), -# c("n1", "n4"), -# c("n2", "n5"), -# c("n1", "n6"), -# c("n1", "n7"), -# c("n2", "n4"), -# c("n4", "n6"), -# c("n6", "n8"), -# c("n7", "n8"), -# c("n7", "n9"), -# c("n7", "n10"), -# c("n8", "n9"), -# c("n8", "n10"), -# c("n9", "n10") -# ) -# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) -# -# # Set node and graphlet labels to use for row and col names in expected counts -# node_labels <- igraph::V(graph)$name -# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") -# -# # Count graphlets in each ego network of the graph with neighbourhood sizes of 1 and 2 -# max_graphlet_size <- 4 -# min_ego_edges <- 0 -# min_ego_nodes <- 0 -# -# actual_counts_order_1 <- -# count_graphlets_ego_scaled(graph, -# max_graphlet_size = max_graphlet_size, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes, -# neighbourhood_size = 1 -# ) -# actual_counts_order_2 <- -# count_graphlets_ego_scaled(graph, -# max_graphlet_size = max_graphlet_size, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes, -# neighbourhood_size = 2 -# ) -# -# graphlet_key <- graphlet_key(max_graphlet_size) -# k <- graphlet_key$node_count -# # Set manually verified counts -# # 1-step ego networks -# expected_counts_order_1 <- rbind( -# c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), -# c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), -# c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) -# ) -# rownames(expected_counts_order_1) <- node_labels -# colnames(expected_counts_order_1) <- graphlet_labels -# # 2-step ego networks -# expected_counts_order_2 <- rbind( -# c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), -# c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), -# c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), -# c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), -# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), -# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) -# ) -# rownames(expected_counts_order_2) <- node_labels -# colnames(expected_counts_order_2) <- graphlet_labels -# -# # Test that actual counts match expected with only counts requested (default) -# expect_equal(actual_counts_order_1, expected_counts_order_1) -# expect_equal(actual_counts_order_2, expected_counts_order_2) -# -# # Test that actual counts and returned ego networks match expected -# # 1. Define expected -# expected_ego_networks_order_1 <- make_named_ego_graph(graph, -# order = 1, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes -# ) -# expected_ego_networks_order_2 <- make_named_ego_graph(graph, -# order = 2, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes -# ) -# expected_counts_with_networks_order_1 <- -# list( -# graphlet_counts = expected_counts_order_1, -# ego_networks = expected_ego_networks_order_1 -# ) -# expected_counts_with_networks_order_2 <- -# list( -# graphlet_counts = expected_counts_order_2, -# ego_networks = expected_ego_networks_order_2 -# ) -# # 2. Calculate actual -# actual_counts_with_networks_order_1 <- -# count_graphlets_ego_scaled(graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes, -# return_ego_networks = TRUE -# ) -# actual_counts_with_networks_order_2 <- -# count_graphlets_ego_scaled(graph, -# max_graphlet_size = max_graphlet_size, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes, -# neighbourhood_size = 2, return_ego_networks = TRUE -# ) -# -# # 3. Compare -# # Comparison is not implemented for igraph objects, so convert all igraphs to -# # indexed edge list and then compare. Do in-situ replacement of igraphs with -# # indexed edge lists to ensure we are checking full properties of returned -# # objects (i.e. named lists with matching elements). -# # 3a. Convert expected and actual ego networks from igraphs to indexed edges -# expected_counts_with_networks_order_1$ego_networks <- -# purrr::map( -# expected_counts_with_networks_order_1$ego_networks, -# graph_to_indexed_edges -# ) -# expected_counts_with_networks_order_2$ego_networks <- -# purrr::map( -# expected_counts_with_networks_order_2$ego_networks, -# graph_to_indexed_edges -# ) -# actual_counts_with_networks_order_1$ego_networks <- -# purrr::map( -# actual_counts_with_networks_order_1$ego_networks, -# graph_to_indexed_edges -# ) -# actual_counts_with_networks_order_2$ego_networks <- -# purrr::map( -# actual_counts_with_networks_order_2$ego_networks, -# graph_to_indexed_edges -# ) -# # 3b. Do comparison -# expect_equal( -# actual_counts_with_networks_order_1, -# expected_counts_with_networks_order_1 -# ) -# expect_equal( -# actual_counts_with_networks_order_2, -# expected_counts_with_networks_order_2 -# ) -# }) - context("Measures Netdis: Ego-network density values match those for manually verified networks") test_that("Ego-network 4-node density values match manually verified totals", { # Set up a small sample network with at least one ego-network that contains @@ -1006,145 +845,6 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { } }) - -# test_that("netdis_expected_graphlet_counts_ego works for graphlets up to 4 nodes", { -# # Helper function to generate graphs with known density and number of nodes -# # Set up a small sample network with at least one ego-network that contains -# # at least one of each graphlets -# elist <- rbind( -# c("n1", "n2"), -# c("n2", "n3"), -# c("n1", "n4"), -# c("n2", "n5"), -# c("n1", "n6"), -# c("n1", "n7"), -# c("n2", "n4"), -# c("n4", "n6"), -# c("n6", "n8"), -# c("n7", "n8"), -# c("n7", "n9"), -# c("n7", "n10"), -# c("n8", "n9"), -# c("n8", "n10"), -# c("n9", "n10") -# ) -# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) -# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") -# graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) -# max_graphlet_size <- 4 -# min_ego_edges <- 0 -# min_ego_nodes <- 0 -# -# # Make graph ego networks -# ego_networks_o1 <- make_named_ego_graph(graph, -# order = 1, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes -# ) -# ego_networks_o2 <- make_named_ego_graph(graph, -# order = 2, -# min_ego_edges = min_ego_edges, -# min_ego_nodes = min_ego_nodes -# ) -# # Set manually-verified node counts and densities -# # 1. Ego-networks of order 1 -# num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) -# num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) -# max_edges_o1 <- choose(num_nodes_o1, 2) -# densities_o1 <- num_edges_o1 / max_edges_o1 -# # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 -# # 2. Ego-networks of order 2 -# num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) -# num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) -# max_edges_o2 <- choose(num_nodes_o2, 2) -# densities_o2 <- num_edges_o2 / max_edges_o2 -# # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 -# # Set manually defined density breaks and indexes -# breaks <- c(0, 0.11, 0.21, 0.31, 0.41, 0.51, 0.61, 0.71, 0.81, 0.91, 1.0) -# density_indexes_o1 <- c(6, 5, 10, 9, 10, 7, 7, 7, 10, 10) -# density_indexes_o2 <- c(4, 4, 5, 4, 5, 5, 5, 6, 6, 6) -# # Set dummy reference counts -# scaled_reference_counts <- rbind( -# c(1, 2, 3, 4, 5, 6, 7, 8, 9), -# c(11, 12, 13, 14, 15, 16, 17, 18, 19), -# c(21, 22, 23, 24, 25, 26, 27, 28, 29), -# c(31, 32, 33, 34, 35, 36, 37, 38, 39), -# c(41, 42, 43, 44, 45, 46, 47, 48, 49), -# c(51, 52, 53, 54, 55, 56, 57, 58, 59), -# c(61, 62, 63, 64, 65, 66, 67, 68, 69), -# c(71, 72, 73, 74, 75, 76, 77, 78, 79), -# c(81, 82, 83, 84, 85, 86, 87, 88, 89), -# c(91, 92, 93, 94, 95, 96, 97, 98, 99) -# ) -# expected_dims <- dim(scaled_reference_counts) -# min_ego_nodes <- 3 -# min_ego_edges <- 1 -# -# # Helper function to calculate expected expected graphlet counts -# expected_expected_graphlet_counts_fn <- function(density_index, node_count) { -# reference_counts <- scaled_reference_counts[density_index, ] -# reference_counts * choose(node_count, graphlet_sizes) -# } -# # Calculate expected graphlet counts. NOTE: We expect a matrix with graphlet -# # types as columns and ego networks for nodes in graph as rows -# expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( -# density_indexes_o1, num_nodes_o1, expected_expected_graphlet_counts_fn -# ))) -# expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( -# density_indexes_o2, num_nodes_o2, expected_expected_graphlet_counts_fn -# ))) -# # Sanity check for expected output shape. Should be matrix with graphlet types -# # as columns and nodes as rows -# expect_equal(dim(expected_expected_graphlet_counts_ego_o1), expected_dims) -# expect_equal(dim(expected_expected_graphlet_counts_ego_o2), expected_dims) -# # Set column labels to graphlet names -# colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels -# colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels -# # Set row labels to ego network names -# rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) -# rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) -# # Drop rows for nodes with ewer than minumum required nodes and edges in ego -# # network -# expected_expected_graphlet_counts_ego_o1 <- -# expected_expected_graphlet_counts_ego_o1[ -# (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), -# ] -# expected_expected_graphlet_counts_ego_o2 <- -# expected_expected_graphlet_counts_ego_o2[ -# (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), -# ] -# -# # Calculate actual output of function under test -# actual_expected_graphlet_counts_ego_o1 <- -# netdis_expected_graphlet_counts_ego( -# graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, density_breaks = breaks, -# density_binned_reference_counts = scaled_reference_counts, -# min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, -# scale_fn = count_graphlet_tuples -# ) -# actual_expected_graphlet_counts_ego_o2 <- -# netdis_expected_graphlet_counts_ego( -# graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2, density_breaks = breaks, -# density_binned_reference_counts = scaled_reference_counts, -# min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, -# scale_fn = count_graphlet_tuples -# ) -# -# # Compare actual to expected -# expect_equal( -# actual_expected_graphlet_counts_ego_o1, -# expected_expected_graphlet_counts_ego_o1 -# ) -# expect_equal( -# actual_expected_graphlet_counts_ego_o2, -# expected_expected_graphlet_counts_ego_o2 -# ) -# }) - test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes # Set up a small sample network with at least one ego-network that contains @@ -1327,384 +1027,6 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n ) }) -# test_that("netdis_expected_graphlet_counts_ego_fn works for graphlets up to 4 nodes", { -# # Set up a small sample network with at least one ego-network that contains -# # at least one of each graphlets -# elist <- rbind( -# c("n1", "n2"), -# c("n2", "n3"), -# c("n1", "n4"), -# c("n2", "n5"), -# c("n1", "n6"), -# c("n1", "n7"), -# c("n2", "n4"), -# c("n4", "n6"), -# c("n6", "n8"), -# c("n7", "n8"), -# c("n7", "n9"), -# c("n7", "n10"), -# c("n8", "n9"), -# c("n8", "n10"), -# c("n9", "n10") -# ) -# graph <- igraph::graph_from_edgelist(elist, directed = FALSE) -# graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") -# graphlet_sizes <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) -# names(graphlet_sizes) <- graphlet_labels -# max_graphlet_size <- 4 -# # Make graph ego networks -# min_ego_nodes <- 0 -# min_edgo_edges <- 0 -# ego_networks_o1 <- make_named_ego_graph(graph, -# order = 1, -# min_ego_nodes = min_ego_nodes, -# min_ego_edges = min_edgo_edges -# ) -# ego_networks_o2 <- make_named_ego_graph(graph, -# order = 2, -# min_ego_nodes = min_ego_nodes, -# min_ego_edges = min_edgo_edges -# ) -# # Set manually-verified node counts and densities -# # 1. Ego-networks of order 1 -# num_nodes_o1 <- c(5, 5, 2, 4, 2, 4, 5, 5, 4, 4) -# num_edges_o1 <- c(6, 5, 1, 5, 1, 4, 7, 7, 6, 6) -# max_edges_o1 <- choose(num_nodes_o1, 2) -# densities_o1 <- num_edges_o1 / max_edges_o1 -# # Order 1 densities should be: 0.6000000 0.5000000 1.0000000 0.8333333 1.0000000 0.6666667 0.7000000 0.7000000 1.0000000 1.0000000 -# # 2. Ego-networks of order 2 -# num_nodes_o2 <- c(10, 7, 5, 8, 5, 8, 8, 7, 6, 6) -# num_edges_o2 <- c(15, 8, 5, 10, 5, 13, 13, 11, 9, 9) -# max_edges_o2 <- choose(num_nodes_o2, 2) -# densities_o2 <- num_edges_o2 / max_edges_o2 -# # Order 2 densities should be: 0.3333333 0.3809524 0.5000000 0.3571429 0.5000000 0.4642857 0.4642857 0.5238095 0.6000000 0.6000000 -# # Set manually determined density breaks and indexes, based on a min bin count -# # of 2 and an initial request for 100 bins -# min_bin_count <- 2 -# num_bins <- 100 -# num_breaks <- num_bins + 1 -# min_density_o1 <- 0.5 -# max_density_o1 <- 1.0 -# breaks_o1 <- seq(min_density_o1, max_density_o1, length.out = num_breaks)[c(1, 22, 42, 101)] -# density_indexes_o1 <- c(1, 1, 3, 3, 3, 2, 2, 2, 3, 3) -# min_density_o2 <- 1 / 3 -# max_density_o2 <- 0.6 -# breaks_o2 <- seq(min_density_o2, max_density_o2, length.out = num_breaks)[c(1, 10, 51, 64, 101)] -# density_indexes_o2 <- c(1, 2, 3, 1, 3, 2, 2, 4, 4, 4) -# # Guard against errors in manually determined breaks and indexes by checking -# # against already tested code. This also lets us ensure we handle densities -# # falling exactly on a bin boundary the same as the code under test. -# comp_binned_densities_o1 <- binned_densities_adaptive( -# densities_o1, -# min_counts_per_interval = min_bin_count, -# num_intervals = num_bins -# ) -# comp_binned_densities_o2 <- binned_densities_adaptive( -# densities_o2, -# min_counts_per_interval = min_bin_count, -# num_intervals = num_bins -# ) -# expect_equal( -# comp_binned_densities_o1, -# list( -# densities = densities_o1, -# interval_indexes = density_indexes_o1, -# breaks = breaks_o1 -# ) -# ) -# expect_equal( -# comp_binned_densities_o2, -# list( -# densities = densities_o2, -# interval_indexes = density_indexes_o2, -# breaks = breaks_o2 -# ) -# ) -# -# # Set manually verified scaled ego-network graphlet counts -# graphlet_key <- graphlet_key(max_graphlet_size) -# k <- graphlet_key$node_count -# # 1-step ego networks -# scaled_reference_counts_o1 <- rbind( -# c(6, 5, 2, 0, 1, 0, 2, 1, 0) / zeros_to_ones(choose(5, k)), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), -# c(5, 2, 2, 0, 0, 0, 0, 1, 0) / zeros_to_ones(choose(4, k)), -# c(1, 0, 0, 0, 0, 0, 0, 0, 0) / zeros_to_ones(choose(2, k)), -# c(4, 2, 1, 0, 0, 0, 1, 0, 0) / zeros_to_ones(choose(4, k)), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), -# c(7, 3, 4, 0, 0, 0, 3, 0, 1) / zeros_to_ones(choose(5, k)), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)), -# c(6, 0, 4, 0, 0, 0, 0, 0, 1) / zeros_to_ones(choose(4, k)) -# ) -# # 2-step ego networks -# scaled_reference_counts_o2 <- rbind( -# c(15, 18, 6, 21, 3, 1, 11, 1, 1) / zeros_to_ones(choose(10, k)), -# c(8, 10, 2, 6, 3, 0, 4, 1, 0) / zeros_to_ones(choose(7, k)), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), -# c(10, 14, 2, 11, 3, 1, 5, 1, 0) / zeros_to_ones(choose(8, k)), -# c(5, 5, 1, 0, 2, 0, 2, 0, 0) / zeros_to_ones(choose(5, k)), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), -# c(13, 13, 6, 15, 1, 1, 9, 1, 1) / zeros_to_ones(choose(8, k)), -# c(11, 10, 5, 10, 0, 1, 8, 0, 1) / zeros_to_ones(choose(7, k)), -# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)), -# c(9, 8, 4, 4, 0, 1, 6, 0, 1) / zeros_to_ones(choose(6, k)) -# ) -# min_ego_nodes <- 3 -# min_ego_edges <- 1 -# # Drop rows for nodes with ewer than minumum required nodes and edges in ego -# # network -# scaled_reference_counts_o1 <- -# scaled_reference_counts_o1[ -# (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges), -# ] -# scaled_reference_counts_o2 <- -# scaled_reference_counts_o2[ -# (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges), -# ] -# density_indexes_o1 <- density_indexes_o1[ -# (num_nodes_o1 >= min_ego_nodes) & (num_edges_o1 >= min_ego_edges) -# ] -# density_indexes_o2 <- density_indexes_o2[ -# (num_nodes_o2 >= min_ego_nodes) & (num_edges_o2 >= min_ego_edges) -# ] -# # Average manually verified scaled reference counts across density bins -# density_binned_reference_counts_o1 <- rbind( -# (scaled_reference_counts_o1[1, ] + scaled_reference_counts_o1[2, ]) / 2, -# (scaled_reference_counts_o1[4, ] + scaled_reference_counts_o1[5, ] + -# scaled_reference_counts_o1[6, ]) / 3, -# (scaled_reference_counts_o1[3, ] + -# scaled_reference_counts_o1[7, ] + -# scaled_reference_counts_o1[8, ]) / 3 -# ) -# rownames(density_binned_reference_counts_o1) <- 1:3 -# density_binned_reference_counts_o2 <- rbind( -# (scaled_reference_counts_o2[1, ] + scaled_reference_counts_o2[4, ]) / 2, -# (scaled_reference_counts_o2[2, ] + scaled_reference_counts_o2[6, ] + -# scaled_reference_counts_o2[7, ]) / 3, -# (scaled_reference_counts_o2[3, ] + scaled_reference_counts_o2[5, ]) / 2, -# (scaled_reference_counts_o2[8, ] + scaled_reference_counts_o2[9, ] + -# scaled_reference_counts_o2[10, ]) / 3 -# ) -# rownames(density_binned_reference_counts_o2) <- 1:4 -# -# # Helper functions to calculate expected expected graphlet counts -# expected_expected_graphlet_counts_o1_fn <- function(density_index, node_count) { -# reference_counts <- density_binned_reference_counts_o1[density_index, ] -# reference_counts * choose(node_count, graphlet_sizes) -# } -# expected_expected_graphlet_counts_o2_fn <- function(density_index, node_count) { -# reference_counts <- density_binned_reference_counts_o2[density_index, ] -# reference_counts * choose(node_count, graphlet_sizes) -# } -# # Calculate expected graphlet counts -# expected_expected_graphlet_counts_ego_o1 <- t(simplify2array(purrr::map2( -# density_indexes_o1, num_nodes_o1[(num_nodes_o1 >= min_ego_nodes)], -# expected_expected_graphlet_counts_o1_fn -# ))) -# rownames(expected_expected_graphlet_counts_ego_o1) <- -# names(ego_networks_o1[(num_nodes_o1 >= min_ego_nodes)]) -# expected_expected_graphlet_counts_ego_o2 <- t(simplify2array(purrr::map2( -# density_indexes_o2, num_nodes_o2[(num_nodes_o2 >= min_ego_nodes)], -# expected_expected_graphlet_counts_o2_fn -# ))) -# rownames(expected_expected_graphlet_counts_ego_o2) <- -# names(ego_networks_o2[(num_nodes_o2 >= min_ego_nodes)]) -# -# # Sanity check manually derived expected expected counts by comparing against -# # pre-tested fully applied expected_graphlet_counts_ego function -# expect_equal( -# expected_expected_graphlet_counts_ego_o1, -# netdis_expected_graphlet_counts_ego( -# graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, -# density_breaks = breaks_o1, -# density_binned_reference_counts_o1, -# min_ego_nodes = min_ego_nodes, -# min_ego_edges = min_ego_edges, -# scale_fn = count_graphlet_tuples -# ) -# ) -# expect_equal( -# expected_expected_graphlet_counts_ego_o2, -# netdis_expected_graphlet_counts_ego( -# graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2, -# density_breaks = breaks_o2, -# density_binned_reference_counts_o2, -# min_ego_nodes = min_ego_nodes, -# min_ego_edges = min_ego_edges, -# scale_fn = count_graphlet_tuples -# ) -# ) -# -# # Generate partially applied functions using function under test -# actual_expected_graphlet_counts_ego_fn_o1 <- -# netdis_expected_graphlet_counts_ego_fn( -# graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, -# min_bin_count = min_bin_count, -# num_bins = num_bins, -# min_ego_nodes = min_ego_nodes, -# min_ego_edges = min_ego_edges, -# scale_fn = count_graphlet_tuples -# ) -# actual_expected_graphlet_counts_ego_fn_o2 <- -# netdis_expected_graphlet_counts_ego_fn( -# graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2, -# min_bin_count = min_bin_count, -# num_bins = num_bins, -# min_ego_nodes = min_ego_nodes, -# min_ego_edges = min_ego_edges, -# scale_fn = count_graphlet_tuples -# ) -# # Generate actual expected accounts by applying generated functions to test -# # graph -# actual_expected_graphlet_counts_ego_o1 <- -# actual_expected_graphlet_counts_ego_fn_o1(graph) -# actual_expected_graphlet_counts_ego_o2 <- -# actual_expected_graphlet_counts_ego_fn_o2(graph) -# -# # Compare actual to expected -# expect_equal( -# actual_expected_graphlet_counts_ego_o1, -# expected_expected_graphlet_counts_ego_o1 -# ) -# expect_equal( -# actual_expected_graphlet_counts_ego_o2, -# expected_expected_graphlet_counts_ego_o2 -# ) -# }) - -# context("Measures Netdis: Centered graphlet counts") -# test_that("netdis_centred_graphlet_counts_ego is correct", { -# # Set up small sample networks each with each graphlet represented in at least -# # one ego network -# ref_elist <- rbind( -# c("n1", "n2"), -# c("n1", "n3"), -# c("n1", "n4"), -# c("n1", "n5"), -# c("n1", "n6"), -# c("n2", "n7"), -# c("n2", "n8"), -# c("n2", "n9"), -# c("n9", "n10"), -# c("n10", "n11"), -# c("n11", "n12"), -# c("n11", "n13"), -# c("n2", "n14"), -# c("n8", "n14"), -# c("n12", "n15"), -# c("n12", "n16"), -# c("n15", "n17"), -# c("n12", "n18"), -# c("n15", "n18"), -# c("n16", "n17"), -# c("n16", "n18"), -# c("n17", "n18"), -# c("n16", "n19"), -# c("n16", "n20"), -# c("n16", "n21"), -# c("n19", "n20"), -# c("n19", "n21"), -# c("n15", "n22"), -# c("n15", "n23"), -# c("n15", "n24"), -# c("n22", "n23"), -# c("n22", "n24"), -# c("n23", "n24") -# ) -# ref_graph <- igraph::graph_from_edgelist(ref_elist, directed = FALSE) -# -# query_elist <- rbind( -# c("n1", "n2"), -# c("n2", "n3"), -# c("n1", "n4"), -# c("n2", "n5"), -# c("n1", "n6"), -# c("n1", "n7"), -# c("n2", "n4"), -# c("n4", "n6"), -# c("n6", "n8"), -# c("n7", "n8"), -# c("n7", "n9"), -# c("n7", "n10"), -# c("n8", "n9"), -# c("n8", "n10"), -# c("n9", "n10") -# ) -# query_graph <- igraph::graph_from_edgelist(query_elist, directed = FALSE) -# -# max_graphlet_size <- 4 -# # Use pre-tested functions to generate ego-network graphlet counts -# # 1. Reference graph ego-network graphlet counts -# ref_o1 <- count_graphlets_ego( -# ref_graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, return_ego_networks = TRUE -# ) -# ego_counts_ref_o1 <- ref_o1$graphlet_counts -# ego_networks_ref_o1 <- ref_o1$ego_networks -# density_ref_o1 <- sapply(ego_networks_ref_o1, igraph::edge_density) -# -# ref_o2 <- count_graphlets_ego( -# ref_graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2, return_ego_networks = TRUE -# ) -# ego_counts_ref_o2 <- ref_o2$graphlet_counts -# ego_networks_ref_o2 <- ref_o2$ego_networks -# density_ref_o2 <- sapply(ego_networks_ref_o2, igraph::edge_density) -# -# # 2. Query graph ego-network graphlet countsa -# query_o1 <- count_graphlets_ego( -# query_graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 1, return_ego_networks = TRUE -# ) -# ego_counts_query_o1 <- query_o1$graphlet_counts -# ego_networks_query_o1 <- query_o1$ego_networks -# density_query_o1 <- sapply(ego_networks_query_o1, igraph::edge_density) -# -# query_o2 <- count_graphlets_ego( -# query_graph, -# max_graphlet_size = max_graphlet_size, -# neighbourhood_size = 2, return_ego_networks = TRUE -# ) -# ego_counts_query_o2 <- query_o2$graphlet_counts -# ego_networks_query_o2 <- query_o2$ego_networks -# density_query_o2 <- sapply(ego_networks_query_o2, igraph::edge_density) -# -# centred_counts_k4 <- function(query_graphlet_count, ref_graphlet_count, -# query_node_counts, ref_node_count, -# min_nodes, min_edges, -# min_bin_count, num_bins) { -# graphlet_node_counts_k4 <- c(2, 3, 3, 4, 4, 4, 4, 4, 4) -# # 1. Calculate scaling factors for each reference and query graphlet count -# # These are nCk, where n is the number of nodes in the network and -# # k is the number of nodes in the graphlet -# ref_scale_factor <- sapply( -# graphlet_node_counts_k4, FUN <- function(k) { -# choose(ref_node_count, k) -# } -# ) -# query_scale_factor <- sapply( -# graphlet_node_counts_k4, FUN <- function(k) { -# choose(query_node_count, k) -# } -# ) -# # 2. Calculate scaled reference counts by dividing by ref_scale_factor -# ref_scaled_graphlet_count <- query_graphlet_count / ref_scale_factor -# # -# } -# }) - context("Netdis: Geometric Poisson") test_that("expected counts using geometric poisson approximation are correct", { @@ -1712,6 +1034,7 @@ test_that("expected counts using geometric poisson approximation are correct", { context("Netdis: Statistic calculation") test_that("netdis statistic function output matches manually verified result", { + # TODO Rewrite with more realistic counts. # arbitrary counts of correct size for graphlets up to size 5 counts_1 <- c(11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, @@ -1740,6 +1063,8 @@ test_that("netdis statistic function output matches manually verified result", { }) test_that("netdis_uptok gives expected netdis result for graphlets up to size k", { + # TODO Rewrite with more realistic counts. + # arbitrary counts of correct size for graphlets up to size 5 counts_1 <- c(11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14) @@ -1768,6 +1093,9 @@ test_that("netdis_many_to_many gives expected result", { context("Netdis: functions for different pairwise comparisons") test_that("netdis_one_to_one gives expected result", { + # TODO This test is not robust. Rewrite with basic network that gives known + # result. + # Set source directory for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") @@ -1804,6 +1132,9 @@ test_that("netdis_one_to_one gives expected result", { expect_equal(expected_netdis, actual_netdis, tolerance = .002, scale = 1) }) test_that("netdis_one_to_many gives expected result", { + # TODO This test is not robust. Rewrite with basic network that gives known + # result. + # Set source directory for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") From a4adf5319a35bd5c7af4885f6a27632f068377b5 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Mon, 23 Sep 2019 16:24:19 +0100 Subject: [PATCH 047/188] many to many test --- tests/testthat/test_measures_net_dis.R | 77 ++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 15cbc8af..2ac90ec8 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1029,6 +1029,7 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n context("Netdis: Geometric Poisson") test_that("expected counts using geometric poisson approximation are correct", { + # TODO write this test }) @@ -1088,6 +1089,69 @@ test_that("netdis_uptok gives expected netdis result for graphlets up to size k" context("Netdis: full calculation pipeline") test_that("netdis_many_to_many gives expected result", { + # TODO write this test + # TODO This test is not robust. Rewrite with basic network that gives known + # result. + + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + # Load query and reference graphs + graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + graphs <- graphs[c("EBV", "ECL", "HSV-1", "KSHV")] + + ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") + ref_graph <- read_simple_graph(ref_path, format = "ncol") + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified results + # $netdis + # [,1] [,2] [,3] [,4] [,5] [,6] + # netdis3 0.1846655 0.008264222 0.01005385 0.2065762 0.2091241 0.0001335756 + # netdis4 0.1749835 0.165264120 0.01969246 0.2917612 0.2215579 0.0760242643 + # + # $comp_spec + # name_a name_b index_a index_b + # 1 EBV ECL 1 2 + # 2 EBV HSV-1 1 3 + # 3 EBV KSHV 1 4 + # 4 ECL HSV-1 2 3 + # 5 ECL KSHV 2 4 + # 6 HSV-1 KSHV 3 4 + expected_netdis_netdis <- matrix(nrow = 2, ncol = 6) + expected_netdis_netdis[1,] <- c(0.1846655, 0.008264222, 0.01005385, + 0.2065762, 0.2091241, 0.0001335756) + expected_netdis_netdis[2,] <- c(0.1749835, 0.165264120, 0.01969246, + 0.2917612, 0.2215579, 0.0760242643) + rownames(expected_netdis_netdis) <- c("netdis3", "netdis4") + + expected_netdis_comp_spec <- cross_comparison_spec( + list("EBV" = c(), + "ECL" = c(), + "HSV-1" = c(), + "KSHV" = c()) + ) + + expected_netdis = list(netdis = expected_netdis_netdis, + comp_spec = expected_netdis_comp_spec) + + + # Calculate netdis statistics + actual_netdis <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + # Check results as expected + expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) }) @@ -1129,7 +1193,7 @@ test_that("netdis_one_to_one gives expected result", { min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - expect_equal(expected_netdis, actual_netdis, tolerance = .002, scale = 1) + expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) }) test_that("netdis_one_to_many gives expected result", { # TODO This test is not robust. Rewrite with basic network that gives known @@ -1154,9 +1218,14 @@ test_that("netdis_one_to_many gives expected result", { min_ego_edges <- 1 # manually verified results - # ECL HSV-1 KSHV VZV + # ECL HSV-1 KSHV VZV # netdis3 0.1846655 0.008264222 0.01005385 0.006777578 # netdis4 0.1749835 0.165264120 0.01969246 0.159711160 + expected_netdis <- matrix(nrow = 2, ncol = 4) + colnames(expected_netdis) <- c("ECL", "HSV-1", "KSHV", "VZV") + rownames(expected_netdis) <- c("netdis3", "netdis4") + expected_netdis[1,] <- c(0.1846655, 0.008264222, 0.01005385, 0.006777578) + expected_netdis[2,] <- c(0.1749835, 0.165264120, 0.01969246, 0.159711160) # Calculate netdis statistics actual_netdis <- netdis_one_to_many(graph_1, graphs_compare, @@ -1166,6 +1235,6 @@ test_that("netdis_one_to_many gives expected result", { min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges) - - + # Check results as expected + expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) }) From 613f559d232fa7aabbaa2a3c1a3d6426fcb8d694 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Wed, 25 Sep 2019 13:56:15 +0100 Subject: [PATCH 048/188] tidy comments --- R/measures_net_dis.R | 21 --------------------- R/orca_interface.R | 10 ---------- tests/testthat/test_measures_net_dis.R | 1 - 3 files changed, 32 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index ae40a076..3015a797 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,4 +1,3 @@ -#' FLAGUSED #' Netdis between two graphs #' @param graph_1 First query graph #' @param graph_2 Second query graph @@ -68,7 +67,6 @@ netdis_one_to_one <- function(graph_1, graph_2, result$netdis[, 1] } -#' FLAGUSED #' Netdis comparisons between one graph and many other graphs #' @param graph_1 query graph - this graph will be compared with #' all graphs in graphs_compare @@ -143,7 +141,6 @@ netdis_one_to_many <- function(graph_1, graphs_compare, } -#' FLAGUSED #' Netdis between all graph pairs #' @param graphs Query graphs #' @param ref_graph Reference graph @@ -325,7 +322,6 @@ netdis_many_to_many <- function(graphs, } -#' FLAGUSED #' Netdis #' #' Calculate Netdis statistic between two graphs from their Centred Graphlet @@ -357,7 +353,6 @@ netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, 0.5 * (1 - netds2) } -#' FLAGUSED #' Netdis - graphlets up to max_graphlet_size #' #' Calculate Netdis statistic between two graphs from their Centred Graphlet @@ -395,13 +390,8 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, netdis_statistics } -#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' -#' JACK To follow through logic of paper steps, wanted to pass -#' ego networks to the function, not the input query graph -#' (as in netdis_expected_graphlet_counts_ego above). -#' #' Used by \code{netdis_expected_graphlet_counts_ego_fn} to #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. @@ -446,7 +436,6 @@ netdis_expected_graphlet_counts_per_ego <- function( expected_graphlet_counts } -#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' #' Used by \code{netdis_expected_graphlet_counts_ego} to @@ -492,7 +481,6 @@ netdis_expected_graphlet_counts <- function(graph, matched_reference_counts } -#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' #' Used by \code{netdis_expected_graphlet_counts_ego_fn} to @@ -532,7 +520,6 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, mean_density_binned_graphlet_counts } -#' FLAGUSED #' For case where don't want to use binning, return a single bin which covers #' the full range of possible density values. #' @param densities Ego network density values (only used to return @@ -544,7 +531,6 @@ single_density_bin <- function(densities) { breaks = c(0, 1)) } -#' FLAGUSED #' INTERNAL FUNCTION - Do not call directly #' #' Used to calculate expected graphlet counts for each density bin. @@ -582,7 +568,6 @@ density_binned_counts <- function(graphlet_counts, } -#' FLAGUSED #' Calculate expected counts in density bins using #' geometric poisson (Polya-Aeppli) approximation. #' @param graphlet_counts Graphlet counts for a number of ego_networks. @@ -635,7 +620,6 @@ density_binned_counts_gp <- function(graphlet_counts, } -#' FLAGUSED #' Replace zero values in a vector with ones. Used by #' \code{scale_graphlet_count} to prevent divide by #' zero errors. @@ -649,7 +633,6 @@ zeros_to_ones <- function(v) { } -#' FLAGUSED #' Divide graphlet counts by pre-computed scaling factor from #' \code{count_graphlet_tuples} output. #' @param graphlet_count Pre-computed graphlet counts. @@ -662,7 +645,6 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { } -#' FLAGUSED #' Run count_graphlet_tuples across pre-computed ego networks. #' @param ego_networks Pre-generated ego networks for an input graph. #' @param max_graphlet_size Determines the maximum size of graphlets included @@ -677,7 +659,6 @@ count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { } -#' FLAGUSED #' Calculate ego network edge densities. #' @param ego_networks Pre-generated ego networks for an input graph. #' @export @@ -691,7 +672,6 @@ ego_network_density <- function(ego_networks) { } -#' FLAGUSED #' Scale graphlet counts for an ego network by the n choose k possible #' choices of k nodes in that ego-network, where n is the number of nodes #' in the ego network and k is the number of nodes in the graphlet. @@ -718,7 +698,6 @@ scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, } -#' FLAGUSED #' For each graphlet calculate the number of possible sets of k nodes in the #' query graph, where k is the number of nodes in the graphlet. #' diff --git a/R/orca_interface.R b/R/orca_interface.R index d77bedb6..35b6b5f1 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -42,7 +42,6 @@ indexed_edges_to_graph <- function(indexed_edges) { return(graph) } -#' FLAGUSED #' Read all graphs in a directory, simplifying as requested #' #' Reads graph data from all files in a directory matching the specified @@ -112,7 +111,6 @@ read_simple_graphs <- function(source_dir, return(graphs) } -#' FLAGUSED #' Read a graph from file, simplifying as requested #' #' Reads graph data from file, constructing an a igraph graph object, making the @@ -153,7 +151,6 @@ read_simple_graph <- function(file, format, as_undirected = TRUE, ) } -#' FLAGUSED #' Simplify an igraph #' #' Takes a igraph graph object and makes the requested subset of the following @@ -387,11 +384,7 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size } } -#' FLAGUSED #' ego_to_graphlet_counts -#' JACK To follow through logic of paper steps, wanted to pass -#' ego networks to the function for generating graphlet counts, -#' not the input query graph directly (as in count_graphlets_ego above). #' #' Calculates graphlet counts for previously generated ego networks. #' @param ego_networks Named list of ego networks for a graph. @@ -417,7 +410,6 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { return(ego_graphlet_counts) } -#' FLAGUSED #' Get ego-networks for a graph as a named list #' #' Simple wrapper for the \code{igraph::make_ego_graph} function that names @@ -567,7 +559,6 @@ orbit_key <- function(max_graphlet_size) { return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) } -#' FLAGUSED #' Graphlet IDs for size #' #' List IDs for all graphlets of a specified size @@ -632,7 +623,6 @@ gdd_for_all_graphs <- function( ) } -#' FLAGUSED #' Generate a cross-comparison specification #' #' Creates a cross-comparison matrix with pair-wise combinations diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 2ac90ec8..b23179a5 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1089,7 +1089,6 @@ test_that("netdis_uptok gives expected netdis result for graphlets up to size k" context("Netdis: full calculation pipeline") test_that("netdis_many_to_many gives expected result", { - # TODO write this test # TODO This test is not robust. Rewrite with basic network that gives known # result. From 4a57a1190dc8da48fc22ebf33c9701c34d628f5f Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 26 Sep 2019 15:26:06 +0100 Subject: [PATCH 049/188] adding number of nodes to graphlet_counts objects --- R/measures_net_dis.R | 4 ++++ R/orca_interface.R | 14 +++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 3015a797..fee6f4f8 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -201,6 +201,8 @@ netdis_many_to_many <- function(graphs, min_ego_edges = min_ego_edges ) + rm(graphs) + ## ------------------------------------------------------------------------ # Count graphlets for ego networks in query graphs graphlet_counts <- purrr::map( @@ -219,6 +221,8 @@ netdis_many_to_many <- function(graphs, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) + + rm(ref_graph) # Get ego network graphlet counts graphlet_counts_ref <- ego_to_graphlet_counts( diff --git a/R/orca_interface.R b/R/orca_interface.R index 35b6b5f1..028af0a4 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -384,6 +384,14 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size } } +#' ego_network_node_counts +#' +#' Calculates number of nodes in each ego network. +#' @param ego_networks Named list of ego networks for a graph. +ego_network_node_counts <- function(ego_networks) { + simplify2array(purrr::map(ego_networks, igraph::vcount)) +} + #' ego_to_graphlet_counts #' #' Calculates graphlet counts for previously generated ego networks. @@ -401,10 +409,14 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, max_graphlet_size = max_graphlet_size ) - + # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) + + # Add node counts column + N <- ego_network_node_counts(ego_networks) + ego_graphlet_counts <- cbind(N, ego_graphlet_counts) # Return graphlet counts return(ego_graphlet_counts) From 19a6bd3d83d26c24927beb14892aad9e69ad2505 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 26 Sep 2019 16:35:56 +0100 Subject: [PATCH 050/188] count graphlet tuples using graphlet counts object only --- R/measures_net_dis.R | 26 +++++++++++++++---------- vignettes/quickstart_netdis_2graphs.Rmd | 4 ++-- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index fee6f4f8..4c7060f4 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -654,11 +654,11 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { #' @param max_graphlet_size Determines the maximum size of graphlets included #' in the tuple counts. #' @export -count_graphlet_tuples_ego <- function(ego_networks, max_graphlet_size) { +count_graphlet_tuples_ego <- function(graphlet_counts, max_graphlet_size) { graphlet_tuple_counts <- - t(simplify2array(purrr::map(ego_networks, count_graphlet_tuples, - max_graphlet_size = max_graphlet_size - ))) + t(apply(graphlet_counts, 1, + count_graphlet_tuples, max_graphlet_size = max_graphlet_size)) + graphlet_tuple_counts } @@ -686,10 +686,10 @@ ego_network_density <- function(ego_networks) { #' in graphlet_counts. #' @return scaled graphlet counts. #' @export -scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, +scale_graphlet_counts_ego <- function(graphlet_counts, max_graphlet_size) { ego_graphlet_tuples <- count_graphlet_tuples_ego( - ego_networks, + graphlet_counts, max_graphlet_size = max_graphlet_size ) @@ -710,14 +710,20 @@ scale_graphlet_counts_ego <- function(ego_networks, graphlet_counts, #' @param max_graphlet_size Determines the maximum size of graphlets included #' in the tuple counts. #' @export -count_graphlet_tuples <- function(graph, max_graphlet_size) { - graph_node_count <- igraph::vcount(graph) +count_graphlet_tuples <- function(graph_graphlet_counts, max_graphlet_size) { + # extract node counts from graph_graphlet_counts + N <- graph_graphlet_counts["N"] + graphlet_key <- graphlet_key(max_graphlet_size) graphlet_node_counts <- graphlet_key$node_count - graphlet_tuple_counts <- choose(graph_node_count, graphlet_node_counts) + + graphlet_tuple_counts <- choose(N, graphlet_node_counts) + graphlet_tuple_counts <- stats::setNames( graphlet_tuple_counts, graphlet_key$id ) - graphlet_tuple_counts + + # add node counts back to object + graphlet_tuple_counts <- c(N, graphlet_tuple_counts) } diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index f65e5276..12d1c131 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -85,10 +85,10 @@ graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_g # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) -scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(ego_ref, - graphlet_counts_ref, +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, max_graphlet_size) + # Get ego-network densities densities_ref <- ego_network_density(ego_ref) From 323612ac2bc396780c26d0607e1d840bb13db9e7 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 26 Sep 2019 16:55:11 +0100 Subject: [PATCH 051/188] density from graphlet counts objects --- R/measures_net_dis.R | 24 ++++++++++++++---------- vignettes/quickstart_netdis_2graphs.Rmd | 2 +- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 4c7060f4..29414cbf 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -261,7 +261,7 @@ netdis_many_to_many <- function(graphs, # Case where expected counts calculated using query networks # Get ego-network densities - densities <- purrr::map(ego_networks, + densities <- purrr::map(graphlet_counts, ego_network_density) # bin ref ego-network densities @@ -663,18 +663,22 @@ count_graphlet_tuples_ego <- function(graphlet_counts, max_graphlet_size) { } -#' Calculate ego network edge densities. -#' @param ego_networks Pre-generated ego networks for an input graph. +#' Calculate edge density for a single graph. +#' @param graphlet_counts Vector of pre-calculated graphlet, edge and node +#' counts. Must have named items "N" (node counts) and "G0" (edge counts). #' @export -ego_network_density <- function(ego_networks) { - densities <- purrr::simplify(purrr::map_dbl( - ego_networks, - igraph::edge_density - )) - - return(densities) +density_from_counts <- function(graphlet_counts) { + graphlet_counts["G0"] / choose(graphlet_counts["N"], 2) } +#' Calculate ego network edge densities. +#' @param graphlet_counts Matrix of pre-generated graphlet, edge and node counts +#' (columns) for each ego network (rows). Columns must include "N" (node counts) +#' and "G0" (edge counts). +#' @export +ego_network_density <- function(graphlet_counts) { + apply(graphlet_counts, 1, density_from_counts) +} #' Scale graphlet counts for an ego network by the n choose k possible #' choices of k nodes in that ego-network, where n is the number of nodes diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 12d1c131..69e2d0ba 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -90,7 +90,7 @@ scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, # Get ego-network densities -densities_ref <- ego_network_density(ego_ref) +densities_ref <- ego_network_density(graphlet_counts_ref) # Adaptively bin ref ego-network densities binned_densities <- binned_densities_adaptive(densities_ref, From 3c3cd16c04e950bb873325130e43f11dcaa7ba16 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 27 Sep 2019 17:25:46 +0100 Subject: [PATCH 052/188] expected counts functions using graphlet counts --- R/measures_net_dis.R | 38 +++++++++++-------------- vignettes/quickstart_netdis_2graphs.Rmd | 4 +-- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 29414cbf..9dbc1eea 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -211,6 +211,8 @@ netdis_many_to_many <- function(graphs, max_graphlet_size = max_graphlet_size ) + rm(ego_networks) + ## ------------------------------------------------------------------------ # Case where expected counts calculated using a reference network if (!is.null(ref_graph)) { @@ -231,7 +233,7 @@ netdis_many_to_many <- function(graphs, ) # Get ego-network densities - densities_ref <- ego_network_density(ego_ref) + densities_ref <- ego_network_density(graphlet_counts_ref) # bin ref ego-network densities binned_densities <- binning_fn(densities_ref) @@ -242,14 +244,13 @@ netdis_many_to_many <- function(graphs, ref_binned_graphlet_counts <- bin_counts_fn( graphlet_counts_ref, binned_densities$interval_indexes, - ego_networks = ego_ref, max_graphlet_size = max_graphlet_size ) # Calculate expected graphlet counts (using ref # graph ego network density bins) exp_graphlet_counts <- purrr::map( - ego_networks, + graphlet_counts, exp_counts_fn, density_breaks = ref_ego_density_bins, density_binned_reference_counts = ref_binned_graphlet_counts, @@ -288,7 +289,7 @@ netdis_many_to_many <- function(graphs, # Calculate expected graphlet counts for each ego network exp_graphlet_counts <- mapply(exp_counts_fn, - ego_networks, + graphlet_counts, ego_density_bin_breaks, binned_graphlet_counts, max_graphlet_size = max_graphlet_size, @@ -416,7 +417,7 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, #' TODO: Remove @export prior to publishing #' @export netdis_expected_graphlet_counts_per_ego <- function( - ego_networks, + graphlet_counts, density_breaks, density_binned_reference_counts, max_graphlet_size, @@ -425,18 +426,13 @@ netdis_expected_graphlet_counts_per_ego <- function( # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. - expected_graphlet_counts <- - purrr::map(ego_networks, netdis_expected_graphlet_counts, - max_graphlet_size = max_graphlet_size, - density_breaks = density_breaks, - density_binned_reference_counts = density_binned_reference_counts, - scale_fn = scale_fn - ) - names(expected_graphlet_counts) <- names(ego_networks) - - # Simplify list to array - expected_graphlet_counts <- t(simplify2array(expected_graphlet_counts)) - colnames(expected_graphlet_counts) <- graphlet_key(max_graphlet_size)$id + expected_graphlet_counts <- t(apply( + graphlet_counts, 1, netdis_expected_graphlet_counts, + max_graphlet_size = max_graphlet_size, + density_breaks = density_breaks, + density_binned_reference_counts = density_binned_reference_counts, + scale_fn = scale_fn)) + expected_graphlet_counts } @@ -460,7 +456,7 @@ netdis_expected_graphlet_counts_per_ego <- function( #' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export -netdis_expected_graphlet_counts <- function(graph, +netdis_expected_graphlet_counts <- function(graphlet_counts, max_graphlet_size, density_breaks, density_binned_reference_counts, @@ -468,7 +464,7 @@ netdis_expected_graphlet_counts <- function(graph, # Look up average scaled graphlet counts for graphs of similar density # in the reference graph - query_density <- igraph::edge_density(graph) + query_density <- density_from_counts(graphlet_counts) matched_density_index <- interval_index(query_density, density_breaks) matched_reference_counts <- @@ -480,7 +476,7 @@ netdis_expected_graphlet_counts <- function(graph, # of possible sets of k nodes in the query graph, # where k is the number of nodes in the graphlet. matched_reference_counts <- matched_reference_counts * - scale_fn(graph, max_graphlet_size) + scale_fn(graphlet_counts, max_graphlet_size) } matched_reference_counts } @@ -553,7 +549,6 @@ density_binned_counts <- function(graphlet_counts, density_interval_indexes, agg_fn = mean, scale_fn = NULL, - ego_networks = NULL, max_graphlet_size = NULL) { if (!is.null(scale_fn)) { @@ -561,7 +556,6 @@ density_binned_counts <- function(graphlet_counts, # by dividing by total number of k-tuples in # ego-network (where k is graphlet size) graphlet_counts <- scale_fn(graphlet_counts, - ego_networks = ego_networks, max_graphlet_size = max_graphlet_size) } diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 69e2d0ba..6bcfd610 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -109,14 +109,14 @@ ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( ## Centre graphlet counts of query graphs based on statistics of reference graph ```{r} # Calculate expected graphlet counts (using ref graph ego network density bins) -exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, ref_ego_density_bins, ref_binned_graphlet_counts, max_graphlet_size, scale_fn=count_graphlet_tuples) -exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, ref_ego_density_bins, ref_binned_graphlet_counts, max_graphlet_size, From 8d75dd1184a029133e6b26711278682d11021884 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 8 Oct 2019 16:56:41 +0100 Subject: [PATCH 053/188] stopped passing around ego networks + started to implement possibility to pass in graphlet counts and a constant expected counts value --- R/measures_net_dis.R | 107 ++++++++++++++-------- R/orca_interface.R | 13 +-- vignettes/netdis_2graphs_polya-aeppli.Rmd | 16 ++-- vignettes/quickstart_netdis_2graphs.Rmd | 8 +- 4 files changed, 90 insertions(+), 54 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 9dbc1eea..f9d77e6e 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -191,47 +191,50 @@ netdis_many_to_many <- function(graphs, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial( netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) { + scale_fn = count_graphlet_tuples), + graphlet_counts = NULL, + graphlet_counts_ref = NULL) { ## ------------------------------------------------------------------------ - # Get ego networks for query graphs - ego_networks <- purrr::map( - graphs, make_named_ego_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) - + # Generate ego networks and count graphlets for query graphs. + # But if some graphlet counts have been provided we can skip this step. + if (is.null(graphlet_counts)) { + graphlet_counts <- purrr::map( + graphs, + count_graphlets_ego, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } rm(graphs) - - ## ------------------------------------------------------------------------ - # Count graphlets for ego networks in query graphs - graphlet_counts <- purrr::map( - ego_networks, - ego_to_graphlet_counts, - max_graphlet_size = max_graphlet_size - ) - rm(ego_networks) + ## ------------------------------------------------------------------------ + # If a number has been passed as ref_graph, treat it as a constant expected + # counts value (e.g. if zero counts will not be centred) + if (is.numeric(ref_graph)) { + # TODO set exp_graphlet_counts to matrices of constant (ref_graph) ## ------------------------------------------------------------------------ - # Case where expected counts calculated using a reference network - if (!is.null(ref_graph)) { - # Get ego networks - ego_ref <- make_named_ego_graph( - ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges - ) + # If a reference graph passed, use it to calculate expected counts for all + # query graphs. + } else if (!is.null(ref_graph) || !is.null(graphlet_counts_ref)) { + # Generate ego networks and calculate graphlet counts + # But if some ref graphlet counts provided can skip this step + if (is.null(graphlet_counts_ref)) { + graphlet_counts_ref <- count_graphlets_ego( + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } rm(ref_graph) - - # Get ego network graphlet counts - graphlet_counts_ref <- ego_to_graphlet_counts( - ego_ref, - max_graphlet_size = max_graphlet_size - ) - + # Get ego-network densities densities_ref <- ego_network_density(graphlet_counts_ref) @@ -258,9 +261,9 @@ netdis_many_to_many <- function(graphs, ) ## ------------------------------------------------------------------------ + # If no reference passed, calculate expected counts using query networks + # themselves. } else { - # Case where expected counts calculated using query networks - # Get ego-network densities densities <- purrr::map(graphlet_counts, ego_network_density) @@ -298,7 +301,10 @@ netdis_many_to_many <- function(graphs, ## ------------------------------------------------------------------------ # Centre graphlet counts by subtracting expected counts - centred_graphlet_counts <- mapply("-", graphlet_counts, exp_graphlet_counts) + centred_graphlet_counts <- mapply(netdis_centred_graphlet_counts, + graphlet_counts, + exp_graphlet_counts, + max_graphlet_size = max_graphlet_size) ## ------------------------------------------------------------------------ # Sum centred graphlet counts across all ego networks @@ -395,6 +401,33 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, netdis_statistics } + +#' netdis_centred_graphlet_counts +#' +#' Centre counts by subtracting expected graphlet counts from actual graphlet +#' counts. +#' @param graphlet_counts Ego network graphlet counts for a query graph +#' @param exp_graphlet_counts Pre-calculated expected counts for each graphlet +#' type for each ego network. +#' @param max_graphlet_size max graphlet size to calculate centred counts for. +#' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size +#' max_graphlet_size. +#' @export +netdis_centred_graphlet_counts <- function( + graphlet_counts, + exp_graphlet_counts, + max_graphlet_size) { + + # extract columns for graphlets up to size max_graphlet_size + id <- graphlet_key(max_graphlet_size)$id + graphlet_counts <- graphlet_counts[, id] + exp_graphlet_counts <- exp_graphlet_counts[, id] + + # centre counts + graphlet_counts - exp_graphlet_counts + +} + #' INTERNAL FUNCTION - Do not call directly #' #' Used by \code{netdis_expected_graphlet_counts_ego_fn} to diff --git a/R/orca_interface.R b/R/orca_interface.R index 028af0a4..e6b887c2 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -367,15 +367,10 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - # Generate graphlet counts for each node in each ego network (returns an ORCA - # format graphlet count matrix for each ego network) - ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, - max_graphlet_size = max_graphlet_size - ) - # Reshape the list of per node single row graphlet count matrices to a single - # ORCA format graphlet count matrix with one row per node - ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) - + + # Generate graphlet counts for each node in each ego network + ego_graphlet_counts <- ego_to_graphlet_counts(ego_networks, max_graphlet_size) + # Return either graphlet counts, or graphlet counts and ego_networks if (return_ego_networks) { return(list(graphlet_counts = ego_graphlet_counts, ego_networks = ego_networks)) diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index a5f2f7e9..3ebb51ce 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -74,8 +74,8 @@ graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graph ```{r} # Get ego-network densities -densities_1 <- ego_network_density(ego_1) -densities_2 <- ego_network_density(ego_2) +densities_1 <- ego_network_density(graphlet_counts_1) +densities_2 <- ego_network_density(graphlet_counts_2) # Adaptively bin ego-network densities binned_densities_1 <- binned_densities_adaptive(densities_1, @@ -148,22 +148,26 @@ binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, ## Centre graphlet counts of query graphs using binned expected counts ```{r} # Calculate expected graphlet counts for each ego network -exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, ego_density_bins_1, binned_graphlet_counts_1, max_graphlet_size, scale_fn = NULL) -exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, ego_density_bins_2, binned_graphlet_counts_2, max_graphlet_size, scale_fn = NULL) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 +centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) -centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 +centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ``` ## Sum centred graphlet counts across all ego networks diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 6bcfd610..3fb06e6e 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -123,9 +123,13 @@ exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts scale_fn=count_graphlet_tuples) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 +centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) -centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 +centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ``` ## Sum centred graphlet counts across all ego networks From ecc6b1d33670a4ea37b6f0c157a71437dfc65bc1 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 10 Oct 2019 14:04:48 +0100 Subject: [PATCH 054/188] add option to pass constant exp counts value --- R/measures_net_dis.R | 184 ++++++++++++++++++++++++++++++++----------- 1 file changed, 137 insertions(+), 47 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index f9d77e6e..b005f72d 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,24 +1,40 @@ #' Netdis between two graphs -#' @param graph_1 First query graph -#' @param graph_2 Second query graph -#' @param ref_graph Reference graph -#' @param max_graphlet_size Generate graphlets up to this size -#' @param neighbourhood_size Ego network neighbourhood size +#' +#' @param graph_1 A simplified igraph graph object. +#' +#' @param graph_2 A simplified igraph graph object. +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query +#' graphs (DEFAULT: 0). +#' 2) A simplified \code{igraph} object - used as a reference graph from which +#' expected counts are calculated for all query graphs. +#' 3) NULL - Expected counts will be calculated based on the properties of the +#' query graphs themselves. +#' +#' @param max_graphlet_size Generate graphlets up to this size. +#' +#' @param neighbourhood_size Ego network neighbourhood size. +#' #' @param min_ego_nodes Filter ego networks which have fewer -#' than min_ego_nodes nodes +#' than min_ego_nodes nodes. +#' #' @param min_ego_edges Filter ego networks which have fewer -#' than min_ego_edges edges +#' than min_ego_edges edges. +#' #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each #' ego network). (Default: \code{binned_densities_adaptive} with #' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +#' (bin indexes) and \code{max_graphlet_size} as arguments. #' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and #' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the #' approach used in the original netdis paper). +#' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, @@ -26,11 +42,13 @@ #' (Default: \code{netdis_expected_graphlet_counts_per_ego} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size +#' #' @export netdis_one_to_one <- function(graph_1, graph_2, - ref_graph, + ref_graph = 0, max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, @@ -67,28 +85,45 @@ netdis_one_to_one <- function(graph_1, graph_2, result$netdis[, 1] } -#' Netdis comparisons between one graph and many other graphs -#' @param graph_1 query graph - this graph will be compared with -#' all graphs in graphs_compare -#' @param graphs_compare graphs graph_1 will be compared with -#' @param ref_graph Reference graph -#' @param max_graphlet_size Generate graphlets up to this size -#' @param neighbourhood_size Ego network neighbourhood size +#' Netdis comparisons between one graph and many other graphs. +#' +#' @param graph_1 Query graph - this graph will be compared with +#' all graphs in graphs_compare. A simplified igraph graph object. +#' +#' @param graphs_compare Graphs graph_1 will be compared with. A named list of +#' simplified igraph graph objects. +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query +#' graphs (DEFAULT: 0). +#' 2) A simplified \code{igraph} object - used as a reference graph from which +#' expected counts are calculated for all query graphs. +#' 3) NULL - Expected counts will be calculated based on the properties of the +#' query graphs themselves. +#' +#' @param max_graphlet_size Generate graphlets up to this size. +#' +#' @param neighbourhood_size Ego network neighbourhood size. +#' #' @param min_ego_nodes Filter ego networks which have fewer -#' than min_ego_nodes nodes +#' than min_ego_nodes nodes. +#' #' @param min_ego_edges Filter ego networks which have fewer -#' than min_ego_edges edges +#' than min_ego_edges edges. +#' #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each #' ego network). (Default: \code{binned_densities_adaptive} with #' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +#' (bin indexes) and \code{max_graphlet_size} as arguments. #' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and #' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the #' approach used in the original netdis paper). +#' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, @@ -96,26 +131,27 @@ netdis_one_to_one <- function(graph_1, graph_2, #' (Default: \code{netdis_expected_graphlet_counts_per_ego} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export netdis_one_to_many <- function(graph_1, graphs_compare, - ref_graph, - max_graphlet_size = 4, - neighbourhood_size = 2, - min_ego_nodes = 3, - min_ego_edges = 1, - binning_fn = purrr::partial( - binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100), - bin_counts_fn = purrr::partial( - density_binned_counts, - agg_fn = mean, - scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial( - netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) { + ref_graph = 0, + max_graphlet_size = 4, + neighbourhood_size = 2, + min_ego_nodes = 3, + min_ego_edges = 1, + binning_fn = purrr::partial( + binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100), + bin_counts_fn = purrr::partial( + density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego), + exp_counts_fn = purrr::partial( + netdis_expected_graphlet_counts_per_ego, + scale_fn = count_graphlet_tuples)) { # bundle graph_1 and graphs_compare to one vector, with # graph_1 at start as needed for netdis_many_to_many call @@ -142,28 +178,46 @@ netdis_one_to_many <- function(graph_1, graphs_compare, #' Netdis between all graph pairs -#' @param graphs Query graphs -#' @param ref_graph Reference graph +#' +#' @param graphs A named list of simplified igraph graph objects (undirected +#' graphs excluding loops, multiple edges and isolated vertices), such as those +#' obtained by using \code{read_simple_graphs}. +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query +#' graphs (DEFAULT: 0). +#' 2) A simplified \code{igraph} object - used as a reference graph from which +#' expected counts are calculated for all query graphs. +#' 3) NULL - Expected counts will be calculated based on the properties of the +#' query graphs themselves. +#' #' @param comparisons Which comparisons to perform between graphs. #' Can be "many-to-many" (all pairwise combinations) or "one-to-many" #' (compare first graph in graphs to all other graphs.) -#' @param max_graphlet_size Generate graphlets up to this size -#' @param neighbourhood_size Ego network neighbourhood size +#' +#' @param max_graphlet_size Generate graphlets up to this size. +#' +#' @param neighbourhood_size Ego network neighbourhood size. +#' #' @param min_ego_nodes Filter ego networks which have fewer -#' than min_ego_nodes nodes +#' than min_ego_nodes nodes. +#' #' @param min_ego_edges Filter ego networks which have fewer -#' than min_ego_edges edges +#' than min_ego_edges edges. +#' #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each #' ego network). (Default: \code{binned_densities_adaptive} with #' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +#' (bin indexes) and \code{max_graphlet_size} as arguments. #' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and #' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the #' approach used in the original netdis paper). +#' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, @@ -171,11 +225,27 @@ netdis_one_to_many <- function(graph_1, graphs_compare, #' (Default: \code{netdis_expected_graphlet_counts_per_ego} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). +#' +#' @param graphlet_counts Pre-generated graphlet counts. If the +#' \code{graphlet_counts} argument is defined then \code{graphs} will not be +#' used. +#' A named list of matrices containing counts of each graphlet (columns) for +#' each ego-network in the input graph (rows). Columns are labelled with +#' graphlet IDs and rows are labelled with the ID of the central node in each +#' ego-network. As well as graphlet counts, each matrix must contain an +#' additional column labelled "N" including the node count for +#' each ego network. +#' +#' @param graphlet_counts_ref Pre-generated reference graphlet counts. If the +#' \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +#' be used. +#' #' @return Netdis statistics between query graphs for graphlet sizes #' up to and including max_graphlet_size. +#' #' @export -netdis_many_to_many <- function(graphs, - ref_graph, +netdis_many_to_many <- function(graphs = NULL, + ref_graph = 0, comparisons = "many-to-many", max_graphlet_size = 4, neighbourhood_size = 2, @@ -194,6 +264,12 @@ netdis_many_to_many <- function(graphs, scale_fn = count_graphlet_tuples), graphlet_counts = NULL, graphlet_counts_ref = NULL) { + + ## ------------------------------------------------------------------------ + if (is.null(graphs) & is.null(graphlet_counts)) { + stop("One of graphs and graphlet_counts must be supplied.") + } + ## ------------------------------------------------------------------------ # Generate ego networks and count graphlets for query graphs. # But if some graphlet counts have been provided we can skip this step. @@ -212,10 +288,12 @@ netdis_many_to_many <- function(graphs, ## ------------------------------------------------------------------------ # If a number has been passed as ref_graph, treat it as a constant expected - # counts value (e.g. if zero counts will not be centred) + # counts value (e.g. if ref_graph = 0 then graphlet_counts used directly). if (is.numeric(ref_graph)) { - # TODO set exp_graphlet_counts to matrices of constant (ref_graph) - + exp_graphlet_counts <- purrr::map(graphlet_counts, + netdis_const_expected_counts, + const = ref_graph) + ## ------------------------------------------------------------------------ # If a reference graph passed, use it to calculate expected counts for all # query graphs. @@ -651,6 +729,18 @@ density_binned_counts_gp <- function(graphlet_counts, } +#' Create matrix of constant value to use as expected counts. +#' @param graphlet_counts Ego network graphlet counts matrix to create expected +#' counts for. +#' @param const Constant expected counts value to use. +#' @return Counts of value const with same shape and names as graphlet_counts. +netdis_const_expected_counts <- function(graphlet_counts, const) { + exp_counts <- graphlet_counts + exp_counts[,] <- const + exp_counts +} + + #' Replace zero values in a vector with ones. Used by #' \code{scale_graphlet_count} to prevent divide by #' zero errors. From 5f8969ca8e64f0d135f75c47b256d5f297d5f653 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 10 Oct 2019 14:37:01 +0100 Subject: [PATCH 055/188] update one_to_one and one_to_many fns to reflect many_to_many changes --- R/measures_net_dis.R | 147 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 124 insertions(+), 23 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index b005f72d..07c3dcd3 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -42,12 +42,21 @@ #' (Default: \code{netdis_expected_graphlet_counts_per_ego} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). -#' +#' +#' @param graphlet_counts_1 Pre-generated graphlet counts for the first query +#' graph. If the \code{graphlet_counts_1} argument is defined then +#' \code{graph_1} will not be used. +#' +#' @param graphlet_counts_2 Pre-generated graphlet counts for the second query +#' graph. If the \code{graphlet_counts_2} argument is defined then +#' \code{graph_2} will not be used. +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' #' @export -netdis_one_to_one <- function(graph_1, graph_2, +netdis_one_to_one <- function(graph_1 = NULL, + graph_2 = NULL, ref_graph = 0, max_graphlet_size = 4, neighbourhood_size = 2, @@ -63,24 +72,65 @@ netdis_one_to_one <- function(graph_1, graph_2, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial( netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) { + scale_fn = count_graphlet_tuples), + graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL) { - # bundle graphs into one vector with format needed for - # netdis many-to-many - graphs <- list(graph_1 = graph_1, graph_2 = graph_2) + ## ------------------------------------------------------------------------ + # Check arguments + if (is.null(graph_1) & is.null(graphlet_counts_1)) { + stop("One of graph_1 and graphlet_counts_1 must be supplied.") + } + if (is.null(graph_2) & is.null(graphlet_counts_2)) { + stop("One of graph_2 and graphlet_counts_2 must be supplied.") + } + + ## ------------------------------------------------------------------------ + # Generate graphlet counts and bundle them into named list with format needed + # for netdis_many_to_many. + + if (is.null(graphlet_counts_1)) { + graphlet_counts_1 <- count_graphlets_ego( + graph_1, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graph_1) + + if (is.null(graphlet_counts_2)) { + graphlet_counts_2 <- count_graphlets_ego( + graph_2, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graph_2) + + graphlet_counts <- list(graph_1 = graphlet_counts_1, + graph_2 = graphlet_counts_2) + ## ------------------------------------------------------------------------ # calculate netdis result <- netdis_many_to_many( - graphs, - ref_graph, + graphs = NULL, + ref_graph = ref_graph, max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, binning_fn = binning_fn, - exp_counts_fn = exp_counts_fn + exp_counts_fn = exp_counts_fn, + graphlet_counts = graphlet_counts ) - + + ## ------------------------------------------------------------------------ # extract netdis statistics from list returned by netdis_many_to_many result$netdis[, 1] } @@ -132,10 +182,19 @@ netdis_one_to_one <- function(graph_1, graph_2, #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). #' +#' @param graphlet_counts_1 Pre-generated graphlet counts for the first query +#' graph. If the \code{graphlet_counts_1} argument is defined then +#' \code{graph_1} will not be used. +#' +#' @param graphlet_counts_compare Named list of pre-generated graphlet counts +#' for the remaining query graphs. If the \code{graphlet_counts_compare} +#' argument is defined then \code{graphs_compare} will not be used. +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export -netdis_one_to_many <- function(graph_1, graphs_compare, +netdis_one_to_many <- function(graph_1 = NULL, + graphs_compare = NULL, ref_graph = 0, max_graphlet_size = 4, neighbourhood_size = 2, @@ -151,16 +210,56 @@ netdis_one_to_many <- function(graph_1, graphs_compare, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial( netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) { - - # bundle graph_1 and graphs_compare to one vector, with - # graph_1 at start as needed for netdis_many_to_many call - graphs <- append(graphs_compare, list(graph_1 = graph_1), after = 0) - + scale_fn = count_graphlet_tuples), + graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL) { + ## ------------------------------------------------------------------------ + # Check arguments + if (is.null(graph_1) & is.null(graphlet_counts_1)) { + stop("One of graph_1 and graphlet_counts_1 must be supplied.") + } + if (is.null(graphs_compare) & is.null(graphlet_counts_compare)) { + stop("One of graph_2 and graphlet_counts_2 must be supplied.") + } + + ## ------------------------------------------------------------------------ + # Generate graphlet counts and bundle them into named list with format needed + # for netdis_many_to_many. + + if (is.null(graphlet_counts_1)) { + graphlet_counts_1 <- count_graphlets_ego( + graph_1, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graph_1) + + if (is.null(graphlet_counts_compare)) { + graphlet_counts_compare <- purrr::map( + graphs_compare, + count_graphlets_ego, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + } + rm(graphs_compare) + + graphlet_counts <- append(graphlet_counts_compare, + list(graph_1 = graphlet_counts_1), + after = 0) + + ## ------------------------------------------------------------------------ # calculate netdis result <- netdis_many_to_many( - graphs, - ref_graph, + graphs = NULL, + ref_graph = ref_graph, comparisons = "one-to-many", max_graphlet_size = 4, neighbourhood_size = 2, @@ -168,9 +267,11 @@ netdis_one_to_many <- function(graph_1, graphs_compare, min_ego_edges = 1, binning_fn = binning_fn, bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn + exp_counts_fn = exp_counts_fn, + graphlet_counts = graphlet_counts ) + ## ------------------------------------------------------------------------ # restructure netdis_many_to_many output colnames(result$netdis) <- result$comp_spec$name_b result$netdis @@ -266,13 +367,14 @@ netdis_many_to_many <- function(graphs = NULL, graphlet_counts_ref = NULL) { ## ------------------------------------------------------------------------ + # Check arguments if (is.null(graphs) & is.null(graphlet_counts)) { stop("One of graphs and graphlet_counts must be supplied.") } ## ------------------------------------------------------------------------ # Generate ego networks and count graphlets for query graphs. - # But if some graphlet counts have been provided we can skip this step. + # But if graphlet counts have already been provided we can skip this step. if (is.null(graphlet_counts)) { graphlet_counts <- purrr::map( graphs, @@ -288,7 +390,7 @@ netdis_many_to_many <- function(graphs = NULL, ## ------------------------------------------------------------------------ # If a number has been passed as ref_graph, treat it as a constant expected - # counts value (e.g. if ref_graph = 0 then graphlet_counts used directly). + # counts value (e.g. if ref_graph = 0 then no centring of counts). if (is.numeric(ref_graph)) { exp_graphlet_counts <- purrr::map(graphlet_counts, netdis_const_expected_counts, @@ -406,7 +508,6 @@ netdis_many_to_many <- function(graphs = NULL, comp_spec$index_b, SIMPLIFY = TRUE) - list(netdis = results, comp_spec = comp_spec) } From 49f5b74943d78688821a08c4bd5b2dc7545d850f Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 11:49:06 +0100 Subject: [PATCH 056/188] rebuild --- NAMESPACE | 2 + man/count_graphlet_tuples.Rd | 12 ++- man/count_graphlet_tuples_ego.Rd | 10 +-- man/cross_comparison_spec.Rd | 3 +- man/density_binned_counts.Rd | 10 +-- man/density_binned_counts_gp.Rd | 4 +- man/density_from_counts.Rd | 15 ++++ man/ego_network_density.Rd | 10 +-- man/ego_network_node_counts.Rd | 14 ++++ man/ego_to_graphlet_counts.Rd | 6 +- man/graphlet_ids_for_size.Rd | 3 +- man/make_named_ego_graph.Rd | 3 +- man/mean_density_binned_graphlet_counts.Rd | 3 +- man/netdis.Rd | 3 +- man/netdis_centred_graphlet_counts.Rd | 25 ++++++ man/netdis_const_expected_counts.Rd | 20 +++++ man/netdis_expected_graphlet_counts.Rd | 13 ++- ...netdis_expected_graphlet_counts_per_ego.Rd | 16 ++-- man/netdis_many_to_many.Rd | 49 +++++++---- man/netdis_one_to_many.Rd | 48 +++++++---- man/netdis_one_to_one.Rd | 42 ++++++---- man/netdis_uptok.Rd | 3 +- man/read_simple_graph.Rd | 3 +- man/read_simple_graphs.Rd | 3 +- man/scale_graphlet_count.Rd | 4 +- man/scale_graphlet_counts_ego.Rd | 10 +-- man/simplify_graph.Rd | 3 +- man/single_density_bin.Rd | 4 +- man/zeros_to_ones.Rd | 4 +- vignettes/dendrogram_example_net_dis.html | 4 +- vignettes/netdis_2graphs_polya-aeppli.R | 16 ++-- vignettes/netdis_2graphs_polya-aeppli.html | 22 ++--- vignettes/netdis_pairwise_comparisons.html | 4 +- vignettes/quickstart_netdis_2graphs.R | 19 +++-- vignettes/quickstart_netdis_2graphs.Rmd | 1 - vignettes/quickstart_netdis_2graphs.html | 81 ++++++++++--------- 36 files changed, 297 insertions(+), 195 deletions(-) create mode 100644 man/density_from_counts.Rd create mode 100644 man/ego_network_node_counts.Rd create mode 100644 man/netdis_centred_graphlet_counts.Rd create mode 100644 man/netdis_const_expected_counts.Rd diff --git a/NAMESPACE b/NAMESPACE index 69e2208c..e4e578d1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(cross_comp_to_matrix) export(cross_comparison_spec) export(density_binned_counts) export(density_binned_counts_gp) +export(density_from_counts) export(dhist) export(dhist_ecmf) export(dhist_from_obs) @@ -50,6 +51,7 @@ export(min_emd_optimise_fast) export(net_emd) export(net_emds_for_all_graphs) export(netdis) +export(netdis_centred_graphlet_counts) export(netdis_expected_graphlet_counts) export(netdis_expected_graphlet_counts_per_ego) export(netdis_many_to_many) diff --git a/man/count_graphlet_tuples.Rd b/man/count_graphlet_tuples.Rd index 7975385e..def9d19b 100644 --- a/man/count_graphlet_tuples.Rd +++ b/man/count_graphlet_tuples.Rd @@ -2,21 +2,19 @@ % Please edit documentation in R/measures_net_dis.R \name{count_graphlet_tuples} \alias{count_graphlet_tuples} -\title{FLAGUSED -For each graphlet calculate the number of possible sets of k nodes in the +\title{For each graphlet calculate the number of possible sets of k nodes in the query graph, where k is the number of nodes in the graphlet.} \usage{ -count_graphlet_tuples(graph, max_graphlet_size) +count_graphlet_tuples(graph_graphlet_counts, max_graphlet_size) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} -object.} - \item{max_graphlet_size}{Determines the maximum size of graphlets included in the tuple counts.} + +\item{graph}{A connected, undirected, simple graph as an \code{igraph} +object.} } \description{ -FLAGUSED For each graphlet calculate the number of possible sets of k nodes in the query graph, where k is the number of nodes in the graphlet. } diff --git a/man/count_graphlet_tuples_ego.Rd b/man/count_graphlet_tuples_ego.Rd index ae3173a5..12aa77ec 100644 --- a/man/count_graphlet_tuples_ego.Rd +++ b/man/count_graphlet_tuples_ego.Rd @@ -2,18 +2,16 @@ % Please edit documentation in R/measures_net_dis.R \name{count_graphlet_tuples_ego} \alias{count_graphlet_tuples_ego} -\title{FLAGUSED -Run count_graphlet_tuples across pre-computed ego networks.} +\title{Run count_graphlet_tuples across pre-computed ego networks.} \usage{ -count_graphlet_tuples_ego(ego_networks, max_graphlet_size) +count_graphlet_tuples_ego(graphlet_counts, max_graphlet_size) } \arguments{ -\item{ego_networks}{Pre-generated ego networks for an input graph.} - \item{max_graphlet_size}{Determines the maximum size of graphlets included in the tuple counts.} + +\item{ego_networks}{Pre-generated ego networks for an input graph.} } \description{ -FLAGUSED Run count_graphlet_tuples across pre-computed ego networks. } diff --git a/man/cross_comparison_spec.Rd b/man/cross_comparison_spec.Rd index bf21f490..35ab245f 100644 --- a/man/cross_comparison_spec.Rd +++ b/man/cross_comparison_spec.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/orca_interface.R \name{cross_comparison_spec} \alias{cross_comparison_spec} -\title{FLAGUSED -Generate a cross-comparison specification} +\title{Generate a cross-comparison specification} \usage{ cross_comparison_spec(named_list, how = "many-to-many") } diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd index d827e730..ecf1f0a3 100644 --- a/man/density_binned_counts.Rd +++ b/man/density_binned_counts.Rd @@ -2,12 +2,10 @@ % Please edit documentation in R/measures_net_dis.R \name{density_binned_counts} \alias{density_binned_counts} -\title{FLAGUSED -INTERNAL FUNCTION - Do not call directly} +\title{INTERNAL FUNCTION - Do not call directly} \usage{ density_binned_counts(graphlet_counts, density_interval_indexes, - agg_fn = mean, scale_fn = NULL, ego_networks = NULL, - max_graphlet_size = NULL) + agg_fn = mean, scale_fn = NULL, max_graphlet_size = NULL) } \arguments{ \item{graphlet_counts}{Graphlet counts for a number of ego_networks.} @@ -22,9 +20,9 @@ each ego network.} to graphlet_counts, must have arguments graphlet_counts, ego_networks and max_graphlet_size.} -\item{ego_networks}{Optionally passed and used by scale_fn.} - \item{max_graphlet_size}{Optionally passed and used by scale_fn.} + +\item{ego_networks}{Optionally passed and used by scale_fn.} } \description{ Used to calculate expected graphlet counts for each density bin. diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd index a423c077..fe5669a0 100644 --- a/man/density_binned_counts_gp.Rd +++ b/man/density_binned_counts_gp.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{density_binned_counts_gp} \alias{density_binned_counts_gp} -\title{FLAGUSED -Calculate expected counts in density bins using +\title{Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation.} \usage{ density_binned_counts_gp(graphlet_counts, density_interval_indexes, @@ -19,7 +18,6 @@ each ego network.} included in graphlet_counts.} } \description{ -FLAGUSED Calculate expected counts in density bins using geometric poisson (Polya-Aeppli) approximation. } diff --git a/man/density_from_counts.Rd b/man/density_from_counts.Rd new file mode 100644 index 00000000..e5c50f79 --- /dev/null +++ b/man/density_from_counts.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{density_from_counts} +\alias{density_from_counts} +\title{Calculate edge density for a single graph.} +\usage{ +density_from_counts(graphlet_counts) +} +\arguments{ +\item{graphlet_counts}{Vector of pre-calculated graphlet, edge and node +counts. Must have named items "N" (node counts) and "G0" (edge counts).} +} +\description{ +Calculate edge density for a single graph. +} diff --git a/man/ego_network_density.Rd b/man/ego_network_density.Rd index e4d36189..43114243 100644 --- a/man/ego_network_density.Rd +++ b/man/ego_network_density.Rd @@ -2,15 +2,15 @@ % Please edit documentation in R/measures_net_dis.R \name{ego_network_density} \alias{ego_network_density} -\title{FLAGUSED -Calculate ego network edge densities.} +\title{Calculate ego network edge densities.} \usage{ -ego_network_density(ego_networks) +ego_network_density(graphlet_counts) } \arguments{ -\item{ego_networks}{Pre-generated ego networks for an input graph.} +\item{graphlet_counts}{Matrix of pre-generated graphlet, edge and node counts +(columns) for each ego network (rows). Columns must include "N" (node counts) +and "G0" (edge counts).} } \description{ -FLAGUSED Calculate ego network edge densities. } diff --git a/man/ego_network_node_counts.Rd b/man/ego_network_node_counts.Rd new file mode 100644 index 00000000..4ddc0a6b --- /dev/null +++ b/man/ego_network_node_counts.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/orca_interface.R +\name{ego_network_node_counts} +\alias{ego_network_node_counts} +\title{ego_network_node_counts} +\usage{ +ego_network_node_counts(ego_networks) +} +\arguments{ +\item{ego_networks}{Named list of ego networks for a graph.} +} +\description{ +Calculates number of nodes in each ego network. +} diff --git a/man/ego_to_graphlet_counts.Rd b/man/ego_to_graphlet_counts.Rd index b5f2d95d..a94ff274 100644 --- a/man/ego_to_graphlet_counts.Rd +++ b/man/ego_to_graphlet_counts.Rd @@ -2,11 +2,7 @@ % Please edit documentation in R/orca_interface.R \name{ego_to_graphlet_counts} \alias{ego_to_graphlet_counts} -\title{FLAGUSED -ego_to_graphlet_counts -JACK To follow through logic of paper steps, wanted to pass -ego networks to the function for generating graphlet counts, -not the input query graph directly (as in count_graphlets_ego above).} +\title{ego_to_graphlet_counts} \usage{ ego_to_graphlet_counts(ego_networks, max_graphlet_size = 4) } diff --git a/man/graphlet_ids_for_size.Rd b/man/graphlet_ids_for_size.Rd index b97e8048..9545ceb0 100644 --- a/man/graphlet_ids_for_size.Rd +++ b/man/graphlet_ids_for_size.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/orca_interface.R \name{graphlet_ids_for_size} \alias{graphlet_ids_for_size} -\title{FLAGUSED -Graphlet IDs for size} +\title{Graphlet IDs for size} \usage{ graphlet_ids_for_size(graphlet_size) } diff --git a/man/make_named_ego_graph.Rd b/man/make_named_ego_graph.Rd index de5401fa..1c0db5a7 100644 --- a/man/make_named_ego_graph.Rd +++ b/man/make_named_ego_graph.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/orca_interface.R \name{make_named_ego_graph} \alias{make_named_ego_graph} -\title{FLAGUSED -Get ego-networks for a graph as a named list} +\title{Get ego-networks for a graph as a named list} \usage{ make_named_ego_graph(graph, order, min_ego_nodes = 3, min_ego_edges = 1, ...) diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index dc95201f..69dd8b01 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{mean_density_binned_graphlet_counts} \alias{mean_density_binned_graphlet_counts} -\title{FLAGUSED -INTERNAL FUNCTION - Do not call directly} +\title{INTERNAL FUNCTION - Do not call directly} \usage{ mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes, agg_fn = mean) diff --git a/man/netdis.Rd b/man/netdis.Rd index 011e8594..9f477425 100644 --- a/man/netdis.Rd +++ b/man/netdis.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis} \alias{netdis} -\title{FLAGUSED -Netdis} +\title{Netdis} \usage{ netdis(centred_graphlet_counts1, centred_graphlet_counts2, graphlet_size) } diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd new file mode 100644 index 00000000..e4e932af --- /dev/null +++ b/man/netdis_centred_graphlet_counts.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_centred_graphlet_counts} +\alias{netdis_centred_graphlet_counts} +\title{netdis_centred_graphlet_counts} +\usage{ +netdis_centred_graphlet_counts(graphlet_counts, exp_graphlet_counts, + max_graphlet_size) +} +\arguments{ +\item{graphlet_counts}{Ego network graphlet counts for a query graph} + +\item{exp_graphlet_counts}{Pre-calculated expected counts for each graphlet +type for each ego network.} + +\item{max_graphlet_size}{max graphlet size to calculate centred counts for.} +} +\value{ +graphlet_counts minus exp_graphlet_counts for graphlets up to size +max_graphlet_size. +} +\description{ +Centre counts by subtracting expected graphlet counts from actual graphlet +counts. +} diff --git a/man/netdis_const_expected_counts.Rd b/man/netdis_const_expected_counts.Rd new file mode 100644 index 00000000..5eb624a6 --- /dev/null +++ b/man/netdis_const_expected_counts.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_const_expected_counts} +\alias{netdis_const_expected_counts} +\title{Create matrix of constant value to use as expected counts.} +\usage{ +netdis_const_expected_counts(graphlet_counts, const) +} +\arguments{ +\item{graphlet_counts}{Ego network graphlet counts matrix to create expected +counts for.} + +\item{const}{Constant expected counts value to use.} +} +\value{ +Counts of value const with same shape and names as graphlet_counts. +} +\description{ +Create matrix of constant value to use as expected counts. +} diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index fe01b72e..16e05899 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -2,16 +2,12 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_expected_graphlet_counts} \alias{netdis_expected_graphlet_counts} -\title{FLAGUSED -INTERNAL FUNCTION - Do not call directly} +\title{INTERNAL FUNCTION - Do not call directly} \usage{ -netdis_expected_graphlet_counts(graph, max_graphlet_size, density_breaks, - density_binned_reference_counts, scale_fn = NULL) +netdis_expected_graphlet_counts(graphlet_counts, max_graphlet_size, + density_breaks, density_binned_reference_counts, scale_fn = NULL) } \arguments{ -\item{graph}{A connected, undirected, simple reference graph as an -\code{igraph} object.} - \item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} @@ -26,6 +22,9 @@ factor that the looked up \code{density_binned_reference_counts} values will be multiplied by. Temporarily accessible during development. TODO: Remove @export prior to publishing} + +\item{graph}{A connected, undirected, simple reference graph as an +\code{igraph} object.} } \description{ Used by \code{netdis_expected_graphlet_counts_ego} to diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index 9abe227b..d1ca4346 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -2,16 +2,12 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_expected_graphlet_counts_per_ego} \alias{netdis_expected_graphlet_counts_per_ego} -\title{FLAGUSED -INTERNAL FUNCTION - Do not call directly} +\title{INTERNAL FUNCTION - Do not call directly} \usage{ -netdis_expected_graphlet_counts_per_ego(ego_networks, density_breaks, +netdis_expected_graphlet_counts_per_ego(graphlet_counts, density_breaks, density_binned_reference_counts, max_graphlet_size, scale_fn = NULL) } \arguments{ -\item{ego_networks}{The number of steps from the source node to include -node in ego-network.} - \item{density_breaks}{Density values defining bin edges.} \item{density_binned_reference_counts}{Reference network graphlet counts for @@ -27,13 +23,11 @@ be multiplied by. #' Temporarily accessible during development. TODO: Remove @export prior to publishing} + +\item{ego_networks}{The number of steps from the source node to include +node in ego-network.} } \description{ -JACK To follow through logic of paper steps, wanted to pass -ego networks to the function, not the input query graph -(as in netdis_expected_graphlet_counts_ego above). -} -\details{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to generate a function for calculating expected ego-network graphlet counts from the statistics of a provided reference graph. diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index 82551939..d4f1bf22 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -2,37 +2,45 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_many_to_many} \alias{netdis_many_to_many} -\title{FLAGUSED -Netdis between all graph pairs} +\title{Netdis between all graph pairs} \usage{ -netdis_many_to_many(graphs, ref_graph, comparisons = "many-to-many", - max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, - min_ego_edges = 1, +netdis_many_to_many(graphs = NULL, ref_graph = 0, + comparisons = "many-to-many", max_graphlet_size = 4, + neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) + scale_fn = count_graphlet_tuples), graphlet_counts = NULL, + graphlet_counts_ref = NULL) } \arguments{ -\item{graphs}{Query graphs} +\item{graphs}{A named list of simplified igraph graph objects (undirected +graphs excluding loops, multiple edges and isolated vertices), such as those +obtained by using \code{read_simple_graphs}.} -\item{ref_graph}{Reference graph} +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query +graphs (DEFAULT: 0). +2) A simplified \code{igraph} object - used as a reference graph from which +expected counts are calculated for all query graphs. +3) NULL - Expected counts will be calculated based on the properties of the +query graphs themselves.} \item{comparisons}{Which comparisons to perform between graphs. Can be "many-to-many" (all pairwise combinations) or "one-to-many" (compare first graph in graphs to all other graphs.)} -\item{max_graphlet_size}{Generate graphlets up to this size} +\item{max_graphlet_size}{Generate graphlets up to this size.} -\item{neighbourhood_size}{Ego network neighbourhood size} +\item{neighbourhood_size}{Ego network neighbourhood size.} \item{min_ego_nodes}{Filter ego networks which have fewer -than min_ego_nodes nodes} +than min_ego_nodes nodes.} \item{min_ego_edges}{Filter ego networks which have fewer -than min_ego_edges edges} +than min_ego_edges edges.} \item{binning_fn}{Function used to bin ego network densities. Takes densities as its single argument, and returns a named list including keys \code{breaks} @@ -42,7 +50,7 @@ ego network). (Default: \code{binned_densities_adaptive} with \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +(bin indexes) and \code{max_graphlet_size} as arguments. (Default: \code{density_binned_counts} with \code{agg_fn = mean} and \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the approach used in the original netdis paper).} @@ -54,12 +62,25 @@ Takes \code{ego_networks}, \code{density_bin_breaks}, (Default: \code{netdis_expected_graphlet_counts_per_ego} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} + +\item{graphlet_counts}{Pre-generated graphlet counts. If the +\code{graphlet_counts} argument is defined then \code{graphs} will not be +used. +A named list of matrices containing counts of each graphlet (columns) for +each ego-network in the input graph (rows). Columns are labelled with +graphlet IDs and rows are labelled with the ID of the central node in each +ego-network. As well as graphlet counts, each matrix must contain an +additional column labelled "N" including the node count for +each ego network.} + +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts. If the +\code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +be used.} } \value{ Netdis statistics between query graphs for graphlet sizes up to and including max_graphlet_size. } \description{ -FLAGUSED Netdis between all graph pairs } diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index e72e000c..efeb74f7 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -2,36 +2,43 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_one_to_many} \alias{netdis_one_to_many} -\title{FLAGUSED -Netdis comparisons between one graph and many other graphs} +\title{Netdis comparisons between one graph and many other graphs.} \usage{ -netdis_one_to_many(graph_1, graphs_compare, ref_graph, - max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, - min_ego_edges = 1, +netdis_one_to_many(graph_1 = NULL, graphs_compare = NULL, + ref_graph = 0, max_graphlet_size = 4, neighbourhood_size = 2, + min_ego_nodes = 3, min_ego_edges = 1, binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) + scale_fn = count_graphlet_tuples), graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL) } \arguments{ -\item{graph_1}{query graph - this graph will be compared with -all graphs in graphs_compare} +\item{graph_1}{Query graph - this graph will be compared with +all graphs in graphs_compare. A simplified igraph graph object.} -\item{graphs_compare}{graphs graph_1 will be compared with} +\item{graphs_compare}{Graphs graph_1 will be compared with. A named list of +simplified igraph graph objects.} -\item{ref_graph}{Reference graph} +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query +graphs (DEFAULT: 0). +2) A simplified \code{igraph} object - used as a reference graph from which +expected counts are calculated for all query graphs. +3) NULL - Expected counts will be calculated based on the properties of the +query graphs themselves.} -\item{max_graphlet_size}{Generate graphlets up to this size} +\item{max_graphlet_size}{Generate graphlets up to this size.} -\item{neighbourhood_size}{Ego network neighbourhood size} +\item{neighbourhood_size}{Ego network neighbourhood size.} \item{min_ego_nodes}{Filter ego networks which have fewer -than min_ego_nodes nodes} +than min_ego_nodes nodes.} \item{min_ego_edges}{Filter ego networks which have fewer -than min_ego_edges edges} +than min_ego_edges edges.} \item{binning_fn}{Function used to bin ego network densities. Takes densities as its single argument, and returns a named list including keys \code{breaks} @@ -41,7 +48,7 @@ ego network). (Default: \code{binned_densities_adaptive} with \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +(bin indexes) and \code{max_graphlet_size} as arguments. (Default: \code{density_binned_counts} with \code{agg_fn = mean} and \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the approach used in the original netdis paper).} @@ -53,12 +60,19 @@ Takes \code{ego_networks}, \code{density_bin_breaks}, (Default: \code{netdis_expected_graphlet_counts_per_ego} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} + +\item{graphlet_counts_1}{Pre-generated graphlet counts for the first query +graph. If the \code{graphlet_counts_1} argument is defined then +\code{graph_1} will not be used.} + +\item{graphlet_counts_compare}{Named list of pre-generated graphlet counts +for the remaining query graphs. If the \code{graphlet_counts_compare} +argument is defined then \code{graphs_compare} will not be used.} } \value{ Netdis statistics between graph_1 and graph_2 for graphlet sizes up to and including max_graphlet_size } \description{ -FLAGUSED -Netdis comparisons between one graph and many other graphs +Netdis comparisons between one graph and many other graphs. } diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 65e8c89c..14ffcb90 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -2,34 +2,41 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_one_to_one} \alias{netdis_one_to_one} -\title{FLAGUSED -Netdis between two graphs} +\title{Netdis between two graphs} \usage{ -netdis_one_to_one(graph_1, graph_2, ref_graph, max_graphlet_size = 4, - neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, +netdis_one_to_one(graph_1 = NULL, graph_2 = NULL, ref_graph = 0, + max_graphlet_size = 4, neighbourhood_size = 2, min_ego_nodes = 3, + min_ego_edges = 1, binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples)) + scale_fn = count_graphlet_tuples), graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL) } \arguments{ -\item{graph_1}{First query graph} +\item{graph_1}{A simplified igraph graph object.} -\item{graph_2}{Second query graph} +\item{graph_2}{A simplified igraph graph object.} -\item{ref_graph}{Reference graph} +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query +graphs (DEFAULT: 0). +2) A simplified \code{igraph} object - used as a reference graph from which +expected counts are calculated for all query graphs. +3) NULL - Expected counts will be calculated based on the properties of the +query graphs themselves.} -\item{max_graphlet_size}{Generate graphlets up to this size} +\item{max_graphlet_size}{Generate graphlets up to this size.} -\item{neighbourhood_size}{Ego network neighbourhood size} +\item{neighbourhood_size}{Ego network neighbourhood size.} \item{min_ego_nodes}{Filter ego networks which have fewer -than min_ego_nodes nodes} +than min_ego_nodes nodes.} \item{min_ego_edges}{Filter ego networks which have fewer -than min_ego_edges edges} +than min_ego_edges edges.} \item{binning_fn}{Function used to bin ego network densities. Takes densities as its single argument, and returns a named list including keys \code{breaks} @@ -39,7 +46,7 @@ ego network). (Default: \code{binned_densities_adaptive} with \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes), \code{ego_networks} and \code{max_graphlet_size} as arguments. +(bin indexes) and \code{max_graphlet_size} as arguments. (Default: \code{density_binned_counts} with \code{agg_fn = mean} and \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the approach used in the original netdis paper).} @@ -51,12 +58,19 @@ Takes \code{ego_networks}, \code{density_bin_breaks}, (Default: \code{netdis_expected_graphlet_counts_per_ego} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} + +\item{graphlet_counts_1}{Pre-generated graphlet counts for the first query +graph. If the \code{graphlet_counts_1} argument is defined then +\code{graph_1} will not be used.} + +\item{graphlet_counts_2}{Pre-generated graphlet counts for the second query +graph. If the \code{graphlet_counts_2} argument is defined then +\code{graph_2} will not be used.} } \value{ Netdis statistics between graph_1 and graph_2 for graphlet sizes up to and including max_graphlet_size } \description{ -FLAGUSED Netdis between two graphs } diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd index 4c335179..7e491b60 100644 --- a/man/netdis_uptok.Rd +++ b/man/netdis_uptok.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_uptok} \alias{netdis_uptok} -\title{FLAGUSED -Netdis - graphlets up to max_graphlet_size} +\title{Netdis - graphlets up to max_graphlet_size} \usage{ netdis_uptok(centred_graphlet_counts1, centred_graphlet_counts2, max_graphlet_size) diff --git a/man/read_simple_graph.Rd b/man/read_simple_graph.Rd index 4db79754..3895b0ef 100644 --- a/man/read_simple_graph.Rd +++ b/man/read_simple_graph.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/orca_interface.R \name{read_simple_graph} \alias{read_simple_graph} -\title{FLAGUSED -Read a graph from file, simplifying as requested} +\title{Read a graph from file, simplifying as requested} \usage{ read_simple_graph(file, format, as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, diff --git a/man/read_simple_graphs.Rd b/man/read_simple_graphs.Rd index da0c9fce..4e907180 100644 --- a/man/read_simple_graphs.Rd +++ b/man/read_simple_graphs.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/orca_interface.R \name{read_simple_graphs} \alias{read_simple_graphs} -\title{FLAGUSED -Read all graphs in a directory, simplifying as requested} +\title{Read all graphs in a directory, simplifying as requested} \usage{ read_simple_graphs(source_dir, format = "ncol", pattern = "*", as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, diff --git a/man/scale_graphlet_count.Rd b/man/scale_graphlet_count.Rd index 6137b18d..8a313c27 100644 --- a/man/scale_graphlet_count.Rd +++ b/man/scale_graphlet_count.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{scale_graphlet_count} \alias{scale_graphlet_count} -\title{FLAGUSED -Divide graphlet counts by pre-computed scaling factor from +\title{Divide graphlet counts by pre-computed scaling factor from \code{count_graphlet_tuples} output.} \usage{ scale_graphlet_count(graphlet_count, graphlet_tuples) @@ -14,7 +13,6 @@ scale_graphlet_count(graphlet_count, graphlet_tuples) \item{graphlet_tuples}{Pre-computed \code{count_graphlet_tuples} output.} } \description{ -FLAGUSED Divide graphlet counts by pre-computed scaling factor from \code{count_graphlet_tuples} output. } diff --git a/man/scale_graphlet_counts_ego.Rd b/man/scale_graphlet_counts_ego.Rd index cba6fc93..28ae9ffc 100644 --- a/man/scale_graphlet_counts_ego.Rd +++ b/man/scale_graphlet_counts_ego.Rd @@ -2,26 +2,24 @@ % Please edit documentation in R/measures_net_dis.R \name{scale_graphlet_counts_ego} \alias{scale_graphlet_counts_ego} -\title{FLAGUSED -Scale graphlet counts for an ego network by the n choose k possible +\title{Scale graphlet counts for an ego network by the n choose k possible choices of k nodes in that ego-network, where n is the number of nodes in the ego network and k is the number of nodes in the graphlet.} \usage{ -scale_graphlet_counts_ego(ego_networks, graphlet_counts, max_graphlet_size) +scale_graphlet_counts_ego(graphlet_counts, max_graphlet_size) } \arguments{ -\item{ego_networks}{Pre-generated ego networks for an input graph.} - \item{graphlet_counts}{Pre-calculated graphlet counts for each ego_network.} \item{max_graphlet_size}{Determines the maximum size of graphlets included in graphlet_counts.} + +\item{ego_networks}{Pre-generated ego networks for an input graph.} } \value{ scaled graphlet counts. } \description{ -FLAGUSED Scale graphlet counts for an ego network by the n choose k possible choices of k nodes in that ego-network, where n is the number of nodes in the ego network and k is the number of nodes in the graphlet. diff --git a/man/simplify_graph.Rd b/man/simplify_graph.Rd index 80ac4e57..f4e37722 100644 --- a/man/simplify_graph.Rd +++ b/man/simplify_graph.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/orca_interface.R \name{simplify_graph} \alias{simplify_graph} -\title{FLAGUSED -Simplify an igraph} +\title{Simplify an igraph} \usage{ simplify_graph(graph, as_undirected = TRUE, remove_loops = TRUE, remove_multiple = TRUE, remove_isolates = TRUE) diff --git a/man/single_density_bin.Rd b/man/single_density_bin.Rd index dda61c79..ed801c45 100644 --- a/man/single_density_bin.Rd +++ b/man/single_density_bin.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{single_density_bin} \alias{single_density_bin} -\title{FLAGUSED -For case where don't want to use binning, return a single bin which covers +\title{For case where don't want to use binning, return a single bin which covers the full range of possible density values.} \usage{ single_density_bin(densities) @@ -13,7 +12,6 @@ single_density_bin(densities) a list of indexes of the required length.)} } \description{ -FLAGUSED For case where don't want to use binning, return a single bin which covers the full range of possible density values. } diff --git a/man/zeros_to_ones.Rd b/man/zeros_to_ones.Rd index 4f5dc1c7..ca662211 100644 --- a/man/zeros_to_ones.Rd +++ b/man/zeros_to_ones.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{zeros_to_ones} \alias{zeros_to_ones} -\title{FLAGUSED -Replace zero values in a vector with ones. Used by +\title{Replace zero values in a vector with ones. Used by \code{scale_graphlet_count} to prevent divide by zero errors.} \usage{ @@ -14,7 +13,6 @@ zeros_to_ones(v) TODO remove export} } \description{ -FLAGUSED Replace zero values in a vector with ones. Used by \code{scale_graphlet_count} to prevent divide by zero errors. diff --git a/vignettes/dendrogram_example_net_dis.html b/vignettes/dendrogram_example_net_dis.html index f53bacfc..5f7c9f70 100644 --- a/vignettes/dendrogram_example_net_dis.html +++ b/vignettes/dendrogram_example_net_dis.html @@ -12,7 +12,7 @@ - + Dendrogram example for Netdis @@ -305,7 +305,7 @@

Dendrogram example for Netdis

Martin O’Reilly

-

2019-09-11

+

2019-10-11

diff --git a/vignettes/netdis_2graphs_polya-aeppli.R b/vignettes/netdis_2graphs_polya-aeppli.R index 70dc8f23..db2ae759 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.R +++ b/vignettes/netdis_2graphs_polya-aeppli.R @@ -52,8 +52,8 @@ graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graph ## ------------------------------------------------------------------------ # Get ego-network densities -densities_1 <- ego_network_density(ego_1) -densities_2 <- ego_network_density(ego_2) +densities_1 <- ego_network_density(graphlet_counts_1) +densities_2 <- ego_network_density(graphlet_counts_2) # Adaptively bin ego-network densities binned_densities_1 <- binned_densities_adaptive(densities_1, @@ -122,22 +122,26 @@ binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, ## ------------------------------------------------------------------------ # Calculate expected graphlet counts for each ego network -exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, ego_density_bins_1, binned_graphlet_counts_1, max_graphlet_size, scale_fn = NULL) -exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, ego_density_bins_2, binned_graphlet_counts_2, max_graphlet_size, scale_fn = NULL) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 +centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) -centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 +centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ## ------------------------------------------------------------------------ sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) diff --git a/vignettes/netdis_2graphs_polya-aeppli.html b/vignettes/netdis_2graphs_polya-aeppli.html index 09d7f1cd..e84ffcef 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.html +++ b/vignettes/netdis_2graphs_polya-aeppli.html @@ -12,7 +12,7 @@ - + Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation @@ -305,7 +305,7 @@

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

Martin O’Reilly, Jack Roberts

-

2019-09-05

+

2019-10-10

@@ -365,8 +365,8 @@

Count graphlets in ego networks

Bin ego networks by density

# Get ego-network densities
-densities_1 <- ego_network_density(ego_1)
-densities_2 <- ego_network_density(ego_2)
+densities_1 <- ego_network_density(graphlet_counts_1)
+densities_2 <- ego_network_density(graphlet_counts_2)
 
 # Adaptively bin ego-network densities
 binned_densities_1 <- binned_densities_adaptive(densities_1, 
@@ -435,22 +435,26 @@ 

Calculate expected graphlet counts in each bin using geometric poisson appro

Centre graphlet counts of query graphs using binned expected counts

# Calculate expected graphlet counts for each ego network
-exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, 
+exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, 
                                                                  ego_density_bins_1, 
                                                                  binned_graphlet_counts_1,
                                                                  max_graphlet_size,
                                                                  scale_fn = NULL)
 
 
-exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, 
+exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, 
                                                                  ego_density_bins_2, 
                                                                  binned_graphlet_counts_2,
                                                                  max_graphlet_size,
                                                                  scale_fn = NULL)
 # Centre graphlet counts by subtracting expected counts
-centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1
-
-centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2
+centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size)

Sum centred graphlet counts across all ego networks

diff --git a/vignettes/netdis_pairwise_comparisons.html b/vignettes/netdis_pairwise_comparisons.html index a8e28f93..f1dc7259 100644 --- a/vignettes/netdis_pairwise_comparisons.html +++ b/vignettes/netdis_pairwise_comparisons.html @@ -12,7 +12,7 @@ - + Usage of netdis interfaces for different pairwise comparison options. @@ -305,7 +305,7 @@

Usage of netdis interfaces for different pairwise comparison options.

Jack Roberts

-

2019-09-11

+

2019-10-11

diff --git a/vignettes/quickstart_netdis_2graphs.R b/vignettes/quickstart_netdis_2graphs.R index f00710e8..080e1f6a 100644 --- a/vignettes/quickstart_netdis_2graphs.R +++ b/vignettes/quickstart_netdis_2graphs.R @@ -49,7 +49,6 @@ graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graph ## ------------------------------------------------------------------------ # Load reference graph -# JACK - need to deal with case where ref graph not used. ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), package = "netdist") ref_graph <- read_simple_graph(ref_path, format = "ncol") @@ -63,12 +62,12 @@ graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_g # Scale ego-network graphlet counts by dividing by total number of k-tuples in # ego-network (where k is graphlet size) -scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(ego_ref, - graphlet_counts_ref, +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, max_graphlet_size) + # Get ego-network densities -densities_ref <- ego_network_density(ego_ref) +densities_ref <- ego_network_density(graphlet_counts_ref) # Adaptively bin ref ego-network densities binned_densities <- binned_densities_adaptive(densities_ref, @@ -85,23 +84,27 @@ ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( ## ------------------------------------------------------------------------ # Calculate expected graphlet counts (using ref graph ego network density bins) -exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, +exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, ref_ego_density_bins, ref_binned_graphlet_counts, max_graphlet_size, scale_fn=count_graphlet_tuples) -exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, +exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, ref_ego_density_bins, ref_binned_graphlet_counts, max_graphlet_size, scale_fn=count_graphlet_tuples) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1 +centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) -centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2 +centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ## ------------------------------------------------------------------------ sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 3fb06e6e..da3edc9c 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -71,7 +71,6 @@ graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graph ## Use a reference graph to calculate expected graphlet counts in ego network density bins ```{r} # Load reference graph -# JACK - need to deal with case where ref graph not used. ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), package = "netdist") ref_graph <- read_simple_graph(ref_path, format = "ncol") diff --git a/vignettes/quickstart_netdis_2graphs.html b/vignettes/quickstart_netdis_2graphs.html index b9ab33a8..d913896e 100644 --- a/vignettes/quickstart_netdis_2graphs.html +++ b/vignettes/quickstart_netdis_2graphs.html @@ -12,7 +12,7 @@ - + Quick start guide for Netdis - 2 graphs @@ -305,7 +305,7 @@

Quick start guide for Netdis - 2 graphs

Martin O’Reilly, Jack Roberts

-

2019-09-05

+

2019-10-11

@@ -365,59 +365,62 @@

Count graphlets in ego networks

Use a reference graph to calculate expected graphlet counts in ego network density bins

# Load reference graph
-# JACK - need to deal with case where ref graph not used.
-ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
-                        package = "netdist")
-ref_graph <- read_simple_graph(ref_path, format = "ncol")
-
-ego_ref <- make_named_ego_graph(ref_graph, 
-                                order = neighbourhood_size, 
-                                min_ego_nodes = min_ego_nodes, 
-                                min_ego_edges = min_ego_edges)
-
-graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size)
-
-# Scale ego-network graphlet counts by dividing by total number of k-tuples in
-# ego-network (where k is graphlet size)
-scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(ego_ref, 
-                                                        graphlet_counts_ref, 
-                                                        max_graphlet_size)
-
-# Get ego-network densities
-densities_ref <- ego_network_density(ego_ref)
-
-# Adaptively bin ref ego-network densities
-binned_densities <- binned_densities_adaptive(densities_ref, 
-                                              min_counts_per_interval = min_bin_count, 
-                                              num_intervals = num_bins)
-
-ref_ego_density_bins <- binned_densities$breaks
-
-# Average ref graphlet counts across density bins
-ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
-                                  scaled_graphlet_counts_ref, 
-                                  binned_densities$interval_indexes)
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, + max_graphlet_size) + + +# Get ego-network densities +densities_ref <- ego_network_density(graphlet_counts_ref) + +# Adaptively bin ref ego-network densities +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks + +# Average ref graphlet counts across density bins +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes)

Centre graphlet counts of query graphs based on statistics of reference graph

# Calculate expected graphlet counts (using ref graph ego network density bins)
-exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(ego_1, 
+exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, 
                                                                  ref_ego_density_bins, 
                                                                  ref_binned_graphlet_counts,
                                                                  max_graphlet_size,
                                                                  scale_fn=count_graphlet_tuples)
 
 
-exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(ego_2, 
+exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, 
                                                                  ref_ego_density_bins, 
                                                                  ref_binned_graphlet_counts,
                                                                  max_graphlet_size,
                                                                  scale_fn=count_graphlet_tuples)
 
 # Centre graphlet counts by subtracting expected counts
-centred_graphlet_counts_1 <- graphlet_counts_1 - exp_graphlet_counts_1
-
-centred_graphlet_counts_2 <- graphlet_counts_2 - exp_graphlet_counts_2
+centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size)

Sum centred graphlet counts across all ego networks

From b62c25c63c237cbadb69dd4621fd98de7168a137 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 13:39:15 +0100 Subject: [PATCH 057/188] update orca tests for objects with node counts --- tests/testthat/test_orca_interface.R | 84 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index b6efd1f0..fa088f8e 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -1251,7 +1251,7 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") max_graphlet_size <- 4 graphlet_key <- graphlet_key(max_graphlet_size) @@ -1259,31 +1259,31 @@ test_that("count_graphlets_ego: Ego-network 4-node graphlet counts match manuall # Set manually verified counts # 1-step ego networks expected_counts_order_1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0), - c(5, 5, 1, 0, 2, 0, 2, 0, 0), - c(1, 0, 0, 0, 0, 0, 0, 0, 0), - c(5, 2, 2, 0, 0, 0, 0, 1, 0), - c(1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 2, 1, 0, 0, 0, 1, 0, 0), - c(7, 3, 4, 0, 0, 0, 3, 0, 1), - c(7, 3, 4, 0, 0, 0, 3, 0, 1), - c(6, 0, 4, 0, 0, 0, 0, 0, 1), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) ) rownames(expected_counts_order_1) <- node_labels colnames(expected_counts_order_1) <- graphlet_labels # 2-step ego networks expected_counts_order_2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1), - c(8, 10, 2, 6, 3, 0, 4, 1, 0), - c(5, 5, 1, 0, 2, 0, 2, 0, 0), - c(10, 14, 2, 11, 3, 1, 5, 1, 0), - c(5, 5, 1, 0, 2, 0, 2, 0, 0), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(11, 10, 5, 10, 0, 1, 8, 0, 1), - c(9, 8, 4, 4, 0, 1, 6, 0, 1), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) ) rownames(expected_counts_order_2) <- node_labels colnames(expected_counts_order_2) <- graphlet_labels @@ -1416,7 +1416,7 @@ test_that("ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manu # Set node and graphlet labels to use for row and col names in expected counts node_labels <- igraph::V(graph)$name - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") max_graphlet_size <- 4 graphlet_key <- graphlet_key(max_graphlet_size) @@ -1424,31 +1424,31 @@ test_that("ego_to_graphlet_counts: Ego-network 4-node graphlet counts match manu # Set manually verified counts # 1-step ego networks expected_counts_order_1 <- rbind( - c(6, 5, 2, 0, 1, 0, 2, 1, 0), - c(5, 5, 1, 0, 2, 0, 2, 0, 0), - c(1, 0, 0, 0, 0, 0, 0, 0, 0), - c(5, 2, 2, 0, 0, 0, 0, 1, 0), - c(1, 0, 0, 0, 0, 0, 0, 0, 0), - c(4, 2, 1, 0, 0, 0, 1, 0, 0), - c(7, 3, 4, 0, 0, 0, 3, 0, 1), - c(7, 3, 4, 0, 0, 0, 3, 0, 1), - c(6, 0, 4, 0, 0, 0, 0, 0, 1), - c(6, 0, 4, 0, 0, 0, 0, 0, 1) + c(5, 6, 5, 2, 0, 1, 0, 2, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 5, 2, 2, 0, 0, 0, 0, 1, 0), + c(2, 1, 0, 0, 0, 0, 0, 0, 0, 0), + c(4, 4, 2, 1, 0, 0, 0, 1, 0, 0), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(5, 7, 3, 4, 0, 0, 0, 3, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1), + c(4, 6, 0, 4, 0, 0, 0, 0, 0, 1) ) rownames(expected_counts_order_1) <- node_labels colnames(expected_counts_order_1) <- graphlet_labels # 2-step ego networks expected_counts_order_2 <- rbind( - c(15, 18, 6, 21, 3, 1, 11, 1, 1), - c(8, 10, 2, 6, 3, 0, 4, 1, 0), - c(5, 5, 1, 0, 2, 0, 2, 0, 0), - c(10, 14, 2, 11, 3, 1, 5, 1, 0), - c(5, 5, 1, 0, 2, 0, 2, 0, 0), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(13, 13, 6, 15, 1, 1, 9, 1, 1), - c(11, 10, 5, 10, 0, 1, 8, 0, 1), - c(9, 8, 4, 4, 0, 1, 6, 0, 1), - c(9, 8, 4, 4, 0, 1, 6, 0, 1) + c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1), + c(7, 8, 10, 2, 6, 3, 0, 4, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 10, 14, 2, 11, 3, 1, 5, 1, 0), + c(5, 5, 5, 1, 0, 2, 0, 2, 0, 0), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(8, 13, 13, 6, 15, 1, 1, 9, 1, 1), + c(7, 11, 10, 5, 10, 0, 1, 8, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1), + c(6, 9, 8, 4, 4, 0, 1, 6, 0, 1) ) rownames(expected_counts_order_2) <- node_labels colnames(expected_counts_order_2) <- graphlet_labels From c05cff46209f5c98db4286e6424b63163b0a2752 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 15:20:52 +0100 Subject: [PATCH 058/188] fix tuple count tests --- R/measures_net_dis.R | 2 +- R/orca_interface.R | 19 ++++------- man/ego_network_node_counts.Rd | 14 -------- tests/testthat/test_measures_net_dis.R | 44 +++++++++++++++----------- tests/testthat/test_orca_interface.R | 4 +-- 5 files changed, 35 insertions(+), 48 deletions(-) delete mode 100644 man/ego_network_node_counts.Rd diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 07c3dcd3..49937779 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -947,5 +947,5 @@ count_graphlet_tuples <- function(graph_graphlet_counts, max_graphlet_size) { ) # add node counts back to object - graphlet_tuple_counts <- c(N, graphlet_tuple_counts) + # graphlet_tuple_counts <- c(N, graphlet_tuple_counts) } diff --git a/R/orca_interface.R b/R/orca_interface.R index e6b887c2..9444ef21 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -328,7 +328,12 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { # the graphlet counts by the number of nodes that contribute to # each graphlet type nodes_per_graphlet <- graphlet_key(max_graphlet_size)$node_count - return(total_counts / nodes_per_graphlet) + total_counts <- total_counts / nodes_per_graphlet + + # add overall graph node count to total_counts + N <- igraph::vcount(graph) + total_counts <- c(N = N, total_counts) + total_counts } #' Ego-network graphlet counts @@ -379,14 +384,6 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size } } -#' ego_network_node_counts -#' -#' Calculates number of nodes in each ego network. -#' @param ego_networks Named list of ego networks for a graph. -ego_network_node_counts <- function(ego_networks) { - simplify2array(purrr::map(ego_networks, igraph::vcount)) -} - #' ego_to_graphlet_counts #' #' Calculates graphlet counts for previously generated ego networks. @@ -409,10 +406,6 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) - # Add node counts column - N <- ego_network_node_counts(ego_networks) - ego_graphlet_counts <- cbind(N, ego_graphlet_counts) - # Return graphlet counts return(ego_graphlet_counts) } diff --git a/man/ego_network_node_counts.Rd b/man/ego_network_node_counts.Rd deleted file mode 100644 index 4ddc0a6b..00000000 --- a/man/ego_network_node_counts.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/orca_interface.R -\name{ego_network_node_counts} -\alias{ego_network_node_counts} -\title{ego_network_node_counts} -\usage{ -ego_network_node_counts(ego_networks) -} -\arguments{ -\item{ego_networks}{Named list of ego networks for a graph.} -} -\description{ -Calculates number of nodes in each ego network. -} diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index b23179a5..fc5c70ec 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -12,6 +12,12 @@ test_that(test_message, { graph_n11 <- igraph::erdos.renyi.game(11, p = 1, type = "gnp") graph_n37 <- igraph::erdos.renyi.game(37, p = 1, type = "gnp") graph_n73 <- igraph::erdos.renyi.game(73, p = 1, type = "gnp") + + # calculate graphlet counts object using previously tested function + graphlet_counts_n11 <- count_graphlets_for_graph(graph_n11, 5) + graphlet_counts_n37 <- count_graphlets_for_graph(graph_n37, 5) + graphlet_counts_n73 <- count_graphlets_for_graph(graph_n73, 5) + # Calculate expected graph tuple count for graphlets of various sizes. There # is 1 graphlet of size 1, 2 of size 3, 6 of size 4, and 21 of size 5 graphlet_tuple_counts <- function(n, max_graphlet_size) { @@ -39,13 +45,15 @@ test_that(test_message, { expected_tuple_count_n11_gs5 <- graphlet_tuple_counts(11, 5) expected_tuple_count_n37_gs5 <- graphlet_tuple_counts(37, 5) expected_tuple_count_n73_gs5 <- graphlet_tuple_counts(73, 5) + # Generate actual tuple counts for graphlets up to size 4 and 5 - actual_tuple_count_n11_gs4 <- count_graphlet_tuples(graph_n11, 4) - actual_tuple_count_n37_gs4 <- count_graphlet_tuples(graph_n37, 4) - actual_tuple_count_n73_gs4 <- count_graphlet_tuples(graph_n73, 4) - actual_tuple_count_n11_gs5 <- count_graphlet_tuples(graph_n11, 5) - actual_tuple_count_n37_gs5 <- count_graphlet_tuples(graph_n37, 5) - actual_tuple_count_n73_gs5 <- count_graphlet_tuples(graph_n73, 5) + actual_tuple_count_n11_gs4 <- count_graphlet_tuples(graphlet_counts_n11, 4) + actual_tuple_count_n37_gs4 <- count_graphlet_tuples(graphlet_counts_n37, 4) + actual_tuple_count_n73_gs4 <- count_graphlet_tuples(graphlet_counts_n73, 4) + actual_tuple_count_n11_gs5 <- count_graphlet_tuples(graphlet_counts_n11, 5) + actual_tuple_count_n37_gs5 <- count_graphlet_tuples(graphlet_counts_n37, 5) + actual_tuple_count_n73_gs5 <- count_graphlet_tuples(graphlet_counts_n73, 5) + # Compare expected tuple counts with actual expect_equal(expected_tuple_count_n11_gs4, actual_tuple_count_n11_gs4) expect_equal(expected_tuple_count_n37_gs4, actual_tuple_count_n37_gs4) @@ -57,18 +65,19 @@ test_that(test_message, { # === TEST count_graphlet_tuples_ego === # NOTE: This test is not amazing, as graphlet_tuple_counts_ego is very similar # to the method under test. However, it's a simple method so maybe that's ok? - graphlet_tuple_counts_ego <- function(ego_networks, max_graphlet_size) { - t(sapply(ego_networks, FUN = function(g) { - graphlet_tuple_counts(length(igraph::V(g)), max_graphlet_size) - })) + graphlet_tuple_counts_ego <- function(graphlet_counts_ego, max_graphlet_size) { + t(apply(graphlet_counts_ego, 1, + count_graphlet_tuples, max_graphlet_size = max_graphlet_size)) + } - # Generate ego networks for each graph - graph_n11_ego1 <- make_named_ego_graph(graph_n11, order = 1) - graph_n37_ego1 <- make_named_ego_graph(graph_n37, order = 1) - graph_n73_ego1 <- make_named_ego_graph(graph_n73, order = 1) - graph_n11_ego2 <- make_named_ego_graph(graph_n11, order = 2) - graph_n37_ego2 <- make_named_ego_graph(graph_n37, order = 2) - graph_n73_ego2 <- make_named_ego_graph(graph_n73, order = 2) + # Generate ego network graphlet counts for each graph + graph_n11_ego1 <- count_graphlets_ego(graph_n11, neighbourhood_size = 1) + graph_n37_ego1 <- count_graphlets_ego(graph_n37, neighbourhood_size = 1) + graph_n73_ego1 <- count_graphlets_ego(graph_n73, neighbourhood_size = 1) + graph_n11_ego2 <- count_graphlets_ego(graph_n11, neighbourhood_size = 2) + graph_n37_ego2 <- count_graphlets_ego(graph_n37, neighbourhood_size = 2) + graph_n73_ego2 <- count_graphlets_ego(graph_n73, neighbourhood_size = 2) + # Generate expected tuple counts for graphlets up to size 4 and 5 # 1. For ego-networks of order 1 expected_tuple_count_n11_ego1_gs4 <- graphlet_tuple_counts_ego(graph_n11_ego1, 4) @@ -102,7 +111,6 @@ test_that(test_message, { actual_tuple_count_n73_ego2_gs5 <- count_graphlet_tuples_ego(graph_n73_ego2, 5) # Compare expected with actual - # 1. For ego-networks of order 1 expect_equal(expected_tuple_count_n11_ego1_gs4, actual_tuple_count_n11_ego1_gs4) expect_equal(expected_tuple_count_n37_ego1_gs4, actual_tuple_count_n37_ego1_gs4) expect_equal(expected_tuple_count_n73_ego1_gs4, actual_tuple_count_n73_ego1_gs4) diff --git a/tests/testthat/test_orca_interface.R b/tests/testthat/test_orca_interface.R index fa088f8e..44e16cf1 100644 --- a/tests/testthat/test_orca_interface.R +++ b/tests/testthat/test_orca_interface.R @@ -1215,10 +1215,10 @@ test_that("count_graphlets_for_graph works", { graph <- igraph::graph_from_edgelist(elist, directed = FALSE) # Setgraphlet labels to use for names in expected counts - graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + graphlet_labels <- c("N", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") # Manually verified graphlet counts - expected_counts <- c(15, 18, 6, 21, 3, 1, 11, 1, 1) + expected_counts <- c(10, 15, 18, 6, 21, 3, 1, 11, 1, 1) names(expected_counts) <- graphlet_labels # Test From bf40c3d61317fc771a6dc640fecc116f0685b913 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 15:25:45 +0100 Subject: [PATCH 059/188] fix density test --- tests/testthat/test_measures_net_dis.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index fc5c70ec..a33f7502 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -178,21 +178,22 @@ test_that("Ego-network 4-node density values match manually verified totals", { # Order 2 expected densities should be: # 0.33, 0.38, 0.50, 0.36, 0.50, 0.46, 0.46, 0.52, 0.60, 0.60 - # Generate order 1 and 2 ego networks with previously tested function - ego_networks_o1 <- make_named_ego_graph(graph, - order = 1, + # Generate order 1 and 2 ego network graphlet counts + # with previously tested function + graphlet_counts_ego_o1 <- count_graphlets_ego(graph, + neighbourhood_size = 1, min_ego_edges = min_ego_edges, min_ego_nodes = min_ego_nodes ) - ego_networks_o2 <- make_named_ego_graph(graph, - order = 2, + graphlet_counts_ego_o2 <- count_graphlets_ego(graph, + neighbourhood_size = 2, min_ego_edges = min_ego_edges, min_ego_nodes = min_ego_nodes ) # Calculate densities - actual_densities_o1 <- ego_network_density(ego_networks_o1) - actual_densities_o2 <- ego_network_density(ego_networks_o2) + actual_densities_o1 <- ego_network_density(graphlet_counts_ego_o1) + actual_densities_o2 <- ego_network_density(graphlet_counts_ego_o2) # Check densities match expected values expect_equal(actual_densities_o1, expected_densities_o1) From 48defe3eadfbb4bebd5d72413fa128690cef15e2 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 15:41:06 +0100 Subject: [PATCH 060/188] scale graphlet counts only for columns present in tuple counts --- R/measures_net_dis.R | 2 +- tests/testthat/test_measures_net_dis.R | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 49937779..82102be0 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -863,7 +863,7 @@ zeros_to_ones <- function(v) { scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { # Avoid divide by zero errors by replacing all zeros with ones in the # divisor - graphlet_count / zeros_to_ones(graphlet_tuples) + graphlet_count[, colnames(graphlet_tuples)] / zeros_to_ones(graphlet_tuples) } diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index a33f7502..92ca7305 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -424,13 +424,11 @@ test_that("Ego-network 4-node graphlet counts match manually verified totals", { # Calculate scaled counts with scale_graphlet_counts_ego # (function to test). actual_counts_o1 <- - scale_graphlet_counts_ego(ego_networks_o1, - graphlet_counts_o1, - max_graphlet_size = max_graphlet_size + scale_graphlet_counts_ego(graphlet_counts_o1, + max_graphlet_size = max_graphlet_size ) actual_counts_o2 <- - scale_graphlet_counts_ego(ego_networks_o2, - graphlet_counts_o2, + scale_graphlet_counts_ego(graphlet_counts_o2, max_graphlet_size = max_graphlet_size ) From aac03f1a3ae9b5490771b6cd99ce7803fbf74c44 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 16:03:37 +0100 Subject: [PATCH 061/188] fix netdis_expected_graphlet_counts test --- tests/testthat/test_measures_net_dis.R | 28 ++++++++------------------ 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 92ca7305..c543a766 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -685,26 +685,13 @@ test_that("density_binned_counts output matches manually verified totals with di agg_fn <- mean scale_fn <- scale_graphlet_counts_ego - # generate ego networks using previously tested function - ego_networks_o1 <- make_named_ego_graph(graph, - order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) - ego_networks_o2 <- make_named_ego_graph(graph, - order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes - ) # calculate expected counts using previously tested function expected_scaled_counts_o1 <- - scale_graphlet_counts_ego(ego_networks_o1, - expected_counts_o1, + scale_graphlet_counts_ego(expected_counts_o1, max_graphlet_size = max_graphlet_size ) expected_scaled_counts_o2 <- - scale_graphlet_counts_ego(ego_networks_o2, - expected_counts_o2, + scale_graphlet_counts_ego(expected_counts_o2, max_graphlet_size = max_graphlet_size ) @@ -738,7 +725,6 @@ test_that("density_binned_counts output matches manually verified totals with di expected_interval_indexes_o1, agg_fn = agg_fn, scale_fn = scale_fn, - ego_networks = ego_networks_o1, max_graphlet_size = max_graphlet_size) actual_scaled_density_binned_counts_o2 <- density_binned_counts( @@ -746,7 +732,6 @@ test_that("density_binned_counts output matches manually verified totals with di expected_interval_indexes_o2, agg_fn = agg_fn, scale_fn = scale_fn, - ego_networks = ego_networks_o2, max_graphlet_size = max_graphlet_size) # Check actual output vs expected @@ -797,7 +782,9 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { density_indexes <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) num_nodes <- rep(120, 10) graphs <- purrr::map2(num_nodes, densities, rand_graph) - + graphlet_counts <- purrr::map(graphs, + count_graphlets_for_graph, + max_graphlet_size = max_graphlet_size) # WITH scale_fn = NULL (bin counts directly with no scaling) # Helper function to calculate expected expected graphlet counts expected_expected_graphlet_counts_fn <- function(density_index) { @@ -807,12 +794,13 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { expected_expected_graphlet_counts <- purrr::map(density_indexes, expected_expected_graphlet_counts_fn) actual_expected_graphlet_counts <- - purrr::map(graphs, netdis_expected_graphlet_counts, + purrr::map(graphlet_counts, netdis_expected_graphlet_counts, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = scaled_reference_counts, scale_fn = NULL ) + # Loop over each graph and compare expected with actual # NOTE: v2.0.0 of testthat library made a breaking change that means using # map, mapply etc can cause failures under certain conditions @@ -834,7 +822,7 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { expected_expected_graphlet_counts <- purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) actual_expected_graphlet_counts <- - purrr::map(graphs, netdis_expected_graphlet_counts, + purrr::map(graphlet_counts, netdis_expected_graphlet_counts, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = scaled_reference_counts, From 1a978754fbcb044761d9bf12499824b1c82afa07 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 16:26:45 +0100 Subject: [PATCH 062/188] fix expected graphlet counts test --- R/measures_net_dis.R | 2 +- tests/testthat/test_measures_net_dis.R | 39 ++++++++++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 82102be0..5f3ba628 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -681,7 +681,7 @@ netdis_expected_graphlet_counts <- function(graphlet_counts, matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - + rownames(matched_reference_counts) <- rownames(graphlet_counts) if (!is.null(scale_fn)) { # Scale reference counts e.g. by multiplying the # reference count for each graphlet by the number diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index c543a766..7ab7cba9 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -868,16 +868,16 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n min_ego_edges <- 0 min_ego_nodes <- 0 - # Make graph ego networks - ego_networks_o1 <- make_named_ego_graph(graph, - order = 1, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes + # Get ego network graphlet counts + graphlet_counts_ego_o1 <- count_graphlets_ego(graph, + neighbourhood_size = 1, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes ) - ego_networks_o2 <- make_named_ego_graph(graph, - order = 2, - min_ego_edges = min_ego_edges, - min_ego_nodes = min_ego_nodes + graphlet_counts_ego_o2 <- count_graphlets_ego(graph, + neighbourhood_size = 2, + min_ego_edges = min_ego_edges, + min_ego_nodes = min_ego_nodes ) # Set manually-verified node counts and densities # 1. Ego-networks of order 1 @@ -909,6 +909,7 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n c(81, 82, 83, 84, 85, 86, 87, 88, 89), c(91, 92, 93, 94, 95, 96, 97, 98, 99) ) + colnames(scaled_reference_counts) <- graphlet_labels expected_dims <- dim(scaled_reference_counts) min_ego_nodes <- 3 min_ego_edges <- 1 @@ -937,13 +938,13 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) + rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o1) # Calculate actual output of function under test actual_expected_graphlet_counts_ego_o1 <- netdis_expected_graphlet_counts_per_ego( - ego_networks_o1, + graphlet_counts_ego_o1, breaks, scaled_reference_counts, max_graphlet_size, @@ -951,13 +952,13 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n ) actual_expected_graphlet_counts_ego_o2 <- netdis_expected_graphlet_counts_per_ego( - ego_networks_o2, + graphlet_counts_ego_o2, breaks, scaled_reference_counts, max_graphlet_size, scale_fn = count_graphlet_tuples ) - + # Compare actual to expected expect_equal( actual_expected_graphlet_counts_ego_o1, @@ -991,13 +992,13 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n colnames(expected_expected_graphlet_counts_ego_o1) <- graphlet_labels colnames(expected_expected_graphlet_counts_ego_o2) <- graphlet_labels # Set row labels to ego network names - rownames(expected_expected_graphlet_counts_ego_o1) <- names(ego_networks_o1) - rownames(expected_expected_graphlet_counts_ego_o2) <- names(ego_networks_o2) + rownames(expected_expected_graphlet_counts_ego_o1) <- rownames(graphlet_counts_ego_o1) + rownames(expected_expected_graphlet_counts_ego_o2) <- rownames(graphlet_counts_ego_o2) # Calculate actual output of function under test actual_expected_graphlet_counts_ego_o1 <- netdis_expected_graphlet_counts_per_ego( - ego_networks_o1, + graphlet_counts_ego_o1, breaks, scaled_reference_counts, max_graphlet_size, @@ -1005,12 +1006,14 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n ) actual_expected_graphlet_counts_ego_o2 <- netdis_expected_graphlet_counts_per_ego( - ego_networks_o2, + graphlet_counts_ego_o2, breaks, scaled_reference_counts, max_graphlet_size, scale_fn = NULL ) + print(actual_expected_graphlet_counts_ego_o1) + print(expected_expected_graphlet_counts_ego_o1) # Compare actual to expected expect_equal( actual_expected_graphlet_counts_ego_o1, From 09eb6e78ede524da75a1eb8c4b8669fac10d53d4 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 16:54:46 +0100 Subject: [PATCH 063/188] remove print statements --- R/measures_net_dis.R | 2 +- tests/testthat/test_measures_net_dis.R | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 5f3ba628..82102be0 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -681,7 +681,7 @@ netdis_expected_graphlet_counts <- function(graphlet_counts, matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - rownames(matched_reference_counts) <- rownames(graphlet_counts) + if (!is.null(scale_fn)) { # Scale reference counts e.g. by multiplying the # reference count for each graphlet by the number diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 7ab7cba9..5a8bc4e2 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1012,8 +1012,7 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n max_graphlet_size, scale_fn = NULL ) - print(actual_expected_graphlet_counts_ego_o1) - print(expected_expected_graphlet_counts_ego_o1) + # Compare actual to expected expect_equal( actual_expected_graphlet_counts_ego_o1, From 18f33f539f2b752acdeeeeed2c0a34a1a8ac82cc Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 17:24:11 +0100 Subject: [PATCH 064/188] fix some lint errors --- R/measures_net_dis.R | 155 +++++++++++++++++++++---------------------- R/orca_interface.R | 113 +++++++++++++++++++------------ 2 files changed, 146 insertions(+), 122 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 82102be0..c22a98ea 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -1,40 +1,40 @@ #' Netdis between two graphs -#' +#' #' @param graph_1 A simplified igraph graph object. -#' +#' #' @param graph_2 A simplified igraph graph object. -#' -#' @param ref_graph Controls how expected counts are calculated. Either: -#' 1) A numeric value - used as a constant expected counts value for all query +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query #' graphs (DEFAULT: 0). #' 2) A simplified \code{igraph} object - used as a reference graph from which #' expected counts are calculated for all query graphs. -#' 3) NULL - Expected counts will be calculated based on the properties of the +#' 3) NULL - Expected counts will be calculated based on the properties of the #' query graphs themselves. -#' +#' #' @param max_graphlet_size Generate graphlets up to this size. -#' +#' #' @param neighbourhood_size Ego network neighbourhood size. -#' +#' #' @param min_ego_nodes Filter ego networks which have fewer #' than min_ego_nodes nodes. -#' +#' #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges. -#' +#' #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each #' ego network). (Default: \code{binned_densities_adaptive} with #' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). -#' +#' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} #' (bin indexes) and \code{max_graphlet_size} as arguments. #' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and #' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the #' approach used in the original netdis paper). -#' +#' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, @@ -42,18 +42,18 @@ #' (Default: \code{netdis_expected_graphlet_counts_per_ego} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). -#' +#' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. If the \code{graphlet_counts_1} argument is defined then #' \code{graph_1} will not be used. -#' +#' #' @param graphlet_counts_2 Pre-generated graphlet counts for the second query #' graph. If the \code{graphlet_counts_2} argument is defined then #' \code{graph_2} will not be used. -#' +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size -#' +#' #' @export netdis_one_to_one <- function(graph_1 = NULL, graph_2 = NULL, @@ -84,11 +84,11 @@ netdis_one_to_one <- function(graph_1 = NULL, if (is.null(graph_2) & is.null(graphlet_counts_2)) { stop("One of graph_2 and graphlet_counts_2 must be supplied.") } - + ## ------------------------------------------------------------------------ # Generate graphlet counts and bundle them into named list with format needed # for netdis_many_to_many. - + if (is.null(graphlet_counts_1)) { graphlet_counts_1 <- count_graphlets_ego( graph_1, @@ -100,7 +100,7 @@ netdis_one_to_one <- function(graph_1 = NULL, ) } rm(graph_1) - + if (is.null(graphlet_counts_2)) { graphlet_counts_2 <- count_graphlets_ego( graph_2, @@ -111,8 +111,8 @@ netdis_one_to_one <- function(graph_1 = NULL, return_ego_networks = FALSE ) } - rm(graph_2) - + rm(graph_2) + graphlet_counts <- list(graph_1 = graphlet_counts_1, graph_2 = graphlet_counts_2) @@ -129,51 +129,51 @@ netdis_one_to_one <- function(graph_1 = NULL, exp_counts_fn = exp_counts_fn, graphlet_counts = graphlet_counts ) - + ## ------------------------------------------------------------------------ # extract netdis statistics from list returned by netdis_many_to_many result$netdis[, 1] } #' Netdis comparisons between one graph and many other graphs. -#' +#' #' @param graph_1 Query graph - this graph will be compared with #' all graphs in graphs_compare. A simplified igraph graph object. -#' +#' #' @param graphs_compare Graphs graph_1 will be compared with. A named list of #' simplified igraph graph objects. -#' -#' @param ref_graph Controls how expected counts are calculated. Either: -#' 1) A numeric value - used as a constant expected counts value for all query +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query #' graphs (DEFAULT: 0). #' 2) A simplified \code{igraph} object - used as a reference graph from which #' expected counts are calculated for all query graphs. -#' 3) NULL - Expected counts will be calculated based on the properties of the +#' 3) NULL - Expected counts will be calculated based on the properties of the #' query graphs themselves. -#' +#' #' @param max_graphlet_size Generate graphlets up to this size. -#' +#' #' @param neighbourhood_size Ego network neighbourhood size. -#' +#' #' @param min_ego_nodes Filter ego networks which have fewer #' than min_ego_nodes nodes. -#' +#' #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges. -#' +#' #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each #' ego network). (Default: \code{binned_densities_adaptive} with #' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). -#' +#' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} #' (bin indexes) and \code{max_graphlet_size} as arguments. #' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and #' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the #' approach used in the original netdis paper). -#' +#' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, @@ -185,11 +185,11 @@ netdis_one_to_one <- function(graph_1 = NULL, #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. If the \code{graphlet_counts_1} argument is defined then #' \code{graph_1} will not be used. -#' +#' #' @param graphlet_counts_compare Named list of pre-generated graphlet counts #' for the remaining query graphs. If the \code{graphlet_counts_compare} #' argument is defined then \code{graphs_compare} will not be used. -#' +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export @@ -221,11 +221,11 @@ netdis_one_to_many <- function(graph_1 = NULL, if (is.null(graphs_compare) & is.null(graphlet_counts_compare)) { stop("One of graph_2 and graphlet_counts_2 must be supplied.") } - + ## ------------------------------------------------------------------------ # Generate graphlet counts and bundle them into named list with format needed # for netdis_many_to_many. - + if (is.null(graphlet_counts_1)) { graphlet_counts_1 <- count_graphlets_ego( graph_1, @@ -237,7 +237,7 @@ netdis_one_to_many <- function(graph_1 = NULL, ) } rm(graph_1) - + if (is.null(graphlet_counts_compare)) { graphlet_counts_compare <- purrr::map( graphs_compare, @@ -249,12 +249,12 @@ netdis_one_to_many <- function(graph_1 = NULL, return_ego_networks = FALSE ) } - rm(graphs_compare) - + rm(graphs_compare) + graphlet_counts <- append(graphlet_counts_compare, list(graph_1 = graphlet_counts_1), after = 0) - + ## ------------------------------------------------------------------------ # calculate netdis result <- netdis_many_to_many( @@ -279,46 +279,46 @@ netdis_one_to_many <- function(graph_1 = NULL, #' Netdis between all graph pairs -#' +#' #' @param graphs A named list of simplified igraph graph objects (undirected #' graphs excluding loops, multiple edges and isolated vertices), such as those #' obtained by using \code{read_simple_graphs}. -#' -#' @param ref_graph Controls how expected counts are calculated. Either: -#' 1) A numeric value - used as a constant expected counts value for all query +#' +#' @param ref_graph Controls how expected counts are calculated. Either: +#' 1) A numeric value - used as a constant expected counts value for all query #' graphs (DEFAULT: 0). #' 2) A simplified \code{igraph} object - used as a reference graph from which #' expected counts are calculated for all query graphs. -#' 3) NULL - Expected counts will be calculated based on the properties of the +#' 3) NULL - Expected counts will be calculated based on the properties of the #' query graphs themselves. -#' +#' #' @param comparisons Which comparisons to perform between graphs. #' Can be "many-to-many" (all pairwise combinations) or "one-to-many" #' (compare first graph in graphs to all other graphs.) -#' +#' #' @param max_graphlet_size Generate graphlets up to this size. -#' +#' #' @param neighbourhood_size Ego network neighbourhood size. -#' +#' #' @param min_ego_nodes Filter ego networks which have fewer #' than min_ego_nodes nodes. -#' +#' #' @param min_ego_edges Filter ego networks which have fewer #' than min_ego_edges edges. -#' +#' #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each #' ego network). (Default: \code{binned_densities_adaptive} with #' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). -#' +#' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} #' (bin indexes) and \code{max_graphlet_size} as arguments. #' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and #' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the #' approach used in the original netdis paper). -#' +#' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, @@ -326,9 +326,9 @@ netdis_one_to_many <- function(graph_1 = NULL, #' (Default: \code{netdis_expected_graphlet_counts_per_ego} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). -#' +#' #' @param graphlet_counts Pre-generated graphlet counts. If the -#' \code{graphlet_counts} argument is defined then \code{graphs} will not be +#' \code{graphlet_counts} argument is defined then \code{graphs} will not be #' used. #' A named list of matrices containing counts of each graphlet (columns) for #' each ego-network in the input graph (rows). Columns are labelled with @@ -336,14 +336,14 @@ netdis_one_to_many <- function(graph_1 = NULL, #' ego-network. As well as graphlet counts, each matrix must contain an #' additional column labelled "N" including the node count for #' each ego network. -#' +#' #' @param graphlet_counts_ref Pre-generated reference graphlet counts. If the #' \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not #' be used. #' #' @return Netdis statistics between query graphs for graphlet sizes #' up to and including max_graphlet_size. -#' +#' #' @export netdis_many_to_many <- function(graphs = NULL, ref_graph = 0, @@ -365,13 +365,13 @@ netdis_many_to_many <- function(graphs = NULL, scale_fn = count_graphlet_tuples), graphlet_counts = NULL, graphlet_counts_ref = NULL) { - + ## ------------------------------------------------------------------------ # Check arguments if (is.null(graphs) & is.null(graphlet_counts)) { stop("One of graphs and graphlet_counts must be supplied.") } - + ## ------------------------------------------------------------------------ # Generate ego networks and count graphlets for query graphs. # But if graphlet counts have already been provided we can skip this step. @@ -400,7 +400,7 @@ netdis_many_to_many <- function(graphs = NULL, # If a reference graph passed, use it to calculate expected counts for all # query graphs. } else if (!is.null(ref_graph) || !is.null(graphlet_counts_ref)) { - + # Generate ego networks and calculate graphlet counts # But if some ref graphlet counts provided can skip this step if (is.null(graphlet_counts_ref)) { @@ -414,7 +414,7 @@ netdis_many_to_many <- function(graphs = NULL, ) } rm(ref_graph) - + # Get ego-network densities densities_ref <- ego_network_density(graphlet_counts_ref) @@ -586,7 +586,7 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, #' Centre counts by subtracting expected graphlet counts from actual graphlet #' counts. #' @param graphlet_counts Ego network graphlet counts for a query graph -#' @param exp_graphlet_counts Pre-calculated expected counts for each graphlet +#' @param exp_graphlet_counts Pre-calculated expected counts for each graphlet #' type for each ego network. #' @param max_graphlet_size max graphlet size to calculate centred counts for. #' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size @@ -597,11 +597,11 @@ netdis_centred_graphlet_counts <- function( exp_graphlet_counts, max_graphlet_size) { - # extract columns for graphlets up to size max_graphlet_size + # extract columns for graphlets up to size max_graphlet_size id <- graphlet_key(max_graphlet_size)$id graphlet_counts <- graphlet_counts[, id] exp_graphlet_counts <- exp_graphlet_counts[, id] - + # centre counts graphlet_counts - exp_graphlet_counts @@ -644,7 +644,7 @@ netdis_expected_graphlet_counts_per_ego <- function( density_breaks = density_breaks, density_binned_reference_counts = density_binned_reference_counts, scale_fn = scale_fn)) - + expected_graphlet_counts } @@ -681,7 +681,7 @@ netdis_expected_graphlet_counts <- function(graphlet_counts, matched_reference_counts <- density_binned_reference_counts[matched_density_index, ] - + if (!is.null(scale_fn)) { # Scale reference counts e.g. by multiplying the # reference count for each graphlet by the number @@ -837,7 +837,7 @@ density_binned_counts_gp <- function(graphlet_counts, #' @return Counts of value const with same shape and names as graphlet_counts. netdis_const_expected_counts <- function(graphlet_counts, const) { exp_counts <- graphlet_counts - exp_counts[,] <- const + exp_counts[, ] <- const exp_counts } @@ -876,13 +876,13 @@ count_graphlet_tuples_ego <- function(graphlet_counts, max_graphlet_size) { graphlet_tuple_counts <- t(apply(graphlet_counts, 1, count_graphlet_tuples, max_graphlet_size = max_graphlet_size)) - + graphlet_tuple_counts } #' Calculate edge density for a single graph. -#' @param graphlet_counts Vector of pre-calculated graphlet, edge and node +#' @param graphlet_counts Vector of pre-calculated graphlet, edge and node #' counts. Must have named items "N" (node counts) and "G0" (edge counts). #' @export density_from_counts <- function(graphlet_counts) { @@ -935,17 +935,14 @@ scale_graphlet_counts_ego <- function(graphlet_counts, count_graphlet_tuples <- function(graph_graphlet_counts, max_graphlet_size) { # extract node counts from graph_graphlet_counts N <- graph_graphlet_counts["N"] - + graphlet_key <- graphlet_key(max_graphlet_size) graphlet_node_counts <- graphlet_key$node_count - + graphlet_tuple_counts <- choose(N, graphlet_node_counts) - + graphlet_tuple_counts <- stats::setNames( graphlet_tuple_counts, graphlet_key$id ) - - # add node counts back to object - # graphlet_tuple_counts <- c(N, graphlet_tuple_counts) } diff --git a/R/orca_interface.R b/R/orca_interface.R index 9444ef21..0db02c2e 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -203,7 +203,7 @@ simplify_graph <- function(graph, as_undirected = TRUE, remove_loops = TRUE, #' of each graphlet or orbit at each graph vertex) to #' a set of discrete histograms (a histogram of counts for each distinct value #' across all graph vertices for each feature with no binning) -#' @param features_matrix A number of nodes (rows) by number of features +#' @param features_matrix A number of nodes (rows) by number of features #' (columns) matrix, where the ij entry is the score of node i on feature j #' (e.g. for ORCA output this is counts of each graphlet or orbit at each #' graph vertex) @@ -221,16 +221,17 @@ graph_features_to_histogramsSLOW <- function(features_matrix) { #' Graphlet-based degree distributions (GDDs) #' -#' Generates graphlet-based degree distributions from \code{igraph} graph object, +#' Generates graphlet-based degree distributions from \code{igraph} graph object #' using the ORCA fast graphlet orbit counting package. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param feature_type Type of graphlet-based feature to count: "graphlet" #' counts the number of graphlets each node participates in; "orbit" calculates #' the number of graphlet orbits each node participates in. #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. -#' @param ego_neighbourhood_size The number of steps from the source node to include -#' nodes for each ego-network. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. +#' @param ego_neighbourhood_size The number of steps from the source node to +#' include nodes for each ego-network. #' @return List of graphlet-based degree distributions, with each distribution #' represented as a \code{dhist} discrete histogram object. #' @export @@ -249,7 +250,8 @@ gdd <- function(graph, feature_type = "orbit", max_graphlet_size = 4, } else if (feature_type == "orbit") { out <- count_orbits_per_node(graph, max_graphlet_size = max_graphlet_size) } else if (feature_type == "graphlet") { - out <- count_graphlets_per_node(graph, max_graphlet_size = max_graphlet_size) + out <- count_graphlets_per_node(graph, + max_graphlet_size = max_graphlet_size) } else { stop("gdd: unrecognised feature_type") @@ -261,9 +263,10 @@ gdd <- function(graph, feature_type = "orbit", max_graphlet_size = 4, #' #' Calculates graphlet orbit counts for each node in an \code{igraph} graph #' object, using the ORCA fast graphlet orbit counting package. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. #' @return ORCA-format matrix containing counts of each graphlet #' orbit (columns) at each node in the graph (rows). #' @export @@ -297,14 +300,16 @@ count_orbits_per_node <- function(graph, max_graphlet_size) { #' Calculates graphlet counts for each node in an \code{igraph} graph object, #' using the ORCA fast graphlet orbit counting package. by summing orbits over #' graphlets. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. #' @return ORCA-format matrix containing counts of each graphlet (columns) at #' each node in the graph (rows). #' @export count_graphlets_per_node <- function(graph, max_graphlet_size) { - orbit_counts <- count_orbits_per_node(graph, max_graphlet_size = max_graphlet_size) + orbit_counts <- count_orbits_per_node(graph, + max_graphlet_size = max_graphlet_size) orbit_to_graphlet_counts(orbit_counts) } @@ -315,9 +320,10 @@ count_graphlets_per_node <- function(graph, max_graphlet_size) { #' calculated by summing orbits over graphlets. These are then divided by the #' number of nodes comprising each graphlet to avoid counting the same graphlet #' multiple times. -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. #' @return Vector containing counts of each graphlet for the graph. #' @export count_graphlets_for_graph <- function(graph, max_graphlet_size) { @@ -329,7 +335,7 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { # each graphlet type nodes_per_graphlet <- graphlet_key(max_graphlet_size)$node_count total_counts <- total_counts / nodes_per_graphlet - + # add overall graph node count to total_counts N <- igraph::vcount(graph) total_counts <- c(N = N, total_counts) @@ -339,9 +345,10 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { #' Ego-network graphlet counts #' #' Calculates graphlet counts for the n-step ego-network of each node in a graph -#' @param graph A connected, undirected, simple graph as an \code{igraph} object. +#' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. #' @param neighbourhood_size The number of steps from the source node to include #' nodes for each ego-network. #' @param min_ego_nodes Only ego networks with at least \code{min_ego_nodes} @@ -362,8 +369,11 @@ count_graphlets_for_graph <- function(graph, max_graphlet_size) { #' \item \code{ego_networks}: The ego-networks of the query graph. #' } #' @export -count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size, - min_ego_nodes = 3, min_ego_edges = 1, +count_graphlets_ego <- function(graph, + max_graphlet_size = 4, + neighbourhood_size, + min_ego_nodes = 3, + min_ego_edges = 1, return_ego_networks = FALSE) { # Extract ego network for each node in original graph, naming each ego network # in the list with the name of the node the ego network is generated for @@ -372,13 +382,14 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges ) - + # Generate graphlet counts for each node in each ego network ego_graphlet_counts <- ego_to_graphlet_counts(ego_networks, max_graphlet_size) - + # Return either graphlet counts, or graphlet counts and ego_networks if (return_ego_networks) { - return(list(graphlet_counts = ego_graphlet_counts, ego_networks = ego_networks)) + return(list(graphlet_counts = ego_graphlet_counts, + ego_networks = ego_networks)) } else { return(ego_graphlet_counts) } @@ -389,9 +400,11 @@ count_graphlets_ego <- function(graph, max_graphlet_size = 4, neighbourhood_size #' Calculates graphlet counts for previously generated ego networks. #' @param ego_networks Named list of ego networks for a graph. #' @param max_graphlet_size Determines the maximum size of graphlets to count. -#' Only graphlets containing up to \code{max_graphlet_size} nodes will be counted. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. #' @return returns an RxC matrix -#' containing counts of each graphlet (columns, C) for each ego-network (rows, R). +#' containing counts of each graphlet (columns, C) for each ego-network +#' (rows, R). #' Columns are labelled with graphlet IDs and rows are #' labelled with the ID of the central node in each ego-network. #' @export @@ -401,11 +414,11 @@ ego_to_graphlet_counts <- function(ego_networks, max_graphlet_size = 4) { ego_graphlet_counts <- purrr::map(ego_networks, count_graphlets_for_graph, max_graphlet_size = max_graphlet_size ) - + # Reshape the list of per node single row graphlet count matrices to a single # ORCA format graphlet count matrix with one row per node ego_graphlet_counts <- t(simplify2array(ego_graphlet_counts)) - + # Return graphlet counts return(ego_graphlet_counts) } @@ -524,7 +537,9 @@ graphlet_key <- function(max_graphlet_size) { paste("G", index, sep = "") })) name <- - return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) + return(list(max_nodes = max_graphlet_size, + id = id, + node_count = node_count)) } #' Orbit key @@ -556,7 +571,9 @@ orbit_key <- function(max_graphlet_size) { paste("O", index, sep = "") })) name <- - return(list(max_nodes = max_graphlet_size, id = id, node_count = node_count)) + return(list(max_nodes = max_graphlet_size, + id = id, + node_count = node_count)) } #' Graphlet IDs for size @@ -574,10 +591,10 @@ graphlet_ids_for_size <- function(graphlet_size) { #' Load all graphs in a directory and calculates their Graphlet-based Degree #' Distributions (GDDs) #' -#' Loads graphs from all files matching the given pattern in the given directory, -#' converts them to indexed edge lists compatible with the ORCA fast orbit -#' counting package and calculates the specified set of graphlet-based degree -#' distributions usingthe ORCA package. +#' Loads graphs from all files matching the given pattern in the given +#' directory, converts them to indexed edge lists compatible with the ORCA fast +#' orbit counting package and calculates the specified set of graphlet-based +#' degree distributions usingthe ORCA package. #' @param source_dir Path to graph directory #' @param format Format of graph files #' @param pattern Filename pattern to match graph files @@ -594,9 +611,12 @@ graphlet_ids_for_size <- function(graphlet_size) { #' graph from the source directory. Each set of GDDs is itself a named list, #' where each GDD element is a \code{dhist} discrete histogram object. #' @export -gdd_for_all_graphs <- function( - source_dir, format = "ncol", pattern = ".txt", feature_type = "orbit", - max_graphlet_size = 4, ego_neighbourhood_size = 0, +gdd_for_all_graphs <- function(source_dir, + format = "ncol", + pattern = ".txt", + feature_type = "orbit", + max_graphlet_size = 4, + ego_neighbourhood_size = 0, mc.cores = getOption("mc.cores", 2L)) { # Create function to read graph from file and generate GDD graphs <- read_simple_graphs( @@ -647,7 +667,7 @@ cross_comparison_spec <- function(named_list, how = "many-to-many") { } else { indexes <- as.data.frame(t(utils::combn(1:length(named_list), 2))) } - + names <- as.data.frame(cbind( names(named_list)[indexes[, 1]], names(named_list)[indexes[, 2]] @@ -669,17 +689,24 @@ cross_comparison_spec <- function(named_list, how = "many-to-many") { #' \code{index_b = j} #' @export cross_comp_to_matrix <- function(measure, cross_comparison_spec) { - num_items <- max(c(cross_comparison_spec$index_a, cross_comparison_spec$index_b)) + num_items <- max(c(cross_comparison_spec$index_a, + cross_comparison_spec$index_b)) out <- matrix(data = 0, nrow = num_items, ncol = num_items) - out[cbind(cross_comparison_spec$index_a, cross_comparison_spec$index_b)] <- measure - out[cbind(cross_comparison_spec$index_b, cross_comparison_spec$index_a)] <- measure + out[cbind(cross_comparison_spec$index_a, + cross_comparison_spec$index_b)] <- measure + out[cbind(cross_comparison_spec$index_b, + cross_comparison_spec$index_a)] <- measure row_labels <- rep("", num_items) - row_labels[cross_comparison_spec$index_a] <- as.character(cross_comparison_spec$name_a) - row_labels[cross_comparison_spec$index_b] <- as.character(cross_comparison_spec$name_b) + row_labels[cross_comparison_spec$index_a] <- as.character( + cross_comparison_spec$name_a) + row_labels[cross_comparison_spec$index_b] <- as.character( + cross_comparison_spec$name_b) rownames(out) <- row_labels col_labels <- rep("", num_items) - col_labels[cross_comparison_spec$index_a] <- as.character(cross_comparison_spec$name_a) - col_labels[cross_comparison_spec$index_b] <- as.character(cross_comparison_spec$name_b) + col_labels[cross_comparison_spec$index_a] <- as.character( + cross_comparison_spec$name_a) + col_labels[cross_comparison_spec$index_b] <- as.character( + cross_comparison_spec$name_b) colnames(out) <- col_labels return(out) } From 9a1dd4ba0d985d8ebf1bad65e109b3f11ae67198 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 11 Oct 2019 17:42:16 +0100 Subject: [PATCH 065/188] update some docstrings --- R/measures_net_dis.R | 27 +++++++++---------- man/count_graphlet_tuples.Rd | 5 ++-- man/count_graphlet_tuples_ego.Rd | 5 ++-- man/count_graphlets_ego.Rd | 5 ++-- man/count_graphlets_for_graph.Rd | 5 ++-- man/count_graphlets_per_node.Rd | 5 ++-- man/count_orbits_per_node.Rd | 5 ++-- man/density_binned_counts.Rd | 9 +++---- man/density_from_counts.Rd | 2 +- man/ego_to_graphlet_counts.Rd | 6 +++-- man/gdd.Rd | 11 ++++---- man/gdd_for_all_graphs.Rd | 8 +++--- man/graph_features_to_histograms.Rd | 2 +- man/netdis_centred_graphlet_counts.Rd | 2 +- man/netdis_expected_graphlet_counts.Rd | 5 ++-- ...netdis_expected_graphlet_counts_per_ego.Rd | 12 ++++----- man/netdis_many_to_many.Rd | 8 +++--- man/netdis_one_to_many.Rd | 6 ++--- man/netdis_one_to_one.Rd | 6 ++--- 19 files changed, 69 insertions(+), 65 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index c22a98ea..3ac3ec7e 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -613,17 +613,17 @@ netdis_centred_graphlet_counts <- function( #' generate a function for calculating expected ego-network graphlet counts #' from the statistics of a provided reference graph. #' -#' @param ego_networks The number of steps from the source node to include -#' node in ego-network. +#' @param graphlet_counts Matrix of graphlet and node counts (columns) for a +#' nummber of ego networks (rows). #' @param density_breaks Density values defining bin edges. #' @param density_binned_reference_counts Reference network graphlet counts for #' each density bin. #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. #' @param scale_fn Optional function to scale calculated expected counts, taking -#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -#' factor that the looked up \code{density_binned_reference_counts} values will -#' be multiplied by. +#' \code{graphlet_counts} and \code{max_graphlet_size} as arguments, +#' and returning a scale factor that the looked up +#' \code{density_binned_reference_counts} values will be multiplied by. #' #' #' Temporarily accessible during development. #' TODO: Remove @export prior to publishing @@ -654,8 +654,7 @@ netdis_expected_graphlet_counts_per_ego <- function( #' calculate expected graphlet counts for a query graph #' ego-network from the statistics of a provided reference #' graph. -#' @param graph A connected, undirected, simple reference graph as an -#' \code{igraph} object. +#' @param graphlet_counts Node and graphlet counts for a graph. #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. #' @param density_breaks Density values defining bin edges. @@ -746,15 +745,15 @@ single_density_bin <- function(densities) { #' INTERNAL FUNCTION - Do not call directly #' #' Used to calculate expected graphlet counts for each density bin. -#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param graphlet_counts Graphlet and node counts (columns) for a number of +#' ego_networks (rows). #' @param density_interval_indexes Density bin index for #' each ego network. #' @param agg_fn Function to aggregate counts in each bin #' (default \code{agg_fn = mean}). #' @param scale_fn Optional function to apply a transformation -#' to graphlet_counts, must have arguments graphlet_counts, -#' ego_networks and max_graphlet_size. -#' @param ego_networks Optionally passed and used by scale_fn. +#' to graphlet_counts, must have arguments graphlet_counts and +#' max_graphlet_size. #' @param max_graphlet_size Optionally passed and used by scale_fn. #' @export density_binned_counts <- function(graphlet_counts, @@ -868,7 +867,8 @@ scale_graphlet_count <- function(graphlet_count, graphlet_tuples) { #' Run count_graphlet_tuples across pre-computed ego networks. -#' @param ego_networks Pre-generated ego networks for an input graph. +#' @param graphlet_counts Matrix of graphlet and node counts (columns) for a +#' number of ego networks (rows). #' @param max_graphlet_size Determines the maximum size of graphlets included #' in the tuple counts. #' @export @@ -927,8 +927,7 @@ scale_graphlet_counts_ego <- function(graphlet_counts, #' For each graphlet calculate the number of possible sets of k nodes in the #' query graph, where k is the number of nodes in the graphlet. #' -#' @param graph A connected, undirected, simple graph as an \code{igraph} -#' object. +#' @param graph_graphlet_counts Node and graphlet counts for a single graph. #' @param max_graphlet_size Determines the maximum size of graphlets included #' in the tuple counts. #' @export diff --git a/man/count_graphlet_tuples.Rd b/man/count_graphlet_tuples.Rd index def9d19b..09ddcc14 100644 --- a/man/count_graphlet_tuples.Rd +++ b/man/count_graphlet_tuples.Rd @@ -8,11 +8,10 @@ query graph, where k is the number of nodes in the graphlet.} count_graphlet_tuples(graph_graphlet_counts, max_graphlet_size) } \arguments{ +\item{graph_graphlet_counts}{Node and graphlet counts for a single graph.} + \item{max_graphlet_size}{Determines the maximum size of graphlets included in the tuple counts.} - -\item{graph}{A connected, undirected, simple graph as an \code{igraph} -object.} } \description{ For each graphlet calculate the number of possible sets of k nodes in the diff --git a/man/count_graphlet_tuples_ego.Rd b/man/count_graphlet_tuples_ego.Rd index 12aa77ec..7466db81 100644 --- a/man/count_graphlet_tuples_ego.Rd +++ b/man/count_graphlet_tuples_ego.Rd @@ -7,10 +7,11 @@ count_graphlet_tuples_ego(graphlet_counts, max_graphlet_size) } \arguments{ +\item{graphlet_counts}{Matrix of graphlet and node counts (columns) for a +number of ego networks (rows).} + \item{max_graphlet_size}{Determines the maximum size of graphlets included in the tuple counts.} - -\item{ego_networks}{Pre-generated ego networks for an input graph.} } \description{ Run count_graphlet_tuples across pre-computed ego networks. diff --git a/man/count_graphlets_ego.Rd b/man/count_graphlets_ego.Rd index edd489db..ca9d4322 100644 --- a/man/count_graphlets_ego.Rd +++ b/man/count_graphlets_ego.Rd @@ -8,10 +8,11 @@ count_graphlets_ego(graph, max_graphlet_size = 4, neighbourhood_size, min_ego_nodes = 3, min_ego_edges = 1, return_ego_networks = FALSE) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted.} \item{neighbourhood_size}{The number of steps from the source node to include nodes for each ego-network.} diff --git a/man/count_graphlets_for_graph.Rd b/man/count_graphlets_for_graph.Rd index 696b78f9..c91c107e 100644 --- a/man/count_graphlets_for_graph.Rd +++ b/man/count_graphlets_for_graph.Rd @@ -7,10 +7,11 @@ count_graphlets_for_graph(graph, max_graphlet_size) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted.} } \value{ Vector containing counts of each graphlet for the graph. diff --git a/man/count_graphlets_per_node.Rd b/man/count_graphlets_per_node.Rd index 25fa1c67..cf38ea2f 100644 --- a/man/count_graphlets_per_node.Rd +++ b/man/count_graphlets_per_node.Rd @@ -7,10 +7,11 @@ count_graphlets_per_node(graph, max_graphlet_size) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted.} } \value{ ORCA-format matrix containing counts of each graphlet (columns) at diff --git a/man/count_orbits_per_node.Rd b/man/count_orbits_per_node.Rd index 76fd28cb..04d5fd35 100644 --- a/man/count_orbits_per_node.Rd +++ b/man/count_orbits_per_node.Rd @@ -7,10 +7,11 @@ count_orbits_per_node(graph, max_graphlet_size) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted.} } \value{ ORCA-format matrix containing counts of each graphlet diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd index ecf1f0a3..3343676a 100644 --- a/man/density_binned_counts.Rd +++ b/man/density_binned_counts.Rd @@ -8,7 +8,8 @@ density_binned_counts(graphlet_counts, density_interval_indexes, agg_fn = mean, scale_fn = NULL, max_graphlet_size = NULL) } \arguments{ -\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} +\item{graphlet_counts}{Graphlet and node counts (columns) for a number of +ego_networks (rows).} \item{density_interval_indexes}{Density bin index for each ego network.} @@ -17,12 +18,10 @@ each ego network.} (default \code{agg_fn = mean}).} \item{scale_fn}{Optional function to apply a transformation -to graphlet_counts, must have arguments graphlet_counts, -ego_networks and max_graphlet_size.} +to graphlet_counts, must have arguments graphlet_counts and +max_graphlet_size.} \item{max_graphlet_size}{Optionally passed and used by scale_fn.} - -\item{ego_networks}{Optionally passed and used by scale_fn.} } \description{ Used to calculate expected graphlet counts for each density bin. diff --git a/man/density_from_counts.Rd b/man/density_from_counts.Rd index e5c50f79..6436ca4e 100644 --- a/man/density_from_counts.Rd +++ b/man/density_from_counts.Rd @@ -7,7 +7,7 @@ density_from_counts(graphlet_counts) } \arguments{ -\item{graphlet_counts}{Vector of pre-calculated graphlet, edge and node +\item{graphlet_counts}{Vector of pre-calculated graphlet, edge and node counts. Must have named items "N" (node counts) and "G0" (edge counts).} } \description{ diff --git a/man/ego_to_graphlet_counts.Rd b/man/ego_to_graphlet_counts.Rd index a94ff274..7ae7260e 100644 --- a/man/ego_to_graphlet_counts.Rd +++ b/man/ego_to_graphlet_counts.Rd @@ -10,11 +10,13 @@ ego_to_graphlet_counts(ego_networks, max_graphlet_size = 4) \item{ego_networks}{Named list of ego networks for a graph.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted.} } \value{ returns an RxC matrix -containing counts of each graphlet (columns, C) for each ego-network (rows, R). +containing counts of each graphlet (columns, C) for each ego-network +(rows, R). Columns are labelled with graphlet IDs and rows are labelled with the ID of the central node in each ego-network. } diff --git a/man/gdd.Rd b/man/gdd.Rd index e33a8699..961be4f9 100644 --- a/man/gdd.Rd +++ b/man/gdd.Rd @@ -8,23 +8,24 @@ gdd(graph, feature_type = "orbit", max_graphlet_size = 4, ego_neighbourhood_size = 0) } \arguments{ -\item{graph}{A connected, undirected, simple graph as an \code{igraph} object.} +\item{graph}{A connected, undirected, simple graph as an \code{igraph} object} \item{feature_type}{Type of graphlet-based feature to count: "graphlet" counts the number of graphlets each node participates in; "orbit" calculates the number of graphlet orbits each node participates in.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. -Only graphlets containing up to \code{max_graphlet_size} nodes will be counted.} +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted.} -\item{ego_neighbourhood_size}{The number of steps from the source node to include -nodes for each ego-network.} +\item{ego_neighbourhood_size}{The number of steps from the source node to +include nodes for each ego-network.} } \value{ List of graphlet-based degree distributions, with each distribution represented as a \code{dhist} discrete histogram object. } \description{ -Generates graphlet-based degree distributions from \code{igraph} graph object, +Generates graphlet-based degree distributions from \code{igraph} graph object using the ORCA fast graphlet orbit counting package. } diff --git a/man/gdd_for_all_graphs.Rd b/man/gdd_for_all_graphs.Rd index 98e55666..2f3c6c7b 100644 --- a/man/gdd_for_all_graphs.Rd +++ b/man/gdd_for_all_graphs.Rd @@ -36,8 +36,8 @@ graph from the source directory. Each set of GDDs is itself a named list, where each GDD element is a \code{dhist} discrete histogram object. } \description{ -Loads graphs from all files matching the given pattern in the given directory, -converts them to indexed edge lists compatible with the ORCA fast orbit -counting package and calculates the specified set of graphlet-based degree -distributions usingthe ORCA package. +Loads graphs from all files matching the given pattern in the given +directory, converts them to indexed edge lists compatible with the ORCA fast +orbit counting package and calculates the specified set of graphlet-based +degree distributions usingthe ORCA package. } diff --git a/man/graph_features_to_histograms.Rd b/man/graph_features_to_histograms.Rd index 2479d593..82014cf4 100644 --- a/man/graph_features_to_histograms.Rd +++ b/man/graph_features_to_histograms.Rd @@ -8,7 +8,7 @@ each feature.} graph_features_to_histograms(features_matrix) } \arguments{ -\item{features_matrix}{A number of nodes (rows) by number of features +\item{features_matrix}{A number of nodes (rows) by number of features (columns) matrix, where the ij entry is the score of node i on feature j (e.g. for ORCA output this is counts of each graphlet or orbit at each graph vertex)} diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd index e4e932af..7ce5d9ae 100644 --- a/man/netdis_centred_graphlet_counts.Rd +++ b/man/netdis_centred_graphlet_counts.Rd @@ -10,7 +10,7 @@ netdis_centred_graphlet_counts(graphlet_counts, exp_graphlet_counts, \arguments{ \item{graphlet_counts}{Ego network graphlet counts for a query graph} -\item{exp_graphlet_counts}{Pre-calculated expected counts for each graphlet +\item{exp_graphlet_counts}{Pre-calculated expected counts for each graphlet type for each ego network.} \item{max_graphlet_size}{max graphlet size to calculate centred counts for.} diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index 16e05899..4c94c5b2 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -8,6 +8,8 @@ netdis_expected_graphlet_counts(graphlet_counts, max_graphlet_size, density_breaks, density_binned_reference_counts, scale_fn = NULL) } \arguments{ +\item{graphlet_counts}{Node and graphlet counts for a graph.} + \item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} @@ -22,9 +24,6 @@ factor that the looked up \code{density_binned_reference_counts} values will be multiplied by. Temporarily accessible during development. TODO: Remove @export prior to publishing} - -\item{graph}{A connected, undirected, simple reference graph as an -\code{igraph} object.} } \description{ Used by \code{netdis_expected_graphlet_counts_ego} to diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index d1ca4346..f96707eb 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -8,6 +8,9 @@ netdis_expected_graphlet_counts_per_ego(graphlet_counts, density_breaks, density_binned_reference_counts, max_graphlet_size, scale_fn = NULL) } \arguments{ +\item{graphlet_counts}{Matrix of graphlet and node counts (columns) for a +nummber of ego networks (rows).} + \item{density_breaks}{Density values defining bin edges.} \item{density_binned_reference_counts}{Reference network graphlet counts for @@ -17,15 +20,12 @@ each density bin.} Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} \item{scale_fn}{Optional function to scale calculated expected counts, taking -\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -factor that the looked up \code{density_binned_reference_counts} values will -be multiplied by. +\code{graphlet_counts} and \code{max_graphlet_size} as arguments, +and returning a scale factor that the looked up +\code{density_binned_reference_counts} values will be multiplied by. #' Temporarily accessible during development. TODO: Remove @export prior to publishing} - -\item{ego_networks}{The number of steps from the source node to include -node in ego-network.} } \description{ Used by \code{netdis_expected_graphlet_counts_ego_fn} to diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index d4f1bf22..983dbafb 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -20,12 +20,12 @@ netdis_many_to_many(graphs = NULL, ref_graph = 0, graphs excluding loops, multiple edges and isolated vertices), such as those obtained by using \code{read_simple_graphs}.} -\item{ref_graph}{Controls how expected counts are calculated. Either: -1) A numeric value - used as a constant expected counts value for all query +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query graphs (DEFAULT: 0). 2) A simplified \code{igraph} object - used as a reference graph from which expected counts are calculated for all query graphs. -3) NULL - Expected counts will be calculated based on the properties of the +3) NULL - Expected counts will be calculated based on the properties of the query graphs themselves.} \item{comparisons}{Which comparisons to perform between graphs. @@ -64,7 +64,7 @@ Takes \code{ego_networks}, \code{density_bin_breaks}, the original netdis paper).} \item{graphlet_counts}{Pre-generated graphlet counts. If the -\code{graphlet_counts} argument is defined then \code{graphs} will not be +\code{graphlet_counts} argument is defined then \code{graphs} will not be used. A named list of matrices containing counts of each graphlet (columns) for each ego-network in the input graph (rows). Columns are labelled with diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index efeb74f7..5386dc17 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -22,12 +22,12 @@ all graphs in graphs_compare. A simplified igraph graph object.} \item{graphs_compare}{Graphs graph_1 will be compared with. A named list of simplified igraph graph objects.} -\item{ref_graph}{Controls how expected counts are calculated. Either: -1) A numeric value - used as a constant expected counts value for all query +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query graphs (DEFAULT: 0). 2) A simplified \code{igraph} object - used as a reference graph from which expected counts are calculated for all query graphs. -3) NULL - Expected counts will be calculated based on the properties of the +3) NULL - Expected counts will be calculated based on the properties of the query graphs themselves.} \item{max_graphlet_size}{Generate graphlets up to this size.} diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 14ffcb90..5b3a5c66 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -20,12 +20,12 @@ netdis_one_to_one(graph_1 = NULL, graph_2 = NULL, ref_graph = 0, \item{graph_2}{A simplified igraph graph object.} -\item{ref_graph}{Controls how expected counts are calculated. Either: -1) A numeric value - used as a constant expected counts value for all query +\item{ref_graph}{Controls how expected counts are calculated. Either: +1) A numeric value - used as a constant expected counts value for all query graphs (DEFAULT: 0). 2) A simplified \code{igraph} object - used as a reference graph from which expected counts are calculated for all query graphs. -3) NULL - Expected counts will be calculated based on the properties of the +3) NULL - Expected counts will be calculated based on the properties of the query graphs themselves.} \item{max_graphlet_size}{Generate graphlets up to this size.} From 678e93febff06970a92a11c5d208191d65222b44 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 15 Oct 2019 17:09:41 +0100 Subject: [PATCH 066/188] couple of tests to improve coverage --- tests/testthat/test_measures_net_dis.R | 93 ++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 5a8bc4e2..d0e686f0 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1234,3 +1234,96 @@ test_that("netdis_one_to_many gives expected result", { # Check results as expected expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) }) + +context("Netdis: error if no query graphs or graphlet counts provided") +test_that("netdis functions error when no query graphs provided", { + # dummy values to use for other parameters + ref_graph <- NULL + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + comparisons <- "many-to-many" + + # one to one function + expect_error( + netdis_one_to_one( + graph_1 = NULL, + graph_2 = NULL, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL + ) + ) + + # one to many function + expect_error( + netdis_one_to_many( + graph_1 = NULL, + graphs_compare = NULL, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL + ) + ) + + # many to many function + expect_error( + netdis_many_to_many( + graphs = NULL, + comparisons = comparisons, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts = NULL + ) + ) +}) + +context("Netdis: constant expected counts") +test_that("netdis_many_to_many correctly interprets numeric ref_graph value", { + # TODO + +}) + +context("Netdis: expected counts using query networks") +test_that("netdis_many_to_many calculates expected counts from query networks if + ref_graph is NULL", { + # TODO + +}) + +context("Netdis: error if max_graphlet_size is not 3, 4 or 5") +test_that("netdis_uptok errors for unsupported max_graphlet_size", { + # dummy counts values + counts_1 <- c(11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, + 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14) + counts_2 <- c(12, 11, 6, 10, 15, 7, 10, 8, 7, 7, 7, 13, 9, 14, 7, 12, + 7, 10, 9, 11, 7, 7, 11, 8, 10, 14, 8, 16, 14, 10) + ids <- graphlet_key(5)$id + names(counts_1) <- ids + names(counts_2) <- ids + + # graphlet size greater than 5 + expect_error(netdis_uptok(counts_1, counts_2, 6)) + + # graphlet size less than 3 + expect_error(netdis_uptok(counts_1, counts_2, 2)) + +}) + +context("Netdis: works correctly when using a single density bin") +test_that("netdis single density bin works correctly", { + # TODO + +}) From ab03a0090ba154d06b9bdb1d2507159195000664 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 17 Oct 2019 15:19:06 +0100 Subject: [PATCH 067/188] add additional cases to argument checking test --- tests/testthat/test_measures_net_dis.R | 52 ++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index d0e686f0..0a0fe655 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1245,10 +1245,29 @@ test_that("netdis functions error when no query graphs provided", { min_ego_edges <- 1 comparisons <- "many-to-many" + elist <- rbind( + c("n1", "n2"), + c("n2", "n3"), + c("n1", "n4"), + c("n2", "n5"), + c("n1", "n6"), + c("n1", "n7"), + c("n2", "n4"), + c("n4", "n6"), + c("n6", "n8"), + c("n7", "n8"), + c("n7", "n9"), + c("n7", "n10"), + c("n8", "n9"), + c("n8", "n10"), + c("n9", "n10") + ) + graph <- igraph::graph_from_edgelist(elist, directed = FALSE) + # one to one function expect_error( netdis_one_to_one( - graph_1 = NULL, + graph_1 = graph, graph_2 = NULL, ref_graph = ref_graph, max_graphlet_size = max_graphlet_size, @@ -1259,11 +1278,24 @@ test_that("netdis functions error when no query graphs provided", { graphlet_counts_2 = NULL ) ) - + expect_error( + netdis_one_to_one( + graph_1 = NULL, + graph_2 = graph, + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_2 = NULL + ) + ) + # one to many function expect_error( netdis_one_to_many( - graph_1 = NULL, + graph_1 = graph, graphs_compare = NULL, ref_graph = ref_graph, max_graphlet_size = max_graphlet_size, @@ -1274,7 +1306,19 @@ test_that("netdis functions error when no query graphs provided", { graphlet_counts_compare = NULL ) ) - + expect_error( + netdis_one_to_many( + graph_1 = NULL, + graphs_compare = list(graph_1 = graph, graph_2 = graph), + ref_graph = ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + graphlet_counts_1 = NULL, + graphlet_counts_compare = NULL + ) + ) # many to many function expect_error( netdis_many_to_many( From 77cfb01f3d9872a687b4a063a715322e42dbb334 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 18 Oct 2019 17:26:03 +0100 Subject: [PATCH 068/188] update some docstrings --- R/measures_net_dis.R | 68 +++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 3ac3ec7e..196a4a8e 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -512,10 +512,11 @@ netdis_many_to_many <- function(graphs = NULL, } -#' Netdis +#' Netdis - for one graphlet size #' #' Calculate Netdis statistic between two graphs from their Centred Graphlet -#' Counts (generated using \code{netdis_centred_graphlet_counts}). +#' Counts (generated using \code{netdis_centred_graphlet_counts}) for graphlets +#' of size \code{graphlet_size}. #' @param centred_graphlet_counts1 Centred Graphlet Counts for graph 1 #' @param centred_graphlet_counts2 Centred Graphlet Counts for graph 2 #' @param graphlet_size The size of graphlets to use for the Netdis calculation @@ -543,10 +544,11 @@ netdis <- function(centred_graphlet_counts1, centred_graphlet_counts2, 0.5 * (1 - netds2) } -#' Netdis - graphlets up to max_graphlet_size +#' Netdis - for all graphlet sizes up to max_graphlet_size #' #' Calculate Netdis statistic between two graphs from their Centred Graphlet -#' Counts (generated using \code{netdis_centred_graphlet_counts}). +#' Counts (generated using \code{netdis_centred_graphlet_counts}) for all +#' graphlet sizes up to \code{max_graphlet_size}. #' @param centred_graphlet_counts1 Centred Graphlet Counts for graph 1 #' @param centred_graphlet_counts2 Centred Graphlet Counts for graph 2 #' @param max_graphlet_size max graphlet size to calculate Netdis for. @@ -586,8 +588,7 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, #' Centre counts by subtracting expected graphlet counts from actual graphlet #' counts. #' @param graphlet_counts Ego network graphlet counts for a query graph -#' @param exp_graphlet_counts Pre-calculated expected counts for each graphlet -#' type for each ego network. +#' @param exp_graphlet_counts Expected graphlet counts for each ego network. #' @param max_graphlet_size max graphlet size to calculate centred counts for. #' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size #' max_graphlet_size. @@ -607,11 +608,10 @@ netdis_centred_graphlet_counts <- function( } -#' INTERNAL FUNCTION - Do not call directly -#' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to -#' generate a function for calculating expected ego-network graphlet counts -#' from the statistics of a provided reference graph. +#' netdis_expected_graphlet_counts_per_ego +#' +#' Calculates expected graphlet counts for each ego network based on its density +#' and pre-calculated reference density bins and graphlet counts for each bin. #' #' @param graphlet_counts Matrix of graphlet and node counts (columns) for a #' nummber of ego networks (rows). @@ -648,22 +648,23 @@ netdis_expected_graphlet_counts_per_ego <- function( expected_graphlet_counts } +#' netdis_expected_graphlet_counts #' INTERNAL FUNCTION - Do not call directly +#' +#' Calculates expected graphlet counts for one ego network based on its density +#' and pre-calculated reference density bins and graphlet counts for each bin. #' -#' Used by \code{netdis_expected_graphlet_counts_ego} to -#' calculate expected graphlet counts for a query graph -#' ego-network from the statistics of a provided reference -#' graph. -#' @param graphlet_counts Node and graphlet counts for a graph. +#' @param graphlet_counts Node and graphlet counts for an ego network. #' @param max_graphlet_size Determines the maximum size of graphlets to count. #' Only graphlets containing up to \code{max_graphlet_size} nodes are counted. #' @param density_breaks Density values defining bin edges. #' @param density_binned_reference_counts Reference network graphlet counts for #' each density bin. #' @param scale_fn Optional function to scale calculated expected counts, taking -#' \code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -#' factor that the looked up \code{density_binned_reference_counts} values will -#' be multiplied by. +#' \code{graphlet_counts} and \code{max_graphlet_size} as arguments, and +#' returning a scale factor that the looked up +#' \code{density_binned_reference_counts} values will be multiplied by. +#' #' Temporarily accessible during development. #' TODO: Remove @export prior to publishing #' @export @@ -692,18 +693,17 @@ netdis_expected_graphlet_counts <- function(graphlet_counts, matched_reference_counts } -#' INTERNAL FUNCTION - Do not call directly +#' mean_density_binned_graphlet_counts +#' +#' Calculate mean (dy default) graphlet counts for ego networks in each density +#' bin. #' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to -#' generate a function for calculating expected ego-network graphlet counts -#' from the statistics of a provided reference graph. #' @param graphlet_counts Graphlet counts for a number of ego_networks. #' @param density_interval_indexes Density bin index for -#' each ego network. +#' each ego network in graphlet_counts. #' @param agg_fn Function to aggregate counts in each bin #' (default \code{agg_fn = mean}). -#' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing +#' #' @export mean_density_binned_graphlet_counts <- function(graphlet_counts, density_interval_indexes, @@ -732,7 +732,7 @@ mean_density_binned_graphlet_counts <- function(graphlet_counts, } #' For case where don't want to use binning, return a single bin which covers -#' the full range of possible density values. +#' the full range of possible density values (0 to 1). #' @param densities Ego network density values (only used to return #' a list of indexes of the required length.) #' @export @@ -742,19 +742,21 @@ single_density_bin <- function(densities) { breaks = c(0, 1)) } -#' INTERNAL FUNCTION - Do not call directly -#' -#' Used to calculate expected graphlet counts for each density bin. +#' Used to calculate aggregated graphlet counts for each density bin. +#' #' @param graphlet_counts Graphlet and node counts (columns) for a number of #' ego_networks (rows). #' @param density_interval_indexes Density bin index for #' each ego network. #' @param agg_fn Function to aggregate counts in each bin #' (default \code{agg_fn = mean}). -#' @param scale_fn Optional function to apply a transformation -#' to graphlet_counts, must have arguments graphlet_counts and -#' max_graphlet_size. +#' @param scale_fn Optional function to apply a transformation/scaling +#' to the raw graphlet_counts. Must have arguments \code{graphlet_counts} and +#' \code{max_graphlet_size}, and return a transformed \code{graphlet_counts} +#' object with the same number of rows as the input, and columns for all +#' graphlets up to \code{max_graphlet_size}. #' @param max_graphlet_size Optionally passed and used by scale_fn. +#' #' @export density_binned_counts <- function(graphlet_counts, density_interval_indexes, From 4741511c9be5e8bc47dd5d518c363554667eaa58 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 7 Nov 2019 17:40:24 +0000 Subject: [PATCH 069/188] started refactor to avoid storing all expected counts in memory --- R/measures_net_dis.R | 91 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 196a4a8e..41b6fe7e 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -597,7 +597,7 @@ netdis_centred_graphlet_counts <- function( graphlet_counts, exp_graphlet_counts, max_graphlet_size) { - + # extract columns for graphlets up to size max_graphlet_size id <- graphlet_key(max_graphlet_size)$id graphlet_counts <- graphlet_counts[, id] @@ -608,6 +608,95 @@ netdis_centred_graphlet_counts <- function( } +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +netdis_centred_graphlet_counts_new <- function( + graphlet_counts, + ref_ego_density_bins, + ref_binned_graphlet_counts, + binning_fn, + bin_counts_fn, + exp_counts_fn, + max_graphlet_size) { + + ## ------------------------------------------------------------------------ + # If a number has been passed as ref_binned_graphlet_counts, treat it as a + # constant expected counts value (e.g. if ref_binned_graphlet_counts = 0 + # then no centring of counts). + if (is.numeric(ref_binned_graphlet_counts)) { + exp_graphlet_counts <- netdis_const_expected_counts( + graphlet_counts, + const = ref_graph) + + ## ------------------------------------------------------------------------ + # If reference bins and counts passed, use them to calculate + # expected counts + } else if (!is.null(ref_ego_density_bins) && + !is.null(ref_binned_graphlet_counts)) { + + # Calculate expected graphlet counts (using ref + # graph ego network density bins) + exp_graphlet_counts <- exp_counts_fn( + graphlet_counts, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size) + + ## ------------------------------------------------------------------------ + # If NULL passed as ref bins and counts, calculate expected counts using + # query network itself. + } else if (is.null(ref_ego_density_bins) && + is.null(ref_binned_graphlet_counts)) { + # Get ego-network densities + densities <- ego_network_density(graphlet_counts) + + # bin ref ego-network densities + binned_densities <- binning_fn(densities, + binning_fn) + + # extract bin breaks and indexes from binning results + ego_density_bin_breaks <- binned_densities$breaks + ego_density_bin_indexes <- binned_densities$interval_indexes + + # Calculate expected counts in each bin + binned_graphlet_counts <- bin_counts_fn( + graphlet_counts, + ego_density_bin_indexes, + max_graphlet_size = max_graphlet_size) + + # Calculate expected graphlet counts for each ego network + exp_graphlet_counts <- exp_counts_fn( + graphlet_counts, + ego_density_bin_breaks, + binned_graphlet_counts, + max_graphlet_size = max_graphlet_size) + + ## ------------------------------------------------------------------------ + # Invalid combination of ref_ego_density_bins and ref_binned_graphlet_counts + } else { + stop("Invalid combination of ref_ego_density_bins and + ref_binned_graphlet_counts. Options are: + - Both NULL: calculate expected counts using query network. + - List of bin edges and matrix of binned counts: Reference graph values + for calculating expected counts. + - Constant numeric ref_binned_graphlet_counts: Use as constant expected + counts value.") + } + + ## ------------------------------------------------------------------------ + # Centre counts + # extract columns for graphlets up to size max_graphlet_size + id <- graphlet_key(max_graphlet_size)$id + graphlet_counts <- graphlet_counts[, id] + exp_graphlet_counts <- exp_graphlet_counts[, id] + + # Subtract expected counts from actual graphlet counts + graphlet_counts - exp_graphlet_counts + +} +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + #' netdis_expected_graphlet_counts_per_ego #' #' Calculates expected graphlet counts for each ego network based on its density From bf681ee7f5ed9cb9b5439e9a80b40c1e70f8e522 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Fri, 8 Nov 2019 17:31:21 +0000 Subject: [PATCH 070/188] new centred counts function working --- NAMESPACE | 1 + R/measures_net_dis.R | 184 ++++++++++-------- man/density_binned_counts.Rd | 12 +- man/mean_density_binned_graphlet_counts.Rd | 13 +- man/netdis.Rd | 5 +- man/netdis_centred_graphlet_counts.Rd | 36 +++- man/netdis_expected_graphlet_counts.Rd | 18 +- ...netdis_expected_graphlet_counts_per_ego.Rd | 7 +- man/netdis_subtract_exp_counts.Rd | 24 +++ man/netdis_uptok.Rd | 5 +- man/single_density_bin.Rd | 4 +- vignettes/netdis_2graphs_polya-aeppli.R | 8 +- vignettes/netdis_2graphs_polya-aeppli.Rmd | 8 +- vignettes/netdis_2graphs_polya-aeppli.html | 12 +- vignettes/quickstart_netdis_2graphs.R | 12 +- vignettes/quickstart_netdis_2graphs.Rmd | 13 +- vignettes/quickstart_netdis_2graphs.html | 16 +- 17 files changed, 223 insertions(+), 155 deletions(-) create mode 100644 man/netdis_subtract_exp_counts.Rd diff --git a/NAMESPACE b/NAMESPACE index e4e578d1..fcd4939d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,6 +57,7 @@ export(netdis_expected_graphlet_counts_per_ego) export(netdis_many_to_many) export(netdis_one_to_many) export(netdis_one_to_one) +export(netdis_subtract_exp_counts) export(netdis_uptok) export(normalise_dhist_mass) export(normalise_dhist_variance) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 41b6fe7e..dbc13919 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -389,12 +389,20 @@ netdis_many_to_many <- function(graphs = NULL, rm(graphs) ## ------------------------------------------------------------------------ + # Centre counts # If a number has been passed as ref_graph, treat it as a constant expected # counts value (e.g. if ref_graph = 0 then no centring of counts). - if (is.numeric(ref_graph)) { - exp_graphlet_counts <- purrr::map(graphlet_counts, - netdis_const_expected_counts, - const = ref_graph) + if (is.numeric(ref_graph) && length(ref_graph) == 1) { + + centred_graphlet_counts <- purrr::map( + graphlet_counts, + netdis_centred_graphlet_counts, + ref_ego_density_bins = NULL, + ref_binned_graphlet_counts = ref_graph, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + max_graphlet_size = max_graphlet_size) ## ------------------------------------------------------------------------ # If a reference graph passed, use it to calculate expected counts for all @@ -429,67 +437,42 @@ netdis_many_to_many <- function(graphs = NULL, binned_densities$interval_indexes, max_graphlet_size = max_graphlet_size ) - - # Calculate expected graphlet counts (using ref - # graph ego network density bins) - exp_graphlet_counts <- purrr::map( + + # Calculate centred counts using ref graph + centred_graphlet_counts <- purrr::map( graphlet_counts, - exp_counts_fn, - density_breaks = ref_ego_density_bins, - density_binned_reference_counts = ref_binned_graphlet_counts, - max_graphlet_size = max_graphlet_size - ) + netdis_centred_graphlet_counts, + ref_ego_density_bins = ref_ego_density_bins, + ref_binned_graphlet_counts = ref_binned_graphlet_counts, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn, + max_graphlet_size = max_graphlet_size) ## ------------------------------------------------------------------------ # If no reference passed, calculate expected counts using query networks # themselves. } else { - # Get ego-network densities - densities <- purrr::map(graphlet_counts, - ego_network_density) - - # bin ref ego-network densities - binned_densities <- purrr::map(densities, - binning_fn) - - # extract bin breaks and indexes from binning results - ego_density_bin_breaks <- purrr::map(binned_densities, - function(x) { - x$breaks - }) - ego_density_bin_indexes <- purrr::map(binned_densities, - function(x) { - x$interval_indexes - }) - - - # Calculate expected counts in each bin - binned_graphlet_counts <- mapply(bin_counts_fn, - graphlet_counts, - ego_density_bin_indexes, - max_graphlet_size = max_graphlet_size, - SIMPLIFY = FALSE) - - # Calculate expected graphlet counts for each ego network - exp_graphlet_counts <- mapply(exp_counts_fn, - graphlet_counts, - ego_density_bin_breaks, - binned_graphlet_counts, - max_graphlet_size = max_graphlet_size, - SIMPLIFY = FALSE) + + centred_graphlet_counts <- purrr::map( + graphlet_counts, + netdis_centred_graphlet_counts, + ref_ego_density_bins = NULL, + ref_binned_graphlet_counts = NULL, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn, + max_graphlet_size = max_graphlet_size) + } - - ## ------------------------------------------------------------------------ - # Centre graphlet counts by subtracting expected counts - centred_graphlet_counts <- mapply(netdis_centred_graphlet_counts, - graphlet_counts, - exp_graphlet_counts, - max_graphlet_size = max_graphlet_size) - + rm(graphlet_counts) + ## ------------------------------------------------------------------------ # Sum centred graphlet counts across all ego networks sum_graphlet_counts <- lapply(centred_graphlet_counts, colSums) - + + rm(centred_graphlet_counts) + ## ------------------------------------------------------------------------ # Generate pairwise comparisons comp_spec <- cross_comparison_spec(sum_graphlet_counts, how = comparisons) @@ -585,32 +568,44 @@ netdis_uptok <- function(centred_graphlet_counts1, centred_graphlet_counts2, #' netdis_centred_graphlet_counts #' -#' Centre counts by subtracting expected graphlet counts from actual graphlet -#' counts. +#' Calculate expected graphlet counts for each ego network in a query graph and +#' centre the actual counts by subtracting those calculated expected count +#' values. #' @param graphlet_counts Ego network graphlet counts for a query graph -#' @param exp_graphlet_counts Expected graphlet counts for each ego network. +#' +#' @param ref_ego_density_bins Either a list of previously calculated ego +#' network density bin edges from a reference network, or \code{NULL}, in +#' which case density bins are generated using the query graph itself. +#' +#' @param ref_binned_graphlet_counts Either expected graphlet counts for each +#' ego network density bin from a reference network (a matrix with columns +#' labelled by graphlet ID and rows by density bin index), \code{NULL}, in +#' which case density binned counts are generated using the query graph itself, +#' or a constant numeric value to subtract from all graphlet counts. +#' +#' @param binning_fn Function used to bin ego network densities. Only needed if +#' \code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are +#' \code{NULL}. Takes densities as its single argument, and returns a named list +#' including keys \code{breaks} (list of bin edges) and \code{interval_indexes} +#' (density bin index for each ego network). +#' +#' @param bin_counts_fn Function used to calculate expected graphlet counts in +#' each density bin. Only needed if \code{ref_ego_density_bins} and +#' \code{ref_binned_graphlet_counts} are \code{NULL}. Takes +#' \code{graphlet_counts}, \code{interval_indexes} (bin indexes) and +#' \code{max_graphlet_size} as arguments. +#' +#' @param exp_counts_fn Function used to map from binned reference counts to +#' expected counts for each graphlet in each ego network of the query graphs. +#' Takes \code{ego_networks}, \code{density_bin_breaks}, +#' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. +#' #' @param max_graphlet_size max graphlet size to calculate centred counts for. +#' #' @return graphlet_counts minus exp_graphlet_counts for graphlets up to size #' max_graphlet_size. #' @export netdis_centred_graphlet_counts <- function( - graphlet_counts, - exp_graphlet_counts, - max_graphlet_size) { - - # extract columns for graphlets up to size max_graphlet_size - id <- graphlet_key(max_graphlet_size)$id - graphlet_counts <- graphlet_counts[, id] - exp_graphlet_counts <- exp_graphlet_counts[, id] - - # centre counts - graphlet_counts - exp_graphlet_counts - -} - -# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -netdis_centred_graphlet_counts_new <- function( graphlet_counts, ref_ego_density_bins, ref_binned_graphlet_counts, @@ -623,17 +618,17 @@ netdis_centred_graphlet_counts_new <- function( # If a number has been passed as ref_binned_graphlet_counts, treat it as a # constant expected counts value (e.g. if ref_binned_graphlet_counts = 0 # then no centring of counts). - if (is.numeric(ref_binned_graphlet_counts)) { + if (is.numeric(ref_binned_graphlet_counts) && + length(ref_binned_graphlet_counts) == 1) { exp_graphlet_counts <- netdis_const_expected_counts( graphlet_counts, - const = ref_graph) + const = ref_binned_graphlet_counts) ## ------------------------------------------------------------------------ # If reference bins and counts passed, use them to calculate # expected counts } else if (!is.null(ref_ego_density_bins) && !is.null(ref_binned_graphlet_counts)) { - # Calculate expected graphlet counts (using ref # graph ego network density bins) exp_graphlet_counts <- exp_counts_fn( @@ -651,8 +646,7 @@ netdis_centred_graphlet_counts_new <- function( densities <- ego_network_density(graphlet_counts) # bin ref ego-network densities - binned_densities <- binning_fn(densities, - binning_fn) + binned_densities <- binning_fn(densities) # extract bin breaks and indexes from binning results ego_density_bin_breaks <- binned_densities$breaks @@ -684,8 +678,32 @@ netdis_centred_graphlet_counts_new <- function( } ## ------------------------------------------------------------------------ - # Centre counts - # extract columns for graphlets up to size max_graphlet_size + # Subtract expected counts from actual graphlet counts + netdis_subtract_exp_counts(graphlet_counts, + exp_graphlet_counts, + max_graphlet_size) +} + + +#' netdis_subtract_exp_counts +#' +#' Subtract expected graphlet counts from actual graphlet counts. +#' +#' @param graphlet_counts Matrix of graphlet counts (columns) for a +#' nummber of ego networks (rows). +#' @param exp_graphlet_counts Matrix of expected graphlet counts (columns) for a +#' nummber of ego networks (rows). +#' @param max_graphlet_size Do the subtraction for graphlets up to this size. +#' +#' #' Temporarily accessible during development. +#' TODO: Remove @export prior to publishing +#' @export +netdis_subtract_exp_counts <- function( + graphlet_counts, + exp_graphlet_counts, + max_graphlet_size) { + + # select columns for graphlets up to size max_graphlet_size id <- graphlet_key(max_graphlet_size)$id graphlet_counts <- graphlet_counts[, id] exp_graphlet_counts <- exp_graphlet_counts[, id] @@ -694,8 +712,6 @@ netdis_centred_graphlet_counts_new <- function( graphlet_counts - exp_graphlet_counts } -# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! #' netdis_expected_graphlet_counts_per_ego #' diff --git a/man/density_binned_counts.Rd b/man/density_binned_counts.Rd index 3343676a..be3f5cbb 100644 --- a/man/density_binned_counts.Rd +++ b/man/density_binned_counts.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{density_binned_counts} \alias{density_binned_counts} -\title{INTERNAL FUNCTION - Do not call directly} +\title{Used to calculate aggregated graphlet counts for each density bin.} \usage{ density_binned_counts(graphlet_counts, density_interval_indexes, agg_fn = mean, scale_fn = NULL, max_graphlet_size = NULL) @@ -17,12 +17,14 @@ each ego network.} \item{agg_fn}{Function to aggregate counts in each bin (default \code{agg_fn = mean}).} -\item{scale_fn}{Optional function to apply a transformation -to graphlet_counts, must have arguments graphlet_counts and -max_graphlet_size.} +\item{scale_fn}{Optional function to apply a transformation/scaling +to the raw graphlet_counts. Must have arguments \code{graphlet_counts} and +\code{max_graphlet_size}, and return a transformed \code{graphlet_counts} +object with the same number of rows as the input, and columns for all +graphlets up to \code{max_graphlet_size}.} \item{max_graphlet_size}{Optionally passed and used by scale_fn.} } \description{ -Used to calculate expected graphlet counts for each density bin. +Used to calculate aggregated graphlet counts for each density bin. } diff --git a/man/mean_density_binned_graphlet_counts.Rd b/man/mean_density_binned_graphlet_counts.Rd index 69dd8b01..69af07e0 100644 --- a/man/mean_density_binned_graphlet_counts.Rd +++ b/man/mean_density_binned_graphlet_counts.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{mean_density_binned_graphlet_counts} \alias{mean_density_binned_graphlet_counts} -\title{INTERNAL FUNCTION - Do not call directly} +\title{mean_density_binned_graphlet_counts} \usage{ mean_density_binned_graphlet_counts(graphlet_counts, density_interval_indexes, agg_fn = mean) @@ -11,15 +11,12 @@ mean_density_binned_graphlet_counts(graphlet_counts, \item{graphlet_counts}{Graphlet counts for a number of ego_networks.} \item{density_interval_indexes}{Density bin index for -each ego network.} +each ego network in graphlet_counts.} \item{agg_fn}{Function to aggregate counts in each bin -(default \code{agg_fn = mean}). -Temporarily accessible during development. -TODO: Remove @export prior to publishing} +(default \code{agg_fn = mean}).} } \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to -generate a function for calculating expected ego-network graphlet counts -from the statistics of a provided reference graph. +Calculate mean (dy default) graphlet counts for ego networks in each density +bin. } diff --git a/man/netdis.Rd b/man/netdis.Rd index 9f477425..d539bb1a 100644 --- a/man/netdis.Rd +++ b/man/netdis.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis} \alias{netdis} -\title{Netdis} +\title{Netdis - for one graphlet size} \usage{ netdis(centred_graphlet_counts1, centred_graphlet_counts2, graphlet_size) } @@ -21,5 +21,6 @@ the specified size } \description{ Calculate Netdis statistic between two graphs from their Centred Graphlet -Counts (generated using \code{netdis_centred_graphlet_counts}). +Counts (generated using \code{netdis_centred_graphlet_counts}) for graphlets +of size \code{graphlet_size}. } diff --git a/man/netdis_centred_graphlet_counts.Rd b/man/netdis_centred_graphlet_counts.Rd index 7ce5d9ae..2e233be9 100644 --- a/man/netdis_centred_graphlet_counts.Rd +++ b/man/netdis_centred_graphlet_counts.Rd @@ -4,14 +4,39 @@ \alias{netdis_centred_graphlet_counts} \title{netdis_centred_graphlet_counts} \usage{ -netdis_centred_graphlet_counts(graphlet_counts, exp_graphlet_counts, +netdis_centred_graphlet_counts(graphlet_counts, ref_ego_density_bins, + ref_binned_graphlet_counts, binning_fn, bin_counts_fn, exp_counts_fn, max_graphlet_size) } \arguments{ \item{graphlet_counts}{Ego network graphlet counts for a query graph} -\item{exp_graphlet_counts}{Pre-calculated expected counts for each graphlet -type for each ego network.} +\item{ref_ego_density_bins}{Either a list of previously calculated ego +network density bin edges from a reference network, or \code{NULL}, in +which case density bins are generated using the query graph itself.} + +\item{ref_binned_graphlet_counts}{Either expected graphlet counts for each +ego network density bin from a reference network (a matrix with columns +labelled by graphlet ID and rows by density bin index), \code{NULL}, in +which case density binned counts are generated using the query graph itself, +or a constant numeric value to subtract from all graphlet counts.} + +\item{binning_fn}{Function used to bin ego network densities. Only needed if +\code{ref_ego_density_bins} and \code{ref_binned_graphlet_counts} are +\code{NULL}. Takes densities as its single argument, and returns a named list +including keys \code{breaks} (list of bin edges) and \code{interval_indexes} +(density bin index for each ego network).} + +\item{bin_counts_fn}{Function used to calculate expected graphlet counts in +each density bin. Only needed if \code{ref_ego_density_bins} and +\code{ref_binned_graphlet_counts} are \code{NULL}. Takes +\code{graphlet_counts}, \code{interval_indexes} (bin indexes) and +\code{max_graphlet_size} as arguments.} + +\item{exp_counts_fn}{Function used to map from binned reference counts to +expected counts for each graphlet in each ego network of the query graphs. +Takes \code{ego_networks}, \code{density_bin_breaks}, +\code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments.} \item{max_graphlet_size}{max graphlet size to calculate centred counts for.} } @@ -20,6 +45,7 @@ graphlet_counts minus exp_graphlet_counts for graphlets up to size max_graphlet_size. } \description{ -Centre counts by subtracting expected graphlet counts from actual graphlet -counts. +Calculate expected graphlet counts for each ego network in a query graph and +centre the actual counts by subtracting those calculated expected count +values. } diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_graphlet_counts.Rd index 4c94c5b2..2cee4166 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_graphlet_counts.Rd @@ -2,13 +2,14 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_expected_graphlet_counts} \alias{netdis_expected_graphlet_counts} -\title{INTERNAL FUNCTION - Do not call directly} +\title{netdis_expected_graphlet_counts +INTERNAL FUNCTION - Do not call directly} \usage{ netdis_expected_graphlet_counts(graphlet_counts, max_graphlet_size, density_breaks, density_binned_reference_counts, scale_fn = NULL) } \arguments{ -\item{graphlet_counts}{Node and graphlet counts for a graph.} +\item{graphlet_counts}{Node and graphlet counts for an ego network.} \item{max_graphlet_size}{Determines the maximum size of graphlets to count. Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} @@ -19,15 +20,14 @@ Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} each density bin.} \item{scale_fn}{Optional function to scale calculated expected counts, taking -\code{graph} and \code{max_graphlet_size} as arguments, and returning a scale -factor that the looked up \code{density_binned_reference_counts} values will -be multiplied by. +\code{graphlet_counts} and \code{max_graphlet_size} as arguments, and +returning a scale factor that the looked up +\code{density_binned_reference_counts} values will be multiplied by. + Temporarily accessible during development. TODO: Remove @export prior to publishing} } \description{ -Used by \code{netdis_expected_graphlet_counts_ego} to -calculate expected graphlet counts for a query graph -ego-network from the statistics of a provided reference -graph. +Calculates expected graphlet counts for one ego network based on its density +and pre-calculated reference density bins and graphlet counts for each bin. } diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_graphlet_counts_per_ego.Rd index f96707eb..8c11906b 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_graphlet_counts_per_ego.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_expected_graphlet_counts_per_ego} \alias{netdis_expected_graphlet_counts_per_ego} -\title{INTERNAL FUNCTION - Do not call directly} +\title{netdis_expected_graphlet_counts_per_ego} \usage{ netdis_expected_graphlet_counts_per_ego(graphlet_counts, density_breaks, density_binned_reference_counts, max_graphlet_size, scale_fn = NULL) @@ -28,7 +28,6 @@ and returning a scale factor that the looked up TODO: Remove @export prior to publishing} } \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to -generate a function for calculating expected ego-network graphlet counts -from the statistics of a provided reference graph. +Calculates expected graphlet counts for each ego network based on its density +and pre-calculated reference density bins and graphlet counts for each bin. } diff --git a/man/netdis_subtract_exp_counts.Rd b/man/netdis_subtract_exp_counts.Rd new file mode 100644 index 00000000..e6b59314 --- /dev/null +++ b/man/netdis_subtract_exp_counts.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{netdis_subtract_exp_counts} +\alias{netdis_subtract_exp_counts} +\title{netdis_subtract_exp_counts} +\usage{ +netdis_subtract_exp_counts(graphlet_counts, exp_graphlet_counts, + max_graphlet_size) +} +\arguments{ +\item{graphlet_counts}{Matrix of graphlet counts (columns) for a +nummber of ego networks (rows).} + +\item{exp_graphlet_counts}{Matrix of expected graphlet counts (columns) for a +nummber of ego networks (rows).} + +\item{max_graphlet_size}{Do the subtraction for graphlets up to this size. + +#' Temporarily accessible during development. +TODO: Remove @export prior to publishing} +} +\description{ +Subtract expected graphlet counts from actual graphlet counts. +} diff --git a/man/netdis_uptok.Rd b/man/netdis_uptok.Rd index 7e491b60..e819edcf 100644 --- a/man/netdis_uptok.Rd +++ b/man/netdis_uptok.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{netdis_uptok} \alias{netdis_uptok} -\title{Netdis - graphlets up to max_graphlet_size} +\title{Netdis - for all graphlet sizes up to max_graphlet_size} \usage{ netdis_uptok(centred_graphlet_counts1, centred_graphlet_counts2, max_graphlet_size) @@ -22,5 +22,6 @@ the specified size } \description{ Calculate Netdis statistic between two graphs from their Centred Graphlet -Counts (generated using \code{netdis_centred_graphlet_counts}). +Counts (generated using \code{netdis_centred_graphlet_counts}) for all +graphlet sizes up to \code{max_graphlet_size}. } diff --git a/man/single_density_bin.Rd b/man/single_density_bin.Rd index ed801c45..2c3cd29f 100644 --- a/man/single_density_bin.Rd +++ b/man/single_density_bin.Rd @@ -3,7 +3,7 @@ \name{single_density_bin} \alias{single_density_bin} \title{For case where don't want to use binning, return a single bin which covers -the full range of possible density values.} +the full range of possible density values (0 to 1).} \usage{ single_density_bin(densities) } @@ -13,5 +13,5 @@ a list of indexes of the required length.)} } \description{ For case where don't want to use binning, return a single bin which covers -the full range of possible density values. +the full range of possible density values (0 to 1). } diff --git a/vignettes/netdis_2graphs_polya-aeppli.R b/vignettes/netdis_2graphs_polya-aeppli.R index db2ae759..72593e0b 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.R +++ b/vignettes/netdis_2graphs_polya-aeppli.R @@ -135,13 +135,13 @@ exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts max_graphlet_size, scale_fn = NULL) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, exp_graphlet_counts_1, max_graphlet_size) -centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ## ------------------------------------------------------------------------ sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index 3ebb51ce..6ab3ce6c 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -161,13 +161,13 @@ exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts max_graphlet_size, scale_fn = NULL) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, exp_graphlet_counts_1, max_graphlet_size) -centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ``` ## Sum centred graphlet counts across all ego networks diff --git a/vignettes/netdis_2graphs_polya-aeppli.html b/vignettes/netdis_2graphs_polya-aeppli.html index e84ffcef..f4119998 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.html +++ b/vignettes/netdis_2graphs_polya-aeppli.html @@ -12,7 +12,7 @@ - + Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation @@ -305,7 +305,7 @@

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

Martin O’Reilly, Jack Roberts

-

2019-10-10

+

2019-11-08

@@ -448,13 +448,13 @@

Centre graphlet counts of query graphs using binned expected counts

max_graphlet_size, scale_fn = NULL) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, exp_graphlet_counts_1, max_graphlet_size) -centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size)
+centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size)

Sum centred graphlet counts across all ego networks

diff --git a/vignettes/quickstart_netdis_2graphs.R b/vignettes/quickstart_netdis_2graphs.R index 080e1f6a..a58154e6 100644 --- a/vignettes/quickstart_netdis_2graphs.R +++ b/vignettes/quickstart_netdis_2graphs.R @@ -98,13 +98,13 @@ exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts scale_fn=count_graphlet_tuples) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, - exp_graphlet_counts_1, - max_graphlet_size) +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) -centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ## ------------------------------------------------------------------------ sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index da3edc9c..0990ed15 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -105,6 +105,7 @@ ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( ``` + ## Centre graphlet counts of query graphs based on statistics of reference graph ```{r} # Calculate expected graphlet counts (using ref graph ego network density bins) @@ -122,13 +123,13 @@ exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts scale_fn=count_graphlet_tuples) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, - exp_graphlet_counts_1, - max_graphlet_size) +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) -centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) ``` ## Sum centred graphlet counts across all ego networks diff --git a/vignettes/quickstart_netdis_2graphs.html b/vignettes/quickstart_netdis_2graphs.html index d913896e..24600d12 100644 --- a/vignettes/quickstart_netdis_2graphs.html +++ b/vignettes/quickstart_netdis_2graphs.html @@ -12,7 +12,7 @@ - + Quick start guide for Netdis - 2 graphs @@ -305,7 +305,7 @@

Quick start guide for Netdis - 2 graphs

Martin O’Reilly, Jack Roberts

-

2019-10-11

+

2019-11-08

@@ -414,13 +414,13 @@

Centre graphlet counts of query graphs based on statistics of reference grap scale_fn=count_graphlet_tuples) # Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_centred_graphlet_counts(graphlet_counts_1, - exp_graphlet_counts_1, - max_graphlet_size) +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) -centred_graphlet_counts_2 <- netdis_centred_graphlet_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size)

+centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size)

Sum centred graphlet counts across all ego networks

From 05d30432e1c3f1e598bf4299a0c1b3f8a338a572 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 12 Nov 2019 14:18:16 +0000 Subject: [PATCH 071/188] Fix for NA exp counts in geometric poisson And added test for geometric poisson --- R/measures_net_dis.R | 93 ++++++++++------ man/density_binned_counts_gp.Rd | 4 +- man/exp_counts_bin_gp.Rd | 26 +++++ tests/testthat/test_measures_net_dis.R | 46 +++++++- vignettes/netdis_2graphs_polya-aeppli.R | 84 ++++++++------ vignettes/netdis_2graphs_polya-aeppli.Rmd | 87 +++++++++------ vignettes/netdis_2graphs_polya-aeppli.html | 121 ++++++++++++--------- 7 files changed, 307 insertions(+), 154 deletions(-) create mode 100644 man/exp_counts_bin_gp.Rd diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index dbc13919..6277c55b 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -126,6 +126,7 @@ netdis_one_to_one <- function(graph_1 = NULL, min_ego_nodes = 3, min_ego_edges = 1, binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, exp_counts_fn = exp_counts_fn, graphlet_counts = graphlet_counts ) @@ -883,8 +884,51 @@ density_binned_counts <- function(graphlet_counts, } +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Used by \code{density_binned_counts_gp} +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + mean_binned_graphlet_counts, + max_graphlet_size) { + # extract ego networks belonging to input density bin index + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + + # mean graphlet counts in this density bin + means <- mean_binned_graphlet_counts[bin_idx, ] + + # subtract mean graphlet counts from actual graphlet counts + mean_sub_counts <- sweep(counts, 2, means) + + # variance in graphlet counts across ego networks in this density bin + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + + # GP theta parameter for each graphlet id in this density bin + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + # GP lambda parameter for graphlet size k in this density bin + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE) + + # Expected counts for graphlet size k in this density bin + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk +} -#' Calculate expected counts in density bins using +#' Calculate expected counts in density bins using the #' geometric poisson (Polya-Aeppli) approximation. #' @param graphlet_counts Graphlet counts for a number of ego_networks. #' @param density_interval_indexes Density bin index for @@ -895,47 +939,28 @@ density_binned_counts <- function(graphlet_counts, density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes, max_graphlet_size) { - + # mean graphlet counts in each ego network density bin mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( graphlet_counts, density_interval_indexes) - - exp_counts_bin <- function(bin_idx) { - counts <- graphlet_counts[density_interval_indexes == bin_idx, ] - means <- mean_binned_graphlet_counts[bin_idx, ] - - mean_sub_counts <- sweep(counts, 2, means) - - Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) - theta_d <- 2 * means / (Vd_sq + means) - - exp_counts_dk <- vector() - for (k in 2:max_graphlet_size) { - graphlet_idx <- graphlet_ids_for_size(k) - - lambda_dk <- (1 / length(graphlet_idx)) * - sum( - 2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]) - ) - - exp_counts_dk <- append(exp_counts_dk, - lambda_dk / theta_d[graphlet_idx]) - } - - exp_counts_dk - } - + + # calculate expected counts for each density bin index nbins <- length(unique(density_interval_indexes)) - expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) - - # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin <- t(sapply( + 1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size + )) + + # remove NAs caused by bins with zero counts for a graphlet expected_counts_bin[is.nan(expected_counts_bin)] <- 0 - + expected_counts_bin } - #' Create matrix of constant value to use as expected counts. #' @param graphlet_counts Ego network graphlet counts matrix to create expected #' counts for. diff --git a/man/density_binned_counts_gp.Rd b/man/density_binned_counts_gp.Rd index fe5669a0..b2cca7a0 100644 --- a/man/density_binned_counts_gp.Rd +++ b/man/density_binned_counts_gp.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/measures_net_dis.R \name{density_binned_counts_gp} \alias{density_binned_counts_gp} -\title{Calculate expected counts in density bins using +\title{Calculate expected counts in density bins using the geometric poisson (Polya-Aeppli) approximation.} \usage{ density_binned_counts_gp(graphlet_counts, density_interval_indexes, @@ -18,6 +18,6 @@ each ego network.} included in graphlet_counts.} } \description{ -Calculate expected counts in density bins using +Calculate expected counts in density bins using the geometric poisson (Polya-Aeppli) approximation. } diff --git a/man/exp_counts_bin_gp.Rd b/man/exp_counts_bin_gp.Rd new file mode 100644 index 00000000..b1fa64ac --- /dev/null +++ b/man/exp_counts_bin_gp.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/measures_net_dis.R +\name{exp_counts_bin_gp} +\alias{exp_counts_bin_gp} +\title{INTERNAL FUNCTION - DO NOT CALL DIRECTLY +Used by \code{density_binned_counts_gp} +Calculate expected counts with geometric poisson (Polya-Aeppli) +approximation for a single density bin.} +\usage{ +exp_counts_bin_gp(bin_idx, graphlet_counts, density_interval_indexes, + mean_binned_graphlet_counts, max_graphlet_size) +} +\arguments{ +\item{bin_idx}{Density bin index to calculate expected counts for.} + +\item{graphlet_counts}{Graphlet counts for a number of ego_networks.} + +\item{density_interval_indexes}{Density bin index for +each ego network.} +} +\description{ +INTERNAL FUNCTION - DO NOT CALL DIRECTLY +Used by \code{density_binned_counts_gp} +Calculate expected counts with geometric poisson (Polya-Aeppli) +approximation for a single density bin. +} diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 0a0fe655..5144697c 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1340,11 +1340,49 @@ test_that("netdis_many_to_many correctly interprets numeric ref_graph value", { }) -context("Netdis: expected counts using query networks") -test_that("netdis_many_to_many calculates expected counts from query networks if - ref_graph is NULL", { - # TODO +context("Netdis: Geometric Poisson Approximation") +test_that("netdis_one_to_one gives expected result when using geometric Poisson + approximation", { + # TODO This test is not robust. Rewrite with basic network that gives known + # result. + + # Set source directory for Virus PPI graph edge files + source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + + # Load query and reference graphs + graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + + graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + # set parameters + max_graphlet_size <- 4 + neighbourhood_size <- 2 + min_ego_nodes <- 3 + min_ego_edges <- 1 + + # manually verified result for graphlets of size 4 + expected_netdis4 <- 0.1892716 + + # check function to test + bin_counts_fn <- density_binned_counts_gp + + exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, + scale_fn = NULL) + actual_netdis <- netdis_one_to_one(graph_1, + graph_2, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + print(actual_netdis) + expect_equal(expected_netdis4, actual_netdis[["netdis4"]], + tolerance = .0001, scale = 1) }) context("Netdis: error if max_graphlet_size is not 3, 4 or 5") diff --git a/vignettes/netdis_2graphs_polya-aeppli.R b/vignettes/netdis_2graphs_polya-aeppli.R index 72593e0b..bf96a36f 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.R +++ b/vignettes/netdis_2graphs_polya-aeppli.R @@ -70,44 +70,65 @@ ego_density_bins_2 <- binned_densities_2$breaks ## ------------------------------------------------------------------------ -density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + mean_binned_graphlet_counts) { + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx, ] - mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - graphlet_counts, - bin_indexes) + mean_sub_counts <- sweep(counts, 2, means) - exp_counts_bin <- function(bin_idx) { - counts <- graphlet_counts[bin_indexes == bin_idx, ] - means <- mean_binned_graphlet_counts[bin_idx,] - - mean_sub_counts <- sweep(counts, 2, means) + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) - Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) - theta_d <- 2*means / (Vd_sq + means) - - exp_counts_dk <- vector() - for (k in 2:max_graphlet_size) { - graphlet_idx <- graphlet_ids_for_size(k) - - lambda_dk <- (1 / length(graphlet_idx)) * - sum( - 2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]) - ) - - exp_counts_dk <- append(exp_counts_dk, - lambda_dk / theta_d[graphlet_idx]) - } + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE) - exp_counts_dk + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) } - nbins <- length(unique(bin_indexes)) - expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) - + exp_counts_dk +} + +#' Calculate expected counts in density bins using the +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply(1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts)) + # deal with NAs caused by bins with zero counts for a graphlet - expected_counts_bin[is.nan(expected_counts_bin)] = 0 - + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + expected_counts_bin } @@ -119,7 +140,6 @@ binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, binned_densities_2$interval_indexes, max_graphlet_size) - ## ------------------------------------------------------------------------ # Calculate expected graphlet counts for each ego network exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index 6ab3ce6c..15ae3a49 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -94,44 +94,67 @@ ego_density_bins_2 <- binned_densities_2$breaks ## Calculate expected graphlet counts in each bin using geometric poisson approximation ```{r} -density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) { +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + mean_binned_graphlet_counts, + max_graphlet_size) { + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx, ] - mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - graphlet_counts, - bin_indexes) + mean_sub_counts <- sweep(counts, 2, means) - exp_counts_bin <- function(bin_idx) { - counts <- graphlet_counts[bin_indexes == bin_idx, ] - means <- mean_binned_graphlet_counts[bin_idx,] - - mean_sub_counts <- sweep(counts, 2, means) + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) - Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1) - theta_d <- 2*means / (Vd_sq + means) - - exp_counts_dk <- vector() - for (k in 2:max_graphlet_size) { - graphlet_idx <- graphlet_ids_for_size(k) - - lambda_dk <- (1 / length(graphlet_idx)) * - sum( - 2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]) - ) - - exp_counts_dk <- append(exp_counts_dk, - lambda_dk / theta_d[graphlet_idx]) - } + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE) - exp_counts_dk + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) } - nbins <- length(unique(bin_indexes)) - expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins)) - + exp_counts_dk +} + +#' Calculate expected counts in density bins using the +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply(1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size)) + # deal with NAs caused by bins with zero counts for a graphlet - expected_counts_bin[is.nan(expected_counts_bin)] = 0 - + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + expected_counts_bin } @@ -142,7 +165,6 @@ binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, binned_densities_2$interval_indexes, max_graphlet_size) - ``` ## Centre graphlet counts of query graphs using binned expected counts @@ -170,6 +192,7 @@ centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, max_graphlet_size) ``` + ## Sum centred graphlet counts across all ego networks ```{r} sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) diff --git a/vignettes/netdis_2graphs_polya-aeppli.html b/vignettes/netdis_2graphs_polya-aeppli.html index f4119998..8d090fea 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.html +++ b/vignettes/netdis_2graphs_polya-aeppli.html @@ -12,7 +12,7 @@ - + Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation @@ -305,7 +305,7 @@

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

Martin O’Reilly, Jack Roberts

-

2019-11-08

+

2019-11-12

@@ -383,54 +383,75 @@

Bin ego networks by density

Calculate expected graphlet counts in each bin using geometric poisson approximation

-
density_binned_counts_gp <- function(graphlet_counts, bin_indexes, max_graphlet_size) {
-  
-  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
-                                  graphlet_counts, 
-                                  bin_indexes)
-  
-  exp_counts_bin <- function(bin_idx) {
-    counts <- graphlet_counts[bin_indexes == bin_idx, ]
-    means <- mean_binned_graphlet_counts[bin_idx,]
-    
-    mean_sub_counts <- sweep(counts, 2, means)
-    
-    Vd_sq <- colSums(mean_sub_counts^2)/(nrow(mean_sub_counts)-1)
-    theta_d <- 2*means / (Vd_sq + means)
-
-    exp_counts_dk <- vector()
-    for (k in 2:max_graphlet_size) {
-      graphlet_idx <- graphlet_ids_for_size(k)
-      
-      lambda_dk <- (1 / length(graphlet_idx)) * 
-                   sum(
-                     2 * means[graphlet_idx]^2 /
-                     (Vd_sq[graphlet_idx] + means[graphlet_idx])
-                   )
-      
-      exp_counts_dk <- append(exp_counts_dk,
-                              lambda_dk / theta_d[graphlet_idx])
-    }
-    
-    exp_counts_dk
-  }
-  
-  nbins <- length(unique(bin_indexes))
-  expected_counts_bin <- t(mapply(exp_counts_bin, bin_idx = 1:nbins))
-  
-  # deal with NAs caused by bins with zero counts for a graphlet
-  expected_counts_bin[is.nan(expected_counts_bin)] = 0
-  
-  expected_counts_bin
-}
-
-binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1,
-                                                     binned_densities_1$interval_indexes,
-                                                     max_graphlet_size)
+
#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY
+#' Calculate expected counts with geometric poisson (Polya-Aeppli)
+#' approximation for a single density bin.
+#' @param bin_idx Density bin index to calculate expected counts for.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+exp_counts_bin_gp <- function(bin_idx, graphlet_counts,
+                              density_interval_indexes,
+                              mean_binned_graphlet_counts) {
+  counts <- graphlet_counts[density_interval_indexes == bin_idx, ]
+  means <- mean_binned_graphlet_counts[bin_idx, ]
+  
+  mean_sub_counts <- sweep(counts, 2, means)
+  
+  Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1)
+  theta_d <- 2 * means / (Vd_sq + means)
+  
+  exp_counts_dk <- vector()
+  for (k in 2:max_graphlet_size) {
+    graphlet_idx <- graphlet_ids_for_size(k)
+    
+    lambda_dk <- mean(2 * means[graphlet_idx]^2 /
+                        (Vd_sq[graphlet_idx] + means[graphlet_idx]),
+                      na.rm = TRUE)
+    
+    exp_counts_dk <- append(exp_counts_dk,
+                            lambda_dk / theta_d[graphlet_idx])
+  }
+  
+  exp_counts_dk
+}
+
+#' Calculate expected counts in density bins using the
+#' geometric poisson (Polya-Aeppli) approximation.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+#' @param max_graphlet_size Determines the maximum size of graphlets
+#' included in graphlet_counts.
+#' @export
+density_binned_counts_gp <- function(graphlet_counts,
+                                     density_interval_indexes,
+                                     max_graphlet_size) {
 
-binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2,
-                                                     binned_densities_2$interval_indexes,
-                                                     max_graphlet_size)
+ mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply(1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + + expected_counts_bin +} + +binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) + +binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size)

Centre graphlet counts of query graphs using binned expected counts

@@ -470,7 +491,7 @@

Calculate netdis statistics

print(netdis_result)
##   netdis3   netdis4 
-## 0.8822527 0.1892755
+## 0.8822527 0.1892716 From 6e8df4814c9e634619206c04fe19dfffdddd2631 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 12 Nov 2019 15:27:43 +0000 Subject: [PATCH 072/188] add a test for using constant expected counts --- R/measures_net_dis.R | 20 +++------ tests/testthat/test_measures_net_dis.R | 61 +++++++++++++++----------- 2 files changed, 42 insertions(+), 39 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 6277c55b..ca0c440b 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -771,9 +771,6 @@ netdis_expected_graphlet_counts_per_ego <- function( #' returning a scale factor that the looked up #' \code{density_binned_reference_counts} values will be multiplied by. #' -#' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing -#' @export netdis_expected_graphlet_counts <- function(graphlet_counts, max_graphlet_size, density_breaks, @@ -890,17 +887,18 @@ density_binned_counts <- function(graphlet_counts, #' approximation for a single density bin. #' @param bin_idx Density bin index to calculate expected counts for. #' @param graphlet_counts Graphlet counts for a number of ego_networks. -#' @param density_interval_indexes Density bin index for -#' each ego network. +#' @param density_interval_indexes Density bin indexes for each ego network in +#' \code{graphlet_counts}. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. exp_counts_bin_gp <- function(bin_idx, graphlet_counts, density_interval_indexes, - mean_binned_graphlet_counts, max_graphlet_size) { # extract ego networks belonging to input density bin index counts <- graphlet_counts[density_interval_indexes == bin_idx, ] # mean graphlet counts in this density bin - means <- mean_binned_graphlet_counts[bin_idx, ] + means <- colMeans(counts) # subtract mean graphlet counts from actual graphlet counts mean_sub_counts <- sweep(counts, 2, means) @@ -939,11 +937,7 @@ exp_counts_bin_gp <- function(bin_idx, graphlet_counts, density_binned_counts_gp <- function(graphlet_counts, density_interval_indexes, max_graphlet_size) { - # mean graphlet counts in each ego network density bin - mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - graphlet_counts, - density_interval_indexes) - + # calculate expected counts for each density bin index nbins <- length(unique(density_interval_indexes)) expected_counts_bin <- t(sapply( @@ -951,7 +945,6 @@ density_binned_counts_gp <- function(graphlet_counts, exp_counts_bin_gp, graphlet_counts = graphlet_counts, density_interval_indexes = density_interval_indexes, - mean_binned_graphlet_counts = mean_binned_graphlet_counts, max_graphlet_size = max_graphlet_size )) @@ -1034,7 +1027,6 @@ ego_network_density <- function(graphlet_counts) { #' choices of k nodes in that ego-network, where n is the number of nodes #' in the ego network and k is the number of nodes in the graphlet. #' -#' @param ego_networks Pre-generated ego networks for an input graph. #' @param graphlet_counts Pre-calculated graphlet counts for each ego_network. #' @param max_graphlet_size Determines the maximum size of graphlets included #' in graphlet_counts. diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 5144697c..3777680a 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1024,16 +1024,8 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n ) }) -context("Netdis: Geometric Poisson") -test_that("expected counts using geometric poisson approximation are correct", { - # TODO write this test - -}) - context("Netdis: Statistic calculation") test_that("netdis statistic function output matches manually verified result", { - # TODO Rewrite with more realistic counts. - # arbitrary counts of correct size for graphlets up to size 5 counts_1 <- c(11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14) @@ -1061,8 +1053,6 @@ test_that("netdis statistic function output matches manually verified result", { }) test_that("netdis_uptok gives expected netdis result for graphlets up to size k", { - # TODO Rewrite with more realistic counts. - # arbitrary counts of correct size for graphlets up to size 5 counts_1 <- c(11, 11, 13, 9, 12, 10, 14, 9, 13, 10, 10, 7, 9, 12, 6, 12, 9, 12, 9, 7, 15, 7, 5, 12, 16, 10, 10, 8, 9, 14) @@ -1086,9 +1076,6 @@ test_that("netdis_uptok gives expected netdis result for graphlets up to size k" context("Netdis: full calculation pipeline") test_that("netdis_many_to_many gives expected result", { - # TODO This test is not robust. Rewrite with basic network that gives known - # result. - # Set source directory for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") @@ -1153,9 +1140,6 @@ test_that("netdis_many_to_many gives expected result", { context("Netdis: functions for different pairwise comparisons") test_that("netdis_one_to_one gives expected result", { - # TODO This test is not robust. Rewrite with basic network that gives known - # result. - # Set source directory for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") @@ -1192,9 +1176,6 @@ test_that("netdis_one_to_one gives expected result", { expect_equal(expected_netdis, actual_netdis, tolerance = .001, scale = 1) }) test_that("netdis_one_to_many gives expected result", { - # TODO This test is not robust. Rewrite with basic network that gives known - # result. - # Set source directory for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") @@ -1335,17 +1316,45 @@ test_that("netdis functions error when no query graphs provided", { }) context("Netdis: constant expected counts") -test_that("netdis_many_to_many correctly interprets numeric ref_graph value", { - # TODO +test_that("netdis_centred_graphlet_counts correctly interprets numeric + ref_binned_graphlet_counts value", { + # dummy counts + graphlet_counts <- rbind( + c(1, 2, 3, 4, 5, 6, 7, 8, 9), + c(11, 12, 13, 14, 15, 16, 17, 18, 19), + c(21, 22, 23, 24, 25, 26, 27, 28, 29), + c(31, 32, 33, 34, 35, 36, 37, 38, 39), + c(41, 42, 43, 44, 45, 46, 47, 48, 49), + c(51, 52, 53, 54, 55, 56, 57, 58, 59), + c(61, 62, 63, 64, 65, 66, 67, 68, 69), + c(71, 72, 73, 74, 75, 76, 77, 78, 79), + c(81, 82, 83, 84, 85, 86, 87, 88, 89), + c(91, 92, 93, 94, 95, 96, 97, 98, 99) + ) + graphlet_labels <- c("G0", "G1", "G2", "G3", "G4", "G5", "G6", "G7", "G8") + colnames(graphlet_counts) <- graphlet_labels + rownames(graphlet_counts) <- 1:10 + max_graphlet_size <- 4 + + # netdis_centred_graphlet_counts with ref_binned_graphlet_counts=0 should + # perform no centring, i.e. centred_counts should equal input graphlet_counts + centred_counts <- netdis_centred_graphlet_counts( + graphlet_counts = graphlet_counts, + ref_ego_density_bins = NULL, + ref_binned_graphlet_counts = 0, + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, + max_graphlet_size = 4 + ) + + expect_equal(centred_counts, graphlet_counts) }) context("Netdis: Geometric Poisson Approximation") test_that("netdis_one_to_one gives expected result when using geometric Poisson approximation", { - # TODO This test is not robust. Rewrite with basic network that gives known - # result. - # Set source directory for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") @@ -1363,6 +1372,8 @@ test_that("netdis_one_to_one gives expected result when using geometric Poisson min_ego_edges <- 1 # manually verified result for graphlets of size 4 + # verified using a different implementation of geometric poisson with these + # networks. expected_netdis4 <- 0.1892716 # check function to test @@ -1380,7 +1391,7 @@ test_that("netdis_one_to_one gives expected result when using geometric Poisson min_ego_edges = min_ego_edges, bin_counts_fn = bin_counts_fn, exp_counts_fn = exp_counts_fn) - print(actual_netdis) + expect_equal(expected_netdis4, actual_netdis[["netdis4"]], tolerance = .0001, scale = 1) }) From 41944e32a5cd6d7ab9e64d056bb3ffb0c7dcd548 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 10 Mar 2020 15:15:28 +0000 Subject: [PATCH 073/188] Rename some long variables --- NAMESPACE | 3 +- R/graph_binning.R | 8 ++--- R/measures_net_dis.R | 29 +++++++------------ man/binned_densities_adaptive.Rd | 8 ++--- man/exp_counts_bin_gp.Rd | 9 ++++-- ...s_per_ego.Rd => netdis_expected_counts.Rd} | 8 ++--- ...ounts.Rd => netdis_expected_counts_ego.Rd} | 13 ++++----- man/netdis_many_to_many.Rd | 6 ++-- man/netdis_one_to_many.Rd | 6 ++-- man/netdis_one_to_one.Rd | 6 ++-- man/scale_graphlet_counts_ego.Rd | 2 -- tests/testthat/test_measures_net_dis.R | 18 ++++++------ vignettes/netdis_2graphs_polya-aeppli.Rmd | 4 +-- vignettes/netdis_customisations.Rmd | 4 +-- vignettes/quickstart_netdis_2graphs.Rmd | 4 +-- 15 files changed, 57 insertions(+), 71 deletions(-) rename man/{netdis_expected_graphlet_counts_per_ego.Rd => netdis_expected_counts.Rd} (83%) rename man/{netdis_expected_graphlet_counts.Rd => netdis_expected_counts_ego.Rd} (78%) diff --git a/NAMESPACE b/NAMESPACE index fcd4939d..3c26f728 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,8 +52,7 @@ export(net_emd) export(net_emds_for_all_graphs) export(netdis) export(netdis_centred_graphlet_counts) -export(netdis_expected_graphlet_counts) -export(netdis_expected_graphlet_counts_per_ego) +export(netdis_expected_counts) export(netdis_many_to_many) export(netdis_one_to_many) export(netdis_one_to_one) diff --git a/R/graph_binning.R b/R/graph_binning.R index cf38bde9..bd51f7bb 100644 --- a/R/graph_binning.R +++ b/R/graph_binning.R @@ -1,9 +1,7 @@ -#' INTERNAL FUNCTION - Do not call directly +#' binned_densities_adaptive #' -#' Used by \code{netdis_expected_graphlet_counts_ego_fn} to -#' generate a function for calculating expected ego-network graphlet counts -#' from the statistics of a provided reference graph. -#' Temporarily accessible during development. +#' Adaptive binning function guaranteeing a minimum number of entries in each +#' bin. #' @param densities Density values to use for binning. #' @param min_counts_per_interval Minimum count for each bin. #' @param num_intervals Initial number of density bins to generate. diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index ca0c440b..de26a7ef 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -39,7 +39,7 @@ #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -#' (Default: \code{netdis_expected_graphlet_counts_per_ego} with +#' (Default: \code{netdis_expected_counts} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). #' @@ -71,7 +71,7 @@ netdis_one_to_one <- function(graph_1 = NULL, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial( - netdis_expected_graphlet_counts_per_ego, + netdis_expected_counts, scale_fn = count_graphlet_tuples), graphlet_counts_1 = NULL, graphlet_counts_2 = NULL) { @@ -179,7 +179,7 @@ netdis_one_to_one <- function(graph_1 = NULL, #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -#' (Default: \code{netdis_expected_graphlet_counts_per_ego} with +#' (Default: \code{netdis_expected_counts} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). #' @@ -210,7 +210,7 @@ netdis_one_to_many <- function(graph_1 = NULL, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial( - netdis_expected_graphlet_counts_per_ego, + netdis_expected_counts, scale_fn = count_graphlet_tuples), graphlet_counts_1 = NULL, graphlet_counts_compare = NULL) { @@ -324,7 +324,7 @@ netdis_one_to_many <- function(graph_1 = NULL, #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -#' (Default: \code{netdis_expected_graphlet_counts_per_ego} with +#' (Default: \code{netdis_expected_counts} with #' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in #' the original netdis paper). #' @@ -362,7 +362,7 @@ netdis_many_to_many <- function(graphs = NULL, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), exp_counts_fn = purrr::partial( - netdis_expected_graphlet_counts_per_ego, + netdis_expected_counts, scale_fn = count_graphlet_tuples), graphlet_counts = NULL, graphlet_counts_ref = NULL) { @@ -696,9 +696,6 @@ netdis_centred_graphlet_counts <- function( #' nummber of ego networks (rows). #' @param max_graphlet_size Do the subtraction for graphlets up to this size. #' -#' #' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing -#' @export netdis_subtract_exp_counts <- function( graphlet_counts, exp_graphlet_counts, @@ -714,7 +711,7 @@ netdis_subtract_exp_counts <- function( } -#' netdis_expected_graphlet_counts_per_ego +#' netdis_expected_counts #' #' Calculates expected graphlet counts for each ego network based on its density #' and pre-calculated reference density bins and graphlet counts for each bin. @@ -731,10 +728,8 @@ netdis_subtract_exp_counts <- function( #' and returning a scale factor that the looked up #' \code{density_binned_reference_counts} values will be multiplied by. #' -#' #' Temporarily accessible during development. -#' TODO: Remove @export prior to publishing #' @export -netdis_expected_graphlet_counts_per_ego <- function( +netdis_expected_counts <- function( graphlet_counts, density_breaks, density_binned_reference_counts, @@ -745,7 +740,7 @@ netdis_expected_graphlet_counts_per_ego <- function( # Map over query graph ego-networks, using reference graph statistics to # calculate expected graphlet counts for each ego-network. expected_graphlet_counts <- t(apply( - graphlet_counts, 1, netdis_expected_graphlet_counts, + graphlet_counts, 1, netdis_expected_counts_ego, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = density_binned_reference_counts, @@ -754,7 +749,7 @@ netdis_expected_graphlet_counts_per_ego <- function( expected_graphlet_counts } -#' netdis_expected_graphlet_counts +#' netdis_expected_counts_ego #' INTERNAL FUNCTION - Do not call directly #' #' Calculates expected graphlet counts for one ego network based on its density @@ -771,7 +766,7 @@ netdis_expected_graphlet_counts_per_ego <- function( #' returning a scale factor that the looked up #' \code{density_binned_reference_counts} values will be multiplied by. #' -netdis_expected_graphlet_counts <- function(graphlet_counts, +netdis_expected_counts_ego <- function(graphlet_counts, max_graphlet_size, density_breaks, density_binned_reference_counts, @@ -970,8 +965,6 @@ netdis_const_expected_counts <- function(graphlet_counts, const) { #' \code{scale_graphlet_count} to prevent divide by #' zero errors. #' @param v A vector. -#' TODO remove export -#' @export zeros_to_ones <- function(v) { zero_index <- which(v == 0) v[zero_index] <- 1 diff --git a/man/binned_densities_adaptive.Rd b/man/binned_densities_adaptive.Rd index d33cbc25..82ea2638 100644 --- a/man/binned_densities_adaptive.Rd +++ b/man/binned_densities_adaptive.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/graph_binning.R \name{binned_densities_adaptive} \alias{binned_densities_adaptive} -\title{INTERNAL FUNCTION - Do not call directly} +\title{binned_densities_adaptive} \usage{ binned_densities_adaptive(densities, min_counts_per_interval, num_intervals) @@ -16,8 +16,6 @@ binned_densities_adaptive(densities, min_counts_per_interval, TODO: Remove @export prior to publishing} } \description{ -Used by \code{netdis_expected_graphlet_counts_ego_fn} to -generate a function for calculating expected ego-network graphlet counts -from the statistics of a provided reference graph. -Temporarily accessible during development. +Adaptive binning function guaranteeing a minimum number of entries in each +bin. } diff --git a/man/exp_counts_bin_gp.Rd b/man/exp_counts_bin_gp.Rd index b1fa64ac..eaf5c2ce 100644 --- a/man/exp_counts_bin_gp.Rd +++ b/man/exp_counts_bin_gp.Rd @@ -8,15 +8,18 @@ Calculate expected counts with geometric poisson (Polya-Aeppli) approximation for a single density bin.} \usage{ exp_counts_bin_gp(bin_idx, graphlet_counts, density_interval_indexes, - mean_binned_graphlet_counts, max_graphlet_size) + max_graphlet_size) } \arguments{ \item{bin_idx}{Density bin index to calculate expected counts for.} \item{graphlet_counts}{Graphlet counts for a number of ego_networks.} -\item{density_interval_indexes}{Density bin index for -each ego network.} +\item{density_interval_indexes}{Density bin indexes for each ego network in +\code{graphlet_counts}.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets +included in graphlet_counts.} } \description{ INTERNAL FUNCTION - DO NOT CALL DIRECTLY diff --git a/man/netdis_expected_graphlet_counts_per_ego.Rd b/man/netdis_expected_counts.Rd similarity index 83% rename from man/netdis_expected_graphlet_counts_per_ego.Rd rename to man/netdis_expected_counts.Rd index 8c11906b..e3cabe27 100644 --- a/man/netdis_expected_graphlet_counts_per_ego.Rd +++ b/man/netdis_expected_counts.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/measures_net_dis.R -\name{netdis_expected_graphlet_counts_per_ego} -\alias{netdis_expected_graphlet_counts_per_ego} -\title{netdis_expected_graphlet_counts_per_ego} +\name{netdis_expected_counts} +\alias{netdis_expected_counts} +\title{netdis_expected_counts} \usage{ -netdis_expected_graphlet_counts_per_ego(graphlet_counts, density_breaks, +netdis_expected_counts(graphlet_counts, density_breaks, density_binned_reference_counts, max_graphlet_size, scale_fn = NULL) } \arguments{ diff --git a/man/netdis_expected_graphlet_counts.Rd b/man/netdis_expected_counts_ego.Rd similarity index 78% rename from man/netdis_expected_graphlet_counts.Rd rename to man/netdis_expected_counts_ego.Rd index 2cee4166..2f59d491 100644 --- a/man/netdis_expected_graphlet_counts.Rd +++ b/man/netdis_expected_counts_ego.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/measures_net_dis.R -\name{netdis_expected_graphlet_counts} -\alias{netdis_expected_graphlet_counts} -\title{netdis_expected_graphlet_counts +\name{netdis_expected_counts_ego} +\alias{netdis_expected_counts_ego} +\title{netdis_expected_counts_ego INTERNAL FUNCTION - Do not call directly} \usage{ -netdis_expected_graphlet_counts(graphlet_counts, max_graphlet_size, +netdis_expected_counts_ego(graphlet_counts, max_graphlet_size, density_breaks, density_binned_reference_counts, scale_fn = NULL) } \arguments{ @@ -22,10 +22,7 @@ each density bin.} \item{scale_fn}{Optional function to scale calculated expected counts, taking \code{graphlet_counts} and \code{max_graphlet_size} as arguments, and returning a scale factor that the looked up -\code{density_binned_reference_counts} values will be multiplied by. - -Temporarily accessible during development. -TODO: Remove @export prior to publishing} +\code{density_binned_reference_counts} values will be multiplied by.} } \description{ Calculates expected graphlet counts for one ego network based on its density diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index 983dbafb..27d0f933 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -11,8 +11,8 @@ netdis_many_to_many(graphs = NULL, ref_graph = 0, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples), graphlet_counts = NULL, + exp_counts_fn = purrr::partial(netdis_expected_counts, scale_fn = + count_graphlet_tuples), graphlet_counts = NULL, graphlet_counts_ref = NULL) } \arguments{ @@ -59,7 +59,7 @@ approach used in the original netdis paper).} expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_graphlet_counts_per_ego} with +(Default: \code{netdis_expected_counts} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index 5386dc17..f52632d2 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -11,8 +11,8 @@ netdis_one_to_many(graph_1 = NULL, graphs_compare = NULL, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples), graphlet_counts_1 = NULL, + exp_counts_fn = purrr::partial(netdis_expected_counts, scale_fn = + count_graphlet_tuples), graphlet_counts_1 = NULL, graphlet_counts_compare = NULL) } \arguments{ @@ -57,7 +57,7 @@ approach used in the original netdis paper).} expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_graphlet_counts_per_ego} with +(Default: \code{netdis_expected_counts} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 5b3a5c66..960e6dfa 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -11,8 +11,8 @@ netdis_one_to_one(graph_1 = NULL, graph_2 = NULL, ref_graph = 0, min_counts_per_interval = 5, num_intervals = 100), bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_graphlet_counts_per_ego, - scale_fn = count_graphlet_tuples), graphlet_counts_1 = NULL, + exp_counts_fn = purrr::partial(netdis_expected_counts, scale_fn = + count_graphlet_tuples), graphlet_counts_1 = NULL, graphlet_counts_2 = NULL) } \arguments{ @@ -55,7 +55,7 @@ approach used in the original netdis paper).} expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_graphlet_counts_per_ego} with +(Default: \code{netdis_expected_counts} with \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in the original netdis paper).} diff --git a/man/scale_graphlet_counts_ego.Rd b/man/scale_graphlet_counts_ego.Rd index 28ae9ffc..15f062a7 100644 --- a/man/scale_graphlet_counts_ego.Rd +++ b/man/scale_graphlet_counts_ego.Rd @@ -13,8 +13,6 @@ scale_graphlet_counts_ego(graphlet_counts, max_graphlet_size) \item{max_graphlet_size}{Determines the maximum size of graphlets included in graphlet_counts.} - -\item{ego_networks}{Pre-generated ego networks for an input graph.} } \value{ scaled graphlet counts. diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 3777680a..447d1b6d 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -747,7 +747,7 @@ test_that("density_binned_counts output matches manually verified totals with di }) context("Measures Netdis: Expected graphlet counts") -test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { +test_that("netdis_expected_counts_ego works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes rand_graph <- function(num_nodes, density) { max_edges <- choose(num_nodes, 2) @@ -794,7 +794,7 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { expected_expected_graphlet_counts <- purrr::map(density_indexes, expected_expected_graphlet_counts_fn) actual_expected_graphlet_counts <- - purrr::map(graphlet_counts, netdis_expected_graphlet_counts, + purrr::map(graphlet_counts, netdis_expected_counts_ego, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = scaled_reference_counts, @@ -822,7 +822,7 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { expected_expected_graphlet_counts <- purrr::map2(density_indexes, num_nodes, expected_expected_graphlet_counts_fn) actual_expected_graphlet_counts <- - purrr::map(graphlet_counts, netdis_expected_graphlet_counts, + purrr::map(graphlet_counts, netdis_expected_counts_ego, max_graphlet_size = max_graphlet_size, density_breaks = density_breaks, density_binned_reference_counts = scaled_reference_counts, @@ -840,7 +840,7 @@ test_that("netdis_expected_graphlet_counts works for graphlets up to 4 nodes", { } }) -test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 nodes", { +test_that("netdis_expected_counts works for graphlets up to 4 nodes", { # Helper function to generate graphs with known density and number of nodes # Set up a small sample network with at least one ego-network that contains # at least one of each graphlets @@ -943,7 +943,7 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n # Calculate actual output of function under test actual_expected_graphlet_counts_ego_o1 <- - netdis_expected_graphlet_counts_per_ego( + netdis_expected_counts( graphlet_counts_ego_o1, breaks, scaled_reference_counts, @@ -951,7 +951,7 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n scale_fn = count_graphlet_tuples ) actual_expected_graphlet_counts_ego_o2 <- - netdis_expected_graphlet_counts_per_ego( + netdis_expected_counts( graphlet_counts_ego_o2, breaks, scaled_reference_counts, @@ -997,7 +997,7 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n # Calculate actual output of function under test actual_expected_graphlet_counts_ego_o1 <- - netdis_expected_graphlet_counts_per_ego( + netdis_expected_counts( graphlet_counts_ego_o1, breaks, scaled_reference_counts, @@ -1005,7 +1005,7 @@ test_that("netdis_expected_graphlet_counts_per_ego works for graphlets up to 4 n scale_fn = NULL ) actual_expected_graphlet_counts_ego_o2 <- - netdis_expected_graphlet_counts_per_ego( + netdis_expected_counts( graphlet_counts_ego_o2, breaks, scaled_reference_counts, @@ -1379,7 +1379,7 @@ test_that("netdis_one_to_one gives expected result when using geometric Poisson # check function to test bin_counts_fn <- density_binned_counts_gp - exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, + exp_counts_fn <- purrr::partial(netdis_expected_counts, scale_fn = NULL) actual_netdis <- netdis_one_to_one(graph_1, diff --git a/vignettes/netdis_2graphs_polya-aeppli.Rmd b/vignettes/netdis_2graphs_polya-aeppli.Rmd index 15ae3a49..ac421d69 100644 --- a/vignettes/netdis_2graphs_polya-aeppli.Rmd +++ b/vignettes/netdis_2graphs_polya-aeppli.Rmd @@ -170,14 +170,14 @@ binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, ## Centre graphlet counts of query graphs using binned expected counts ```{r} # Calculate expected graphlet counts for each ego network -exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, ego_density_bins_1, binned_graphlet_counts_1, max_graphlet_size, scale_fn = NULL) -exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, ego_density_bins_2, binned_graphlet_counts_2, max_graphlet_size, diff --git a/vignettes/netdis_customisations.Rmd b/vignettes/netdis_customisations.Rmd index 56baed73..0c1c724a 100644 --- a/vignettes/netdis_customisations.Rmd +++ b/vignettes/netdis_customisations.Rmd @@ -83,7 +83,7 @@ print(results$comp_spec) ```{r} bin_counts_fn <- density_binned_counts_gp -exp_counts_fn <- purrr::partial(netdis_expected_graphlet_counts_per_ego, +exp_counts_fn <- purrr::partial(netdis_expected_counts, scale_fn = NULL) # Calculate netdis statistics @@ -104,7 +104,7 @@ print(results$comp_spec) ```{r} binning_fn <- single_density_bin bin_counts_fn <- density_binned_counts -exp_counts_fn <- netdis_expected_graphlet_counts_per_ego +exp_counts_fn <- netdis_expected_counts # Calculate netdis statistics results <- netdis_many_to_many(graphs, diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 0990ed15..51eb0330 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -109,14 +109,14 @@ ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( ## Centre graphlet counts of query graphs based on statistics of reference graph ```{r} # Calculate expected graphlet counts (using ref graph ego network density bins) -exp_graphlet_counts_1 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_1, +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, ref_ego_density_bins, ref_binned_graphlet_counts, max_graphlet_size, scale_fn=count_graphlet_tuples) -exp_graphlet_counts_2 <- netdis_expected_graphlet_counts_per_ego(graphlet_counts_2, +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, ref_ego_density_bins, ref_binned_graphlet_counts, max_graphlet_size, From b3941a452bb4cdae395d4d6b2d4c05ef8aafd4f8 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Thu, 12 Mar 2020 16:09:53 +0000 Subject: [PATCH 074/188] rebuild vignettes and docs --- NAMESPACE | 1 - R/measures_net_dis.R | 2 +- man/netdis_expected_counts.Rd | 5 +- man/netdis_subtract_exp_counts.Rd | 5 +- man/zeros_to_ones.Rd | 3 +- vignettes/dendrogram_example_net_dis.html | 4 +- vignettes/dendrogram_example_net_emd.html | 146 +++++++++++---------- vignettes/netdis_2graphs_polya-aeppli.R | 14 +- vignettes/netdis_2graphs_polya-aeppli.Rmd | 4 +- vignettes/netdis_2graphs_polya-aeppli.html | 134 +++++++++---------- vignettes/netdis_customisations.R | 4 +- vignettes/netdis_customisations.html | 12 +- vignettes/netdis_pairwise_comparisons.html | 4 +- vignettes/quickstart_netdis_2graphs.R | 4 +- vignettes/quickstart_netdis_2graphs.html | 8 +- 15 files changed, 174 insertions(+), 176 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 3c26f728..521414db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,6 +70,5 @@ export(shift_dhist) export(simplify_graph) export(single_density_bin) export(sort_dhist) -export(zeros_to_ones) importFrom(Rcpp,sourceCpp) useDynLib(netdist, .registration=TRUE) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index de26a7ef..b2dfe6dc 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -695,7 +695,7 @@ netdis_centred_graphlet_counts <- function( #' @param exp_graphlet_counts Matrix of expected graphlet counts (columns) for a #' nummber of ego networks (rows). #' @param max_graphlet_size Do the subtraction for graphlets up to this size. -#' +#' @export netdis_subtract_exp_counts <- function( graphlet_counts, exp_graphlet_counts, diff --git a/man/netdis_expected_counts.Rd b/man/netdis_expected_counts.Rd index e3cabe27..e938fff9 100644 --- a/man/netdis_expected_counts.Rd +++ b/man/netdis_expected_counts.Rd @@ -22,10 +22,7 @@ Only graphlets containing up to \code{max_graphlet_size} nodes are counted.} \item{scale_fn}{Optional function to scale calculated expected counts, taking \code{graphlet_counts} and \code{max_graphlet_size} as arguments, and returning a scale factor that the looked up -\code{density_binned_reference_counts} values will be multiplied by. - -#' Temporarily accessible during development. -TODO: Remove @export prior to publishing} +\code{density_binned_reference_counts} values will be multiplied by.} } \description{ Calculates expected graphlet counts for each ego network based on its density diff --git a/man/netdis_subtract_exp_counts.Rd b/man/netdis_subtract_exp_counts.Rd index e6b59314..f8ba0e7c 100644 --- a/man/netdis_subtract_exp_counts.Rd +++ b/man/netdis_subtract_exp_counts.Rd @@ -14,10 +14,7 @@ nummber of ego networks (rows).} \item{exp_graphlet_counts}{Matrix of expected graphlet counts (columns) for a nummber of ego networks (rows).} -\item{max_graphlet_size}{Do the subtraction for graphlets up to this size. - -#' Temporarily accessible during development. -TODO: Remove @export prior to publishing} +\item{max_graphlet_size}{Do the subtraction for graphlets up to this size.} } \description{ Subtract expected graphlet counts from actual graphlet counts. diff --git a/man/zeros_to_ones.Rd b/man/zeros_to_ones.Rd index ca662211..a33206bf 100644 --- a/man/zeros_to_ones.Rd +++ b/man/zeros_to_ones.Rd @@ -9,8 +9,7 @@ zero errors.} zeros_to_ones(v) } \arguments{ -\item{v}{A vector. -TODO remove export} +\item{v}{A vector.} } \description{ Replace zero values in a vector with ones. Used by diff --git a/vignettes/dendrogram_example_net_dis.html b/vignettes/dendrogram_example_net_dis.html index 5f7c9f70..32cbc351 100644 --- a/vignettes/dendrogram_example_net_dis.html +++ b/vignettes/dendrogram_example_net_dis.html @@ -12,7 +12,7 @@ - + Dendrogram example for Netdis @@ -305,7 +305,7 @@

Dendrogram example for Netdis

Martin O’Reilly

-

2019-10-11

+

2020-03-12

diff --git a/vignettes/dendrogram_example_net_emd.html b/vignettes/dendrogram_example_net_emd.html index 8228129c..4d695791 100644 --- a/vignettes/dendrogram_example_net_emd.html +++ b/vignettes/dendrogram_example_net_emd.html @@ -12,7 +12,7 @@ - + Dendrogram example for NetEMD @@ -20,9 +20,9 @@ + + + + + + + + + + + +

Netdis Vignette’s Menu

+

Luis Ospina-Forero

+

10-06-2020

+ + + +
+

Netdis introductory Vignettes

+

This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods ‘Netdis’, ‘NetEmd’ and their variants (e.g. `Netdis Geometric-Poisson’), and also the large flexibility of these methodologies. The following is a list of the available vignettes:

+ +
+ + + + + + + + + + + diff --git a/doc/default_pairwise_usage.R b/doc/default_pairwise_usage.R new file mode 100644 index 00000000..5faef82e --- /dev/null +++ b/doc/default_pairwise_usage.R @@ -0,0 +1,89 @@ +## ---- include = FALSE--------------------------------------------------------- +knitr::opts_chunk$set( +collapse = TRUE, +comment = "#>" +) + +## ---- packages, message= FALSE------------------------------------------------ +# Load packages/libraries +library("netdist") +library("igraph") + +## ---- graphs,fig.align='center'----------------------------------------------- +# Set source directory for Virus PPI graph edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +print(source_dir) + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 + +#A simple visualization of the graphs. +plot(graph_1,vertex.size=0.5,vertex.label=NA) +plot(graph_2,vertex.size=0.5,vertex.label=NA) + +## ---- netemd,fig.align='center'----------------------------------------------- +# Set source directory for Virus PPI graph edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +#The one to one network comparison. +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) + +## ---- netemdEigen,fig.align='center'------------------------------------------ +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) + +#Spectra (This may take a couple of minutes). +props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0) + +## ----netdisgoldstand,fig.align='center'--------------------------------------- +# Lattice graphs tto be used as reference point comparison. +goldstd_1 <- igraph::graph.lattice(c(round(sqrt(igraph::vcount(graph_1))),round(sqrt(igraph::vcount(graph_1))))) +goldstd_2 <- igraph::graph.lattice(c(round(sqrt(igraph::vcount(graph_2))),round(sqrt(igraph::vcount(graph_2))))) + +plot(goldstd_1,vertex.size=0.8,vertex.label=NA) +plot(goldstd_2,vertex.size=0.5,vertex.label=NA) + + +# Netdis using the er_goldstd_1 graph as gold standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + +# Netdis using the er_goldstd_2 graph as gold standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) + +## ---- netdisGP---------------------------------------------------------------- +#Netdis using the Geometric Poisson approximation for the background subgraph expectations. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) + +## ----netdiszero--------------------------------------------------------------- +#Netdis using no expecations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + + diff --git a/doc/default_pairwise_usage.Rmd b/doc/default_pairwise_usage.Rmd new file mode 100644 index 00000000..c4caeb33 --- /dev/null +++ b/doc/default_pairwise_usage.Rmd @@ -0,0 +1,215 @@ +--- +title: "Default pairwise usage of the network comparison methods" +date: "`10-06-2020`" +author: "Luis Ospina-Forero" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{default_pairwise_usage} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( +collapse = TRUE, +comment = "#>" +) +``` + +# Introduction + +The \code{netdist} package currently considers to broad methodologies for network comparison, namely Netdis and NetEmd. Netdis considers multiple variants (via background expectations) to capture the dissimilarity between the local structure of networks exhibited by the occurrence of small subgraphs. NetEmd is also a method to capture the dissimilarity between networks using subgraph counts, but it has also been defined for any type of network features that the user defines; for example eigen distributions. The variantes of Netdis are controlled by the input selected for the background expectations, whereas the variants of NetEmd are controlled directly by the user in the selection of the network features being compared (by default this pacakge uses subgraph counts). + +The following shows a quick introduction to the most simple functions of the package, and to some of the variants of Netdis and NetEmd. + +# Load required packages/libraries +```{r, packages, message= FALSE} +# Load packages/libraries +library("netdist") +library("igraph") +``` + +## Load graphs included in the netdist package +```{r, graphs,fig.align='center'} +# Set source directory for Virus PPI graph edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +print(source_dir) + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 + +#A simple visualization of the graphs. +plot(graph_1,vertex.size=0.5,vertex.label=NA) +plot(graph_2,vertex.size=0.5,vertex.label=NA) +``` + + + +# Compare two networks via NetEmd. + +## What is NetEmd? +(Extracted from Wegner et al. (2017)): +NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties reflects the topological organization of the network. From an abstract point of view NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. + +Based on these NetEmd uses the following measure between distributions $p$ and $q$ that are supported on $\mathbb{R}$ and have non-zero, finite variances: +\begin{equation}\label{emdmet} +EMD^*(p,q)=\mathrm{inf}_{c\in\mathbb{R}}\left( EMD\big(\tilde{p}(\cdot+c),\tilde{q}(\cdot)\big)\right), +\end{equation} +where $EMD$ is the earth mover's distance and $\tilde{p}$ and $\tilde{q}$ are the distributions obtained by rescaling $p$ and $q$ to have variance 1. More precisely, $\tilde{p}$ is the distribution obtained from $p$ by the transformation $x\rightarrow \frac{x}{\sigma(p)}$, where $\sigma(p)$ is the standard deviation of $p$. For probability distributions $p$ and $q$ with support in $\mathbb{R}$ and bounded absolute first moment, the $EMD$ between $p$ and $q$ is given by $EMD(p,q)=\int_{-\infty}^\infty|F(x)-G(x)|\,\mathrm{d}x$, where $F$ and $G$ are the cumulative distribution functions of $p$ and $q$ respectively. + +Now, for two networks $G$ and $G'$ and for a given set $T=\{t_1,t_2,...,t_m\}$ of network features, the $NetEmd$ measure corresponding to $T$ is: +\begin{equation}\label{eq:def_netemd} +NetEmd_T(G,G')=\frac{1}{m}\sum_{j=1}^{m} NetEmd_{t_j} (G,G'), +\end{equation} +where +\begin{equation} +NetEmd_{t_i} (G,G')=EMD^*(p_{t_i}(G),p_{t_i}(G')), +\end{equation} +and where $p_{t_i}(G)$ and $p_{t_i}(G')$ are the distributions of ${t_i}$ on $G$ and $G'$ respectively. $NetEmd_{t_i}$ can be shown to be a pseudometric between graphs for any feature $t$, that is it is non-negative, symmetric and satisfies the triangle inequality. + + +## Comparing two graphs with NetEmd. +```{r, netemd,fig.align='center'} +# Set source directory for Virus PPI graph edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +#The one to one network comparison. +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) +``` + +## Comparing two graphs with NetEmd via their Laplacian spectrum. +```{r, netemdEigen,fig.align='center'} +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE) + +#Spectra (This may take a couple of minutes). +props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0) +``` +------------------------- + +# Compare two networks via Netdis and its variants. + +## What is Netdis? +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. The centred counts between the networks are compared to form the Netdis statistic. + +Netdis is constructed as follows: + +Let $N_{w,i}(G)$ be the number of induced occurrences of small graphs $w$ in the 2-step ego network of vertex $i$. Now, bin all 2-step ego-networks of network $G$ according to their network density. Let $E_w(G,d)$ be the expected number of occurrences of $w$ in an ego-network whose density falls in density bin $d$. For a given network $G$ compute the centred subgraph counts as +\[ +S_w(G)=\sum\limits_{i }{\bigg (N_{w,i}(G)- E_w(G, \rho(i)) \bigg )}, +\] +where $i$ is a node in $G$ and $\rho(i)$ the density bin of the 2-step ego-network of node $i$. + +Now, to compare networks $G_1$ and $G_2$, set +$$ +\displaystyle +netD_2^S(k) = \tfrac{1}{ \sqrt{ M(k)} } \sum\limits_{w \in A(k)} +\bigg ({ \tfrac{S_w(G_1) S_w(G_2)} {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +}\bigg ), \quad k=3,4, 5, +$$ +where $A(k)$ is the set of connected subgraphs of size $k$, and where $M(k)$ is a normalising constant so that $netD_2^S(k)\in[-1,1]$. $M(k)$ is equal to +\[ +M(k) = \sum\limits_{w \in A(k)} +\left( \tfrac{ S_w(G_1)^2 }{\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} \right) +\sum\limits_{w \in A(k)} +\left(\tfrac{ S_w(G_2)^2 } {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +\right) +. +\] +The corresponding Netdis statistic is defined as +$$Netdis(k)=netd_2^S(k)=\tfrac{1}{2}(1-netD_2^S(k)) \in [0,1].$$ +Small values of Netdis suggest higher `similarity' between the networks. By default Netdis uses subgraphs on $k=4$ nodes. + + +## Using netdis with an gold standard graph as $E_w$. +The use of gold standard graph as a substitute for $E_w$ can be used when such graph is known to be a good proxy for $E_w$, or alternatively a good reference point of comparison. + +```{r,netdisgoldstand,fig.align='center'} +# Lattice graphs tto be used as reference point comparison. +goldstd_1 <- igraph::graph.lattice(c(round(sqrt(igraph::vcount(graph_1))),round(sqrt(igraph::vcount(graph_1))))) +goldstd_2 <- igraph::graph.lattice(c(round(sqrt(igraph::vcount(graph_2))),round(sqrt(igraph::vcount(graph_2))))) + +plot(goldstd_1,vertex.size=0.8,vertex.label=NA) +plot(goldstd_2,vertex.size=0.5,vertex.label=NA) + + +# Netdis using the er_goldstd_1 graph as gold standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + +# Netdis using the er_goldstd_2 graph as gold standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) +``` + +## Netdis-GP: Using a Geometric Poisson approximation + +(Extracted from Ospina-Forero et al. (2018)): Instead of considering an approximation based on an observed gold-standard network whose selection may be difficult, $E_w$ is computed, independently for each graph, based on a geometric Poisson (GP) approximation for the distribution of the number of occurrences of subgraph $w$. It is assumes that $N_{w,i} \sim GP(\lambda^{\rho(i)}_k, \theta^{\rho(i)}_w)$, where $\lambda^{\rho(i)}_k$ is the Poisson parameter indexed by the size of subgraph $w$ and the density bin $\rho(i)$; and where $\theta^{\rho(i)}_w$ is the geometric parameter indexed by subgraph $w$ and density bin $\rho(i)$. $E_w(G, \rho(i))$ is taken as the mean of the GP approximation, i.e. $\lambda^{\rho(i)}_k/\theta^{\rho(i)}_w$. + +As $\lambda^{\rho(i)}_k$ and $\theta^{\rho(i)}_w$ are not known, they are estimated as follows: +Let $x_{w,d}^j$ be the number of subgraphs $w$ on the 2-step ego-network $j$ of density bin $d$, and let +\[ +\bar{X}_{w,d}=\frac{1}{q} \sum_{j=1}^q x_{w,d}^j, \qquad V^2_{w,d}=\frac{1}{q-1} \sum_{j=1}^q (x_{w,d}^j - \bar{X}_{w,d})^2 +, +\] +where $q$ is the number of ego-networks in density bin $d$. Then, +\[ +\hat{\lambda}^{d}_{k}= \frac{1}{l} \sum_{h \in A(k)} \frac{2 (\bar{X}_{h,d})^2}{V^2_{h,d}+\bar{X}_{h,d}} , \qquad \hat{\theta}^{d}_w= \frac{2\bar{X}_{w,d}}{V^2_{w,d}+\bar{X}_{w,d}}, +\] +where $l$ is the number of connected subgraphs of size $k$, for example, $l=6$ for $k=4$. These estimators are based on the moment estimators of a GP random variable and the proposal made by (Picard et al.(2008)), where the total count of each individual subgraph could be thought as the sum of the total subgraph counts over multiple ``clumps'' of edges that appear across the network. + +```{r, netdisGP} +#Netdis using the Geometric Poisson approximation for the background subgraph expectations. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) +``` + + +## Using Netdis with no expectation ($E_w=0$) +```{r,netdiszero} +#Netdis using no expecations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + +``` + +------------------------- + + + + + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. \ No newline at end of file diff --git a/doc/default_pairwise_usage.html b/doc/default_pairwise_usage.html new file mode 100644 index 00000000..8a4ed507 --- /dev/null +++ b/doc/default_pairwise_usage.html @@ -0,0 +1,529 @@ + + + + + + + + + + + + + + + +Default pairwise usage of the network comparison methods + + + + + + + + + + + + + + + + + + + + + + +

Default pairwise usage of the network comparison methods

+

Luis Ospina-Forero

+

10-06-2020

+ + + +
+

Introduction

+

The package currently considers to broad methodologies for network comparison, namely Netdis and NetEmd. Netdis considers multiple variants (via background expectations) to capture the dissimilarity between the local structure of networks exhibited by the occurrence of small subgraphs. NetEmd is also a method to capture the dissimilarity between networks using subgraph counts, but it has also been defined for any type of network features that the user defines; for example eigen distributions. The variantes of Netdis are controlled by the input selected for the background expectations, whereas the variants of NetEmd are controlled directly by the user in the selection of the network features being compared (by default this pacakge uses subgraph counts).

+

The following shows a quick introduction to the most simple functions of the package, and to some of the variants of Netdis and NetEmd.

+
+
+

Load required packages/libraries

+
# Load packages/libraries
+library("netdist")
+library("igraph")
+
+

Load graphs included in the netdist package

+
# Set source directory for Virus PPI graph edge files stored in the netdist package.
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+print(source_dir)
+#> [1] "/private/var/folders/bv/2vl4fwcn3_x_yvcszq53645h0000gp/T/RtmpRljMhU/temp_libpatha2174f842f1/netdist/extdata/VRPINS"
+
+# Load query graphs as igraph objects
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges.
+graph_1
+#> IGRAPH 90584f3 UN-- 60 208 -- 
+#> + attr: name (v/c)
+#> + edges from 90584f3 (vertex names):
+#>  [1] A73  --BALF3 A73  --BARF0 A73  --BBLF2 A73  --BDRF1 A73  --BFRF4
+#>  [6] A73  --BGLF2 A73  --BGLF3 A73  --BGLF5 A73  --BLLF2 A73  --BTRF1
+#> [11] BALF3--BBLF2 BALF3--BDRF1 BALF3--BFRF4 BALF3--BGLF5 BALF3--BTRF1
+#> [16] BALF3--BALF1 BALF3--BALF2 BALF3--BORF1 BALF3--BALF4 BALF3--BFLF2
+#> [21] BALF3--BPLF1 BALF3--BALF5 BALF3--BBLF4 BALF3--BDLF2 BALF3--BdRF1
+#> [26] BALF3--BERF3 BALF3--BHRF1 BALF3--LMP2A BARF0--BBLF2 BARF0--BFRF4
+#> [31] BARF0--BSRF1 BARF0--BALF4 BARF0--BPLF1 BARF0--BALF5 BARF0--BDLF2
+#> [36] BARF0--BdRF1 BARF0--BERF3 BARF0--BGLF1 BARF0--LMP2A BBLF2--BDRF1
+#> + ... omitted several edges
+
+# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges.
+graph_2
+#> IGRAPH 39bbb71 UN-- 1941 3989 -- 
+#> + attr: name (v/c)
+#> + edges from 39bbb71 (vertex names):
+#>  [1] B1882--B1888 B1882--B1945 B1882--B1946 B1882--B1886 B1882--B1887
+#>  [6] B1882--B1939 B1882--B1938 B1882--B1884 B1882--B1883 B1882--B3210
+#> [11] B1882--B1881 B1882--B4355 B1882--B1922 B1882--B1890 B1882--B1889
+#> [16] B1888--B1886 B1888--B1887 B1888--B1884 B1888--B1883 B1888--B1881
+#> [21] B1888--B4355 B1888--B1890 B1888--B1889 B1888--B1421 B1888--B3072
+#> [26] B1888--B1885 B0728--B0729 B0728--B0724 B0728--B0726 B0728--B0727
+#> [31] B0729--B0724 B0729--B3734 B0729--B0726 B0729--B0727 B0729--B0720
+#> [36] B0729--B3236 B1812--B3360 B1812--B1260 B1812--B1261 B1812--B1263
+#> + ... omitted several edges
+
+#A simple visualization of the graphs.
+plot(graph_1,vertex.size=0.5,vertex.label=NA)
+

+
plot(graph_2,vertex.size=0.5,vertex.label=NA)
+

+
+
+
+

Compare two networks via NetEmd.

+
+

What is NetEmd?

+

(Extracted from Wegner et al. (2017)): NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties reflects the topological organization of the network. From an abstract point of view NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e\(.\) translations and re-scalings of the axis.

+

Based on these NetEmd uses the following measure between distributions \(p\) and \(q\) that are supported on \(\mathbb{R}\) and have non-zero, finite variances: \[\begin{equation}\label{emdmet} +EMD^*(p,q)=\mathrm{inf}_{c\in\mathbb{R}}\left( EMD\big(\tilde{p}(\cdot+c),\tilde{q}(\cdot)\big)\right), +\end{equation}\] where \(EMD\) is the earth mover’s distance and \(\tilde{p}\) and \(\tilde{q}\) are the distributions obtained by rescaling \(p\) and \(q\) to have variance 1. More precisely, \(\tilde{p}\) is the distribution obtained from \(p\) by the transformation \(x\rightarrow \frac{x}{\sigma(p)}\), where \(\sigma(p)\) is the standard deviation of \(p\). For probability distributions \(p\) and \(q\) with support in \(\mathbb{R}\) and bounded absolute first moment, the \(EMD\) between \(p\) and \(q\) is given by \(EMD(p,q)=\int_{-\infty}^\infty|F(x)-G(x)|\,\mathrm{d}x\), where \(F\) and \(G\) are the cumulative distribution functions of \(p\) and \(q\) respectively.

+

Now, for two networks \(G\) and \(G'\) and for a given set \(T=\{t_1,t_2,...,t_m\}\) of network features, the \(NetEmd\) measure corresponding to \(T\) is: \[\begin{equation}\label{eq:def_netemd} +NetEmd_T(G,G')=\frac{1}{m}\sum_{j=1}^{m} NetEmd_{t_j} (G,G'), +\end{equation}\] where \[\begin{equation} +NetEmd_{t_i} (G,G')=EMD^*(p_{t_i}(G),p_{t_i}(G')), +\end{equation}\] and where \(p_{t_i}(G)\) and \(p_{t_i}(G')\) are the distributions of \({t_i}\) on \(G\) and \(G'\) respectively. \(NetEmd_{t_i}\) can be shown to be a pseudometric between graphs for any feature \(t\), that is it is non-negative, symmetric and satisfies the triangle inequality.

+
+
+

Comparing two graphs with NetEmd.

+
# Set source directory for Virus PPI graph edge files stored in the netdist package.
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs as igraph objects
+# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges.
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges.
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+#The one to one network comparison.
+netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5)
+#> [1] 0.5174161
+
+
+

Comparing two graphs with NetEmd via their Laplacian spectrum.

+
#Laplacian
+Lapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = FALSE,sparse = FALSE)
+Lapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = FALSE,sparse = FALSE)
+
+#Normalized Laplacian
+NLapg_1 <- igraph::laplacian_matrix(graph = graph_1,normalized = TRUE,sparse = FALSE)
+NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = FALSE)
+
+#Spectra (This may take a couple of minutes).
+props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) 
+props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) 
+
+netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)
+#> [1] 0.1818771
+
+
+
+
+

Compare two networks via Netdis and its variants.

+
+

What is Netdis?

+

(Extracted from Ali et al. (2014)): Netdis counts small subgraphs \(w\) on \(k\) nodes for all 2-step ego-networks, \(k=3,4,5\). These counts are centred by subtracting the expected number of counts \(E_w\). The centred counts between the networks are compared to form the Netdis statistic.

+

Netdis is constructed as follows:

+

Let \(N_{w,i}(G)\) be the number of induced occurrences of small graphs \(w\) in the 2-step ego network of vertex \(i\). Now, bin all 2-step ego-networks of network \(G\) according to their network density. Let \(E_w(G,d)\) be the expected number of occurrences of \(w\) in an ego-network whose density falls in density bin \(d\). For a given network \(G\) compute the centred subgraph counts as \[ +S_w(G)=\sum\limits_{i }{\bigg (N_{w,i}(G)- E_w(G, \rho(i)) \bigg )}, +\] where \(i\) is a node in \(G\) and \(\rho(i)\) the density bin of the 2-step ego-network of node \(i\).

+

Now, to compare networks \(G_1\) and \(G_2\), set \[ +\displaystyle +netD_2^S(k) = \tfrac{1}{ \sqrt{ M(k)} } \sum\limits_{w \in A(k)} +\bigg ({ \tfrac{S_w(G_1) S_w(G_2)} {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +}\bigg ), \quad k=3,4, 5, +\] where \(A(k)\) is the set of connected subgraphs of size \(k\), and where \(M(k)\) is a normalising constant so that \(netD_2^S(k)\in[-1,1]\). \(M(k)\) is equal to \[ +M(k) = \sum\limits_{w \in A(k)} +\left( \tfrac{ S_w(G_1)^2 }{\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} \right) +\sum\limits_{w \in A(k)} +\left(\tfrac{ S_w(G_2)^2 } {\sqrt{S_w(G_1)^2 + S_w(G_2)^2}} +\right) +. +\] The corresponding Netdis statistic is defined as \[Netdis(k)=netd_2^S(k)=\tfrac{1}{2}(1-netD_2^S(k)) \in [0,1].\] Small values of Netdis suggest higher `similarity’ between the networks. By default Netdis uses subgraphs on \(k=4\) nodes.

+
+
+

Using netdis with an gold standard graph as \(E_w\).

+

The use of gold standard graph as a substitute for \(E_w\) can be used when such graph is known to be a good proxy for \(E_w\), or alternatively a good reference point of comparison.

+
# Lattice graphs tto be used as reference point comparison.
+goldstd_1 <- igraph::graph.lattice(c(round(sqrt(igraph::vcount(graph_1))),round(sqrt(igraph::vcount(graph_1))))) 
+goldstd_2 <- igraph::graph.lattice(c(round(sqrt(igraph::vcount(graph_2))),round(sqrt(igraph::vcount(graph_2))))) 
+
+plot(goldstd_1,vertex.size=0.8,vertex.label=NA)
+

+
plot(goldstd_2,vertex.size=0.5,vertex.label=NA)
+

+

+
+# Netdis using the er_goldstd_1 graph as gold standard reference point
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = goldstd_1)
+#>   netdis3   netdis4 
+#> 0.1422771 0.2517043
+
+# Netdis using the er_goldstd_2 graph as gold standard reference point
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = goldstd_2)
+#>   netdis3   netdis4 
+#> 0.1401654 0.2505384
+
+
+

Netdis-GP: Using a Geometric Poisson approximation

+

(Extracted from Ospina-Forero et al. (2018)): Instead of considering an approximation based on an observed gold-standard network whose selection may be difficult, \(E_w\) is computed, independently for each graph, based on a geometric Poisson (GP) approximation for the distribution of the number of occurrences of subgraph \(w\). It is assumes that \(N_{w,i} \sim GP(\lambda^{\rho(i)}_k, \theta^{\rho(i)}_w)\), where \(\lambda^{\rho(i)}_k\) is the Poisson parameter indexed by the size of subgraph \(w\) and the density bin \(\rho(i)\); and where \(\theta^{\rho(i)}_w\) is the geometric parameter indexed by subgraph \(w\) and density bin \(\rho(i)\). \(E_w(G, \rho(i))\) is taken as the mean of the GP approximation, i.e. \(\lambda^{\rho(i)}_k/\theta^{\rho(i)}_w\).

+

As \(\lambda^{\rho(i)}_k\) and \(\theta^{\rho(i)}_w\) are not known, they are estimated as follows: Let \(x_{w,d}^j\) be the number of subgraphs \(w\) on the 2-step ego-network \(j\) of density bin \(d\), and let \[ +\bar{X}_{w,d}=\frac{1}{q} \sum_{j=1}^q x_{w,d}^j, \qquad V^2_{w,d}=\frac{1}{q-1} \sum_{j=1}^q (x_{w,d}^j - \bar{X}_{w,d})^2 +, +\] where \(q\) is the number of ego-networks in density bin \(d\). Then, \[ +\hat{\lambda}^{d}_{k}= \frac{1}{l} \sum_{h \in A(k)} \frac{2 (\bar{X}_{h,d})^2}{V^2_{h,d}+\bar{X}_{h,d}} , \qquad \hat{\theta}^{d}_w= \frac{2\bar{X}_{w,d}}{V^2_{w,d}+\bar{X}_{w,d}}, +\] where \(l\) is the number of connected subgraphs of size \(k\), for example, \(l=6\) for \(k=4\). These estimators are based on the moment estimators of a GP random variable and the proposal made by (Picard et al.(2008)), where the total count of each individual subgraph could be thought as the sum of the total subgraph counts over multiple ``clumps’’ of edges that appear across the network.

+
#Netdis using the Geometric Poisson approximation for the background subgraph expectations.
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = NULL)
+#>    netdis3    netdis4 
+#> 0.03536235 0.38972699
+
+
+

Using Netdis with no expectation (\(E_w=0\))

+
#Netdis using no expecations (or equivalently, expectation equal to zero).
+netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = 0)
+#>    netdis3    netdis4 
+#> 0.00761545 0.02106628
+
+
+
+
+

Bibliography

+
    +
  • W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014.

  • +
  • L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018.

  • +
  • A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017.

  • +
  • F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008.

  • +
+
+ + + + + + + + + + + diff --git a/doc/dendrogram_example_net_dis.R b/doc/dendrogram_example_net_dis.R new file mode 100644 index 00000000..17c93627 --- /dev/null +++ b/doc/dendrogram_example_net_dis.R @@ -0,0 +1,87 @@ +## ----------------------------------------------------------------------------- +library("netdist") +edge_format = "ncol" +# Load reference graph (used for Netdis. Not required for NetEMD) +ref_path = file.path(system.file(file.path("extdata", "random"), + package = "netdist"), + "ER_1250_10_1") +ref_graph <- read_simple_graph(ref_path, format = edge_format) + +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" + +# Load all graphs in the source folder matching the filename pattern +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) +print(names(query_graphs)) + +## ----------------------------------------------------------------------------- +# Set the maximum graphlet size to compute counts for +max_graphlet_size <- 4 +neighbourhood_size <- 2 + +## ----------------------------------------------------------------------------- + +# Calculate netdis measure for graphlets up to size max_graphlet_size +netdis_result <- netdis_many_to_many(query_graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size) + +# Netdis measure for graphlets of size 3 +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) + +print("Netdis: graphlet size = 3") +print(netdis3_mat) + +# Netdis measure for graphlets of size 4 +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print("Netdis: graphlet size = 4") +print(netdis4_mat) + +## ----------------------------------------------------------------------------- +graphdists <- as.dist(netdis4_mat) +par(mfrow = c(1, 2)) +cex <- 1 + +# Dendrogram based on Netdis measure for graphlets of size 3 +title <- paste("Netdis: graphlet size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + +# Dendrogram based on Netdis measure for graphlets of size 4 +title = paste("Netdis: graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + +## ----------------------------------------------------------------------------- +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) + +## ----------------------------------------------------------------------------- +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 4, sep = "") +heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) + diff --git a/doc/dendrogram_example_net_dis.Rmd b/doc/dendrogram_example_net_dis.Rmd new file mode 100644 index 00000000..e8dce40e --- /dev/null +++ b/doc/dendrogram_example_net_dis.Rmd @@ -0,0 +1,124 @@ +--- +title: "Dendrogram example for Netdis" +author: "Martin O'Reilly" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Dendrogram example for Netdis} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Virus PPI example for Netdis + +### Load graphs +Use `read_simple_graphs` to read graph data from all files in a directory that +match a specific filename pattern in a format suitable for calculating +graphlet-based feature counts using the +[ORCA package](https://CRAN.R-project.org/package=orca). +We use `igraph::read_graph` to read graph data from files, so support +all file formats it supports. See help for `igraph::read_graph` for a list of +supported values for the `format` parameter and the [igraph documentation](http://igraph.org/c/doc/igraph-Foreign.html#igraph_read_graph_edgelist) +for descriptions of each of the supported file formats. + +The ORCA package we use to efficiently calculate graphlet and orbit counts +requires that graphs are _undirected_, _simple_ (i.e. have no self-loops or +multiple edges) and _connected_ (i.e. have no isolated vertices). Therefore, by +default, graphs loaded by `read_simple_graphs` will be coerced to have the above +properties. This can be avoided by setting the relevant `as_undirected`, +`remove_loops`, `remove_multiple` or `remove_isolates` parameters to `FALSE`. +```{r} +library("netdist") +edge_format = "ncol" +# Load reference graph (used for Netdis. Not required for NetEMD) +ref_path = file.path(system.file(file.path("extdata", "random"), + package = "netdist"), + "ER_1250_10_1") +ref_graph <- read_simple_graph(ref_path, format = edge_format) + +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), + package = "netdist") +edge_format <- "ncol" +file_pattern <- "*" + +# Load all graphs in the source folder matching the filename pattern +query_graphs <- read_simple_graphs(source_dir, + format = edge_format, + pattern = file_pattern) +print(names(query_graphs)) +``` + +In this example we will use counts of graphlets containing up to 4 nodes and +consider ego-network neighbourhoods of size 2 (i.e. the immediate neighbours of +each node plus their immediate neighbours). +```{r} +# Set the maximum graphlet size to compute counts for +max_graphlet_size <- 4 +neighbourhood_size <- 2 +``` + +## Generate NetDis measures between each pair of query graphs +```{r} + +# Calculate netdis measure for graphlets up to size max_graphlet_size +netdis_result <- netdis_many_to_many(query_graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size) + +# Netdis measure for graphlets of size 3 +res3 <- netdis_result$netdis["netdis3", ] +netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec) + +print("Netdis: graphlet size = 3") +print(netdis3_mat) + +# Netdis measure for graphlets of size 4 +res4 <- netdis_result$netdis["netdis4", ] +netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) + +print("Netdis: graphlet size = 4") +print(netdis4_mat) +``` + +## Generate dendrograms +```{r} +graphdists <- as.dist(netdis4_mat) +par(mfrow = c(1, 2)) +cex <- 1 + +# Dendrogram based on Netdis measure for graphlets of size 3 +title <- paste("Netdis: graphlet size = ", 3, sep = "") +plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) + +# Dendrogram based on Netdis measure for graphlets of size 4 +title = paste("Netdis: graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), + use.edge.length = FALSE, + edge.width = cex*2, + main = title, + cex.lab = cex, cex.axis = cex, + cex.main = cex, cex.sub = cex, + cex = cex) +``` +```{r} +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 3, sep = "") +heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) +``` +```{r} +cex <- 1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title <- paste("Netdis: graphlet size = ", 4, sep = "") +heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title, + cexRow = cex, cexCol = cex, symm = TRUE) +``` \ No newline at end of file diff --git a/doc/dendrogram_example_net_dis.html b/doc/dendrogram_example_net_dis.html new file mode 100644 index 00000000..a11367e9 --- /dev/null +++ b/doc/dendrogram_example_net_dis.html @@ -0,0 +1,445 @@ + + + + + + + + + + + + + + + + +Dendrogram example for Netdis + + + + + + + + + + + + + + + + + + + + + + +

Dendrogram example for Netdis

+

Martin O’Reilly

+

2020-06-22

+ + + +
+

Virus PPI example for Netdis

+
+

Load graphs

+

Use read_simple_graphs to read graph data from all files in a directory that match a specific filename pattern in a format suitable for calculating graphlet-based feature counts using the ORCA package. We use igraph::read_graph to read graph data from files, so support all file formats it supports. See help for igraph::read_graph for a list of supported values for the format parameter and the igraph documentation for descriptions of each of the supported file formats.

+

The ORCA package we use to efficiently calculate graphlet and orbit counts requires that graphs are undirected, simple (i.e. have no self-loops or multiple edges) and connected (i.e. have no isolated vertices). Therefore, by default, graphs loaded by read_simple_graphs will be coerced to have the above properties. This can be avoided by setting the relevant as_undirected, remove_loops, remove_multiple or remove_isolates parameters to FALSE.

+
library("netdist")
+edge_format = "ncol"
+# Load reference graph (used for Netdis. Not required for NetEMD)
+ref_path = file.path(system.file(file.path("extdata", "random"),
+                                 package = "netdist"),
+                     "ER_1250_10_1")
+ref_graph <- read_simple_graph(ref_path, format = edge_format)
+
+# Set source directory and file properties for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"),
+                          package = "netdist")
+edge_format <- "ncol"
+file_pattern <- "*"
+
+# Load all graphs in the source folder matching the filename pattern
+query_graphs <- read_simple_graphs(source_dir,
+                                   format = edge_format, 
+                                   pattern = file_pattern)
+print(names(query_graphs))
+
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
+

In this example we will use counts of graphlets containing up to 4 nodes and consider ego-network neighbourhoods of size 2 (i.e. the immediate neighbours of each node plus their immediate neighbours).

+
# Set the maximum graphlet size to compute counts for
+max_graphlet_size <- 4
+neighbourhood_size <- 2
+
+
+
+

Generate NetDis measures between each pair of query graphs

+
# Calculate netdis measure for graphlets up to size max_graphlet_size
+netdis_result <- netdis_many_to_many(query_graphs,
+                                     ref_graph,
+                                     max_graphlet_size = max_graphlet_size,
+                                     neighbourhood_size = neighbourhood_size)
+
+# Netdis measure for graphlets of size 3
+res3 <- netdis_result$netdis["netdis3", ]
+netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec)
+
+print("Netdis: graphlet size = 3")
+
## [1] "Netdis: graphlet size = 3"
+
print(netdis3_mat)
+
##               EBV       ECL        HSV-1         KSHV          VZV
+## EBV   0.000000000 0.1846655 0.0082642217 0.0100538469 0.0067775779
+## ECL   0.184665514 0.0000000 0.2065761911 0.2091240549 0.2075471192
+## HSV-1 0.008264222 0.2065762 0.0000000000 0.0001335756 0.0001748254
+## KSHV  0.010053847 0.2091241 0.0001335756 0.0000000000 0.0005964448
+## VZV   0.006777578 0.2075471 0.0001748254 0.0005964448 0.0000000000
+
# Netdis measure for graphlets of size 4
+res4 <- netdis_result$netdis["netdis4", ]
+netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec)
+
+print("Netdis: graphlet size = 4")
+
## [1] "Netdis: graphlet size = 4"
+
print(netdis4_mat)
+
##              EBV       ECL      HSV-1       KSHV        VZV
+## EBV   0.00000000 0.1749835 0.16526412 0.01969246 0.15971116
+## ECL   0.17498347 0.0000000 0.29176120 0.22155786 0.41716144
+## HSV-1 0.16526412 0.2917612 0.00000000 0.07602426 0.03434187
+## KSHV  0.01969246 0.2215579 0.07602426 0.00000000 0.13115524
+## VZV   0.15971116 0.4171614 0.03434187 0.13115524 0.00000000
+
+
+

Generate dendrograms

+
graphdists <- as.dist(netdis4_mat)
+par(mfrow = c(1, 2))
+cex <- 1
+
+# Dendrogram based on Netdis measure for graphlets of size 3
+title <- paste("Netdis: graphlet size = ", 3, sep = "")
+plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"),
+     use.edge.length = FALSE, 
+     edge.width = cex*2,
+     main = title,
+     cex.lab = cex, cex.axis = cex,
+     cex.main = cex, cex.sub = cex,
+     cex = cex)
+
+# Dendrogram based on Netdis measure for graphlets of size 4
+title = paste("Netdis: graphlet size = ", 4, sep = "")
+plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"),
+     use.edge.length = FALSE, 
+     edge.width = cex*2,
+     main = title,
+     cex.lab = cex, cex.axis = cex,
+     cex.main = cex, cex.sub = cex,
+     cex = cex)
+

+
cex <- 1.5
+col <- colorRampPalette(colors = c("blue","white"))(100)
+title <- paste("Netdis: graphlet size = ", 3, sep = "")
+heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
+        cexRow = cex, cexCol = cex, symm = TRUE)
+

+
cex <- 1.5
+col <- colorRampPalette(colors = c("blue","white"))(100)
+title <- paste("Netdis: graphlet size = ", 4, sep = "")
+heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
+        cexRow = cex, cexCol = cex, symm = TRUE)
+

+
+ + + + + + + + + + + diff --git a/doc/dendrogram_example_net_emd.R b/doc/dendrogram_example_net_emd.R new file mode 100644 index 00000000..6af69634 --- /dev/null +++ b/doc/dendrogram_example_net_emd.R @@ -0,0 +1,66 @@ +## ---- fig.show='hold'--------------------------------------------------------- +library("netdist") +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +edge_format = "ncol" +file_pattern = ".txt" + +# Calculate graphlet-based degree distributions for all orbits in graphlets +# comprising up to 4 nodes for all graphs. This only needs to be done once +# per graph (feature_type = "orbit", max_graphlet_size = 4).. +# If feature_type is set to "feature_type", orbit counts for orbits in the +# same graphlet will be summed to generate graphlet counts +# If max_graphlet_size is set to 5, graphlet-based degree distributions will +# be calculated for graphlets comprising up to 5 nodes. +virus_gdds <- gdd_for_all_graphs( + source_dir = source_dir, format = edge_format, pattern = file_pattern, + feature_type = "orbit", max_graphlet_size = 4) +names(virus_gdds) + +# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- +# based degree distributions using the default fast "optimise" method and no +# smoothing (default). The "optimise" method uses the built-in R optimise +# function to efficiently find the offset with the minimum EMD, but is not +# guaranteed to find the global minimum if EMD as a function of offset +# is non-convex and/or multimodal. The smoothing window width determines +# whether to calculate the NetEMD from the unaltered discrete GDD histograms +# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" +# smoothing by "smearing" the discrete GDD histogram point masses across bins +# of unit width (smoothing_window_width = 1). Returns a named list containing: +# (i) the NetEMDs and (ii) a table containing the graph names and indices +# within the input GDD list for each pair of graphs compared. +res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) + +# You can also specify method = "fixed_step" to use the much slower method of +# exhaustively evaluating the EMD at all offsets separated by a fixed step. +# The default step size is 1/2 the the minimum spacing between locations in +# either histogram after normalising to unit variance. However, you can +# specifiy your own fixed step using the optional "step_size" parameter. +# Note that this step size is applied to the histograms after they have been +# normalised to unit variance + +# Convert to matrix for input to dendrogram method +netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) +netemd_mat + +## ----------------------------------------------------------------------------- +cex=1 +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, + edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, + cex.sub=cex, cex=cex) + +# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in +# parallel using multiple threads where supported. The number of threads +# used is determined by the global R option "mc.cores". You can inspect the +# current value of this using options("mc.cores") and set it with +# options("mc.cores" = ). To fully utilise a modern consumer +# processor, this should be set to 2x the number of available processor +# cores as each core supports two threads. + +## ----------------------------------------------------------------------------- +cex=1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) + diff --git a/doc/dendrogram_example_net_emd.Rmd b/doc/dendrogram_example_net_emd.Rmd new file mode 100644 index 00000000..72fd2750 --- /dev/null +++ b/doc/dendrogram_example_net_emd.Rmd @@ -0,0 +1,80 @@ +--- +title: "Dendrogram example for NetEMD" +author: "Martin O'Reilly" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Dendrogram example for NetEMD} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- +## Virus PPI example for NetEMD +```{r, fig.show='hold'} +library("netdist") +# Set source directory and file properties for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +edge_format = "ncol" +file_pattern = ".txt" + +# Calculate graphlet-based degree distributions for all orbits in graphlets +# comprising up to 4 nodes for all graphs. This only needs to be done once +# per graph (feature_type = "orbit", max_graphlet_size = 4).. +# If feature_type is set to "feature_type", orbit counts for orbits in the +# same graphlet will be summed to generate graphlet counts +# If max_graphlet_size is set to 5, graphlet-based degree distributions will +# be calculated for graphlets comprising up to 5 nodes. +virus_gdds <- gdd_for_all_graphs( + source_dir = source_dir, format = edge_format, pattern = file_pattern, + feature_type = "orbit", max_graphlet_size = 4) +names(virus_gdds) + +# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- +# based degree distributions using the default fast "optimise" method and no +# smoothing (default). The "optimise" method uses the built-in R optimise +# function to efficiently find the offset with the minimum EMD, but is not +# guaranteed to find the global minimum if EMD as a function of offset +# is non-convex and/or multimodal. The smoothing window width determines +# whether to calculate the NetEMD from the unaltered discrete GDD histograms +# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" +# smoothing by "smearing" the discrete GDD histogram point masses across bins +# of unit width (smoothing_window_width = 1). Returns a named list containing: +# (i) the NetEMDs and (ii) a table containing the graph names and indices +# within the input GDD list for each pair of graphs compared. +res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) + +# You can also specify method = "fixed_step" to use the much slower method of +# exhaustively evaluating the EMD at all offsets separated by a fixed step. +# The default step size is 1/2 the the minimum spacing between locations in +# either histogram after normalising to unit variance. However, you can +# specifiy your own fixed step using the optional "step_size" parameter. +# Note that this step size is applied to the histograms after they have been +# normalised to unit variance + +# Convert to matrix for input to dendrogram method +netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) +netemd_mat +``` + +```{r} +cex=1 +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, + edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, + cex.sub=cex, cex=cex) + +# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in +# parallel using multiple threads where supported. The number of threads +# used is determined by the global R option "mc.cores". You can inspect the +# current value of this using options("mc.cores") and set it with +# options("mc.cores" = ). To fully utilise a modern consumer +# processor, this should be set to 2x the number of available processor +# cores as each core supports two threads. +``` + +```{r} +cex=1.5 +col <- colorRampPalette(colors = c("blue","white"))(100) +title = paste("NetEMD: max graphlet size = ", 4, sep = "") +heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) +``` + diff --git a/doc/dendrogram_example_net_emd.html b/doc/dendrogram_example_net_emd.html new file mode 100644 index 00000000..4a4d357e --- /dev/null +++ b/doc/dendrogram_example_net_emd.html @@ -0,0 +1,409 @@ + + + + + + + + + + + + + + + + +Dendrogram example for NetEMD + + + + + + + + + + + + + + + + + + + + + + +

Dendrogram example for NetEMD

+

Martin O’Reilly

+

2020-06-22

+ + + +
+

Virus PPI example for NetEMD

+
library("netdist")
+# Set source directory and file properties for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+edge_format = "ncol"
+file_pattern = ".txt"
+
+# Calculate graphlet-based degree distributions for all orbits in graphlets 
+# comprising up to 4 nodes for all graphs. This only needs to be done once 
+# per graph (feature_type = "orbit", max_graphlet_size = 4).. 
+# If feature_type is set to "feature_type", orbit counts for orbits in the
+# same graphlet will be summed to generate graphlet counts
+# If max_graphlet_size is set to 5, graphlet-based degree distributions will  
+# be calculated for graphlets comprising up to 5 nodes.
+virus_gdds <- gdd_for_all_graphs(
+  source_dir = source_dir, format = edge_format, pattern = file_pattern, 
+  feature_type = "orbit", max_graphlet_size = 4)
+names(virus_gdds)
+
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
+
# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- 
+# based degree distributions using the default fast "optimise" method and no
+# smoothing (default). The "optimise" method uses the built-in R optimise
+# function to efficiently find the offset with the minimum EMD, but is not
+# guaranteed to find the global minimum if EMD as a function of offset
+# is non-convex and/or multimodal. The smoothing window width determines 
+# whether to calculate the NetEMD from the unaltered discrete GDD histograms
+# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" 
+# smoothing by "smearing" the discrete GDD histogram point masses across bins 
+# of unit width (smoothing_window_width = 1). Returns a named list containing:
+# (i) the NetEMDs and (ii) a table containing the graph names and indices 
+# within the input GDD list for each pair of graphs compared.
+res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0)
+
+# You can also specify method = "fixed_step" to use the much slower method of 
+# exhaustively evaluating the EMD at all offsets separated by a fixed step. 
+# The default step size is 1/2 the the minimum spacing between locations in 
+# either histogram after normalising to unit variance. However, you can 
+# specifiy your own fixed step using the optional "step_size" parameter.
+# Note that this step size is applied to the histograms after they have been 
+# normalised to unit variance
+
+# Convert to matrix for input to dendrogram method
+netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec)
+netemd_mat
+
##             EBV       ECL     HSV-1      KSHV       VZV
+## EBV   0.0000000 0.4876039 0.1662892 0.1607293 0.1994605
+## ECL   0.4876039 0.0000000 0.3986281 0.4024176 0.4029344
+## HSV-1 0.1662892 0.3986281 0.0000000 0.1581520 0.2164003
+## KSHV  0.1607293 0.4024176 0.1581520 0.0000000 0.2323936
+## VZV   0.1994605 0.4029344 0.2164003 0.2323936 0.0000000
+
cex=1
+title = paste("NetEMD: max graphlet size = ", 4, sep = "")
+plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, 
+     edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, 
+     cex.sub=cex, cex=cex)
+

+
# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in 
+# parallel using multiple threads where supported. The number of threads
+# used is determined by the global R option "mc.cores". You can inspect the 
+# current value of this using options("mc.cores") and set it with 
+# options("mc.cores" = <num_cores>). To fully utilise a modern consumer
+# processor, this should be set to 2x the number of available processor 
+# cores as each core supports two threads.
+
cex=1.5
+col <- colorRampPalette(colors = c("blue","white"))(100)
+title = paste("NetEMD: max graphlet size = ", 4, sep = "")
+heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE)
+

+
+ + + + + + + + + + + diff --git a/doc/netdis_2graphs_polya-aeppli.R b/doc/netdis_2graphs_polya-aeppli.R new file mode 100644 index 00000000..4d73ce38 --- /dev/null +++ b/doc/netdis_2graphs_polya-aeppli.R @@ -0,0 +1,180 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ----------------------------------------------------------------------------- +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + + +## ----------------------------------------------------------------------------- +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + + +## ----------------------------------------------------------------------------- + +# Get ego-network densities +densities_1 <- ego_network_density(graphlet_counts_1) +densities_2 <- ego_network_density(graphlet_counts_2) + +# Adaptively bin ego-network densities +binned_densities_1 <- binned_densities_adaptive(densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_1 <- binned_densities_1$breaks + +binned_densities_2 <- binned_densities_adaptive(densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_2 <- binned_densities_2$breaks + +## ----------------------------------------------------------------------------- + +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + mean_binned_graphlet_counts, + max_graphlet_size) { + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx, ] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk +} + +#' Calculate expected counts in density bins using the +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply(1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + + expected_counts_bin +} + +binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) + +binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size) + +## ----------------------------------------------------------------------------- +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ego_density_bins_1, + binned_graphlet_counts_1, + max_graphlet_size, + scale_fn = NULL) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ego_density_bins_2, + binned_graphlet_counts_2, + max_graphlet_size, + scale_fn = NULL) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) + +## ----------------------------------------------------------------------------- +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + +## ----------------------------------------------------------------------------- + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) + diff --git a/doc/netdis_2graphs_polya-aeppli.Rmd b/doc/netdis_2graphs_polya-aeppli.Rmd new file mode 100644 index 00000000..ac4c1191 --- /dev/null +++ b/doc/netdis_2graphs_polya-aeppli.Rmd @@ -0,0 +1,211 @@ +--- +title: "Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis - 2 graphs with GP Approximation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + +``` + +## Bin ego networks by density +```{r} + +# Get ego-network densities +densities_1 <- ego_network_density(graphlet_counts_1) +densities_2 <- ego_network_density(graphlet_counts_2) + +# Adaptively bin ego-network densities +binned_densities_1 <- binned_densities_adaptive(densities_1, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_1 <- binned_densities_1$breaks + +binned_densities_2 <- binned_densities_adaptive(densities_2, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ego_density_bins_2 <- binned_densities_2$breaks +``` + +## Calculate expected graphlet counts in each bin using geometric poisson approximation +```{r} + +#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY +#' Calculate expected counts with geometric poisson (Polya-Aeppli) +#' approximation for a single density bin. +#' @param bin_idx Density bin index to calculate expected counts for. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +exp_counts_bin_gp <- function(bin_idx, graphlet_counts, + density_interval_indexes, + mean_binned_graphlet_counts, + max_graphlet_size) { + counts <- graphlet_counts[density_interval_indexes == bin_idx, ] + means <- mean_binned_graphlet_counts[bin_idx, ] + + mean_sub_counts <- sweep(counts, 2, means) + + Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + theta_d <- 2 * means / (Vd_sq + means) + + exp_counts_dk <- vector() + for (k in 2:max_graphlet_size) { + graphlet_idx <- graphlet_ids_for_size(k) + + lambda_dk <- mean(2 * means[graphlet_idx]^2 / + (Vd_sq[graphlet_idx] + means[graphlet_idx]), + na.rm = TRUE) + + exp_counts_dk <- append(exp_counts_dk, + lambda_dk / theta_d[graphlet_idx]) + } + + exp_counts_dk +} + +#' Calculate expected counts in density bins using the +#' geometric poisson (Polya-Aeppli) approximation. +#' @param graphlet_counts Graphlet counts for a number of ego_networks. +#' @param density_interval_indexes Density bin index for +#' each ego network. +#' @param max_graphlet_size Determines the maximum size of graphlets +#' included in graphlet_counts. +#' @export +density_binned_counts_gp <- function(graphlet_counts, + density_interval_indexes, + max_graphlet_size) { + + mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + graphlet_counts, + density_interval_indexes) + + nbins <- length(unique(density_interval_indexes)) + expected_counts_bin <- t(sapply(1:nbins, + exp_counts_bin_gp, + graphlet_counts = graphlet_counts, + density_interval_indexes = density_interval_indexes, + mean_binned_graphlet_counts = mean_binned_graphlet_counts, + max_graphlet_size = max_graphlet_size)) + + # deal with NAs caused by bins with zero counts for a graphlet + expected_counts_bin[is.nan(expected_counts_bin)] <- 0 + + expected_counts_bin +} + +binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, + binned_densities_1$interval_indexes, + max_graphlet_size) + +binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, + binned_densities_2$interval_indexes, + max_graphlet_size) +``` + +## Centre graphlet counts of query graphs using binned expected counts +```{r} +# Calculate expected graphlet counts for each ego network +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ego_density_bins_1, + binned_graphlet_counts_1, + max_graphlet_size, + scale_fn = NULL) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ego_density_bins_2, + binned_graphlet_counts_2, + max_graphlet_size, + scale_fn = NULL) +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) +``` + + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) +``` + +## Calculate netdis statistics +```{r} + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) +``` \ No newline at end of file diff --git a/doc/netdis_2graphs_polya-aeppli.html b/doc/netdis_2graphs_polya-aeppli.html new file mode 100644 index 00000000..2c496350 --- /dev/null +++ b/doc/netdis_2graphs_polya-aeppli.html @@ -0,0 +1,530 @@ + + + + + + + + + + + + + + + + +Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation + + + + + + + + + + + + + + + + + + + + + + +

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

+

Martin O’Reilly, Jack Roberts

+

2020-06-22

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
## 
+## Attaching package: 'purrr'
+
## The following objects are masked from 'package:igraph':
+## 
+##     compose, simplify
+
+
+

Load graphs

+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+
+

Generate ego networks

+
# Get ego networks for query graphs and reference graph
+ego_1 <- make_named_ego_graph(graph_1, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+ego_2 <- make_named_ego_graph(graph_2, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+
+

Count graphlets in ego networks

+
# Count graphlets for ego networks in query and reference graphs
+graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
+graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
+
+
+

Bin ego networks by density

+
# Get ego-network densities
+densities_1 <- ego_network_density(graphlet_counts_1)
+densities_2 <- ego_network_density(graphlet_counts_2)
+
+# Adaptively bin ego-network densities
+binned_densities_1 <- binned_densities_adaptive(densities_1, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_1 <- binned_densities_1$breaks
+
+binned_densities_2 <- binned_densities_adaptive(densities_2, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_2 <- binned_densities_2$breaks
+
+
+

Calculate expected graphlet counts in each bin using geometric poisson approximation

+
#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY
+#' Calculate expected counts with geometric poisson (Polya-Aeppli)
+#' approximation for a single density bin.
+#' @param bin_idx Density bin index to calculate expected counts for.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+exp_counts_bin_gp <- function(bin_idx, graphlet_counts,
+                              density_interval_indexes,
+                              mean_binned_graphlet_counts,
+                              max_graphlet_size) {
+  counts <- graphlet_counts[density_interval_indexes == bin_idx, ]
+  means <- mean_binned_graphlet_counts[bin_idx, ]
+  
+  mean_sub_counts <- sweep(counts, 2, means)
+  
+  Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1)
+  theta_d <- 2 * means / (Vd_sq + means)
+  
+  exp_counts_dk <- vector()
+  for (k in 2:max_graphlet_size) {
+    graphlet_idx <- graphlet_ids_for_size(k)
+    
+    lambda_dk <- mean(2 * means[graphlet_idx]^2 /
+                        (Vd_sq[graphlet_idx] + means[graphlet_idx]),
+                      na.rm = TRUE)
+    
+    exp_counts_dk <- append(exp_counts_dk,
+                            lambda_dk / theta_d[graphlet_idx])
+  }
+  
+  exp_counts_dk
+}
+
+#' Calculate expected counts in density bins using the
+#' geometric poisson (Polya-Aeppli) approximation.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+#' @param max_graphlet_size Determines the maximum size of graphlets
+#' included in graphlet_counts.
+#' @export
+density_binned_counts_gp <- function(graphlet_counts,
+                                     density_interval_indexes,
+                                     max_graphlet_size) {
+
+  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
+    graphlet_counts,
+    density_interval_indexes)
+
+  nbins <- length(unique(density_interval_indexes))
+  expected_counts_bin <- t(sapply(1:nbins,
+                                  exp_counts_bin_gp,
+                                  graphlet_counts = graphlet_counts,
+                                  density_interval_indexes = density_interval_indexes,
+                                  mean_binned_graphlet_counts = mean_binned_graphlet_counts,
+                                  max_graphlet_size = max_graphlet_size))
+
+  # deal with NAs caused by bins with zero counts for a graphlet
+  expected_counts_bin[is.nan(expected_counts_bin)] <- 0
+
+  expected_counts_bin
+}
+
+binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1,
+                                                     binned_densities_1$interval_indexes,
+                                                     max_graphlet_size)
+
+binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2,
+                                                     binned_densities_2$interval_indexes,
+                                                     max_graphlet_size)
+
+
+

Centre graphlet counts of query graphs using binned expected counts

+
# Calculate expected graphlet counts for each ego network
+exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
+                                                                 ego_density_bins_1, 
+                                                                 binned_graphlet_counts_1,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+
+
+exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
+                                                                 ego_density_bins_2, 
+                                                                 binned_graphlet_counts_2,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
+                                                        exp_graphlet_counts_1,
+                                                        max_graphlet_size)
+
+centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
+                                                        exp_graphlet_counts_2,
+                                                        max_graphlet_size)
+
+
+

Sum centred graphlet counts across all ego networks

+
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
+
+sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
+
+
+

Calculate netdis statistics

+
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
+                              sum_graphlet_counts_2, 
+                              max_graphlet_size)
+
+print(netdis_result)
+
##   netdis3   netdis4 
+## 0.8822527 0.1892716
+
+ + + + + + + + + + + diff --git a/doc/netdis_customisations.R b/doc/netdis_customisations.R new file mode 100644 index 00000000..2c5c0071 --- /dev/null +++ b/doc/netdis_customisations.R @@ -0,0 +1,98 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + + +## ----------------------------------------------------------------------------- +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +## ----------------------------------------------------------------------------- + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) + +## ----------------------------------------------------------------------------- + +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, + num_intervals = 50) + + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn) + +print(results$netdis) +print(results$comp_spec) + + + +## ----------------------------------------------------------------------------- +bin_counts_fn <- density_binned_counts_gp + +exp_counts_fn <- purrr::partial(netdis_expected_counts, + scale_fn = NULL) + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) + +## ----------------------------------------------------------------------------- +binning_fn <- single_density_bin +bin_counts_fn <- density_binned_counts +exp_counts_fn <- netdis_expected_counts + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) + diff --git a/doc/netdis_customisations.Rmd b/doc/netdis_customisations.Rmd new file mode 100644 index 00000000..0c1c724a --- /dev/null +++ b/doc/netdis_customisations.Rmd @@ -0,0 +1,122 @@ +--- +title: "Usage of netdis with binning and expected counts customisations." +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis function customisations} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Load query graphs +```{r} +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +``` + +## Default Expected Counts with Reference Graph +```{r} + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Binning Parameters +```{r} + +binning_fn <- purrr::partial(binned_densities_adaptive, + min_counts_per_interval = 10, + num_intervals = 50) + + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn) + +print(results$netdis) +print(results$comp_spec) + + +``` + +## With Modified Expected Counts: Geometric Poisson +```{r} +bin_counts_fn <- density_binned_counts_gp + +exp_counts_fn <- purrr::partial(netdis_expected_counts, + scale_fn = NULL) + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) +``` + +## With Modified Expected Counts: Simple Mean +```{r} +binning_fn <- single_density_bin +bin_counts_fn <- density_binned_counts +exp_counts_fn <- netdis_expected_counts + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph = NULL, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + binning_fn = binning_fn, + bin_counts_fn = bin_counts_fn, + exp_counts_fn = exp_counts_fn) + +print(results$netdis) +print(results$comp_spec) +``` \ No newline at end of file diff --git a/doc/netdis_customisations.html b/doc/netdis_customisations.html new file mode 100644 index 00000000..9607767a --- /dev/null +++ b/doc/netdis_customisations.html @@ -0,0 +1,507 @@ + + + + + + + + + + + + + + + + +Usage of netdis with binning and expected counts customisations. + + + + + + + + + + + + + + + + + + + + + + +

Usage of netdis with binning and expected counts customisations.

+

Jack Roberts

+

2020-06-22

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+
+

Load query graphs

+
source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+
+
+

Default Expected Counts with Reference Graph

+
# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges)
+
+print(results$netdis)
+
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
+## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Binning Parameters

+
binning_fn <- purrr::partial(binned_densities_adaptive,
+                             min_counts_per_interval = 10,
+                             num_intervals = 50)
+
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               binning_fn = binning_fn)
+
+print(results$netdis)
+
##               [,1]        [,2]        [,3]        [,4]      [,5]      [,6]
+## netdis3 0.08499773 0.005900766 0.009547675 0.007177066 0.1078916 0.1144589
+## netdis4 0.20037679 0.045244760 0.018904439 0.112043371 0.3361503 0.2631420
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.1101426 0.0006494388 2.478794e-05 0.0004097632
+## netdis4 0.4818139 0.0274434372 3.227187e-02 0.0928126401
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Expected Counts: Geometric Poisson

+
bin_counts_fn <- density_binned_counts_gp
+
+exp_counts_fn <- purrr::partial(netdis_expected_counts,
+                                scale_fn = NULL)
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph = NULL,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               bin_counts_fn = bin_counts_fn,
+                               exp_counts_fn = exp_counts_fn)
+
+print(results$netdis)
+
##              [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
+## netdis3 0.8822527 0.9101084 0.8838054 0.96266771 0.04173551 0.03585169
+## netdis4 0.1892716 0.5735233 0.3719671 0.04604718 0.60270399 0.20370737
+##               [,7]         [,8]        [,9]       [,10]
+## netdis3 0.06271238 0.0004211575 0.005364888 0.009114229
+## netdis4 0.12978637 0.7173089685 0.487688692 0.371848474
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+
+

With Modified Expected Counts: Simple Mean

+
binning_fn <- single_density_bin
+bin_counts_fn <- density_binned_counts
+exp_counts_fn <- netdis_expected_counts
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph = NULL,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges,
+                               binning_fn = binning_fn,
+                               bin_counts_fn = bin_counts_fn,
+                               exp_counts_fn = exp_counts_fn)
+
+print(results$netdis)
+
##              [,1]      [,2]      [,3]       [,4]      [,5]      [,6]      [,7]
+## netdis3 0.3116860 0.8254261 0.8768637 0.04053921 0.8531485 0.8226894 0.2353732
+## netdis4 0.9592365 0.2009423 0.7974697 0.21688688 0.7734930 0.2144558 0.8030030
+##               [,8]      [,9]     [,10]
+## netdis3 0.01970843 0.8288649 0.9167543
+## netdis4 0.39992007 0.3300305 0.6301018
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + + + + diff --git a/doc/netdis_pairwise_comparisons.R b/doc/netdis_pairwise_comparisons.R new file mode 100644 index 00000000..8261d7a4 --- /dev/null +++ b/doc/netdis_pairwise_comparisons.R @@ -0,0 +1,74 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + + +## ----------------------------------------------------------------------------- +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Calculate netdis statistics +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +## ----------------------------------------------------------------------------- +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +graph_1 <- graphs$EBV +graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + +# Calculate netdis statistics +netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +## ----------------------------------------------------------------------------- +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) + diff --git a/doc/netdis_pairwise_comparisons.Rmd b/doc/netdis_pairwise_comparisons.Rmd new file mode 100644 index 00000000..d5809c4d --- /dev/null +++ b/doc/netdis_pairwise_comparisons.Rmd @@ -0,0 +1,94 @@ +--- +title: "Usage of netdis interfaces for different pairwise comparison options." +author: "Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Netdis pairwise comparisons} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +# Reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +``` + +## Compare two graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +# Calculate netdis statistics +netdis_one_to_one(graph_1, graph_2, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Compare one graph to many other graphs +```{r} +# Load query graphs +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") +graph_1 <- graphs$EBV +graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] + +# Calculate netdis statistics +netdis_one_to_many(graph_1, graphs_compare, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Do pairwise netdis calculations for many graphs +```{r} +# Load query graphs +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") +graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") + +# Calculate netdis statistics +results <- netdis_many_to_many(graphs, + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +print(results$netdis) +print(results$comp_spec) +``` \ No newline at end of file diff --git a/doc/netdis_pairwise_comparisons.html b/doc/netdis_pairwise_comparisons.html new file mode 100644 index 00000000..ee4d70e1 --- /dev/null +++ b/doc/netdis_pairwise_comparisons.html @@ -0,0 +1,439 @@ + + + + + + + + + + + + + + + + +Usage of netdis interfaces for different pairwise comparison options. + + + + + + + + + + + + + + + + + + + + + + +

Usage of netdis interfaces for different pairwise comparison options.

+

Jack Roberts

+

2020-06-22

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+# Reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+
+

Compare two graphs

+
# Load query graphs
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+# Calculate netdis statistics
+netdis_one_to_one(graph_1, graph_2,
+                  ref_graph,
+                  max_graphlet_size = max_graphlet_size,
+                  neighbourhood_size = neighbourhood_size,
+                  min_ego_nodes = min_ego_nodes,
+                  min_ego_edges = min_ego_edges)
+
##   netdis3   netdis4 
+## 0.1846655 0.1749835
+
+
+

Compare one graph to many other graphs

+
# Load query graphs
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+graph_1 <- graphs$EBV
+graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")]
+
+# Calculate netdis statistics
+netdis_one_to_many(graph_1, graphs_compare,
+                   ref_graph,
+                   max_graphlet_size = max_graphlet_size,
+                   neighbourhood_size = neighbourhood_size,
+                   min_ego_nodes = min_ego_nodes,
+                   min_ego_edges = min_ego_edges)
+
##               ECL       HSV-1       KSHV         VZV
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160
+
+
+

Do pairwise netdis calculations for many graphs

+
# Load query graphs
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
+
+# Calculate netdis statistics
+results <- netdis_many_to_many(graphs,
+                               ref_graph,
+                               max_graphlet_size = max_graphlet_size,
+                               neighbourhood_size = neighbourhood_size,
+                               min_ego_nodes = min_ego_nodes,
+                               min_ego_edges = min_ego_edges)
+
+print(results$netdis)
+
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
+## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
+## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
+##              [,7]         [,8]         [,9]        [,10]
+## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
+## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
+
print(results$comp_spec)
+
##    name_a name_b index_a index_b
+## 1     EBV    ECL       1       2
+## 2     EBV  HSV-1       1       3
+## 3     EBV   KSHV       1       4
+## 4     EBV    VZV       1       5
+## 5     ECL  HSV-1       2       3
+## 6     ECL   KSHV       2       4
+## 7     ECL    VZV       2       5
+## 8   HSV-1   KSHV       3       4
+## 9   HSV-1    VZV       3       5
+## 10   KSHV    VZV       4       5
+
+ + + + + + + + + + + diff --git a/doc/quickstart_netdis_2graphs.R b/doc/quickstart_netdis_2graphs.R new file mode 100644 index 00000000..a8189f71 --- /dev/null +++ b/doc/quickstart_netdis_2graphs.R @@ -0,0 +1,121 @@ +## ----------------------------------------------------------------------------- +# Load libraries +library("netdist") +library("purrr") + +## ----------------------------------------------------------------------------- +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + + +## ----------------------------------------------------------------------------- +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 + +## ----------------------------------------------------------------------------- +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +## ----------------------------------------------------------------------------- +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) + +## ----------------------------------------------------------------------------- +# Load reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, + max_graphlet_size) + + +# Get ego-network densities +densities_ref <- ego_network_density(graphlet_counts_ref) + +# Adaptively bin ref ego-network densities +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks + +# Average ref graphlet counts across density bins +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) + + +## ----------------------------------------------------------------------------- +# Calculate expected graphlet counts (using ref graph ego network density bins) +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) + +## ----------------------------------------------------------------------------- +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) + +## ----------------------------------------------------------------------------- + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) + diff --git a/doc/quickstart_netdis_2graphs.Rmd b/doc/quickstart_netdis_2graphs.Rmd new file mode 100644 index 00000000..51eb0330 --- /dev/null +++ b/doc/quickstart_netdis_2graphs.Rmd @@ -0,0 +1,150 @@ +--- +title: "Quick start guide for Netdis - 2 graphs" +author: "Martin O'Reilly, Jack Roberts" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Quick start for Netdis - 2 graphs} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Load required libraries +```{r} +# Load libraries +library("netdist") +library("purrr") +``` + +## Load graphs +```{r} +# Set source directory for Virus PPI graph edge files +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), + format = "ncol") + +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), + format = "ncol") + +``` + +## Set Netdis parameters +```{r} +# Maximum graphlet size to calculate counts and netdis statistic for. +max_graphlet_size <- 4 + +# Ego network neighbourhood size +neighbourhood_size <- 2 + +# Minimum size of ego networks to consider +min_ego_nodes <- 3 +min_ego_edges <- 1 + +# Ego network density binning parameters +min_bin_count <- 5 +num_bins <- 100 +``` + +## Generate ego networks +```{r} +# Get ego networks for query graphs and reference graph +ego_1 <- make_named_ego_graph(graph_1, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +ego_2 <- make_named_ego_graph(graph_2, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) +``` + +## Count graphlets in ego networks +```{r} +# Count graphlets for ego networks in query and reference graphs +graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) +graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) +``` + +## Use a reference graph to calculate expected graphlet counts in ego network density bins +```{r} +# Load reference graph +ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), + package = "netdist") +ref_graph <- read_simple_graph(ref_path, format = "ncol") + +ego_ref <- make_named_ego_graph(ref_graph, + order = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges) + +graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) + +# Scale ego-network graphlet counts by dividing by total number of k-tuples in +# ego-network (where k is graphlet size) +scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, + max_graphlet_size) + + +# Get ego-network densities +densities_ref <- ego_network_density(graphlet_counts_ref) + +# Adaptively bin ref ego-network densities +binned_densities <- binned_densities_adaptive(densities_ref, + min_counts_per_interval = min_bin_count, + num_intervals = num_bins) + +ref_ego_density_bins <- binned_densities$breaks + +# Average ref graphlet counts across density bins +ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( + scaled_graphlet_counts_ref, + binned_densities$interval_indexes) + +``` + + +## Centre graphlet counts of query graphs based on statistics of reference graph +```{r} +# Calculate expected graphlet counts (using ref graph ego network density bins) +exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + + +exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, + ref_ego_density_bins, + ref_binned_graphlet_counts, + max_graphlet_size, + scale_fn=count_graphlet_tuples) + +# Centre graphlet counts by subtracting expected counts +centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, + exp_graphlet_counts_1, + max_graphlet_size) + +centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, + exp_graphlet_counts_2, + max_graphlet_size) +``` + +## Sum centred graphlet counts across all ego networks +```{r} +sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) + +sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) +``` + +## Calculate netdis statistics +```{r} + +netdis_result <- netdis_uptok(sum_graphlet_counts_1, + sum_graphlet_counts_2, + max_graphlet_size) + +print(netdis_result) +``` \ No newline at end of file diff --git a/doc/quickstart_netdis_2graphs.html b/doc/quickstart_netdis_2graphs.html new file mode 100644 index 00000000..3863ac42 --- /dev/null +++ b/doc/quickstart_netdis_2graphs.html @@ -0,0 +1,468 @@ + + + + + + + + + + + + + + + + +Quick start guide for Netdis - 2 graphs + + + + + + + + + + + + + + + + + + + + + + +

Quick start guide for Netdis - 2 graphs

+

Martin O’Reilly, Jack Roberts

+

2020-06-22

+ + + +
+

Load required libraries

+
# Load libraries
+library("netdist")
+library("purrr")
+
+
+

Load graphs

+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")
+
+
+

Set Netdis parameters

+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100
+
+
+

Generate ego networks

+
# Get ego networks for query graphs and reference graph
+ego_1 <- make_named_ego_graph(graph_1, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+ego_2 <- make_named_ego_graph(graph_2, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+
+

Count graphlets in ego networks

+
# Count graphlets for ego networks in query and reference graphs
+graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
+graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
+
+
+

Use a reference graph to calculate expected graphlet counts in ego network density bins

+
# Load reference graph
+ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
+                        package = "netdist")
+ref_graph <- read_simple_graph(ref_path, format = "ncol")
+
+ego_ref <- make_named_ego_graph(ref_graph, 
+                                order = neighbourhood_size, 
+                                min_ego_nodes = min_ego_nodes, 
+                                min_ego_edges = min_ego_edges)
+
+graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size)
+
+# Scale ego-network graphlet counts by dividing by total number of k-tuples in
+# ego-network (where k is graphlet size)
+scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, 
+                                                        max_graphlet_size)
+
+
+# Get ego-network densities
+densities_ref <- ego_network_density(graphlet_counts_ref)
+
+# Adaptively bin ref ego-network densities
+binned_densities <- binned_densities_adaptive(densities_ref, 
+                                              min_counts_per_interval = min_bin_count, 
+                                              num_intervals = num_bins)
+
+ref_ego_density_bins <- binned_densities$breaks
+
+# Average ref graphlet counts across density bins
+ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
+                                  scaled_graphlet_counts_ref, 
+                                  binned_densities$interval_indexes)
+
+
+

Centre graphlet counts of query graphs based on statistics of reference graph

+
# Calculate expected graphlet counts (using ref graph ego network density bins)
+exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
+                                                                 ref_ego_density_bins, 
+                                                                 ref_binned_graphlet_counts,
+                                                                 max_graphlet_size,
+                                                                 scale_fn=count_graphlet_tuples)
+
+
+exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
+                                                                 ref_ego_density_bins, 
+                                                                 ref_binned_graphlet_counts,
+                                                                 max_graphlet_size,
+                                                                 scale_fn=count_graphlet_tuples)
+
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
+                                                        exp_graphlet_counts_1,
+                                                        max_graphlet_size)
+
+centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
+                                                        exp_graphlet_counts_2,
+                                                        max_graphlet_size)
+
+
+

Sum centred graphlet counts across all ego networks

+
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
+
+sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
+
+
+

Calculate netdis statistics

+
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
+                              sum_graphlet_counts_2, 
+                              max_graphlet_size)
+
+print(netdis_result)
+
##   netdis3   netdis4 
+## 0.1846655 0.1749835
+
+ + + + + + + + + + + diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index fb17de2b..ddbebea9 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -1,6 +1,6 @@ self_net_emd <- function(histogram, shift, method) { - net_emd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method) + netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method) } expected <- 0 @@ -21,9 +21,9 @@ expect_equal(self_net_emd(histogram, shift = 0.01, "exhaustive"), expected) expect_equal(self_net_emd(histogram, shift = 0, "optimise"), expected) expect_equal(self_net_emd(histogram, shift = 0, "exhaustive"), expected) -expect_self_net_emd_correct <- function(histogram, shift, method, +expect_self_netemd_correct <- function(histogram, shift, method, return_details = FALSE) { - self_net_emd <- net_emd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), + self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift), method = method, return_details = return_details ) loc <- histogram$locations @@ -39,51 +39,51 @@ expect_self_net_emd_correct <- function(histogram, shift, method, locations <- c(-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5) masses <- c(0, 1, 2, 3, 4, 5, 4, 3, 2, 1, 0) histogram <- dhist(locations = locations, masses = masses) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 1, "optimise", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 1, "exhaustive", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.5, "optimise", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.5, "exhaustive", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.1, "optimise", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.1, "exhaustive", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.05, "optimise", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.05, "exhaustive", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.01, "optimise", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0.01, "exhaustive", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0, "optimise", return_details = TRUE ) -expect_self_net_emd_correct(histogram, +expect_self_netemd_correct(histogram, shift = 0, "exhaustive", return_details = TRUE ) @@ -107,14 +107,14 @@ test_that("net_emd returns 0 when comparing any normal histogram against itself expected <- 0 actuals_opt <- purrr::map(rand_dhists, function(dhist) { - net_emd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "optimise") + netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "optimise") }) purrr::walk(actuals_opt, function(actual) { expect_equal(actual, expected) }) actuals_exhaustive_default <- purrr::map(rand_dhists, function(dhist) { - net_emd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "exhaustive") + netemd_one_to_one(dhists_1 = dhist, dhists_2 = dhist, method = "exhaustive") }) purrr::walk(actuals_exhaustive_default, function(actual) { expect_equal(actual, expected) @@ -143,9 +143,9 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset offset_lists <- replicate(num_hists, offsets, simplify = FALSE) - net_emd_offset_self <- function(dhist, offsets, method) { + netemd_offset_self <- function(dhist, offsets, method) { net_emds <- purrr::map_dbl(offsets, function(offset) { - net_emd_one_to_one(dhists_1 = dhist, dhists_2 = shift_dhist(dhist, offset), method = method) + netemd_one_to_one(dhists_1 = dhist, dhists_2 = shift_dhist(dhist, offset), method = method) }) return(net_emds) } @@ -154,7 +154,7 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset actuals_list_opt <- purrr::map2( rand_dhists, offset_lists, function(dhist, offsets) { - net_emd_offset_self(dhist, offsets, method = "optimise") + netemd_offset_self(dhist, offsets, method = "optimise") } ) purrr::walk(actuals_list_opt, function(actuals) { @@ -165,7 +165,7 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset actuals_list_exhaustive <- purrr::map2( rand_dhists, offset_lists, function(dhist, offsets) { - net_emd_offset_self(dhist, offsets, method = "exhaustive") + netemd_offset_self(dhist, offsets, method = "exhaustive") } ) purrr::walk(actuals_list_exhaustive, function(actuals) { @@ -197,9 +197,9 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any offset_lists <- replicate(num_hists, offsets, simplify = FALSE) - expect_self_net_emd_correct <- + expect_self_netemd_correct <- function(histogram, shift, method, return_details = FALSE) { - self_net_emd <- net_emd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift),method = method, return_details = return_details + self_net_emd <- netemd_one_to_one(dhists_1 = histogram, dhists_2 = shift_dhist(histogram, shift),method = method, return_details = return_details ) loc <- histogram$locations mass <- histogram$masses @@ -213,7 +213,7 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any purrr::walk2(rand_dhists, offset_lists, function(dhist, offsets) { purrr::walk(offsets, function(offset) { - expect_self_net_emd_correct(dhist, offset, "optimise", + expect_self_netemd_correct(dhist, offset, "optimise", return_details = TRUE ) }) @@ -221,7 +221,7 @@ test_that("net_emd returns min_emd = 0 and min_offset = 0 when comparing any purrr::walk2(rand_dhists, offset_lists, function(dhist, offsets) { purrr::walk(offsets, function(offset) { - expect_self_net_emd_correct(dhist, offset, "exhaustive", + expect_self_netemd_correct(dhist, offset, "exhaustive", return_details = TRUE ) }) @@ -242,10 +242,10 @@ test_that("net_emd returns analytically derived non-zero solutions for distribut test_pair <- function(p, expected) { dhistA <- two_bin_dhist(p) dhistB <- three_bin_dhist(p) - expect_equal(net_emd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "exhaustive"), expected, tolerance = 1e-12) + expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "exhaustive"), expected, tolerance = 1e-12) # Even setting the stats::optimise method tolerance to machine double precision, the # optimised NetEMD is ~1e-09, so set a slightly looser tolerance here - expect_equal(net_emd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "optimise"), expected, tolerance = 1e-08) + expect_equal(netemd_one_to_one(dhists_1 = dhistA, dhists_2 = dhistB, method = "optimise"), expected, tolerance = 1e-08) } # Test for p values with analytically calculated NetEMD @@ -292,7 +292,7 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over virus PPI networks purrr::walk(virus_gdd, function(gdd) { purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(net_emd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, method = "optimise", smoothing_window_width = 0 ), 0) @@ -346,7 +346,7 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions # Map over random graphs purrr::walk(random_gdd, function(gdd) { purrr::walk(gdd, function(gdd_Ox) { - expect_equalish(net_emd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, + expect_equalish(netemd_one_to_one(dhists_1 = gdd_Ox, dhists_2 = gdd_Ox, method = "optimise", smoothing_window_width = 0 ), 0) @@ -400,14 +400,14 @@ test_that("net_emds_for_all_graphs works", { # Use previously tested NetEMD function to generate expected NetEMD scores # individually and combine into expected output for code under test - expected_net_emd_fn <- function(gdds) { + expected_netemd_fn <- function(gdds) { list( net_emds = c( - net_emd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), net_emd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$HSV), - net_emd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), net_emd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$VZV), - net_emd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), net_emd_one_to_one(dhists_1 =gdds$ECL, dhists_2 = gdds$KSHV), - net_emd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), net_emd_one_to_one(dhists_1 =gdds$HSV, dhists_2 = gdds$KSHV), - net_emd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), net_emd_one_to_one(dhists_1 =gdds$KSHV, dhists_2 = gdds$VZV) + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$HSV), + netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$VZV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 =gdds$ECL, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$HSV, dhists_2 = gdds$KSHV), + netemd_one_to_one(dhists_1 = gdds$HSV, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 =gdds$KSHV, dhists_2 = gdds$VZV) ), comp_spec = cross_comparison_spec(gdds) ) @@ -415,7 +415,7 @@ test_that("net_emds_for_all_graphs works", { # Comparison function for clarity compare_fn <- function(gdds) { - expect_equal(net_emds_for_all_graphs(gdds), expected_net_emd_fn(gdds)) + expect_equal(net_emds_for_all_graphs(gdds), expected_netemd_fn(gdds)) } # Map over test parameters, comparing actual gdds to expected diff --git a/vignettes/V-Menu.Rmd b/vignettes/V-Menu.Rmd index efa8fe00..6ada76e1 100644 --- a/vignettes/V-Menu.Rmd +++ b/vignettes/V-Menu.Rmd @@ -22,8 +22,8 @@ knitr::opts_chunk$set( This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods 'Netdis', 'NetEmd' and their variants (e.g. `Netdis Geometric-Poisson'), and also the large flexibility of these methodologies. The following is a list of the available vignettes: -* [Default usage](default_pairwise_usage.html): -* [Simple and quick use](netdis_pairwise_comparisons.html): +* [Default, simple and quick use usage 1](default_pairwise_usage.html): +* [Default, simple and quick use usage 2](netdis_pairwise_comparisons.html): * [Dendrogram Examples](dendrogram_example_net_dis.html). diff --git a/vignettes/default_pairwise_usage.Rmd b/vignettes/default_pairwise_usage.Rmd index 853c5a54..c4caeb33 100644 --- a/vignettes/default_pairwise_usage.Rmd +++ b/vignettes/default_pairwise_usage.Rmd @@ -94,7 +94,7 @@ graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") #The one to one network comparison. -net_emd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) ``` ## Comparing two graphs with NetEmd via their Laplacian spectrum. @@ -111,7 +111,7 @@ NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = F props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) -net_emd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0) +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0) ``` ------------------------- diff --git a/vignettes/dendrogram_example_net_dis.R b/vignettes/dendrogram_example_net_dis.R index 6ef454de..17c93627 100644 --- a/vignettes/dendrogram_example_net_dis.R +++ b/vignettes/dendrogram_example_net_dis.R @@ -1,5 +1,4 @@ -## ------------------------------------------------------------------------ -#A CHANGE TO FILE- COMMENT. +## ----------------------------------------------------------------------------- library("netdist") edge_format = "ncol" # Load reference graph (used for Netdis. Not required for NetEMD) @@ -20,12 +19,12 @@ query_graphs <- read_simple_graphs(source_dir, pattern = file_pattern) print(names(query_graphs)) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- # Set the maximum graphlet size to compute counts for max_graphlet_size <- 4 neighbourhood_size <- 2 -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- # Calculate netdis measure for graphlets up to size max_graphlet_size netdis_result <- netdis_many_to_many(query_graphs, @@ -47,7 +46,7 @@ netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec) print("Netdis: graphlet size = 4") print(netdis4_mat) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- graphdists <- as.dist(netdis4_mat) par(mfrow = c(1, 2)) cex <- 1 @@ -72,14 +71,14 @@ plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"), cex.main = cex, cex.sub = cex, cex = cex) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) title <- paste("Netdis: graphlet size = ", 3, sep = "") heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) -## ------------------------------------------------------------------------ +## ----------------------------------------------------------------------------- cex <- 1.5 col <- colorRampPalette(colors = c("blue","white"))(100) title <- paste("Netdis: graphlet size = ", 4, sep = "") diff --git a/vignettes/dendrogram_example_net_dis.html b/vignettes/dendrogram_example_net_dis.html index 32cbc351..a11367e9 100644 --- a/vignettes/dendrogram_example_net_dis.html +++ b/vignettes/dendrogram_example_net_dis.html @@ -1,38 +1,50 @@ - + - + - + - + Dendrogram example for Netdis + @@ -305,7 +315,7 @@

Dendrogram example for Netdis

Martin O’Reilly

-

2020-03-12

+

2020-06-22

@@ -401,23 +411,26 @@

Generate dendrograms

cex.lab = cex, cex.axis = cex, cex.main = cex, cex.sub = cex, cex = cex) -

+

cex <- 1.5
 col <- colorRampPalette(colors = c("blue","white"))(100)
 title <- paste("Netdis: graphlet size = ", 3, sep = "")
 heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
         cexRow = cex, cexCol = cex, symm = TRUE)
-

+

cex <- 1.5
 col <- colorRampPalette(colors = c("blue","white"))(100)
 title <- paste("Netdis: graphlet size = ", 4, sep = "")
 heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
         cexRow = cex, cexCol = cex, symm = TRUE)
-

+

+ + + @@ -305,7 +315,7 @@

Dendrogram example for NetEMD

Martin O’Reilly

-

2020-03-12

+

2020-06-22

@@ -365,7 +375,7 @@

Virus PPI example for NetEMD

plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, cex.sub=cex, cex=cex) -

+

# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in 
 # parallel using multiple threads where supported. The number of threads
 # used is determined by the global R option "mc.cores". You can inspect the 
@@ -377,11 +387,14 @@ 

Virus PPI example for NetEMD

col <- colorRampPalette(colors = c("blue","white"))(100) title = paste("NetEMD: max graphlet size = ", 4, sep = "") heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE)
-

+

+ + + @@ -305,7 +315,7 @@

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

Martin O’Reilly, Jack Roberts

-

2020-03-12

+

2020-06-22

@@ -314,190 +324,198 @@

Load required libraries

# Load libraries
 library("netdist")
 library("purrr")
+
## 
+## Attaching package: 'purrr'
+
## The following objects are masked from 'package:igraph':
+## 
+##     compose, simplify

Load graphs

-
# Set source directory for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-
-# Load query graphs
-graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
-                             format = "ncol")
-
-graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
-                             format = "ncol")
+
# Set source directory for Virus PPI graph edge files
+source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
+
+# Load query graphs
+graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
+                             format = "ncol")
+
+graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
+                             format = "ncol")

Set Netdis parameters

-
# Maximum graphlet size to calculate counts and netdis statistic for.
-max_graphlet_size <- 4
-
-# Ego network neighbourhood size
-neighbourhood_size <- 2
-
-# Minimum size of ego networks to consider
-min_ego_nodes <- 3
-min_ego_edges <- 1
-
-# Ego network density binning parameters
-min_bin_count <- 5
-num_bins <- 100
+
# Maximum graphlet size to calculate counts and netdis statistic for.
+max_graphlet_size <- 4
+
+# Ego network neighbourhood size
+neighbourhood_size <- 2
+
+# Minimum size of ego networks to consider
+min_ego_nodes <- 3
+min_ego_edges <- 1
+
+# Ego network density binning parameters
+min_bin_count <- 5
+num_bins <- 100

Generate ego networks

-
# Get ego networks for query graphs and reference graph
-ego_1 <- make_named_ego_graph(graph_1, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
-
-ego_2 <- make_named_ego_graph(graph_2, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
+
# Get ego networks for query graphs and reference graph
+ego_1 <- make_named_ego_graph(graph_1, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)
+
+ego_2 <- make_named_ego_graph(graph_2, 
+                              order = neighbourhood_size, 
+                              min_ego_nodes = min_ego_nodes, 
+                              min_ego_edges = min_ego_edges)

Count graphlets in ego networks

-
# Count graphlets for ego networks in query and reference graphs
-graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
-graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
+
# Count graphlets for ego networks in query and reference graphs
+graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
+graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)

Bin ego networks by density

-
# Get ego-network densities
-densities_1 <- ego_network_density(graphlet_counts_1)
-densities_2 <- ego_network_density(graphlet_counts_2)
-
-# Adaptively bin ego-network densities
-binned_densities_1 <- binned_densities_adaptive(densities_1, 
-                                                min_counts_per_interval = min_bin_count, 
-                                                num_intervals = num_bins)
-
-ego_density_bins_1 <- binned_densities_1$breaks
-
-binned_densities_2 <- binned_densities_adaptive(densities_2, 
-                                                min_counts_per_interval = min_bin_count, 
-                                                num_intervals = num_bins)
-
-ego_density_bins_2 <- binned_densities_2$breaks
+
# Get ego-network densities
+densities_1 <- ego_network_density(graphlet_counts_1)
+densities_2 <- ego_network_density(graphlet_counts_2)
+
+# Adaptively bin ego-network densities
+binned_densities_1 <- binned_densities_adaptive(densities_1, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_1 <- binned_densities_1$breaks
+
+binned_densities_2 <- binned_densities_adaptive(densities_2, 
+                                                min_counts_per_interval = min_bin_count, 
+                                                num_intervals = num_bins)
+
+ego_density_bins_2 <- binned_densities_2$breaks

Calculate expected graphlet counts in each bin using geometric poisson approximation

-
#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY
-#' Calculate expected counts with geometric poisson (Polya-Aeppli)
-#' approximation for a single density bin.
-#' @param bin_idx Density bin index to calculate expected counts for.
-#' @param graphlet_counts Graphlet counts for a number of ego_networks.
-#' @param density_interval_indexes Density bin index for
-#' each ego network.
-exp_counts_bin_gp <- function(bin_idx, graphlet_counts,
-                              density_interval_indexes,
-                              mean_binned_graphlet_counts,
-                              max_graphlet_size) {
-  counts <- graphlet_counts[density_interval_indexes == bin_idx, ]
-  means <- mean_binned_graphlet_counts[bin_idx, ]
-  
-  mean_sub_counts <- sweep(counts, 2, means)
-  
-  Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1)
-  theta_d <- 2 * means / (Vd_sq + means)
-  
-  exp_counts_dk <- vector()
-  for (k in 2:max_graphlet_size) {
-    graphlet_idx <- graphlet_ids_for_size(k)
-    
-    lambda_dk <- mean(2 * means[graphlet_idx]^2 /
-                        (Vd_sq[graphlet_idx] + means[graphlet_idx]),
-                      na.rm = TRUE)
-    
-    exp_counts_dk <- append(exp_counts_dk,
-                            lambda_dk / theta_d[graphlet_idx])
-  }
-  
-  exp_counts_dk
-}
-
-#' Calculate expected counts in density bins using the
-#' geometric poisson (Polya-Aeppli) approximation.
-#' @param graphlet_counts Graphlet counts for a number of ego_networks.
-#' @param density_interval_indexes Density bin index for
-#' each ego network.
-#' @param max_graphlet_size Determines the maximum size of graphlets
-#' included in graphlet_counts.
-#' @export
-density_binned_counts_gp <- function(graphlet_counts,
-                                     density_interval_indexes,
-                                     max_graphlet_size) {
-
-  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
-    graphlet_counts,
-    density_interval_indexes)
-
-  nbins <- length(unique(density_interval_indexes))
-  expected_counts_bin <- t(sapply(1:nbins,
-                                  exp_counts_bin_gp,
-                                  graphlet_counts = graphlet_counts,
-                                  density_interval_indexes = density_interval_indexes,
-                                  mean_binned_graphlet_counts = mean_binned_graphlet_counts,
-                                  max_graphlet_size = max_graphlet_size))
-
-  # deal with NAs caused by bins with zero counts for a graphlet
-  expected_counts_bin[is.nan(expected_counts_bin)] <- 0
-
-  expected_counts_bin
-}
-
-binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1,
-                                                     binned_densities_1$interval_indexes,
-                                                     max_graphlet_size)
-
-binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2,
-                                                     binned_densities_2$interval_indexes,
-                                                     max_graphlet_size)
+
#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY
+#' Calculate expected counts with geometric poisson (Polya-Aeppli)
+#' approximation for a single density bin.
+#' @param bin_idx Density bin index to calculate expected counts for.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+exp_counts_bin_gp <- function(bin_idx, graphlet_counts,
+                              density_interval_indexes,
+                              mean_binned_graphlet_counts,
+                              max_graphlet_size) {
+  counts <- graphlet_counts[density_interval_indexes == bin_idx, ]
+  means <- mean_binned_graphlet_counts[bin_idx, ]
+  
+  mean_sub_counts <- sweep(counts, 2, means)
+  
+  Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1)
+  theta_d <- 2 * means / (Vd_sq + means)
+  
+  exp_counts_dk <- vector()
+  for (k in 2:max_graphlet_size) {
+    graphlet_idx <- graphlet_ids_for_size(k)
+    
+    lambda_dk <- mean(2 * means[graphlet_idx]^2 /
+                        (Vd_sq[graphlet_idx] + means[graphlet_idx]),
+                      na.rm = TRUE)
+    
+    exp_counts_dk <- append(exp_counts_dk,
+                            lambda_dk / theta_d[graphlet_idx])
+  }
+  
+  exp_counts_dk
+}
+
+#' Calculate expected counts in density bins using the
+#' geometric poisson (Polya-Aeppli) approximation.
+#' @param graphlet_counts Graphlet counts for a number of ego_networks.
+#' @param density_interval_indexes Density bin index for
+#' each ego network.
+#' @param max_graphlet_size Determines the maximum size of graphlets
+#' included in graphlet_counts.
+#' @export
+density_binned_counts_gp <- function(graphlet_counts,
+                                     density_interval_indexes,
+                                     max_graphlet_size) {
+
+  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
+    graphlet_counts,
+    density_interval_indexes)
+
+  nbins <- length(unique(density_interval_indexes))
+  expected_counts_bin <- t(sapply(1:nbins,
+                                  exp_counts_bin_gp,
+                                  graphlet_counts = graphlet_counts,
+                                  density_interval_indexes = density_interval_indexes,
+                                  mean_binned_graphlet_counts = mean_binned_graphlet_counts,
+                                  max_graphlet_size = max_graphlet_size))
+
+  # deal with NAs caused by bins with zero counts for a graphlet
+  expected_counts_bin[is.nan(expected_counts_bin)] <- 0
+
+  expected_counts_bin
+}
+
+binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1,
+                                                     binned_densities_1$interval_indexes,
+                                                     max_graphlet_size)
+
+binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2,
+                                                     binned_densities_2$interval_indexes,
+                                                     max_graphlet_size)

Centre graphlet counts of query graphs using binned expected counts

-
# Calculate expected graphlet counts for each ego network
-exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
-                                                                 ego_density_bins_1, 
-                                                                 binned_graphlet_counts_1,
-                                                                 max_graphlet_size,
-                                                                 scale_fn = NULL)
-
-
-exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
-                                                                 ego_density_bins_2, 
-                                                                 binned_graphlet_counts_2,
-                                                                 max_graphlet_size,
-                                                                 scale_fn = NULL)
-# Centre graphlet counts by subtracting expected counts
-centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
-                                                        exp_graphlet_counts_1,
-                                                        max_graphlet_size)
-
-centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
-                                                        exp_graphlet_counts_2,
-                                                        max_graphlet_size)
+
# Calculate expected graphlet counts for each ego network
+exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
+                                                                 ego_density_bins_1, 
+                                                                 binned_graphlet_counts_1,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+
+
+exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
+                                                                 ego_density_bins_2, 
+                                                                 binned_graphlet_counts_2,
+                                                                 max_graphlet_size,
+                                                                 scale_fn = NULL)
+# Centre graphlet counts by subtracting expected counts
+centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
+                                                        exp_graphlet_counts_1,
+                                                        max_graphlet_size)
+
+centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
+                                                        exp_graphlet_counts_2,
+                                                        max_graphlet_size)

Sum centred graphlet counts across all ego networks

-
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
-
-sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
+
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
+
+sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)

Calculate netdis statistics

-
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
-                              sum_graphlet_counts_2, 
-                              max_graphlet_size)
-
-print(netdis_result)
+
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
+                              sum_graphlet_counts_2, 
+                              max_graphlet_size)
+
+print(netdis_result)
##   netdis3   netdis4 
 ## 0.8822527 0.1892716
+ + + @@ -305,7 +315,7 @@

Usage of netdis with binning and expected counts customisations.

Jack Roberts

-

2020-03-12

+

2020-06-22

@@ -458,12 +468,12 @@

With Modified Expected Counts: Simple Mean

exp_counts_fn = exp_counts_fn) print(results$netdis) -
##              [,1]      [,2]      [,3]       [,4]      [,5]      [,6]
-## netdis3 0.3116860 0.8254261 0.8768637 0.04053921 0.8531485 0.8226894
-## netdis4 0.9592365 0.2009423 0.7974697 0.21688688 0.7734930 0.2144558
-##              [,7]       [,8]      [,9]     [,10]
-## netdis3 0.2353732 0.01970843 0.8288649 0.9167543
-## netdis4 0.8030030 0.39992007 0.3300305 0.6301018
+
##              [,1]      [,2]      [,3]       [,4]      [,5]      [,6]      [,7]
+## netdis3 0.3116860 0.8254261 0.8768637 0.04053921 0.8531485 0.8226894 0.2353732
+## netdis4 0.9592365 0.2009423 0.7974697 0.21688688 0.7734930 0.2144558 0.8030030
+##               [,8]      [,9]     [,10]
+## netdis3 0.01970843 0.8288649 0.9167543
+## netdis4 0.39992007 0.3300305 0.6301018
print(results$comp_spec)
##    name_a name_b index_a index_b
 ## 1     EBV    ECL       1       2
@@ -480,6 +490,9 @@ 

With Modified Expected Counts: Simple Mean

+ + + @@ -305,7 +315,7 @@

Usage of netdis interfaces for different pairwise comparison options.

Jack Roberts

-

2020-03-12

+

2020-06-22

@@ -412,6 +422,9 @@

Do pairwise netdis calculations for many graphs

+ + + @@ -305,7 +315,7 @@

Quick start guide for Netdis - 2 graphs

Martin O’Reilly, Jack Roberts

-

2020-03-12

+

2020-06-22

@@ -441,6 +451,9 @@

Calculate netdis statistics

+ + + - - - - - - - - - - - - - - - - - - - - -

Dendrogram example for Netdis

-

Martin O’Reilly

-

2020-06-22

- - - -
-

Virus PPI example for Netdis

-
-

Load graphs

-

Use read_simple_graphs to read graph data from all files in a directory that match a specific filename pattern in a format suitable for calculating graphlet-based feature counts using the ORCA package. We use igraph::read_graph to read graph data from files, so support all file formats it supports. See help for igraph::read_graph for a list of supported values for the format parameter and the igraph documentation for descriptions of each of the supported file formats.

-

The ORCA package we use to efficiently calculate graphlet and orbit counts requires that graphs are undirected, simple (i.e. have no self-loops or multiple edges) and connected (i.e. have no isolated vertices). Therefore, by default, graphs loaded by read_simple_graphs will be coerced to have the above properties. This can be avoided by setting the relevant as_undirected, remove_loops, remove_multiple or remove_isolates parameters to FALSE.

-
library("netdist")
-edge_format = "ncol"
-# Load reference graph (used for Netdis. Not required for NetEMD)
-ref_path = file.path(system.file(file.path("extdata", "random"),
-                                 package = "netdist"),
-                     "ER_1250_10_1")
-ref_graph <- read_simple_graph(ref_path, format = edge_format)
-
-# Set source directory and file properties for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"),
-                          package = "netdist")
-edge_format <- "ncol"
-file_pattern <- "*"
-
-# Load all graphs in the source folder matching the filename pattern
-query_graphs <- read_simple_graphs(source_dir,
-                                   format = edge_format, 
-                                   pattern = file_pattern)
-print(names(query_graphs))
-
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
-

In this example we will use counts of graphlets containing up to 4 nodes and consider ego-network neighbourhoods of size 2 (i.e. the immediate neighbours of each node plus their immediate neighbours).

-
# Set the maximum graphlet size to compute counts for
-max_graphlet_size <- 4
-neighbourhood_size <- 2
-
-
-
-

Generate NetDis measures between each pair of query graphs

-
# Calculate netdis measure for graphlets up to size max_graphlet_size
-netdis_result <- netdis_many_to_many(query_graphs,
-                                     ref_graph,
-                                     max_graphlet_size = max_graphlet_size,
-                                     neighbourhood_size = neighbourhood_size)
-
-# Netdis measure for graphlets of size 3
-res3 <- netdis_result$netdis["netdis3", ]
-netdis3_mat <- cross_comp_to_matrix(res3, netdis_result$comp_spec)
-
-print("Netdis: graphlet size = 3")
-
## [1] "Netdis: graphlet size = 3"
-
print(netdis3_mat)
-
##               EBV       ECL        HSV-1         KSHV          VZV
-## EBV   0.000000000 0.1846655 0.0082642217 0.0100538469 0.0067775779
-## ECL   0.184665514 0.0000000 0.2065761911 0.2091240549 0.2075471192
-## HSV-1 0.008264222 0.2065762 0.0000000000 0.0001335756 0.0001748254
-## KSHV  0.010053847 0.2091241 0.0001335756 0.0000000000 0.0005964448
-## VZV   0.006777578 0.2075471 0.0001748254 0.0005964448 0.0000000000
-
# Netdis measure for graphlets of size 4
-res4 <- netdis_result$netdis["netdis4", ]
-netdis4_mat <- cross_comp_to_matrix(res4, netdis_result$comp_spec)
-
-print("Netdis: graphlet size = 4")
-
## [1] "Netdis: graphlet size = 4"
-
print(netdis4_mat)
-
##              EBV       ECL      HSV-1       KSHV        VZV
-## EBV   0.00000000 0.1749835 0.16526412 0.01969246 0.15971116
-## ECL   0.17498347 0.0000000 0.29176120 0.22155786 0.41716144
-## HSV-1 0.16526412 0.2917612 0.00000000 0.07602426 0.03434187
-## KSHV  0.01969246 0.2215579 0.07602426 0.00000000 0.13115524
-## VZV   0.15971116 0.4171614 0.03434187 0.13115524 0.00000000
-
-
-

Generate dendrograms

-
graphdists <- as.dist(netdis4_mat)
-par(mfrow = c(1, 2))
-cex <- 1
-
-# Dendrogram based on Netdis measure for graphlets of size 3
-title <- paste("Netdis: graphlet size = ", 3, sep = "")
-plot(phangorn::upgma(as.dist(netdis3_mat), method = "average"),
-     use.edge.length = FALSE, 
-     edge.width = cex*2,
-     main = title,
-     cex.lab = cex, cex.axis = cex,
-     cex.main = cex, cex.sub = cex,
-     cex = cex)
-
-# Dendrogram based on Netdis measure for graphlets of size 4
-title = paste("Netdis: graphlet size = ", 4, sep = "")
-plot(phangorn::upgma(as.dist(netdis4_mat), method = "average"),
-     use.edge.length = FALSE, 
-     edge.width = cex*2,
-     main = title,
-     cex.lab = cex, cex.axis = cex,
-     cex.main = cex, cex.sub = cex,
-     cex = cex)
-

-
cex <- 1.5
-col <- colorRampPalette(colors = c("blue","white"))(100)
-title <- paste("Netdis: graphlet size = ", 3, sep = "")
-heatmap(netdis3_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
-        cexRow = cex, cexCol = cex, symm = TRUE)
-

-
cex <- 1.5
-col <- colorRampPalette(colors = c("blue","white"))(100)
-title <- paste("Netdis: graphlet size = ", 4, sep = "")
-heatmap(netdis4_mat, Rowv = NULL, Colv = NULL, col = col, main = title,
-        cexRow = cex, cexCol = cex, symm = TRUE)
-

-
- - - - - - - - - - - diff --git a/vignettes/dendrogram_example_net_emd.R b/vignettes/dendrogram_example_net_emd.R deleted file mode 100644 index 6af69634..00000000 --- a/vignettes/dendrogram_example_net_emd.R +++ /dev/null @@ -1,66 +0,0 @@ -## ---- fig.show='hold'--------------------------------------------------------- -library("netdist") -# Set source directory and file properties for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -edge_format = "ncol" -file_pattern = ".txt" - -# Calculate graphlet-based degree distributions for all orbits in graphlets -# comprising up to 4 nodes for all graphs. This only needs to be done once -# per graph (feature_type = "orbit", max_graphlet_size = 4).. -# If feature_type is set to "feature_type", orbit counts for orbits in the -# same graphlet will be summed to generate graphlet counts -# If max_graphlet_size is set to 5, graphlet-based degree distributions will -# be calculated for graphlets comprising up to 5 nodes. -virus_gdds <- gdd_for_all_graphs( - source_dir = source_dir, format = edge_format, pattern = file_pattern, - feature_type = "orbit", max_graphlet_size = 4) -names(virus_gdds) - -# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- -# based degree distributions using the default fast "optimise" method and no -# smoothing (default). The "optimise" method uses the built-in R optimise -# function to efficiently find the offset with the minimum EMD, but is not -# guaranteed to find the global minimum if EMD as a function of offset -# is non-convex and/or multimodal. The smoothing window width determines -# whether to calculate the NetEMD from the unaltered discrete GDD histograms -# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" -# smoothing by "smearing" the discrete GDD histogram point masses across bins -# of unit width (smoothing_window_width = 1). Returns a named list containing: -# (i) the NetEMDs and (ii) a table containing the graph names and indices -# within the input GDD list for each pair of graphs compared. -res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) - -# You can also specify method = "fixed_step" to use the much slower method of -# exhaustively evaluating the EMD at all offsets separated by a fixed step. -# The default step size is 1/2 the the minimum spacing between locations in -# either histogram after normalising to unit variance. However, you can -# specifiy your own fixed step using the optional "step_size" parameter. -# Note that this step size is applied to the histograms after they have been -# normalised to unit variance - -# Convert to matrix for input to dendrogram method -netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) -netemd_mat - -## ----------------------------------------------------------------------------- -cex=1 -title = paste("NetEMD: max graphlet size = ", 4, sep = "") -plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, - edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, - cex.sub=cex, cex=cex) - -# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in -# parallel using multiple threads where supported. The number of threads -# used is determined by the global R option "mc.cores". You can inspect the -# current value of this using options("mc.cores") and set it with -# options("mc.cores" = ). To fully utilise a modern consumer -# processor, this should be set to 2x the number of available processor -# cores as each core supports two threads. - -## ----------------------------------------------------------------------------- -cex=1.5 -col <- colorRampPalette(colors = c("blue","white"))(100) -title = paste("NetEMD: max graphlet size = ", 4, sep = "") -heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE) - diff --git a/vignettes/dendrogram_example_net_emd.html b/vignettes/dendrogram_example_net_emd.html deleted file mode 100644 index 4a4d357e..00000000 --- a/vignettes/dendrogram_example_net_emd.html +++ /dev/null @@ -1,409 +0,0 @@ - - - - - - - - - - - - - - - - -Dendrogram example for NetEMD - - - - - - - - - - - - - - - - - - - - - - -

Dendrogram example for NetEMD

-

Martin O’Reilly

-

2020-06-22

- - - -
-

Virus PPI example for NetEMD

-
library("netdist")
-# Set source directory and file properties for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-edge_format = "ncol"
-file_pattern = ".txt"
-
-# Calculate graphlet-based degree distributions for all orbits in graphlets 
-# comprising up to 4 nodes for all graphs. This only needs to be done once 
-# per graph (feature_type = "orbit", max_graphlet_size = 4).. 
-# If feature_type is set to "feature_type", orbit counts for orbits in the
-# same graphlet will be summed to generate graphlet counts
-# If max_graphlet_size is set to 5, graphlet-based degree distributions will  
-# be calculated for graphlets comprising up to 5 nodes.
-virus_gdds <- gdd_for_all_graphs(
-  source_dir = source_dir, format = edge_format, pattern = file_pattern, 
-  feature_type = "orbit", max_graphlet_size = 4)
-names(virus_gdds)
-
## [1] "EBV"   "ECL"   "HSV-1" "KSHV"  "VZV"
-
# Compute NetEMDs between all virus PPI graphs based on the computed graphlet- 
-# based degree distributions using the default fast "optimise" method and no
-# smoothing (default). The "optimise" method uses the built-in R optimise
-# function to efficiently find the offset with the minimum EMD, but is not
-# guaranteed to find the global minimum if EMD as a function of offset
-# is non-convex and/or multimodal. The smoothing window width determines 
-# whether to calculate the NetEMD from the unaltered discrete GDD histograms
-# (smoothing_window_width = 0; default) or to first apply "nearest neighbour" 
-# smoothing by "smearing" the discrete GDD histogram point masses across bins 
-# of unit width (smoothing_window_width = 1). Returns a named list containing:
-# (i) the NetEMDs and (ii) a table containing the graph names and indices 
-# within the input GDD list for each pair of graphs compared.
-res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0)
-
-# You can also specify method = "fixed_step" to use the much slower method of 
-# exhaustively evaluating the EMD at all offsets separated by a fixed step. 
-# The default step size is 1/2 the the minimum spacing between locations in 
-# either histogram after normalising to unit variance. However, you can 
-# specifiy your own fixed step using the optional "step_size" parameter.
-# Note that this step size is applied to the histograms after they have been 
-# normalised to unit variance
-
-# Convert to matrix for input to dendrogram method
-netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec)
-netemd_mat
-
##             EBV       ECL     HSV-1      KSHV       VZV
-## EBV   0.0000000 0.4876039 0.1662892 0.1607293 0.1994605
-## ECL   0.4876039 0.0000000 0.3986281 0.4024176 0.4029344
-## HSV-1 0.1662892 0.3986281 0.0000000 0.1581520 0.2164003
-## KSHV  0.1607293 0.4024176 0.1581520 0.0000000 0.2323936
-## VZV   0.1994605 0.4029344 0.2164003 0.2323936 0.0000000
-
cex=1
-title = paste("NetEMD: max graphlet size = ", 4, sep = "")
-plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FALSE, 
-     edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, 
-     cex.sub=cex, cex=cex)
-

-
# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in 
-# parallel using multiple threads where supported. The number of threads
-# used is determined by the global R option "mc.cores". You can inspect the 
-# current value of this using options("mc.cores") and set it with 
-# options("mc.cores" = <num_cores>). To fully utilise a modern consumer
-# processor, this should be set to 2x the number of available processor 
-# cores as each core supports two threads.
-
cex=1.5
-col <- colorRampPalette(colors = c("blue","white"))(100)
-title = paste("NetEMD: max graphlet size = ", 4, sep = "")
-heatmap(netemd_mat, Rowv = NULL, Colv = NULL, col = col, main = title, cexRow = cex, cexCol = cex, symm = TRUE)
-

-
- - - - - - - - - - - diff --git a/vignettes/netdis_2graphs_polya-aeppli.R b/vignettes/netdis_2graphs_polya-aeppli.R deleted file mode 100644 index 4d73ce38..00000000 --- a/vignettes/netdis_2graphs_polya-aeppli.R +++ /dev/null @@ -1,180 +0,0 @@ -## ----------------------------------------------------------------------------- -# Load libraries -library("netdist") -library("purrr") - -## ----------------------------------------------------------------------------- -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -# Load query graphs -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - - -## ----------------------------------------------------------------------------- -# Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size <- 4 - -# Ego network neighbourhood size -neighbourhood_size <- 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Ego network density binning parameters -min_bin_count <- 5 -num_bins <- 100 - -## ----------------------------------------------------------------------------- -# Get ego networks for query graphs and reference graph -ego_1 <- make_named_ego_graph(graph_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(graph_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - - -## ----------------------------------------------------------------------------- -# Count graphlets for ego networks in query and reference graphs -graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) -graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) - - -## ----------------------------------------------------------------------------- - -# Get ego-network densities -densities_1 <- ego_network_density(graphlet_counts_1) -densities_2 <- ego_network_density(graphlet_counts_2) - -# Adaptively bin ego-network densities -binned_densities_1 <- binned_densities_adaptive(densities_1, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - -ego_density_bins_1 <- binned_densities_1$breaks - -binned_densities_2 <- binned_densities_adaptive(densities_2, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - -ego_density_bins_2 <- binned_densities_2$breaks - -## ----------------------------------------------------------------------------- - -#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY -#' Calculate expected counts with geometric poisson (Polya-Aeppli) -#' approximation for a single density bin. -#' @param bin_idx Density bin index to calculate expected counts for. -#' @param graphlet_counts Graphlet counts for a number of ego_networks. -#' @param density_interval_indexes Density bin index for -#' each ego network. -exp_counts_bin_gp <- function(bin_idx, graphlet_counts, - density_interval_indexes, - mean_binned_graphlet_counts, - max_graphlet_size) { - counts <- graphlet_counts[density_interval_indexes == bin_idx, ] - means <- mean_binned_graphlet_counts[bin_idx, ] - - mean_sub_counts <- sweep(counts, 2, means) - - Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) - theta_d <- 2 * means / (Vd_sq + means) - - exp_counts_dk <- vector() - for (k in 2:max_graphlet_size) { - graphlet_idx <- graphlet_ids_for_size(k) - - lambda_dk <- mean(2 * means[graphlet_idx]^2 / - (Vd_sq[graphlet_idx] + means[graphlet_idx]), - na.rm = TRUE) - - exp_counts_dk <- append(exp_counts_dk, - lambda_dk / theta_d[graphlet_idx]) - } - - exp_counts_dk -} - -#' Calculate expected counts in density bins using the -#' geometric poisson (Polya-Aeppli) approximation. -#' @param graphlet_counts Graphlet counts for a number of ego_networks. -#' @param density_interval_indexes Density bin index for -#' each ego network. -#' @param max_graphlet_size Determines the maximum size of graphlets -#' included in graphlet_counts. -#' @export -density_binned_counts_gp <- function(graphlet_counts, - density_interval_indexes, - max_graphlet_size) { - - mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - graphlet_counts, - density_interval_indexes) - - nbins <- length(unique(density_interval_indexes)) - expected_counts_bin <- t(sapply(1:nbins, - exp_counts_bin_gp, - graphlet_counts = graphlet_counts, - density_interval_indexes = density_interval_indexes, - mean_binned_graphlet_counts = mean_binned_graphlet_counts, - max_graphlet_size = max_graphlet_size)) - - # deal with NAs caused by bins with zero counts for a graphlet - expected_counts_bin[is.nan(expected_counts_bin)] <- 0 - - expected_counts_bin -} - -binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1, - binned_densities_1$interval_indexes, - max_graphlet_size) - -binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2, - binned_densities_2$interval_indexes, - max_graphlet_size) - -## ----------------------------------------------------------------------------- -# Calculate expected graphlet counts for each ego network -exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, - ego_density_bins_1, - binned_graphlet_counts_1, - max_graphlet_size, - scale_fn = NULL) - - -exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, - ego_density_bins_2, - binned_graphlet_counts_2, - max_graphlet_size, - scale_fn = NULL) -# Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, - exp_graphlet_counts_1, - max_graphlet_size) - -centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) - -## ----------------------------------------------------------------------------- -sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) - -sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) - -## ----------------------------------------------------------------------------- - -netdis_result <- netdis_uptok(sum_graphlet_counts_1, - sum_graphlet_counts_2, - max_graphlet_size) - -print(netdis_result) - diff --git a/vignettes/netdis_2graphs_polya-aeppli.html b/vignettes/netdis_2graphs_polya-aeppli.html deleted file mode 100644 index 2c496350..00000000 --- a/vignettes/netdis_2graphs_polya-aeppli.html +++ /dev/null @@ -1,530 +0,0 @@ - - - - - - - - - - - - - - - - -Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation - - - - - - - - - - - - - - - - - - - - - - -

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

-

Martin O’Reilly, Jack Roberts

-

2020-06-22

- - - -
-

Load required libraries

-
# Load libraries
-library("netdist")
-library("purrr")
-
## 
-## Attaching package: 'purrr'
-
## The following objects are masked from 'package:igraph':
-## 
-##     compose, simplify
-
-
-

Load graphs

-
# Set source directory for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-
-# Load query graphs
-graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
-                             format = "ncol")
-
-graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
-                             format = "ncol")
-
-
-

Set Netdis parameters

-
# Maximum graphlet size to calculate counts and netdis statistic for.
-max_graphlet_size <- 4
-
-# Ego network neighbourhood size
-neighbourhood_size <- 2
-
-# Minimum size of ego networks to consider
-min_ego_nodes <- 3
-min_ego_edges <- 1
-
-# Ego network density binning parameters
-min_bin_count <- 5
-num_bins <- 100
-
-
-

Generate ego networks

-
# Get ego networks for query graphs and reference graph
-ego_1 <- make_named_ego_graph(graph_1, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
-
-ego_2 <- make_named_ego_graph(graph_2, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
-
-
-

Count graphlets in ego networks

-
# Count graphlets for ego networks in query and reference graphs
-graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
-graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
-
-
-

Bin ego networks by density

-
# Get ego-network densities
-densities_1 <- ego_network_density(graphlet_counts_1)
-densities_2 <- ego_network_density(graphlet_counts_2)
-
-# Adaptively bin ego-network densities
-binned_densities_1 <- binned_densities_adaptive(densities_1, 
-                                                min_counts_per_interval = min_bin_count, 
-                                                num_intervals = num_bins)
-
-ego_density_bins_1 <- binned_densities_1$breaks
-
-binned_densities_2 <- binned_densities_adaptive(densities_2, 
-                                                min_counts_per_interval = min_bin_count, 
-                                                num_intervals = num_bins)
-
-ego_density_bins_2 <- binned_densities_2$breaks
-
-
-

Calculate expected graphlet counts in each bin using geometric poisson approximation

-
#' INTERNAL FUNCTION - DO NOT CALL DIRECTLY
-#' Calculate expected counts with geometric poisson (Polya-Aeppli)
-#' approximation for a single density bin.
-#' @param bin_idx Density bin index to calculate expected counts for.
-#' @param graphlet_counts Graphlet counts for a number of ego_networks.
-#' @param density_interval_indexes Density bin index for
-#' each ego network.
-exp_counts_bin_gp <- function(bin_idx, graphlet_counts,
-                              density_interval_indexes,
-                              mean_binned_graphlet_counts,
-                              max_graphlet_size) {
-  counts <- graphlet_counts[density_interval_indexes == bin_idx, ]
-  means <- mean_binned_graphlet_counts[bin_idx, ]
-  
-  mean_sub_counts <- sweep(counts, 2, means)
-  
-  Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1)
-  theta_d <- 2 * means / (Vd_sq + means)
-  
-  exp_counts_dk <- vector()
-  for (k in 2:max_graphlet_size) {
-    graphlet_idx <- graphlet_ids_for_size(k)
-    
-    lambda_dk <- mean(2 * means[graphlet_idx]^2 /
-                        (Vd_sq[graphlet_idx] + means[graphlet_idx]),
-                      na.rm = TRUE)
-    
-    exp_counts_dk <- append(exp_counts_dk,
-                            lambda_dk / theta_d[graphlet_idx])
-  }
-  
-  exp_counts_dk
-}
-
-#' Calculate expected counts in density bins using the
-#' geometric poisson (Polya-Aeppli) approximation.
-#' @param graphlet_counts Graphlet counts for a number of ego_networks.
-#' @param density_interval_indexes Density bin index for
-#' each ego network.
-#' @param max_graphlet_size Determines the maximum size of graphlets
-#' included in graphlet_counts.
-#' @export
-density_binned_counts_gp <- function(graphlet_counts,
-                                     density_interval_indexes,
-                                     max_graphlet_size) {
-
-  mean_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
-    graphlet_counts,
-    density_interval_indexes)
-
-  nbins <- length(unique(density_interval_indexes))
-  expected_counts_bin <- t(sapply(1:nbins,
-                                  exp_counts_bin_gp,
-                                  graphlet_counts = graphlet_counts,
-                                  density_interval_indexes = density_interval_indexes,
-                                  mean_binned_graphlet_counts = mean_binned_graphlet_counts,
-                                  max_graphlet_size = max_graphlet_size))
-
-  # deal with NAs caused by bins with zero counts for a graphlet
-  expected_counts_bin[is.nan(expected_counts_bin)] <- 0
-
-  expected_counts_bin
-}
-
-binned_graphlet_counts_1 <- density_binned_counts_gp(graphlet_counts_1,
-                                                     binned_densities_1$interval_indexes,
-                                                     max_graphlet_size)
-
-binned_graphlet_counts_2 <- density_binned_counts_gp(graphlet_counts_2,
-                                                     binned_densities_2$interval_indexes,
-                                                     max_graphlet_size)
-
-
-

Centre graphlet counts of query graphs using binned expected counts

-
# Calculate expected graphlet counts for each ego network
-exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
-                                                                 ego_density_bins_1, 
-                                                                 binned_graphlet_counts_1,
-                                                                 max_graphlet_size,
-                                                                 scale_fn = NULL)
-
-
-exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
-                                                                 ego_density_bins_2, 
-                                                                 binned_graphlet_counts_2,
-                                                                 max_graphlet_size,
-                                                                 scale_fn = NULL)
-# Centre graphlet counts by subtracting expected counts
-centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
-                                                        exp_graphlet_counts_1,
-                                                        max_graphlet_size)
-
-centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
-                                                        exp_graphlet_counts_2,
-                                                        max_graphlet_size)
-
-
-

Sum centred graphlet counts across all ego networks

-
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
-
-sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
-
-
-

Calculate netdis statistics

-
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
-                              sum_graphlet_counts_2, 
-                              max_graphlet_size)
-
-print(netdis_result)
-
##   netdis3   netdis4 
-## 0.8822527 0.1892716
-
- - - - - - - - - - - diff --git a/vignettes/netdis_customisations.R b/vignettes/netdis_customisations.R deleted file mode 100644 index 2c5c0071..00000000 --- a/vignettes/netdis_customisations.R +++ /dev/null @@ -1,98 +0,0 @@ -## ----------------------------------------------------------------------------- -# Load libraries -library("netdist") -library("purrr") - -## ----------------------------------------------------------------------------- -# Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size <- 4 - -# Ego network neighbourhood size -neighbourhood_size <- 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(ref_path, format = "ncol") - - -## ----------------------------------------------------------------------------- -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") - -## ----------------------------------------------------------------------------- - -# Calculate netdis statistics -results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -print(results$netdis) -print(results$comp_spec) - -## ----------------------------------------------------------------------------- - -binning_fn <- purrr::partial(binned_densities_adaptive, - min_counts_per_interval = 10, - num_intervals = 50) - - -# Calculate netdis statistics -results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn) - -print(results$netdis) -print(results$comp_spec) - - - -## ----------------------------------------------------------------------------- -bin_counts_fn <- density_binned_counts_gp - -exp_counts_fn <- purrr::partial(netdis_expected_counts, - scale_fn = NULL) - -# Calculate netdis statistics -results <- netdis_many_to_many(graphs, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn) - -print(results$netdis) -print(results$comp_spec) - -## ----------------------------------------------------------------------------- -binning_fn <- single_density_bin -bin_counts_fn <- density_binned_counts -exp_counts_fn <- netdis_expected_counts - -# Calculate netdis statistics -results <- netdis_many_to_many(graphs, - ref_graph = NULL, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges, - binning_fn = binning_fn, - bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn) - -print(results$netdis) -print(results$comp_spec) - diff --git a/vignettes/netdis_customisations.html b/vignettes/netdis_customisations.html deleted file mode 100644 index 9607767a..00000000 --- a/vignettes/netdis_customisations.html +++ /dev/null @@ -1,507 +0,0 @@ - - - - - - - - - - - - - - - - -Usage of netdis with binning and expected counts customisations. - - - - - - - - - - - - - - - - - - - - - - -

Usage of netdis with binning and expected counts customisations.

-

Jack Roberts

-

2020-06-22

- - - -
-

Load required libraries

-
# Load libraries
-library("netdist")
-library("purrr")
-
-
-

Set Netdis parameters

-
# Maximum graphlet size to calculate counts and netdis statistic for.
-max_graphlet_size <- 4
-
-# Ego network neighbourhood size
-neighbourhood_size <- 2
-
-# Minimum size of ego networks to consider
-min_ego_nodes <- 3
-min_ego_edges <- 1
-
-# Reference graph
-ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
-                        package = "netdist")
-ref_graph <- read_simple_graph(ref_path, format = "ncol")
-
-
-

Load query graphs

-
source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
-
-
-

Default Expected Counts with Reference Graph

-
# Calculate netdis statistics
-results <- netdis_many_to_many(graphs,
-                               ref_graph,
-                               max_graphlet_size = max_graphlet_size,
-                               neighbourhood_size = neighbourhood_size,
-                               min_ego_nodes = min_ego_nodes,
-                               min_ego_edges = min_ego_edges)
-
-print(results$netdis)
-
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
-## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
-## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
-##              [,7]         [,8]         [,9]        [,10]
-## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
-## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
-
print(results$comp_spec)
-
##    name_a name_b index_a index_b
-## 1     EBV    ECL       1       2
-## 2     EBV  HSV-1       1       3
-## 3     EBV   KSHV       1       4
-## 4     EBV    VZV       1       5
-## 5     ECL  HSV-1       2       3
-## 6     ECL   KSHV       2       4
-## 7     ECL    VZV       2       5
-## 8   HSV-1   KSHV       3       4
-## 9   HSV-1    VZV       3       5
-## 10   KSHV    VZV       4       5
-
-
-

With Modified Binning Parameters

-
binning_fn <- purrr::partial(binned_densities_adaptive,
-                             min_counts_per_interval = 10,
-                             num_intervals = 50)
-
-
-# Calculate netdis statistics
-results <- netdis_many_to_many(graphs,
-                               ref_graph,
-                               max_graphlet_size = max_graphlet_size,
-                               neighbourhood_size = neighbourhood_size,
-                               min_ego_nodes = min_ego_nodes,
-                               min_ego_edges = min_ego_edges,
-                               binning_fn = binning_fn)
-
-print(results$netdis)
-
##               [,1]        [,2]        [,3]        [,4]      [,5]      [,6]
-## netdis3 0.08499773 0.005900766 0.009547675 0.007177066 0.1078916 0.1144589
-## netdis4 0.20037679 0.045244760 0.018904439 0.112043371 0.3361503 0.2631420
-##              [,7]         [,8]         [,9]        [,10]
-## netdis3 0.1101426 0.0006494388 2.478794e-05 0.0004097632
-## netdis4 0.4818139 0.0274434372 3.227187e-02 0.0928126401
-
print(results$comp_spec)
-
##    name_a name_b index_a index_b
-## 1     EBV    ECL       1       2
-## 2     EBV  HSV-1       1       3
-## 3     EBV   KSHV       1       4
-## 4     EBV    VZV       1       5
-## 5     ECL  HSV-1       2       3
-## 6     ECL   KSHV       2       4
-## 7     ECL    VZV       2       5
-## 8   HSV-1   KSHV       3       4
-## 9   HSV-1    VZV       3       5
-## 10   KSHV    VZV       4       5
-
-
-

With Modified Expected Counts: Geometric Poisson

-
bin_counts_fn <- density_binned_counts_gp
-
-exp_counts_fn <- purrr::partial(netdis_expected_counts,
-                                scale_fn = NULL)
-
-# Calculate netdis statistics
-results <- netdis_many_to_many(graphs,
-                               ref_graph = NULL,
-                               max_graphlet_size = max_graphlet_size,
-                               neighbourhood_size = neighbourhood_size,
-                               min_ego_nodes = min_ego_nodes,
-                               min_ego_edges = min_ego_edges,
-                               bin_counts_fn = bin_counts_fn,
-                               exp_counts_fn = exp_counts_fn)
-
-print(results$netdis)
-
##              [,1]      [,2]      [,3]       [,4]       [,5]       [,6]
-## netdis3 0.8822527 0.9101084 0.8838054 0.96266771 0.04173551 0.03585169
-## netdis4 0.1892716 0.5735233 0.3719671 0.04604718 0.60270399 0.20370737
-##               [,7]         [,8]        [,9]       [,10]
-## netdis3 0.06271238 0.0004211575 0.005364888 0.009114229
-## netdis4 0.12978637 0.7173089685 0.487688692 0.371848474
-
print(results$comp_spec)
-
##    name_a name_b index_a index_b
-## 1     EBV    ECL       1       2
-## 2     EBV  HSV-1       1       3
-## 3     EBV   KSHV       1       4
-## 4     EBV    VZV       1       5
-## 5     ECL  HSV-1       2       3
-## 6     ECL   KSHV       2       4
-## 7     ECL    VZV       2       5
-## 8   HSV-1   KSHV       3       4
-## 9   HSV-1    VZV       3       5
-## 10   KSHV    VZV       4       5
-
-
-

With Modified Expected Counts: Simple Mean

-
binning_fn <- single_density_bin
-bin_counts_fn <- density_binned_counts
-exp_counts_fn <- netdis_expected_counts
-
-# Calculate netdis statistics
-results <- netdis_many_to_many(graphs,
-                               ref_graph = NULL,
-                               max_graphlet_size = max_graphlet_size,
-                               neighbourhood_size = neighbourhood_size,
-                               min_ego_nodes = min_ego_nodes,
-                               min_ego_edges = min_ego_edges,
-                               binning_fn = binning_fn,
-                               bin_counts_fn = bin_counts_fn,
-                               exp_counts_fn = exp_counts_fn)
-
-print(results$netdis)
-
##              [,1]      [,2]      [,3]       [,4]      [,5]      [,6]      [,7]
-## netdis3 0.3116860 0.8254261 0.8768637 0.04053921 0.8531485 0.8226894 0.2353732
-## netdis4 0.9592365 0.2009423 0.7974697 0.21688688 0.7734930 0.2144558 0.8030030
-##               [,8]      [,9]     [,10]
-## netdis3 0.01970843 0.8288649 0.9167543
-## netdis4 0.39992007 0.3300305 0.6301018
-
print(results$comp_spec)
-
##    name_a name_b index_a index_b
-## 1     EBV    ECL       1       2
-## 2     EBV  HSV-1       1       3
-## 3     EBV   KSHV       1       4
-## 4     EBV    VZV       1       5
-## 5     ECL  HSV-1       2       3
-## 6     ECL   KSHV       2       4
-## 7     ECL    VZV       2       5
-## 8   HSV-1   KSHV       3       4
-## 9   HSV-1    VZV       3       5
-## 10   KSHV    VZV       4       5
-
- - - - - - - - - - - diff --git a/vignettes/netdis_pairwise_comparisons.R b/vignettes/netdis_pairwise_comparisons.R deleted file mode 100644 index 8261d7a4..00000000 --- a/vignettes/netdis_pairwise_comparisons.R +++ /dev/null @@ -1,74 +0,0 @@ -## ----------------------------------------------------------------------------- -# Load libraries -library("netdist") -library("purrr") - -## ----------------------------------------------------------------------------- -# Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size <- 4 - -# Ego network neighbourhood size -neighbourhood_size <- 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Ego network density binning parameters -min_bin_count <- 5 -num_bins <- 100 - -# Reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(ref_path, format = "ncol") - - -## ----------------------------------------------------------------------------- -# Load query graphs -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - -# Calculate netdis statistics -netdis_one_to_one(graph_1, graph_2, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -## ----------------------------------------------------------------------------- -# Load query graphs -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") -graph_1 <- graphs$EBV -graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")] - -# Calculate netdis statistics -netdis_one_to_many(graph_1, graphs_compare, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -## ----------------------------------------------------------------------------- -# Load query graphs -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") -graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*") - -# Calculate netdis statistics -results <- netdis_many_to_many(graphs, - ref_graph, - max_graphlet_size = max_graphlet_size, - neighbourhood_size = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -print(results$netdis) -print(results$comp_spec) - diff --git a/vignettes/netdis_pairwise_comparisons.html b/vignettes/netdis_pairwise_comparisons.html deleted file mode 100644 index ee4d70e1..00000000 --- a/vignettes/netdis_pairwise_comparisons.html +++ /dev/null @@ -1,439 +0,0 @@ - - - - - - - - - - - - - - - - -Usage of netdis interfaces for different pairwise comparison options. - - - - - - - - - - - - - - - - - - - - - - -

Usage of netdis interfaces for different pairwise comparison options.

-

Jack Roberts

-

2020-06-22

- - - -
-

Load required libraries

-
# Load libraries
-library("netdist")
-library("purrr")
-
-
-

Set Netdis parameters

-
# Maximum graphlet size to calculate counts and netdis statistic for.
-max_graphlet_size <- 4
-
-# Ego network neighbourhood size
-neighbourhood_size <- 2
-
-# Minimum size of ego networks to consider
-min_ego_nodes <- 3
-min_ego_edges <- 1
-
-# Ego network density binning parameters
-min_bin_count <- 5
-num_bins <- 100
-
-# Reference graph
-ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
-                        package = "netdist")
-ref_graph <- read_simple_graph(ref_path, format = "ncol")
-
-
-

Compare two graphs

-
# Load query graphs
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-
-graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
-                             format = "ncol")
-
-graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
-                             format = "ncol")
-
-# Calculate netdis statistics
-netdis_one_to_one(graph_1, graph_2,
-                  ref_graph,
-                  max_graphlet_size = max_graphlet_size,
-                  neighbourhood_size = neighbourhood_size,
-                  min_ego_nodes = min_ego_nodes,
-                  min_ego_edges = min_ego_edges)
-
##   netdis3   netdis4 
-## 0.1846655 0.1749835
-
-
-

Compare one graph to many other graphs

-
# Load query graphs
-graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
-graph_1 <- graphs$EBV
-graphs_compare <- graphs[c("ECL", "HSV-1", "KSHV", "VZV")]
-
-# Calculate netdis statistics
-netdis_one_to_many(graph_1, graphs_compare,
-                   ref_graph,
-                   max_graphlet_size = max_graphlet_size,
-                   neighbourhood_size = neighbourhood_size,
-                   min_ego_nodes = min_ego_nodes,
-                   min_ego_edges = min_ego_edges)
-
##               ECL       HSV-1       KSHV         VZV
-## netdis3 0.1846655 0.008264222 0.01005385 0.006777578
-## netdis4 0.1749835 0.165264120 0.01969246 0.159711160
-
-
-

Do pairwise netdis calculations for many graphs

-
# Load query graphs
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-graphs <- read_simple_graphs(source_dir, format = "ncol", pattern = "*")
-
-# Calculate netdis statistics
-results <- netdis_many_to_many(graphs,
-                               ref_graph,
-                               max_graphlet_size = max_graphlet_size,
-                               neighbourhood_size = neighbourhood_size,
-                               min_ego_nodes = min_ego_nodes,
-                               min_ego_edges = min_ego_edges)
-
-print(results$netdis)
-
##              [,1]        [,2]       [,3]        [,4]      [,5]      [,6]
-## netdis3 0.1846655 0.008264222 0.01005385 0.006777578 0.2065762 0.2091241
-## netdis4 0.1749835 0.165264120 0.01969246 0.159711160 0.2917612 0.2215579
-##              [,7]         [,8]         [,9]        [,10]
-## netdis3 0.2075471 0.0001335756 0.0001748254 0.0005964448
-## netdis4 0.4171614 0.0760242643 0.0343418653 0.1311552411
-
print(results$comp_spec)
-
##    name_a name_b index_a index_b
-## 1     EBV    ECL       1       2
-## 2     EBV  HSV-1       1       3
-## 3     EBV   KSHV       1       4
-## 4     EBV    VZV       1       5
-## 5     ECL  HSV-1       2       3
-## 6     ECL   KSHV       2       4
-## 7     ECL    VZV       2       5
-## 8   HSV-1   KSHV       3       4
-## 9   HSV-1    VZV       3       5
-## 10   KSHV    VZV       4       5
-
- - - - - - - - - - - diff --git a/vignettes/quickstart_netdis_2graphs.R b/vignettes/quickstart_netdis_2graphs.R deleted file mode 100644 index a8189f71..00000000 --- a/vignettes/quickstart_netdis_2graphs.R +++ /dev/null @@ -1,121 +0,0 @@ -## ----------------------------------------------------------------------------- -# Load libraries -library("netdist") -library("purrr") - -## ----------------------------------------------------------------------------- -# Set source directory for Virus PPI graph edge files -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -# Load query graphs -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"), - format = "ncol") - -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), - format = "ncol") - - -## ----------------------------------------------------------------------------- -# Maximum graphlet size to calculate counts and netdis statistic for. -max_graphlet_size <- 4 - -# Ego network neighbourhood size -neighbourhood_size <- 2 - -# Minimum size of ego networks to consider -min_ego_nodes <- 3 -min_ego_edges <- 1 - -# Ego network density binning parameters -min_bin_count <- 5 -num_bins <- 100 - -## ----------------------------------------------------------------------------- -# Get ego networks for query graphs and reference graph -ego_1 <- make_named_ego_graph(graph_1, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -ego_2 <- make_named_ego_graph(graph_2, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -## ----------------------------------------------------------------------------- -# Count graphlets for ego networks in query and reference graphs -graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size) -graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size) - -## ----------------------------------------------------------------------------- -# Load reference graph -ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), - package = "netdist") -ref_graph <- read_simple_graph(ref_path, format = "ncol") - -ego_ref <- make_named_ego_graph(ref_graph, - order = neighbourhood_size, - min_ego_nodes = min_ego_nodes, - min_ego_edges = min_ego_edges) - -graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size) - -# Scale ego-network graphlet counts by dividing by total number of k-tuples in -# ego-network (where k is graphlet size) -scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, - max_graphlet_size) - - -# Get ego-network densities -densities_ref <- ego_network_density(graphlet_counts_ref) - -# Adaptively bin ref ego-network densities -binned_densities <- binned_densities_adaptive(densities_ref, - min_counts_per_interval = min_bin_count, - num_intervals = num_bins) - -ref_ego_density_bins <- binned_densities$breaks - -# Average ref graphlet counts across density bins -ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts( - scaled_graphlet_counts_ref, - binned_densities$interval_indexes) - - -## ----------------------------------------------------------------------------- -# Calculate expected graphlet counts (using ref graph ego network density bins) -exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, - ref_ego_density_bins, - ref_binned_graphlet_counts, - max_graphlet_size, - scale_fn=count_graphlet_tuples) - - -exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, - ref_ego_density_bins, - ref_binned_graphlet_counts, - max_graphlet_size, - scale_fn=count_graphlet_tuples) - -# Centre graphlet counts by subtracting expected counts -centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1, - exp_graphlet_counts_1, - max_graphlet_size) - -centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2, - exp_graphlet_counts_2, - max_graphlet_size) - -## ----------------------------------------------------------------------------- -sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1) - -sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2) - -## ----------------------------------------------------------------------------- - -netdis_result <- netdis_uptok(sum_graphlet_counts_1, - sum_graphlet_counts_2, - max_graphlet_size) - -print(netdis_result) - diff --git a/vignettes/quickstart_netdis_2graphs.html b/vignettes/quickstart_netdis_2graphs.html deleted file mode 100644 index 3863ac42..00000000 --- a/vignettes/quickstart_netdis_2graphs.html +++ /dev/null @@ -1,468 +0,0 @@ - - - - - - - - - - - - - - - - -Quick start guide for Netdis - 2 graphs - - - - - - - - - - - - - - - - - - - - - - -

Quick start guide for Netdis - 2 graphs

-

Martin O’Reilly, Jack Roberts

-

2020-06-22

- - - -
-

Load required libraries

-
# Load libraries
-library("netdist")
-library("purrr")
-
-
-

Load graphs

-
# Set source directory for Virus PPI graph edge files
-source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist")
-
-# Load query graphs
-graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),
-                             format = "ncol")
-
-graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),
-                             format = "ncol")
-
-
-

Set Netdis parameters

-
# Maximum graphlet size to calculate counts and netdis statistic for.
-max_graphlet_size <- 4
-
-# Ego network neighbourhood size
-neighbourhood_size <- 2
-
-# Minimum size of ego networks to consider
-min_ego_nodes <- 3
-min_ego_edges <- 1
-
-# Ego network density binning parameters
-min_bin_count <- 5
-num_bins <- 100
-
-
-

Generate ego networks

-
# Get ego networks for query graphs and reference graph
-ego_1 <- make_named_ego_graph(graph_1, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
-
-ego_2 <- make_named_ego_graph(graph_2, 
-                              order = neighbourhood_size, 
-                              min_ego_nodes = min_ego_nodes, 
-                              min_ego_edges = min_ego_edges)
-
-
-

Count graphlets in ego networks

-
# Count graphlets for ego networks in query and reference graphs
-graphlet_counts_1 <- ego_to_graphlet_counts(ego_1, max_graphlet_size = max_graphlet_size)
-graphlet_counts_2 <- ego_to_graphlet_counts(ego_2, max_graphlet_size = max_graphlet_size)
-
-
-

Use a reference graph to calculate expected graphlet counts in ego network density bins

-
# Load reference graph
-ref_path <- system.file(file.path("extdata", "random", "ER_1250_10_1"), 
-                        package = "netdist")
-ref_graph <- read_simple_graph(ref_path, format = "ncol")
-
-ego_ref <- make_named_ego_graph(ref_graph, 
-                                order = neighbourhood_size, 
-                                min_ego_nodes = min_ego_nodes, 
-                                min_ego_edges = min_ego_edges)
-
-graphlet_counts_ref <- ego_to_graphlet_counts(ego_ref, max_graphlet_size = max_graphlet_size)
-
-# Scale ego-network graphlet counts by dividing by total number of k-tuples in
-# ego-network (where k is graphlet size)
-scaled_graphlet_counts_ref <- scale_graphlet_counts_ego(graphlet_counts_ref, 
-                                                        max_graphlet_size)
-
-
-# Get ego-network densities
-densities_ref <- ego_network_density(graphlet_counts_ref)
-
-# Adaptively bin ref ego-network densities
-binned_densities <- binned_densities_adaptive(densities_ref, 
-                                              min_counts_per_interval = min_bin_count, 
-                                              num_intervals = num_bins)
-
-ref_ego_density_bins <- binned_densities$breaks
-
-# Average ref graphlet counts across density bins
-ref_binned_graphlet_counts <- mean_density_binned_graphlet_counts(
-                                  scaled_graphlet_counts_ref, 
-                                  binned_densities$interval_indexes)
-
-
-

Centre graphlet counts of query graphs based on statistics of reference graph

-
# Calculate expected graphlet counts (using ref graph ego network density bins)
-exp_graphlet_counts_1 <- netdis_expected_counts(graphlet_counts_1, 
-                                                                 ref_ego_density_bins, 
-                                                                 ref_binned_graphlet_counts,
-                                                                 max_graphlet_size,
-                                                                 scale_fn=count_graphlet_tuples)
-
-
-exp_graphlet_counts_2 <- netdis_expected_counts(graphlet_counts_2, 
-                                                                 ref_ego_density_bins, 
-                                                                 ref_binned_graphlet_counts,
-                                                                 max_graphlet_size,
-                                                                 scale_fn=count_graphlet_tuples)
-
-# Centre graphlet counts by subtracting expected counts
-centred_graphlet_counts_1 <- netdis_subtract_exp_counts(graphlet_counts_1,
-                                                        exp_graphlet_counts_1,
-                                                        max_graphlet_size)
-
-centred_graphlet_counts_2 <- netdis_subtract_exp_counts(graphlet_counts_2,
-                                                        exp_graphlet_counts_2,
-                                                        max_graphlet_size)
-
-
-

Sum centred graphlet counts across all ego networks

-
sum_graphlet_counts_1 <- colSums(centred_graphlet_counts_1)
-
-sum_graphlet_counts_2 <- colSums(centred_graphlet_counts_2)
-
-
-

Calculate netdis statistics

-
netdis_result <- netdis_uptok(sum_graphlet_counts_1, 
-                              sum_graphlet_counts_2, 
-                              max_graphlet_size)
-
-print(netdis_result)
-
##   netdis3   netdis4 
-## 0.1846655 0.1749835
-
- - - - - - - - - - - From a59cc6f8ddadd4f6ab1b43181f571f6ddd0b62f7 Mon Sep 17 00:00:00 2001 From: leospinaf Date: Tue, 23 Jun 2020 10:49:05 +0100 Subject: [PATCH 108/188] test --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 43b3164f..26da5aaa 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Network Comparison -An R package implementing the Netdis and NetEMD alignment-free network comparison measures. +An R package implementing the Netdis and NetEMD alignment-free network comparison measures. + ### :warning: BETA: Package under construction (pre-release) :warning: Until this package hits release 1.0 anything can change with no notice. From dd82b1688313084e86ce24d921eef3a6c6bd4112 Mon Sep 17 00:00:00 2001 From: Luis Ospina Date: Tue, 23 Jun 2020 14:11:32 +0100 Subject: [PATCH 109/188] Latest package source built that is in use for pairwise comparisons calling networks or precomputed counts. --- doc/default_pairwise_usage.Rmd | 2 +- doc/default_pairwise_usage.html | 17 ++++++++--------- doc/dendrogram_example_net_dis.html | 4 ++-- doc/dendrogram_example_net_emd.html | 4 ++-- doc/netdis_2graphs_polya-aeppli.html | 4 ++-- doc/netdis_customisations.html | 4 ++-- doc/netdis_pairwise_comparisons.html | 4 ++-- doc/quickstart_netdis_2graphs.html | 4 ++-- vignettes/V-Menu.Rmd | 2 -- 9 files changed, 21 insertions(+), 24 deletions(-) diff --git a/doc/default_pairwise_usage.Rmd b/doc/default_pairwise_usage.Rmd index 0361125c..0ae05e20 100644 --- a/doc/default_pairwise_usage.Rmd +++ b/doc/default_pairwise_usage.Rmd @@ -194,7 +194,7 @@ netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) ## Using Netdis with no expectation ($E_w=0$) Comparing the networks via their observed ego counts without centering them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks. -#' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + ```{r,netdiszero} #Netdis using no expectations (or equivalently, expectation equal to zero). netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) diff --git a/doc/default_pairwise_usage.html b/doc/default_pairwise_usage.html index 73b5c434..ee3d8ee9 100644 --- a/doc/default_pairwise_usage.html +++ b/doc/default_pairwise_usage.html @@ -343,9 +343,9 @@

Load graphs included in the netdist package

# Herpes virus EBV protein-protein interaction graph with 60 nodes and 208 edges. graph_1 -#> IGRAPH a5c2f10 UN-- 60 208 -- +#> IGRAPH a7e984b UN-- 60 208 -- #> + attr: name (v/c) -#> + edges from a5c2f10 (vertex names): +#> + edges from a7e984b (vertex names): #> [1] A73 --BALF3 A73 --BARF0 A73 --BBLF2 A73 --BDRF1 A73 --BFRF4 #> [6] A73 --BGLF2 A73 --BGLF3 A73 --BGLF5 A73 --BLLF2 A73 --BTRF1 #> [11] BALF3--BBLF2 BALF3--BDRF1 BALF3--BFRF4 BALF3--BGLF5 BALF3--BTRF1 @@ -358,9 +358,9 @@

Load graphs included in the netdist package

# Herpes virus ECL protein-protein interaction graph with 1941 nodes and 3989 edges. graph_2 -#> IGRAPH c7e8990 UN-- 1941 3989 -- +#> IGRAPH 120e432 UN-- 1941 3989 -- #> + attr: name (v/c) -#> + edges from c7e8990 (vertex names): +#> + edges from 120e432 (vertex names): #> [1] B1882--B1888 B1882--B1945 B1882--B1946 B1882--B1886 B1882--B1887 #> [6] B1882--B1939 B1882--B1938 B1882--B1884 B1882--B1883 B1882--B3210 #> [11] B1882--B1881 B1882--B4355 B1882--B1922 B1882--B1890 B1882--B1889 @@ -373,9 +373,9 @@

Load graphs included in the netdist package

#A simple visualization of the graphs. plot(graph_1,vertex.size=0.5,vertex.label=NA)
-

+

plot(graph_2,vertex.size=0.5,vertex.label=NA)
-

+

@@ -460,9 +460,9 @@

Using netdis with a gold-standard graph to obtain goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes plot(goldstd_1,vertex.size=0.8,vertex.label=NA)

-

+

plot(goldstd_2,vertex.size=0.5,vertex.label=NA)
-

+


 
 # Netdis using the goldstd_1 graph as gold-standard reference point
@@ -493,7 +493,6 @@ 

Netdis-GP: Using a Geometric-Poisson approximation

Using Netdis with no expectation (\(E_w=0\))

Comparing the networks via their observed ego counts without centering them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks.

-

#’ netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0)

#Netdis using no expectations (or equivalently, expectation equal to zero).
 netdis_one_to_one(graph_1= graph_1, graph_2= graph_2,  ref_graph = 0)
 #>    netdis3    netdis4 
diff --git a/doc/dendrogram_example_net_dis.html b/doc/dendrogram_example_net_dis.html
index a11367e9..9f102866 100644
--- a/doc/dendrogram_example_net_dis.html
+++ b/doc/dendrogram_example_net_dis.html
@@ -12,7 +12,7 @@
 
 
 
-
+
 
 Dendrogram example for Netdis
 
@@ -315,7 +315,7 @@
 
 

Dendrogram example for Netdis

Martin O’Reilly

-

2020-06-22

+

2020-06-23

diff --git a/doc/dendrogram_example_net_emd.html b/doc/dendrogram_example_net_emd.html index 4a4d357e..e6428ef2 100644 --- a/doc/dendrogram_example_net_emd.html +++ b/doc/dendrogram_example_net_emd.html @@ -12,7 +12,7 @@ - + Dendrogram example for NetEMD @@ -315,7 +315,7 @@

Dendrogram example for NetEMD

Martin O’Reilly

-

2020-06-22

+

2020-06-23

diff --git a/doc/netdis_2graphs_polya-aeppli.html b/doc/netdis_2graphs_polya-aeppli.html index 2c496350..3eccf297 100644 --- a/doc/netdis_2graphs_polya-aeppli.html +++ b/doc/netdis_2graphs_polya-aeppli.html @@ -12,7 +12,7 @@ - + Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation @@ -315,7 +315,7 @@

Netdis - 2 graphs with Expected Counts from Geometric Poisson Approximation

Martin O’Reilly, Jack Roberts

-

2020-06-22

+

2020-06-23

diff --git a/doc/netdis_customisations.html b/doc/netdis_customisations.html index 9607767a..46baec27 100644 --- a/doc/netdis_customisations.html +++ b/doc/netdis_customisations.html @@ -12,7 +12,7 @@ - + Usage of netdis with binning and expected counts customisations. @@ -315,7 +315,7 @@

Usage of netdis with binning and expected counts customisations.

Jack Roberts

-

2020-06-22

+

2020-06-23

diff --git a/doc/netdis_pairwise_comparisons.html b/doc/netdis_pairwise_comparisons.html index ee4d70e1..be8d206b 100644 --- a/doc/netdis_pairwise_comparisons.html +++ b/doc/netdis_pairwise_comparisons.html @@ -12,7 +12,7 @@ - + Usage of netdis interfaces for different pairwise comparison options. @@ -315,7 +315,7 @@

Usage of netdis interfaces for different pairwise comparison options.

Jack Roberts

-

2020-06-22

+

2020-06-23

diff --git a/doc/quickstart_netdis_2graphs.html b/doc/quickstart_netdis_2graphs.html index 3863ac42..e46e059e 100644 --- a/doc/quickstart_netdis_2graphs.html +++ b/doc/quickstart_netdis_2graphs.html @@ -12,7 +12,7 @@ - + Quick start guide for Netdis - 2 graphs @@ -315,7 +315,7 @@

Quick start guide for Netdis - 2 graphs

Martin O’Reilly, Jack Roberts

-

2020-06-22

+

2020-06-23

diff --git a/vignettes/V-Menu.Rmd b/vignettes/V-Menu.Rmd index 6ada76e1..2eae3bef 100644 --- a/vignettes/V-Menu.Rmd +++ b/vignettes/V-Menu.Rmd @@ -23,7 +23,5 @@ This package contains multiple vignettes that illustrate the simple ways of empl * [Default, simple and quick use usage 1](default_pairwise_usage.html): -* [Default, simple and quick use usage 2](netdis_pairwise_comparisons.html): -* [Dendrogram Examples](dendrogram_example_net_dis.html). From d33ccbb0c552033e83b47bda40a7596431630e14 Mon Sep 17 00:00:00 2001 From: Luis Ospina Date: Wed, 24 Jun 2020 16:45:29 +0100 Subject: [PATCH 110/188] Important changes to functions for netdis_gp, mostly for the handling of zero variance and divisions by zero. --- R/measures_net_dis.R | 15 ++- vignettes/PreComputedProps.Rmd | 174 +++++++++++++++++++++++++++ vignettes/V-Menu.Rmd | 4 +- vignettes/default_pairwise_usage.Rmd | 4 +- 4 files changed, 192 insertions(+), 5 deletions(-) create mode 100644 vignettes/PreComputedProps.Rmd diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index a3d3dfb3..f23cb892 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -85,7 +85,7 @@ #' netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) #' #' -#' #Imputing pre-calculated subgraph counts instead of subgraphs. +#' #Providing pre-calculated subgraph counts. #' #' props_1 <- count_graphlets_ego(graph = graph_1) #' props_2 <- count_graphlets_ego(graph = graph_2) @@ -530,7 +530,7 @@ netdis_many_to_many <- function(graphs = NULL, ## ------------------------------------------------------------------------ # If no reference passed, calculate expected counts using query networks - # themselves. + # themselves. Geometric-Poisson GP SHOULD BE THE DEFALUT } else { centred_graphlet_counts <- purrr::map( graphlet_counts, @@ -986,6 +986,10 @@ exp_counts_bin_gp <- function(bin_idx, graphlet_counts, # variance in graphlet counts across ego networks in this density bin Vd_sq <- colSums(mean_sub_counts^2) / (nrow(mean_sub_counts) - 1) + # Dealing with zero variance HERE + ind_zerovar <- (Vd_sq < .00000001) + if(sum(ind_zerovar) > 0) Vd_sq[ind_zerovar] <- 0.1 + # GP theta parameter for each graphlet id in this density bin theta_d <- 2 * means / (Vd_sq + means) @@ -1004,8 +1008,15 @@ exp_counts_bin_gp <- function(bin_idx, graphlet_counts, exp_counts_dk, lambda_dk / theta_d[graphlet_idx] ) + } + # Dealing with divisions by zero. + ind <- is.na(exp_counts_dk) + ind <- ind | is.infinite(exp_counts_dk) + if(sum(ind) > 0) exp_counts_dk[ind & ind_zerovar[-1]] <- 0 + + exp_counts_dk } diff --git a/vignettes/PreComputedProps.Rmd b/vignettes/PreComputedProps.Rmd new file mode 100644 index 00000000..089ac96b --- /dev/null +++ b/vignettes/PreComputedProps.Rmd @@ -0,0 +1,174 @@ +--- +title: "Default, simple and quick use usage 2: precomputed properties" +author: "Luis Ospina-Forero" +date: "23/06/2020" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{PreComputedProps} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Introduction + +In many situations there may be a need to pursue the network comparisons from pre-computed counts. This decreases the computational time to obtain the results, particularly when the same networks are involved in multiple comparisons or when the same properties are used in other procedures. Also, performing the comparisons from pre-computed properties allows greater flexibility in the use of the network comparison methods and their variants. + +This Vignette follows similar examples as ["Default, simple and quick use usage 1: pairwise comparisons"](default_pairwise_usage.html) but shows the modifications required to obtain properties and to then use them in the network comparison methods. + +# Load required packages/libraries +```{r, packages, message= FALSE} +# Load packages/libraries +library("netdist") +library("igraph") +``` + +# Compare two networks via NetEmd. + +## Comparing two graphs with NetEmd via subgraph counts. +```{r, netemd,fig.align='center'} +# Create lattice networks +gLat_1 <- graph.lattice(c(8,8)) +gLat_2 <- graph.lattice(c(44,44)) + +netemd_one_to_one(graph_1=gLat_1,graph_2=gLat_2,feature_type="orbit",max_graphlet_size=5) + +#Providing a matrix of network features +counts_1= count_orbits_per_node(graph = gLat_1,max_graphlet_size = 5) +counts_2= count_orbits_per_node(graph = gLat_2,max_graphlet_size = 5) + +netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2) + +#Providing the network features as lists of dhist objects +dhists_1<- graph_features_to_histograms(counts_1) +dhists_2<- graph_features_to_histograms(counts_2) + +netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) +``` + +## Comparing two graphs with NetEmd via their Laplacian spectrum. +```{r, netemdEigen,fig.align='center'} +# Networks +gLat_1 <- graph.lattice(c(8,8)) +gLat_2 <- graph.lattice(c(44,44)) + +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = TRUE,sparse = FALSE) + +# Providing a matrix of network features (e.g. Spectra). Spectra (This may take a couple of minutes). +spec_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +spec_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +netemd_one_to_one(dhists_1 = spec_1,dhists_2 = spec_2, smoothing_window_width = 0) + + +# Providing pre-computed dhist objects from network features +dhists_1<- graph_features_to_histograms(spec_1) +dhists_2<- graph_features_to_histograms(spec_2) + +netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2, smoothing_window_width = 0) +``` +------------------------- + +# Compare two networks via Netdis and its variants. + +## Using netdis with a gold-standard graph to obtain $E_w$. +The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard (summarized in $E_w$). + +```{r,netdisgoldstand,fig.align='center'} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") + +# Lattice graphs to be used as gold-standard as a reference point comparison +goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes +goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes + +# Netdis using the goldstd_1 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_1) + +# Netdis using the goldstd_2 graph as gold-standard reference point +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) + +# Providing pre-calculated subgraph counts. +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) +props_goldstd_1 <- count_graphlets_ego(graph = goldstd_1) +props_goldstd_2 <- count_graphlets_ego(graph = goldstd_2) + + netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_1) + netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, graphlet_counts_ref = props_goldstd_2) +``` + + +## Netdis-GP: Using a Geometric-Poisson approximation + +This variant focuses on detecting more general and global discrepancies between the ego-network structures. + +```{r, netdisGP} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") + +# Netdis using the Geometric-Poisson approximation as a way to obtain background expectations. +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = NULL) + +# Providing pre-calculated subgraph counts. +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) + +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = NULL) +``` + + +## Using Netdis with no expectation ($E_w=0$) +Comparing the networks via their observed ego counts without centering them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks. + + +```{r,netdiszero} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") + +#Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + +# Providing pre-calculated subgraph counts. +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) + +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) +``` + +------------------------- + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. \ No newline at end of file diff --git a/vignettes/V-Menu.Rmd b/vignettes/V-Menu.Rmd index 2eae3bef..93723f7b 100644 --- a/vignettes/V-Menu.Rmd +++ b/vignettes/V-Menu.Rmd @@ -22,6 +22,8 @@ knitr::opts_chunk$set( This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods 'Netdis', 'NetEmd' and their variants (e.g. `Netdis Geometric-Poisson'), and also the large flexibility of these methodologies. The following is a list of the available vignettes: -* [Default, simple and quick use usage 1](default_pairwise_usage.html): +* [Default, simple and quick use usage 1: pairwise comparisons](default_pairwise_usage.html): +* [Default, simple and quick use usage 2: precomputed properties](PreComputedProps.html): +* [Default, simple and quick use usage 3: many to many comparions](ManyToMany.html): diff --git a/vignettes/default_pairwise_usage.Rmd b/vignettes/default_pairwise_usage.Rmd index 0ae05e20..a8ffcebc 100644 --- a/vignettes/default_pairwise_usage.Rmd +++ b/vignettes/default_pairwise_usage.Rmd @@ -1,5 +1,5 @@ --- -title: "Default pairwise usage of the network comparison methods" +title: "Default, simple and quick use usage 1: pairwise comparisons" date: "`10-06-2020`" author: "Luis Ospina-Forero" output: rmarkdown::html_vignette @@ -7,7 +7,7 @@ vignette: > %\VignetteIndexEntry{default_pairwise_usage} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---- + --- ```{r, include = FALSE} knitr::opts_chunk$set( From 25476ef1e26155e54d111538e986c0eab8f8ed2f Mon Sep 17 00:00:00 2001 From: Luis Ospina Date: Wed, 24 Jun 2020 18:02:34 +0100 Subject: [PATCH 111/188] Current work on many_to_many vignette (trial for pushing) --- vignettes/ManyToMany.Rmd | 220 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 vignettes/ManyToMany.Rmd diff --git a/vignettes/ManyToMany.Rmd b/vignettes/ManyToMany.Rmd new file mode 100644 index 00000000..649a7abf --- /dev/null +++ b/vignettes/ManyToMany.Rmd @@ -0,0 +1,220 @@ +--- +title: 'Default, simple and quick use usage 3: many to many comparions' +author: "Luis Ospina-Forero" +date: "23/06/2020" +output: rmarkdown::html_vignette +vignette: > +%\VignetteIndexEntry{ManyToMany} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + --- + + ```{r, include = FALSE} + knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" + ) + ``` + + # Introduction + + In some situations there is a need to compare multiple graphs among each other. For such situations, the `netdist` package contains some initial shortcut functions that perform such calculation and may incorporate parallel processing. + + This Vignette follows similar examples as ["Default, simple and quick use usage 1: pairwise comparisons"](default_pairwise_usage.html) but shows the default usage of the shortcut functions for many-to-many comparisons. + + # Load required packages/libraries + ```{r, packages, message= FALSE} + # Load packages/libraries +library("netdist") +library("igraph") +library("pheatmap") +``` + +# Compare two networks via NetEmd. + +## Comparing two graphs with NetEmd via subgraph counts. +```{r, netemd,fig.align='center'} +# Create lattice networks +gLat_1 <- graph.lattice(c(8,8)) +gLat_2 <- graph.lattice(c(44,44)) +gRing_1 <- make_ring(8^2) +gRing_2 <- make_ring(44^2) +gTree_1 <- make_tree(n = 8^2) +gTree_2 <- make_ring(n = 44^2) + + +#Providing a matrix of network features +counts_1= count_orbits_per_node(graph = gLat_1,max_graphlet_size = 5) +counts_2= count_orbits_per_node(graph = gLat_2,max_graphlet_size = 5) + +netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2) + +#Providing the network features as lists of dhist objects +dhists_1<- graph_features_to_histograms(counts_1) +dhists_2<- graph_features_to_histograms(counts_2) + +netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) +``` + +## Comparing two graphs with NetEmd via their Laplacian spectrum. +```{r, netemdEigen,fig.align='center'} +# Networks +gLat_1 <- graph.lattice(c(8,8)) +gLat_2 <- graph.lattice(c(44,44)) + +#Laplacian +Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = FALSE,sparse = FALSE) +Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = FALSE,sparse = FALSE) + +#Normalized Laplacian +NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = TRUE,sparse = FALSE) +NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = TRUE,sparse = FALSE) + +# Providing a matrix of network features (e.g. Spectra). Spectra (This may take a couple of minutes). +spec_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) +spec_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) + +netemd_one_to_one(dhists_1 = spec_1,dhists_2 = spec_2, smoothing_window_width = 0) + + +# Providing pre-computed dhist objects from network features +dhists_1<- graph_features_to_histograms(spec_1) +dhists_2<- graph_features_to_histograms(spec_2) + +netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2, smoothing_window_width = 0) +``` +------------------------- + +# Compare two networks via Netdis and its variants. + +## Using netdis with a gold-standard graph to obtain $E_w$. +The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard (summarized in $E_w$). + +```{r,netdisgoldstand,fig.align='center'} +# Create lattice, Ring and Tree like networks of sizes 8^2 and 44^2. +set.seed(3171) +gLat_1 <- graph.lattice(c(8,8)) +gLat_2 <- graph.lattice(c(44,44)) +gRing_1 <- make_ring(8^2) +gRing_2 <- make_ring(44^2) +gTree_1 <- make_tree(n = 8^2,children = 3) +gTree_2 <- make_tree(n = 44^2,children = 3) + +# Create a random graph to be used as a gold-standard +gst_1 <- graph.star(8^2,) +gst_2 <- graph.star(44^2) + + +# Netdis using the goldstd_1 graph as gold-standard reference point +glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) + +netdis_mat_gst1 <- netdis_many_to_many(graphs = glist, ref_graph = gst_1) +netdis_mat_gst2 <- netdis_many_to_many(graphs = glist, ref_graph = gst_2) + +netdis_mat_gst1 + +netdis_mat_gst2 +``` + +```{r,netdisgoldstandPLOT,fig.align='center'} +# Creating matrices to plot the the comparison +mat_gst1 <- matrix(nrow=6,ncol=6,0) +mat_gst2 <- matrix(nrow=6,ncol=6,0) +vnames <- rep(NA,6) +for(i in 1:nrow(netdis_mat_gst1$comp_spec)){ + ind1 <- netdis_mat_gst1$comp_spec[i,"index_a"] + ind2 <- netdis_mat_gst1$comp_spec[i,"index_b"] + mat_gst1[ind1,ind2] <- netdis_mat_gst1$netdis["netdis4",i] + mat_gst1[ind2,ind1] <- mat_gst1[ind1,ind2] + # + ind1 <- netdis_mat_gst2$comp_spec[i,"index_a"] + ind2 <- netdis_mat_gst2$comp_spec[i,"index_b"] + mat_gst2[ind1,ind2] <- netdis_mat_gst2$netdis["netdis4",i] + mat_gst2[ind2,ind1] <- mat_gst2[ind1,ind2] + # + vnames[ind1] <- as.character(netdis_mat_gst1$comp_spec[i,"name_a"]) + vnames[ind2] <- as.character(netdis_mat_gst1$comp_spec[i,"name_b"]) +} +rownames(mat_gst1) <- vnames +colnames(mat_gst1) <- vnames +rownames(mat_gst2) <- vnames +colnames(mat_gst2) <- vnames + +#Network comparisons heatmap with Gold-Standard 1 +legend1 <- c(seq(min(mat_gst1),max(mat_gst1),length.out = 5)) +levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="Netdis" +pheatmap::pheatmap(mat = as.dist(mat_gst1),cluster_rows = TRUE,clustering_method = "average",angle_col=45,main = "Netdis GoldStd-1",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) + +#Network comparisons heatmap with Gold-Standard 2 +legend2 <- c(seq(min(mat_gst2),max(mat_gst2),length.out = 5)) +levels2 <- round(legend2,digits = 2); levels2[length(levels2)]="Netdis" +pheatmap::pheatmap(mat = as.dist(mat_gst2),cluster_rows = TRUE,clustering_method = "average",angle_col=45,main = "Netdis GoldStd-1",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend2,legend_labels = levels2) +``` + + +## Netdis-GP: Using a Geometric-Poisson approximation + +This variant focuses on detecting more general and global discrepancies between the ego-network structures. + +```{r, netdisGP} +# Netdis Geometric-Poisson comparisons +netdis_mat_gst1 <- netdis_many_to_many(graphs = glist, ref_graph = NULL) +netdis_mat_gst1 +``` + +```{r,netdisGPPLOT,fig.align='center'} +# Creating matrices to plot the the comparison +mat_gst1 <- matrix(nrow=6,ncol=6,0) +vnames <- rep(NA,6) +for(i in 1:nrow(netdis_mat_gst1$comp_spec)){ + ind1 <- netdis_mat_gst1$comp_spec[i,"index_a"] + ind2 <- netdis_mat_gst1$comp_spec[i,"index_b"] + mat_gst1[ind1,ind2] <- netdis_mat_gst1$netdis["netdis4",i] + mat_gst1[ind2,ind1] <- mat_gst1[ind1,ind2] + # + vnames[ind1] <- as.character(netdis_mat_gst1$comp_spec[i,"name_a"]) + vnames[ind2] <- as.character(netdis_mat_gst1$comp_spec[i,"name_b"]) +} +rownames(mat_gst1) <- vnames +colnames(mat_gst1) <- vnames + +#Network comparisons heatmap with Gold-Standard 1 +legend1 <- c(seq(min(mat_gst1),max(mat_gst1),length.out = 5)) +levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="Netdis-GP" +pheatmap::pheatmap(mat = as.dist(mat_gst1),cluster_rows = TRUE,clustering_method = "average",angle_col=45,main = "Netdis GoldStd-1",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) +``` + +## Using Netdis with no expectation ($E_w=0$) +Comparing the networks via their observed ego counts without centering them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks. + + +```{r,netdiszero} +# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. +source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") + +# Load query graphs as igraph objects +graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") +graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") + +#Netdis using no expectations (or equivalently, expectation equal to zero). +netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) + +# Providing pre-calculated subgraph counts. +props_1 <- count_graphlets_ego(graph = graph_1) +props_2 <- count_graphlets_ego(graph = graph_2) + +netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) +``` + +------------------------- + + +# Bibliography + +* W. Ali, T. Rito, G. Reinert, F. Sun, and C. M. Deane. Alignment-free protein interaction network comparison. Bioinformatics, 30:i430–i437, 2014. + +* L. Ospina-Forero, C. M. Deane, and G. Reinert. Assessment of model fit via network comparison methods based on subgraph counts. Journal of Complex Networks, page cny017, August 2018. + +* A. E. Wegner, L. Ospina-Forero, R. E. Gaunt, C. M. Deane, and G. Reinert. Identifying networks with common organizational principles. Journal of Complex networks, 2017. + +* F. Picard, J.-J. Daudin, M. Koskas, S. Schbath, and S. Robin. Assessing the exceptionality of network motifs. Journal of Computational Biology, 15(1):1–20, 2008. \ No newline at end of file From 420f86d87d8a94f1a4dd9c42438e3f22f1cc3047 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Sat, 27 Jun 2020 12:49:06 +0100 Subject: [PATCH 112/188] make geometric poisson default if ref_graph null --- DESCRIPTION | 2 +- R/measures_net_dis.R | 153 ++++++++++++------------ man/netdis_many_to_many.Rd | 27 ++--- man/netdis_one_to_many.Rd | 27 ++--- man/netdis_one_to_one.Rd | 29 ++--- man/virusppi.Rd | 4 +- tests/testthat/test_measures_net_dis.R | 8 -- vignettes/quickstart_netdis_2graphs.Rmd | 2 +- 8 files changed, 120 insertions(+), 132 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b9c2c7d3..80254188 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,6 +28,6 @@ Suggests: phangorn, rmarkdown, roxygen2 -RoxygenNote: 7.0.2 +RoxygenNote: 7.1.0 VignetteBuilder: knitr Encoding: UTF-8 diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index f23cb892..c5fad69f 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -40,24 +40,24 @@ #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each -#' ego network). (Default: \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' ego network). If NULL uses \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} +#' (Default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. -#' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and -#' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the -#' approach used in the original netdis paper). +#' (bin indexes) and \code{max_graphlet_size} as arguments. If NULL uses the +#' approach from the original Netdis paper if \code{ref_graph} is not +#' NULL, or geometric poisson if \code{ref_graph} is NULL (Default: NULL). #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -#' (Default: \code{netdis_expected_counts} with -#' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in -#' the original netdis paper). -#' +#' If NULL uses the approach from the original Netdis paper if \code{ref_graph} +#' is not NULL, or geometric poisson if \code{ref_graph} is NULL +#' (Default: NULL). +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size. #' @@ -109,20 +109,9 @@ netdis_one_to_one <- function(graph_1 = NULL, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial( - binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100 - ), - bin_counts_fn = purrr::partial( - density_binned_counts, - agg_fn = mean, - scale_fn = scale_graphlet_counts_ego - ), - exp_counts_fn = purrr::partial( - netdis_expected_counts, - scale_fn = count_graphlet_tuples - ), + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_2 = NULL, graphlet_counts_ref= NULL) { @@ -203,9 +192,6 @@ netdis_one_to_one <- function(graph_1 = NULL, } - - - #' Netdis comparisons between one graph and many other graphs. #' #' @param graph_1 Query graph - this graph will be compared with @@ -235,23 +221,23 @@ netdis_one_to_one <- function(graph_1 = NULL, #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each -#' ego network). (Default: \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' ego network). If NULL uses \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} +#' (Default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. -#' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and -#' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the -#' approach used in the original netdis paper). +#' (bin indexes) and \code{max_graphlet_size} as arguments. If NULL uses the +#' approach from the original Netdis paper if \code{ref_graph} is not +#' NULL, or geometric poisson if \code{ref_graph} is NULL (Default: NULL). #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -#' (Default: \code{netdis_expected_counts} with -#' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in -#' the original netdis paper). +#' If NULL uses the approach from the original Netdis paper if \code{ref_graph} +#' is not NULL, or geometric poisson if \code{ref_graph} is NULL +#' (Default: NULL). #' #' @param graphlet_counts_1 Pre-generated graphlet counts for the first query #' graph. If the \code{graphlet_counts_1} argument is defined then @@ -271,20 +257,9 @@ netdis_one_to_many <- function(graph_1 = NULL, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial( - binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100 - ), - bin_counts_fn = purrr::partial( - density_binned_counts, - agg_fn = mean, - scale_fn = scale_graphlet_counts_ego - ), - exp_counts_fn = purrr::partial( - netdis_expected_counts, - scale_fn = count_graphlet_tuples - ), + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_compare = NULL) { ## ------------------------------------------------------------------------ @@ -398,23 +373,23 @@ netdis_one_to_many <- function(graph_1 = NULL, #' @param binning_fn Function used to bin ego network densities. Takes densities #' as its single argument, and returns a named list including keys \code{breaks} #' (list of bin edges) and \code{interval_indexes} (density bin index for each -#' ego network). (Default: \code{binned_densities_adaptive} with -#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100}). +#' ego network). If NULL uses \code{binned_densities_adaptive} with +#' \code{min_counts_per_interval = 5} and \code{num_intervals = 100} +#' (Default: NULL). #' #' @param bin_counts_fn Function used to calculate expected graphlet counts in #' each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -#' (bin indexes) and \code{max_graphlet_size} as arguments. -#' (Default: \code{density_binned_counts} with \code{agg_fn = mean} and -#' \code{scale_fn = scale_graphlet_counts_ego}, which mirrors the -#' approach used in the original netdis paper). +#' (bin indexes) and \code{max_graphlet_size} as arguments. If NULL uses the +#' approach from the original Netdis paper if \code{ref_graph} is not +#' NULL, or geometric poisson if \code{ref_graph} is NULL (Default: NULL). #' #' @param exp_counts_fn Function used to map from binned reference counts to #' expected counts for each graphlet in each ego network of the query graphs. #' Takes \code{ego_networks}, \code{density_bin_breaks}, #' \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -#' (Default: \code{netdis_expected_counts} with -#' \code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in -#' the original netdis paper). +#' If NULL uses the approach from the original Netdis paper if \code{ref_graph} +#' is not NULL, or geometric poisson if \code{ref_graph} is NULL +#' (Default: NULL). #' #' @return Netdis statistics between query graphs for graphlet sizes #' up to and including max_graphlet_size. @@ -427,29 +402,57 @@ netdis_many_to_many <- function(graphs = NULL, neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial( - binned_densities_adaptive, - min_counts_per_interval = 5, - num_intervals = 100 - ), - bin_counts_fn = purrr::partial( - density_binned_counts, - agg_fn = mean, - scale_fn = scale_graphlet_counts_ego - ), - exp_counts_fn = purrr::partial( - netdis_expected_counts, - scale_fn = count_graphlet_tuples - ), + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, graphlet_counts = NULL, graphlet_counts_ref = NULL) { ## ------------------------------------------------------------------------ - # Check arguments + # Check arguments and set functions appropriately if (is.null(graphs) & is.null(graphlet_counts)) { stop("One of graphs and graphlet_counts must be supplied.") } + # Set default binning_fn if none supplied + if (is.null(binning_fn)) { + binning_fn <- purrr::partial( + binned_densities_adaptive, + min_counts_per_interval = 5, + num_intervals = 100 + ) + } + + # If no ref_graph supplied, default to geometric poisson unless user-defined + # functions have been provided. + if (is.null(ref_graph)) { + if (is.null(bin_counts_fn)) { + bin_counts_fn <- density_binned_counts_gp + } + if (is.null(exp_counts_fn)) { + exp_counts_fn <- purrr::partial( + netdis_expected_counts, + scale_fn = NULL + ) + } + # If a ref_graph value supplied (including a constant), default to approach + # from original netdis paper, unless user-defined functions provided. + } else { + if (is.null(bin_counts_fn)) { + bin_counts_fn <- purrr::partial( + density_binned_counts, + agg_fn = mean, + scale_fn = scale_graphlet_counts_ego + ) + } + if (is.null(exp_counts_fn)) { + exp_counts_fn <- purrr::partial( + netdis_expected_counts, + scale_fn = count_graphlet_tuples + ) + } + } + ## ------------------------------------------------------------------------ # Generate ego networks and count graphlets for query graphs. # But if graphlet counts have already been provided we can skip this step. @@ -488,7 +491,7 @@ netdis_many_to_many <- function(graphs = NULL, } else if (!is.null(ref_graph) | !is.null(graphlet_counts_ref)) { # Generate ego networks and calculate graphlet counts - # But if some ref graphlet counts provided can skip this step + # But if graphlet_counts_ref provided can skip this step if (is.null(graphlet_counts_ref)) { graphlet_counts_ref <- count_graphlets_ego( ref_graph, diff --git a/man/netdis_many_to_many.Rd b/man/netdis_many_to_many.Rd index 1ec3bf46..f02e4f7c 100644 --- a/man/netdis_many_to_many.Rd +++ b/man/netdis_many_to_many.Rd @@ -12,12 +12,9 @@ netdis_many_to_many( neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, - num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = - scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_counts, scale_fn = - count_graphlet_tuples), + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, graphlet_counts = NULL, graphlet_counts_ref = NULL ) @@ -52,23 +49,23 @@ than min_ego_edges edges.} \item{binning_fn}{Function used to bin ego network densities. Takes densities as its single argument, and returns a named list including keys \code{breaks} (list of bin edges) and \code{interval_indexes} (density bin index for each -ego network). (Default: \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} +ego network). If NULL uses \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100} +(Default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. -(Default: \code{density_binned_counts} with \code{agg_fn = mean} and -\code{scale_fn = scale_graphlet_counts_ego}, which mirrors the -approach used in the original netdis paper).} +(bin indexes) and \code{max_graphlet_size} as arguments. If NULL uses the +approach from the original Netdis paper if \code{ref_graph} is not +NULL, or geometric poisson if \code{ref_graph} is NULL (Default: NULL).} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_counts} with -\code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in -the original netdis paper).} +If NULL uses the approach from the original Netdis paper if \code{ref_graph} +is not NULL, or geometric poisson if \code{ref_graph} is NULL +(Default: NULL).} \item{graphlet_counts}{Pre-generated graphlet counts. If the \code{graphlet_counts} argument is defined then \code{graphs} will not be diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index be32f267..ed19a20f 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -12,12 +12,9 @@ netdis_one_to_many( neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, - num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = - scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_counts, scale_fn = - count_graphlet_tuples), + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_compare = NULL ) @@ -50,23 +47,23 @@ than min_ego_edges edges.} \item{binning_fn}{Function used to bin ego network densities. Takes densities as its single argument, and returns a named list including keys \code{breaks} (list of bin edges) and \code{interval_indexes} (density bin index for each -ego network). (Default: \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} +ego network). If NULL uses \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100} +(Default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. -(Default: \code{density_binned_counts} with \code{agg_fn = mean} and -\code{scale_fn = scale_graphlet_counts_ego}, which mirrors the -approach used in the original netdis paper).} +(bin indexes) and \code{max_graphlet_size} as arguments. If NULL uses the +approach from the original Netdis paper if \code{ref_graph} is not +NULL, or geometric poisson if \code{ref_graph} is NULL (Default: NULL).} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_counts} with -\code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in -the original netdis paper).} +If NULL uses the approach from the original Netdis paper if \code{ref_graph} +is not NULL, or geometric poisson if \code{ref_graph} is NULL +(Default: NULL).} \item{graphlet_counts_1}{Pre-generated graphlet counts for the first query graph. If the \code{graphlet_counts_1} argument is defined then diff --git a/man/netdis_one_to_one.Rd b/man/netdis_one_to_one.Rd index 1b58467d..55a11a81 100644 --- a/man/netdis_one_to_one.Rd +++ b/man/netdis_one_to_one.Rd @@ -12,12 +12,9 @@ netdis_one_to_one( neighbourhood_size = 2, min_ego_nodes = 3, min_ego_edges = 1, - binning_fn = purrr::partial(binned_densities_adaptive, min_counts_per_interval = 5, - num_intervals = 100), - bin_counts_fn = purrr::partial(density_binned_counts, agg_fn = mean, scale_fn = - scale_graphlet_counts_ego), - exp_counts_fn = purrr::partial(netdis_expected_counts, scale_fn = - count_graphlet_tuples), + binning_fn = NULL, + bin_counts_fn = NULL, + exp_counts_fn = NULL, graphlet_counts_1 = NULL, graphlet_counts_2 = NULL, graphlet_counts_ref = NULL @@ -49,23 +46,23 @@ than min_ego_edges edges.} \item{binning_fn}{Function used to bin ego network densities. Takes densities as its single argument, and returns a named list including keys \code{breaks} (list of bin edges) and \code{interval_indexes} (density bin index for each -ego network). (Default: \code{binned_densities_adaptive} with -\code{min_counts_per_interval = 5} and \code{num_intervals = 100}).} +ego network). If NULL uses \code{binned_densities_adaptive} with +\code{min_counts_per_interval = 5} and \code{num_intervals = 100} +(Default: NULL).} \item{bin_counts_fn}{Function used to calculate expected graphlet counts in each density bin. Takes \code{graphlet_counts}, \code{interval_indexes} -(bin indexes) and \code{max_graphlet_size} as arguments. -(Default: \code{density_binned_counts} with \code{agg_fn = mean} and -\code{scale_fn = scale_graphlet_counts_ego}, which mirrors the -approach used in the original netdis paper).} +(bin indexes) and \code{max_graphlet_size} as arguments. If NULL uses the +approach from the original Netdis paper if \code{ref_graph} is not +NULL, or geometric poisson if \code{ref_graph} is NULL (Default: NULL).} \item{exp_counts_fn}{Function used to map from binned reference counts to expected counts for each graphlet in each ego network of the query graphs. Takes \code{ego_networks}, \code{density_bin_breaks}, \code{binned_graphlet_counts}, and \code{max_graphlet_size} as arguments. -(Default: \code{netdis_expected_counts} with -\code{scale_fn = count_graphlet_tuples}, which mirrors the approach used in -the original netdis paper).} +If NULL uses the approach from the original Netdis paper if \code{ref_graph} +is not NULL, or geometric poisson if \code{ref_graph} is NULL +(Default: NULL).} \item{graphlet_counts_1}{Pre-generated graphlet counts for the first query graph. If the \code{graphlet_counts_1} argument is defined then @@ -110,7 +107,7 @@ netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = goldstd_2) - #Imputing pre-calculated subgraph counts instead of subgraphs. + #Providing pre-calculated subgraph counts. props_1 <- count_graphlets_ego(graph = graph_1) props_2 <- count_graphlets_ego(graph = graph_2) diff --git a/man/virusppi.Rd b/man/virusppi.Rd index ddbb8eb2..6c905b2c 100644 --- a/man/virusppi.Rd +++ b/man/virusppi.Rd @@ -5,7 +5,9 @@ \name{virusppi} \alias{virusppi} \title{Protein-protein interaction (PPI) networks for 5 microorganisms} -\format{A list of \code{igraph} objects.} +\format{ +A list of \code{igraph} objects. +} \source{ \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. diff --git a/tests/testthat/test_measures_net_dis.R b/tests/testthat/test_measures_net_dis.R index 70eee456..ac1a68c2 100644 --- a/tests/testthat/test_measures_net_dis.R +++ b/tests/testthat/test_measures_net_dis.R @@ -1414,12 +1414,6 @@ test_that("netdis_one_to_one gives expected result when using geometric Poisson expected_netdis4 <- 0.1892716 # check function to test - bin_counts_fn <- density_binned_counts_gp - - exp_counts_fn <- purrr::partial(netdis_expected_counts, - scale_fn = NULL - ) - actual_netdis <- netdis_one_to_one(graph_1, graph_2, ref_graph = NULL, @@ -1427,8 +1421,6 @@ test_that("netdis_one_to_one gives expected result when using geometric Poisson neighbourhood_size = neighbourhood_size, min_ego_nodes = min_ego_nodes, min_ego_edges = min_ego_edges, - bin_counts_fn = bin_counts_fn, - exp_counts_fn = exp_counts_fn ) expect_equal(expected_netdis4, actual_netdis[["netdis4"]], diff --git a/vignettes/quickstart_netdis_2graphs.Rmd b/vignettes/quickstart_netdis_2graphs.Rmd index 51eb0330..a55f1d82 100644 --- a/vignettes/quickstart_netdis_2graphs.Rmd +++ b/vignettes/quickstart_netdis_2graphs.Rmd @@ -147,4 +147,4 @@ netdis_result <- netdis_uptok(sum_graphlet_counts_1, max_graphlet_size) print(netdis_result) -``` \ No newline at end of file +``` From 1ab8407535262651c034592a994a6eeda4e1efb7 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Sat, 27 Jun 2020 13:24:18 +0100 Subject: [PATCH 113/188] check for null graphlet_counts_ref as well as ref_graph --- R/measures_net_dis.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index c5fad69f..80a268df 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -118,10 +118,10 @@ netdis_one_to_one <- function(graph_1 = NULL, ## ------------------------------------------------------------------------ # Check arguments - if (is.null(graph_1) & is.null(graphlet_counts_1)) { + if (is.null(graph_1) && is.null(graphlet_counts_1)) { stop("One of graph_1 and graphlet_counts_1 must be supplied.") } - if (is.null(graph_2) & is.null(graphlet_counts_2)) { + if (is.null(graph_2) && is.null(graphlet_counts_2)) { stop("One of graph_2 and graphlet_counts_2 must be supplied.") } ## ------------------------------------------------------------------------ @@ -158,7 +158,7 @@ netdis_one_to_one <- function(graph_1 = NULL, ) if(!is.null(ref_graph)){ - if (!is.numeric(ref_graph) & is.null(graphlet_counts_ref)) { + if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { graphlet_counts_ref <- count_graphlets_ego( ref_graph, max_graphlet_size = max_graphlet_size, @@ -264,10 +264,10 @@ netdis_one_to_many <- function(graph_1 = NULL, graphlet_counts_compare = NULL) { ## ------------------------------------------------------------------------ # Check arguments - if (is.null(graph_1) & is.null(graphlet_counts_1)) { + if (is.null(graph_1) && is.null(graphlet_counts_1)) { stop("One of graph_1 and graphlet_counts_1 must be supplied.") } - if (is.null(graphs_compare) & is.null(graphlet_counts_compare)) { + if (is.null(graphs_compare) && is.null(graphlet_counts_compare)) { stop("One of graph_2 and graphlet_counts_2 must be supplied.") } @@ -410,7 +410,7 @@ netdis_many_to_many <- function(graphs = NULL, ## ------------------------------------------------------------------------ # Check arguments and set functions appropriately - if (is.null(graphs) & is.null(graphlet_counts)) { + if (is.null(graphs) && is.null(graphlet_counts)) { stop("One of graphs and graphlet_counts must be supplied.") } @@ -425,7 +425,7 @@ netdis_many_to_many <- function(graphs = NULL, # If no ref_graph supplied, default to geometric poisson unless user-defined # functions have been provided. - if (is.null(ref_graph)) { + if (is.null(ref_graph) && is.null(graphlet_counts_ref)) { if (is.null(bin_counts_fn)) { bin_counts_fn <- density_binned_counts_gp } From 370c28a23406a44cdde5658e993b8ea84b65f76a Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 30 Jun 2020 16:56:24 +0100 Subject: [PATCH 114/188] add graphlet_counts_ref argument to netdis_one_to_many --- R/measures_net_dis.R | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 80a268df..2f7a1283 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -247,6 +247,10 @@ netdis_one_to_one <- function(graph_1 = NULL, #' for the remaining query graphs. If the \code{graphlet_counts_compare} #' argument is defined then \code{graphs_compare} will not be used. #' +#' @param graphlet_counts_ref Pre-generated reference graphlet counts. If the +#' \code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +#' be used. +#' #' @return Netdis statistics between graph_1 and graph_2 for graphlet sizes #' up to and including max_graphlet_size #' @export @@ -261,7 +265,8 @@ netdis_one_to_many <- function(graph_1 = NULL, bin_counts_fn = NULL, exp_counts_fn = NULL, graphlet_counts_1 = NULL, - graphlet_counts_compare = NULL) { + graphlet_counts_compare = NULL, + graphlet_counts_ref= NULL) { ## ------------------------------------------------------------------------ # Check arguments if (is.null(graph_1) && is.null(graphlet_counts_1)) { @@ -305,6 +310,20 @@ netdis_one_to_many <- function(graph_1 = NULL, after = 0 ) + if(!is.null(ref_graph)){ + if (!is.numeric(ref_graph) && is.null(graphlet_counts_ref)) { + graphlet_counts_ref <- count_graphlets_ego( + ref_graph, + max_graphlet_size = max_graphlet_size, + neighbourhood_size = neighbourhood_size, + min_ego_nodes = min_ego_nodes, + min_ego_edges = min_ego_edges, + return_ego_networks = FALSE + ) + ref_graph <- NULL + } + } + ## ------------------------------------------------------------------------ # calculate netdis result <- netdis_many_to_many( @@ -488,7 +507,7 @@ netdis_many_to_many <- function(graphs = NULL, ## ------------------------------------------------------------------------ # If there are no graphlet_counts_ref, and If a reference graph passed, use it to calculate expected counts for all # query graphs. - } else if (!is.null(ref_graph) | !is.null(graphlet_counts_ref)) { + } else if (!is.null(ref_graph) || !is.null(graphlet_counts_ref)) { # Generate ego networks and calculate graphlet counts # But if graphlet_counts_ref provided can skip this step From 11c6507ef676ab3cf3cb4bd69f775bbf81e91ff8 Mon Sep 17 00:00:00 2001 From: Jack Roberts Date: Tue, 30 Jun 2020 17:07:47 +0100 Subject: [PATCH 115/188] bug fix missing graphlet_counts_ref --- R/measures_net_dis.R | 3 ++- man/netdis_one_to_many.Rd | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index 2f7a1283..d7dc969c 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -337,7 +337,8 @@ netdis_one_to_many <- function(graph_1 = NULL, binning_fn = binning_fn, bin_counts_fn = bin_counts_fn, exp_counts_fn = exp_counts_fn, - graphlet_counts = graphlet_counts + graphlet_counts = graphlet_counts, + graphlet_counts_ref = graphlet_counts_ref ) ## ------------------------------------------------------------------------ diff --git a/man/netdis_one_to_many.Rd b/man/netdis_one_to_many.Rd index ed19a20f..04102ae1 100644 --- a/man/netdis_one_to_many.Rd +++ b/man/netdis_one_to_many.Rd @@ -16,7 +16,8 @@ netdis_one_to_many( bin_counts_fn = NULL, exp_counts_fn = NULL, graphlet_counts_1 = NULL, - graphlet_counts_compare = NULL + graphlet_counts_compare = NULL, + graphlet_counts_ref = NULL ) } \arguments{ @@ -72,6 +73,10 @@ graph. If the \code{graphlet_counts_1} argument is defined then \item{graphlet_counts_compare}{Named list of pre-generated graphlet counts for the remaining query graphs. If the \code{graphlet_counts_compare} argument is defined then \code{graphs_compare} will not be used.} + +\item{graphlet_counts_ref}{Pre-generated reference graphlet counts. If the +\code{graphlet_counts_ref} argument is defined then \code{ref_graph} will not +be used.} } \value{ Netdis statistics between graph_1 and graph_2 for graphlet sizes From 722f83e48c295bfd5b5d4495495623a40ec85ab7 Mon Sep 17 00:00:00 2001 From: Luis Ospina Date: Mon, 13 Jul 2020 13:30:34 +0100 Subject: [PATCH 116/188] Correct default functionallit of netdis call for GP variant. Renaming of netemd_many_to_many that is currently causing some sort of issue, and changes to vignette of many to many comparisons. --- NAMESPACE | 1 - R/measures_net_dis.R | 2 +- R/measures_net_emd.R | 83 ++++-- R/orca_interface.R | 2 +- doc/dendrogram_example_net_emd.R | 4 +- doc/dendrogram_example_net_emd.Rmd | 2 +- man/gdd.Rd | 2 +- ...r_all_graphs.Rd => netemd_many_to_many.Rd} | 30 +- man/virusppi.Rd | 4 +- tests/testthat/test_measures_net_emd.R | 4 +- vignettes/ManyToMany.Rmd | 260 ++++++++++-------- vignettes/PreComputedProps.Rmd | 12 +- vignettes/default_pairwise_usage.Rmd | 10 +- vignettes/dendrogram_example_net_emd.Rmd | 6 +- 14 files changed, 248 insertions(+), 174 deletions(-) rename man/{net_emds_for_all_graphs.Rd => netemd_many_to_many.Rd} (63%) diff --git a/NAMESPACE b/NAMESPACE index bec918fe..f2241414 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,7 +48,6 @@ export(min_emd) export(min_emd_exhaustive) export(min_emd_optimise) export(min_emd_optimise_fast) -export(net_emds_for_all_graphs) export(netdis) export(netdis_centred_graphlet_counts) export(netdis_expected_counts) diff --git a/R/measures_net_dis.R b/R/measures_net_dis.R index d7dc969c..05b8fa86 100644 --- a/R/measures_net_dis.R +++ b/R/measures_net_dis.R @@ -490,7 +490,7 @@ netdis_many_to_many <- function(graphs = NULL, rm(graphs) ## ------------------------------------------------------------------------ - # Centre counts + # Centred counts # If there are no graphlet_counts_ref, and a number has been passed as ref_graph, treat it as a constant expected # counts value (e.g. if ref_graph = 0 then no centring of counts). if (is.numeric(ref_graph) && length(ref_graph) == 1 && is.null(graphlet_counts_ref)) { diff --git a/R/measures_net_emd.R b/R/measures_net_emd.R index 53ca27f5..f9a7b497 100755 --- a/R/measures_net_emd.R +++ b/R/measures_net_emd.R @@ -139,26 +139,10 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= } - - - - - - - - - - - - - - - - #' NetEMDs between all graph pairs using provided Graphlet-based Degree #' Distributions -#' @param gdds List containing sets of Graphlet-based Degree Distributions for -#' all graphs being compared +#' @param graphs A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if dhists is provided. +#' @param dhists A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided. #' @param method The method to use to find the minimum EMD across all potential #' offsets for each pair of histograms. Default is "optimise" to use #' R's built-in \code{stats::optimise} method to efficiently find the offset @@ -175,6 +159,14 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= #' (e.g.for the integer domain a width of 1 is the natural choice) #' @param mc.cores Number of cores to use for parallel processing. Defaults to #' the \code{mc.cores} option set in the R environment. +#' @param feature_type Type of graphlet-based feature to count: "graphlet" +#' counts the number of graphlets each node participates in; "orbit" calculates +#' the number of graphlet orbits each node participates in. +#' @param max_graphlet_size Determines the maximum size of graphlets to count. +#' Only graphlets containing up to \code{max_graphlet_size} nodes will be +#' counted. Possible values are 3,4, and 5 (default). +#' @param ego_neighbourhood_size The number of steps from the source node to +#' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. #' @return NetEMD measures between all pairs of graphs for which GDDs #' were provided. Format of returned data depends on the \code{return_details} #' parameter. If set to FALSE, a list is returned with the following named @@ -185,11 +177,12 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= #' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD #' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving #' the minimal EMD for each GDD -#' @export -net_emds_for_all_graphs <- function( - gdds, method = "optimise", smoothing_window_width = 0, - return_details = FALSE, mc.cores = getOption("mc.cores", 2L)) { - comp_spec <- cross_comparison_spec(gdds) +#' @export +#' +netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smoothing_window_width = 0, + return_details = FALSE, mc.cores = getOption("mc.cores", 2L),feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { + if(max_graphlet_size > 4 & mc.cores > 1) print(paste("This function will compute orbits of graphlets up to size 5 using ", mc.cores," cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM.")) + # NOTE: mcapply only works on unix-like systems with system level forking # capability. This means it will work on Linux and OSX, but not Windows. # For now, we just revert to single threaded operation on Windows @@ -199,9 +192,45 @@ net_emds_for_all_graphs <- function( # forking mc.cores <- 1 } - num_features <- length(gdds[[1]]) + ## ------------------------------------------------------------------------ + # Check arguments 1 + which_imput_type <- NULL + if(!is.null(graphs) & is.null(dhists)){ + if ( !all(( unlist(sapply(X = graphs, FUN = igraph::is.igraph)) ) ) ) { + stop("Graphs need to be igraph graph objects, or a list of dhists network features should be supplied.") + } + which_imput_type <- "Graphs" + } + if (!is.null(dhists) ) { + if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) { which_imput_type <- "dhist" } + if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) { which_imput_type <- "Matrix" } + } + ## ------------------------------------------------------------------------ + # Check arguments 2 + # If dhists is a list of matrices of network features then transform them to dhist objects. + if(which_imput_type == "Matrix"){ + dhists <- sapply(X = dhists,FUN = graph_features_to_histograms, simplify = FALSE ) + } + ## ------------------------------------------------------------------------ + # Check arguments 3 + #If input is graph then get graphlet counts + if(which_imput_type == "Graphs"){ + dhists <- parallel::mcmapply(gdd, graphs, + MoreArgs = + list( + feature_type = feature_type, + max_graphlet_size = max_graphlet_size, + ego_neighbourhood_size = ego_neighbourhood_size + ), + SIMPLIFY = FALSE, mc.cores = mc.cores + ) + } + rm(graphs) + ## ------------------------------------------------------------------------ + comp_spec <- cross_comparison_spec(dhists) + num_features <- length(dhists[[1]]) out <- purrr::simplify(parallel::mcmapply(function(index_a, index_b) { - netemd_one_to_one(dhists_1 = gdds[[index_a]], dhists_2 = gdds[[index_b]], + netemd_one_to_one(dhists_1 = dhists[[index_a]], dhists_2 = dhists[[index_b]], method = method, return_details = return_details, smoothing_window_width = smoothing_window_width ) @@ -214,10 +243,10 @@ net_emds_for_all_graphs <- function( colnames(min_offsets) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsets_O", . - 1, sep = ""))) min_offsets_std <- matrix(purrr::simplify(purrr::map(out, ~ .$min_offsets_std)), ncol = num_features, byrow = TRUE) colnames(min_offsets_std) <- purrr::simplify(purrr::map(1:num_features, ~ paste("MinOffsetsStd_O", . - 1, sep = ""))) - ret <- list(net_emds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std) + ret <- list(netemds = net_emds, comp_spec = comp_spec, min_emds = min_emds, min_offsets = min_offsets, min_offsets_std = min_offsets_std) } else { net_emds <- out - ret <- list(net_emds = net_emds, comp_spec = comp_spec) + ret <- list(netemds = net_emds, comp_spec = comp_spec) } } diff --git a/R/orca_interface.R b/R/orca_interface.R index 1db42292..8bfa4878 100644 --- a/R/orca_interface.R +++ b/R/orca_interface.R @@ -228,7 +228,7 @@ graph_features_to_histogramsSLOW <- function(features_matrix) { #' Graphlet-based degree distributions (GDDs) #' -#' Generates graphlet-based degree distributions from \code{igraph} graph object +#' Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object #' using the ORCA fast graphlet orbit counting package. #' @param graph A connected, undirected, simple graph as an \code{igraph} object #' @param feature_type Type of graphlet-based feature to count: "graphlet" diff --git a/doc/dendrogram_example_net_emd.R b/doc/dendrogram_example_net_emd.R index 6af69634..394b8a01 100644 --- a/doc/dendrogram_example_net_emd.R +++ b/doc/dendrogram_example_net_emd.R @@ -29,7 +29,7 @@ names(virus_gdds) # of unit width (smoothing_window_width = 1). Returns a named list containing: # (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) # You can also specify method = "fixed_step" to use the much slower method of # exhaustively evaluating the EMD at all offsets separated by a fixed step. @@ -50,7 +50,7 @@ plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FAL edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, cex.sub=cex, cex=cex) -# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in +# The gdd_for_all_graphs and netemd_many_to_many functions will run in # parallel using multiple threads where supported. The number of threads # used is determined by the global R option "mc.cores". You can inspect the # current value of this using options("mc.cores") and set it with diff --git a/doc/dendrogram_example_net_emd.Rmd b/doc/dendrogram_example_net_emd.Rmd index 72fd2750..a684d9df 100644 --- a/doc/dendrogram_example_net_emd.Rmd +++ b/doc/dendrogram_example_net_emd.Rmd @@ -51,7 +51,7 @@ res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) # normalised to unit variance # Convert to matrix for input to dendrogram method -netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) +netemd_mat <- cross_comp_to_matrix(res$netemds, res$comp_spec) netemd_mat ``` diff --git a/man/gdd.Rd b/man/gdd.Rd index 1dc26f0c..d1563d66 100644 --- a/man/gdd.Rd +++ b/man/gdd.Rd @@ -30,6 +30,6 @@ List of graphlet-based degree distributions, with each distribution represented as a \code{dhist} discrete histogram object. } \description{ -Generates graphlet-based degree distributions from \code{igraph} graph object +Short-cut function to create graphlet-based degree distributions from \code{igraph} graph object using the ORCA fast graphlet orbit counting package. } diff --git a/man/net_emds_for_all_graphs.Rd b/man/netemd_many_to_many.Rd similarity index 63% rename from man/net_emds_for_all_graphs.Rd rename to man/netemd_many_to_many.Rd index e71227e9..902bcb1d 100644 --- a/man/net_emds_for_all_graphs.Rd +++ b/man/netemd_many_to_many.Rd @@ -1,21 +1,26 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/measures_net_emd.R -\name{net_emds_for_all_graphs} -\alias{net_emds_for_all_graphs} +\name{netemd_many_to_many} +\alias{netemd_many_to_many} \title{NetEMDs between all graph pairs using provided Graphlet-based Degree Distributions} \usage{ -net_emds_for_all_graphs( - gdds, +netemd_many_to_many( + graphs = NULL, + dhists = NULL, method = "optimise", smoothing_window_width = 0, return_details = FALSE, - mc.cores = getOption("mc.cores", 2L) + mc.cores = getOption("mc.cores", 2L), + feature_type = "orbit", + max_graphlet_size = 5, + ego_neighbourhood_size = 0 ) } \arguments{ -\item{gdds}{List containing sets of Graphlet-based Degree Distributions for -all graphs being compared} +\item{graphs}{A list of network/graph objects from the \code{igraph} package. \code{graphs} can be set to \code{NULL} (default) if dhists is provided.} + +\item{dhists}{A list whose elements contain either: A list of \code{dhist} discrete histogram objects for each graph, or a list a matrix of network features (each column representing a feature). \code{dhists} can be set to \code{NULL} (default) if \code{graphs} is provided.} \item{method}{The method to use to find the minimum EMD across all potential offsets for each pair of histograms. Default is "optimise" to use @@ -36,6 +41,17 @@ minimal EMDs and associated offsets for all pairs of histograms} \item{mc.cores}{Number of cores to use for parallel processing. Defaults to the \code{mc.cores} option set in the R environment.} + +\item{feature_type}{Type of graphlet-based feature to count: "graphlet" +counts the number of graphlets each node participates in; "orbit" calculates +the number of graphlet orbits each node participates in.} + +\item{max_graphlet_size}{Determines the maximum size of graphlets to count. +Only graphlets containing up to \code{max_graphlet_size} nodes will be +counted. Possible values are 3,4, and 5 (default).} + +\item{ego_neighbourhood_size}{The number of steps from the source node to +include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0.} } \value{ NetEMD measures between all pairs of graphs for which GDDs diff --git a/man/virusppi.Rd b/man/virusppi.Rd index 6c905b2c..ddbb8eb2 100644 --- a/man/virusppi.Rd +++ b/man/virusppi.Rd @@ -5,9 +5,7 @@ \name{virusppi} \alias{virusppi} \title{Protein-protein interaction (PPI) networks for 5 microorganisms} -\format{ -A list of \code{igraph} objects. -} +\format{A list of \code{igraph} objects.} \source{ \strong{PPI data (EBV, HSV-1, KSHV, VZV):} Fossum E, Friedel CC, Rajagopala SV, Titz B, Baiker A, Schmidt T, et al. (2009) Evolutionarily Conserved Herpesviral Protein Interaction Networks. PLoS Pathog 5(9): e1000570. \url{https://doi.org/10.1371/journal.ppat.1000570}. Data from Table S2 in the supporting information. diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index ddbebea9..42cbc654 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -355,7 +355,7 @@ test_that("net_emd return 0 when comparing graphlet orbit degree distributions }) context("Measures NetEMD: All graphs in directory") -test_that("net_emds_for_all_graphs works", { +test_that("netemd_many_to_many works", { # Set source directory and file properties for Virus PPI graph edge files source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") edge_format <- "ncol" @@ -415,7 +415,7 @@ test_that("net_emds_for_all_graphs works", { # Comparison function for clarity compare_fn <- function(gdds) { - expect_equal(net_emds_for_all_graphs(gdds), expected_netemd_fn(gdds)) + expect_equal(netemd_many_to_many(dhists=gdds), expected_netemd_fn(gdds)) } # Map over test parameters, comparing actual gdds to expected diff --git a/vignettes/ManyToMany.Rmd b/vignettes/ManyToMany.Rmd index 649a7abf..3d625233 100644 --- a/vignettes/ManyToMany.Rmd +++ b/vignettes/ManyToMany.Rmd @@ -4,107 +4,152 @@ author: "Luis Ospina-Forero" date: "23/06/2020" output: rmarkdown::html_vignette vignette: > -%\VignetteIndexEntry{ManyToMany} - %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{PreComputedProps} %\VignetteEncoding{UTF-8} - --- + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- - ```{r, include = FALSE} +```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) - ``` +``` # Introduction - In some situations there is a need to compare multiple graphs among each other. For such situations, the `netdist` package contains some initial shortcut functions that perform such calculation and may incorporate parallel processing. + In some situations there is a need to compare multiple graphs among each other. For such situations, the `netdist` package contains some initial shortcut functions that perform such calculation and may automatically (Unix) incorporate parallel processing. + + This vignette does not show details about the Netdis and NetEmd network comparison methods or their variants, please check ["Default, simple and quick use usage 1: pairwise comparisons"](default_pairwise_usage.html) for these details. Instead, this vignette shows the default usage of the shortcut functions for many-to-many comparisons. - This Vignette follows similar examples as ["Default, simple and quick use usage 1: pairwise comparisons"](default_pairwise_usage.html) but shows the default usage of the shortcut functions for many-to-many comparisons. + Please note that for high performance computation, this shortcut functions may or may not be ideal, particualry in terms of RAM consumption. - # Load required packages/libraries - ```{r, packages, message= FALSE} +# Load required packages/libraries +```{r, packages, message= FALSE} # Load packages/libraries library("netdist") library("igraph") library("pheatmap") ``` -# Compare two networks via NetEmd. +# Compare Multiple networks via NetEmd. -## Comparing two graphs with NetEmd via subgraph counts. -```{r, netemd,fig.align='center'} -# Create lattice networks -gLat_1 <- graph.lattice(c(8,8)) -gLat_2 <- graph.lattice(c(44,44)) -gRing_1 <- make_ring(8^2) -gRing_2 <- make_ring(44^2) -gTree_1 <- make_tree(n = 8^2) -gTree_2 <- make_ring(n = 44^2) +(Extracted from Wegner et al. (2017)): NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties which reflect the topological organization of the network. From an abstract point of view, NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. +## Networks being compared. -#Providing a matrix of network features -counts_1= count_orbits_per_node(graph = gLat_1,max_graphlet_size = 5) -counts_2= count_orbits_per_node(graph = gLat_2,max_graphlet_size = 5) +Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 nodes. The plots of these networks illustrate clearly their structural differences. +```{r, netwokrs,fig.align='center'} +# Create networks +set.seed(3171) +gLat_1 <- graph.lattice(c(20,20)) +gLat_2 <- graph.lattice(c(40,40)) +gRing_1 <- make_ring(20^2) +gRing_2 <- make_ring(40^2) +gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) +gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) + +plot(gLat_1,vertex.size=0.8,vertex.label=NA) +plot(gRing_1,vertex.size=0.8,vertex.label=NA) +plot(gTree_1,vertex.size=0.8,vertex.label=NA) +``` -netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2) +## NetEmd using orbit Counts + +Orbit based NetEmd comparisons. +```{r, netemdorbits,fig.align='center'} +# NetEMD using subgraph counts. +glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) -#Providing the network features as lists of dhist objects -dhists_1<- graph_features_to_histograms(counts_1) -dhists_2<- graph_features_to_histograms(counts_2) +netemd_mat <- netemd_many_to_many(graphs = glist,smoothing_window_width = 1) #Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. -netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) +netemd_mat ``` -## Comparing two graphs with NetEmd via their Laplacian spectrum. -```{r, netemdEigen,fig.align='center'} -# Networks -gLat_1 <- graph.lattice(c(8,8)) -gLat_2 <- graph.lattice(c(44,44)) -#Laplacian -Lapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = FALSE,sparse = FALSE) -Lapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = FALSE,sparse = FALSE) +Illustration of the multiple NetEmd comparisons based on orbit counts. +```{r,netemdorbitsPLOT,fig.align='center'} +# Creating matrices to plot the Network comparison +g_NETEMDcomp <- graph.edgelist(el = as.matrix(netemd_mat$comp_spec[,1:2]),directed = FALSE) +edge_attr(graph = g_NETEMDcomp,name = "NetEmdOrbits") <- netemd_mat$netemds +mat <- get.adjacency(graph = g_NETEMDcomp,type = "both",attr = "NetEmdOrbits",names = TRUE,sparse = FALSE) +vnames <- rownames(mat) +mat -#Normalized Laplacian -NLapg_1 <- igraph::laplacian_matrix(graph = gLat_1,normalized = TRUE,sparse = FALSE) -NLapg_2 <- igraph::laplacian_matrix(graph = gLat_2,normalized = TRUE,sparse = FALSE) +#Network comparisons heatmap with Gold-Standard 1 +legend1 <- c(seq(min(mat),max(mat),length.out = 5)) +levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="NetEmd" +pheatmap::pheatmap(mat = as.dist(mat),cluster_rows = TRUE,cluster_cols = TRUE,clustering_method = "ward.D",angle_col=45,main = "NetEmd Orbit Counts",labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) +``` + +## NetEmd using the Laplacian and Normalized Laplacian Spectrum. +```{r, netemdspectrum,fig.align='center'} +# NetEMD using the Laplacian and normalized Laplacian Spectrum. +SPECT<-list() +#this step may take several minutes. +for(i in 1:length(glist)){ + Lapg <- igraph::laplacian_matrix(graph = glist[[i]],normalized = FALSE,sparse = FALSE) + NLap <- igraph::laplacian_matrix(graph = glist[[i]],normalized = TRUE,sparse = FALSE) + SPECT[[ names(glist)[i] ]] <- cbind(L.Spectra= eigen(Lapg)$values, NL.Spectra= eigen(NLap)$values) +} -# Providing a matrix of network features (e.g. Spectra). Spectra (This may take a couple of minutes). -spec_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) -spec_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) +netemd_mat <- netemd_many_to_many(dhists = SPECT,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. -netemd_one_to_one(dhists_1 = spec_1,dhists_2 = spec_2, smoothing_window_width = 0) +netemd_mat +``` -# Providing pre-computed dhist objects from network features -dhists_1<- graph_features_to_histograms(spec_1) -dhists_2<- graph_features_to_histograms(spec_2) +Illustration of the multiple NetEmd comparisons based on the Laplacian and Normalized Laplacian spectra. +```{r,netemdspectrumPLOT,fig.align='center'} +# Creating matrices to plot the Network comparison +g_NETEMDcomp <- graph.edgelist(el = as.matrix(netemd_mat$comp_spec[,1:2]),directed = FALSE) +edge_attr(graph = g_NETEMDcomp,name = "NetEmdOrbits") <- netemd_mat$netemds +mat <- get.adjacency(graph = g_NETEMDcomp,type = "both",attr = "NetEmdOrbits",names = TRUE,sparse = FALSE) +vnames <- rownames(mat) +mat -netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2, smoothing_window_width = 0) +#Network comparisons heatmap with Gold-Standard 1 +legend1 <- c(seq(min(mat),max(mat),length.out = 5)) +levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="NetEmd" +pheatmap::pheatmap(mat = as.dist(mat),cluster_rows = TRUE,cluster_cols = TRUE,clustering_method = "ward.D",angle_col=45,main = "NetEmd Spectra",labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) ``` + + ------------------------- # Compare two networks via Netdis and its variants. +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared thus leading to the Netdis statistic. -## Using netdis with a gold-standard graph to obtain $E_w$. -The selection of a gold-standard graph as a substitute for $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This option will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard (summarized in $E_w$). +## Using Netdis with a reference or gold-standard graph to obtain the expectations $E_w$. +The selection of a gold-standard graph as a substitute for a theoretical $E_w$ could be done when such graph is known to be a good proxy for $E_w$, or alternatively as a good reference point for the comparison. This netdis variant will focus on detecting discrepancies between the networks relative to the ego-network structure of the reference network / gold-standard given in $E_w$. -```{r,netdisgoldstand,fig.align='center'} -# Create lattice, Ring and Tree like networks of sizes 8^2 and 44^2. + +# Netdis using reference or goldstandard networks. + +Generation of regular grid, ring and tree-like networks with 400 nodes and 1600 nodes. Reference graphs given by start-like graphs are used as illustration of reference graphs. The plots of these networks illustrate clearly their structural differences. +```{r,netdisgoldstandnetworks,fig.align='center'} +# Create lattice, Ring and Tree like networks of sizes 20^2 and 40^2. +# Create networks set.seed(3171) -gLat_1 <- graph.lattice(c(8,8)) -gLat_2 <- graph.lattice(c(44,44)) -gRing_1 <- make_ring(8^2) -gRing_2 <- make_ring(44^2) -gTree_1 <- make_tree(n = 8^2,children = 3) -gTree_2 <- make_tree(n = 44^2,children = 3) +gLat_1 <- graph.lattice(c(20,20)) +gLat_2 <- graph.lattice(c(40,40)) +gRing_1 <- make_ring(20^2) +gRing_2 <- make_ring(40^2) +gTree_1 <- igraph::as.undirected( make_tree(n = 20^2,children = 3) ) +gTree_2 <- igraph::as.undirected( make_tree(n = 40^2,children = 3) ) # Create a random graph to be used as a gold-standard -gst_1 <- graph.star(8^2,) -gst_2 <- graph.star(44^2) +gst_1 <- graph.star(20^2,) +gst_2 <- graph.star(40^2) + +plot(gst_1,vertex.size=0.8,vertex.label=NA) +``` +Obtain the comparison via Netdis using each of the reference graph networks. +```{r,netdisgoldstand,fig.align='center'} # Netdis using the goldstd_1 graph as gold-standard reference point glist <- list(Lat_1=gLat_1, Lat_2=gLat_2, Ring_1=gRing_1, Ring_2=gRing_1, Tree_1=gTree_1, Tree_2=gTree_2) @@ -116,39 +161,31 @@ netdis_mat_gst1 netdis_mat_gst2 ``` + ```{r,netdisgoldstandPLOT,fig.align='center'} -# Creating matrices to plot the the comparison -mat_gst1 <- matrix(nrow=6,ncol=6,0) -mat_gst2 <- matrix(nrow=6,ncol=6,0) -vnames <- rep(NA,6) -for(i in 1:nrow(netdis_mat_gst1$comp_spec)){ - ind1 <- netdis_mat_gst1$comp_spec[i,"index_a"] - ind2 <- netdis_mat_gst1$comp_spec[i,"index_b"] - mat_gst1[ind1,ind2] <- netdis_mat_gst1$netdis["netdis4",i] - mat_gst1[ind2,ind1] <- mat_gst1[ind1,ind2] - # - ind1 <- netdis_mat_gst2$comp_spec[i,"index_a"] - ind2 <- netdis_mat_gst2$comp_spec[i,"index_b"] - mat_gst2[ind1,ind2] <- netdis_mat_gst2$netdis["netdis4",i] - mat_gst2[ind2,ind1] <- mat_gst2[ind1,ind2] - # - vnames[ind1] <- as.character(netdis_mat_gst1$comp_spec[i,"name_a"]) - vnames[ind2] <- as.character(netdis_mat_gst1$comp_spec[i,"name_b"]) -} -rownames(mat_gst1) <- vnames -colnames(mat_gst1) <- vnames -rownames(mat_gst2) <- vnames -colnames(mat_gst2) <- vnames +# Creating matrices to plot the Network comparison with Gold-Standard 1 +g_NETDIScomp_1 <- graph.edgelist(el = as.matrix(netdis_mat_gst1$comp_spec[,1:2]),directed = FALSE) +edge_attr(graph = g_NETDIScomp_1,name = "Netdis4") <- netdis_mat_gst1$netdis[2,] +mat_gst1 <- get.adjacency(graph = g_NETDIScomp_1,type = "both",attr = "Netdis4",names = TRUE,sparse = FALSE) +vnames <- rownames(mat_gst1) +mat_gst1 + +# Creating matrices to plot the Network comparison with Gold-Standard 2 +g_NETDIScomp_2 <- graph.edgelist(el = as.matrix(netdis_mat_gst2$comp_spec[,1:2]),directed = FALSE) +edge_attr(graph = g_NETDIScomp_2,name = "Netdis4") <- netdis_mat_gst2$netdis[2,] +mat_gst2 <- get.adjacency(graph = g_NETDIScomp_2,type = "both",attr = "Netdis4",names = TRUE,sparse = FALSE) +vnames <- rownames(mat_gst2) +mat_gst2 #Network comparisons heatmap with Gold-Standard 1 legend1 <- c(seq(min(mat_gst1),max(mat_gst1),length.out = 5)) levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="Netdis" -pheatmap::pheatmap(mat = as.dist(mat_gst1),cluster_rows = TRUE,clustering_method = "average",angle_col=45,main = "Netdis GoldStd-1",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) +pheatmap::pheatmap(mat = as.dist(mat_gst1),cluster_rows = TRUE,clustering_method = "ward.D",angle_col=45,main = "Netdis GoldStd-1",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) #Network comparisons heatmap with Gold-Standard 2 legend2 <- c(seq(min(mat_gst2),max(mat_gst2),length.out = 5)) levels2 <- round(legend2,digits = 2); levels2[length(levels2)]="Netdis" -pheatmap::pheatmap(mat = as.dist(mat_gst2),cluster_rows = TRUE,clustering_method = "average",angle_col=45,main = "Netdis GoldStd-1",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend2,legend_labels = levels2) +pheatmap::pheatmap(mat = as.dist(mat_gst2),cluster_rows = TRUE,clustering_method = "ward.D",angle_col=45,main = "Netdis GoldStd-2",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend2,legend_labels = levels2,kmeans_k = NA) ``` @@ -158,52 +195,47 @@ This variant focuses on detecting more general and global discrepancies between ```{r, netdisGP} # Netdis Geometric-Poisson comparisons -netdis_mat_gst1 <- netdis_many_to_many(graphs = glist, ref_graph = NULL) -netdis_mat_gst1 +netdis_mat <- netdis_many_to_many(graphs = glist, ref_graph = NULL) +netdis_mat ``` ```{r,netdisGPPLOT,fig.align='center'} -# Creating matrices to plot the the comparison -mat_gst1 <- matrix(nrow=6,ncol=6,0) -vnames <- rep(NA,6) -for(i in 1:nrow(netdis_mat_gst1$comp_spec)){ - ind1 <- netdis_mat_gst1$comp_spec[i,"index_a"] - ind2 <- netdis_mat_gst1$comp_spec[i,"index_b"] - mat_gst1[ind1,ind2] <- netdis_mat_gst1$netdis["netdis4",i] - mat_gst1[ind2,ind1] <- mat_gst1[ind1,ind2] - # - vnames[ind1] <- as.character(netdis_mat_gst1$comp_spec[i,"name_a"]) - vnames[ind2] <- as.character(netdis_mat_gst1$comp_spec[i,"name_b"]) -} -rownames(mat_gst1) <- vnames -colnames(mat_gst1) <- vnames +# Creating matrices to plot the Network comparison with Gold-Standard 1 +g_NETDIScomp_GP <- graph.edgelist(el = as.matrix(netdis_mat$comp_spec[,1:2]),directed = FALSE) +edge_attr(graph = g_NETDIScomp_GP,name = "Netdis4") <- netdis_mat$netdis[2,] +mat_GP <- get.adjacency(graph = g_NETDIScomp_GP,type = "both",attr = "Netdis4",names = TRUE,sparse = FALSE) +vnames <- rownames(mat_GP) +mat_GP #Network comparisons heatmap with Gold-Standard 1 -legend1 <- c(seq(min(mat_gst1),max(mat_gst1),length.out = 5)) -levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="Netdis-GP" -pheatmap::pheatmap(mat = as.dist(mat_gst1),cluster_rows = TRUE,clustering_method = "average",angle_col=45,main = "Netdis GoldStd-1",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) +legend1 <- c(seq(min(mat_GP),max(mat_GP),length.out = 5)) +levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="Netdis" +pheatmap::pheatmap(mat = as.dist(mat_GP),cluster_rows = TRUE,clustering_method = "ward.D",angle_col=45,main = "Netdis-GP",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) ``` ## Using Netdis with no expectation ($E_w=0$) Comparing the networks via their observed ego counts without centering them, (equivalent to using expectation equal to zero). This variant thus focuses on detecting small discrepancies between the networks. -```{r,netdiszero} -# Set source directory for Virus protein-protein interaction edge files stored in the netdist package. -source_dir <- system.file(file.path("extdata", "VRPINS"), package = "netdist") - -# Load query graphs as igraph objects -graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") -graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") -#Netdis using no expectations (or equivalently, expectation equal to zero). -netdis_one_to_one(graph_1= graph_1, graph_2= graph_2, ref_graph = 0) +```{r, netdiszero} +# Netdis Geometric-Poisson comparisons +netdis_mat <- netdis_many_to_many(graphs = glist, ref_graph = 0) +netdis_mat +``` -# Providing pre-calculated subgraph counts. -props_1 <- count_graphlets_ego(graph = graph_1) -props_2 <- count_graphlets_ego(graph = graph_2) +```{r,netdiszeroPLOT,fig.align='center'} +# Creating matrices to plot the Network comparison with Gold-Standard 1 +g_NETDIScomp_zero <- graph.edgelist(el = as.matrix(netdis_mat$comp_spec[,1:2]),directed = FALSE) +edge_attr(graph = g_NETDIScomp_zero,name = "Netdis4") <- netdis_mat$netdis[2,] +mat_zero <- get.adjacency(graph = g_NETDIScomp_zero,type = "both",attr = "Netdis4",names = TRUE,sparse = FALSE) +vnames <- rownames(mat_zero) +mat_zero -netdis_one_to_one(graphlet_counts_1= props_1,graphlet_counts_2= props_2, ref_graph = 0) +#Network comparisons heatmap with Gold-Standard 1 +legend1 <- c(seq(min(mat_zero),max(mat_zero),length.out = 5)) +levels1 <- round(legend1,digits = 2); levels1[length(levels1)]="Netdis" +pheatmap::pheatmap(mat = as.dist(mat_zero),cluster_rows = TRUE,clustering_method = "ward.D",angle_col=45,main = "Netdis-zero",treeheight_row = 80,labels_row = vnames,labels_col = vnames,display_numbers = TRUE,legend_breaks = legend1,legend_labels = levels1) ``` ------------------------- diff --git a/vignettes/PreComputedProps.Rmd b/vignettes/PreComputedProps.Rmd index 089ac96b..3d8e8b66 100644 --- a/vignettes/PreComputedProps.Rmd +++ b/vignettes/PreComputedProps.Rmd @@ -34,28 +34,28 @@ library("igraph") ## Comparing two graphs with NetEmd via subgraph counts. ```{r, netemd,fig.align='center'} # Create lattice networks -gLat_1 <- graph.lattice(c(8,8)) +gLat_1 <- graph.lattice(c(20,20)) gLat_2 <- graph.lattice(c(44,44)) -netemd_one_to_one(graph_1=gLat_1,graph_2=gLat_2,feature_type="orbit",max_graphlet_size=5) +netemd_one_to_one(graph_1=gLat_1,graph_2=gLat_2,feature_type="orbit",smoothing_window_width = 1) #Providing a matrix of network features counts_1= count_orbits_per_node(graph = gLat_1,max_graphlet_size = 5) counts_2= count_orbits_per_node(graph = gLat_2,max_graphlet_size = 5) -netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2) +netemd_one_to_one(dhists_1=counts_1, dhists_2=counts_2,smoothing_window_width = 1) #Providing the network features as lists of dhist objects dhists_1<- graph_features_to_histograms(counts_1) dhists_2<- graph_features_to_histograms(counts_2) -netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2) +netemd_one_to_one(dhists_1=dhists_1, dhists_2=dhists_2,smoothing_window_width = 1) ``` ## Comparing two graphs with NetEmd via their Laplacian spectrum. ```{r, netemdEigen,fig.align='center'} # Networks -gLat_1 <- graph.lattice(c(8,8)) +gLat_1 <- graph.lattice(c(20,20)) gLat_2 <- graph.lattice(c(44,44)) #Laplacian @@ -95,7 +95,7 @@ graph_1 <- read_simple_graph(file.path(source_dir, "EBV.txt"),format = "ncol") graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"),format = "ncol") # Lattice graphs to be used as gold-standard as a reference point comparison -goldstd_1 <- igraph::graph.lattice(c(8,8)) #Graph with 8^2 nodes +goldstd_1 <- igraph::graph.lattice(c(20,20)) #Graph with 8^2 nodes goldstd_2 <- igraph::graph.lattice(c(44,44)) #Graph with 44^2 nodes # Netdis using the goldstd_1 graph as gold-standard reference point diff --git a/vignettes/default_pairwise_usage.Rmd b/vignettes/default_pairwise_usage.Rmd index a8ffcebc..e5ff1e64 100644 --- a/vignettes/default_pairwise_usage.Rmd +++ b/vignettes/default_pairwise_usage.Rmd @@ -7,7 +7,7 @@ vignette: > %\VignetteIndexEntry{default_pairwise_usage} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} - --- +--- ```{r, include = FALSE} knitr::opts_chunk$set( @@ -59,7 +59,7 @@ plot(graph_2,vertex.size=0.5,vertex.label=NA) ## What is NetEmd? (Extracted from Wegner et al. (2017)): -NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties reflects the topological organization of the network. From an abstract point of view NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. +NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties which reflect the topological organization of the network. From an abstract point of view, NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. Based on these NetEmd uses the following measure between distributions $p$ and $q$ that are supported on $\mathbb{R}$ and have non-zero, finite variances: \begin{equation}\label{emdmet} @@ -93,7 +93,7 @@ graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") # One to one NetEmd comparison. -netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ``` ## Comparing two graphs with NetEmd via their Laplacian spectrum. @@ -110,14 +110,14 @@ NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = F props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) -netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0) +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ``` ------------------------- # Compare two networks via Netdis and its variants. ## What is Netdis? -(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. The centred counts between the networks are compared to form the Netdis statistic. +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared thus leading to the Netdis statistic. Netdis is constructed as follows: diff --git a/vignettes/dendrogram_example_net_emd.Rmd b/vignettes/dendrogram_example_net_emd.Rmd index 72fd2750..41434cb1 100644 --- a/vignettes/dendrogram_example_net_emd.Rmd +++ b/vignettes/dendrogram_example_net_emd.Rmd @@ -40,7 +40,7 @@ names(virus_gdds) # of unit width (smoothing_window_width = 1). Returns a named list containing: # (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(virus_gdds, smoothing_window_width = 0) # You can also specify method = "fixed_step" to use the much slower method of # exhaustively evaluating the EMD at all offsets separated by a fixed step. @@ -51,7 +51,7 @@ res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) # normalised to unit variance # Convert to matrix for input to dendrogram method -netemd_mat <- cross_comp_to_matrix(res$net_emds, res$comp_spec) +netemd_mat <- cross_comp_to_matrix(res$netemds, res$comp_spec) netemd_mat ``` @@ -62,7 +62,7 @@ plot(phangorn::upgma(as.dist(netemd_mat), method="average"), use.edge.length=FAL edge.width=cex*2, main=title, cex.lab=cex, cex.axis=cex, cex.main=cex, cex.sub=cex, cex=cex) -# The gdd_for_all_graphs and net_emds_for_all_graphs functions will run in +# The gdd_for_all_graphs and netemd_many_to_many functions will run in # parallel using multiple threads where supported. The number of threads # used is determined by the global R option "mc.cores". You can inspect the # current value of this using options("mc.cores") and set it with From 2d34b25a1238ad4f5c14b094d26df63189aa8d9f Mon Sep 17 00:00:00 2001 From: Luis Ospina Date: Mon, 13 Jul 2020 14:18:32 +0100 Subject: [PATCH 117/188] Correct renaming of net emd many to many. Package documentation compiling correctly. --- NAMESPACE | 1 + R/measures_net_emd.R | 22 ++++++++++++---------- doc/dendrogram_example_net_emd.Rmd | 2 +- vignettes/dendrogram_example_net_emd.Rmd | 2 +- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f2241414..476e7e9a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(netdis_one_to_many) export(netdis_one_to_one) export(netdis_subtract_exp_counts) export(netdis_uptok) +export(netemd_many_to_many) export(netemd_one_to_one) export(netemd_single_pair) export(normalise_dhist_mass) diff --git a/R/measures_net_emd.R b/R/measures_net_emd.R index f9a7b497..46b0a89c 100755 --- a/R/measures_net_emd.R +++ b/R/measures_net_emd.R @@ -25,11 +25,6 @@ #' which results in no smoothing. Care should be taken to select a #' \code{smoothing_window_width} that is appropriate for the discrete domain #' (e.g.for the integer domain a width of 1 is the natural choice) -#' @return NetEMD measure for the two sets of discrete histograms (or graphs). If -#' (\code{return_details = FALSE}) then a list with the following named elements is returned -#' \code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: -#' the minimal EMD for each pair of histograms, \code{min_offsets}: the associated -#' offsets giving the minimal EMD for each pair of histograms #' @param feature_type Type of graphlet-based feature to count: "graphlet" #' counts the number of graphlets each node participates in; "orbit" calculates #' the number of graphlet orbits each node participates in. @@ -38,6 +33,11 @@ #' counted. Possible values are 3,4, and 5 (default). #' @param ego_neighbourhood_size The number of steps from the source node to #' include nodes for each ego-network. NetEmd was proposed for individual nodes alone, hence the default value is 0. +#' @return NetEMD measure for the two sets of discrete histograms (or graphs). If +#' (\code{return_details = FALSE}) then a list with the following named elements is returned +#' \code{net_emd}: the NetEMD for the set of histogram pairs (or graphs), \code{min_emds}: +#' the minimal EMD for each pair of histograms, \code{min_offsets}: the associated +#' offsets giving the minimal EMD for each pair of histograms #' @examples #' require(igraph) #' goldstd_1 <- graph.lattice(c(8,8)) @@ -177,8 +177,7 @@ netemd_one_to_one <- function(graph_1=NULL,graph_2=NULL,dhists_1=NULL, dhists_2= #' matrices for each graph pair: \code{min_emds}: the minimal EMD for each GDD #' used to compute the NetEMD, \code{min_offsets}: the associated offsets giving #' the minimal EMD for each GDD -#' @export -#' +#' @export netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smoothing_window_width = 0, return_details = FALSE, mc.cores = getOption("mc.cores", 2L),feature_type="orbit",max_graphlet_size = 5,ego_neighbourhood_size = 0) { if(max_graphlet_size > 4 & mc.cores > 1) print(paste("This function will compute orbits of graphlets up to size 5 using ", mc.cores," cores. Depending on the density and size of the graphs, this may lead to a large compsumption of RAM.")) @@ -202,8 +201,12 @@ netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smo which_imput_type <- "Graphs" } if (!is.null(dhists) ) { - if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) { which_imput_type <- "dhist" } - if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) { which_imput_type <- "Matrix" } + if (all(( unlist(sapply(X = dhists, FUN = is.matrix)) ) ) ) {which_imput_type <- "Matrix"} + if (all(( unlist(sapply(X = dhists, FUN = + function(l){ all(( unlist(sapply(X = l, FUN = is_dhist)) ) ) } + )) ) ) ) {which_imput_type <- "dhist"} else { + warning("dhists does not conform to a Matrix or dhist class for all elmenents/netwroks in the list.") + } } ## ------------------------------------------------------------------------ # Check arguments 2 @@ -250,7 +253,6 @@ netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smo } } - #' Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms #' #' Calculates the minimum Earth Mover's Distance (EMD) between two diff --git a/doc/dendrogram_example_net_emd.Rmd b/doc/dendrogram_example_net_emd.Rmd index a684d9df..3817b5a5 100644 --- a/doc/dendrogram_example_net_emd.Rmd +++ b/doc/dendrogram_example_net_emd.Rmd @@ -40,7 +40,7 @@ names(virus_gdds) # of unit width (smoothing_window_width = 1). Returns a named list containing: # (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- net_emds_for_all_graphs(virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) # You can also specify method = "fixed_step" to use the much slower method of # exhaustively evaluating the EMD at all offsets separated by a fixed step. diff --git a/vignettes/dendrogram_example_net_emd.Rmd b/vignettes/dendrogram_example_net_emd.Rmd index 41434cb1..4a70fb8d 100644 --- a/vignettes/dendrogram_example_net_emd.Rmd +++ b/vignettes/dendrogram_example_net_emd.Rmd @@ -40,7 +40,7 @@ names(virus_gdds) # of unit width (smoothing_window_width = 1). Returns a named list containing: # (i) the NetEMDs and (ii) a table containing the graph names and indices # within the input GDD list for each pair of graphs compared. -res <- netemd_many_to_many(virus_gdds, smoothing_window_width = 0) +res <- netemd_many_to_many(dhists= virus_gdds, smoothing_window_width = 0) # You can also specify method = "fixed_step" to use the much slower method of # exhaustively evaluating the EMD at all offsets separated by a fixed step. From df9a4673eda5281ad996baaf9a66eaa7ff79ccf1 Mon Sep 17 00:00:00 2001 From: Luis Ospina Date: Mon, 13 Jul 2020 15:54:03 +0100 Subject: [PATCH 118/188] ManyToMany renaming and vignette working. --- R/measures_net_emd.R | 1 + tests/testthat/test_measures_net_emd.R | 6 +++--- vignettes/ManyToMany.Rmd | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/measures_net_emd.R b/R/measures_net_emd.R index 46b0a89c..f1d2acaf 100755 --- a/R/measures_net_emd.R +++ b/R/measures_net_emd.R @@ -251,6 +251,7 @@ netemd_many_to_many<- function(graphs=NULL,dhists=NULL, method = "optimise", smo net_emds <- out ret <- list(netemds = net_emds, comp_spec = comp_spec) } + return(ret) } #' Internal function to compute the minimum Earth Mover's Distance between standarized and translated histograms diff --git a/tests/testthat/test_measures_net_emd.R b/tests/testthat/test_measures_net_emd.R index 42cbc654..6a99c71b 100644 --- a/tests/testthat/test_measures_net_emd.R +++ b/tests/testthat/test_measures_net_emd.R @@ -144,10 +144,10 @@ test_that("net_emd returns 0 when comparing any normal histogram randomly offset offset_lists <- replicate(num_hists, offsets, simplify = FALSE) netemd_offset_self <- function(dhist, offsets, method) { - net_emds <- purrr::map_dbl(offsets, function(offset) { + netemds <- purrr::map_dbl(offsets, function(offset) { netemd_one_to_one(dhists_1 = dhist, dhists_2 = shift_dhist(dhist, offset), method = method) }) - return(net_emds) + return(netemds) } expected <- 0 @@ -402,7 +402,7 @@ test_that("netemd_many_to_many works", { # individually and combine into expected output for code under test expected_netemd_fn <- function(gdds) { list( - net_emds = c( + netemds = c( netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$ECL), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 = gdds$EBV, dhists_2 = gdds$KSHV), netemd_one_to_one(dhists_1 =gdds$EBV, dhists_2 = gdds$VZV), netemd_one_to_one(dhists_1 = gdds$ECL, dhists_2 = gdds$HSV), netemd_one_to_one(dhists_1 =gdds$ECL, dhists_2 = gdds$KSHV), diff --git a/vignettes/ManyToMany.Rmd b/vignettes/ManyToMany.Rmd index 3d625233..1bfe7a84 100644 --- a/vignettes/ManyToMany.Rmd +++ b/vignettes/ManyToMany.Rmd @@ -1,10 +1,10 @@ --- -title: 'Default, simple and quick use usage 3: many to many comparions' +title: 'Default, simple and quick use usage 3: many to many comparisons' author: "Luis Ospina-Forero" date: "23/06/2020" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{PreComputedProps} + %\VignetteIndexEntry{ManyToManyComp} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: From 4f0f7cea6ea16a7d4fcab08d737c92c9556da6af Mon Sep 17 00:00:00 2001 From: Luis Ospina Date: Mon, 13 Jul 2020 21:25:11 +0100 Subject: [PATCH 119/188] Build checks done and passed with zero errors --- Meta/vignette.rds | Bin 473 -> 532 bytes doc/V-Menu.Rmd | 4 +- doc/V-Menu.html | 4 +- doc/default_pairwise_usage.R | 4 +- doc/default_pairwise_usage.Rmd | 10 ++--- doc/default_pairwise_usage.html | 34 +++++++-------- doc/dendrogram_example_net_dis.html | 4 +- doc/dendrogram_example_net_emd.R | 2 +- doc/dendrogram_example_net_emd.Rmd | 2 +- doc/dendrogram_example_net_emd.html | 62 +++++++++++++-------------- doc/netdis_2graphs_polya-aeppli.html | 4 +- doc/netdis_customisations.html | 4 +- doc/netdis_pairwise_comparisons.html | 4 +- doc/quickstart_netdis_2graphs.Rmd | 2 +- doc/quickstart_netdis_2graphs.html | 4 +- vignettes/ManyToMany.Rmd | 2 +- 16 files changed, 75 insertions(+), 71 deletions(-) diff --git a/Meta/vignette.rds b/Meta/vignette.rds index 2ce1eb8b9f4262bf7afc32d72176154781a215f4..eda91ed9f5d925cc45b3531058d27a84712d2535 100644 GIT binary patch literal 532 zcmV+v0_*)BiwFP!000001GSXTZ__Xo$FsC6TY)h#evAz^bK=w<_8*{?aX>`{O%u1s zO5D{Vj*ac8>y5vfBi&2-Tsdu;?XVr}*w3Fn+h2cQe|3W(2!mE948j%+TL;i}pzTBJ z2JK)MzDEfg(>%42qFhh8VNq`AHS7Nr2@)p>%MvZGH5C!Np+coB%9xE3Ztg)bk$BYJ zkY=+>S@d4?6hO!pox!iw#6(J_Guop}sg!%zJH+DLSSh%nmdni4M8;B+@8w2! zLtlo2bsdYFVD!@>5$TaLvN$xEayubkN940oTHbI`YUZ8{@o}ETMVM5DMd}zScO&A7 zipc(|pcw!7qY#snYI+y-&WB)UA?S*j#~Z9Uu_ARZ-r&u9V1+q7?1kq}FHD;3>2XK( z{;bJVYfz7Sz2;DF@no|_J?bfXv3?A_NNO{{KbX$0j?TiX$aIu5I_61MfKhpLk zx64UuG`PUwmQ&)%fGK!sSExJR*la2_%&QyiYB}w@&7MOG|3WvP&;PBSC{C$yCvw4V WLM`o&!AlHg|8D`(J6A2Z1^@t0WD1=C literal 473 zcmV;~0Ve(*iwFP!000001Fe%!Pr@)5#^*o;jERYfe^9ZWJi)1)UopeVp6P0)?V48is$Ro< zTM-%JG}eK@RF0`4L8|a8={~Xuh;~GhNb*;SSwJS3iI@Zl(LqGj8WdrXL!FtBSr=Ry zPK;C_c>IAom}j)Ay8yZ%SjSgZ#-3)O_yr zNs?xfyf*vA+Stu|iyI@jF&c7-?z4k_hpziQfG*}sPzPP2d3tC}^$6V#(3KF9PbkY2 zuu=m)Op|aHMtPZlItI$-jvR>qIe(bRgED=b)SWy4tj#4_lIM6u>5Mds?K_p!&KlUP zcpMCsRgc59ip!^9r^5Pixc2G-a;R;npe&Tv=x~+2;QvXPxqe|XYuP$0H5NbzD{Qs2 zN3p5wqnB6;kh(RL8SiCU%r)`;@cpNKs=nzqAkWPNGos*U4T9Ah8e4~NK2g1xR0h2p z7;e~`ZuN*TZ)0szzat-GF3miDP($)@*B-WGn*V9K>2&(FoKYBKWiztDW`s5F4$bQm PGyDDlvDLaQ)&&3n88YOA diff --git a/doc/V-Menu.Rmd b/doc/V-Menu.Rmd index 2eae3bef..93723f7b 100644 --- a/doc/V-Menu.Rmd +++ b/doc/V-Menu.Rmd @@ -22,6 +22,8 @@ knitr::opts_chunk$set( This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods 'Netdis', 'NetEmd' and their variants (e.g. `Netdis Geometric-Poisson'), and also the large flexibility of these methodologies. The following is a list of the available vignettes: -* [Default, simple and quick use usage 1](default_pairwise_usage.html): +* [Default, simple and quick use usage 1: pairwise comparisons](default_pairwise_usage.html): +* [Default, simple and quick use usage 2: precomputed properties](PreComputedProps.html): +* [Default, simple and quick use usage 3: many to many comparions](ManyToMany.html): diff --git a/doc/V-Menu.html b/doc/V-Menu.html index a41e987a..35637614 100644 --- a/doc/V-Menu.html +++ b/doc/V-Menu.html @@ -235,7 +235,9 @@

10-06-2020

Netdis introductory Vignettes

This package contains multiple vignettes that illustrate the simple ways of employing the network comparison methods ‘Netdis’, ‘NetEmd’ and their variants (e.g. `Netdis Geometric-Poisson’), and also the large flexibility of these methodologies. The following is a list of the available vignettes:

diff --git a/doc/default_pairwise_usage.R b/doc/default_pairwise_usage.R index 881c5cde..a56afaa6 100644 --- a/doc/default_pairwise_usage.R +++ b/doc/default_pairwise_usage.R @@ -45,7 +45,7 @@ graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") # One to one NetEmd comparison. -netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ## ---- netemdEigen,fig.align='center'------------------------------------------ #Laplacian @@ -60,7 +60,7 @@ NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = F props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) -netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0) +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ## ----netdisgoldstand,fig.align='center'--------------------------------------- # Lattice graphs to be used as gold-standard as a reference point comparison diff --git a/doc/default_pairwise_usage.Rmd b/doc/default_pairwise_usage.Rmd index 0ae05e20..e5ff1e64 100644 --- a/doc/default_pairwise_usage.Rmd +++ b/doc/default_pairwise_usage.Rmd @@ -1,5 +1,5 @@ --- -title: "Default pairwise usage of the network comparison methods" +title: "Default, simple and quick use usage 1: pairwise comparisons" date: "`10-06-2020`" author: "Luis Ospina-Forero" output: rmarkdown::html_vignette @@ -59,7 +59,7 @@ plot(graph_2,vertex.size=0.5,vertex.label=NA) ## What is NetEmd? (Extracted from Wegner et al. (2017)): -NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties reflects the topological organization of the network. From an abstract point of view NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. +NetEmd is based on the idea that the information encapsulated in the shape of the degree distribution and other network properties which reflect the topological organization of the network. From an abstract point of view, NetEmd views the shape of a distribution as a property that is invariant under linear deformations i.e$.$ translations and re-scalings of the axis. Based on these NetEmd uses the following measure between distributions $p$ and $q$ that are supported on $\mathbb{R}$ and have non-zero, finite variances: \begin{equation}\label{emdmet} @@ -93,7 +93,7 @@ graph_2 <- read_simple_graph(file.path(source_dir, "ECL.txt"), format = "ncol") # One to one NetEmd comparison. -netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",max_graphlet_size=5) +netemd_one_to_one(graph_1=graph_1,graph_2=graph_2,feature_type="orbit",smoothing_window_width = 1)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ``` ## Comparing two graphs with NetEmd via their Laplacian spectrum. @@ -110,14 +110,14 @@ NLapg_2 <- igraph::laplacian_matrix(graph = graph_2,normalized = TRUE,sparse = F props_1 <- cbind(L.Spectra= eigen(Lapg_1)$values, NL.Spectra= eigen(NLapg_1)$values) props_2 <- cbind(L.Spectra= eigen(Lapg_2)$values, NL.Spectra= eigen(NLapg_2)$values) -netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0) +netemd_one_to_one(dhists_1 = props_1,dhists_2 = props_2,smoothing_window_width = 0)#Use of smoothing window 1 is given for discrete integer distributions. If the network features are considered continuous variables smoothing_window_width equal to zero is recommended. ``` ------------------------- # Compare two networks via Netdis and its variants. ## What is Netdis? -(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. The centred counts between the networks are compared to form the Netdis statistic. +(Extracted from Ali et al. (2014)): Netdis counts small subgraphs $w$ on $k$ nodes for all 2-step ego-networks, $k=3,4,5$. These counts are centred by subtracting the expected number of counts $E_w$. These centred counts of each network are then compared thus leading to the Netdis statistic. Netdis is constructed as follows: diff --git a/doc/default_pairwise_usage.html b/doc/default_pairwise_usage.html index ee3d8ee9..13f24329 100644 --- a/doc/default_pairwise_usage.html +++ b/doc/default_pairwise_usage.html @@ -13,7 +13,7 @@ -Default pairwise usage of the network comparison methods +Default, simple and quick use usage 1: pairwise comparisons