Merge pull request #57 from JuliaDynamics/PR-Hegger_Cao_dimension

hegger dimension and automated Cao's method
JuliaDynamics · Nov 14, 2020 · c20d2fa · c20d2fa · Datseris · Nov 14, 2020
2 parents 82ce3bd + 8817dbb
commit c20d2fa
Show file tree

Hide file tree

Showing 18 changed files with 1,044 additions and 680 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+# v1.12.0
+* Possible delay times in `optimal_traditional_de` are now `1:100` for increased accuracy.
+* New method for univariate non-unified delay embedding by Hegger, Kantz
+* It is now possible to `embed` in one dimension (which just returns the vector as a Dataset)
+* New function `optimal_traditional_de` for automated delay embeddings
+* `delay_afnn, delay_ifnn, delay_fnn, delay_f1nn` are part of public API now.
+* The argument `γs` and the function `reconstruct` is starting to be phased out in
+  favor of `ds` and `embed`.
+
+## Deprecations
+* Using `estimate_dimension` is deprecated in favor of either calling `afnn, fnn, ...` directly or using the function `optimal_traditional_de`
+
 # v1.11.0
 * Dropped RecipesBase
 * Now exports `dimension` (there was a problem where we have forgotten to export `dimension` from here, and `DynamicalSystemsBase` was overwritting it)

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DelayEmbeddings"
 uuid = "5732040d-69e3-5649-938a-b6b4f237613f"
 repo = "https://github.com/JuliaDynamics/DelayEmbeddings.jl.git"
-version = "1.11.0"
+version = "1.12.0"
 
 [deps]
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
@@ -21,7 +21,7 @@ NearestNeighbors = "0.4"
 Neighborhood = "0.1, 0.2"
 StaticArrays = "0.8, 0.11, 0.12"
 StatsBase = "0.24, 0.32, 0.33"
-julia = "1"
+julia = "1.5"
 
 [extras]
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"

diff --git a/src/DelayEmbeddings.jl b/src/DelayEmbeddings.jl
@@ -6,14 +6,16 @@ module DelayEmbeddings
 
 include("dataset.jl")
 include("embeddings.jl")
-include("various.jl")
+include("utils.jl")
 include("neighborhoods.jl")
 
-include("estimate_delay.jl")
-include("estimate_dimension.jl")
-include("pecora.jl")
-include("uzal_cost.jl")
-include("MDOP.jl")
-include("garcia_almeida.jl")
+include("traditional_de/estimate_delay.jl")
+include("traditional_de/estimate_dimension.jl")
+include("traditional_de/automated.jl")
+
+include("unified_de/pecora.jl")
+include("unified_de/uzal_cost.jl")
+include("unified_de/MDOP.jl")
+include("unified_de/garcia_almeida.jl")
 
 end
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -1,7 +1,7 @@
 using StaticArrays, LinearAlgebra
 using Base.Iterators: flatten
 
-export Dataset, AbstractDataset, minima, maxima
+export Dataset, AbstractDataset, SVector, minima, maxima
 export minmaxima, columns, regularize, dimension
 
 abstract type AbstractDataset{D, T} end
@@ -101,7 +101,6 @@ end
 ###########################################################################
 # Concrete implementation
 ###########################################################################
-
 """
     Dataset{D, T} <: AbstractDataset{D,T}
 A dedicated interface for datasets.
@@ -146,8 +145,7 @@ Dataset(s::Dataset) = s
 ###########################################################################
 # Dataset(Vectors of stuff)
 ###########################################################################
-Dataset(s::AbstractVector{T}) where {T<:Number} =
-Dataset{1, T}(reinterpret(SVector{1, T}, s))
+Dataset(s::AbstractVector{T}) where {T} = Dataset(SVector.(s))
 
 function Dataset(v::Vector{<:AbstractArray{T}}) where {T<:Number}
     D = length(v[1])

diff --git a/src/embeddings.jl b/src/embeddings.jl
@@ -134,14 +134,14 @@ Systems and Turbulence*, Lecture Notes in Mathematics **366**, Springer (1981)
 """
 function reconstruct(s::AbstractVector{T}, γ, τ) where {T}
     if γ == 0
-        return Dataset{1, T}(s)
+        return Dataset(s)
     end
     de::DelayEmbedding{γ} = DelayEmbedding(Val{γ}(), τ)
     return reconstruct(s, de)
 end
 function reconstruct(s::AbstractVector{T}, γ, τ, w) where {T}
     if γ == 0
-        return Dataset{1, T}(s)
+        return Dataset(s)
     end
     de = WeightedDelayEmbedding(Val{γ}(), τ, w)
     return reconstruct(s, de)

diff --git a/src/traditional_de/automated.jl b/src/traditional_de/automated.jl
@@ -0,0 +1,187 @@
+export optimal_traditional_de
+
+"""
+    optimal_traditional_de(s, method = "afnn", dmethod = "mi_min"; kwargs...) → 𝒟, τ, E
+
+Produce an optimal delay embedding `𝒟` of the given timeseries `s` by
+using the traditional approach of first finding an optimal (and constant) delay
+time using [`estimate_delay`](@ref) with the given `dmethod`, and then an optimal
+embedding dimension, by calculating an appropriate statistic for each dimension `d ∈ 1:dmax`.
+Return the embedding `𝒟`, the optimal delay time `τ`
+(the optimal embedding dimension `d` is just `size(𝒟, 2)`) and the actual
+statistic `E` used to estimate optimal `d`.
+
+Notice that `E` is a function of the embedding dimension, which ranges from 1 to `dmax`.
+
+For calculating `E` to estimate the dimension we use the given `method` which can be:
+
+* `"afnn"` (default) is Cao's "Averaged False Nearest Neighbors" method[^Cao1997],
+    which gives a ratio of distances between nearest neighbors.
+* `"ifnn"` is the "Improved False Nearest Neighbors" from Hegger & Kantz[^Hegger1999],
+    which gives the fraction of false nearest neighbors.
+* `"fnn"` is Kennel's "False Nearest Neighbors" method[^Kennel1992], which gives
+    the number of points that cease to be "nearest neighbors" when the dimension
+    increases.
+* `"f1nn"` is Krakovská's "False First Nearest Neighbors" method[^Krakovská2015],
+    which gives the ratio of pairs of points that cease to be "nearest neighbors"
+    when the dimension increases.
+
+For more details, see individual methods: [`delay_afnn`](@ref), [`delay_ifnn`](@ref),
+[`delay_fnn`](@ref), [`delay_f1nn`](@ref).
+The special keywords `` denote for which possible embedding
+dimensions should the statistics be computed for.
+
+!!! warn "Careful in automated methods"
+    While this method is automated if you want to be **really sure** of the results,
+    you should directly calculate the statistic and plot its values versus the
+    dimensions.
+
+## Keywords
+The keywords
+```
+τs = 1:100, dmax = 10
+```
+denote which delay times and embedding dimensions `ds ∈ 1:dmax` to consider when calculating
+optimal embedding. All remaining keywords are propagated to the low level functions:
+```
+fnn_thres::Real = 0.05, slope_thres::Real= 0.2, w::Int=1,
+rtol=10.0, atol=2.0, τs = 1:100, metric = Euclidean(), r::Real=2.0
+```
+
+## Description
+We estimate the optimal embedding dimension based on the given delay time gained
+from `dmethod` as follows: For Cao's method the optimal dimension is reached,
+when the slope of the `E₁`-statistic (output from `"afnn"`) falls below the
+threshold `slope_thres` (Default is .05) and the according stochastic test turns
+out to be false, i.e. when the `E₂`-statistic is not "equal" to 1 for all en-
+countered dimensions. We treat `E₂`-values as equal to 1, when `1-E₂ ≤ fnn_thres`.
+For all the other methods we return the optimal embedding dimension
+when the corresponding FNN-statistic (output from `"ifnn"`, `"fnn"` or `"f1nn"`)
+falls below the fnn-threshold `fnn_thres` (Default is .05) AND the slope of the
+statistic falls below the threshold `slope_thres`. Note that with noise
+contaminated time series, one might need to adjust `fnn_thres` according to the
+noise level.
+
+See also the file `test/compare_different_dimension_estimations.jl` for a comparison.
+
+[^Cao1997]: Liangyue Cao, [Physica D, pp. 43-50 (1997)](https://www.sciencedirect.com/science/article/pii/S0167278997001188?via%3Dihub)
+
+[^Kennel1992]: M. Kennel *et al.*, [Phys. Review A **45**(6), (1992)](https://journals.aps.org/pra/abstract/10.1103/PhysRevA.45.3403).
+
+[^Krakovská2015]: Anna Krakovská *et al.*, [J. Complex Sys. 932750 (2015)](https://doi.org/10.1155/2015/932750)
+
+[^Hegger1999]: Hegger & Kantz, [Improved false nearest neighbor method to detect determinism in time series data. Physical Review E 60, 4970](https://doi.org/10.1103/PhysRevE.60.4970).
+"""
+function optimal_traditional_de(s::AbstractVector, dimensionmethod::String = "afnn",
+        delaymethod::String= "mi_min";
+        fnn_thres::Real = 0.05, slope_thres::Real = .05, dmax::Int = 10, w::Int=1,
+        rtol=10.0, atol=2.0, τs = 1:100, metric = Euclidean(), r::Real=2.0,
+    )
+
+    # TODO: This function needs to be reworked to use dimensions 2:dmax.
+    # starting with dimension 1:dmax is pointless, because 1 can **never** be
+    # a proper embedding dimension!!!
+
+    @assert dimensionmethod ∈ ("afnn", "fnn", "ifnn", "f1nn")
+    τ = estimate_delay(s, delaymethod, τs)
+    ds = 1:dmax
+    γs = ds .- 1 # TODO: This must be updated to dimension in 2.0
+
+    if dimensionmethod=="afnn"
+        dimension_statistic = delay_afnn(s, τ, ds, metric)
+        Y, τ = find_cao_optimal(s, τ, dimension_statistic, slope_thres)
+        E2 = stochastic_indicator(s, τ, ds)
+        flag = is_stochastic(E2, fnn_thres)
+        flag && println("Stochastic signal, valid embedding NOT achieved ⨉.")
+    elseif dimensionmethod=="fnn"
+        dimension_statistic = delay_fnn(s, τ, ds; rtol, atol)
+        Y, τ = find_fnn_optimal(s, τ, dimension_statistic, fnn_thres, slope_thres)
+    elseif dimensionmethod=="ifnn"
+        dimension_statistic = delay_ifnn(s, τ, ds; r, w, metric)
+        Y, τ = find_fnn_optimal(s, τ, dimension_statistic, fnn_thres, slope_thres)
+    elseif dimensionmethod=="f1nn"
+        dimension_statistic = delay_f1nn(s, τ, ds, metric)
+        Y, τ = find_fnn_optimal(s, τ, dimension_statistic, fnn_thres, slope_thres)
+    end
+    return Y, τ, dimension_statistic
+end
+
+
+"""
+Helper function for selecting the appropriate embedding dimension from the
+statistic when using Kennel's, Hegger's or Krakovskas's method.
+"""
+function find_fnn_optimal(s::Vector{T}, τ::Int, rat::Vector, fnn_thres::Real,
+                                                    slope_thres::Real) where {T}
+    @assert length(rat) > 1
+    y = abs.(diff(rat))
+    flag = false
+    m = 0
+    for i = 2:length(rat)
+        if rat[i] ≤ fnn_thres && y[i-1] ≤ slope_thres
+            m = i
+            break
+        elseif rat[i] > rat[i-1]
+            flag = true
+            m = i-1
+            break
+        end
+    end
+    if m == length(rat) || m == 0
+        println("Sufficiently small FNNs NOT reached. "*
+                "Valid embedding NOT achieved ⨉.")
+    elseif flag
+        println("Algorithm stopped due to increasing FNNs. "*
+        "Double-check the FNN-statistic.")
+    else
+        println("Algorithm stopped due to sufficiently small FNNs. "*
+                "Valid embedding achieved ✓.")
+    end
+    Y = embed(s, max(1, m), τ) # you can embed in 1 dimension in latest version
+    return Y, τ
+end
+
+
+"""
+Helper function for selecting the appropriate embedding dimension from the
+statistic when using Cao's method.
+"""
+function find_cao_optimal(s::Vector{T}, τ::Int, rat::Vector, thres::Real) where {T}
+    m = 0
+    y = abs.(diff(rat))
+    for i = 1:length(rat)-1
+        if y[i] ≤ thres && rat[i] > 0.5
+            m = i
+            break
+        end
+    end
+    if m == 0
+        println("NO convergence of E₁-statistic. "*
+                "Valid embedding NOT achieved ⨉.")
+    else
+        Y =  m > 1 ? embed(s, m, τ) : s
+        println("Algorithm stopped due to convergence of E₁-statistic. "*
+                "Valid embedding achieved ✓.")
+    end
+    Y = embed(s, max(1, m), τ) # you can embed in 1 dimension in latest version
+    return Y, τ
+end
+
+"""
+Helper function for Cao's method, whether to decide if the input signal is
+stochastic or not.
+"""
+function is_stochastic(rat::Vector{T}, thres::Real) where {T}
+    cnt = 0
+    for i = 1:length(rat)
+        if abs(1-rat[i]) ≥ thres
+            cnt += 1
+        end
+    end
+    if cnt == 0
+        flag = true
+    else
+        flag = false
+    end
+    return flag
+end
diff --git a/src/estimate_delay.jl → src/traditional_de/estimate_delay.jl b/src/estimate_delay.jl → src/traditional_de/estimate_delay.jl
@@ -5,7 +5,7 @@ export estimate_delay, exponential_decay_fit, autocor
 #                               Estimate Delay Times                                #
 #####################################################################################
 """
-    estimate_delay(s, method::String [, τs = 1:2:100]; kwargs...) -> τ
+    estimate_delay(s, method::String [, τs = 1:100]; kwargs...) -> τ
 
 Estimate an optimal delay to be used in [`reconstruct`](@ref) or [`embed`](@ref).
 The `method` can be one of the following:
@@ -28,7 +28,7 @@ The method `mi_min` is significantly more accurate than the others and also retu
 good results for most timeseries. It is however the slowest method (but still quite fast!).
 """
 function estimate_delay(x::AbstractVector, method::String,
-    τs = 1:2:min(100, length(x)); kwargs...)
+    τs = 1:min(100, length(x)); kwargs...)
 
     issorted(τs) || error("`τs` must be sorted")
 
@@ -59,7 +59,7 @@ function estimate_delay(x::AbstractVector, method::String,
         return round(Int,τ)
     elseif method=="exp_extrema"
         c = autocor(x, τs; demean=true)
-        max_ind, min_ind = localextrema(c)
+        max_ind, min_ind = findlocalextrema(c)
         idxs = sort!(append!(max_ind, min_ind))
         ca = abs.(c[idxs])
         τa = τs[idxs]
@@ -115,44 +115,9 @@ function exponential_decay_fit(X, Y, weight = :equal)
     end
 end
 
-"""
-    localextrema(y) -> max_ind, min_ind
-Find the local extrema of given array `y`, by scanning point-by-point. Return the
-indices of the maxima (`max_ind`) and the indices of the minima (`min_ind`).
-"""
-function localextrema(y)
-    @inbounds begin
-        l = length(y)
-        i = 1
-        maxargs = Int[]
-        minargs = Int[]
-        if y[1] > y[2]
-            push!(maxargs, 1)
-        elseif y[1] < y[2]
-            push!(minargs, 1)
-        end
-
-        for i in 2:l-1
-            left = i-1
-            right = i+1
-            if  y[left] < y[i] > y[right]
-                push!(maxargs, i)
-            elseif y[left] > y[i] < y[right]
-                push!(minargs, i)
-            end
-        end
-
-        if y[l] > y[l-1]
-            push!(maxargs, l)
-        elseif y[l] < y[l-1]
-            push!(minargs, l)
-        end
-        return maxargs, minargs
-    end
-end
 
 #####################################################################################
-#                               Estimate Delay Times                                #
+#                               Mutual information                                  #
 #####################################################################################
 export mutualinformation
 """