From 2cae9eb2aca644bfe579ac0095a866ba880ba08b Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 25 Jan 2025 12:26:11 +0100 Subject: [PATCH 1/3] reorganize docs --- docs/src/api.md | 87 +++++++++++++++++++++++++------------- src/Datasets/Datasets.jl | 3 +- src/Datasets/generators.jl | 61 +++++++++++++++++++++++--- 3 files changed, 114 insertions(+), 37 deletions(-) diff --git a/docs/src/api.md b/docs/src/api.md index 36d6a9b..326d4bd 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -1,61 +1,88 @@ +```@meta +CollapsedDocStrings = true +``` + # API Reference -## Index -```@index -Order = [:type, :function] -Modules = [MLUtils] -Pages = ["api.md"] +## Core API + +```@docs +getobs +getobs! +numobs ``` -## Docs +## Lazy Transforms + +```@docs +filterobs +groupobs +joinobs +mapobs +shuffleobs +``` + +## Batching, Iteration, and Views ```@docs batch batchsize batchseq BatchView -chunk -DataLoader eachobs +DataLoader +obsview +ObsView +randobs +``` + +## Partitioning + +```@docs +leavepout +kfolds +splitobs +``` + +## Array Constructors + +```@docs +falses_like fill_like -filterobs +ones_like +trues_like +zeros_like +``` + +## Resampling + +```@docs +oversample +undersample +``` + +## Operations + +```@docs +chunk flatten -getobs -getobs! -joinobs group_counts group_indices -groupobs -kfolds -leavepout -mapobs -numobs normalise -obsview -ObsView -ones_like -oversample -randobs rpad_constant -shuffleobs -splitobs stack unbatch -undersample unsqueeze unstack -zeros_like ``` - -## Datasets Docs +## Datasets ```@docs Datasets.load_iris Datasets.make_sin Datasets.make_spiral Datasets.make_poly +Datasets.make_moons ``` - - diff --git a/src/Datasets/Datasets.jl b/src/Datasets/Datasets.jl index 06204c7..99b16c5 100644 --- a/src/Datasets/Datasets.jl +++ b/src/Datasets/Datasets.jl @@ -8,6 +8,7 @@ export load_iris include("generators.jl") export make_spiral, make_poly, - make_sin + make_sin, + make_moons end \ No newline at end of file diff --git a/src/Datasets/generators.jl b/src/Datasets/generators.jl index 8cf28c4..9fe64a3 100644 --- a/src/Datasets/generators.jl +++ b/src/Datasets/generators.jl @@ -3,7 +3,9 @@ make_sin(n, start, stop; noise = 0.3, f_rand = randn) -> x, y Generates `n` noisy equally spaces samples of a sinus from `start` to `stop` -by adding `noise .* f_rand(length(x))` to the result of `fun(x)`. +by adding `noise .* f_rand(length(x))` to the result of `sin(x)`. + +Returns the vector `x` with the samples and the noisy response `y`. """ function make_sin(n::Int = 50, start::Real = 0, stop::Real = 2π; noise::Real = 0.3, f_rand::Function = randn) x = collect(range(start, stop=stop, length=n)) @@ -15,10 +17,15 @@ end make_poly(coef, x; noise = 0.01, f_rand = randn) -> x, y Generates a noisy response for a polynomial of degree `length(coef)` -using the vector `x` as input and adding `noise .* f_randn(length(x))` to the result. +and with the coefficients given by `coef`. The response is generated +by elmentwise computation of the polynome on the elements of `x` +and adding `noise .* f_randn(length(x))` to the result. + The vector `coef` contains the coefficients for the terms of the polynome. The first element of `coef` denotes the coefficient for the term with the highest degree, while the last element of `coef` denotes the intercept. + +Return the input `x` and the noisy response `y`. """ function make_poly(coef::AbstractVector{R}, x::AbstractVector{T}; noise::Real = 0.1, f_rand::Function = randn) where {T<:Real,R<:Real} @@ -32,16 +39,18 @@ function make_poly(coef::AbstractVector{R}, x::AbstractVector{T}; end end y .+= noise .* f_rand.() - x_vec, y + return x_vec, y end - """ make_spiral(n, a, theta, b; noise = 0.01, f_rand = randn) -> x, y Generates `n` noisy responses for a spiral with two labels. Uses the radius, angle and scaling arguments to space the points in 2D space and adding `noise .* f_randn(n)` to the response. + +Returns the 2 x n matrix `x` with the coordinates of the samples and the +vector `y` with the labels. """ function make_spiral(n::Int = 97, a::Real = 6.5, theta::Real = 16.0, b::Real=104.0; noise::Real = 0.1, f_rand::Function = randn) @@ -62,5 +71,45 @@ function make_spiral(n::Int = 97, a::Real = 6.5, theta::Real = 16.0, b::Real=104 end x[1, :] .+= noise .* f_rand.() x[2, :] .+= noise .* f_rand.() - x, y -end \ No newline at end of file + return x, y +end + + +""" + make_moons(n; noise=0.0, f_rand=randn, shuffle=true) -> x, y + +Generate a dataset with two interleaving half circles. The number of samples +is given by `n_samples` and the noise level can be controlled by the `noise` argument. + +Returns a 2 x n matrix with the the samples. +Set `shuffle=false` to keep the order of the samples. +""" +function make_moons(n_samples::Union{Int, Tuple{Int, Int}} = 100; + noise::Number = 0.0, f_rand = randn, shuffle::Bool = true) + + T = Float64 + rng = Randon.default_rng() + if n_samples isa Tuple + @assert length(n_samples) == 2 + n_samples_1, n_samples_2 = n_samples + else + n_samples_1 = n_samples ÷ 2 + n_samples_2 = n_samples - n_samples_1 + end + t_min, t_max = T(0), T(π) + t_inner = rand(rng, T, n_samples_1) * (t_max - t_min) .+ t_min + t_outer = rand(rng, T, n_samples_2) * (t_max - t_min) .+ t_min + outer_circ_x = cos.(t_outer) + outer_circ_y = sin.(t_outer) .+ T(1) + inner_circ_x = 1 .- cos.(t_inner) + inner_circ_y = 1 .- sin.(t_inner) .- T(1) + + x = [outer_circ_x outer_circ_y; inner_circ_x inner_circ_y] + x = permutedims(x, (2, 1)) + x .+= T(noise) * f_rand(rng, T, size(z)) + y = [fill(1, n_samples_1); fill(2, n_samples_2)] + if shuffle + x, y = getobs(shuffleobs((x, y))) + end + return x, y +end From 4befdca2319edbdacc8762eb0ae9140b7e86ab43 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 25 Jan 2025 12:38:44 +0100 Subject: [PATCH 2/3] fix --- .gitignore | 1 + src/Datasets/Datasets.jl | 1 + src/Datasets/generators.jl | 28 +++++++++++++++++----------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 48f8ce3..2709867 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ /Manifest.toml docs/build .vscode +/test.jl \ No newline at end of file diff --git a/src/Datasets/Datasets.jl b/src/Datasets/Datasets.jl index 99b16c5..d5a2597 100644 --- a/src/Datasets/Datasets.jl +++ b/src/Datasets/Datasets.jl @@ -1,6 +1,7 @@ module Datasets using Random using DelimitedFiles: readdlm +using MLUtils: getobs, shuffleobs include("load_datasets.jl") export load_iris diff --git a/src/Datasets/generators.jl b/src/Datasets/generators.jl index 9fe64a3..dcd7cf9 100644 --- a/src/Datasets/generators.jl +++ b/src/Datasets/generators.jl @@ -78,17 +78,23 @@ end """ make_moons(n; noise=0.0, f_rand=randn, shuffle=true) -> x, y -Generate a dataset with two interleaving half circles. The number of samples -is given by `n_samples` and the noise level can be controlled by the `noise` argument. +Generate a dataset with two interleaving half circles. + +If `n` is an integer, the number of samples is `n` and the number of samples +for each half circle is `n ÷ 2`. If `n` is a tuple, the first element of the tuple +denotes the number of samples for the first half circle and the second element +denotes the number of samples for the second half circle. + +The noise level can be controlled by the `noise` argument. -Returns a 2 x n matrix with the the samples. Set `shuffle=false` to keep the order of the samples. + +Returns a 2 x n matrix with the the samples. """ function make_moons(n_samples::Union{Int, Tuple{Int, Int}} = 100; noise::Number = 0.0, f_rand = randn, shuffle::Bool = true) - T = Float64 - rng = Randon.default_rng() + rng = Random.default_rng() if n_samples isa Tuple @assert length(n_samples) == 2 n_samples_1, n_samples_2 = n_samples @@ -96,17 +102,17 @@ function make_moons(n_samples::Union{Int, Tuple{Int, Int}} = 100; n_samples_1 = n_samples ÷ 2 n_samples_2 = n_samples - n_samples_1 end - t_min, t_max = T(0), T(π) - t_inner = rand(rng, T, n_samples_1) * (t_max - t_min) .+ t_min - t_outer = rand(rng, T, n_samples_2) * (t_max - t_min) .+ t_min + t_min, t_max = 0.0, π + t_inner = rand(rng, n_samples_1) * (t_max - t_min) .+ t_min + t_outer = rand(rng, n_samples_2) * (t_max - t_min) .+ t_min outer_circ_x = cos.(t_outer) - outer_circ_y = sin.(t_outer) .+ T(1) + outer_circ_y = sin.(t_outer) inner_circ_x = 1 .- cos.(t_inner) - inner_circ_y = 1 .- sin.(t_inner) .- T(1) + inner_circ_y = 0.5 .- sin.(t_inner) x = [outer_circ_x outer_circ_y; inner_circ_x inner_circ_y] x = permutedims(x, (2, 1)) - x .+= T(noise) * f_rand(rng, T, size(z)) + x .+= noise .* f_rand(rng, size(x)) y = [fill(1, n_samples_1); fill(2, n_samples_2)] if shuffle x, y = getobs(shuffleobs((x, y))) From 48581fc9a41e949139c506105e6ef1817b1953e1 Mon Sep 17 00:00:00 2001 From: Carlo Lucibello Date: Sat, 25 Jan 2025 12:45:53 +0100 Subject: [PATCH 3/3] tests --- test/Datasets/generators.jl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/Datasets/generators.jl b/test/Datasets/generators.jl index f3a3cf7..afa976a 100644 --- a/test/Datasets/generators.jl +++ b/test/Datasets/generators.jl @@ -28,3 +28,20 @@ end # test_plot = scatterplot(xtmp[1, 1:97], xtmp[2, 1:97], title="Spiral Function", color=:blue, name="pos") # print(scatterplot!(test_plot, xtmp[1, 98:194], xtmp[2, 98:194], color=:yellow, name="neg" )) end + + +@testset "make_moons" begin + x, y = Datasets.make_moons(100, noise=0, shuffle=false) + @test size(x) == (2, 100) + @test size(y) == (100,) + @test all(==(1), y[1:50]) + @test all(==(2), y[51:100]) + @test minimum(x[1,1:50]) >= -1 + @test maximum(x[1,1:50]) <= 1 + @test minimum(x[2,1:50]) >= -1 + @test maximum(x[2,1:50]) <= 1 + @test minimum(x[1,51:100]) >= 0 + @test maximum(x[1,51:100]) <= 2 + @test minimum(x[2,51:100]) >= -0.5 + @test maximum(x[2,51:100]) <= 0.5 +end \ No newline at end of file