Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add make_moons generator + some doc reorg #185

Merged
merged 3 commits into from
Jan 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
/Manifest.toml
docs/build
.vscode
/test.jl
87 changes: 57 additions & 30 deletions docs/src/api.md
Original file line number Diff line number Diff line change
@@ -1,61 +1,88 @@
```@meta
CollapsedDocStrings = true
```

# API Reference

## Index

```@index
Order = [:type, :function]
Modules = [MLUtils]
Pages = ["api.md"]
## Core API

```@docs
getobs
getobs!
numobs
```

## Docs
## Lazy Transforms

```@docs
filterobs
groupobs
joinobs
mapobs
shuffleobs
```

## Batching, Iteration, and Views

```@docs
batch
batchsize
batchseq
BatchView
chunk
DataLoader
eachobs
DataLoader
obsview
ObsView
randobs
```

## Partitioning

```@docs
leavepout
kfolds
splitobs
```

## Array Constructors

```@docs
falses_like
fill_like
filterobs
ones_like
trues_like
zeros_like
```

## Resampling

```@docs
oversample
undersample
```

## Operations

```@docs
chunk
flatten
getobs
getobs!
joinobs
group_counts
group_indices
groupobs
kfolds
leavepout
mapobs
numobs
normalise
obsview
ObsView
ones_like
oversample
randobs
rpad_constant
shuffleobs
splitobs
stack
unbatch
undersample
unsqueeze
unstack
zeros_like
```


## Datasets Docs
## Datasets

```@docs
Datasets.load_iris
Datasets.make_sin
Datasets.make_spiral
Datasets.make_poly
Datasets.make_moons
```


4 changes: 3 additions & 1 deletion src/Datasets/Datasets.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
module Datasets
using Random
using DelimitedFiles: readdlm
using MLUtils: getobs, shuffleobs

include("load_datasets.jl")
export load_iris

include("generators.jl")
export make_spiral,
make_poly,
make_sin
make_sin,
make_moons

end
67 changes: 61 additions & 6 deletions src/Datasets/generators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
make_sin(n, start, stop; noise = 0.3, f_rand = randn) -> x, y

Generates `n` noisy equally spaces samples of a sinus from `start` to `stop`
by adding `noise .* f_rand(length(x))` to the result of `fun(x)`.
by adding `noise .* f_rand(length(x))` to the result of `sin(x)`.

Returns the vector `x` with the samples and the noisy response `y`.
"""
function make_sin(n::Int = 50, start::Real = 0, stop::Real = 2π; noise::Real = 0.3, f_rand::Function = randn)
x = collect(range(start, stop=stop, length=n))
Expand All @@ -15,10 +17,15 @@ end
make_poly(coef, x; noise = 0.01, f_rand = randn) -> x, y

Generates a noisy response for a polynomial of degree `length(coef)`
using the vector `x` as input and adding `noise .* f_randn(length(x))` to the result.
and with the coefficients given by `coef`. The response is generated
by elmentwise computation of the polynome on the elements of `x`
and adding `noise .* f_randn(length(x))` to the result.

The vector `coef` contains the coefficients for the terms of the polynome.
The first element of `coef` denotes the coefficient for the term with
the highest degree, while the last element of `coef` denotes the intercept.

Return the input `x` and the noisy response `y`.
"""
function make_poly(coef::AbstractVector{R}, x::AbstractVector{T};
noise::Real = 0.1, f_rand::Function = randn) where {T<:Real,R<:Real}
Expand All @@ -32,16 +39,18 @@ function make_poly(coef::AbstractVector{R}, x::AbstractVector{T};
end
end
y .+= noise .* f_rand.()
x_vec, y
return x_vec, y
end


"""
make_spiral(n, a, theta, b; noise = 0.01, f_rand = randn) -> x, y

Generates `n` noisy responses for a spiral with two labels. Uses the radius, angle
and scaling arguments to space the points in 2D space and adding `noise .* f_randn(n)`
to the response.

Returns the 2 x n matrix `x` with the coordinates of the samples and the
vector `y` with the labels.
"""
function make_spiral(n::Int = 97, a::Real = 6.5, theta::Real = 16.0, b::Real=104.0;
noise::Real = 0.1, f_rand::Function = randn)
Expand All @@ -62,5 +71,51 @@ function make_spiral(n::Int = 97, a::Real = 6.5, theta::Real = 16.0, b::Real=104
end
x[1, :] .+= noise .* f_rand.()
x[2, :] .+= noise .* f_rand.()
x, y
end
return x, y
end


"""
make_moons(n; noise=0.0, f_rand=randn, shuffle=true) -> x, y

Generate a dataset with two interleaving half circles.

If `n` is an integer, the number of samples is `n` and the number of samples
for each half circle is `n ÷ 2`. If `n` is a tuple, the first element of the tuple
denotes the number of samples for the first half circle and the second element
denotes the number of samples for the second half circle.

The noise level can be controlled by the `noise` argument.

Set `shuffle=false` to keep the order of the samples.

Returns a 2 x n matrix with the the samples.
"""
function make_moons(n_samples::Union{Int, Tuple{Int, Int}} = 100;
noise::Number = 0.0, f_rand = randn, shuffle::Bool = true)

rng = Random.default_rng()
if n_samples isa Tuple
@assert length(n_samples) == 2
n_samples_1, n_samples_2 = n_samples
else
n_samples_1 = n_samples ÷ 2
n_samples_2 = n_samples - n_samples_1
end
t_min, t_max = 0.0, π
t_inner = rand(rng, n_samples_1) * (t_max - t_min) .+ t_min
t_outer = rand(rng, n_samples_2) * (t_max - t_min) .+ t_min
outer_circ_x = cos.(t_outer)
outer_circ_y = sin.(t_outer)
inner_circ_x = 1 .- cos.(t_inner)
inner_circ_y = 0.5 .- sin.(t_inner)

x = [outer_circ_x outer_circ_y; inner_circ_x inner_circ_y]
x = permutedims(x, (2, 1))
x .+= noise .* f_rand(rng, size(x))
y = [fill(1, n_samples_1); fill(2, n_samples_2)]
if shuffle
x, y = getobs(shuffleobs((x, y)))
end
return x, y
end
17 changes: 17 additions & 0 deletions test/Datasets/generators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,20 @@ end
# test_plot = scatterplot(xtmp[1, 1:97], xtmp[2, 1:97], title="Spiral Function", color=:blue, name="pos")
# print(scatterplot!(test_plot, xtmp[1, 98:194], xtmp[2, 98:194], color=:yellow, name="neg" ))
end


@testset "make_moons" begin
x, y = Datasets.make_moons(100, noise=0, shuffle=false)
@test size(x) == (2, 100)
@test size(y) == (100,)
@test all(==(1), y[1:50])
@test all(==(2), y[51:100])
@test minimum(x[1,1:50]) >= -1
@test maximum(x[1,1:50]) <= 1
@test minimum(x[2,1:50]) >= -1
@test maximum(x[2,1:50]) <= 1
@test minimum(x[1,51:100]) >= 0
@test maximum(x[1,51:100]) <= 2
@test minimum(x[2,51:100]) >= -0.5
@test maximum(x[2,51:100]) <= 0.5
end
Loading