diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index dd911f3..623860f 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -1,11 +1,15 @@ name: TagBot on: - schedule: - - cron: 0 0 * * * + issue_comment: + types: + - created + workflow_dispatch: jobs: TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' runs-on: ubuntu-latest steps: - uses: JuliaRegistries/TagBot@v1 with: token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} \ No newline at end of file diff --git a/Project.toml b/Project.toml index adff472..58a980d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,22 +1,24 @@ name = "PopGenSims" uuid = "e670a858-c023-4621-870e-30d0f49b8322" authors = ["Pavel Dimens and contributors"] -version = "0.0.3" +version = "0.0.4" [deps] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" InvertedIndices = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" PopGen = "af524d12-c74b-11e9-22a8-3b091653023f" +RandomNumbers = "e6cf234a-135c-5ec9-84dd-332b85af5143" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" [compat] DataFrames = "0.21, 0.22" InvertedIndices = "1.0" -PooledArrays = "0.5" -StaticArrays = "0.12, 1.0" -PopGen = "0.0.3, 0.1, 0.4" +PooledArrays = "1.0, 1.1" +PopGen = "0.4" +RandomNumbers = "1.4" +StaticArrays = "1.0" StatsBase = "0.33" julia = "1.4" diff --git a/README.md b/README.md index 0c391ea..34d21d8 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,14 @@ -![logo](PopGenSims.png) -[![alt text](https://img.shields.io/badge/docs-stable-informational?style=for-the-badge&logo=Read%20The%20Docs&logoColor=white)](https://pdimens.github.io/PopGen.jl/docs/simulations/simulate_samples) +![logo](misc/PopGenSims.png) +[![alt text](https://img.shields.io/badge/docs-stable-informational?style=for-the-badge&logo=Read%20The%20Docs&logoColor=white)](https://pdimens.github.io/PopGen.jl/docs/simulations/simulate_samples) +[![alt text](https://img.shields.io/badge/slack-join%20PopGen.jl-9d72b1?style=for-the-badge&logo=slack)](https://join.slack.com/t/popgenjl/shared_invite/zt-deam65n8-DuBs2z1oDtsbBuRplJW~Pg) +![build_status](https://img.shields.io/github/workflow/status/pdimens/PopGenSims.jl/CI_dev?label=Dev%20Build&logo=GitHub&style=for-the-badge) + + ## Create population genetics simulations -This package builds off of [PopGen.jl](http://github.com/pdimens/PopGen.jl) and +This package builds off of [PopGen.jl](http://github.com/BioJulia/PopGen.jl) and simulates offspring that would be generated under certain conditions. With this package you can simulate the offspring of specific individuals, simulate full-sibs, half-sibs, unrelated individuals, and parent-offspring pairs for use with PopGen.jl. You can also randomly generate samples given population-specific allele frequencies. ### Installation -```julia -julia>] - -pkg(v1.5)> add "http://github.com/pdimens/PopGenSims.jl" -``` - -### Usage -Documentation for PopGenSims.jl has been integrated into [the PopGen.jl docs](https://pdimens.github.io/PopGen.jl/docs/simulations/simulate_samples) +![installation](misc/install.png) \ No newline at end of file diff --git a/PopGenSims.png b/misc/PopGenSims.png similarity index 100% rename from PopGenSims.png rename to misc/PopGenSims.png diff --git a/misc/install.png b/misc/install.png new file mode 100644 index 0000000..a472730 Binary files /dev/null and b/misc/install.png differ diff --git a/src/Cross.jl b/src/Cross.jl index 340638d..012252f 100644 --- a/src/Cross.jl +++ b/src/Cross.jl @@ -1,7 +1,7 @@ export cross, backcross function sample_genotype(geno::T, n_alleles::Int) where T<:Genotype - sample(SVector(geno), n_alleles, replace = false) + sample(Xoroshiro128Star(), SVector(geno), n_alleles, replace = false) end function sample_genotype(geno::Missing, n_alleles::Int) @@ -12,7 +12,7 @@ function haploid_cross!(data::DataFrame, p1::T, p2::T; n::Int) where T <: GenoAr iter_df = DataFrames.groupby(data, :name) for simulation in iter_df all_alleles = getindex.(collect.(zip.(p1,p2)), 1) - offspring_geno = Tuple(Base.Iterators.flatten(rand.(all_alleles, 1)) |> collect) + offspring_geno = Tuple(Base.Iterators.flatten(rand.(Xoroshiro128Star(), all_alleles, 1)) |> collect) miss_idx = reduce(union, findall.(i -> i == (missing,), offspring_geno)) simulation.genotype[Not(miss_idx)] .= offspring_geno[Not(miss_idx)] end @@ -71,7 +71,6 @@ function cross(data::PopData, parent1::String, parent2::String; n::Int = 100, ge out_population = fill(generation, n * length(loci)) out_geno = similar(p1, n * length(loci)) out_loci = DataFrame(:name => out_offspring, :population => out_population, :locus => out_loci_names, :genotype => out_geno) - #categorical!(out_loci, [:name, :population, :locus], compress = true) out_loci.name = PooledArray(out_loci.name) out_loci.population = PooledArray(out_loci.population) out_loci.locus = PooledArray(out_loci.locus) diff --git a/src/PopGenSims.jl b/src/PopGenSims.jl index cc466a0..0f3bee0 100644 --- a/src/PopGenSims.jl +++ b/src/PopGenSims.jl @@ -1,15 +1,17 @@ module PopGenSims using DataFrames, PooledArrays, StaticArrays -using StatsBase: sample +using StatsBase: sample, Weights +using RandomNumbers.Xorshifts using PopGen: - PopData, + allele_freq, Genotype, GenoArray, get_genotypes, + PopData, read_from, - write_to, - sort + sort, + write_to include("Cross.jl") include("Samples.jl") diff --git a/src/Samples.jl b/src/Samples.jl index 19e4fe1..f45e9ac 100644 --- a/src/Samples.jl +++ b/src/Samples.jl @@ -31,7 +31,7 @@ julia> sample_locus(d, 3, 3) function sample_locus(locus::Dict, n::Int, ploidy::Signed) isempty(locus) && return fill(missing, n) k,v = collect(keys(locus)), collect(values(locus)) - alleles = [sample(k, Weights(v), n) for i in 1:ploidy] + alleles = [sample(Xoroshiro128Star(), k, Weights(v), n) for i in 1:ploidy] Tuple.(sort.(eachrow(hcat(alleles...)))) end @@ -55,34 +55,38 @@ PopData Object Coordinates: absent julia> sims.meta -1700×5 DataFrame -│ Row │ name │ population │ ploidy │ longitude │ latitude │ -│ │ String │ String │ Int64 │ Missing │ Missing │ -├──────┼──────────┼────────────┼────────┼───────────┼──────────┤ -│ 1 │ sim_1 │ 1 │ 2 │ missing │ missing │ -│ 2 │ sim_2 │ 1 │ 2 │ missing │ missing │ -│ 3 │ sim_3 │ 1 │ 2 │ missing │ missing │ -│ 4 │ sim_4 │ 1 │ 2 │ missing │ missing │ -⋮ -│ 1696 │ sim_1696 │ 17 │ 2 │ missing │ missing │ -│ 1697 │ sim_1697 │ 17 │ 2 │ missing │ missing │ -│ 1698 │ sim_1698 │ 17 │ 2 │ missing │ missing │ -│ 1699 │ sim_1699 │ 17 │ 2 │ missing │ missing │ -│ 1700 │ sim_1700 │ 17 │ 2 │ missing │ missing │ + 1700×5 DataFrame + Row │ name population ploidy longitude latitude + │ String String Int8 Missing Missing +──────┼─────────────────────────────────────────────────── + 1 │ sim_1 1 2 missing missing + 2 │ sim_2 1 2 missing missing + 3 │ sim_3 1 2 missing missing + 4 │ sim_4 1 2 missing missing + 5 │ sim_5 1 2 missing missing + ⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ + 1697 │ sim_1697 17 2 missing missing + 1698 │ sim_1698 17 2 missing missing + 1699 │ sim_1699 17 2 missing missing + 1700 │ sim_1700 17 2 missing missing + 1691 rows omitted + julia> sims.loci 15300×4 DataFrame -│ Row │ name │ population │ locus │ genotype │ -│ │ String │ String │ String │ Tuple…? │ -├───────┼──────────┼────────────┼────────┼────────────┤ -│ 1 │ sim_1 │ 1 │ fca8 │ (135, 135) │ -│ 2 │ sim_1 │ 1 │ fca23 │ (132, 140) │ -│ 3 │ sim_1 │ 1 │ fca43 │ (139, 139) │ -│ 4 │ sim_1 │ 1 │ fca45 │ (126, 126) │ -⋮ -│ 15297 │ sim_1700 │ 17 │ fca78 │ (142, 142) │ -│ 15298 │ sim_1700 │ 17 │ fca90 │ (199, 199) │ -│ 15299 │ sim_1700 │ 17 │ fca96 │ (113, 113) │ -│ 15300 │ sim_1700 │ 17 │ fca37 │ (208, 208) │ + Row │ name population locus genotype + │ String String String Tuple…? +───────┼────────────────────────────────────────── + 1 │ sim_1 1 fca8 (135, 143) + 2 │ sim_1 1 fca23 (136, 146) + 3 │ sim_1 1 fca43 (141, 145) + 4 │ sim_1 1 fca45 (120, 126) + 5 │ sim_1 1 fca77 (156, 156) + ⋮ │ ⋮ ⋮ ⋮ ⋮ + 15297 │ sim_1700 17 fca78 (150, 150) + 15298 │ sim_1700 17 fca90 (197, 197) + 15299 │ sim_1700 17 fca96 (113, 113) + 15300 │ sim_1700 17 fca37 (208, 208) + 15291 rows omitted ``` """ function simulate(data::PopData; n::Int = 100) @@ -113,6 +117,13 @@ function simulate(data::PopData; n::Int = 100) for pop in pops out_gdf[(population = pop,)].genotype .= reduce(hcat, geno_gdf[(population = pop,)].frq) |> permutedims |> vec end + transform!( + geno_out, + :name => PooledArray => :name, + :population => PooledArray => :population, + :locus => PooledArray => :locus, + :genotype + ) # regenerate meta info meta_df = DataFrames.combine( @@ -120,7 +131,7 @@ function simulate(data::PopData; n::Int = 100) :name => first => :name, :population => first => :population ) - meta_df[!, :ploidy] .= 2 + meta_df[!, :ploidy] .= ploidy meta_df[!, :longitude] .= missing meta_df[!, :latitude] .= missing diff --git a/src/Sibship.jl b/src/Sibship.jl index 8fe95b1..7d60500 100644 --- a/src/Sibship.jl +++ b/src/Sibship.jl @@ -9,21 +9,21 @@ function _cross(parent1::Vector{Vector{T}}, parent2::Vector{Vector{T}}) where T ploidy = length(first(parent1)) ploidy == 1 && error("Haploid crosses are not yet supported. Please file and issue or pull request") if ploidy == 2 - p1_contrib = rand.(parent1) - p2_contrib = rand.(parent2) + p1_contrib = rand.(Xoroshiro128Star(),parent1) + p2_contrib = rand.(Xoroshiro128Star(),parent2) geno_out = sort.(zip(p1_contrib, p2_contrib)) elseif iseven(ploidy) n_allele = ploidy ÷ 2 - p1_contrib = sample.(parent1, n_allele, replace = false) - p2_contrib = sample.(parent2, n_allele, replace = false) + p1_contrib = sample.(Xoroshiro128Star(),parent1, n_allele, replace = false) + p2_contrib = sample.(Xoroshiro128Star(),parent2, n_allele, replace = false) geno_out = Tuple.(sort!.(append!.(p1_contrib, p2_contrib))) else # special method to provide a 50% chance of one parent giving more alleles than the other - rng = rand() + rng = rand(Xoroshiro128Star()) contrib_1 = ploidy ÷ 2 contrib_2 = ploidy - contrib_1 - p1_contrib = rng > 0.5 ? sample.(parent1, contrib_1, replace = false) : sample.(parent1, contrib_2, replace = false) - p2_contrib = rng > 0.5 ? sample.(parent2, contrib_2, replace = false) : sample.(parent2, contrib_1, replace = false) + p1_contrib = rng > 0.5 ? sample.(Xoroshiro128Star(),parent1, contrib_1, replace = false) : sample.(Xoroshiro128Star(),parent1, contrib_2, replace = false) + p2_contrib = rng > 0.5 ? sample.(Xoroshiro128Star(),parent2, contrib_2, replace = false) : sample.(Xoroshiro128Star(),parent2, contrib_1, replace = false) geno_out = Tuple.(sort!.(append!.(p1_contrib, p2_contrib))) end return geno_out @@ -195,9 +195,9 @@ julia> fullsib_sims.meta_df100×5 DataFrame """ function simulate_sibship(data::PopData; n::Int = 500, relationship::String = "nothing", ploidy::Int = 0) if relationship == "nothing" - error("Please use the keyword \'relationship\' and specify one of: \n- \"fullsib\" \n- \"halfsib\" \n- \"unrelated\"\n- \"parent-offspring\"") + throw(ArgumentError("Please use the keyword \'relationship\' and specify one of: \n- \"fullsib\" \n- \"halfsib\" \n- \"unrelated\"\n- \"parent-offspring\"")) elseif relationship ∉ ["fullsib", "halfsib", "unrelated", "parent-offspring"] - error("relationship = \"$relationship\" is invalid, please specify one of: \n- \"fullsib\" \n- \"halfsib\" \n- \"unrelated\"\n- \"parent-offspring\"") + throw(ArgumentError("relationship = \"$relationship\" is invalid, please specify one of: \n- \"fullsib\" \n- \"halfsib\" \n- \"unrelated\"\n- \"parent-offspring\"")) end # automatic ploidy finding if ploidy == 0 diff --git a/src/Utils.jl b/src/Utils.jl index a8b820b..241ba3b 100644 --- a/src/Utils.jl +++ b/src/Utils.jl @@ -59,7 +59,6 @@ function Base.append!(data::PopData, data2::PopData) append!(data.meta, data2.meta) - data.loci.name = decompress(data.loci.name) append!(data.loci, data2.loci) return data end @@ -125,7 +124,6 @@ function append(data::PopData, data2::PopData) append!(tmp.meta, tmp2.meta) - tmp.loci.name = decompress(tmp.loci.name) append!(tmp.loci, tmp2.loci) return tmp end @@ -135,20 +133,16 @@ function allele_pool(locus::T) where T <: GenoArray Tuple(Base.Iterators.flatten(skipmissing(locus))) end - -function Base.sort(x::NTuple{N,T}) where N where T <: Signed - Tuple(sort(SVector(x))) -end - function allele_pool(data::PopData) # index dataframe by locus idx_df = groupby(data.loci, [:locus]) # instantiate dict to store alleles - allele_dict = Dict{String,NTuple}() + #allele_dict = Dict{String,Tuple}() # pull out loci names loc = getindex.(keys(idx_df), :locus) - [allele_dict[i] = allele_pool(idx_df[(;locus = i)].genotype) for i in loc] - return String.(loc), allele_dict + allele_dict = Dict(i => allele_pool(idx_df[(;locus = i)].genotype) for i in loc) + #[allele_dict[i] = allele_pool(idx_df[(;locus = i)].genotype) for i in loc] + return string.(loc), allele_dict end """ @@ -162,7 +156,7 @@ an individual with a given `ploidy`. Returns a Vector of genotypes. ``` julia> cats = nancycats() ; julia> loc, alleles = allele_pool(cats) ; -julia> simulate_parent(alleles, loc, ploidy = 2) +julia> simulate_sample(alleles, loc, ploidy = 2) 9-element Array{Array{Int16,1},1}: [139, 129] [146, 146] @@ -175,6 +169,6 @@ julia> simulate_parent(alleles, loc, ploidy = 2) [208, 208] ``` """ -function simulate_sample(alleles::Dict{String,NTuple}, loc::Vector{String}; ploidy::Int) - map(i -> rand(alleles[i], ploidy) ,loc) +function simulate_sample(alleles::Dict{String,<:Tuple}, loc::Vector{String}; ploidy::Int) + map(i -> rand(Xoroshiro128Star(), alleles[i], ploidy) ,loc) end diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..b2d8dfa --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,3 @@ +[deps] +PopGen = "af524d12-c74b-11e9-22a8-3b091653023f" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/cross.jl b/test/cross.jl new file mode 100644 index 0000000..f41b9a5 --- /dev/null +++ b/test/cross.jl @@ -0,0 +1,35 @@ +module TestCross + +using PopGen +using PopGenSims +using Test + +cats = @nancycats ; + +@testset "sample_genotypes" begin + @test length(PopGenSims.sample_genotype(cats.loci.genotype[30], 2)) == 2 + @test eltype(PopGenSims.sample_genotype(cats.loci.genotype[30],1)) == eltype(cats.loci.genotype[30]) + @test length(PopGenSims.sample_genotype(missing, 2)) == 1 + @test first(PopGenSims.sample_genotype(missing, 2)) === missing +end + +@testset "crosses" begin + f1 = cross(cats, "N111", "N107", n = 10) + @test typeof(f1) == PopData + @test length(f1.meta.name) == 10 + @test length(f1.loci.name) == 90 + f1 = cross(cats, "N111", "N107", n = 10, generation = "firstgen") + @test typeof(f1) == PopData + @test length(f1.meta.name) == 10 + @test length(f1.loci.name) == 90 + f2 = cross(cats => "N111", f1 => "firstgen_offspring_10", n = 10) + @test typeof(f2) == PopData + @test length(f2.meta.name) == 10 + @test length(f2.loci.name) == 90 + f2 = cross(cats => "N111", f1 => "firstgen_offspring_10", n = 10, generation = "F2") + @test typeof(f2) == PopData + @test length(f2.meta.name) == 10 + @test length(f2.loci.name) == 90 +end + +end # module TestCross \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index cb38b21..d2cde92 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,36 @@ +fatalerrors = length(ARGS) > 0 && ARGS[1] == "-f" +quiet = length(ARGS) > 0 && ARGS[1] == "-q" +anyerrors = false + +using PopGen using PopGenSims using Test -@testset "PopGenSims.jl" begin - # Write your tests here. +all_tests = [ + "cross.jl", + "samples.jl", + "utils.jl", + "sibship.jl", +] + +println("Running tests:") + +for a_test in all_tests + try + include(a_test) + println("\t\033[1m\033[32mPASSED\033[0m: $(a_test)") + catch e + global anyerrors = true + println("\t\033[1m\033[31mFAILED\033[0m: $(a_test)") + if fatalerrors + rethrow(e) + elseif !quiet + showerror(stdout, e, backtrace()) + println() + end + end end + +if anyerrors + throw("Tests failed :(") +end \ No newline at end of file diff --git a/test/samples.jl b/test/samples.jl new file mode 100644 index 0000000..2b464e6 --- /dev/null +++ b/test/samples.jl @@ -0,0 +1,26 @@ +module SamplesTest + +using PopGen +using PopGenSims +using Test + +cats = @nancycats ; + +@testset "sample_locus" begin + d = Dict(133 => 0.125, 135 => 0.5625, 143 => 0.25, 137 => 0.0625) + @test PopGenSims.sample_locus(d, 3, 2) |> length == 3 + @test PopGenSims.sample_locus(d, 3, 2) |> first |> length == 2 + @test PopGenSims.sample_locus(d, 3, 3) |> length == 3 + @test PopGenSims.sample_locus(d, 3, 3) |> first |> length == 3 +end + + + +@testset "simulate" begin + sims = simulate(cats , n = 100) + @test typeof(sims) == PopData + @test length(sims.meta.name) == 1700 + @test length(sims.loci.name) == 15300 +end + +end # module \ No newline at end of file diff --git a/test/sibship.jl b/test/sibship.jl new file mode 100644 index 0000000..7f96d7d --- /dev/null +++ b/test/sibship.jl @@ -0,0 +1,39 @@ +module TestSibship + +using PopGen +using PopGenSims +using Test + +cats = @nancycats ; +sims_un = simulate_sibship(cats, n = 50, relationship= "unrelated") +sims_hs = simulate_sibship(cats, n = 50, relationship= "halfsib") +sims_fs = simulate_sibship(cats, n = 50, relationship= "fullsib") +sims_po = simulate_sibship(cats, n = 50, relationship= "parent-offspring") + + +@testset "unrelated" begin + @test typeof(sims_un) == PopData + @test length(sims_un.meta.name) == 100 + @test length(sims_un.loci.name) == 900 +end + +@testset "halfsib" begin + @test typeof(sims_hs) == PopData + @test length(sims_hs.meta.name) == 100 + @test length(sims_hs.loci.name) == 900 +end + +@testset "fullsib" begin + @test typeof(sims_fs) == PopData + @test length(sims_fs.meta.name) == 100 + @test length(sims_fs.loci.name) == 900 +end + +@testset "parent offspring" begin + @test typeof(sims_po) == PopData + @test length(sims_po.meta.name) == 100 + @test length(sims_po.loci.name) == 900 +end + + +end # module \ No newline at end of file diff --git a/test/utils.jl b/test/utils.jl new file mode 100644 index 0000000..1c2d3f7 --- /dev/null +++ b/test/utils.jl @@ -0,0 +1,35 @@ +module TestUtils + +using PopGen +using PopGenSims +using Test + +cats = @nancycats ; +cats2 = deepcopy(cats) + +@testset "append PopData" begin + @test length(append(cats, cats2).meta.name) == 2 * length(cats.meta.name) + @test length(append(cats, cats2).loci.name) == 2 * length(cats.loci.name) + @test typeof(append(cats, cats2)) == PopData + append!(cats2, cats) + @test length(cats2.meta.name) == 2 * length(cats.meta.name) + @test length(cats2.loci.name) == 2 * length(cats.loci.name) + @test typeof(cats2) == PopData +end + +@testset "allele pool" begin + @test PopGenSims.allele_pool(cats.loci.genotype[1:30]) |> length == 56 + @test PopGenSims.allele_pool(cats.loci.genotype[1:30]) |> typeof <: NTuple + a,b = PopGenSims.allele_pool(cats) + @test eltype(a) == String + @test length(a) == 9 + @test typeof(b) <: Dict + @test length(b) == length(a) + c = PopGenSims.simulate_sample(b, a, ploidy = 2) + @test length(c) == length(a) == length(b) + @test all(length.(c) .== 2) + c3 = PopGenSims.simulate_sample(b, a, ploidy = 3) + @test all(length.(c3) .== 3) +end + +end # module \ No newline at end of file